deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
deltacat/aws/s3u.py CHANGED
@@ -1,51 +1,58 @@
1
- import ray
2
- import deltacat.aws.clients as aws_utils
3
1
  import logging
4
2
  import multiprocessing
5
- import s3fs
6
- import pyarrow as pa
7
-
8
3
  from functools import partial
4
+ from typing import Any, Callable, Dict, Generator, List, Optional, Union
9
5
  from uuid import uuid4
10
6
 
11
- from ray.types import ObjectRef
12
- from ray.data.datasource import BlockWritePathProvider
7
+ import pyarrow as pa
8
+ import ray
9
+ import s3fs
10
+ from boto3.resources.base import ServiceResource
11
+ from botocore.client import BaseClient
12
+ from botocore.exceptions import ClientError
13
13
  from ray.data.block import Block, BlockAccessor, BlockMetadata
14
+ from ray.data.datasource import BlockWritePathProvider
15
+ from ray.types import ObjectRef
16
+ from tenacity import (
17
+ Retrying,
18
+ retry_if_exception_type,
19
+ retry_if_not_exception_type,
20
+ stop_after_delay,
21
+ wait_random_exponential,
22
+ )
14
23
 
24
+ import deltacat.aws.clients as aws_utils
15
25
  from deltacat import logs
16
- from deltacat.storage import LocalTable, LocalDataset, DistributedDataset, \
17
- Manifest, ManifestEntry, ManifestEntryList
18
26
  from deltacat.aws.constants import TIMEOUT_ERROR_CODES
19
- from deltacat.exceptions import RetryableError, NonRetryableError
20
- from deltacat.types.media import ContentType, ContentEncoding
21
- from deltacat.types.tables import TABLE_TYPE_TO_READER_FUNC, \
22
- TABLE_CLASS_TO_SIZE_FUNC, get_table_length
23
- from deltacat.types.media import TableType
27
+ from deltacat.exceptions import NonRetryableError, RetryableError
28
+ from deltacat.storage import (
29
+ DistributedDataset,
30
+ LocalDataset,
31
+ LocalTable,
32
+ Manifest,
33
+ ManifestEntry,
34
+ ManifestEntryList,
35
+ )
36
+ from deltacat.types.media import ContentEncoding, ContentType, TableType
37
+ from deltacat.types.tables import (
38
+ TABLE_CLASS_TO_SIZE_FUNC,
39
+ TABLE_TYPE_TO_READER_FUNC,
40
+ get_table_length,
41
+ )
24
42
  from deltacat.utils.common import ReadKwargsProvider
25
43
 
26
- from boto3.resources.base import ServiceResource
27
- from botocore.client import BaseClient
28
- from botocore.exceptions import ClientError
29
- from tenacity import Retrying
30
- from tenacity import wait_random_exponential
31
- from tenacity import stop_after_delay
32
- from tenacity import retry_if_exception_type, retry_if_not_exception_type
33
-
34
- from typing import Any, Callable, Dict, List, Optional, Generator, Union, Tuple
35
-
36
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
37
45
 
46
+ # TODO(raghumdani): refactor redshift datasource to reuse the
47
+ # same module for writing output files.
48
+
38
49
 
39
- @ray.remote
40
50
  class CapturedBlockWritePaths:
41
51
  def __init__(self):
42
52
  self._write_paths: List[str] = []
43
53
  self._block_refs: List[ObjectRef[Block]] = []
44
54
 
45
- def extend(
46
- self,
47
- write_paths: List[str],
48
- block_refs: List[ObjectRef[Block]]):
55
+ def extend(self, write_paths: List[str], block_refs: List[ObjectRef[Block]]):
49
56
  try:
50
57
  iter(write_paths)
51
58
  except TypeError:
@@ -70,27 +77,29 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
70
77
  """Block write path provider implementation that writes each
71
78
  dataset block out to a file of the form: {base_path}/{uuid}
72
79
  """
73
- def __init__(self, capture_actor: CapturedBlockWritePaths):
80
+
81
+ def __init__(self, capture_object: CapturedBlockWritePaths):
74
82
  self.write_paths: List[str] = []
75
83
  self.block_refs: List[ObjectRef[Block]] = []
76
- self.capture_actor = capture_actor
84
+ self.capture_object = capture_object
77
85
 
78
86
  def __del__(self):
79
87
  if self.write_paths or self.block_refs:
80
- self.capture_actor.extend.remote(
88
+ self.capture_object.extend(
81
89
  self.write_paths,
82
90
  self.block_refs,
83
91
  )
84
92
 
85
93
  def _get_write_path_for_block(
86
- self,
87
- base_path: str,
88
- *,
89
- filesystem: Optional[pa.filesystem.FileSystem] = None,
90
- dataset_uuid: Optional[str] = None,
91
- block: Optional[ObjectRef[Block]] = None,
92
- block_index: Optional[int] = None,
93
- file_format: Optional[str] = None) -> str:
94
+ self,
95
+ base_path: str,
96
+ *,
97
+ filesystem: Optional[pa.filesystem.FileSystem] = None,
98
+ dataset_uuid: Optional[str] = None,
99
+ block: Optional[ObjectRef[Block]] = None,
100
+ block_index: Optional[int] = None,
101
+ file_format: Optional[str] = None,
102
+ ) -> str:
94
103
  write_path = f"{base_path}/{str(uuid4())}"
95
104
  self.write_paths.append(write_path)
96
105
  if block:
@@ -99,24 +108,18 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
99
108
 
100
109
 
101
110
  class S3Url:
102
- def __init__(
103
- self,
104
- url: str):
111
+ def __init__(self, url: str):
105
112
 
106
113
  from urllib.parse import urlparse
107
114
 
108
- self._parsed = urlparse(
109
- url,
110
- allow_fragments=False # support '#' in path
111
- )
115
+ self._parsed = urlparse(url, allow_fragments=False) # support '#' in path
112
116
  if not self._parsed.scheme: # support paths w/o 's3://' scheme
113
117
  url = f"s3://{url}"
114
118
  self._parsed = urlparse(url, allow_fragments=False)
115
119
  if self._parsed.query: # support '?' in path
116
- self.key = \
117
- f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
120
+ self.key = f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
118
121
  else:
119
- self.key = self._parsed.path.lstrip('/')
122
+ self.key = self._parsed.path.lstrip("/")
120
123
  self.bucket = self._parsed.netloc
121
124
  self.url = self._parsed.geturl()
122
125
 
@@ -125,9 +128,7 @@ def parse_s3_url(url: str) -> S3Url:
125
128
  return S3Url(url)
126
129
 
127
130
 
128
- def s3_resource_cache(
129
- region: Optional[str],
130
- **kwargs) -> ServiceResource:
131
+ def s3_resource_cache(region: Optional[str], **kwargs) -> ServiceResource:
131
132
 
132
133
  return aws_utils.resource_cache(
133
134
  "s3",
@@ -136,36 +137,20 @@ def s3_resource_cache(
136
137
  )
137
138
 
138
139
 
139
- def s3_client_cache(
140
- region: Optional[str],
141
- **kwargs) -> BaseClient:
140
+ def s3_client_cache(region: Optional[str], **kwargs) -> BaseClient:
142
141
 
143
- return aws_utils.client_cache(
144
- "s3",
145
- region,
146
- **kwargs
147
- )
142
+ return aws_utils.client_cache("s3", region, **kwargs)
148
143
 
149
144
 
150
- def get_object_at_url(
151
- url: str,
152
- **s3_client_kwargs) -> Dict[str, Any]:
145
+ def get_object_at_url(url: str, **s3_client_kwargs) -> Dict[str, Any]:
153
146
 
154
- s3 = s3_client_cache(
155
- None,
156
- **s3_client_kwargs)
147
+ s3 = s3_client_cache(None, **s3_client_kwargs)
157
148
 
158
149
  parsed_s3_url = parse_s3_url(url)
159
- return s3.get_object(
160
- Bucket=parsed_s3_url.bucket,
161
- Key=parsed_s3_url.key
162
- )
150
+ return s3.get_object(Bucket=parsed_s3_url.bucket, Key=parsed_s3_url.key)
163
151
 
164
152
 
165
- def delete_files_by_prefix(
166
- bucket: str,
167
- prefix: str,
168
- **s3_client_kwargs) -> None:
153
+ def delete_files_by_prefix(bucket: str, prefix: str, **s3_client_kwargs) -> None:
169
154
 
170
155
  s3 = s3_resource_cache(None, **s3_client_kwargs)
171
156
  bucket = s3.Bucket(bucket)
@@ -189,14 +174,10 @@ def get_path_from_object(bucket, obj):
189
174
 
190
175
 
191
176
  def filter_objects_by_prefix(
192
- bucket: str,
193
- prefix: str,
194
- **s3_client_kwargs) -> Generator[Dict[str, Any], None, None]:
177
+ bucket: str, prefix: str, **s3_client_kwargs
178
+ ) -> Generator[Dict[str, Any], None, None]:
195
179
 
196
- s3 = s3_client_cache(
197
- None,
198
- **s3_client_kwargs
199
- )
180
+ s3 = s3_client_cache(None, **s3_client_kwargs)
200
181
  params = {"Bucket": bucket, "Prefix": prefix}
201
182
  more_objects_to_list = True
202
183
  while more_objects_to_list:
@@ -209,14 +190,15 @@ def filter_objects_by_prefix(
209
190
 
210
191
 
211
192
  def read_file(
212
- s3_url: str,
213
- content_type: ContentType,
214
- content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
215
- table_type: TableType = TableType.PYARROW,
216
- column_names: Optional[List[str]] = None,
217
- include_columns: Optional[List[str]] = None,
218
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
219
- **s3_client_kwargs) -> LocalTable:
193
+ s3_url: str,
194
+ content_type: ContentType,
195
+ content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
196
+ table_type: TableType = TableType.PYARROW,
197
+ column_names: Optional[List[str]] = None,
198
+ include_columns: Optional[List[str]] = None,
199
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
200
+ **s3_client_kwargs,
201
+ ) -> LocalTable:
220
202
 
221
203
  reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
222
204
  try:
@@ -227,34 +209,33 @@ def read_file(
227
209
  column_names,
228
210
  include_columns,
229
211
  file_reader_kwargs_provider,
230
- **s3_client_kwargs
212
+ **s3_client_kwargs,
231
213
  )
232
214
  return table
233
215
  except ClientError as e:
234
216
  if e.response["Error"]["Code"] in TIMEOUT_ERROR_CODES:
235
217
  # Timeout error not caught by botocore
236
- raise RetryableError(f"Retry table download from: {s3_url}") \
237
- from e
238
- raise NonRetryableError(f"Failed table download from: {s3_url}") \
239
- from e
218
+ raise RetryableError(f"Retry table download from: {s3_url}") from e
219
+ raise NonRetryableError(f"Failed table download from: {s3_url}") from e
240
220
 
241
221
 
242
222
  def upload_sliced_table(
243
- table: Union[LocalTable, DistributedDataset],
244
- s3_url_prefix: str,
245
- s3_file_system: s3fs.S3FileSystem,
246
- max_records_per_entry: Optional[int],
247
- s3_table_writer_func: Callable,
248
- table_slicer_func: Callable,
249
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
250
- content_type: ContentType = ContentType.PARQUET,
251
- **s3_client_kwargs) -> ManifestEntryList:
223
+ table: Union[LocalTable, DistributedDataset],
224
+ s3_url_prefix: str,
225
+ s3_file_system: s3fs.S3FileSystem,
226
+ max_records_per_entry: Optional[int],
227
+ s3_table_writer_func: Callable,
228
+ table_slicer_func: Callable,
229
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
230
+ content_type: ContentType = ContentType.PARQUET,
231
+ **s3_client_kwargs,
232
+ ) -> ManifestEntryList:
252
233
 
253
234
  # @retry decorator can't be pickled by Ray, so wrap upload in Retrying
254
235
  retrying = Retrying(
255
236
  wait=wait_random_exponential(multiplier=1, max=60),
256
237
  stop=stop_after_delay(30 * 60),
257
- retry=retry_if_exception_type(RetryableError)
238
+ retry=retry_if_exception_type(RetryableError),
258
239
  )
259
240
 
260
241
  manifest_entries = ManifestEntryList()
@@ -270,14 +251,11 @@ def upload_sliced_table(
270
251
  s3_table_writer_func,
271
252
  s3_table_writer_kwargs,
272
253
  content_type,
273
- **s3_client_kwargs
254
+ **s3_client_kwargs,
274
255
  )
275
256
  else:
276
257
  # iteratively write table slices
277
- table_slices = table_slicer_func(
278
- table,
279
- max_records_per_entry
280
- )
258
+ table_slices = table_slicer_func(table, max_records_per_entry)
281
259
  for table_slice in table_slices:
282
260
  slice_entries = retrying(
283
261
  upload_table,
@@ -287,7 +265,7 @@ def upload_sliced_table(
287
265
  s3_table_writer_func,
288
266
  s3_table_writer_kwargs,
289
267
  content_type,
290
- **s3_client_kwargs
268
+ **s3_client_kwargs,
291
269
  )
292
270
  manifest_entries.extend(slice_entries)
293
271
 
@@ -303,15 +281,17 @@ def _block_metadata(block: Block) -> BlockMetadata:
303
281
 
304
282
 
305
283
  def _get_metadata(
306
- table: Union[LocalTable, DistributedDataset],
307
- write_paths: List[str],
308
- block_refs: List[ObjectRef[Block]])-> List[BlockMetadata]:
284
+ table: Union[LocalTable, DistributedDataset],
285
+ write_paths: List[str],
286
+ block_refs: List[ObjectRef[Block]],
287
+ ) -> List[BlockMetadata]:
309
288
  metadata: List[BlockMetadata] = []
310
289
  if not block_refs:
311
290
  # this must be a local table - ensure it was written to only 1 file
312
- assert len(write_paths) == 1, \
313
- f"Expected table of type '{type(table)}' to be written to 1 " \
291
+ assert len(write_paths) == 1, (
292
+ f"Expected table of type '{type(table)}' to be written to 1 "
314
293
  f"file, but found {len(write_paths)} files."
294
+ )
315
295
  table_size = None
316
296
  table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
317
297
  if table_size_func:
@@ -333,23 +313,27 @@ def _get_metadata(
333
313
  # metadata = dataset._blocks.get_metadata()
334
314
  # ray 2.0.0dev
335
315
  metadata = table._plan.execute().get_metadata()
336
- if not metadata or metadata[0].size_bytes is None or \
337
- metadata[0].num_rows is None:
338
- metadata_futures = [_block_metadata.remote(block_ref)
339
- for block_ref
340
- in block_refs]
316
+ if (
317
+ not metadata
318
+ or metadata[0].size_bytes is None
319
+ or metadata[0].num_rows is None
320
+ ):
321
+ metadata_futures = [
322
+ _block_metadata.remote(block_ref) for block_ref in block_refs
323
+ ]
341
324
  metadata = ray.get(metadata_futures)
342
325
  return metadata
343
326
 
344
327
 
345
328
  def upload_table(
346
- table: Union[LocalTable, DistributedDataset],
347
- s3_base_url: str,
348
- s3_file_system: s3fs.S3FileSystem,
349
- s3_table_writer_func: Callable,
350
- s3_table_writer_kwargs: Optional[Dict[str, Any]],
351
- content_type: ContentType = ContentType.PARQUET,
352
- **s3_client_kwargs) -> ManifestEntryList:
329
+ table: Union[LocalTable, DistributedDataset],
330
+ s3_base_url: str,
331
+ s3_file_system: s3fs.S3FileSystem,
332
+ s3_table_writer_func: Callable,
333
+ s3_table_writer_kwargs: Optional[Dict[str, Any]],
334
+ content_type: ContentType = ContentType.PARQUET,
335
+ **s3_client_kwargs,
336
+ ) -> ManifestEntryList:
353
337
  """
354
338
  Writes the given table to 1 or more S3 files and return Redshift
355
339
  manifest entries describing the uploaded files.
@@ -357,20 +341,20 @@ def upload_table(
357
341
  if s3_table_writer_kwargs is None:
358
342
  s3_table_writer_kwargs = {}
359
343
 
360
- capture_actor = CapturedBlockWritePaths.remote()
361
- block_write_path_provider = UuidBlockWritePathProvider(capture_actor)
344
+ capture_object = CapturedBlockWritePaths()
345
+ block_write_path_provider = UuidBlockWritePathProvider(capture_object)
362
346
  s3_table_writer_func(
363
347
  table,
364
348
  s3_base_url,
365
349
  s3_file_system,
366
350
  block_write_path_provider,
367
351
  content_type.value,
368
- **s3_table_writer_kwargs
352
+ **s3_table_writer_kwargs,
369
353
  )
370
354
  # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
371
355
  del block_write_path_provider
372
- block_refs = ray.get(capture_actor.block_refs.remote())
373
- write_paths = ray.get(capture_actor.write_paths.remote())
356
+ block_refs = capture_object.block_refs()
357
+ write_paths = capture_object.write_paths()
374
358
  metadata = _get_metadata(table, write_paths, block_refs)
375
359
  manifest_entries = ManifestEntryList()
376
360
  for block_idx, s3_url in enumerate(write_paths):
@@ -385,37 +369,42 @@ def upload_table(
385
369
  except ClientError as e:
386
370
  if e.response["Error"]["Code"] == "NoSuchKey":
387
371
  # s3fs may swallow S3 errors - we were probably throttled
388
- raise RetryableError(f"Retry table upload to: {s3_url}") \
389
- from e
390
- raise NonRetryableError(f"Failed table upload to: {s3_url}") \
391
- from e
372
+ raise RetryableError(f"Retry table upload to: {s3_url}") from e
373
+ raise NonRetryableError(f"Failed table upload to: {s3_url}") from e
392
374
  return manifest_entries
393
375
 
394
376
 
395
377
  def download_manifest_entry(
396
- manifest_entry: ManifestEntry,
397
- token_holder: Optional[Dict[str, Any]] = None,
398
- table_type: TableType = TableType.PYARROW,
399
- column_names: Optional[List[str]] = None,
400
- include_columns: Optional[List[str]] = None,
401
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
402
- content_type: Optional[ContentType] = None,
403
- content_encoding: Optional[ContentEncoding] = None) -> LocalTable:
404
-
405
- s3_client_kwargs = {
406
- "aws_access_key_id": token_holder["accessKeyId"],
407
- "aws_secret_access_key": token_holder["secretAccessKey"],
408
- "aws_session_token": token_holder["sessionToken"]
409
- } if token_holder else {}
378
+ manifest_entry: ManifestEntry,
379
+ token_holder: Optional[Dict[str, Any]] = None,
380
+ table_type: TableType = TableType.PYARROW,
381
+ column_names: Optional[List[str]] = None,
382
+ include_columns: Optional[List[str]] = None,
383
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
384
+ content_type: Optional[ContentType] = None,
385
+ content_encoding: Optional[ContentEncoding] = None,
386
+ ) -> LocalTable:
387
+
388
+ s3_client_kwargs = (
389
+ {
390
+ "aws_access_key_id": token_holder["accessKeyId"],
391
+ "aws_secret_access_key": token_holder["secretAccessKey"],
392
+ "aws_session_token": token_holder["sessionToken"],
393
+ }
394
+ if token_holder
395
+ else {}
396
+ )
410
397
  if not content_type:
411
398
  content_type = manifest_entry.meta.content_type
412
- assert content_type, \
413
- f"Unknown content type for manifest entry: {manifest_entry}"
399
+ assert (
400
+ content_type
401
+ ), f"Unknown content type for manifest entry: {manifest_entry}"
414
402
  content_type = ContentType(content_type)
415
403
  if not content_encoding:
416
404
  content_encoding = manifest_entry.meta.content_encoding
417
- assert content_encoding, \
418
- f"Unknown content encoding for manifest entry: {manifest_entry}"
405
+ assert (
406
+ content_encoding
407
+ ), f"Unknown content encoding for manifest entry: {manifest_entry}"
419
408
  content_encoding = ContentEncoding(content_encoding)
420
409
  s3_url = manifest_entry.uri
421
410
  if s3_url is None:
@@ -424,7 +413,7 @@ def download_manifest_entry(
424
413
  retrying = Retrying(
425
414
  wait=wait_random_exponential(multiplier=1, max=60),
426
415
  stop=stop_after_delay(30 * 60),
427
- retry=retry_if_not_exception_type(NonRetryableError)
416
+ retry=retry_if_not_exception_type(NonRetryableError),
428
417
  )
429
418
  table = retrying(
430
419
  read_file,
@@ -441,46 +430,36 @@ def download_manifest_entry(
441
430
 
442
431
 
443
432
  def _download_manifest_entries(
444
- manifest: Manifest,
445
- token_holder: Optional[Dict[str, Any]] = None,
446
- table_type: TableType = TableType.PYARROW,
447
- ignore_missing_manifest: bool = False,
448
- column_names: Optional[List[str]] = None,
449
- include_columns: Optional[List[str]] = None,
450
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
451
- -> Tuple[LocalDataset,Optional[List[int]]]:
452
-
453
- if ignore_missing_manifest:
454
- result = []
455
- missing = []
456
- for ide, e in enumerate(manifest.entries):
457
- try:
458
- tmp = download_manifest_entry(e, token_holder, table_type, column_names,
459
- include_columns, file_reader_kwargs_provider)
460
- result.append(tmp)
461
- except Exception as e:
462
- missing.append(ide)
463
- logger.info(f"missing {len(missing)} manifest_entry")
464
- pass
465
-
466
- return result, missing
467
- else:
468
- return [
469
- download_manifest_entry(e, token_holder, table_type, column_names,
470
- include_columns, file_reader_kwargs_provider)
471
- for e in manifest.entries
472
- ]
433
+ manifest: Manifest,
434
+ token_holder: Optional[Dict[str, Any]] = None,
435
+ table_type: TableType = TableType.PYARROW,
436
+ column_names: Optional[List[str]] = None,
437
+ include_columns: Optional[List[str]] = None,
438
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
439
+ ) -> LocalDataset:
440
+
441
+ return [
442
+ download_manifest_entry(
443
+ e,
444
+ token_holder,
445
+ table_type,
446
+ column_names,
447
+ include_columns,
448
+ file_reader_kwargs_provider,
449
+ )
450
+ for e in manifest.entries
451
+ ]
452
+
473
453
 
474
454
  def _download_manifest_entries_parallel(
475
- manifest: Manifest,
476
- token_holder: Optional[Dict[str, Any]] = None,
477
- table_type: TableType = TableType.PYARROW,
478
- ignore_missing_manifest: bool = False,
479
- max_parallelism: Optional[int] = None,
480
- column_names: Optional[List[str]] = None,
481
- include_columns: Optional[List[str]] = None,
482
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
483
- -> LocalDataset:
455
+ manifest: Manifest,
456
+ token_holder: Optional[Dict[str, Any]] = None,
457
+ table_type: TableType = TableType.PYARROW,
458
+ max_parallelism: Optional[int] = None,
459
+ column_names: Optional[List[str]] = None,
460
+ include_columns: Optional[List[str]] = None,
461
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
462
+ ) -> LocalDataset:
484
463
 
485
464
  tables = []
486
465
  pool = multiprocessing.Pool(max_parallelism)
@@ -498,22 +477,20 @@ def _download_manifest_entries_parallel(
498
477
 
499
478
 
500
479
  def download_manifest_entries(
501
- manifest: Manifest,
502
- token_holder: Optional[Dict[str, Any]] = None,
503
- table_type: TableType = TableType.PYARROW,
504
- ignore_missing_manifest: bool = False,
505
- max_parallelism: Optional[int] = 1,
506
- column_names: Optional[List[str]] = None,
507
- include_columns: Optional[List[str]] = None,
508
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
509
- -> Tuple[LocalDataset,Optional[List[int]]]:
480
+ manifest: Manifest,
481
+ token_holder: Optional[Dict[str, Any]] = None,
482
+ table_type: TableType = TableType.PYARROW,
483
+ max_parallelism: Optional[int] = 1,
484
+ column_names: Optional[List[str]] = None,
485
+ include_columns: Optional[List[str]] = None,
486
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
487
+ ) -> LocalDataset:
510
488
 
511
489
  if max_parallelism and max_parallelism <= 1:
512
490
  return _download_manifest_entries(
513
491
  manifest,
514
492
  token_holder,
515
493
  table_type,
516
- ignore_missing_manifest,
517
494
  column_names,
518
495
  include_columns,
519
496
  file_reader_kwargs_provider,
@@ -523,7 +500,6 @@ def download_manifest_entries(
523
500
  manifest,
524
501
  token_holder,
525
502
  table_type,
526
- ignore_missing_manifest,
527
503
  max_parallelism,
528
504
  column_names,
529
505
  include_columns,
@@ -531,10 +507,7 @@ def download_manifest_entries(
531
507
  )
532
508
 
533
509
 
534
- def upload(
535
- s3_url: str,
536
- body,
537
- **s3_client_kwargs) -> Dict[str, Any]:
510
+ def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
538
511
 
539
512
  # TODO (pdames): add tenacity retrying
540
513
  parsed_s3_url = parse_s3_url(s3_url)
@@ -547,9 +520,8 @@ def upload(
547
520
 
548
521
 
549
522
  def download(
550
- s3_url: str,
551
- fail_if_not_found: bool = True,
552
- **s3_client_kwargs) -> Optional[Dict[str, Any]]:
523
+ s3_url: str, fail_if_not_found: bool = True, **s3_client_kwargs
524
+ ) -> Optional[Dict[str, Any]]:
553
525
 
554
526
  # TODO (pdames): add tenacity retrying
555
527
  parsed_s3_url = parse_s3_url(s3_url)
@@ -563,15 +535,13 @@ def download(
563
535
  if fail_if_not_found:
564
536
  raise
565
537
  else:
566
- if e.response['Error']['Code'] != "404":
567
- if e.response['Error']['Code'] != 'NoSuchKey':
538
+ if e.response["Error"]["Code"] != "404":
539
+ if e.response["Error"]["Code"] != "NoSuchKey":
568
540
  raise
569
- logger.info(
570
- f"file not found: {s3_url}")
541
+ logger.info(f"file not found: {s3_url}")
571
542
  except s3.exceptions.NoSuchKey:
572
543
  if fail_if_not_found:
573
544
  raise
574
545
  else:
575
- logger.info(
576
- f"file not found: {s3_url}")
546
+ logger.info(f"file not found: {s3_url}")
577
547
  return None