deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/aws/s3u.py CHANGED
@@ -1,171 +1,37 @@
1
1
  import logging
2
- import multiprocessing
3
- from functools import partial
4
- from typing import Any, Callable, Dict, Generator, List, Optional, Union
5
- from uuid import uuid4
2
+ from typing import Any, Dict, Generator, Optional
6
3
  from botocore.config import Config
7
4
  from deltacat.aws.constants import (
8
5
  BOTO_MAX_RETRIES,
9
- UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY,
10
6
  BOTO_THROTTLING_ERROR_CODES,
7
+ )
8
+ from deltacat.constants import (
9
+ UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY,
11
10
  RETRYABLE_TRANSIENT_ERRORS,
12
- BOTO_TIMEOUT_ERROR_CODES,
13
- UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY,
14
- DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
15
11
  )
16
12
 
17
- import pyarrow.fs
18
- import ray
19
- import s3fs
20
13
  from boto3.resources.base import ServiceResource
21
14
  from botocore.client import BaseClient
22
15
  from botocore.exceptions import ClientError
23
- from ray.data.block import Block, BlockAccessor, BlockMetadata
24
- from ray.data.datasource import FilenameProvider
25
- from ray.types import ObjectRef
26
16
  from tenacity import (
27
17
  Retrying,
28
18
  retry_if_exception_type,
29
19
  stop_after_delay,
30
20
  wait_random_exponential,
31
21
  )
32
- from deltacat.utils.ray_utils.concurrency import invoke_parallel
33
22
  import deltacat.aws.clients as aws_utils
34
23
  from deltacat import logs
35
- from deltacat.storage import (
36
- DistributedDataset,
37
- LocalDataset,
38
- LocalTable,
39
- Manifest,
40
- ManifestEntry,
41
- ManifestEntryList,
42
- )
43
- from deltacat.types.media import (
44
- ContentEncoding,
45
- ContentType,
46
- TableType,
47
- DistributedDatasetType,
48
- )
49
- from deltacat.types.tables import (
50
- TABLE_CLASS_TO_SIZE_FUNC,
51
- TABLE_TYPE_TO_S3_READER_FUNC,
52
- TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
53
- DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
54
- get_table_length,
55
- )
56
24
  from deltacat.exceptions import (
57
25
  RetryableError,
58
- RetryableUploadTableError,
59
- RetryableDownloadTableError,
60
26
  RetryableDownloadFileError,
61
27
  RetryableUploadFileError,
62
28
  NonRetryableDownloadFileError,
63
29
  NonRetryableUploadFileError,
64
- NonRetryableUploadTableError,
65
- NonRetryableDownloadTableError,
66
30
  )
67
- from deltacat.types.partial_download import PartialFileDownloadParams
68
- from deltacat.utils.common import ReadKwargsProvider
69
- from deltacat.exceptions import categorize_errors
70
31
 
71
32
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
72
33
 
73
34
 
74
- class CapturedBlockWritePaths:
75
- def __init__(self):
76
- self._write_paths: List[str] = []
77
- self._block_refs: List[ObjectRef[Block]] = []
78
-
79
- def extend(self, write_paths: List[str], block_refs: List[ObjectRef[Block]]):
80
- try:
81
- iter(write_paths)
82
- except TypeError:
83
- pass
84
- else:
85
- self._write_paths.extend(write_paths)
86
- try:
87
- iter(block_refs)
88
- except TypeError:
89
- pass
90
- else:
91
- self._block_refs.extend(block_refs)
92
-
93
- def write_paths(self) -> List[str]:
94
- return self._write_paths
95
-
96
- def block_refs(self) -> List[ObjectRef[Block]]:
97
- return self._block_refs
98
-
99
-
100
- class UuidBlockWritePathProvider(FilenameProvider):
101
- """Block write path provider implementation that writes each
102
- dataset block out to a file of the form: {base_path}/{uuid}
103
- """
104
-
105
- def __init__(
106
- self, capture_object: CapturedBlockWritePaths, base_path: Optional[str] = None
107
- ):
108
- self.base_path = base_path
109
- self.write_paths: List[str] = []
110
- self.block_refs: List[ObjectRef[Block]] = []
111
- self.capture_object = capture_object
112
-
113
- def __del__(self):
114
- if self.write_paths or self.block_refs:
115
- self.capture_object.extend(
116
- self.write_paths,
117
- self.block_refs,
118
- )
119
-
120
- def get_filename_for_block(
121
- self, block: Any, task_index: int, block_index: int
122
- ) -> str:
123
- if self.base_path is None:
124
- raise ValueError(
125
- "Base path must be provided to UuidBlockWritePathProvider",
126
- )
127
- return self._get_write_path_for_block(
128
- base_path=self.base_path,
129
- block=block,
130
- block_index=block_index,
131
- )
132
-
133
- def _get_write_path_for_block(
134
- self,
135
- base_path: str,
136
- *,
137
- filesystem: Optional[pyarrow.fs.FileSystem] = None,
138
- dataset_uuid: Optional[str] = None,
139
- block: Optional[ObjectRef[Block]] = None,
140
- block_index: Optional[int] = None,
141
- file_format: Optional[str] = None,
142
- ) -> str:
143
- write_path = f"{base_path}/{str(uuid4())}"
144
- self.write_paths.append(write_path)
145
- if block:
146
- self.block_refs.append(block)
147
- return write_path
148
-
149
- def __call__(
150
- self,
151
- base_path: str,
152
- *,
153
- filesystem: Optional[pyarrow.fs.FileSystem] = None,
154
- dataset_uuid: Optional[str] = None,
155
- block: Optional[ObjectRef[Block]] = None,
156
- block_index: Optional[int] = None,
157
- file_format: Optional[str] = None,
158
- ) -> str:
159
- return self._get_write_path_for_block(
160
- base_path,
161
- filesystem=filesystem,
162
- dataset_uuid=dataset_uuid,
163
- block=block,
164
- block_index=block_index,
165
- file_format=file_format,
166
- )
167
-
168
-
169
35
  class S3Url:
170
36
  def __init__(self, url: str):
171
37
 
@@ -248,312 +114,6 @@ def filter_objects_by_prefix(
248
114
  more_objects_to_list = params["ContinuationToken"] is not None
249
115
 
250
116
 
251
- @categorize_errors
252
- def read_file(
253
- s3_url: str,
254
- content_type: ContentType,
255
- content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
256
- table_type: TableType = TableType.PYARROW,
257
- column_names: Optional[List[str]] = None,
258
- include_columns: Optional[List[str]] = None,
259
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
260
- partial_file_download_params: Optional[PartialFileDownloadParams] = None,
261
- **s3_client_kwargs,
262
- ) -> LocalTable:
263
-
264
- reader = TABLE_TYPE_TO_S3_READER_FUNC[table_type.value]
265
- try:
266
- table = reader(
267
- s3_url,
268
- content_type.value,
269
- content_encoding.value,
270
- column_names,
271
- include_columns,
272
- file_reader_kwargs_provider,
273
- partial_file_download_params,
274
- **s3_client_kwargs,
275
- )
276
- return table
277
- except ClientError as e:
278
- if (
279
- e.response["Error"]["Code"]
280
- in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
281
- ):
282
- # Timeout error not caught by botocore
283
- raise RetryableDownloadTableError(
284
- f"Retry table download from: {s3_url} after receiving {type(e).__name__}",
285
- ) from e
286
- raise NonRetryableDownloadTableError(
287
- f"Failed table download from: {s3_url} after receiving {type(e).__name__}"
288
- ) from e
289
- except RETRYABLE_TRANSIENT_ERRORS as e:
290
- raise RetryableDownloadTableError(
291
- f"Retry download for: {s3_url} after receiving {type(e).__name__}"
292
- ) from e
293
- except BaseException as e:
294
- logger.warning(
295
- f"Read has failed for {s3_url} and content_type={content_type} "
296
- f"and encoding={content_encoding}. Error: {e}",
297
- exc_info=True,
298
- )
299
- raise NonRetryableDownloadTableError(
300
- f"Read has failed for {s3_url} and content_type={content_type} "
301
- f"and encoding={content_encoding}",
302
- ) from e
303
-
304
-
305
- def upload_sliced_table(
306
- table: Union[LocalTable, DistributedDataset],
307
- s3_url_prefix: str,
308
- s3_file_system: s3fs.S3FileSystem,
309
- max_records_per_entry: Optional[int],
310
- s3_table_writer_func: Callable,
311
- table_slicer_func: Callable,
312
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
313
- content_type: ContentType = ContentType.PARQUET,
314
- **s3_client_kwargs,
315
- ) -> ManifestEntryList:
316
-
317
- # @retry decorator can't be pickled by Ray, so wrap upload in Retrying
318
- retrying = Retrying(
319
- wait=wait_random_exponential(multiplier=1, max=60),
320
- stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
321
- retry=retry_if_exception_type(RetryableError),
322
- )
323
-
324
- manifest_entries = ManifestEntryList()
325
- table_record_count = get_table_length(table)
326
-
327
- if max_records_per_entry is None or not table_record_count:
328
- # write the whole table to a single s3 file
329
- manifest_entries = retrying(
330
- upload_table,
331
- table,
332
- f"{s3_url_prefix}",
333
- s3_file_system,
334
- s3_table_writer_func,
335
- s3_table_writer_kwargs,
336
- content_type,
337
- **s3_client_kwargs,
338
- )
339
- else:
340
- # iteratively write table slices
341
- table_slices = table_slicer_func(table, max_records_per_entry)
342
- for table_slice in table_slices:
343
- slice_entries = retrying(
344
- upload_table,
345
- table_slice,
346
- f"{s3_url_prefix}",
347
- s3_file_system,
348
- s3_table_writer_func,
349
- s3_table_writer_kwargs,
350
- content_type,
351
- **s3_client_kwargs,
352
- )
353
- manifest_entries.extend(slice_entries)
354
- return manifest_entries
355
-
356
-
357
- def upload_table(
358
- table: Union[LocalTable, DistributedDataset],
359
- s3_base_url: str,
360
- s3_file_system: s3fs.S3FileSystem,
361
- s3_table_writer_func: Callable,
362
- s3_table_writer_kwargs: Optional[Dict[str, Any]],
363
- content_type: ContentType = ContentType.PARQUET,
364
- **s3_client_kwargs,
365
- ) -> ManifestEntryList:
366
- """
367
- Writes the given table to 1 or more S3 files and return
368
- manifest entries describing the uploaded files.
369
- """
370
- if s3_table_writer_kwargs is None:
371
- s3_table_writer_kwargs = {}
372
-
373
- capture_object = CapturedBlockWritePaths()
374
- block_write_path_provider = UuidBlockWritePathProvider(capture_object)
375
- s3_table_writer_func(
376
- table,
377
- s3_base_url,
378
- s3_file_system,
379
- block_write_path_provider,
380
- content_type.value,
381
- **s3_table_writer_kwargs,
382
- )
383
- # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
384
- del block_write_path_provider
385
- block_refs = capture_object.block_refs()
386
- write_paths = capture_object.write_paths()
387
- metadata = _get_metadata(table, write_paths, block_refs)
388
- manifest_entries = ManifestEntryList()
389
- for block_idx, s3_url in enumerate(write_paths):
390
- try:
391
- manifest_entry = ManifestEntry.from_s3_obj_url(
392
- s3_url,
393
- metadata[block_idx].num_rows,
394
- metadata[block_idx].size_bytes,
395
- **s3_client_kwargs,
396
- )
397
- manifest_entries.append(manifest_entry)
398
- except ClientError as e:
399
- if e.response["Error"]["Code"] == "NoSuchKey":
400
- # s3fs may swallow S3 errors - we were probably throttled
401
- raise RetryableUploadTableError(
402
- f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
403
- ) from e
404
- if (
405
- e.response["Error"]["Code"]
406
- in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
407
- ):
408
- raise RetryableUploadTableError(
409
- f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
410
- ) from e
411
- raise NonRetryableUploadTableError(
412
- f"Failed table upload to: {s3_url} after receiving {type(e).__name__}",
413
- ) from e
414
- except RETRYABLE_TRANSIENT_ERRORS as e:
415
- raise RetryableUploadTableError(
416
- f"Retry upload for: {s3_url} after receiving {type(e).__name__}",
417
- ) from e
418
- except BaseException as e:
419
- logger.warning(
420
- f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
421
- exc_info=True,
422
- )
423
- raise NonRetryableUploadTableError(
424
- f"Upload has failed for {s3_url} and content_type={content_type} because of {type(e).__name__}",
425
- ) from e
426
- return manifest_entries
427
-
428
-
429
- def download_manifest_entry(
430
- manifest_entry: ManifestEntry,
431
- token_holder: Optional[Dict[str, Any]] = None,
432
- table_type: TableType = TableType.PYARROW,
433
- column_names: Optional[List[str]] = None,
434
- include_columns: Optional[List[str]] = None,
435
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
436
- content_type: Optional[ContentType] = None,
437
- content_encoding: Optional[ContentEncoding] = None,
438
- ) -> LocalTable:
439
-
440
- s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
441
- if not content_type:
442
- content_type = manifest_entry.meta.content_type
443
- assert (
444
- content_type
445
- ), f"Unknown content type for manifest entry: {manifest_entry}"
446
- content_type = ContentType(content_type)
447
- if not content_encoding:
448
- content_encoding = manifest_entry.meta.content_encoding
449
- assert (
450
- content_encoding
451
- ), f"Unknown content encoding for manifest entry: {manifest_entry}"
452
- content_encoding = ContentEncoding(content_encoding)
453
- s3_url = manifest_entry.uri
454
- if s3_url is None:
455
- s3_url = manifest_entry.url
456
-
457
- partial_file_download_params = None
458
- if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
459
- for type_params in manifest_entry.meta.content_type_parameters:
460
- if isinstance(type_params, PartialFileDownloadParams):
461
- partial_file_download_params = type_params
462
- break
463
-
464
- # @retry decorator can't be pickled by Ray, so wrap download in Retrying
465
- retrying = Retrying(
466
- wait=wait_random_exponential(multiplier=1, max=60),
467
- stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
468
- retry=retry_if_exception_type(RetryableError),
469
- )
470
- table = retrying(
471
- read_file,
472
- s3_url,
473
- content_type,
474
- content_encoding,
475
- table_type,
476
- column_names,
477
- include_columns,
478
- file_reader_kwargs_provider,
479
- partial_file_download_params,
480
- **s3_client_kwargs,
481
- )
482
- return table
483
-
484
-
485
- @ray.remote
486
- def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
487
- return download_manifest_entry(*args, **kwargs)
488
-
489
-
490
- def download_manifest_entries(
491
- manifest: Manifest,
492
- token_holder: Optional[Dict[str, Any]] = None,
493
- table_type: TableType = TableType.PYARROW,
494
- max_parallelism: Optional[int] = 1,
495
- column_names: Optional[List[str]] = None,
496
- include_columns: Optional[List[str]] = None,
497
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
498
- ) -> LocalDataset:
499
-
500
- if max_parallelism and max_parallelism <= 1:
501
- return _download_manifest_entries(
502
- manifest,
503
- token_holder,
504
- table_type,
505
- column_names,
506
- include_columns,
507
- file_reader_kwargs_provider,
508
- )
509
- else:
510
- return _download_manifest_entries_parallel(
511
- manifest,
512
- token_holder,
513
- table_type,
514
- max_parallelism,
515
- column_names,
516
- include_columns,
517
- file_reader_kwargs_provider,
518
- )
519
-
520
-
521
- def download_manifest_entries_distributed(
522
- manifest: Manifest,
523
- token_holder: Optional[Dict[str, Any]] = None,
524
- table_type: TableType = TableType.PYARROW,
525
- max_parallelism: Optional[int] = 1000,
526
- column_names: Optional[List[str]] = None,
527
- include_columns: Optional[List[str]] = None,
528
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
529
- ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
530
- distributed_dataset_type: Optional[
531
- DistributedDatasetType
532
- ] = DistributedDatasetType.RAY_DATASET,
533
- ) -> DistributedDataset:
534
-
535
- params = {
536
- "manifest": manifest,
537
- "token_holder": token_holder,
538
- "table_type": table_type,
539
- "max_parallelism": max_parallelism,
540
- "column_names": column_names,
541
- "include_columns": include_columns,
542
- "file_reader_kwargs_provider": file_reader_kwargs_provider,
543
- "ray_options_provider": ray_options_provider,
544
- "distributed_dataset_type": distributed_dataset_type,
545
- }
546
-
547
- if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
548
- return _download_manifest_entries_ray_data_distributed(**params)
549
- elif distributed_dataset_type is not None:
550
- return _download_manifest_entries_all_dataset_distributed(**params)
551
- else:
552
- raise ValueError(
553
- f"Distributed dataset type {distributed_dataset_type} not supported."
554
- )
555
-
556
-
557
117
  def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
558
118
 
559
119
  parsed_s3_url = parse_s3_url(s3_url)
@@ -643,61 +203,6 @@ def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True
643
203
  return None
644
204
 
645
205
 
646
- def _download_manifest_entries_parallel(
647
- manifest: Manifest,
648
- token_holder: Optional[Dict[str, Any]] = None,
649
- table_type: TableType = TableType.PYARROW,
650
- max_parallelism: Optional[int] = None,
651
- column_names: Optional[List[str]] = None,
652
- include_columns: Optional[List[str]] = None,
653
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
654
- ) -> LocalDataset:
655
-
656
- tables = []
657
- pool = multiprocessing.Pool(max_parallelism)
658
- downloader = partial(
659
- download_manifest_entry,
660
- token_holder=token_holder,
661
- table_type=table_type,
662
- column_names=column_names,
663
- include_columns=include_columns,
664
- file_reader_kwargs_provider=file_reader_kwargs_provider,
665
- )
666
- for table in pool.map(downloader, [e for e in manifest.entries]):
667
- tables.append(table)
668
- return tables
669
-
670
-
671
- def _download_manifest_entries(
672
- manifest: Manifest,
673
- token_holder: Optional[Dict[str, Any]] = None,
674
- table_type: TableType = TableType.PYARROW,
675
- column_names: Optional[List[str]] = None,
676
- include_columns: Optional[List[str]] = None,
677
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
678
- ) -> LocalDataset:
679
-
680
- return [
681
- download_manifest_entry(
682
- manifest_entry=e,
683
- token_holder=token_holder,
684
- table_type=table_type,
685
- column_names=column_names,
686
- include_columns=include_columns,
687
- file_reader_kwargs_provider=file_reader_kwargs_provider,
688
- )
689
- for e in manifest.entries
690
- ]
691
-
692
-
693
- @ray.remote
694
- def _block_metadata(block: Block) -> BlockMetadata:
695
- return BlockAccessor.for_block(block).get_metadata(
696
- input_files=None,
697
- exec_stats=None,
698
- )
699
-
700
-
701
206
  def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
702
207
  conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
703
208
  return (
@@ -710,135 +215,3 @@ def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
710
215
  if token_holder
711
216
  else {"config": conf}
712
217
  )
713
-
714
-
715
- def _get_metadata(
716
- table: Union[LocalTable, DistributedDataset],
717
- write_paths: List[str],
718
- block_refs: List[ObjectRef[Block]],
719
- ) -> List[BlockMetadata]:
720
- metadata: List[BlockMetadata] = []
721
- if not block_refs:
722
- # this must be a local table - ensure it was written to only 1 file
723
- assert len(write_paths) == 1, (
724
- f"Expected table of type '{type(table)}' to be written to 1 "
725
- f"file, but found {len(write_paths)} files."
726
- )
727
- table_size = None
728
- table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
729
- if table_size_func:
730
- table_size = table_size_func(table)
731
- else:
732
- logger.warning(f"Unable to estimate '{type(table)}' table size.")
733
- metadata.append(
734
- BlockMetadata(
735
- num_rows=get_table_length(table),
736
- size_bytes=table_size,
737
- schema=None,
738
- input_files=None,
739
- exec_stats=None,
740
- )
741
- )
742
- else:
743
- # TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
744
- # ray 1.10
745
- # metadata = dataset._blocks.get_metadata()
746
- # ray 2.0.0dev
747
- metadata = table._plan.execute().get_metadata()
748
- if (
749
- not metadata
750
- or metadata[0].size_bytes is None
751
- or metadata[0].num_rows is None
752
- ):
753
- metadata_futures = [
754
- _block_metadata.remote(block_ref) for block_ref in block_refs
755
- ]
756
- metadata = ray.get(metadata_futures)
757
- return metadata
758
-
759
-
760
- def _download_manifest_entries_ray_data_distributed(
761
- manifest: Manifest,
762
- token_holder: Optional[Dict[str, Any]] = None,
763
- table_type: TableType = TableType.PYARROW,
764
- max_parallelism: Optional[int] = 1000,
765
- column_names: Optional[List[str]] = None,
766
- include_columns: Optional[List[str]] = None,
767
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
768
- ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
769
- ) -> DistributedDataset:
770
-
771
- table_pending_ids = []
772
- manifest_entries = manifest.entries
773
- if manifest_entries:
774
- table_pending_ids = invoke_parallel(
775
- manifest_entries,
776
- download_manifest_entry_ray,
777
- token_holder,
778
- table_type,
779
- column_names,
780
- include_columns,
781
- file_reader_kwargs_provider,
782
- max_parallelism=max_parallelism,
783
- options_provider=ray_options_provider,
784
- )
785
- return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
786
-
787
-
788
- def _download_manifest_entries_all_dataset_distributed(
789
- manifest: Manifest,
790
- token_holder: Optional[Dict[str, Any]] = None,
791
- table_type: TableType = TableType.PYARROW,
792
- max_parallelism: Optional[int] = 1000,
793
- column_names: Optional[List[str]] = None,
794
- include_columns: Optional[List[str]] = None,
795
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
796
- ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
797
- distributed_dataset_type: Optional[
798
- DistributedDatasetType
799
- ] = DistributedDatasetType.RAY_DATASET,
800
- ) -> DistributedDataset:
801
-
802
- entry_content_type = None
803
- entry_content_encoding = None
804
- uris = []
805
- for entry in manifest.entries or []:
806
- if (
807
- entry_content_type is not None
808
- and entry_content_type != entry.meta.content_type
809
- ):
810
- raise ValueError(
811
- f"Mixed content types of ({entry_content_type},"
812
- f" {entry.meta.content_type}) is not supported."
813
- )
814
-
815
- if (
816
- entry_content_encoding is not None
817
- and entry_content_encoding != entry.meta.content_encoding
818
- ):
819
- raise ValueError(
820
- f"Mixed content encoding of {entry_content_encoding},"
821
- f" {entry.meta.content_encoding} is not supported."
822
- )
823
-
824
- entry_content_type = entry.meta.content_type
825
- entry_content_encoding = entry.meta.content_encoding
826
- uris.append(entry.uri)
827
-
828
- s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
829
-
830
- if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
831
- return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
832
- uris=uris,
833
- content_type=entry_content_type,
834
- content_encoding=entry_content_encoding,
835
- column_names=column_names,
836
- include_columns=include_columns,
837
- read_func_kwargs_provider=file_reader_kwargs_provider,
838
- ray_options_provider=ray_options_provider,
839
- s3_client_kwargs=s3_client_kwargs,
840
- )
841
- else:
842
- raise ValueError(
843
- f"Unsupported distributed dataset type={distributed_dataset_type}"
844
- )