deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -8,6 +8,11 @@ from deltacat.api import (
|
|
8
8
|
list,
|
9
9
|
put,
|
10
10
|
)
|
11
|
+
from deltacat.storage.model.transaction import (
|
12
|
+
transaction,
|
13
|
+
transactions,
|
14
|
+
read_transaction,
|
15
|
+
)
|
11
16
|
from deltacat.catalog import ( # noqa: F401
|
12
17
|
alter_namespace,
|
13
18
|
alter_table,
|
@@ -28,6 +33,7 @@ from deltacat.catalog import ( # noqa: F401
|
|
28
33
|
truncate_table,
|
29
34
|
write_to_table,
|
30
35
|
init,
|
36
|
+
init_local,
|
31
37
|
is_initialized,
|
32
38
|
clear_catalogs,
|
33
39
|
get_catalog,
|
@@ -44,6 +50,12 @@ from deltacat.compute import (
|
|
44
50
|
local_job_client,
|
45
51
|
)
|
46
52
|
from deltacat.storage import (
|
53
|
+
BucketingStrategy,
|
54
|
+
BucketTransform,
|
55
|
+
BucketTransformParameters,
|
56
|
+
DayTransform,
|
57
|
+
HourTransform,
|
58
|
+
IdentityTransform,
|
47
59
|
Dataset,
|
48
60
|
DistributedDataset,
|
49
61
|
Field,
|
@@ -59,6 +71,17 @@ from deltacat.storage import (
|
|
59
71
|
SortKey,
|
60
72
|
SortOrder,
|
61
73
|
SortScheme,
|
74
|
+
TableProperties,
|
75
|
+
TransactionStatus,
|
76
|
+
Transform,
|
77
|
+
TransformName,
|
78
|
+
TransformParameters,
|
79
|
+
TruncateTransform,
|
80
|
+
TruncateTransformParameters,
|
81
|
+
TruncateStrategy,
|
82
|
+
UnknownTransform,
|
83
|
+
VoidTransform,
|
84
|
+
YearTransform,
|
62
85
|
NullOrder,
|
63
86
|
)
|
64
87
|
from deltacat.types.media import (
|
@@ -67,10 +90,26 @@ from deltacat.types.media import (
|
|
67
90
|
DatasetType,
|
68
91
|
DatastoreType,
|
69
92
|
)
|
70
|
-
|
71
|
-
|
93
|
+
from deltacat.types.tables import (
|
94
|
+
TableWriteMode,
|
95
|
+
TableProperty,
|
96
|
+
TableReadOptimizationLevel,
|
97
|
+
SchemaEvolutionMode,
|
98
|
+
from_pandas,
|
99
|
+
from_pyarrow,
|
100
|
+
from_manifest_table,
|
101
|
+
to_pyarrow,
|
102
|
+
to_pandas,
|
103
|
+
dataset_length,
|
104
|
+
dataset_size,
|
105
|
+
dataset_column_names,
|
106
|
+
dataset_schema,
|
107
|
+
)
|
72
108
|
from deltacat.utils.url import DeltaCatUrl
|
73
109
|
|
110
|
+
write = write_to_table
|
111
|
+
read = read_table
|
112
|
+
|
74
113
|
__iceberg__ = []
|
75
114
|
if importlib.util.find_spec("pyiceberg") is not None:
|
76
115
|
from deltacat.experimental.catalog.iceberg import ( # noqa: F401
|
@@ -83,7 +122,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
|
|
83
122
|
|
84
123
|
deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
|
85
124
|
|
86
|
-
__version__ = "2.0.
|
125
|
+
__version__ = "2.0.0b12"
|
87
126
|
|
88
127
|
|
89
128
|
__all__ = [
|
@@ -94,6 +133,8 @@ __all__ = [
|
|
94
133
|
"get",
|
95
134
|
"list",
|
96
135
|
"put",
|
136
|
+
"transaction",
|
137
|
+
"transactions",
|
97
138
|
"alter_table",
|
98
139
|
"create_table",
|
99
140
|
"drop_table",
|
@@ -110,9 +151,13 @@ __all__ = [
|
|
110
151
|
"create_namespace",
|
111
152
|
"drop_namespace",
|
112
153
|
"default_namespace",
|
154
|
+
"write",
|
113
155
|
"write_to_table",
|
156
|
+
"read",
|
114
157
|
"read_table",
|
158
|
+
"read_transaction",
|
115
159
|
"init",
|
160
|
+
"init_local",
|
116
161
|
"is_initialized",
|
117
162
|
"clear_catalogs",
|
118
163
|
"get_catalog",
|
@@ -120,6 +165,18 @@ __all__ = [
|
|
120
165
|
"pop_catalog",
|
121
166
|
"put_catalog",
|
122
167
|
"raise_if_not_initialized",
|
168
|
+
"dataset_length",
|
169
|
+
"dataset_size",
|
170
|
+
"dataset_column_names",
|
171
|
+
"dataset_schema",
|
172
|
+
"from_pandas",
|
173
|
+
"from_pyarrow",
|
174
|
+
"from_manifest_table",
|
175
|
+
"to_pandas",
|
176
|
+
"to_pyarrow",
|
177
|
+
"BucketingStrategy",
|
178
|
+
"BucketTransform",
|
179
|
+
"BucketTransformParameters",
|
123
180
|
"Catalog",
|
124
181
|
"CatalogProperties",
|
125
182
|
"ContentType",
|
@@ -127,13 +184,17 @@ __all__ = [
|
|
127
184
|
"Dataset",
|
128
185
|
"DatasetType",
|
129
186
|
"DatastoreType",
|
187
|
+
"DayTransform",
|
130
188
|
"DeltaCatUrl",
|
131
189
|
"DistributedDataset",
|
132
190
|
"Field",
|
191
|
+
"HourTransform",
|
192
|
+
"IdentityTransform",
|
133
193
|
"LifecycleState",
|
134
194
|
"ListResult",
|
135
195
|
"LocalDataset",
|
136
196
|
"LocalTable",
|
197
|
+
"MonthTransform",
|
137
198
|
"Namespace",
|
138
199
|
"NullOrder",
|
139
200
|
"PartitionKey",
|
@@ -145,6 +206,20 @@ __all__ = [
|
|
145
206
|
"SortScheme",
|
146
207
|
"TableDefinition",
|
147
208
|
"TableWriteMode",
|
209
|
+
"TableProperties",
|
210
|
+
"TableProperty",
|
211
|
+
"TableReadOptimizationLevel",
|
212
|
+
"SchemaEvolutionMode",
|
213
|
+
"TransactionStatus",
|
214
|
+
"Transform",
|
215
|
+
"TransformName",
|
216
|
+
"TransformParameters",
|
217
|
+
"TruncateTransform",
|
218
|
+
"TruncateTransformParameters",
|
219
|
+
"TruncateStrategy",
|
220
|
+
"UnknownTransform",
|
221
|
+
"VoidTransform",
|
222
|
+
"YearTransform",
|
148
223
|
]
|
149
224
|
|
150
225
|
__all__ += __iceberg__
|
deltacat/api.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import time
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from typing import Any, Union, List, Optional, Dict, Callable, Tuple
|
4
|
+
import logging
|
4
5
|
|
5
6
|
import ray
|
6
7
|
import deltacat as dc
|
@@ -15,6 +16,12 @@ from deltacat.io import (
|
|
15
16
|
DeltacatReadType,
|
16
17
|
)
|
17
18
|
from deltacat.storage import (
|
19
|
+
Namespace,
|
20
|
+
Table,
|
21
|
+
TableVersion,
|
22
|
+
Stream,
|
23
|
+
Partition,
|
24
|
+
Delta,
|
18
25
|
Dataset,
|
19
26
|
DistributedDataset,
|
20
27
|
ListResult,
|
@@ -44,6 +51,9 @@ from deltacat.utils.ray_utils.runtime import (
|
|
44
51
|
other_live_node_resource_keys,
|
45
52
|
find_max_single_node_resource_type,
|
46
53
|
)
|
54
|
+
from deltacat import logs
|
55
|
+
|
56
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
47
57
|
|
48
58
|
"""
|
49
59
|
# CLI Example of Copying from Source to Dest without file conversion
|
@@ -72,38 +82,6 @@ from deltacat.utils.ray_utils.runtime import (
|
|
72
82
|
"""
|
73
83
|
|
74
84
|
|
75
|
-
def _copy_dc(
|
76
|
-
source: DeltaCatUrl,
|
77
|
-
destination: DeltaCatUrl,
|
78
|
-
recursive: bool = False,
|
79
|
-
) -> Metafile:
|
80
|
-
if recursive:
|
81
|
-
src_obj = list(source, recursive=True)
|
82
|
-
else:
|
83
|
-
src_obj = get(source) if not source.url.endswith("/*") else list(source)
|
84
|
-
"""
|
85
|
-
dc_dest_url = DeltacatUrl(destination)
|
86
|
-
# TODO(pdames): Add writer with support for Ray Dataset DeltaCAT Sink &
|
87
|
-
# Recursive DeltaCAT source object copies. Ideally, the Ray Dataset read
|
88
|
-
# is lazy, and only indexes metadata about the objects at source instead
|
89
|
-
# of eagerly converting them to PyArrow-based Blocks.
|
90
|
-
dc_dest_url.writer(src_obj, recursive=recursive)
|
91
|
-
"""
|
92
|
-
|
93
|
-
src_parts = source.url.split("/")
|
94
|
-
src_parts = [part for part in src_parts if part]
|
95
|
-
dst_parts = destination.url.split("/")
|
96
|
-
dst_parts = [part for part in dst_parts if part]
|
97
|
-
dc.raise_if_not_initialized()
|
98
|
-
if len(src_parts) != len(dst_parts):
|
99
|
-
# TODO(pdames): Better error message.
|
100
|
-
raise ValueError(
|
101
|
-
f"Cannot copy {source} to {destination}. "
|
102
|
-
f"Source and destination must share the same type."
|
103
|
-
)
|
104
|
-
return put(destination, metafile=src_obj)
|
105
|
-
|
106
|
-
|
107
85
|
def copy(
|
108
86
|
src: DeltaCatUrl,
|
109
87
|
dst: DeltaCatUrl,
|
@@ -123,6 +101,7 @@ def copy(
|
|
123
101
|
"gz": 35,
|
124
102
|
"bz2": 35,
|
125
103
|
"zip": 35,
|
104
|
+
"zst": 35,
|
126
105
|
"7z": 35,
|
127
106
|
"*": 2.5,
|
128
107
|
},
|
@@ -135,11 +114,10 @@ def copy(
|
|
135
114
|
Copies data from the source datastore to the destination datastore. By
|
136
115
|
default, this method launches one parallel Ray process to read/transform
|
137
116
|
each input file found in the source followed by one parallel Ray process
|
138
|
-
to write each output file to the destination.
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
before starting parallel processing.
|
117
|
+
to write each output file to the destination. To ensure that adequate
|
118
|
+
resources are available to complete the operation, you may optionally
|
119
|
+
specify minimum cluster and/or worker CPUs to wait for before starting
|
120
|
+
parallel processing.
|
143
121
|
|
144
122
|
Args:
|
145
123
|
src: DeltaCAT URL of the source datastore to read.
|
@@ -190,6 +168,73 @@ def copy(
|
|
190
168
|
)
|
191
169
|
|
192
170
|
|
171
|
+
def _copy_objects_in_order(
|
172
|
+
src_objects: List[Metafile],
|
173
|
+
destination: DeltaCatUrl,
|
174
|
+
) -> Union[Metafile, List[Metafile]]:
|
175
|
+
dc_dest_url = DeltaCatUrl(destination.url)
|
176
|
+
catalog_name = dc_dest_url.catalog_name
|
177
|
+
|
178
|
+
copied_results = []
|
179
|
+
|
180
|
+
# Group objects by type for hierarchical copying
|
181
|
+
# Copy objects in strict hierarchical order
|
182
|
+
# Namespace -> Table -> TableVersion -> Stream -> Partition -> Delta
|
183
|
+
ordered_objects_by_type = {
|
184
|
+
Namespace: [],
|
185
|
+
Table: [],
|
186
|
+
TableVersion: [],
|
187
|
+
Stream: [],
|
188
|
+
Partition: [],
|
189
|
+
Delta: [],
|
190
|
+
}
|
191
|
+
|
192
|
+
for obj in src_objects:
|
193
|
+
obj_class = Metafile.get_class(obj.to_serializable())
|
194
|
+
ordered_objects_by_type[obj_class].append(obj)
|
195
|
+
|
196
|
+
# TODO(pdames): Support copying uncommitted streams/partitions.
|
197
|
+
# TODO(pdames): Support parallel/distributed copies.
|
198
|
+
for obj_class, objects in ordered_objects_by_type.items():
|
199
|
+
if objects:
|
200
|
+
logger.info(f"Copying {len(objects)} {obj_class} objects...")
|
201
|
+
if obj_class == TableVersion:
|
202
|
+
# sort table versions by ascending table version
|
203
|
+
objects.sort(key=lambda x: x.current_version_number())
|
204
|
+
if obj_class == Delta:
|
205
|
+
# sort deltas by ascending stream position
|
206
|
+
objects.sort(key=lambda x: x.stream_position)
|
207
|
+
for i, obj in enumerate(objects):
|
208
|
+
logger.info(f"Copying object {i+1}/{len(objects)}: {obj.url}")
|
209
|
+
dest_url = DeltaCatUrl(obj.url(catalog_name=catalog_name))
|
210
|
+
logger.info(f"Destination URL for object {i+1}/{len(objects)}: {dest_url}")
|
211
|
+
result = put(dest_url, metafile=obj)
|
212
|
+
copied_results.append(result)
|
213
|
+
logger.info(f"Successfully copied object {i+1}/{len(objects)}")
|
214
|
+
return copied_results[0] if len(copied_results) == 1 else copied_results
|
215
|
+
|
216
|
+
|
217
|
+
def _copy_dc(
|
218
|
+
source: DeltaCatUrl,
|
219
|
+
destination: DeltaCatUrl,
|
220
|
+
recursive: bool = False,
|
221
|
+
) -> Union[Metafile, List[Metafile]]:
|
222
|
+
dc.raise_if_not_initialized()
|
223
|
+
if len(source.url.split("/")) != len(destination.url.split("/")):
|
224
|
+
# TODO(pdames): Better error message.
|
225
|
+
raise ValueError(
|
226
|
+
f"Cannot copy {source} to {destination}. "
|
227
|
+
f"Source and destination must share the same type."
|
228
|
+
)
|
229
|
+
if recursive:
|
230
|
+
src_objects = list(DeltaCatUrl(source.url.rstrip("/**")), recursive=True)
|
231
|
+
elif source.url.endswith("/*"):
|
232
|
+
src_objects = list(DeltaCatUrl(source.url.rstrip("/*")))
|
233
|
+
else:
|
234
|
+
src_objects = [get(source)]
|
235
|
+
return _copy_objects_in_order(src_objects, destination)
|
236
|
+
|
237
|
+
|
193
238
|
def concat(source, destination):
|
194
239
|
raise NotImplementedError
|
195
240
|
|
@@ -214,9 +259,13 @@ def _list_all_metafiles(
|
|
214
259
|
metafiles: ListResult[Metafile] = lister(**kwargs)
|
215
260
|
list_results.append(metafiles)
|
216
261
|
if recursive:
|
262
|
+
# Process each level of the hierarchy
|
263
|
+
current_level_metafiles = [mf for mf in metafiles.all_items()]
|
264
|
+
|
217
265
|
for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
|
266
|
+
next_level_metafiles = []
|
218
267
|
# each subsequent lister needs to inject missing keyword args from the parent metafile
|
219
|
-
for metafile in
|
268
|
+
for metafile in current_level_metafiles:
|
220
269
|
kwargs_update = (
|
221
270
|
{kwarg_name: kwarg_val_resolver_fn(metafile)}
|
222
271
|
if kwarg_name and kwarg_val_resolver_fn
|
@@ -226,8 +275,11 @@ def _list_all_metafiles(
|
|
226
275
|
**kwargs,
|
227
276
|
**kwargs_update,
|
228
277
|
}
|
229
|
-
|
230
|
-
list_results.append(
|
278
|
+
child_metafiles = lister(**lister_kwargs)
|
279
|
+
list_results.append(child_metafiles)
|
280
|
+
next_level_metafiles.extend(child_metafiles.all_items())
|
281
|
+
# Move to the next level for the next iteration
|
282
|
+
current_level_metafiles = next_level_metafiles
|
231
283
|
return [
|
232
284
|
metafile for list_result in list_results for metafile in list_result.all_items()
|
233
285
|
]
|
@@ -308,7 +360,7 @@ def put(
|
|
308
360
|
*args,
|
309
361
|
**kwargs,
|
310
362
|
) -> Union[Metafile, str]:
|
311
|
-
writer = DeltaCatUrlWriter(url, metafile)
|
363
|
+
writer = DeltaCatUrlWriter(url, metafile=metafile)
|
312
364
|
return writer.write(*args, **kwargs)
|
313
365
|
|
314
366
|
|
@@ -351,6 +403,7 @@ def _copy_external_ray(
|
|
351
403
|
"gz": 35,
|
352
404
|
"bz2": 35,
|
353
405
|
"zip": 35,
|
406
|
+
"zst": 35,
|
354
407
|
"7z": 35,
|
355
408
|
"*": 2.5,
|
356
409
|
},
|
@@ -359,7 +412,7 @@ def _copy_external_ray(
|
|
359
412
|
writer_args: Dict[str, Any] = {},
|
360
413
|
filesystem: pafs.FileSystem = None,
|
361
414
|
) -> str:
|
362
|
-
|
415
|
+
logger.info(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
|
363
416
|
|
364
417
|
if not isinstance(src, DeltaCatUrl):
|
365
418
|
raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
|
@@ -367,30 +420,32 @@ def _copy_external_ray(
|
|
367
420
|
# wait for required resources
|
368
421
|
head_cpu_count = int(current_node_resources()["CPU"])
|
369
422
|
if minimum_worker_cpus > 0:
|
370
|
-
|
423
|
+
logger.info(f"Waiting for {minimum_worker_cpus} worker CPUs...")
|
371
424
|
live_cpu_waiter(
|
372
425
|
min_live_cpus=minimum_worker_cpus + head_cpu_count,
|
373
426
|
)
|
374
|
-
|
427
|
+
logger.info(f"{minimum_worker_cpus} worker CPUs found!")
|
375
428
|
# start job execution
|
376
429
|
cluster_resources = ray.cluster_resources()
|
377
|
-
|
378
|
-
|
430
|
+
logger.info(f"Cluster Resources: {cluster_resources}")
|
431
|
+
logger.info(f"Available Cluster Resources: {ray.available_resources()}")
|
379
432
|
cluster_cpus = int(cluster_resources["CPU"])
|
380
|
-
|
433
|
+
logger.info(f"Cluster CPUs: {cluster_cpus}")
|
381
434
|
all_node_resource_keys = live_node_resource_keys()
|
382
|
-
|
435
|
+
logger.info(
|
436
|
+
f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}"
|
437
|
+
)
|
383
438
|
worker_node_resource_keys = other_live_node_resource_keys()
|
384
|
-
|
439
|
+
logger.info(
|
385
440
|
f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
|
386
441
|
)
|
387
442
|
worker_cpu_count = cluster_cpus - head_cpu_count
|
388
|
-
|
443
|
+
logger.info(f"Total worker CPUs: {worker_cpu_count}")
|
389
444
|
|
390
445
|
# estimate memory requirements based on file extension
|
391
446
|
estimated_memory_bytes = 0
|
392
447
|
if extension_to_memory_multiplier:
|
393
|
-
|
448
|
+
logger.info(f"Resolving stats collection filesystem for: {src.url_path}.")
|
394
449
|
path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
|
395
450
|
if isinstance(filesystem, pafs.GcsFileSystem):
|
396
451
|
from datetime import timedelta
|
@@ -402,7 +457,7 @@ def _copy_external_ray(
|
|
402
457
|
anonymous=True,
|
403
458
|
retry_time_limit=timedelta(seconds=10),
|
404
459
|
)
|
405
|
-
|
460
|
+
logger.info(f"Using filesystem {type(filesystem)} to get file size of: {path}")
|
406
461
|
file_info = get_file_info(path, filesystem)
|
407
462
|
if file_info.type != FileType.File:
|
408
463
|
raise ValueError(
|
@@ -413,11 +468,11 @@ def _copy_external_ray(
|
|
413
468
|
if inflation_multiplier is None:
|
414
469
|
inflation_multiplier = extension_to_memory_multiplier.get("*")
|
415
470
|
estimated_memory_bytes = inflation_multiplier * file_info.size
|
416
|
-
|
471
|
+
logger.info(
|
417
472
|
f"Estimated Memory Required for Copy: "
|
418
473
|
f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
|
419
474
|
)
|
420
|
-
|
475
|
+
logger.info(f"Starting DeltaCAT Copy at: {time.time_ns()}")
|
421
476
|
|
422
477
|
index_result = None
|
423
478
|
num_cpus = 1
|
@@ -436,31 +491,31 @@ def _copy_external_ray(
|
|
436
491
|
reader_args=reader_args,
|
437
492
|
writer_args=writer_args,
|
438
493
|
)
|
439
|
-
|
494
|
+
logger.info(f"Time to Launch Copy Task: {latency} seconds")
|
440
495
|
try:
|
441
496
|
index_result, latency = timed_invocation(
|
442
497
|
ray.get,
|
443
498
|
copy_task_pending,
|
444
499
|
)
|
445
500
|
except OutOfMemoryError as e:
|
446
|
-
|
501
|
+
logger.warning(f"Copy Task Ran Out of Memory: {e}")
|
447
502
|
max_single_node_cpus = min(
|
448
503
|
max_allowed_cpus, find_max_single_node_resource_type("CPU")
|
449
504
|
)
|
450
505
|
num_cpus += 1
|
451
506
|
if num_cpus > max_single_node_cpus:
|
452
507
|
raise e
|
453
|
-
|
508
|
+
logger.info(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
|
454
509
|
|
455
|
-
|
456
|
-
|
510
|
+
logger.info(f"Time to Launch Copy Task: {latency} seconds")
|
511
|
+
logger.info(f"Time to Complete Copy Task: {latency} seconds")
|
457
512
|
|
458
513
|
total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
|
459
514
|
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
515
|
+
logger.info(f"Records Copied: {index_result.table_length}")
|
516
|
+
logger.info(f"Bytes Copied: {total_gib_indexed} GiB")
|
517
|
+
logger.info(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
|
518
|
+
logger.info(f"Finished Copy at: {time.time_ns()}")
|
464
519
|
|
465
520
|
return dst.url
|
466
521
|
|
@@ -484,13 +539,13 @@ def copy_task(
|
|
484
539
|
transforms=transforms,
|
485
540
|
reader_args=reader_args,
|
486
541
|
)
|
487
|
-
|
542
|
+
logger.debug(f"Time to read {src.url_path}: {latency} seconds")
|
488
543
|
|
489
544
|
table_size = get_table_size(table)
|
490
|
-
|
545
|
+
logger.debug(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
|
491
546
|
|
492
547
|
table_length = get_table_length(table)
|
493
|
-
|
548
|
+
logger.debug(f"Table Records: {table_length}")
|
494
549
|
|
495
550
|
writer = DeltaCatUrlWriter(dest, dataset_type)
|
496
551
|
written_file_path, latency = timed_invocation(
|
@@ -499,7 +554,7 @@ def copy_task(
|
|
499
554
|
table,
|
500
555
|
**writer_args,
|
501
556
|
)
|
502
|
-
|
557
|
+
logger.debug(f"Time to write {written_file_path}: {latency}")
|
503
558
|
|
504
559
|
return CopyResult(table_size, table_length)
|
505
560
|
|
deltacat/aws/constants.py
CHANGED
@@ -1,32 +1,9 @@
|
|
1
|
-
import botocore
|
2
1
|
from typing import Set
|
3
|
-
from daft.exceptions import DaftTransientError
|
4
2
|
from deltacat.utils.common import env_integer, env_string
|
5
3
|
|
6
4
|
|
7
5
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
|
8
|
-
DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
|
9
|
-
"DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
|
10
|
-
) # 5 mins
|
11
6
|
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
|
12
7
|
BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
|
13
8
|
BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
|
14
|
-
RETRYABLE_TRANSIENT_ERRORS = (
|
15
|
-
OSError,
|
16
|
-
botocore.exceptions.ConnectionError,
|
17
|
-
botocore.exceptions.HTTPClientError,
|
18
|
-
botocore.exceptions.NoCredentialsError,
|
19
|
-
botocore.exceptions.ConnectTimeoutError,
|
20
|
-
botocore.exceptions.ReadTimeoutError,
|
21
|
-
DaftTransientError,
|
22
|
-
)
|
23
9
|
AWS_REGION = env_string("AWS_REGION", "us-east-1")
|
24
|
-
UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
|
25
|
-
"UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
|
26
|
-
)
|
27
|
-
UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
|
28
|
-
"UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
|
29
|
-
)
|
30
|
-
DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
|
31
|
-
"DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
|
32
|
-
)
|