deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
deltacat/catalog/main/impl.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
2
2
|
import logging
|
3
3
|
|
4
|
+
import deltacat as dc
|
5
|
+
|
4
6
|
from deltacat.catalog import CatalogProperties
|
5
7
|
from deltacat.exceptions import (
|
6
8
|
NamespaceAlreadyExistsError,
|
@@ -34,17 +36,17 @@ from deltacat.types.tables import TableWriteMode
|
|
34
36
|
from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
|
35
37
|
from deltacat import logs
|
36
38
|
from deltacat.constants import DEFAULT_NAMESPACE
|
37
|
-
from deltacat.storage import metastore as storage_impl
|
38
39
|
|
39
40
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
40
41
|
|
41
42
|
"""
|
42
|
-
|
43
|
+
Default Catalog interface implementation using DeltaCAT native storage.
|
43
44
|
|
44
|
-
|
45
|
-
will be
|
45
|
+
When this is used by `delegate.py` the `Catalog` implementation `inner`
|
46
|
+
property will be set to the value returned from `intialize`.
|
46
47
|
|
47
|
-
`CatalogProperties` has all state required to implement catalog functions,
|
48
|
+
`CatalogProperties` has all state required to implement catalog functions,
|
49
|
+
such as metastore root URI.
|
48
50
|
"""
|
49
51
|
|
50
52
|
|
@@ -56,6 +58,10 @@ def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProp
|
|
56
58
|
returns CatalogProperties as the "inner" state value for a DC native catalog
|
57
59
|
"""
|
58
60
|
if config is not None:
|
61
|
+
if not isinstance(config, CatalogProperties):
|
62
|
+
raise ValueError(
|
63
|
+
f"Expected `CatalogProperties` but found `{type(config)}`."
|
64
|
+
)
|
59
65
|
return config
|
60
66
|
else:
|
61
67
|
return CatalogProperties(*args, **kwargs)
|
@@ -717,4 +723,4 @@ def _get_storage(**kwargs):
|
|
717
723
|
if properties is not None and properties.storage is not None:
|
718
724
|
return properties.storage
|
719
725
|
else:
|
720
|
-
return
|
726
|
+
return dc.storage.metastore
|
@@ -10,7 +10,7 @@ import ray
|
|
10
10
|
|
11
11
|
from deltacat import logs
|
12
12
|
from deltacat.annotations import ExperimentalAPI
|
13
|
-
from deltacat.catalog.main import impl as
|
13
|
+
from deltacat.catalog.main import impl as DeltaCatCatalog
|
14
14
|
from deltacat.catalog.iceberg import impl as IcebergCatalog
|
15
15
|
from deltacat.catalog import CatalogProperties
|
16
16
|
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
@@ -22,17 +22,14 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
22
22
|
|
23
23
|
|
24
24
|
class Catalog:
|
25
|
-
def __init__(self, impl: ModuleType =
|
25
|
+
def __init__(self, impl: ModuleType = DeltaCatCatalog, *args, **kwargs):
|
26
26
|
"""
|
27
27
|
Constructor for a Catalog.
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
Note: all initialization configuration MUST be pickle-able. When `Catalog` is pickled, _inner is excluded.
|
34
|
-
Instead, we only pass impl/args/kwargs, which are pickled and then _inner is re-constituted by calling __init__.
|
35
|
-
See `ray.util.register_serializer` in Catalogs class.
|
29
|
+
Invokes `impl.initialize(*args, **kwargs)` and stores its return value
|
30
|
+
in the `inner` property, which captures all state required to
|
31
|
+
deterministically reconstruct this Catalog instance on any node (and
|
32
|
+
must therefore be pickleable by Ray cloudpickle).
|
36
33
|
"""
|
37
34
|
if not isinstance(self, Catalog):
|
38
35
|
# self may contain the tuple returned from __reduce__ (ray pickle bug?)
|
@@ -68,7 +65,7 @@ class Catalog:
|
|
68
65
|
|
69
66
|
Uses CatalogProperties as configuration
|
70
67
|
"""
|
71
|
-
return cls(impl=
|
68
|
+
return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
|
72
69
|
|
73
70
|
@property
|
74
71
|
def impl(self):
|
@@ -104,25 +101,17 @@ class Catalogs:
|
|
104
101
|
self,
|
105
102
|
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
103
|
default: Optional[str] = None,
|
107
|
-
*args,
|
108
|
-
**kwargs,
|
109
104
|
):
|
110
|
-
if default and default not in catalogs:
|
111
|
-
raise ValueError(
|
112
|
-
f"Catalog {default} not found " f"in catalogs to register: {catalogs}"
|
113
|
-
)
|
114
|
-
if not catalogs:
|
115
|
-
raise ValueError(
|
116
|
-
f"No catalogs given to register. "
|
117
|
-
f"Please specify one or more catalogs."
|
118
|
-
)
|
119
|
-
|
120
|
-
# if user only provides single Catalog, override it to be a map with default key
|
121
105
|
if isinstance(catalogs, Catalog):
|
122
106
|
catalogs = {DEFAULT_CATALOG: catalogs}
|
123
|
-
|
107
|
+
elif not isinstance(catalogs, dict):
|
108
|
+
raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
|
124
109
|
self.catalogs: Dict[str, Catalog] = catalogs
|
125
110
|
if default:
|
111
|
+
if default not in catalogs:
|
112
|
+
raise ValueError(
|
113
|
+
f"Default catalog `{default}` not found in: {catalogs}"
|
114
|
+
)
|
126
115
|
self.default_catalog = self.catalogs[default]
|
127
116
|
elif len(catalogs) == 1:
|
128
117
|
self.default_catalog = list(self.catalogs.values())[0]
|
@@ -140,7 +129,7 @@ class Catalogs:
|
|
140
129
|
if set_default:
|
141
130
|
self.default_catalog = catalog
|
142
131
|
|
143
|
-
def get(self, name) -> Catalog:
|
132
|
+
def get(self, name) -> Optional[Catalog]:
|
144
133
|
return self.catalogs.get(name)
|
145
134
|
|
146
135
|
def default(self) -> Optional[Catalog]:
|
@@ -149,7 +138,7 @@ class Catalogs:
|
|
149
138
|
|
150
139
|
def is_initialized(*args, **kwargs) -> bool:
|
151
140
|
"""
|
152
|
-
Check if DeltaCAT is initialized
|
141
|
+
Check if DeltaCAT is initialized.
|
153
142
|
"""
|
154
143
|
global all_catalogs
|
155
144
|
|
@@ -162,22 +151,36 @@ def is_initialized(*args, **kwargs) -> bool:
|
|
162
151
|
return all_catalogs is not None
|
163
152
|
|
164
153
|
|
154
|
+
def raise_if_not_initialized(
|
155
|
+
err_msg: str = "DeltaCAT is not initialized. Please call `deltacat.init()` and try again.",
|
156
|
+
) -> None:
|
157
|
+
"""
|
158
|
+
Raises a RuntimeError with the given error message if DeltaCAT is not
|
159
|
+
initialized.
|
160
|
+
|
161
|
+
:param err_msg: Custom error message to raise if DeltaCAT is not
|
162
|
+
initialized. If unspecified, the default error message is used.
|
163
|
+
"""
|
164
|
+
if not is_initialized():
|
165
|
+
raise RuntimeError(err_msg)
|
166
|
+
|
167
|
+
|
165
168
|
def init(
|
166
|
-
catalogs: Union[Dict[str, Catalog], Catalog],
|
169
|
+
catalogs: Union[Dict[str, Catalog], Catalog] = {},
|
167
170
|
default: Optional[str] = None,
|
168
171
|
ray_init_args: Dict[str, Any] = None,
|
169
|
-
|
172
|
+
*,
|
170
173
|
force_reinitialize=False,
|
171
|
-
**kwargs,
|
172
174
|
) -> None:
|
173
175
|
"""
|
174
176
|
Initialize DeltaCAT catalogs.
|
175
177
|
|
176
|
-
:param catalogs:
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
:param
|
178
|
+
:param catalogs: A single Catalog instance or a map of catalog names to
|
179
|
+
Catalog instances.
|
180
|
+
:param default: The name of the default Catalog. If only one Catalog is
|
181
|
+
provided, it will always be the default.
|
182
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
183
|
+
:param force_reinitialize: Whether to force Ray reinitialization.
|
181
184
|
"""
|
182
185
|
global all_catalogs
|
183
186
|
|
@@ -195,6 +198,8 @@ def init(
|
|
195
198
|
ray.util.register_serializer(
|
196
199
|
Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
|
197
200
|
)
|
201
|
+
# TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
|
202
|
+
# with all catalogs from the last session
|
198
203
|
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
199
204
|
|
200
205
|
|
@@ -216,7 +221,6 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
216
221
|
"`deltacat.init(catalogs={...})` to register one or more "
|
217
222
|
"catalogs then retry."
|
218
223
|
)
|
219
|
-
|
220
224
|
if name is not None:
|
221
225
|
catalog = ray.get(all_catalogs.get.remote(name))
|
222
226
|
if not catalog:
|
@@ -225,17 +229,16 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
225
229
|
f"Catalog '{name}' not found. Available catalogs: "
|
226
230
|
f"{available_catalogs}."
|
227
231
|
)
|
228
|
-
return catalog
|
229
|
-
|
230
232
|
else:
|
231
233
|
catalog = ray.get(all_catalogs.default.remote())
|
232
234
|
if not catalog:
|
233
235
|
available_catalogs = ray.get(all_catalogs.all.remote()).values()
|
234
236
|
raise ValueError(
|
235
|
-
f"Call to get_catalog without name set failed because there
|
237
|
+
f"Call to get_catalog without name set failed because there "
|
238
|
+
f"is no default Catalog set. Available catalogs: "
|
236
239
|
f"{available_catalogs}."
|
237
240
|
)
|
238
|
-
|
241
|
+
return catalog
|
239
242
|
|
240
243
|
|
241
244
|
def put_catalog(
|
@@ -246,23 +249,37 @@ def put_catalog(
|
|
246
249
|
ray_init_args: Dict[str, Any] = None,
|
247
250
|
fail_if_exists: bool = False,
|
248
251
|
**kwargs,
|
249
|
-
) ->
|
252
|
+
) -> Catalog:
|
250
253
|
"""
|
251
254
|
Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
|
252
255
|
|
253
256
|
Args:
|
254
|
-
name:
|
255
|
-
catalog:
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
257
|
+
name: Name of the catalog.
|
258
|
+
catalog: Catalog instance to use. If none is provided, then all
|
259
|
+
additional keyword arguments will be forwarded to
|
260
|
+
`CatalogProperties` for a default DeltaCAT native Catalog.
|
261
|
+
default: Make this the default catalog if multiple catalogs are
|
262
|
+
available. If only one catalog is available, it will always be the
|
263
|
+
default.
|
264
|
+
ray_init_args: Ray initialization args (used only if ray not already
|
265
|
+
initialized)
|
266
|
+
fail_if_exists: if True, raises an error if a catalog with the given
|
267
|
+
name already exists. If False, inserts or replaces the given
|
268
|
+
catalog name.
|
269
|
+
kwargs: Additional keyword arguments to forward to `CatalogProperties`
|
270
|
+
for a default DeltaCAT native Catalog.
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
The catalog put in the named catalog map.
|
260
274
|
"""
|
261
275
|
global all_catalogs
|
262
276
|
|
277
|
+
if not catalog:
|
278
|
+
catalog = Catalog(**kwargs)
|
279
|
+
|
263
280
|
# Initialize, if necessary
|
264
281
|
if not is_initialized():
|
265
|
-
#
|
282
|
+
# We are initializing a single catalog - make it the default
|
266
283
|
if not default:
|
267
284
|
logger.info(
|
268
285
|
f"Calling put_catalog with set_as_default=False, "
|
@@ -288,3 +305,4 @@ def put_catalog(
|
|
288
305
|
|
289
306
|
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
290
307
|
ray.get(all_catalogs.put.remote(name, catalog, default))
|
308
|
+
return catalog
|
@@ -45,7 +45,7 @@ class CatalogProperties:
|
|
45
45
|
Attributes:
|
46
46
|
root (str): URI string The root path where catalog metadata and data
|
47
47
|
files are stored. Root is determined (in prededence order) by:
|
48
|
-
1. check
|
48
|
+
1. check "root" input argument
|
49
49
|
2. check env variable "DELTACAT_ROOT"
|
50
50
|
3. default to ${cwd}/.deltacat
|
51
51
|
|
@@ -61,8 +61,6 @@ class CatalogProperties:
|
|
61
61
|
root: Optional[str] = None,
|
62
62
|
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
63
63
|
storage=None,
|
64
|
-
*args,
|
65
|
-
**kwargs,
|
66
64
|
):
|
67
65
|
"""
|
68
66
|
Initialize a CatalogProperties instance.
|
deltacat/compute/__init__.py
CHANGED
@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
|
|
2
2
|
|
3
3
|
# Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
|
4
4
|
DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
|
5
|
+
|
6
|
+
|
7
|
+
# Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
|
8
|
+
# e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
|
9
|
+
IDENTIFIER_FIELD_DELIMITER = "c303282d"
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
|
2
1
|
from deltacat.utils.ray_utils.concurrency import (
|
3
2
|
invoke_parallel,
|
4
3
|
task_resource_options_provider,
|
@@ -20,7 +19,6 @@ from deltacat.compute.converter.steps.convert import convert
|
|
20
19
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
21
20
|
from deltacat.compute.converter.pyiceberg.overrides import (
|
22
21
|
fetch_all_bucket_files,
|
23
|
-
parquet_files_dict_to_iceberg_data_files,
|
24
22
|
)
|
25
23
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
26
24
|
construct_iceberg_table_prefix,
|
@@ -48,32 +46,46 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
48
46
|
table_name = params.iceberg_table_name
|
49
47
|
iceberg_table = load_table(catalog, table_name)
|
50
48
|
enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
|
49
|
+
iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
|
50
|
+
iceberg_namespace = params.iceberg_namespace
|
51
|
+
merge_keys = params.merge_keys
|
52
|
+
compact_previous_position_delete_files = (
|
53
|
+
params.compact_previous_position_delete_files
|
54
|
+
)
|
55
|
+
task_max_parallelism = params.task_max_parallelism
|
56
|
+
s3_client_kwargs = params.s3_client_kwargs
|
57
|
+
s3_file_system = params.s3_file_system
|
58
|
+
location_provider_prefix_override = params.location_provider_prefix_override
|
59
|
+
position_delete_for_multiple_data_files = (
|
60
|
+
params.position_delete_for_multiple_data_files
|
61
|
+
)
|
62
|
+
|
51
63
|
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
|
52
64
|
iceberg_table
|
53
65
|
)
|
66
|
+
|
54
67
|
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
55
68
|
data_file_dict=data_file_dict,
|
56
69
|
equality_delete_dict=equality_delete_dict,
|
57
70
|
pos_delete_dict=pos_delete_dict,
|
58
71
|
)
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
72
|
+
|
73
|
+
if not location_provider_prefix_override:
|
74
|
+
iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
|
75
|
+
iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
|
76
|
+
table_name=table_name,
|
77
|
+
iceberg_namespace=iceberg_namespace,
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
iceberg_table_warehouse_prefix = location_provider_prefix_override
|
81
|
+
|
67
82
|
# Using table identifier fields as merge keys if merge keys not provided
|
68
83
|
if not merge_keys:
|
69
84
|
identifier_fields_set = iceberg_table.schema().identifier_field_names()
|
70
85
|
identifier_fields = list(identifier_fields_set)
|
71
86
|
else:
|
72
87
|
identifier_fields = merge_keys
|
73
|
-
|
74
|
-
raise NotImplementedError(
|
75
|
-
f"Multiple identifier fields lookup not supported yet."
|
76
|
-
)
|
88
|
+
|
77
89
|
convert_options_provider = functools.partial(
|
78
90
|
task_resource_options_provider,
|
79
91
|
resource_amount_provider=convert_resource_options_provider,
|
@@ -86,58 +98,88 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
86
98
|
# Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
|
87
99
|
max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
88
100
|
|
89
|
-
compact_small_files = params.compact_small_files
|
90
|
-
position_delete_for_multiple_data_files = (
|
91
|
-
params.position_delete_for_multiple_data_files
|
92
|
-
)
|
93
|
-
task_max_parallelism = params.task_max_parallelism
|
94
|
-
|
95
101
|
def convert_input_provider(index, item):
|
96
102
|
return {
|
97
103
|
"convert_input": ConvertInput.of(
|
98
|
-
|
104
|
+
convert_input_files=item,
|
99
105
|
convert_task_index=index,
|
100
106
|
iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
|
101
107
|
identifier_fields=identifier_fields,
|
102
|
-
|
108
|
+
compact_previous_position_delete_files=compact_previous_position_delete_files,
|
109
|
+
table_io=iceberg_table.io,
|
110
|
+
table_metadata=iceberg_table.metadata,
|
103
111
|
enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
|
104
112
|
position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
|
105
113
|
max_parallel_data_file_download=max_parallel_data_file_download,
|
114
|
+
s3_client_kwargs=s3_client_kwargs,
|
115
|
+
s3_file_system=s3_file_system,
|
106
116
|
)
|
107
117
|
}
|
108
118
|
|
119
|
+
logger.info(f"Getting remote convert tasks...")
|
109
120
|
# Ray remote task: convert
|
110
|
-
# Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
|
111
121
|
# TODO: Add split mechanism to split large buckets
|
112
122
|
convert_tasks_pending = invoke_parallel(
|
113
|
-
items=convert_input_files_for_all_buckets
|
123
|
+
items=convert_input_files_for_all_buckets,
|
114
124
|
ray_task=convert,
|
115
125
|
max_parallelism=task_max_parallelism,
|
116
126
|
options_provider=convert_options_provider,
|
117
127
|
kwargs_provider=convert_input_provider,
|
118
128
|
)
|
129
|
+
|
119
130
|
to_be_deleted_files_list = []
|
120
|
-
|
131
|
+
logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
|
132
|
+
|
121
133
|
convert_results = ray.get(convert_tasks_pending)
|
122
|
-
|
123
|
-
to_be_deleted_files_list.extend(convert_result[0].values())
|
124
|
-
to_be_added_files_dict_list.append(convert_result[1])
|
134
|
+
logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
|
125
135
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
136
|
+
total_position_delete_record_count = sum(
|
137
|
+
convert_result.position_delete_record_count
|
138
|
+
for convert_result in convert_results
|
139
|
+
)
|
140
|
+
total_input_data_file_record_count = sum(
|
141
|
+
convert_result.input_data_files_record_count
|
142
|
+
for convert_result in convert_results
|
143
|
+
)
|
144
|
+
total_data_file_hash_columns_in_memory_sizes = sum(
|
145
|
+
convert_result.input_data_files_hash_columns_in_memory_sizes
|
146
|
+
for convert_result in convert_results
|
147
|
+
)
|
148
|
+
total_position_delete_file_in_memory_sizes = sum(
|
149
|
+
convert_result.position_delete_in_memory_sizes
|
150
|
+
for convert_result in convert_results
|
151
|
+
)
|
152
|
+
total_position_delete_on_disk_sizes = sum(
|
153
|
+
convert_result.position_delete_on_disk_sizes
|
154
|
+
for convert_result in convert_results
|
130
155
|
)
|
131
156
|
|
132
|
-
|
157
|
+
to_be_added_files_list = []
|
158
|
+
for convert_result in convert_results:
|
159
|
+
to_be_added_files = convert_result.to_be_added_files
|
160
|
+
to_be_deleted_files = convert_result.to_be_deleted_files
|
161
|
+
|
162
|
+
to_be_deleted_files_list.extend(to_be_deleted_files.values())
|
163
|
+
to_be_added_files_list.extend(to_be_added_files)
|
164
|
+
|
165
|
+
if not to_be_deleted_files_list and to_be_added_files_list:
|
133
166
|
commit_append_snapshot(
|
134
167
|
iceberg_table=iceberg_table,
|
135
|
-
new_position_delete_files=
|
168
|
+
new_position_delete_files=to_be_added_files_list,
|
136
169
|
)
|
137
170
|
else:
|
138
171
|
commit_replace_snapshot(
|
139
172
|
iceberg_table=iceberg_table,
|
140
|
-
# equality_delete_files + data file that all rows are deleted
|
141
173
|
to_be_deleted_files_list=to_be_deleted_files_list,
|
142
|
-
new_position_delete_files=
|
174
|
+
new_position_delete_files=to_be_added_files_list,
|
143
175
|
)
|
176
|
+
logger.info(
|
177
|
+
f"Aggregated stats for {table_name}: "
|
178
|
+
f"total position delete record count: {total_position_delete_record_count}, "
|
179
|
+
f"total input data file record_count: {total_input_data_file_record_count}, "
|
180
|
+
f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
|
181
|
+
f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
|
182
|
+
f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
|
183
|
+
)
|
184
|
+
|
185
|
+
logger.info(f"Committed new Iceberg snapshot.")
|
@@ -10,11 +10,14 @@ class ConvertInput(Dict):
|
|
10
10
|
convert_task_index,
|
11
11
|
iceberg_table_warehouse_prefix,
|
12
12
|
identifier_fields,
|
13
|
-
|
13
|
+
table_io,
|
14
|
+
table_metadata,
|
15
|
+
compact_previous_position_delete_files,
|
14
16
|
enforce_primary_key_uniqueness,
|
15
17
|
position_delete_for_multiple_data_files,
|
16
18
|
max_parallel_data_file_download,
|
17
19
|
s3_file_system,
|
20
|
+
s3_client_kwargs,
|
18
21
|
) -> ConvertInput:
|
19
22
|
|
20
23
|
result = ConvertInput()
|
@@ -22,13 +25,18 @@ class ConvertInput(Dict):
|
|
22
25
|
result["convert_task_index"] = convert_task_index
|
23
26
|
result["identifier_fields"] = identifier_fields
|
24
27
|
result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
|
25
|
-
result["
|
28
|
+
result["table_io"] = table_io
|
29
|
+
result["table_metadata"] = table_metadata
|
30
|
+
result[
|
31
|
+
"compact_previous_position_delete_files"
|
32
|
+
] = compact_previous_position_delete_files
|
26
33
|
result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
27
34
|
result[
|
28
35
|
"position_delete_for_multiple_data_files"
|
29
36
|
] = position_delete_for_multiple_data_files
|
30
37
|
result["max_parallel_data_file_download"] = max_parallel_data_file_download
|
31
38
|
result["s3_file_system"] = s3_file_system
|
39
|
+
result["s3_client_kwargs"] = s3_client_kwargs
|
32
40
|
|
33
41
|
return result
|
34
42
|
|
@@ -49,8 +57,16 @@ class ConvertInput(Dict):
|
|
49
57
|
return self["iceberg_table_warehouse_prefix"]
|
50
58
|
|
51
59
|
@property
|
52
|
-
def
|
53
|
-
return self["
|
60
|
+
def table_io(self):
|
61
|
+
return self["table_io"]
|
62
|
+
|
63
|
+
@property
|
64
|
+
def table_metadata(self):
|
65
|
+
return self["table_metadata"]
|
66
|
+
|
67
|
+
@property
|
68
|
+
def compact_previous_position_delete_files(self) -> bool:
|
69
|
+
return self["compact_previous_position_delete_files"]
|
54
70
|
|
55
71
|
@property
|
56
72
|
def enforce_primary_key_uniqueness(self) -> bool:
|
@@ -67,3 +83,7 @@ class ConvertInput(Dict):
|
|
67
83
|
@property
|
68
84
|
def s3_file_system(self):
|
69
85
|
return self["s3_file_system"]
|
86
|
+
|
87
|
+
@property
|
88
|
+
def s3_client_kwargs(self):
|
89
|
+
return self["s3_client_kwargs"]
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict
|
3
|
+
|
4
|
+
|
5
|
+
class ConvertResult(Dict):
|
6
|
+
@staticmethod
|
7
|
+
def of(
|
8
|
+
convert_task_index,
|
9
|
+
to_be_added_files,
|
10
|
+
to_be_deleted_files,
|
11
|
+
position_delete_record_count,
|
12
|
+
input_data_files_record_count,
|
13
|
+
input_data_files_hash_columns_in_memory_sizes,
|
14
|
+
position_delete_in_memory_sizes,
|
15
|
+
position_delete_on_disk_sizes,
|
16
|
+
) -> ConvertResult:
|
17
|
+
|
18
|
+
result = ConvertResult()
|
19
|
+
result["convert_task_index"] = convert_task_index
|
20
|
+
result["to_be_added_files"] = to_be_added_files
|
21
|
+
result["to_be_deleted_files"] = to_be_deleted_files
|
22
|
+
result["position_delete_record_count"] = position_delete_record_count
|
23
|
+
result["input_data_files_record_count"] = input_data_files_record_count
|
24
|
+
result[
|
25
|
+
"input_data_files_hash_columns_in_memory_sizes"
|
26
|
+
] = input_data_files_hash_columns_in_memory_sizes
|
27
|
+
result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
|
28
|
+
result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
|
29
|
+
return result
|
30
|
+
|
31
|
+
@property
|
32
|
+
def convert_task_index(self) -> int:
|
33
|
+
return self["convert_task_index"]
|
34
|
+
|
35
|
+
@property
|
36
|
+
def to_be_added_files(self):
|
37
|
+
return self["to_be_added_files"]
|
38
|
+
|
39
|
+
@property
|
40
|
+
def to_be_deleted_files(self):
|
41
|
+
return self["to_be_deleted_files"]
|
42
|
+
|
43
|
+
@property
|
44
|
+
def position_delete_record_count(self):
|
45
|
+
return self["position_delete_record_count"]
|
46
|
+
|
47
|
+
@property
|
48
|
+
def input_data_files_record_count(self):
|
49
|
+
return self["input_data_files_record_count"]
|
50
|
+
|
51
|
+
@property
|
52
|
+
def input_data_files_hash_columns_in_memory_sizes(self):
|
53
|
+
return self["input_data_files_hash_columns_in_memory_sizes"]
|
54
|
+
|
55
|
+
@property
|
56
|
+
def position_delete_in_memory_sizes(self):
|
57
|
+
return self["position_delete_in_memory_sizes"]
|
58
|
+
|
59
|
+
@property
|
60
|
+
def position_delete_on_disk_sizes(self):
|
61
|
+
return self["position_delete_on_disk_sizes"]
|