deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
from typing import Optional, Any
|
3
4
|
|
5
|
+
import os
|
6
|
+
|
4
7
|
import pyarrow
|
5
8
|
from deltacat.constants import DELTACAT_ROOT
|
6
9
|
|
@@ -8,18 +11,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
|
|
8
11
|
|
9
12
|
|
10
13
|
def get_catalog_properties(
|
11
|
-
|
14
|
+
*,
|
12
15
|
catalog: Optional[CatalogProperties] = None,
|
13
16
|
inner: Optional[CatalogProperties] = None,
|
14
17
|
**kwargs,
|
15
18
|
) -> CatalogProperties:
|
16
19
|
"""
|
17
|
-
Helper function to fetch CatalogProperties instance.
|
18
|
-
kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
|
20
|
+
Helper function to fetch CatalogProperties instance.
|
19
21
|
|
20
|
-
This will look
|
21
|
-
|
22
|
-
constructor.
|
22
|
+
This will look first look for CatalogProperties in either "catalog"
|
23
|
+
or "inner" and otherwise passes all keyword arguments to the
|
24
|
+
CatalogProperties constructor.
|
23
25
|
"""
|
24
26
|
properties = catalog if catalog is not None else inner
|
25
27
|
if properties is not None and isinstance(properties, CatalogProperties):
|
@@ -39,21 +41,22 @@ class CatalogProperties:
|
|
39
41
|
DeltaCAT catalog instance. Properties are set from system environment
|
40
42
|
variables unless explicit overrides are provided during initialization.
|
41
43
|
|
42
|
-
Catalog and storage APIs rely on the property catalog to retrieve durable
|
43
|
-
working against.
|
44
|
+
Catalog and storage APIs rely on the property catalog to retrieve durable
|
45
|
+
state about the catalog they're working against.
|
44
46
|
|
45
47
|
Attributes:
|
46
|
-
root
|
47
|
-
|
48
|
-
1.
|
49
|
-
2.
|
50
|
-
3. default to
|
48
|
+
root: The root path for catalog metadata and data storage. Resolved by
|
49
|
+
searching for the root path in the following order:
|
50
|
+
1. "root" constructor input argument
|
51
|
+
2. "DELTACAT_ROOT" system environment variable
|
52
|
+
3. default to "./.deltacat/"
|
51
53
|
|
52
54
|
filesystem: The filesystem implementation that should be used for
|
53
55
|
reading/writing files. If None, a filesystem will be inferred from
|
54
56
|
the catalog root path.
|
55
57
|
|
56
|
-
storage: Storage class implementation (overrides default filesystem
|
58
|
+
storage: Storage class implementation (overrides default filesystem
|
59
|
+
storage impl)
|
57
60
|
"""
|
58
61
|
|
59
62
|
def __init__(
|
@@ -61,28 +64,26 @@ class CatalogProperties:
|
|
61
64
|
root: Optional[str] = None,
|
62
65
|
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
63
66
|
storage=None,
|
64
|
-
*args,
|
65
|
-
**kwargs,
|
66
67
|
):
|
67
68
|
"""
|
68
69
|
Initialize a CatalogProperties instance.
|
69
70
|
|
70
71
|
Args:
|
71
|
-
root:
|
72
|
+
root: Catalog root directory path. Uses the "DELTACAT_ROOT"
|
73
|
+
system environment variable if not set, and defaults to
|
74
|
+
"./.deltacat/" if this environment variable is not set.
|
72
75
|
filesystem: The filesystem implementation that should be used for
|
73
76
|
reading these files. If None, a filesystem will be inferred.
|
74
|
-
If
|
75
|
-
|
77
|
+
If provided, this will be validated for compatibility with the
|
78
|
+
catalog root path.
|
76
79
|
"""
|
77
80
|
# set root, using precedence rules described in pydoc
|
78
81
|
if root is None:
|
79
82
|
# Check environment variables
|
80
|
-
# This is set or defaulted in constants.py
|
81
83
|
root = DELTACAT_ROOT
|
82
|
-
if root
|
83
|
-
|
84
|
-
|
85
|
-
)
|
84
|
+
if not root:
|
85
|
+
# Default to "./.deltacat/"
|
86
|
+
root = os.path.join(os.getcwd(), ".deltacat")
|
86
87
|
|
87
88
|
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
88
89
|
path=root,
|
deltacat/compute/__init__.py
CHANGED
@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
|
|
2
2
|
|
3
3
|
# Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
|
4
4
|
DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
|
5
|
+
|
6
|
+
|
7
|
+
# Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
|
8
|
+
# e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
|
9
|
+
IDENTIFIER_FIELD_DELIMITER = "c303282d"
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
|
2
1
|
from deltacat.utils.ray_utils.concurrency import (
|
3
2
|
invoke_parallel,
|
4
3
|
task_resource_options_provider,
|
@@ -20,7 +19,6 @@ from deltacat.compute.converter.steps.convert import convert
|
|
20
19
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
21
20
|
from deltacat.compute.converter.pyiceberg.overrides import (
|
22
21
|
fetch_all_bucket_files,
|
23
|
-
parquet_files_dict_to_iceberg_data_files,
|
24
22
|
)
|
25
23
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
26
24
|
construct_iceberg_table_prefix,
|
@@ -48,32 +46,46 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
48
46
|
table_name = params.iceberg_table_name
|
49
47
|
iceberg_table = load_table(catalog, table_name)
|
50
48
|
enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
|
49
|
+
iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
|
50
|
+
iceberg_namespace = params.iceberg_namespace
|
51
|
+
merge_keys = params.merge_keys
|
52
|
+
compact_previous_position_delete_files = (
|
53
|
+
params.compact_previous_position_delete_files
|
54
|
+
)
|
55
|
+
task_max_parallelism = params.task_max_parallelism
|
56
|
+
s3_client_kwargs = params.s3_client_kwargs
|
57
|
+
s3_file_system = params.s3_file_system
|
58
|
+
location_provider_prefix_override = params.location_provider_prefix_override
|
59
|
+
position_delete_for_multiple_data_files = (
|
60
|
+
params.position_delete_for_multiple_data_files
|
61
|
+
)
|
62
|
+
|
51
63
|
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
|
52
64
|
iceberg_table
|
53
65
|
)
|
66
|
+
|
54
67
|
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
55
68
|
data_file_dict=data_file_dict,
|
56
69
|
equality_delete_dict=equality_delete_dict,
|
57
70
|
pos_delete_dict=pos_delete_dict,
|
58
71
|
)
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
72
|
+
|
73
|
+
if not location_provider_prefix_override:
|
74
|
+
iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
|
75
|
+
iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
|
76
|
+
table_name=table_name,
|
77
|
+
iceberg_namespace=iceberg_namespace,
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
iceberg_table_warehouse_prefix = location_provider_prefix_override
|
81
|
+
|
67
82
|
# Using table identifier fields as merge keys if merge keys not provided
|
68
83
|
if not merge_keys:
|
69
84
|
identifier_fields_set = iceberg_table.schema().identifier_field_names()
|
70
85
|
identifier_fields = list(identifier_fields_set)
|
71
86
|
else:
|
72
87
|
identifier_fields = merge_keys
|
73
|
-
|
74
|
-
raise NotImplementedError(
|
75
|
-
f"Multiple identifier fields lookup not supported yet."
|
76
|
-
)
|
88
|
+
|
77
89
|
convert_options_provider = functools.partial(
|
78
90
|
task_resource_options_provider,
|
79
91
|
resource_amount_provider=convert_resource_options_provider,
|
@@ -86,58 +98,88 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
86
98
|
# Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
|
87
99
|
max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
88
100
|
|
89
|
-
compact_small_files = params.compact_small_files
|
90
|
-
position_delete_for_multiple_data_files = (
|
91
|
-
params.position_delete_for_multiple_data_files
|
92
|
-
)
|
93
|
-
task_max_parallelism = params.task_max_parallelism
|
94
|
-
|
95
101
|
def convert_input_provider(index, item):
|
96
102
|
return {
|
97
103
|
"convert_input": ConvertInput.of(
|
98
|
-
|
104
|
+
convert_input_files=item,
|
99
105
|
convert_task_index=index,
|
100
106
|
iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
|
101
107
|
identifier_fields=identifier_fields,
|
102
|
-
|
108
|
+
compact_previous_position_delete_files=compact_previous_position_delete_files,
|
109
|
+
table_io=iceberg_table.io,
|
110
|
+
table_metadata=iceberg_table.metadata,
|
103
111
|
enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
|
104
112
|
position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
|
105
113
|
max_parallel_data_file_download=max_parallel_data_file_download,
|
114
|
+
s3_client_kwargs=s3_client_kwargs,
|
115
|
+
s3_file_system=s3_file_system,
|
106
116
|
)
|
107
117
|
}
|
108
118
|
|
119
|
+
logger.info(f"Getting remote convert tasks...")
|
109
120
|
# Ray remote task: convert
|
110
|
-
# Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
|
111
121
|
# TODO: Add split mechanism to split large buckets
|
112
122
|
convert_tasks_pending = invoke_parallel(
|
113
|
-
items=convert_input_files_for_all_buckets
|
123
|
+
items=convert_input_files_for_all_buckets,
|
114
124
|
ray_task=convert,
|
115
125
|
max_parallelism=task_max_parallelism,
|
116
126
|
options_provider=convert_options_provider,
|
117
127
|
kwargs_provider=convert_input_provider,
|
118
128
|
)
|
129
|
+
|
119
130
|
to_be_deleted_files_list = []
|
120
|
-
|
131
|
+
logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
|
132
|
+
|
121
133
|
convert_results = ray.get(convert_tasks_pending)
|
122
|
-
|
123
|
-
to_be_deleted_files_list.extend(convert_result[0].values())
|
124
|
-
to_be_added_files_dict_list.append(convert_result[1])
|
134
|
+
logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
|
125
135
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
136
|
+
total_position_delete_record_count = sum(
|
137
|
+
convert_result.position_delete_record_count
|
138
|
+
for convert_result in convert_results
|
139
|
+
)
|
140
|
+
total_input_data_file_record_count = sum(
|
141
|
+
convert_result.input_data_files_record_count
|
142
|
+
for convert_result in convert_results
|
143
|
+
)
|
144
|
+
total_data_file_hash_columns_in_memory_sizes = sum(
|
145
|
+
convert_result.input_data_files_hash_columns_in_memory_sizes
|
146
|
+
for convert_result in convert_results
|
147
|
+
)
|
148
|
+
total_position_delete_file_in_memory_sizes = sum(
|
149
|
+
convert_result.position_delete_in_memory_sizes
|
150
|
+
for convert_result in convert_results
|
151
|
+
)
|
152
|
+
total_position_delete_on_disk_sizes = sum(
|
153
|
+
convert_result.position_delete_on_disk_sizes
|
154
|
+
for convert_result in convert_results
|
130
155
|
)
|
131
156
|
|
132
|
-
|
157
|
+
to_be_added_files_list = []
|
158
|
+
for convert_result in convert_results:
|
159
|
+
to_be_added_files = convert_result.to_be_added_files
|
160
|
+
to_be_deleted_files = convert_result.to_be_deleted_files
|
161
|
+
|
162
|
+
to_be_deleted_files_list.extend(to_be_deleted_files.values())
|
163
|
+
to_be_added_files_list.extend(to_be_added_files)
|
164
|
+
|
165
|
+
if not to_be_deleted_files_list and to_be_added_files_list:
|
133
166
|
commit_append_snapshot(
|
134
167
|
iceberg_table=iceberg_table,
|
135
|
-
new_position_delete_files=
|
168
|
+
new_position_delete_files=to_be_added_files_list,
|
136
169
|
)
|
137
170
|
else:
|
138
171
|
commit_replace_snapshot(
|
139
172
|
iceberg_table=iceberg_table,
|
140
|
-
# equality_delete_files + data file that all rows are deleted
|
141
173
|
to_be_deleted_files_list=to_be_deleted_files_list,
|
142
|
-
new_position_delete_files=
|
174
|
+
new_position_delete_files=to_be_added_files_list,
|
143
175
|
)
|
176
|
+
logger.info(
|
177
|
+
f"Aggregated stats for {table_name}: "
|
178
|
+
f"total position delete record count: {total_position_delete_record_count}, "
|
179
|
+
f"total input data file record_count: {total_input_data_file_record_count}, "
|
180
|
+
f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
|
181
|
+
f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
|
182
|
+
f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
|
183
|
+
)
|
184
|
+
|
185
|
+
logger.info(f"Committed new Iceberg snapshot.")
|
@@ -10,11 +10,14 @@ class ConvertInput(Dict):
|
|
10
10
|
convert_task_index,
|
11
11
|
iceberg_table_warehouse_prefix,
|
12
12
|
identifier_fields,
|
13
|
-
|
13
|
+
table_io,
|
14
|
+
table_metadata,
|
15
|
+
compact_previous_position_delete_files,
|
14
16
|
enforce_primary_key_uniqueness,
|
15
17
|
position_delete_for_multiple_data_files,
|
16
18
|
max_parallel_data_file_download,
|
17
19
|
s3_file_system,
|
20
|
+
s3_client_kwargs,
|
18
21
|
) -> ConvertInput:
|
19
22
|
|
20
23
|
result = ConvertInput()
|
@@ -22,13 +25,18 @@ class ConvertInput(Dict):
|
|
22
25
|
result["convert_task_index"] = convert_task_index
|
23
26
|
result["identifier_fields"] = identifier_fields
|
24
27
|
result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
|
25
|
-
result["
|
28
|
+
result["table_io"] = table_io
|
29
|
+
result["table_metadata"] = table_metadata
|
30
|
+
result[
|
31
|
+
"compact_previous_position_delete_files"
|
32
|
+
] = compact_previous_position_delete_files
|
26
33
|
result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
27
34
|
result[
|
28
35
|
"position_delete_for_multiple_data_files"
|
29
36
|
] = position_delete_for_multiple_data_files
|
30
37
|
result["max_parallel_data_file_download"] = max_parallel_data_file_download
|
31
38
|
result["s3_file_system"] = s3_file_system
|
39
|
+
result["s3_client_kwargs"] = s3_client_kwargs
|
32
40
|
|
33
41
|
return result
|
34
42
|
|
@@ -49,8 +57,16 @@ class ConvertInput(Dict):
|
|
49
57
|
return self["iceberg_table_warehouse_prefix"]
|
50
58
|
|
51
59
|
@property
|
52
|
-
def
|
53
|
-
return self["
|
60
|
+
def table_io(self):
|
61
|
+
return self["table_io"]
|
62
|
+
|
63
|
+
@property
|
64
|
+
def table_metadata(self):
|
65
|
+
return self["table_metadata"]
|
66
|
+
|
67
|
+
@property
|
68
|
+
def compact_previous_position_delete_files(self) -> bool:
|
69
|
+
return self["compact_previous_position_delete_files"]
|
54
70
|
|
55
71
|
@property
|
56
72
|
def enforce_primary_key_uniqueness(self) -> bool:
|
@@ -67,3 +83,7 @@ class ConvertInput(Dict):
|
|
67
83
|
@property
|
68
84
|
def s3_file_system(self):
|
69
85
|
return self["s3_file_system"]
|
86
|
+
|
87
|
+
@property
|
88
|
+
def s3_client_kwargs(self):
|
89
|
+
return self["s3_client_kwargs"]
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict
|
3
|
+
|
4
|
+
|
5
|
+
class ConvertResult(Dict):
|
6
|
+
@staticmethod
|
7
|
+
def of(
|
8
|
+
convert_task_index,
|
9
|
+
to_be_added_files,
|
10
|
+
to_be_deleted_files,
|
11
|
+
position_delete_record_count,
|
12
|
+
input_data_files_record_count,
|
13
|
+
input_data_files_hash_columns_in_memory_sizes,
|
14
|
+
position_delete_in_memory_sizes,
|
15
|
+
position_delete_on_disk_sizes,
|
16
|
+
) -> ConvertResult:
|
17
|
+
|
18
|
+
result = ConvertResult()
|
19
|
+
result["convert_task_index"] = convert_task_index
|
20
|
+
result["to_be_added_files"] = to_be_added_files
|
21
|
+
result["to_be_deleted_files"] = to_be_deleted_files
|
22
|
+
result["position_delete_record_count"] = position_delete_record_count
|
23
|
+
result["input_data_files_record_count"] = input_data_files_record_count
|
24
|
+
result[
|
25
|
+
"input_data_files_hash_columns_in_memory_sizes"
|
26
|
+
] = input_data_files_hash_columns_in_memory_sizes
|
27
|
+
result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
|
28
|
+
result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
|
29
|
+
return result
|
30
|
+
|
31
|
+
@property
|
32
|
+
def convert_task_index(self) -> int:
|
33
|
+
return self["convert_task_index"]
|
34
|
+
|
35
|
+
@property
|
36
|
+
def to_be_added_files(self):
|
37
|
+
return self["to_be_added_files"]
|
38
|
+
|
39
|
+
@property
|
40
|
+
def to_be_deleted_files(self):
|
41
|
+
return self["to_be_deleted_files"]
|
42
|
+
|
43
|
+
@property
|
44
|
+
def position_delete_record_count(self):
|
45
|
+
return self["position_delete_record_count"]
|
46
|
+
|
47
|
+
@property
|
48
|
+
def input_data_files_record_count(self):
|
49
|
+
return self["input_data_files_record_count"]
|
50
|
+
|
51
|
+
@property
|
52
|
+
def input_data_files_hash_columns_in_memory_sizes(self):
|
53
|
+
return self["input_data_files_hash_columns_in_memory_sizes"]
|
54
|
+
|
55
|
+
@property
|
56
|
+
def position_delete_in_memory_sizes(self):
|
57
|
+
return self["position_delete_in_memory_sizes"]
|
58
|
+
|
59
|
+
@property
|
60
|
+
def position_delete_on_disk_sizes(self):
|
61
|
+
return self["position_delete_on_disk_sizes"]
|
@@ -1,6 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Optional, Dict
|
3
|
-
from deltacat.compute.converter.constants import
|
3
|
+
from deltacat.compute.converter.constants import (
|
4
|
+
DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
|
5
|
+
)
|
6
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
7
|
+
from fsspec import AbstractFileSystem
|
4
8
|
|
5
9
|
|
6
10
|
class ConverterSessionParams(dict):
|
@@ -18,15 +22,15 @@ class ConverterSessionParams(dict):
|
|
18
22
|
assert (
|
19
23
|
params.get("iceberg_warehouse_bucket_name") is not None
|
20
24
|
), "iceberg_warehouse_bucket_name is a required arg"
|
21
|
-
assert (
|
22
|
-
params.get("iceberg_namespace") is not None
|
23
|
-
), "iceberg_namespace is a required arg"
|
24
25
|
result = ConverterSessionParams(params)
|
25
26
|
|
27
|
+
result.iceberg_namespace = params.get("iceberg_namespace", DEFAULT_NAMESPACE)
|
26
28
|
result.enforce_primary_key_uniqueness = params.get(
|
27
29
|
"enforce_primary_key_uniqueness", False
|
28
30
|
)
|
29
|
-
result.
|
31
|
+
result.compact_previous_position_delete_files = params.get(
|
32
|
+
"compact_previous_position_delete_files", False
|
33
|
+
)
|
30
34
|
|
31
35
|
# For Iceberg v3 spec, option to produce delete vector that can establish 1:1 mapping with data files.
|
32
36
|
result.position_delete_for_multiple_data_files = params.get(
|
@@ -36,6 +40,10 @@ class ConverterSessionParams(dict):
|
|
36
40
|
"task_max_parallelism", DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
|
37
41
|
)
|
38
42
|
result.merge_keys = params.get("merge_keys", None)
|
43
|
+
result.s3_client_kwargs = params.get("s3_client_kwargs", {})
|
44
|
+
result.s3_file_system = params.get("s3_file_system", None)
|
45
|
+
result.s3_prefix_override = params.get("s3_prefix_override", None)
|
46
|
+
|
39
47
|
return result
|
40
48
|
|
41
49
|
@property
|
@@ -54,6 +62,10 @@ class ConverterSessionParams(dict):
|
|
54
62
|
def iceberg_namespace(self) -> str:
|
55
63
|
return self["iceberg_namespace"]
|
56
64
|
|
65
|
+
@iceberg_namespace.setter
|
66
|
+
def iceberg_namespace(self, iceberg_namespace) -> None:
|
67
|
+
self["iceberg_namespace"] = iceberg_namespace
|
68
|
+
|
57
69
|
@property
|
58
70
|
def enforce_primary_key_uniqueness(self) -> bool:
|
59
71
|
return self["enforce_primary_key_uniqueness"]
|
@@ -63,12 +75,16 @@ class ConverterSessionParams(dict):
|
|
63
75
|
self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
64
76
|
|
65
77
|
@property
|
66
|
-
def
|
67
|
-
return self["
|
78
|
+
def compact_previous_position_delete_files(self) -> bool:
|
79
|
+
return self["compact_previous_position_delete_files"]
|
68
80
|
|
69
|
-
@
|
70
|
-
def
|
71
|
-
self
|
81
|
+
@compact_previous_position_delete_files.setter
|
82
|
+
def compact_previous_position_delete_files(
|
83
|
+
self, compact_previous_position_delete_files
|
84
|
+
) -> None:
|
85
|
+
self[
|
86
|
+
"compact_previous_position_delete_files"
|
87
|
+
] = compact_previous_position_delete_files
|
72
88
|
|
73
89
|
@property
|
74
90
|
def position_delete_for_multiple_data_files(self) -> bool:
|
@@ -97,3 +113,29 @@ class ConverterSessionParams(dict):
|
|
97
113
|
@merge_keys.setter
|
98
114
|
def merge_keys(self, merge_keys) -> None:
|
99
115
|
self["merge_keys"] = merge_keys
|
116
|
+
|
117
|
+
@property
|
118
|
+
def s3_client_kwargs(self) -> Dict:
|
119
|
+
return self["s3_client_kwargs"]
|
120
|
+
|
121
|
+
@s3_client_kwargs.setter
|
122
|
+
def s3_client_kwargs(self, s3_client_kwargs) -> None:
|
123
|
+
self["s3_client_kwargs"] = s3_client_kwargs
|
124
|
+
|
125
|
+
@property
|
126
|
+
def s3_file_system(self) -> AbstractFileSystem:
|
127
|
+
return self["s3_file_system"]
|
128
|
+
|
129
|
+
@s3_file_system.setter
|
130
|
+
def s3_file_system(self, s3_file_system) -> None:
|
131
|
+
self["s3_file_system"] = s3_file_system
|
132
|
+
|
133
|
+
@property
|
134
|
+
def location_provider_prefix_override(self) -> str:
|
135
|
+
return self["location_provider_prefix_override"]
|
136
|
+
|
137
|
+
@location_provider_prefix_override.setter
|
138
|
+
def location_provider_prefix_override(
|
139
|
+
self, location_provider_prefix_override
|
140
|
+
) -> None:
|
141
|
+
self["location_provider_prefix_override"] = location_provider_prefix_override
|