deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,25 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Dict, List
|
2
|
+
from typing import Dict, List, Any, Optional
|
3
3
|
from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
|
4
|
+
from fsspec import AbstractFileSystem
|
4
5
|
|
5
6
|
|
6
7
|
class ConvertInput(Dict):
|
7
8
|
@staticmethod
|
8
9
|
def of(
|
9
|
-
convert_input_files,
|
10
|
-
convert_task_index,
|
11
|
-
iceberg_table_warehouse_prefix,
|
12
|
-
identifier_fields,
|
13
|
-
table_io,
|
14
|
-
table_metadata,
|
15
|
-
compact_previous_position_delete_files,
|
16
|
-
enforce_primary_key_uniqueness,
|
17
|
-
position_delete_for_multiple_data_files,
|
18
|
-
max_parallel_data_file_download,
|
19
|
-
|
20
|
-
s3_client_kwargs,
|
10
|
+
convert_input_files: ConvertInputFiles,
|
11
|
+
convert_task_index: int,
|
12
|
+
iceberg_table_warehouse_prefix: str,
|
13
|
+
identifier_fields: List[str],
|
14
|
+
table_io: Any,
|
15
|
+
table_metadata: Any,
|
16
|
+
compact_previous_position_delete_files: bool,
|
17
|
+
enforce_primary_key_uniqueness: bool,
|
18
|
+
position_delete_for_multiple_data_files: bool,
|
19
|
+
max_parallel_data_file_download: int,
|
20
|
+
filesystem: Optional[AbstractFileSystem],
|
21
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
22
|
+
task_memory: float,
|
21
23
|
) -> ConvertInput:
|
22
24
|
|
23
25
|
result = ConvertInput()
|
@@ -35,8 +37,9 @@ class ConvertInput(Dict):
|
|
35
37
|
"position_delete_for_multiple_data_files"
|
36
38
|
] = position_delete_for_multiple_data_files
|
37
39
|
result["max_parallel_data_file_download"] = max_parallel_data_file_download
|
38
|
-
result["
|
40
|
+
result["filesystem"] = filesystem
|
39
41
|
result["s3_client_kwargs"] = s3_client_kwargs
|
42
|
+
result["task_memory"] = task_memory
|
40
43
|
|
41
44
|
return result
|
42
45
|
|
@@ -57,11 +60,11 @@ class ConvertInput(Dict):
|
|
57
60
|
return self["iceberg_table_warehouse_prefix"]
|
58
61
|
|
59
62
|
@property
|
60
|
-
def table_io(self):
|
63
|
+
def table_io(self) -> Any:
|
61
64
|
return self["table_io"]
|
62
65
|
|
63
66
|
@property
|
64
|
-
def table_metadata(self):
|
67
|
+
def table_metadata(self) -> Any:
|
65
68
|
return self["table_metadata"]
|
66
69
|
|
67
70
|
@property
|
@@ -81,9 +84,13 @@ class ConvertInput(Dict):
|
|
81
84
|
return self["max_parallel_data_file_download"]
|
82
85
|
|
83
86
|
@property
|
84
|
-
def
|
85
|
-
return self["
|
87
|
+
def filesystem(self) -> Optional[AbstractFileSystem]:
|
88
|
+
return self["filesystem"]
|
86
89
|
|
87
90
|
@property
|
88
|
-
def s3_client_kwargs(self):
|
91
|
+
def s3_client_kwargs(self) -> Optional[Dict[str, Any]]:
|
89
92
|
return self["s3_client_kwargs"]
|
93
|
+
|
94
|
+
@property
|
95
|
+
def task_memory(self) -> float:
|
96
|
+
return self["task_memory"]
|
@@ -1,15 +1,21 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Dict
|
2
|
+
from typing import Dict, List, Any, Optional, Tuple
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
|
5
|
+
# Type aliases to simplify nested types
|
6
|
+
DataFileWithSequence = Tuple[int, DataFile] # (sequence_number, data_file)
|
7
|
+
DataFileList = List[DataFileWithSequence] # List of data files with sequence numbers
|
8
|
+
DataFileListGroup = List[DataFileList] # Group of data file lists
|
3
9
|
|
4
10
|
|
5
11
|
class ConvertInputFiles(Dict):
|
6
12
|
@staticmethod
|
7
13
|
def of(
|
8
|
-
partition_value,
|
9
|
-
all_data_files_for_dedupe=None,
|
10
|
-
applicable_data_files=None,
|
11
|
-
applicable_equality_delete_files=None,
|
12
|
-
existing_position_delete_files=None,
|
14
|
+
partition_value: Any,
|
15
|
+
all_data_files_for_dedupe: Optional[DataFileList] = None,
|
16
|
+
applicable_data_files: Optional[DataFileListGroup] = None,
|
17
|
+
applicable_equality_delete_files: Optional[DataFileListGroup] = None,
|
18
|
+
existing_position_delete_files: Optional[DataFileList] = None,
|
13
19
|
) -> ConvertInputFiles:
|
14
20
|
|
15
21
|
result = ConvertInputFiles()
|
@@ -21,41 +27,52 @@ class ConvertInputFiles(Dict):
|
|
21
27
|
return result
|
22
28
|
|
23
29
|
@property
|
24
|
-
def partition_value(self):
|
30
|
+
def partition_value(self) -> Any:
|
25
31
|
return self["partition_value"]
|
26
32
|
|
27
33
|
@property
|
28
|
-
def all_data_files_for_dedupe(self):
|
34
|
+
def all_data_files_for_dedupe(self) -> Optional[DataFileList]:
|
29
35
|
return self["all_data_files_for_dedupe"]
|
30
36
|
|
31
37
|
@property
|
32
|
-
def applicable_data_files(self):
|
38
|
+
def applicable_data_files(self) -> Optional[DataFileListGroup]:
|
33
39
|
return self["applicable_data_files"]
|
34
40
|
|
35
41
|
@property
|
36
|
-
def applicable_equality_delete_files(
|
42
|
+
def applicable_equality_delete_files(
|
43
|
+
self,
|
44
|
+
) -> Optional[DataFileListGroup]:
|
37
45
|
return self["applicable_equality_delete_files"]
|
38
46
|
|
39
47
|
@property
|
40
|
-
def existing_position_delete_files(self):
|
48
|
+
def existing_position_delete_files(self) -> Optional[DataFileList]:
|
41
49
|
return self["existing_position_delete_files"]
|
42
50
|
|
43
51
|
@partition_value.setter
|
44
|
-
def partition_value(self, partition_value):
|
52
|
+
def partition_value(self, partition_value: Any) -> None:
|
45
53
|
self["partition_value"] = partition_value
|
46
54
|
|
47
55
|
@all_data_files_for_dedupe.setter
|
48
|
-
def all_data_files_for_dedupe(
|
56
|
+
def all_data_files_for_dedupe(
|
57
|
+
self, all_data_files_for_dedupe: Optional[DataFileList]
|
58
|
+
) -> None:
|
49
59
|
self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
|
50
60
|
|
51
61
|
@applicable_data_files.setter
|
52
|
-
def applicable_data_files(
|
62
|
+
def applicable_data_files(
|
63
|
+
self, applicable_data_files: Optional[DataFileListGroup]
|
64
|
+
) -> None:
|
53
65
|
self["applicable_data_files"] = applicable_data_files
|
54
66
|
|
55
67
|
@applicable_equality_delete_files.setter
|
56
|
-
def applicable_equality_delete_files(
|
68
|
+
def applicable_equality_delete_files(
|
69
|
+
self,
|
70
|
+
applicable_equality_delete_files: Optional[DataFileListGroup],
|
71
|
+
) -> None:
|
57
72
|
self["applicable_equality_delete_files"] = applicable_equality_delete_files
|
58
73
|
|
59
74
|
@existing_position_delete_files.setter
|
60
|
-
def existing_position_delete_files(
|
75
|
+
def existing_position_delete_files(
|
76
|
+
self, existing_position_delete_files: Optional[DataFileList]
|
77
|
+
) -> None:
|
61
78
|
self["existing_position_delete_files"] = existing_position_delete_files
|
@@ -1,18 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Dict
|
2
|
+
from typing import Dict, List, Any
|
3
|
+
from pyiceberg.manifest import DataFile
|
3
4
|
|
4
5
|
|
5
6
|
class ConvertResult(Dict):
|
6
7
|
@staticmethod
|
7
8
|
def of(
|
8
|
-
convert_task_index,
|
9
|
-
to_be_added_files,
|
10
|
-
to_be_deleted_files,
|
11
|
-
position_delete_record_count,
|
12
|
-
input_data_files_record_count,
|
13
|
-
input_data_files_hash_columns_in_memory_sizes,
|
14
|
-
position_delete_in_memory_sizes,
|
15
|
-
position_delete_on_disk_sizes,
|
9
|
+
convert_task_index: int,
|
10
|
+
to_be_added_files: List[DataFile],
|
11
|
+
to_be_deleted_files: Dict[Any, List[DataFile]],
|
12
|
+
position_delete_record_count: int,
|
13
|
+
input_data_files_record_count: int,
|
14
|
+
input_data_files_hash_columns_in_memory_sizes: int,
|
15
|
+
position_delete_in_memory_sizes: int,
|
16
|
+
position_delete_on_disk_sizes: int,
|
17
|
+
input_data_files_on_disk_size: int,
|
18
|
+
peak_memory_usage_bytes: int,
|
19
|
+
memory_usage_percentage: float,
|
16
20
|
) -> ConvertResult:
|
17
21
|
|
18
22
|
result = ConvertResult()
|
@@ -26,6 +30,9 @@ class ConvertResult(Dict):
|
|
26
30
|
] = input_data_files_hash_columns_in_memory_sizes
|
27
31
|
result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
|
28
32
|
result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
|
33
|
+
result["input_data_files_on_disk_size"] = input_data_files_on_disk_size
|
34
|
+
result["peak_memory_usage_bytes"] = peak_memory_usage_bytes
|
35
|
+
result["memory_usage_percentage"] = memory_usage_percentage
|
29
36
|
return result
|
30
37
|
|
31
38
|
@property
|
@@ -33,29 +40,41 @@ class ConvertResult(Dict):
|
|
33
40
|
return self["convert_task_index"]
|
34
41
|
|
35
42
|
@property
|
36
|
-
def to_be_added_files(self):
|
43
|
+
def to_be_added_files(self) -> List[DataFile]:
|
37
44
|
return self["to_be_added_files"]
|
38
45
|
|
39
46
|
@property
|
40
|
-
def to_be_deleted_files(self):
|
47
|
+
def to_be_deleted_files(self) -> Dict[Any, List[DataFile]]:
|
41
48
|
return self["to_be_deleted_files"]
|
42
49
|
|
43
50
|
@property
|
44
|
-
def position_delete_record_count(self):
|
51
|
+
def position_delete_record_count(self) -> int:
|
45
52
|
return self["position_delete_record_count"]
|
46
53
|
|
47
54
|
@property
|
48
|
-
def input_data_files_record_count(self):
|
55
|
+
def input_data_files_record_count(self) -> int:
|
49
56
|
return self["input_data_files_record_count"]
|
50
57
|
|
51
58
|
@property
|
52
|
-
def input_data_files_hash_columns_in_memory_sizes(self):
|
59
|
+
def input_data_files_hash_columns_in_memory_sizes(self) -> int:
|
53
60
|
return self["input_data_files_hash_columns_in_memory_sizes"]
|
54
61
|
|
55
62
|
@property
|
56
|
-
def position_delete_in_memory_sizes(self):
|
63
|
+
def position_delete_in_memory_sizes(self) -> int:
|
57
64
|
return self["position_delete_in_memory_sizes"]
|
58
65
|
|
59
66
|
@property
|
60
|
-
def position_delete_on_disk_sizes(self):
|
67
|
+
def position_delete_on_disk_sizes(self) -> int:
|
61
68
|
return self["position_delete_on_disk_sizes"]
|
69
|
+
|
70
|
+
@property
|
71
|
+
def input_data_files_on_disk_size(self) -> int:
|
72
|
+
return self["input_data_files_on_disk_size"]
|
73
|
+
|
74
|
+
@property
|
75
|
+
def peak_memory_usage_bytes(self) -> int:
|
76
|
+
return self["peak_memory_usage_bytes"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def memory_usage_percentage(self) -> float:
|
80
|
+
return self["memory_usage_percentage"]
|
@@ -1,10 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Optional, Dict
|
2
|
+
from typing import Optional, Dict, Any, List
|
3
3
|
from deltacat.compute.converter.constants import (
|
4
4
|
DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
|
5
5
|
)
|
6
6
|
from deltacat.constants import DEFAULT_NAMESPACE
|
7
7
|
from fsspec import AbstractFileSystem
|
8
|
+
from pyiceberg.catalog import Catalog
|
8
9
|
|
9
10
|
|
10
11
|
class ConverterSessionParams(dict):
|
@@ -13,7 +14,7 @@ class ConverterSessionParams(dict):
|
|
13
14
|
"""
|
14
15
|
|
15
16
|
@staticmethod
|
16
|
-
def of(params: Optional[Dict]) -> ConverterSessionParams:
|
17
|
+
def of(params: Optional[Dict[str, Any]]) -> ConverterSessionParams:
|
17
18
|
params = {} if params is None else params
|
18
19
|
assert params.get("catalog") is not None, "catalog is a required arg"
|
19
20
|
assert (
|
@@ -41,13 +42,13 @@ class ConverterSessionParams(dict):
|
|
41
42
|
)
|
42
43
|
result.merge_keys = params.get("merge_keys", None)
|
43
44
|
result.s3_client_kwargs = params.get("s3_client_kwargs", {})
|
44
|
-
result.
|
45
|
+
result.filesystem = params.get("filesystem", None)
|
45
46
|
result.s3_prefix_override = params.get("s3_prefix_override", None)
|
46
47
|
|
47
48
|
return result
|
48
49
|
|
49
50
|
@property
|
50
|
-
def catalog(self):
|
51
|
+
def catalog(self) -> Catalog:
|
51
52
|
return self["catalog"]
|
52
53
|
|
53
54
|
@property
|
@@ -63,7 +64,7 @@ class ConverterSessionParams(dict):
|
|
63
64
|
return self["iceberg_namespace"]
|
64
65
|
|
65
66
|
@iceberg_namespace.setter
|
66
|
-
def iceberg_namespace(self, iceberg_namespace) -> None:
|
67
|
+
def iceberg_namespace(self, iceberg_namespace: str) -> None:
|
67
68
|
self["iceberg_namespace"] = iceberg_namespace
|
68
69
|
|
69
70
|
@property
|
@@ -71,7 +72,9 @@ class ConverterSessionParams(dict):
|
|
71
72
|
return self["enforce_primary_key_uniqueness"]
|
72
73
|
|
73
74
|
@enforce_primary_key_uniqueness.setter
|
74
|
-
def enforce_primary_key_uniqueness(
|
75
|
+
def enforce_primary_key_uniqueness(
|
76
|
+
self, enforce_primary_key_uniqueness: bool
|
77
|
+
) -> None:
|
75
78
|
self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
76
79
|
|
77
80
|
@property
|
@@ -80,7 +83,7 @@ class ConverterSessionParams(dict):
|
|
80
83
|
|
81
84
|
@compact_previous_position_delete_files.setter
|
82
85
|
def compact_previous_position_delete_files(
|
83
|
-
self, compact_previous_position_delete_files
|
86
|
+
self, compact_previous_position_delete_files: bool
|
84
87
|
) -> None:
|
85
88
|
self[
|
86
89
|
"compact_previous_position_delete_files"
|
@@ -92,50 +95,50 @@ class ConverterSessionParams(dict):
|
|
92
95
|
|
93
96
|
@position_delete_for_multiple_data_files.setter
|
94
97
|
def position_delete_for_multiple_data_files(
|
95
|
-
self, position_delete_for_multiple_data_files
|
98
|
+
self, position_delete_for_multiple_data_files: bool
|
96
99
|
) -> None:
|
97
100
|
self[
|
98
101
|
"position_delete_for_multiple_data_files"
|
99
102
|
] = position_delete_for_multiple_data_files
|
100
103
|
|
101
104
|
@property
|
102
|
-
def task_max_parallelism(self) ->
|
105
|
+
def task_max_parallelism(self) -> int:
|
103
106
|
return self["task_max_parallelism"]
|
104
107
|
|
105
108
|
@task_max_parallelism.setter
|
106
|
-
def task_max_parallelism(self, task_max_parallelism) -> None:
|
109
|
+
def task_max_parallelism(self, task_max_parallelism: int) -> None:
|
107
110
|
self["task_max_parallelism"] = task_max_parallelism
|
108
111
|
|
109
112
|
@property
|
110
|
-
def merge_keys(self) -> str:
|
113
|
+
def merge_keys(self) -> Optional[List[str]]:
|
111
114
|
return self["merge_keys"]
|
112
115
|
|
113
116
|
@merge_keys.setter
|
114
|
-
def merge_keys(self, merge_keys) -> None:
|
117
|
+
def merge_keys(self, merge_keys: Optional[List[str]]) -> None:
|
115
118
|
self["merge_keys"] = merge_keys
|
116
119
|
|
117
120
|
@property
|
118
|
-
def s3_client_kwargs(self) -> Dict:
|
121
|
+
def s3_client_kwargs(self) -> Dict[str, Any]:
|
119
122
|
return self["s3_client_kwargs"]
|
120
123
|
|
121
124
|
@s3_client_kwargs.setter
|
122
|
-
def s3_client_kwargs(self, s3_client_kwargs) -> None:
|
125
|
+
def s3_client_kwargs(self, s3_client_kwargs: Dict[str, Any]) -> None:
|
123
126
|
self["s3_client_kwargs"] = s3_client_kwargs
|
124
127
|
|
125
128
|
@property
|
126
|
-
def
|
127
|
-
return self["
|
129
|
+
def filesystem(self) -> Optional[AbstractFileSystem]:
|
130
|
+
return self["filesystem"]
|
128
131
|
|
129
|
-
@
|
130
|
-
def
|
131
|
-
self["
|
132
|
+
@filesystem.setter
|
133
|
+
def filesystem(self, filesystem: Optional[AbstractFileSystem]) -> None:
|
134
|
+
self["filesystem"] = filesystem
|
132
135
|
|
133
136
|
@property
|
134
|
-
def location_provider_prefix_override(self) -> str:
|
137
|
+
def location_provider_prefix_override(self) -> Optional[str]:
|
135
138
|
return self["location_provider_prefix_override"]
|
136
139
|
|
137
140
|
@location_provider_prefix_override.setter
|
138
141
|
def location_provider_prefix_override(
|
139
|
-
self, location_provider_prefix_override
|
142
|
+
self, location_provider_prefix_override: Optional[str]
|
140
143
|
) -> None:
|
141
144
|
self["location_provider_prefix_override"] = location_provider_prefix_override
|
@@ -1,8 +1,15 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
from typing import Optional, Dict, Any
|
2
|
+
from pyiceberg.table import Table
|
3
|
+
from pyiceberg.catalog import Catalog, load_catalog as pyiceberg_load_catalog
|
4
|
+
from botocore.credentials import Credentials
|
5
|
+
import boto3
|
6
|
+
from boto3.session import Session
|
7
|
+
|
8
|
+
|
9
|
+
def load_catalog(
|
10
|
+
iceberg_catalog_name: str, iceberg_catalog_properties: Dict[str, Any]
|
11
|
+
) -> Catalog:
|
12
|
+
catalog = pyiceberg_load_catalog(
|
6
13
|
name=iceberg_catalog_name,
|
7
14
|
**iceberg_catalog_properties,
|
8
15
|
)
|
@@ -23,25 +30,21 @@ def get_s3_path(
|
|
23
30
|
return result_path
|
24
31
|
|
25
32
|
|
26
|
-
def get_bucket_name():
|
27
|
-
return "
|
33
|
+
def get_bucket_name() -> str:
|
34
|
+
return "test-bucket"
|
28
35
|
|
29
36
|
|
30
|
-
def get_s3_prefix():
|
37
|
+
def get_s3_prefix() -> str:
|
31
38
|
return get_s3_path(get_bucket_name())
|
32
39
|
|
33
40
|
|
34
|
-
def get_credential():
|
35
|
-
|
36
|
-
|
37
|
-
boto3_session = boto3.Session()
|
38
|
-
credentials = boto3_session.get_credentials()
|
41
|
+
def get_credential() -> Credentials:
|
42
|
+
boto3_session: Session = boto3.Session()
|
43
|
+
credentials: Credentials = boto3_session.get_credentials()
|
39
44
|
return credentials
|
40
45
|
|
41
46
|
|
42
|
-
def get_glue_catalog():
|
43
|
-
from pyiceberg.catalog import load_catalog
|
44
|
-
|
47
|
+
def get_glue_catalog() -> Catalog:
|
45
48
|
credential = get_credential()
|
46
49
|
# Credentials are refreshable, so accessing your access key / secret key
|
47
50
|
# separately can lead to a race condition. Use this to get an actual matched
|
@@ -51,7 +54,7 @@ def get_glue_catalog():
|
|
51
54
|
secret_access_key = credential.secret_key
|
52
55
|
session_token = credential.token
|
53
56
|
s3_path = get_s3_prefix()
|
54
|
-
glue_catalog =
|
57
|
+
glue_catalog = pyiceberg_load_catalog(
|
55
58
|
"glue",
|
56
59
|
**{
|
57
60
|
"warehouse": s3_path,
|
@@ -70,6 +73,6 @@ def get_glue_catalog():
|
|
70
73
|
return glue_catalog
|
71
74
|
|
72
75
|
|
73
|
-
def load_table(catalog, table_name):
|
76
|
+
def load_table(catalog: Catalog, table_name: str) -> Table:
|
74
77
|
loaded_table = catalog.load_table(table_name)
|
75
78
|
return loaded_table
|
@@ -11,7 +11,7 @@ from pyiceberg.io.pyarrow import (
|
|
11
11
|
MetricsMode,
|
12
12
|
StatsAggregator,
|
13
13
|
)
|
14
|
-
from typing import Dict, List, Set
|
14
|
+
from typing import Dict, List, Set, Any, Tuple
|
15
15
|
from deltacat.compute.converter.utils.iceberg_columns import (
|
16
16
|
ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN,
|
17
17
|
ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN,
|
@@ -24,18 +24,23 @@ from pyiceberg.manifest import (
|
|
24
24
|
DataFileContent,
|
25
25
|
FileFormat,
|
26
26
|
)
|
27
|
-
from pyiceberg.table import _min_sequence_number, _open_manifest
|
27
|
+
from pyiceberg.table import _min_sequence_number, _open_manifest, Table
|
28
28
|
from pyiceberg.utils.concurrent import ExecutorFactory
|
29
29
|
from itertools import chain
|
30
30
|
from pyiceberg.typedef import (
|
31
31
|
KeyDefaultDict,
|
32
32
|
)
|
33
|
+
from pyiceberg.schema import Schema
|
34
|
+
from pyiceberg.io import FileIO
|
35
|
+
from deltacat.compute.converter.model.convert_input_files import (
|
36
|
+
DataFileList,
|
37
|
+
)
|
33
38
|
|
34
39
|
|
35
40
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
41
|
|
37
42
|
|
38
|
-
def parquet_path_to_id_mapping_override(schema):
|
43
|
+
def parquet_path_to_id_mapping_override(schema: Schema) -> Dict[str, int]:
|
39
44
|
res = parquet_path_to_id_mapping(schema)
|
40
45
|
# Override here to insert position delete reserved column field IDs
|
41
46
|
res["file_path"] = ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN
|
@@ -155,13 +160,16 @@ def data_file_statistics_from_parquet_metadata(
|
|
155
160
|
)
|
156
161
|
|
157
162
|
|
158
|
-
def parquet_files_dict_to_iceberg_data_files(
|
159
|
-
|
163
|
+
def parquet_files_dict_to_iceberg_data_files(
|
164
|
+
io: FileIO,
|
165
|
+
table_metadata: Any,
|
166
|
+
files_dict: Dict[Any, List[str]],
|
167
|
+
file_content_type: DataFileContent,
|
168
|
+
) -> List[DataFile]:
|
160
169
|
iceberg_files = []
|
161
170
|
schema = table_metadata.schema()
|
162
171
|
for partition_value, file_paths in files_dict.items():
|
163
172
|
for file_path in file_paths:
|
164
|
-
logger.info(f"DEBUG_file_path:{file_path}")
|
165
173
|
input_file = io.new_input(file_path)
|
166
174
|
with input_file.open() as input_stream:
|
167
175
|
parquet_metadata = pq.read_metadata(input_stream)
|
@@ -177,7 +185,7 @@ def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict):
|
|
177
185
|
)
|
178
186
|
|
179
187
|
data_file = DataFile(
|
180
|
-
content=
|
188
|
+
content=file_content_type,
|
181
189
|
file_path=file_path,
|
182
190
|
file_format=FileFormat.PARQUET,
|
183
191
|
partition=partition_value,
|
@@ -192,10 +200,11 @@ def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict):
|
|
192
200
|
return iceberg_files
|
193
201
|
|
194
202
|
|
195
|
-
def fetch_all_bucket_files(
|
203
|
+
def fetch_all_bucket_files(
|
204
|
+
table: Table,
|
205
|
+
) -> Tuple[Dict[Any, DataFileList], Dict[Any, DataFileList], Dict[Any, DataFileList]]:
|
196
206
|
# step 1: filter manifests using partition summaries
|
197
207
|
# the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id
|
198
|
-
|
199
208
|
data_scan = table.scan()
|
200
209
|
snapshot = data_scan.snapshot()
|
201
210
|
if not snapshot:
|