deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -201,7 +201,7 @@ def _timed_hash_bucket(
|
|
201
201
|
with memray.Tracker(
|
202
202
|
f"hash_bucket_{worker_id}_{task_id}.bin"
|
203
203
|
) if enable_profiler else nullcontext():
|
204
|
-
sort_key_names = [key.
|
204
|
+
sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
|
205
205
|
if not round_completion_info:
|
206
206
|
is_src_delta = True
|
207
207
|
else:
|
@@ -25,9 +25,10 @@ from deltacat.storage import (
|
|
25
25
|
DeltaType,
|
26
26
|
Partition,
|
27
27
|
PartitionLocator,
|
28
|
-
Manifest,
|
29
28
|
ManifestEntry,
|
29
|
+
ManifestEntryList,
|
30
30
|
)
|
31
|
+
from deltacat.storage.model.manifest import Manifest
|
31
32
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
32
33
|
from deltacat.utils.common import ReadKwargsProvider
|
33
34
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
@@ -82,7 +83,10 @@ def materialize(
|
|
82
83
|
assert (
|
83
84
|
delta_type == DeltaType.UPSERT
|
84
85
|
), "Stage delta with existing manifest entries only supports UPSERT delta type!"
|
85
|
-
manifest = Manifest.of(
|
86
|
+
manifest = Manifest.of(
|
87
|
+
entries=ManifestEntryList.of(manifest_entry_list_reference),
|
88
|
+
uuid=str(uuid4()),
|
89
|
+
)
|
86
90
|
delta = Delta.of(
|
87
91
|
locator=DeltaLocator.of(partition.locator),
|
88
92
|
delta_type=delta_type,
|
@@ -358,7 +358,7 @@ def fit_input_deltas(
|
|
358
358
|
def _discover_deltas(
|
359
359
|
source_partition_locator: PartitionLocator,
|
360
360
|
start_position_exclusive: Optional[int],
|
361
|
-
end_position_inclusive: int,
|
361
|
+
end_position_inclusive: Optional[int],
|
362
362
|
deltacat_storage=unimplemented_deltacat_storage,
|
363
363
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
364
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import pyarrow as pa
|
2
2
|
from typing import List
|
3
|
-
from
|
3
|
+
from itertools import chain
|
4
|
+
from deltacat.storage import PartitionLocator, SortKey, TransformName
|
4
5
|
|
5
6
|
MAX_SORT_KEYS_BIT_WIDTH = 256
|
6
7
|
|
@@ -22,7 +23,13 @@ def validate_sort_keys(
|
|
22
23
|
deltacat_storage_kwargs = {}
|
23
24
|
total_sort_keys_bit_width = 0
|
24
25
|
if sort_keys:
|
25
|
-
sort_key_names = [key.
|
26
|
+
sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
|
27
|
+
assert all(
|
28
|
+
[
|
29
|
+
key.transform is None or key.transform.name == TransformName.IDENTITY
|
30
|
+
for key in sort_keys
|
31
|
+
]
|
32
|
+
), f"Sort key transforms are not supported: {sort_keys}"
|
26
33
|
assert len(sort_key_names) == len(
|
27
34
|
set(sort_key_names)
|
28
35
|
), f"Sort key names must be unique: {sort_key_names}"
|
@@ -27,9 +27,8 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
|
27
27
|
from deltacat.storage import (
|
28
28
|
Delta,
|
29
29
|
DeltaLocator,
|
30
|
-
Manifest,
|
31
|
-
Partition,
|
32
30
|
)
|
31
|
+
from deltacat.storage.model.manifest import Manifest
|
33
32
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
34
33
|
CompactPartitionParams,
|
35
34
|
)
|
@@ -139,7 +138,7 @@ def _execute_compaction(
|
|
139
138
|
logger.info("No input deltas found to compact.")
|
140
139
|
return ExecutionCompactionResult(None, None, None, False)
|
141
140
|
build_uniform_deltas_result: tuple[
|
142
|
-
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
141
|
+
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
143
142
|
] = _build_uniform_deltas(
|
144
143
|
params, compaction_audit, input_deltas, delta_discovery_start
|
145
144
|
)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
from deltacat.utils.common import env_bool, env_integer, env_string
|
2
|
-
|
3
1
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
4
2
|
|
5
3
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -33,9 +31,7 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
33
31
|
# The total size of records that will be hash bucketed at once
|
34
32
|
# Since, sorting is nlogn, we ensure that is not performed
|
35
33
|
# on a very large dataset for best performance.
|
36
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
37
|
-
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
-
)
|
34
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
|
39
35
|
|
40
36
|
# Whether to drop duplicates during merge.
|
41
37
|
DROP_DUPLICATES = True
|
@@ -82,28 +78,3 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
82
78
|
# Number of rounds to run hash/merge for a single
|
83
79
|
# partition. (For large table support)
|
84
80
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
-
|
86
|
-
# Whether to perform sha1 hashing when required to
|
87
|
-
# optimize memory. For example, hashing is always
|
88
|
-
# required for bucketing where it's not mandatory
|
89
|
-
# when dropping duplicates. Setting this to True
|
90
|
-
# will disable sha1 hashing in cases where it isn't
|
91
|
-
# mandatory. This flag is False by default.
|
92
|
-
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
-
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
-
)
|
95
|
-
|
96
|
-
# This env variable specifies whether to check bucketing spec
|
97
|
-
# compliance of the existing compacted table.
|
98
|
-
# PRINT_LOG: Enable logging if any partition is found
|
99
|
-
# to be non-compliant with the bucketing spec.
|
100
|
-
# ASSERT: Fail the job with ValidationError if the
|
101
|
-
# current compacted partition is found to be non-compliant
|
102
|
-
# with bucketing spec. Note, logging is implicitly enabled
|
103
|
-
# in this case.
|
104
|
-
BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
|
105
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE", None
|
106
|
-
)
|
107
|
-
|
108
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
|
109
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
|
@@ -49,7 +49,7 @@ def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]
|
|
49
49
|
] = [
|
50
50
|
(is_delete, list(delete_delta_group))
|
51
51
|
for (is_delete, _), delete_delta_group in itertools.groupby(
|
52
|
-
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.
|
52
|
+
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.meta.entry_params)
|
53
53
|
)
|
54
54
|
]
|
55
55
|
for (
|
@@ -89,11 +89,11 @@ def _get_delete_file_envelopes(
|
|
89
89
|
consecutive_delete_tables: List[pa.Table] = []
|
90
90
|
for delete_delta in delete_delta_sequence:
|
91
91
|
assert (
|
92
|
-
delete_delta.
|
92
|
+
delete_delta.meta.entry_params is not None
|
93
93
|
), "Delete type deltas are required to have delete parameters defined"
|
94
94
|
delete_columns: Optional[
|
95
95
|
List[str]
|
96
|
-
] = delete_delta.
|
96
|
+
] = delete_delta.meta.entry_params.equality_field_locators
|
97
97
|
assert len(delete_columns) > 0, "At least 1 delete column is required"
|
98
98
|
# delete columns should exist in underlying table
|
99
99
|
delete_dataset = params.deltacat_storage.download_delta(
|
@@ -43,7 +43,7 @@ class MergeInput(Dict):
|
|
43
43
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
44
44
|
object_store: Optional[IObjectStore] = None,
|
45
45
|
delete_strategy: Optional[DeleteStrategy] = None,
|
46
|
-
delete_file_envelopes: Optional[List] = None,
|
46
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
|
47
47
|
deltacat_storage=unimplemented_deltacat_storage,
|
48
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
49
|
memory_logs_enabled: Optional[bool] = None,
|
@@ -63,7 +63,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
|
|
63
63
|
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
64
64
|
from deltacat.compute.compactor_v2.utils import io
|
65
65
|
|
66
|
-
from typing import List, Optional
|
66
|
+
from typing import List, Optional, Union
|
67
67
|
from collections import defaultdict
|
68
68
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
69
69
|
CompactionSessionAuditInfo,
|
@@ -83,7 +83,7 @@ def _fetch_compaction_metadata(
|
|
83
83
|
|
84
84
|
# read the results from any previously completed compaction round
|
85
85
|
round_completion_info: Optional[RoundCompletionInfo] = None
|
86
|
-
high_watermark: Optional[HighWatermark] = None
|
86
|
+
high_watermark: Optional[Union[HighWatermark, int]] = None
|
87
87
|
previous_compacted_delta_manifest: Optional[Manifest] = None
|
88
88
|
|
89
89
|
if not params.rebase_source_partition_locator:
|
@@ -129,7 +129,7 @@ def _build_uniform_deltas(
|
|
129
129
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
130
130
|
input_deltas: List[Delta],
|
131
131
|
delta_discovery_start: float,
|
132
|
-
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
132
|
+
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]]:
|
133
133
|
|
134
134
|
delete_strategy: Optional[DeleteStrategy] = None
|
135
135
|
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
|
@@ -222,7 +222,7 @@ def _run_hash_and_merge(
|
|
222
222
|
uniform_deltas: List[DeltaAnnotated],
|
223
223
|
round_completion_info: RoundCompletionInfo,
|
224
224
|
delete_strategy: Optional[DeleteStrategy],
|
225
|
-
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
225
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]],
|
226
226
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
227
227
|
previous_compacted_delta_manifest: Optional[Manifest],
|
228
228
|
compacted_partition: Partition,
|
@@ -389,7 +389,7 @@ def _merge(
|
|
389
389
|
all_hash_group_idx_to_obj_id: dict,
|
390
390
|
compacted_partition: Partition,
|
391
391
|
delete_strategy: DeleteStrategy,
|
392
|
-
delete_file_envelopes: DeleteFileEnvelope,
|
392
|
+
delete_file_envelopes: List[DeleteFileEnvelope],
|
393
393
|
) -> tuple[List[MergeResult], float]:
|
394
394
|
merge_options_provider = functools.partial(
|
395
395
|
task_resource_options_provider,
|
@@ -7,7 +7,6 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
-
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
11
10
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
12
11
|
from uuid import uuid4
|
13
12
|
from deltacat import logs
|
@@ -32,25 +31,21 @@ from deltacat.utils.resources import (
|
|
32
31
|
)
|
33
32
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
34
33
|
generate_pk_hash_column,
|
35
|
-
pk_digest_to_hash_bucket_index,
|
36
34
|
)
|
37
35
|
from deltacat.storage import (
|
38
36
|
Delta,
|
39
37
|
DeltaLocator,
|
40
38
|
DeltaType,
|
41
|
-
Manifest,
|
42
39
|
Partition,
|
43
40
|
interface as unimplemented_deltacat_storage,
|
44
41
|
)
|
42
|
+
from deltacat.storage.model.manifest import Manifest
|
45
43
|
from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
|
46
44
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
47
45
|
from deltacat.compute.compactor_v2.constants import (
|
48
46
|
MERGE_TIME_IN_SECONDS,
|
49
47
|
MERGE_SUCCESS_COUNT,
|
50
48
|
MERGE_FAILURE_COUNT,
|
51
|
-
BUCKETING_SPEC_COMPLIANCE_PROFILE,
|
52
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
53
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
54
49
|
)
|
55
50
|
from deltacat.exceptions import (
|
56
51
|
categorize_errors,
|
@@ -152,32 +147,10 @@ def _merge_tables(
|
|
152
147
|
if compacted_table:
|
153
148
|
compacted_table = all_tables[0]
|
154
149
|
|
155
|
-
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
156
|
-
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
157
|
-
|
158
|
-
logger.info(
|
159
|
-
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
160
|
-
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
161
|
-
)
|
162
|
-
|
163
|
-
if (
|
164
|
-
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
165
|
-
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
166
|
-
):
|
167
|
-
logger.info("Casting compacted and incremental pk hash to large_string...")
|
168
|
-
# is_in combines the chunks of the chunked array passed which can cause
|
169
|
-
# ArrowCapacityError if the total size of string array is over 2GB.
|
170
|
-
# Using a large_string would resolve this issue.
|
171
|
-
# The cast here should be zero-copy in most cases.
|
172
|
-
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
173
|
-
incremental_pk_hash_str = pc.cast(
|
174
|
-
incremental_pk_hash_str, pa.large_string()
|
175
|
-
)
|
176
|
-
|
177
150
|
records_to_keep = pc.invert(
|
178
151
|
pc.is_in(
|
179
|
-
|
180
|
-
|
152
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
|
153
|
+
incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
|
181
154
|
)
|
182
155
|
)
|
183
156
|
|
@@ -192,34 +165,9 @@ def _merge_tables(
|
|
192
165
|
return final_table
|
193
166
|
|
194
167
|
|
195
|
-
def _validate_bucketing_spec_compliance(
|
196
|
-
table: pa.Table, rcf: RoundCompletionInfo, hb_index: int, primary_keys: List[str]
|
197
|
-
) -> None:
|
198
|
-
pki_table = generate_pk_hash_column(
|
199
|
-
[table], primary_keys=primary_keys, requires_hash=True
|
200
|
-
)[0]
|
201
|
-
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
202
|
-
hash_bucket = pk_digest_to_hash_bucket_index(hash_value, rcf.hash_bucket_count)
|
203
|
-
if hash_bucket != hb_index:
|
204
|
-
logger.info(
|
205
|
-
f"{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}"
|
206
|
-
f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
|
207
|
-
f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
|
208
|
-
f"Expected hash bucket is {hb_index} but found {hash_bucket}."
|
209
|
-
)
|
210
|
-
if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
|
211
|
-
raise AssertionError(
|
212
|
-
"Hash bucket drift detected. Expected hash bucket index"
|
213
|
-
f" to be {hb_index} but found {hash_bucket}"
|
214
|
-
)
|
215
|
-
# No further checks necessary
|
216
|
-
break
|
217
|
-
|
218
|
-
|
219
168
|
def _download_compacted_table(
|
220
169
|
hb_index: int,
|
221
170
|
rcf: RoundCompletionInfo,
|
222
|
-
primary_keys: List[str],
|
223
171
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
224
172
|
deltacat_storage=unimplemented_deltacat_storage,
|
225
173
|
deltacat_storage_kwargs: Optional[dict] = None,
|
@@ -243,23 +191,7 @@ def _download_compacted_table(
|
|
243
191
|
|
244
192
|
tables.append(table)
|
245
193
|
|
246
|
-
|
247
|
-
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
248
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
249
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
250
|
-
]
|
251
|
-
|
252
|
-
logger.debug(
|
253
|
-
f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
|
254
|
-
f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
|
255
|
-
)
|
256
|
-
|
257
|
-
# Bucketing spec compliance isn't required without primary keys
|
258
|
-
if primary_keys and check_bucketing_spec:
|
259
|
-
_validate_bucketing_spec_compliance(
|
260
|
-
compacted_table, rcf, hb_index, primary_keys
|
261
|
-
)
|
262
|
-
return compacted_table
|
194
|
+
return pa.concat_tables(tables)
|
263
195
|
|
264
196
|
|
265
197
|
def _copy_all_manifest_files_from_old_hash_buckets(
|
@@ -518,7 +450,9 @@ def _apply_upserts(
|
|
518
450
|
# on non event based sort key does not produce consistent
|
519
451
|
# compaction results. E.g., compaction(delta1, delta2, delta3)
|
520
452
|
# will not be equal to compaction(compaction(delta1, delta2), delta3).
|
521
|
-
table = table.sort_by(
|
453
|
+
table = table.sort_by(
|
454
|
+
[pa_key for key in input.sort_keys for pa_key in key.arrow]
|
455
|
+
)
|
522
456
|
hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
|
523
457
|
table, merge_time = timed_invocation(
|
524
458
|
func=_merge_tables,
|
@@ -560,11 +494,9 @@ def _copy_manifests_from_hash_bucketing(
|
|
560
494
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
561
495
|
task_id = get_current_ray_task_id()
|
562
496
|
worker_id = get_current_ray_worker_id()
|
563
|
-
with (
|
564
|
-
|
565
|
-
|
566
|
-
else nullcontext()
|
567
|
-
):
|
497
|
+
with memray.Tracker(
|
498
|
+
f"merge_{worker_id}_{task_id}.bin"
|
499
|
+
) if input.enable_profiler else nullcontext():
|
568
500
|
total_input_records, total_deduped_records = 0, 0
|
569
501
|
total_dropped_records = 0
|
570
502
|
materialized_results: List[MaterializeResult] = []
|
@@ -588,7 +520,6 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
588
520
|
compacted_table = _download_compacted_table(
|
589
521
|
hb_index=merge_file_group.hb_index,
|
590
522
|
rcf=input.round_completion_info,
|
591
|
-
primary_keys=input.primary_keys,
|
592
523
|
read_kwargs_provider=input.read_kwargs_provider,
|
593
524
|
deltacat_storage=input.deltacat_storage,
|
594
525
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
@@ -673,5 +604,5 @@ def merge(input: MergeInput) -> MergeResult:
|
|
673
604
|
merge_result[3],
|
674
605
|
merge_result[4],
|
675
606
|
np.double(emit_metrics_time),
|
676
|
-
merge_result[
|
607
|
+
merge_result[4],
|
677
608
|
)
|
@@ -5,7 +5,6 @@ from deltacat.compute.compactor_v2.constants import (
|
|
5
5
|
TASK_MAX_PARALLELISM,
|
6
6
|
MAX_PARQUET_METADATA_SIZE,
|
7
7
|
)
|
8
|
-
from deltacat.utils.common import ReadKwargsProvider
|
9
8
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
10
9
|
from deltacat import logs
|
11
10
|
from deltacat.storage import (
|
@@ -76,21 +75,11 @@ def _download_parquet_metadata_for_manifest_entry(
|
|
76
75
|
entry_index: int,
|
77
76
|
deltacat_storage: unimplemented_deltacat_storage,
|
78
77
|
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
79
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
80
78
|
) -> Dict[str, Any]:
|
81
|
-
logger.info(
|
82
|
-
f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
|
83
|
-
)
|
84
|
-
if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
|
85
|
-
logger.info(
|
86
|
-
"'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
|
87
|
-
)
|
88
|
-
deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
|
89
79
|
pq_file = deltacat_storage.download_delta_manifest_entry(
|
90
80
|
delta,
|
91
81
|
entry_index=entry_index,
|
92
82
|
table_type=TableType.PYARROW_PARQUET,
|
93
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
94
83
|
**deltacat_storage_kwargs,
|
95
84
|
)
|
96
85
|
|
@@ -108,15 +97,11 @@ def append_content_type_params(
|
|
108
97
|
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
109
98
|
deltacat_storage=unimplemented_deltacat_storage,
|
110
99
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
111
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
112
100
|
) -> bool:
|
113
101
|
"""
|
114
102
|
This operation appends content type params into the delta entry. Note
|
115
103
|
that this operation can be time consuming, hence we cache it in a Ray actor.
|
116
104
|
"""
|
117
|
-
logger.info(
|
118
|
-
f"Appending the content type params for Delta with locator {delta.locator}..."
|
119
|
-
)
|
120
105
|
|
121
106
|
if not delta.meta:
|
122
107
|
logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
|
@@ -174,7 +159,6 @@ def append_content_type_params(
|
|
174
159
|
|
175
160
|
def input_provider(index, item) -> Dict:
|
176
161
|
return {
|
177
|
-
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
178
162
|
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
179
163
|
"deltacat_storage": deltacat_storage,
|
180
164
|
"delta": delta,
|
@@ -184,7 +168,6 @@ def append_content_type_params(
|
|
184
168
|
logger.info(
|
185
169
|
f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
|
186
170
|
)
|
187
|
-
|
188
171
|
pq_files_promise = invoke_parallel(
|
189
172
|
entry_indices_to_download,
|
190
173
|
ray_task=_download_parquet_metadata_for_manifest_entry,
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths)
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
@@ -101,6 +101,7 @@ def create_uniform_input_deltas(
|
|
101
101
|
delta_manifest_entries_count = 0
|
102
102
|
estimated_da_bytes = 0
|
103
103
|
input_da_list = []
|
104
|
+
|
104
105
|
for delta in input_deltas:
|
105
106
|
if (
|
106
107
|
compact_partition_params.enable_input_split
|
@@ -117,7 +118,6 @@ def create_uniform_input_deltas(
|
|
117
118
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
118
119
|
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
119
120
|
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
120
|
-
file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
|
121
121
|
)
|
122
122
|
|
123
123
|
manifest_entries = delta.manifest.entries
|
@@ -10,7 +10,6 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
-
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
14
13
|
)
|
15
14
|
import time
|
16
15
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -49,13 +48,6 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
49
48
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
50
49
|
)
|
51
50
|
|
52
|
-
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
-
logger.info(
|
54
|
-
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
-
f"Returning False for is_sha1_desired"
|
56
|
-
)
|
57
|
-
return False
|
58
|
-
|
59
51
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
60
52
|
|
61
53
|
|
@@ -116,10 +108,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
116
108
|
record_batches = []
|
117
109
|
result_len = 0
|
118
110
|
for record_batch in table_batches:
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
):
|
111
|
+
current_bytes += record_batch.nbytes
|
112
|
+
record_batches.append(record_batch)
|
113
|
+
if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
|
123
114
|
logger.info(
|
124
115
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
125
116
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -137,9 +128,6 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
137
128
|
current_bytes = 0
|
138
129
|
record_batches.clear()
|
139
130
|
|
140
|
-
current_bytes += record_batch.nbytes
|
141
|
-
record_batches.append(record_batch)
|
142
|
-
|
143
131
|
if record_batches:
|
144
132
|
appended_len, append_latency = timed_invocation(
|
145
133
|
_append_table_by_hash_bucket,
|
@@ -1,16 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
-
from deltacat.compute.compactor_v2.constants import (
|
6
|
-
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
-
)
|
8
4
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
9
5
|
LocalMergeFileGroupsProvider,
|
10
6
|
)
|
11
7
|
from deltacat.storage import (
|
12
8
|
Manifest,
|
13
|
-
ManifestEntry,
|
14
9
|
interface as unimplemented_deltacat_storage,
|
15
10
|
)
|
16
11
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
@@ -86,27 +81,16 @@ def _get_merge_task_options(
|
|
86
81
|
and compacted_delta_manifest
|
87
82
|
and round_completion_info.hb_index_to_entry_range
|
88
83
|
):
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
previous_inflation: float = (
|
94
|
-
(
|
95
|
-
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
96
|
-
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
-
)
|
98
|
-
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
99
|
-
else PYARROW_INFLATION_MULTIPLIER
|
84
|
+
|
85
|
+
previous_inflation = (
|
86
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
87
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
100
88
|
)
|
101
89
|
debug_memory_params["previous_inflation"] = previous_inflation
|
102
90
|
|
103
|
-
average_record_size
|
104
|
-
|
105
|
-
|
106
|
-
/ round_completion_info.compacted_pyarrow_write_result.records
|
107
|
-
)
|
108
|
-
if round_completion_info.compacted_pyarrow_write_result.records
|
109
|
-
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
91
|
+
average_record_size = (
|
92
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
93
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
110
94
|
)
|
111
95
|
debug_memory_params["average_record_size"] = average_record_size
|
112
96
|
|
@@ -122,36 +106,31 @@ def _get_merge_task_options(
|
|
122
106
|
str(hb_idx)
|
123
107
|
]
|
124
108
|
for entry_index in range(entry_start, entry_end):
|
125
|
-
entry
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
)
|
132
|
-
or 0.0
|
109
|
+
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
+
|
111
|
+
current_entry_size = estimate_manifest_entry_size_bytes(
|
112
|
+
entry=entry,
|
113
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
114
|
+
estimate_resources_params=estimate_resources_params,
|
133
115
|
)
|
134
|
-
current_entry_rows
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
estimate_resources_params=estimate_resources_params,
|
139
|
-
)
|
140
|
-
or 0
|
116
|
+
current_entry_rows = estimate_manifest_entry_num_rows(
|
117
|
+
entry=entry,
|
118
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
119
|
+
estimate_resources_params=estimate_resources_params,
|
141
120
|
)
|
142
|
-
|
121
|
+
|
143
122
|
data_size += current_entry_size
|
144
123
|
num_rows += current_entry_rows
|
124
|
+
|
145
125
|
if primary_keys:
|
146
|
-
pk_size
|
147
|
-
float
|
148
|
-
] = estimate_manifest_entry_column_size_bytes(
|
126
|
+
pk_size = estimate_manifest_entry_column_size_bytes(
|
149
127
|
entry=entry,
|
150
128
|
columns=primary_keys,
|
151
129
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
152
130
|
estimate_resources_params=estimate_resources_params,
|
153
131
|
)
|
154
|
-
|
132
|
+
|
133
|
+
if pk_size is None:
|
155
134
|
pk_size_bytes += current_entry_size
|
156
135
|
else:
|
157
136
|
pk_size_bytes += pk_size
|
@@ -180,6 +159,7 @@ def _get_merge_task_options(
|
|
180
159
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
181
160
|
memory_logs_enabled,
|
182
161
|
)
|
162
|
+
|
183
163
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
184
164
|
|
185
165
|
|