deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -201,7 +201,7 @@ def _timed_hash_bucket(
|
|
201
201
|
with memray.Tracker(
|
202
202
|
f"hash_bucket_{worker_id}_{task_id}.bin"
|
203
203
|
) if enable_profiler else nullcontext():
|
204
|
-
sort_key_names = [key.
|
204
|
+
sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
|
205
205
|
if not round_completion_info:
|
206
206
|
is_src_delta = True
|
207
207
|
else:
|
@@ -25,9 +25,10 @@ from deltacat.storage import (
|
|
25
25
|
DeltaType,
|
26
26
|
Partition,
|
27
27
|
PartitionLocator,
|
28
|
-
Manifest,
|
29
28
|
ManifestEntry,
|
29
|
+
ManifestEntryList,
|
30
30
|
)
|
31
|
+
from deltacat.storage.model.manifest import Manifest
|
31
32
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
32
33
|
from deltacat.utils.common import ReadKwargsProvider
|
33
34
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
@@ -82,7 +83,10 @@ def materialize(
|
|
82
83
|
assert (
|
83
84
|
delta_type == DeltaType.UPSERT
|
84
85
|
), "Stage delta with existing manifest entries only supports UPSERT delta type!"
|
85
|
-
manifest = Manifest.of(
|
86
|
+
manifest = Manifest.of(
|
87
|
+
entries=ManifestEntryList.of(manifest_entry_list_reference),
|
88
|
+
uuid=str(uuid4()),
|
89
|
+
)
|
86
90
|
delta = Delta.of(
|
87
91
|
locator=DeltaLocator.of(partition.locator),
|
88
92
|
delta_type=delta_type,
|
@@ -358,7 +358,7 @@ def fit_input_deltas(
|
|
358
358
|
def _discover_deltas(
|
359
359
|
source_partition_locator: PartitionLocator,
|
360
360
|
start_position_exclusive: Optional[int],
|
361
|
-
end_position_inclusive: int,
|
361
|
+
end_position_inclusive: Optional[int],
|
362
362
|
deltacat_storage=unimplemented_deltacat_storage,
|
363
363
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
364
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import pyarrow as pa
|
2
2
|
from typing import List
|
3
|
-
from
|
3
|
+
from itertools import chain
|
4
|
+
from deltacat.storage import PartitionLocator, SortKey, TransformName
|
4
5
|
|
5
6
|
MAX_SORT_KEYS_BIT_WIDTH = 256
|
6
7
|
|
@@ -22,7 +23,13 @@ def validate_sort_keys(
|
|
22
23
|
deltacat_storage_kwargs = {}
|
23
24
|
total_sort_keys_bit_width = 0
|
24
25
|
if sort_keys:
|
25
|
-
sort_key_names = [key.
|
26
|
+
sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
|
27
|
+
assert all(
|
28
|
+
[
|
29
|
+
key.transform is None or key.transform.name == TransformName.IDENTITY
|
30
|
+
for key in sort_keys
|
31
|
+
]
|
32
|
+
), f"Sort key transforms are not supported: {sort_keys}"
|
26
33
|
assert len(sort_key_names) == len(
|
27
34
|
set(sort_key_names)
|
28
35
|
), f"Sort key names must be unique: {sort_key_names}"
|
@@ -27,9 +27,8 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
|
27
27
|
from deltacat.storage import (
|
28
28
|
Delta,
|
29
29
|
DeltaLocator,
|
30
|
-
Manifest,
|
31
|
-
Partition,
|
32
30
|
)
|
31
|
+
from deltacat.storage.model.manifest import Manifest
|
33
32
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
34
33
|
CompactPartitionParams,
|
35
34
|
)
|
@@ -69,17 +68,14 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
69
68
|
assert (
|
70
69
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
71
70
|
), "hash_bucket_count is a required arg for compactor v2"
|
72
|
-
assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
|
73
71
|
if params.num_rounds > 1:
|
74
72
|
assert (
|
75
73
|
not params.drop_duplicates
|
76
74
|
), "num_rounds > 1, drop_duplicates must be False but is True"
|
77
75
|
|
78
|
-
with (
|
79
|
-
|
80
|
-
|
81
|
-
else nullcontext()
|
82
|
-
):
|
76
|
+
with memray.Tracker(
|
77
|
+
"compaction_partition.bin"
|
78
|
+
) if params.enable_profiler else nullcontext():
|
83
79
|
execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
|
84
80
|
params,
|
85
81
|
**kwargs,
|
@@ -142,7 +138,7 @@ def _execute_compaction(
|
|
142
138
|
logger.info("No input deltas found to compact.")
|
143
139
|
return ExecutionCompactionResult(None, None, None, False)
|
144
140
|
build_uniform_deltas_result: tuple[
|
145
|
-
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
141
|
+
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
146
142
|
] = _build_uniform_deltas(
|
147
143
|
params, compaction_audit, input_deltas, delta_discovery_start
|
148
144
|
)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
from deltacat.utils.common import env_bool, env_integer, env_string
|
2
|
-
|
3
1
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
4
2
|
|
5
3
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -33,9 +31,7 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
33
31
|
# The total size of records that will be hash bucketed at once
|
34
32
|
# Since, sorting is nlogn, we ensure that is not performed
|
35
33
|
# on a very large dataset for best performance.
|
36
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
37
|
-
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
-
)
|
34
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
|
39
35
|
|
40
36
|
# Whether to drop duplicates during merge.
|
41
37
|
DROP_DUPLICATES = True
|
@@ -82,28 +78,3 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
82
78
|
# Number of rounds to run hash/merge for a single
|
83
79
|
# partition. (For large table support)
|
84
80
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
-
|
86
|
-
# Whether to perform sha1 hashing when required to
|
87
|
-
# optimize memory. For example, hashing is always
|
88
|
-
# required for bucketing where it's not mandatory
|
89
|
-
# when dropping duplicates. Setting this to True
|
90
|
-
# will disable sha1 hashing in cases where it isn't
|
91
|
-
# mandatory. This flag is False by default.
|
92
|
-
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
-
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
-
)
|
95
|
-
|
96
|
-
# This env variable specifies whether to check bucketing spec
|
97
|
-
# compliance of the existing compacted table.
|
98
|
-
# PRINT_LOG: Enable logging if any partition is found
|
99
|
-
# to be non-compliant with the bucketing spec.
|
100
|
-
# ASSERT: Fail the job with ValidationError if the
|
101
|
-
# current compacted partition is found to be non-compliant
|
102
|
-
# with bucketing spec. Note, logging is implicitly enabled
|
103
|
-
# in this case.
|
104
|
-
BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
|
105
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE", None
|
106
|
-
)
|
107
|
-
|
108
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
|
109
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
|
@@ -49,7 +49,7 @@ def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]
|
|
49
49
|
] = [
|
50
50
|
(is_delete, list(delete_delta_group))
|
51
51
|
for (is_delete, _), delete_delta_group in itertools.groupby(
|
52
|
-
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.
|
52
|
+
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.meta.entry_params)
|
53
53
|
)
|
54
54
|
]
|
55
55
|
for (
|
@@ -89,11 +89,11 @@ def _get_delete_file_envelopes(
|
|
89
89
|
consecutive_delete_tables: List[pa.Table] = []
|
90
90
|
for delete_delta in delete_delta_sequence:
|
91
91
|
assert (
|
92
|
-
delete_delta.
|
92
|
+
delete_delta.meta.entry_params is not None
|
93
93
|
), "Delete type deltas are required to have delete parameters defined"
|
94
94
|
delete_columns: Optional[
|
95
95
|
List[str]
|
96
|
-
] = delete_delta.
|
96
|
+
] = delete_delta.meta.entry_params.equality_field_locators
|
97
97
|
assert len(delete_columns) > 0, "At least 1 delete column is required"
|
98
98
|
# delete columns should exist in underlying table
|
99
99
|
delete_dataset = params.deltacat_storage.download_delta(
|
@@ -43,12 +43,11 @@ class MergeInput(Dict):
|
|
43
43
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
44
44
|
object_store: Optional[IObjectStore] = None,
|
45
45
|
delete_strategy: Optional[DeleteStrategy] = None,
|
46
|
-
delete_file_envelopes: Optional[List] = None,
|
46
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
|
47
47
|
deltacat_storage=unimplemented_deltacat_storage,
|
48
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
49
|
memory_logs_enabled: Optional[bool] = None,
|
50
50
|
disable_copy_by_reference: Optional[bool] = None,
|
51
|
-
hash_bucket_count: Optional[int] = None,
|
52
51
|
) -> MergeInput:
|
53
52
|
|
54
53
|
result = MergeInput()
|
@@ -72,7 +71,6 @@ class MergeInput(Dict):
|
|
72
71
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
73
72
|
result["memory_logs_enabled"] = memory_logs_enabled
|
74
73
|
result["disable_copy_by_reference"] = disable_copy_by_reference
|
75
|
-
result["hash_bucket_count"] = hash_bucket_count
|
76
74
|
return result
|
77
75
|
|
78
76
|
@property
|
@@ -156,7 +154,3 @@ class MergeInput(Dict):
|
|
156
154
|
@property
|
157
155
|
def disable_copy_by_reference(self) -> bool:
|
158
156
|
return self["disable_copy_by_reference"]
|
159
|
-
|
160
|
-
@property
|
161
|
-
def hash_bucket_count(self) -> int:
|
162
|
-
return self["hash_bucket_count"]
|
@@ -63,7 +63,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
|
|
63
63
|
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
64
64
|
from deltacat.compute.compactor_v2.utils import io
|
65
65
|
|
66
|
-
from typing import List, Optional
|
66
|
+
from typing import List, Optional, Union
|
67
67
|
from collections import defaultdict
|
68
68
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
69
69
|
CompactionSessionAuditInfo,
|
@@ -83,7 +83,7 @@ def _fetch_compaction_metadata(
|
|
83
83
|
|
84
84
|
# read the results from any previously completed compaction round
|
85
85
|
round_completion_info: Optional[RoundCompletionInfo] = None
|
86
|
-
high_watermark: Optional[HighWatermark] = None
|
86
|
+
high_watermark: Optional[Union[HighWatermark, int]] = None
|
87
87
|
previous_compacted_delta_manifest: Optional[Manifest] = None
|
88
88
|
|
89
89
|
if not params.rebase_source_partition_locator:
|
@@ -129,7 +129,7 @@ def _build_uniform_deltas(
|
|
129
129
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
130
130
|
input_deltas: List[Delta],
|
131
131
|
delta_discovery_start: float,
|
132
|
-
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
132
|
+
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]]:
|
133
133
|
|
134
134
|
delete_strategy: Optional[DeleteStrategy] = None
|
135
135
|
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
|
@@ -222,7 +222,7 @@ def _run_hash_and_merge(
|
|
222
222
|
uniform_deltas: List[DeltaAnnotated],
|
223
223
|
round_completion_info: RoundCompletionInfo,
|
224
224
|
delete_strategy: Optional[DeleteStrategy],
|
225
|
-
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
225
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]],
|
226
226
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
227
227
|
previous_compacted_delta_manifest: Optional[Manifest],
|
228
228
|
compacted_partition: Partition,
|
@@ -389,7 +389,7 @@ def _merge(
|
|
389
389
|
all_hash_group_idx_to_obj_id: dict,
|
390
390
|
compacted_partition: Partition,
|
391
391
|
delete_strategy: DeleteStrategy,
|
392
|
-
delete_file_envelopes: DeleteFileEnvelope,
|
392
|
+
delete_file_envelopes: List[DeleteFileEnvelope],
|
393
393
|
) -> tuple[List[MergeResult], float]:
|
394
394
|
merge_options_provider = functools.partial(
|
395
395
|
task_resource_options_provider,
|
@@ -438,7 +438,6 @@ def _merge(
|
|
438
438
|
delete_file_envelopes=delete_file_envelopes,
|
439
439
|
memory_logs_enabled=params.memory_logs_enabled,
|
440
440
|
disable_copy_by_reference=params.disable_copy_by_reference,
|
441
|
-
hash_bucket_count=params.hash_bucket_count,
|
442
441
|
)
|
443
442
|
}
|
444
443
|
|
@@ -7,7 +7,6 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
-
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
11
10
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
12
11
|
from uuid import uuid4
|
13
12
|
from deltacat import logs
|
@@ -32,25 +31,21 @@ from deltacat.utils.resources import (
|
|
32
31
|
)
|
33
32
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
34
33
|
generate_pk_hash_column,
|
35
|
-
pk_digest_to_hash_bucket_index,
|
36
34
|
)
|
37
35
|
from deltacat.storage import (
|
38
36
|
Delta,
|
39
37
|
DeltaLocator,
|
40
38
|
DeltaType,
|
41
|
-
Manifest,
|
42
39
|
Partition,
|
43
40
|
interface as unimplemented_deltacat_storage,
|
44
41
|
)
|
42
|
+
from deltacat.storage.model.manifest import Manifest
|
45
43
|
from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
|
46
44
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
47
45
|
from deltacat.compute.compactor_v2.constants import (
|
48
46
|
MERGE_TIME_IN_SECONDS,
|
49
47
|
MERGE_SUCCESS_COUNT,
|
50
48
|
MERGE_FAILURE_COUNT,
|
51
|
-
BUCKETING_SPEC_COMPLIANCE_PROFILE,
|
52
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
53
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
54
49
|
)
|
55
50
|
from deltacat.exceptions import (
|
56
51
|
categorize_errors,
|
@@ -62,10 +57,6 @@ if importlib.util.find_spec("memray"):
|
|
62
57
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
63
58
|
|
64
59
|
|
65
|
-
_EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
|
66
|
-
_INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
|
67
|
-
|
68
|
-
|
69
60
|
def _append_delta_type_column(table: pa.Table, value: np.bool_):
|
70
61
|
return table.append_column(
|
71
62
|
sc._DELTA_TYPE_COLUMN_FIELD,
|
@@ -116,8 +107,6 @@ def _merge_tables(
|
|
116
107
|
table: pa.Table,
|
117
108
|
primary_keys: List[str],
|
118
109
|
can_drop_duplicates: bool,
|
119
|
-
hb_index: int,
|
120
|
-
num_buckets: int,
|
121
110
|
compacted_table: Optional[pa.Table] = None,
|
122
111
|
) -> pa.Table:
|
123
112
|
"""
|
@@ -136,20 +125,6 @@ def _merge_tables(
|
|
136
125
|
|
137
126
|
all_tables.append(table)
|
138
127
|
|
139
|
-
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
140
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
141
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
142
|
-
]
|
143
|
-
|
144
|
-
if primary_keys and check_bucketing_spec:
|
145
|
-
_validate_bucketing_spec_compliance(
|
146
|
-
table=all_tables[incremental_idx],
|
147
|
-
num_buckets=num_buckets,
|
148
|
-
primary_keys=primary_keys,
|
149
|
-
hb_index=hb_index,
|
150
|
-
log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
|
151
|
-
)
|
152
|
-
|
153
128
|
if not primary_keys or not can_drop_duplicates:
|
154
129
|
logger.info(
|
155
130
|
f"Not dropping duplicates for primary keys={primary_keys} "
|
@@ -172,32 +147,10 @@ def _merge_tables(
|
|
172
147
|
if compacted_table:
|
173
148
|
compacted_table = all_tables[0]
|
174
149
|
|
175
|
-
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
176
|
-
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
177
|
-
|
178
|
-
logger.info(
|
179
|
-
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
180
|
-
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
181
|
-
)
|
182
|
-
|
183
|
-
if (
|
184
|
-
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
185
|
-
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
186
|
-
):
|
187
|
-
logger.info("Casting compacted and incremental pk hash to large_string...")
|
188
|
-
# is_in combines the chunks of the chunked array passed which can cause
|
189
|
-
# ArrowCapacityError if the total size of string array is over 2GB.
|
190
|
-
# Using a large_string would resolve this issue.
|
191
|
-
# The cast here should be zero-copy in most cases.
|
192
|
-
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
193
|
-
incremental_pk_hash_str = pc.cast(
|
194
|
-
incremental_pk_hash_str, pa.large_string()
|
195
|
-
)
|
196
|
-
|
197
150
|
records_to_keep = pc.invert(
|
198
151
|
pc.is_in(
|
199
|
-
|
200
|
-
|
152
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
|
153
|
+
incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
|
201
154
|
)
|
202
155
|
)
|
203
156
|
|
@@ -212,47 +165,9 @@ def _merge_tables(
|
|
212
165
|
return final_table
|
213
166
|
|
214
167
|
|
215
|
-
def _validate_bucketing_spec_compliance(
|
216
|
-
table: pa.Table,
|
217
|
-
num_buckets: int,
|
218
|
-
hb_index: int,
|
219
|
-
primary_keys: List[str],
|
220
|
-
rcf: RoundCompletionInfo = None,
|
221
|
-
log_prefix=None,
|
222
|
-
) -> None:
|
223
|
-
if rcf is not None:
|
224
|
-
message_prefix = f"{log_prefix}{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}.{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}.{rcf.compacted_delta_locator.partition_values}"
|
225
|
-
else:
|
226
|
-
message_prefix = f"{log_prefix}"
|
227
|
-
pki_table = generate_pk_hash_column(
|
228
|
-
[table], primary_keys=primary_keys, requires_hash=True
|
229
|
-
)[0]
|
230
|
-
is_not_compliant: bool = False
|
231
|
-
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
232
|
-
hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
|
233
|
-
if hash_bucket != hb_index:
|
234
|
-
is_not_compliant = True
|
235
|
-
logger.info(
|
236
|
-
f"{message_prefix} has non-compliant bucketing spec at index: {index} "
|
237
|
-
f"Expected hash bucket is {hb_index} but found {hash_bucket}."
|
238
|
-
)
|
239
|
-
if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
|
240
|
-
raise AssertionError(
|
241
|
-
f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
|
242
|
-
f" to be {hb_index} but found {hash_bucket}"
|
243
|
-
)
|
244
|
-
# No further checks necessary
|
245
|
-
break
|
246
|
-
if not is_not_compliant:
|
247
|
-
logger.debug(
|
248
|
-
f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
|
249
|
-
)
|
250
|
-
|
251
|
-
|
252
168
|
def _download_compacted_table(
|
253
169
|
hb_index: int,
|
254
170
|
rcf: RoundCompletionInfo,
|
255
|
-
primary_keys: List[str],
|
256
171
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
257
172
|
deltacat_storage=unimplemented_deltacat_storage,
|
258
173
|
deltacat_storage_kwargs: Optional[dict] = None,
|
@@ -276,28 +191,7 @@ def _download_compacted_table(
|
|
276
191
|
|
277
192
|
tables.append(table)
|
278
193
|
|
279
|
-
|
280
|
-
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
281
|
-
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
282
|
-
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
283
|
-
]
|
284
|
-
|
285
|
-
logger.debug(
|
286
|
-
f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
|
287
|
-
f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
|
288
|
-
)
|
289
|
-
|
290
|
-
# Bucketing spec compliance isn't required without primary keys
|
291
|
-
if primary_keys and check_bucketing_spec:
|
292
|
-
_validate_bucketing_spec_compliance(
|
293
|
-
compacted_table,
|
294
|
-
rcf.hash_bucket_count,
|
295
|
-
hb_index,
|
296
|
-
primary_keys,
|
297
|
-
rcf=rcf,
|
298
|
-
log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
|
299
|
-
)
|
300
|
-
return compacted_table
|
194
|
+
return pa.concat_tables(tables)
|
301
195
|
|
302
196
|
|
303
197
|
def _copy_all_manifest_files_from_old_hash_buckets(
|
@@ -500,12 +394,12 @@ def _compact_tables(
|
|
500
394
|
_group_sequence_by_delta_type(reordered_all_dfes)
|
501
395
|
):
|
502
396
|
if delta_type is DeltaType.UPSERT:
|
503
|
-
(
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
)
|
397
|
+
(
|
398
|
+
table,
|
399
|
+
incremental_len,
|
400
|
+
deduped_records,
|
401
|
+
merge_time,
|
402
|
+
) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
|
509
403
|
logger.info(
|
510
404
|
f" [Merge task index {input.merge_task_index}] Merged"
|
511
405
|
f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
@@ -556,7 +450,9 @@ def _apply_upserts(
|
|
556
450
|
# on non event based sort key does not produce consistent
|
557
451
|
# compaction results. E.g., compaction(delta1, delta2, delta3)
|
558
452
|
# will not be equal to compaction(compaction(delta1, delta2), delta3).
|
559
|
-
table = table.sort_by(
|
453
|
+
table = table.sort_by(
|
454
|
+
[pa_key for key in input.sort_keys for pa_key in key.arrow]
|
455
|
+
)
|
560
456
|
hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
|
561
457
|
table, merge_time = timed_invocation(
|
562
458
|
func=_merge_tables,
|
@@ -564,8 +460,6 @@ def _apply_upserts(
|
|
564
460
|
primary_keys=input.primary_keys,
|
565
461
|
can_drop_duplicates=input.drop_duplicates,
|
566
462
|
compacted_table=prev_table,
|
567
|
-
hb_index=hb_idx,
|
568
|
-
num_buckets=input.hash_bucket_count,
|
569
463
|
)
|
570
464
|
deduped_records = hb_table_record_count - len(table)
|
571
465
|
return table, incremental_len, deduped_records, merge_time
|
@@ -600,11 +494,9 @@ def _copy_manifests_from_hash_bucketing(
|
|
600
494
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
601
495
|
task_id = get_current_ray_task_id()
|
602
496
|
worker_id = get_current_ray_worker_id()
|
603
|
-
with (
|
604
|
-
|
605
|
-
|
606
|
-
else nullcontext()
|
607
|
-
):
|
497
|
+
with memray.Tracker(
|
498
|
+
f"merge_{worker_id}_{task_id}.bin"
|
499
|
+
) if input.enable_profiler else nullcontext():
|
608
500
|
total_input_records, total_deduped_records = 0, 0
|
609
501
|
total_dropped_records = 0
|
610
502
|
materialized_results: List[MaterializeResult] = []
|
@@ -628,7 +520,6 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
628
520
|
compacted_table = _download_compacted_table(
|
629
521
|
hb_index=merge_file_group.hb_index,
|
630
522
|
rcf=input.round_completion_info,
|
631
|
-
primary_keys=input.primary_keys,
|
632
523
|
read_kwargs_provider=input.read_kwargs_provider,
|
633
524
|
deltacat_storage=input.deltacat_storage,
|
634
525
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
@@ -713,5 +604,5 @@ def merge(input: MergeInput) -> MergeResult:
|
|
713
604
|
merge_result[3],
|
714
605
|
merge_result[4],
|
715
606
|
np.double(emit_metrics_time),
|
716
|
-
merge_result[
|
607
|
+
merge_result[4],
|
717
608
|
)
|
@@ -5,7 +5,6 @@ from deltacat.compute.compactor_v2.constants import (
|
|
5
5
|
TASK_MAX_PARALLELISM,
|
6
6
|
MAX_PARQUET_METADATA_SIZE,
|
7
7
|
)
|
8
|
-
from deltacat.utils.common import ReadKwargsProvider
|
9
8
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
10
9
|
from deltacat import logs
|
11
10
|
from deltacat.storage import (
|
@@ -76,21 +75,11 @@ def _download_parquet_metadata_for_manifest_entry(
|
|
76
75
|
entry_index: int,
|
77
76
|
deltacat_storage: unimplemented_deltacat_storage,
|
78
77
|
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
79
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
80
78
|
) -> Dict[str, Any]:
|
81
|
-
logger.info(
|
82
|
-
f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
|
83
|
-
)
|
84
|
-
if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
|
85
|
-
logger.info(
|
86
|
-
"'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
|
87
|
-
)
|
88
|
-
deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
|
89
79
|
pq_file = deltacat_storage.download_delta_manifest_entry(
|
90
80
|
delta,
|
91
81
|
entry_index=entry_index,
|
92
82
|
table_type=TableType.PYARROW_PARQUET,
|
93
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
94
83
|
**deltacat_storage_kwargs,
|
95
84
|
)
|
96
85
|
|
@@ -108,15 +97,11 @@ def append_content_type_params(
|
|
108
97
|
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
109
98
|
deltacat_storage=unimplemented_deltacat_storage,
|
110
99
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
111
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
112
100
|
) -> bool:
|
113
101
|
"""
|
114
102
|
This operation appends content type params into the delta entry. Note
|
115
103
|
that this operation can be time consuming, hence we cache it in a Ray actor.
|
116
104
|
"""
|
117
|
-
logger.info(
|
118
|
-
f"Appending the content type params for Delta with locator {delta.locator}..."
|
119
|
-
)
|
120
105
|
|
121
106
|
if not delta.meta:
|
122
107
|
logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
|
@@ -174,7 +159,6 @@ def append_content_type_params(
|
|
174
159
|
|
175
160
|
def input_provider(index, item) -> Dict:
|
176
161
|
return {
|
177
|
-
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
178
162
|
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
179
163
|
"deltacat_storage": deltacat_storage,
|
180
164
|
"delta": delta,
|
@@ -184,7 +168,6 @@ def append_content_type_params(
|
|
184
168
|
logger.info(
|
185
169
|
f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
|
186
170
|
)
|
187
|
-
|
188
171
|
pq_files_promise = invoke_parallel(
|
189
172
|
entry_indices_to_download,
|
190
173
|
ray_task=_download_parquet_metadata_for_manifest_entry,
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths)
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
@@ -101,6 +101,7 @@ def create_uniform_input_deltas(
|
|
101
101
|
delta_manifest_entries_count = 0
|
102
102
|
estimated_da_bytes = 0
|
103
103
|
input_da_list = []
|
104
|
+
|
104
105
|
for delta in input_deltas:
|
105
106
|
if (
|
106
107
|
compact_partition_params.enable_input_split
|
@@ -117,7 +118,6 @@ def create_uniform_input_deltas(
|
|
117
118
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
118
119
|
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
119
120
|
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
120
|
-
file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
|
121
121
|
)
|
122
122
|
|
123
123
|
manifest_entries = delta.manifest.entries
|
@@ -10,7 +10,6 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
-
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
14
13
|
)
|
15
14
|
import time
|
16
15
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -49,13 +48,6 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
49
48
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
50
49
|
)
|
51
50
|
|
52
|
-
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
-
logger.info(
|
54
|
-
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
-
f"Returning False for is_sha1_desired"
|
56
|
-
)
|
57
|
-
return False
|
58
|
-
|
59
51
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
60
52
|
|
61
53
|
|
@@ -116,10 +108,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
116
108
|
record_batches = []
|
117
109
|
result_len = 0
|
118
110
|
for record_batch in table_batches:
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
):
|
111
|
+
current_bytes += record_batch.nbytes
|
112
|
+
record_batches.append(record_batch)
|
113
|
+
if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
|
123
114
|
logger.info(
|
124
115
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
125
116
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -137,9 +128,6 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
137
128
|
current_bytes = 0
|
138
129
|
record_batches.clear()
|
139
130
|
|
140
|
-
current_bytes += record_batch.nbytes
|
141
|
-
record_batches.append(record_batch)
|
142
|
-
|
143
131
|
if record_batches:
|
144
132
|
appended_len, append_latency = timed_invocation(
|
145
133
|
_append_table_by_hash_bucket,
|