deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,266 @@
|
|
1
|
+
import ray
|
2
|
+
from deltacat.types.media import ContentType
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
import tempfile
|
7
|
+
from deltacat.storage import metastore
|
8
|
+
from deltacat.tests.test_utils.pyarrow import (
|
9
|
+
stage_partition_from_file_paths,
|
10
|
+
commit_delta_to_staged_partition,
|
11
|
+
create_table_from_csv_file_paths,
|
12
|
+
)
|
13
|
+
from deltacat.storage.model.schema import Schema
|
14
|
+
from deltacat.utils.pyarrow import (
|
15
|
+
ReadKwargsProviderPyArrowCsvPureUtf8,
|
16
|
+
ReadKwargsProviderPyArrowSchemaOverride,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class TestContentTypeParamsMain:
|
21
|
+
TEST_NAMESPACE = "test_content_type_params_main"
|
22
|
+
TEST_ENTRY_INDEX = 0
|
23
|
+
DEDUPE_BASE_COMPACTED_TABLE_STRING_PK = "deltacat/tests/compute/compactor_v2/steps/data/dedupe_base_compacted_table_string_pk.csv"
|
24
|
+
DEDUPE_NO_DUPLICATION_STRING_PK = "deltacat/tests/compute/compactor_v2/steps/data/dedupe_table_no_duplication_string_pk.csv"
|
25
|
+
|
26
|
+
@pytest.fixture(scope="module", autouse=True)
|
27
|
+
def setup_ray_cluster(self):
|
28
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
29
|
+
yield
|
30
|
+
ray.shutdown()
|
31
|
+
|
32
|
+
@pytest.fixture(scope="function")
|
33
|
+
def main_deltacat_storage_kwargs(self):
|
34
|
+
# Create a temporary directory for main storage
|
35
|
+
temp_dir = tempfile.mkdtemp()
|
36
|
+
from deltacat.catalog import CatalogProperties
|
37
|
+
|
38
|
+
catalog_properties = CatalogProperties(root=temp_dir)
|
39
|
+
storage_kwargs = {"catalog": catalog_properties}
|
40
|
+
yield storage_kwargs
|
41
|
+
# Clean up temporary directory
|
42
|
+
import shutil
|
43
|
+
|
44
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
45
|
+
|
46
|
+
def test__download_parquet_metadata_for_manifest_entry_sanity(
|
47
|
+
self, main_deltacat_storage_kwargs
|
48
|
+
):
|
49
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
50
|
+
_download_parquet_metadata_for_manifest_entry,
|
51
|
+
)
|
52
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
53
|
+
|
54
|
+
# Create schema from CSV file
|
55
|
+
csv_table = create_table_from_csv_file_paths(
|
56
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK]
|
57
|
+
)
|
58
|
+
schema = Schema.of(csv_table.schema)
|
59
|
+
partition = stage_partition_from_file_paths(
|
60
|
+
self.TEST_NAMESPACE,
|
61
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
|
62
|
+
schema,
|
63
|
+
**main_deltacat_storage_kwargs,
|
64
|
+
)
|
65
|
+
test_delta = commit_delta_to_staged_partition(
|
66
|
+
partition,
|
67
|
+
csv_table,
|
68
|
+
**main_deltacat_storage_kwargs,
|
69
|
+
)
|
70
|
+
test_entry_index = 0
|
71
|
+
obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
|
72
|
+
test_delta,
|
73
|
+
test_entry_index,
|
74
|
+
["pk", "value"],
|
75
|
+
metastore,
|
76
|
+
main_deltacat_storage_kwargs,
|
77
|
+
)
|
78
|
+
parquet_metadata = ray.get(obj_ref)
|
79
|
+
partial_parquet_params = parquet_metadata["partial_parquet_params"]
|
80
|
+
|
81
|
+
# validate
|
82
|
+
assert isinstance(parquet_metadata, dict)
|
83
|
+
assert "entry_index" in parquet_metadata
|
84
|
+
assert "partial_parquet_params" in parquet_metadata
|
85
|
+
assert parquet_metadata["entry_index"] == test_entry_index
|
86
|
+
assert isinstance(partial_parquet_params, PartialParquetParameters)
|
87
|
+
|
88
|
+
assert partial_parquet_params.row_groups_to_download == [0]
|
89
|
+
assert partial_parquet_params.num_row_groups == 1
|
90
|
+
assert partial_parquet_params.num_rows == 8
|
91
|
+
assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
|
92
|
+
assert partial_parquet_params.in_memory_size_bytes > 0
|
93
|
+
|
94
|
+
pq_metadata = partial_parquet_params.pq_metadata
|
95
|
+
assert pq_metadata.num_columns == 2
|
96
|
+
assert pq_metadata.num_rows == 8
|
97
|
+
assert pq_metadata.num_row_groups == 1
|
98
|
+
assert pq_metadata.format_version == "2.6"
|
99
|
+
|
100
|
+
assert (
|
101
|
+
test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
|
102
|
+
== ContentType.PARQUET.value
|
103
|
+
)
|
104
|
+
|
105
|
+
@pytest.mark.parametrize(
|
106
|
+
"read_kwargs_provider,expected_values",
|
107
|
+
[
|
108
|
+
(
|
109
|
+
ReadKwargsProviderPyArrowCsvPureUtf8(),
|
110
|
+
{
|
111
|
+
"num_rows": 6,
|
112
|
+
"num_columns": 2,
|
113
|
+
"num_row_groups": 1,
|
114
|
+
"format_version": "2.6",
|
115
|
+
"column_types": [pa.string(), pa.string()],
|
116
|
+
},
|
117
|
+
),
|
118
|
+
(
|
119
|
+
ReadKwargsProviderPyArrowSchemaOverride(
|
120
|
+
schema=pa.schema(
|
121
|
+
[
|
122
|
+
("id", pa.string()),
|
123
|
+
("value", pa.int64()),
|
124
|
+
]
|
125
|
+
)
|
126
|
+
),
|
127
|
+
{
|
128
|
+
"num_rows": 6,
|
129
|
+
"num_columns": 2,
|
130
|
+
"num_row_groups": 1,
|
131
|
+
"format_version": "2.6",
|
132
|
+
"column_types": [pa.string(), pa.int64()],
|
133
|
+
},
|
134
|
+
),
|
135
|
+
(
|
136
|
+
ReadKwargsProviderPyArrowSchemaOverride(
|
137
|
+
schema=None,
|
138
|
+
pq_coerce_int96_timestamp_unit="ms",
|
139
|
+
parquet_reader_type="daft",
|
140
|
+
),
|
141
|
+
{
|
142
|
+
"num_rows": 6,
|
143
|
+
"num_columns": 2,
|
144
|
+
"num_row_groups": 1,
|
145
|
+
"format_version": "2.6",
|
146
|
+
"column_types": None, # Will use default type inference
|
147
|
+
},
|
148
|
+
),
|
149
|
+
],
|
150
|
+
)
|
151
|
+
def test__download_parquet_metadata_for_manifest_entry_with_read_kwargs_provider(
|
152
|
+
self, read_kwargs_provider, expected_values, main_deltacat_storage_kwargs
|
153
|
+
):
|
154
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
155
|
+
_download_parquet_metadata_for_manifest_entry,
|
156
|
+
)
|
157
|
+
|
158
|
+
# Create schema from CSV file
|
159
|
+
csv_table = create_table_from_csv_file_paths(
|
160
|
+
[self.DEDUPE_NO_DUPLICATION_STRING_PK]
|
161
|
+
)
|
162
|
+
schema = Schema.of(csv_table.schema)
|
163
|
+
partition = stage_partition_from_file_paths(
|
164
|
+
self.TEST_NAMESPACE,
|
165
|
+
[self.DEDUPE_NO_DUPLICATION_STRING_PK],
|
166
|
+
schema,
|
167
|
+
**main_deltacat_storage_kwargs,
|
168
|
+
)
|
169
|
+
test_delta = commit_delta_to_staged_partition(
|
170
|
+
partition,
|
171
|
+
csv_table,
|
172
|
+
**main_deltacat_storage_kwargs,
|
173
|
+
)
|
174
|
+
test_entry_index = 0
|
175
|
+
obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
|
176
|
+
test_delta,
|
177
|
+
test_entry_index,
|
178
|
+
["pk", "value"],
|
179
|
+
metastore,
|
180
|
+
main_deltacat_storage_kwargs,
|
181
|
+
read_kwargs_provider,
|
182
|
+
)
|
183
|
+
parquet_metadata = ray.get(obj_ref)
|
184
|
+
partial_parquet_params = parquet_metadata["partial_parquet_params"]
|
185
|
+
|
186
|
+
# validate
|
187
|
+
assert isinstance(parquet_metadata, dict)
|
188
|
+
assert "entry_index" in parquet_metadata
|
189
|
+
assert "partial_parquet_params" in parquet_metadata
|
190
|
+
assert parquet_metadata["entry_index"] == self.TEST_ENTRY_INDEX
|
191
|
+
|
192
|
+
assert partial_parquet_params.row_groups_to_download == [0]
|
193
|
+
assert (
|
194
|
+
partial_parquet_params.num_row_groups == expected_values["num_row_groups"]
|
195
|
+
)
|
196
|
+
assert partial_parquet_params.num_rows == expected_values["num_rows"]
|
197
|
+
assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
|
198
|
+
assert partial_parquet_params.in_memory_size_bytes > 0
|
199
|
+
|
200
|
+
pq_metadata = partial_parquet_params.pq_metadata
|
201
|
+
assert pq_metadata.num_columns == expected_values["num_columns"]
|
202
|
+
assert pq_metadata.num_rows == expected_values["num_rows"]
|
203
|
+
assert pq_metadata.num_row_groups == expected_values["num_row_groups"]
|
204
|
+
assert pq_metadata.format_version == expected_values["format_version"]
|
205
|
+
|
206
|
+
assert (
|
207
|
+
test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
|
208
|
+
== ContentType.PARQUET.value
|
209
|
+
)
|
210
|
+
|
211
|
+
def test_download_parquet_metadata_for_manifest_entry_file_reader_kwargs_present_top_level_and_deltacat_storage_kwarg(
|
212
|
+
self, main_deltacat_storage_kwargs, caplog
|
213
|
+
):
|
214
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
215
|
+
_download_parquet_metadata_for_manifest_entry,
|
216
|
+
)
|
217
|
+
|
218
|
+
test_file_reader_kwargs_provider = ReadKwargsProviderPyArrowCsvPureUtf8()
|
219
|
+
|
220
|
+
main_deltacat_storage_kwargs[
|
221
|
+
"file_reader_kwargs_provider"
|
222
|
+
] = ReadKwargsProviderPyArrowCsvPureUtf8()
|
223
|
+
|
224
|
+
# Create schema from CSV file
|
225
|
+
csv_table = create_table_from_csv_file_paths(
|
226
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK]
|
227
|
+
)
|
228
|
+
schema = Schema.of(csv_table.schema)
|
229
|
+
partition = stage_partition_from_file_paths(
|
230
|
+
self.TEST_NAMESPACE,
|
231
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
|
232
|
+
schema,
|
233
|
+
**main_deltacat_storage_kwargs,
|
234
|
+
)
|
235
|
+
test_delta = commit_delta_to_staged_partition(
|
236
|
+
partition,
|
237
|
+
csv_table,
|
238
|
+
**main_deltacat_storage_kwargs,
|
239
|
+
)
|
240
|
+
test_entry_index = 0
|
241
|
+
obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
|
242
|
+
test_delta,
|
243
|
+
test_entry_index,
|
244
|
+
["pk", "value"],
|
245
|
+
metastore,
|
246
|
+
main_deltacat_storage_kwargs,
|
247
|
+
test_file_reader_kwargs_provider,
|
248
|
+
)
|
249
|
+
parquet_metadata = ray.get(obj_ref)
|
250
|
+
|
251
|
+
# validate
|
252
|
+
assert isinstance(parquet_metadata, dict)
|
253
|
+
assert "entry_index" in parquet_metadata
|
254
|
+
assert "partial_parquet_params" in parquet_metadata
|
255
|
+
assert parquet_metadata["entry_index"] == test_entry_index
|
256
|
+
|
257
|
+
# Check that warning was logged about duplicate file_reader_kwargs_provider
|
258
|
+
# Note: In main storage, this warning might not be logged or captured due to Ray remote execution
|
259
|
+
# The main functionality is validated by successful parquet_metadata retrieval
|
260
|
+
print(f"Captured {len(caplog.records)} log records")
|
261
|
+
if len(caplog.records) > 0:
|
262
|
+
assert any(
|
263
|
+
"file_reader_kwargs_provider" in record.message
|
264
|
+
for record in caplog.records
|
265
|
+
)
|
266
|
+
# Test passes as long as the main functionality works (parquet_metadata retrieval)
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
3
|
+
group_by_pk_hash_bucket,
|
4
|
+
)
|
5
|
+
|
6
|
+
|
7
|
+
class TestGroupByPkHashBucket:
|
8
|
+
def test_sanity(self):
|
9
|
+
record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
10
|
+
pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
|
11
|
+
record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
|
12
|
+
table = pa.Table.from_batches([record_batch])
|
13
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
14
|
+
|
15
|
+
assert len(grouped_array) == 3
|
16
|
+
total_records = 0
|
17
|
+
for arr in grouped_array:
|
18
|
+
if arr is not None:
|
19
|
+
total_records += len(arr[1])
|
20
|
+
|
21
|
+
assert total_records == len(table)
|
22
|
+
|
23
|
+
def test_when_record_batches_exceed_int_max_size(self):
|
24
|
+
record = pa.array(["12bytestring" * 90_000_000])
|
25
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
26
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
27
|
+
|
28
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
29
|
+
|
30
|
+
assert len(grouped_array) == 3
|
31
|
+
# two record batches are preserved as combining them
|
32
|
+
# would exceed 2GB.
|
33
|
+
assert len(grouped_array[2].to_batches()) == 2
|
34
|
+
|
35
|
+
def test_when_record_batches_less_than_int_max_size(self):
|
36
|
+
record = pa.array(["12bytestring" * 90_000])
|
37
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
38
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
39
|
+
|
40
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
41
|
+
|
42
|
+
assert len(grouped_array) == 3
|
43
|
+
# Combined the arrays into one record batch as the size
|
44
|
+
# would not exceed 2GB.
|
45
|
+
assert len(grouped_array[1].to_batches()) == 1
|
@@ -1,6 +1,36 @@
|
|
1
1
|
import unittest
|
2
2
|
import ray
|
3
|
-
from deltacat.compute.compactor_v2.utils.task_options import
|
3
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
4
|
+
_get_task_options,
|
5
|
+
_get_merge_task_options,
|
6
|
+
logger,
|
7
|
+
)
|
8
|
+
from deltacat.compute.resource_estimation.model import (
|
9
|
+
EstimateResourcesParams,
|
10
|
+
ResourceEstimationMethod,
|
11
|
+
)
|
12
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
13
|
+
from deltacat.compute.compactor import (
|
14
|
+
PyArrowWriteResult,
|
15
|
+
RoundCompletionInfo,
|
16
|
+
)
|
17
|
+
from deltacat.types.media import (
|
18
|
+
ContentType,
|
19
|
+
ContentEncoding,
|
20
|
+
)
|
21
|
+
from deltacat.storage import (
|
22
|
+
DeltaLocator,
|
23
|
+
Manifest,
|
24
|
+
ManifestMeta,
|
25
|
+
ManifestEntry,
|
26
|
+
ManifestEntryList,
|
27
|
+
)
|
28
|
+
from unittest.mock import MagicMock
|
29
|
+
from typing import Optional
|
30
|
+
|
31
|
+
from deltacat.compute.compactor_v2.constants import (
|
32
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
33
|
+
)
|
4
34
|
|
5
35
|
|
6
36
|
@ray.remote
|
@@ -14,11 +44,93 @@ def throwing_func():
|
|
14
44
|
|
15
45
|
|
16
46
|
class TestTaskOptions(unittest.TestCase):
|
47
|
+
TEST_INDEX = 0
|
48
|
+
TEST_HB_GROUP_IDX = 0
|
49
|
+
TEST_STREAM_POSITION = 1_000_000
|
50
|
+
TEST_NUM_HASH_GROUPS = 1
|
51
|
+
|
17
52
|
@classmethod
|
18
53
|
def setUpClass(cls):
|
19
54
|
ray.init(local_mode=True, ignore_reinit_error=True)
|
20
55
|
super().setUpClass()
|
21
56
|
|
57
|
+
@classmethod
|
58
|
+
def tearDownClass(cls) -> None:
|
59
|
+
ray.shutdown()
|
60
|
+
|
61
|
+
def _make_estimate_resource_params(
|
62
|
+
cls,
|
63
|
+
resource_estimation_method: Optional[
|
64
|
+
ResourceEstimationMethod
|
65
|
+
] = ResourceEstimationMethod.DEFAULT,
|
66
|
+
previous_inflation: Optional[int] = 7,
|
67
|
+
average_record_size_bytes: Optional[int] = 1000,
|
68
|
+
):
|
69
|
+
return EstimateResourcesParams.of(
|
70
|
+
resource_estimation_method=resource_estimation_method,
|
71
|
+
previous_inflation=previous_inflation,
|
72
|
+
average_record_size_bytes=average_record_size_bytes,
|
73
|
+
)
|
74
|
+
|
75
|
+
def _make_manifest(
|
76
|
+
self,
|
77
|
+
source_content_length: Optional[int] = 1000,
|
78
|
+
content_type: Optional[ContentType] = ContentType.PARQUET,
|
79
|
+
content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
|
80
|
+
uri: Optional[str] = "test",
|
81
|
+
url: Optional[str] = "test",
|
82
|
+
author: Optional[str] = "foo",
|
83
|
+
entry_uuid: Optional[str] = "foo",
|
84
|
+
manifest_uuid: Optional[str] = "bar",
|
85
|
+
) -> Manifest:
|
86
|
+
meta = ManifestMeta.of(
|
87
|
+
10,
|
88
|
+
10,
|
89
|
+
content_type=content_type,
|
90
|
+
content_encoding=content_encoding,
|
91
|
+
source_content_length=source_content_length,
|
92
|
+
)
|
93
|
+
|
94
|
+
return Manifest.of(
|
95
|
+
entries=ManifestEntryList.of(
|
96
|
+
[
|
97
|
+
ManifestEntry.of(
|
98
|
+
uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
|
99
|
+
)
|
100
|
+
]
|
101
|
+
),
|
102
|
+
author=author,
|
103
|
+
uuid=manifest_uuid,
|
104
|
+
)
|
105
|
+
|
106
|
+
def make_round_completion_info(
|
107
|
+
self,
|
108
|
+
high_watermark: Optional[int] = 1_000_000,
|
109
|
+
compacted_delta_locator: Optional[DeltaLocator] = None,
|
110
|
+
records_written: Optional[int] = 10,
|
111
|
+
bytes_written: Optional[int] = 10,
|
112
|
+
files_written: Optional[int] = 10,
|
113
|
+
rows_dropped: Optional[int] = 10,
|
114
|
+
sort_keys_bit_width: Optional[int] = 0,
|
115
|
+
hash_bucket_count: Optional[int] = 1,
|
116
|
+
hb_index_to_entry_range: Optional[dict] = None,
|
117
|
+
) -> RoundCompletionInfo:
|
118
|
+
if compacted_delta_locator is None:
|
119
|
+
compacted_delta_locator = MagicMock(spec=DeltaLocator)
|
120
|
+
|
121
|
+
hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
|
122
|
+
|
123
|
+
return RoundCompletionInfo.of(
|
124
|
+
compacted_delta_locator=compacted_delta_locator,
|
125
|
+
high_watermark=high_watermark,
|
126
|
+
compacted_pyarrow_write_result=PyArrowWriteResult.of(
|
127
|
+
records_written, bytes_written, files_written, rows_dropped
|
128
|
+
),
|
129
|
+
sort_keys_bit_width=sort_keys_bit_width,
|
130
|
+
hb_index_to_entry_range=hb_index_to_entry_range,
|
131
|
+
hash_bucket_count=hash_bucket_count,
|
132
|
+
)
|
133
|
+
|
22
134
|
def test_get_task_options_sanity(self):
|
23
135
|
opts = _get_task_options(0.01, 0.01)
|
24
136
|
result_ref = valid_func.options(**opts).remote()
|
@@ -31,3 +143,160 @@ class TestTaskOptions(unittest.TestCase):
|
|
31
143
|
result_ref = throwing_func.options(**opts).remote()
|
32
144
|
|
33
145
|
self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
|
146
|
+
|
147
|
+
def test_get_merge_task_options_memory_logs_enabled_sanity(self):
|
148
|
+
test_index = 0
|
149
|
+
test_hb_group_idx = 0
|
150
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
151
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
152
|
+
test_ray_custom_resources = {}
|
153
|
+
test_rcf = self.make_round_completion_info()
|
154
|
+
test_manifest = self._make_manifest()
|
155
|
+
expected_task_opts = {
|
156
|
+
"max_retries": 3,
|
157
|
+
"memory": 1680.64,
|
158
|
+
"num_cpus": 0.01,
|
159
|
+
"scheduling_strategy": "SPREAD",
|
160
|
+
}
|
161
|
+
expected_previous_inflation = 1.0
|
162
|
+
expected_average_record_size = 1.0
|
163
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
164
|
+
# At least one log of level DEBUG must be emitted
|
165
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
166
|
+
index=test_index,
|
167
|
+
hb_group_idx=test_hb_group_idx,
|
168
|
+
data_size=1,
|
169
|
+
pk_size_bytes=1,
|
170
|
+
num_rows=1,
|
171
|
+
num_hash_groups=1,
|
172
|
+
total_memory_buffer_percentage=1,
|
173
|
+
incremental_index_array_size=1,
|
174
|
+
debug_memory_params=test_debug_memory_params,
|
175
|
+
ray_custom_resources=test_ray_custom_resources,
|
176
|
+
estimate_resources_params=test_estimate_memory_params,
|
177
|
+
round_completion_info=test_rcf,
|
178
|
+
compacted_delta_manifest=test_manifest,
|
179
|
+
memory_logs_enabled=True,
|
180
|
+
)
|
181
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
182
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
183
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
184
|
+
self.assertIn(
|
185
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
186
|
+
log_message_round_completion_info,
|
187
|
+
)
|
188
|
+
self.assertIn(
|
189
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
190
|
+
log_message_debug_memory_params,
|
191
|
+
)
|
192
|
+
self.assertIn(
|
193
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
194
|
+
log_message_debug_memory_params,
|
195
|
+
)
|
196
|
+
self.assertIn(
|
197
|
+
f"'average_record_size': {expected_average_record_size}",
|
198
|
+
log_message_debug_memory_params,
|
199
|
+
)
|
200
|
+
|
201
|
+
def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
|
202
|
+
self,
|
203
|
+
):
|
204
|
+
test_index = 0
|
205
|
+
test_hb_group_idx = 0
|
206
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
207
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
208
|
+
test_ray_custom_resources = {}
|
209
|
+
test_rcf = self.make_round_completion_info(
|
210
|
+
bytes_written=0, records_written=0, files_written=0, rows_dropped=0
|
211
|
+
)
|
212
|
+
test_manifest = self._make_manifest()
|
213
|
+
expected_task_opts = {
|
214
|
+
"max_retries": 3,
|
215
|
+
"memory": 1680.64,
|
216
|
+
"num_cpus": 0.01,
|
217
|
+
"scheduling_strategy": "SPREAD",
|
218
|
+
}
|
219
|
+
expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
|
220
|
+
expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
221
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
222
|
+
# At least one log of level DEBUG must be emitted
|
223
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
224
|
+
index=test_index,
|
225
|
+
hb_group_idx=test_hb_group_idx,
|
226
|
+
data_size=1,
|
227
|
+
pk_size_bytes=1,
|
228
|
+
num_rows=1,
|
229
|
+
num_hash_groups=1,
|
230
|
+
total_memory_buffer_percentage=1,
|
231
|
+
incremental_index_array_size=1,
|
232
|
+
debug_memory_params=test_debug_memory_params,
|
233
|
+
ray_custom_resources=test_ray_custom_resources,
|
234
|
+
estimate_resources_params=test_estimate_memory_params,
|
235
|
+
round_completion_info=test_rcf,
|
236
|
+
compacted_delta_manifest=test_manifest,
|
237
|
+
memory_logs_enabled=True,
|
238
|
+
)
|
239
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
240
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
241
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
242
|
+
self.assertIn(
|
243
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
244
|
+
log_message_round_completion_info,
|
245
|
+
)
|
246
|
+
self.assertIn(
|
247
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
248
|
+
log_message_debug_memory_params,
|
249
|
+
)
|
250
|
+
self.assertIn(
|
251
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
252
|
+
log_message_debug_memory_params,
|
253
|
+
)
|
254
|
+
self.assertIn(
|
255
|
+
f"'average_record_size': {expected_average_record_size}",
|
256
|
+
log_message_debug_memory_params,
|
257
|
+
)
|
258
|
+
|
259
|
+
def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
|
260
|
+
self,
|
261
|
+
):
|
262
|
+
test_index = 0
|
263
|
+
test_hb_group_idx = 0
|
264
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
265
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
266
|
+
test_ray_custom_resources = {}
|
267
|
+
test_rcf = None
|
268
|
+
test_manifest = self._make_manifest()
|
269
|
+
expected_task_opts = {
|
270
|
+
"max_retries": 3,
|
271
|
+
"memory": 1680.64,
|
272
|
+
"num_cpus": 0.01,
|
273
|
+
"scheduling_strategy": "SPREAD",
|
274
|
+
}
|
275
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
276
|
+
# At least one log of level DEBUG must be emitted
|
277
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
278
|
+
index=test_index,
|
279
|
+
hb_group_idx=test_hb_group_idx,
|
280
|
+
data_size=1,
|
281
|
+
pk_size_bytes=1,
|
282
|
+
num_rows=1,
|
283
|
+
num_hash_groups=1,
|
284
|
+
total_memory_buffer_percentage=1,
|
285
|
+
incremental_index_array_size=1,
|
286
|
+
debug_memory_params=test_debug_memory_params,
|
287
|
+
ray_custom_resources=test_ray_custom_resources,
|
288
|
+
estimate_resources_params=test_estimate_memory_params,
|
289
|
+
round_completion_info=test_rcf,
|
290
|
+
compacted_delta_manifest=test_manifest,
|
291
|
+
memory_logs_enabled=True,
|
292
|
+
)
|
293
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
294
|
+
log_message_debug_memory_params = cm.records[0].getMessage()
|
295
|
+
self.assertIn(
|
296
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
297
|
+
log_message_debug_memory_params,
|
298
|
+
)
|
299
|
+
self.assertNotIn(
|
300
|
+
"'average_record_size'",
|
301
|
+
log_message_debug_memory_params,
|
302
|
+
)
|
@@ -1,9 +1,8 @@
|
|
1
|
-
import os
|
2
1
|
import tempfile
|
3
2
|
import shutil
|
4
|
-
from typing import Dict
|
5
3
|
|
6
4
|
import pytest
|
5
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
7
6
|
|
8
7
|
|
9
8
|
@pytest.fixture
|
@@ -25,51 +24,16 @@ def temp_dir():
|
|
25
24
|
|
26
25
|
|
27
26
|
@pytest.fixture(scope="function")
|
28
|
-
def
|
27
|
+
def main_deltacat_storage_kwargs(temp_dir):
|
29
28
|
"""
|
30
|
-
Fixture that creates a
|
31
|
-
and
|
29
|
+
Fixture that creates a CatalogProperties object for each test function
|
30
|
+
using the main metastore implementation and cleans up afterwards.
|
32
31
|
|
33
32
|
Returns:
|
34
|
-
dict: A dictionary with
|
33
|
+
dict: A dictionary with 'inner' key pointing to CatalogProperties
|
35
34
|
"""
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# Return kwargs dictionary ready to use
|
40
|
-
kwargs = {"db_file_path": db_file_path}
|
35
|
+
catalog = CatalogProperties(root=temp_dir)
|
36
|
+
kwargs = {"inner": catalog}
|
41
37
|
yield kwargs
|
42
38
|
|
43
|
-
# Cleanup
|
44
|
-
if os.path.exists(db_file_path):
|
45
|
-
os.remove(db_file_path)
|
46
|
-
|
47
|
-
|
48
|
-
def create_local_deltacat_storage_file() -> Dict[str, str]:
|
49
|
-
"""
|
50
|
-
Helper function to create a local deltacat storage file
|
51
|
-
|
52
|
-
Essentially uses the same approach as local_deltacat_storage_kwargs, but more flexible
|
53
|
-
if the consumer does not want to use a function scoped fixture
|
54
|
-
|
55
|
-
Returns: kwargs to use for local deltacat storage, i.e. {"db_file_path": $db_file}
|
56
|
-
"""
|
57
|
-
temp_dir = tempfile.mkdtemp()
|
58
|
-
db_file_path = os.path.join(temp_dir, "db_test.sqlite")
|
59
|
-
return {"db_file_path": db_file_path}
|
60
|
-
|
61
|
-
|
62
|
-
def clean_up_local_deltacat_storage_file(local_storage_kwargs: Dict[str, str]):
|
63
|
-
"""
|
64
|
-
Cleans up local file and directory created by create_local_deltacat_storage_file
|
65
|
-
"""
|
66
|
-
db_file = local_storage_kwargs["db_file_path"]
|
67
|
-
dir_path = os.path.dirname(db_file)
|
68
|
-
|
69
|
-
# Remove the database file if it exists
|
70
|
-
if os.path.exists(db_file):
|
71
|
-
os.remove(db_file)
|
72
|
-
|
73
|
-
# Remove the temporary directory if it exists
|
74
|
-
if os.path.exists(dir_path):
|
75
|
-
shutil.rmtree(dir_path)
|
39
|
+
# Cleanup happens automatically via temp_dir fixture
|