deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
import posixpath
|
4
|
+
from pyarrow.fs import FileSelector
|
5
|
+
|
6
|
+
from deltacat.storage import (
|
7
|
+
Transaction,
|
8
|
+
TransactionOperation,
|
9
|
+
TransactionOperationType,
|
10
|
+
)
|
11
|
+
|
12
|
+
from deltacat.constants import (
|
13
|
+
TXN_DIR_NAME,
|
14
|
+
RUNNING_TXN_DIR_NAME,
|
15
|
+
FAILED_TXN_DIR_NAME,
|
16
|
+
TXN_PART_SEPARATOR,
|
17
|
+
SUCCESS_TXN_DIR_NAME,
|
18
|
+
)
|
19
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
20
|
+
from deltacat.tests.test_utils.storage import (
|
21
|
+
create_test_namespace,
|
22
|
+
create_test_table,
|
23
|
+
)
|
24
|
+
from deltacat.compute.janitor import (
|
25
|
+
janitor_delete_timed_out_transaction,
|
26
|
+
janitor_remove_files_in_failed,
|
27
|
+
)
|
28
|
+
|
29
|
+
|
30
|
+
class TestJanitorJob:
|
31
|
+
def test_janitor_delete_timed_out_transaction(self, temp_dir):
|
32
|
+
# Set up test directories and filesystem
|
33
|
+
catalog_root, filesystem = resolve_path_and_filesystem(temp_dir)
|
34
|
+
running_txn_dir = posixpath.join(
|
35
|
+
catalog_root, TXN_DIR_NAME, RUNNING_TXN_DIR_NAME
|
36
|
+
)
|
37
|
+
failed_txn_dir = posixpath.join(catalog_root, TXN_DIR_NAME, FAILED_TXN_DIR_NAME)
|
38
|
+
os.makedirs(running_txn_dir, exist_ok=True)
|
39
|
+
os.makedirs(failed_txn_dir, exist_ok=True)
|
40
|
+
|
41
|
+
# Create a test transaction log that is already timed out.
|
42
|
+
# Note: The janitor expects the first token to be the intended end time.
|
43
|
+
start_time = time.time_ns() - 1_000_000_000 # 1 second in the past
|
44
|
+
txn_id = "test_transaction_id"
|
45
|
+
# The file name uses past_end_time as the first token so that it qualifies as timed out.
|
46
|
+
txn_filename = f"{start_time}{TXN_PART_SEPARATOR}{txn_id}{TXN_PART_SEPARATOR}{time.time_ns()}"
|
47
|
+
txn_path = posixpath.join(running_txn_dir, txn_filename)
|
48
|
+
|
49
|
+
# Create a mock transaction file in the running directory.
|
50
|
+
with open(txn_path, "w") as f:
|
51
|
+
f.write("mock transaction content")
|
52
|
+
|
53
|
+
# Create a test metafile that contains the transaction id to trigger the renaming logic.
|
54
|
+
test_metafile_path = posixpath.join(
|
55
|
+
catalog_root, f"test_metafile_{txn_id}.json"
|
56
|
+
)
|
57
|
+
with open(test_metafile_path, "w") as f:
|
58
|
+
f.write("mock metafile content")
|
59
|
+
|
60
|
+
assert os.path.exists(test_metafile_path), "Test metafile was not deleted."
|
61
|
+
|
62
|
+
assert os.path.exists(
|
63
|
+
txn_path
|
64
|
+
), "Transaction file still exists in running directory."
|
65
|
+
|
66
|
+
# Run the janitor job that should:
|
67
|
+
# 1. Move the running txn file to the failed directory with TIMEOUT_TXN appended.
|
68
|
+
# 2. Invoke brute force search to deletes the metafiles and cleans up txn log files.
|
69
|
+
janitor_delete_timed_out_transaction(temp_dir)
|
70
|
+
|
71
|
+
new_txn_file_name = f"{txn_filename}"
|
72
|
+
new_failed_txn_path = posixpath.join(failed_txn_dir, new_txn_file_name)
|
73
|
+
|
74
|
+
# Verify that the renamed file exists in the failed directory.
|
75
|
+
assert os.path.exists(
|
76
|
+
new_failed_txn_path
|
77
|
+
), f"Expected {new_failed_txn_path} to exist."
|
78
|
+
# Verify that the transaction file is no longer in the running directory.
|
79
|
+
assert not os.path.exists(
|
80
|
+
txn_path
|
81
|
+
), "Transaction file still exists in running directory."
|
82
|
+
# Verify that the metafile was deleted.
|
83
|
+
assert not os.path.exists(test_metafile_path), "Test metafile was not deleted."
|
84
|
+
|
85
|
+
def test_janitor_handles_empty_directories(self, temp_dir):
|
86
|
+
# Set up test directories and filesystem
|
87
|
+
catalog_root, filesystem = resolve_path_and_filesystem(temp_dir)
|
88
|
+
running_txn_dir = posixpath.join(
|
89
|
+
catalog_root, TXN_DIR_NAME, RUNNING_TXN_DIR_NAME
|
90
|
+
)
|
91
|
+
failed_txn_dir = posixpath.join(catalog_root, TXN_DIR_NAME, FAILED_TXN_DIR_NAME)
|
92
|
+
|
93
|
+
# Ensure directories exist but are empty
|
94
|
+
os.makedirs(running_txn_dir, exist_ok=True)
|
95
|
+
os.makedirs(failed_txn_dir, exist_ok=True)
|
96
|
+
|
97
|
+
# Ensure directories are empty
|
98
|
+
assert not os.listdir(
|
99
|
+
running_txn_dir
|
100
|
+
), "Running transaction directory is not empty."
|
101
|
+
assert not os.listdir(
|
102
|
+
failed_txn_dir
|
103
|
+
), "Failed transaction directory is not empty."
|
104
|
+
|
105
|
+
# Run the janitor functions on the empty directories
|
106
|
+
try:
|
107
|
+
janitor_delete_timed_out_transaction(temp_dir)
|
108
|
+
janitor_remove_files_in_failed(temp_dir, filesystem)
|
109
|
+
except Exception as e:
|
110
|
+
assert (
|
111
|
+
False
|
112
|
+
), f"Janitor functions should not fail on empty directories, but got exception: {e}"
|
113
|
+
|
114
|
+
# Verify that directories are still empty after running janitor functions
|
115
|
+
assert not os.listdir(
|
116
|
+
running_txn_dir
|
117
|
+
), "Running transaction directory should still be empty."
|
118
|
+
assert not os.listdir(
|
119
|
+
failed_txn_dir
|
120
|
+
), "Failed transaction directory should still be empty."
|
121
|
+
|
122
|
+
def test_janitor_handles_multiple_timed_out_transactions(self, temp_dir):
|
123
|
+
# Set up test directories and filesystem
|
124
|
+
catalog_root, filesystem = resolve_path_and_filesystem(temp_dir)
|
125
|
+
running_txn_dir = posixpath.join(
|
126
|
+
catalog_root, TXN_DIR_NAME, RUNNING_TXN_DIR_NAME
|
127
|
+
)
|
128
|
+
failed_txn_dir = posixpath.join(catalog_root, TXN_DIR_NAME, FAILED_TXN_DIR_NAME)
|
129
|
+
os.makedirs(running_txn_dir, exist_ok=True)
|
130
|
+
os.makedirs(failed_txn_dir, exist_ok=True)
|
131
|
+
|
132
|
+
# Create multiple timed-out transaction logs
|
133
|
+
txn_ids = ["txn_one", "txn_two", "txn_three"]
|
134
|
+
start_time = time.time_ns() - 1_000_000_000 # 1 second in the past
|
135
|
+
|
136
|
+
txn_filenames = []
|
137
|
+
for txn_id in txn_ids:
|
138
|
+
txn_filename = f"{start_time}{TXN_PART_SEPARATOR}{txn_id}{TXN_PART_SEPARATOR}{time.time_ns()}"
|
139
|
+
txn_path = posixpath.join(running_txn_dir, txn_filename)
|
140
|
+
|
141
|
+
# Create mock transaction files in the running directory
|
142
|
+
with open(txn_path, "w") as f:
|
143
|
+
f.write("mock transaction content")
|
144
|
+
|
145
|
+
# Optionally, create a metafile for each transaction if your janitor function processes these as well
|
146
|
+
test_metafile_path = posixpath.join(
|
147
|
+
catalog_root, f"test_metafile_{txn_id}.json"
|
148
|
+
)
|
149
|
+
with open(test_metafile_path, "w") as f:
|
150
|
+
f.write("mock metafile content")
|
151
|
+
|
152
|
+
txn_filenames.append((txn_filename, txn_path, test_metafile_path))
|
153
|
+
|
154
|
+
# Run the janitor function to move timed-out transactions to the failed directory
|
155
|
+
janitor_delete_timed_out_transaction(temp_dir)
|
156
|
+
|
157
|
+
# Verify that all transactions were moved to the failed directory
|
158
|
+
for txn_filename, txn_path, test_metafile_path in txn_filenames:
|
159
|
+
new_txn_filename = f"{txn_filename}"
|
160
|
+
new_failed_txn_path = posixpath.join(failed_txn_dir, new_txn_filename)
|
161
|
+
|
162
|
+
# Check if the renamed transaction file exists in the failed directory
|
163
|
+
assert os.path.exists(
|
164
|
+
new_failed_txn_path
|
165
|
+
), f"Expected {new_failed_txn_path} to exist."
|
166
|
+
|
167
|
+
# Check if the transaction file is no longer in the running directory
|
168
|
+
assert not os.path.exists(
|
169
|
+
txn_path
|
170
|
+
), f"Transaction file {txn_path} still exists in running directory."
|
171
|
+
|
172
|
+
# Check if the corresponding metafile was deleted (if applicable)
|
173
|
+
assert not os.path.exists(
|
174
|
+
test_metafile_path
|
175
|
+
), f"Metafile {test_metafile_path} was not deleted."
|
176
|
+
|
177
|
+
def test_janitor_remove_files_failed(self, temp_dir):
|
178
|
+
# Set up test directories and filesystem
|
179
|
+
catalog_root, filesystem = resolve_path_and_filesystem(temp_dir)
|
180
|
+
txn_log_dir = posixpath.join(catalog_root, TXN_DIR_NAME)
|
181
|
+
failed_txn_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
|
182
|
+
running_txn_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
|
183
|
+
success_txn_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
184
|
+
|
185
|
+
# Ensure all necessary directories exist
|
186
|
+
for dir_path in [failed_txn_dir, running_txn_dir]:
|
187
|
+
os.makedirs(dir_path, exist_ok=True)
|
188
|
+
|
189
|
+
# Create metadata for the transaction
|
190
|
+
meta_to_create = [
|
191
|
+
create_test_namespace(),
|
192
|
+
create_test_table(),
|
193
|
+
]
|
194
|
+
|
195
|
+
txn_operations = [
|
196
|
+
TransactionOperation.of(
|
197
|
+
operation_type=TransactionOperationType.CREATE,
|
198
|
+
dest_metafile=meta,
|
199
|
+
)
|
200
|
+
for meta in meta_to_create
|
201
|
+
]
|
202
|
+
|
203
|
+
transaction = Transaction.of(txn_operations)
|
204
|
+
write_paths, txn_log_path = transaction.commit(temp_dir)
|
205
|
+
|
206
|
+
# Get filename of committed transaction (from success directory)
|
207
|
+
success_file_dir = filesystem.get_file_info(
|
208
|
+
FileSelector(success_txn_dir, recursive=False)
|
209
|
+
)
|
210
|
+
success_files = filesystem.get_file_info(
|
211
|
+
FileSelector(success_file_dir[0].path, recursive=False)
|
212
|
+
)
|
213
|
+
filename = posixpath.basename(success_file_dir[0].path)
|
214
|
+
|
215
|
+
# Compute destination paths
|
216
|
+
failed_txn_path = posixpath.join(failed_txn_dir, filename)
|
217
|
+
running_txn_path = posixpath.join(running_txn_dir, filename)
|
218
|
+
# Move the file from success to failed to simulate a failed transactions
|
219
|
+
|
220
|
+
filesystem.copy_file(success_files[0].path, failed_txn_path)
|
221
|
+
filesystem.copy_file(success_files[0].path, running_txn_path)
|
222
|
+
|
223
|
+
# Verify that the write path files exist before cleanup.
|
224
|
+
for path in write_paths:
|
225
|
+
assert os.path.exists(
|
226
|
+
path
|
227
|
+
), f"Expected write path {path} to exist before cleanup."
|
228
|
+
|
229
|
+
# Run the cleanup function.
|
230
|
+
janitor_remove_files_in_failed(temp_dir, filesystem)
|
231
|
+
|
232
|
+
# Check that all write paths have been deleted.
|
233
|
+
for path in write_paths:
|
234
|
+
assert not os.path.exists(
|
235
|
+
path
|
236
|
+
), f"Write path {path} should have been deleted after cleanup."
|