deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,408 @@
|
|
1
|
+
import posixpath
|
2
|
+
from deltacat.utils.metafile_locator import _find_partition_path
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import pyarrow as pa
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_schema():
|
15
|
+
return Schema(
|
16
|
+
fields=[
|
17
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
18
|
+
Field("name", Datatype.string()),
|
19
|
+
Field("age", Datatype.int32()),
|
20
|
+
]
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
@pytest.fixture
|
25
|
+
def sample_pydict():
|
26
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
|
27
|
+
|
28
|
+
|
29
|
+
@pytest.fixture
|
30
|
+
def sample_parquet_data(tmp_path, sample_pydict):
|
31
|
+
parquet_path = tmp_path / "test.parquet"
|
32
|
+
table = pa.Table.from_pydict(sample_pydict)
|
33
|
+
pa.parquet.write_table(table, parquet_path)
|
34
|
+
return parquet_path
|
35
|
+
|
36
|
+
|
37
|
+
# Updated Tests
|
38
|
+
|
39
|
+
|
40
|
+
def test_dataset_creation_with_schema(tmp_path, sample_schema):
|
41
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
42
|
+
assert len(dataset.fields) == 3
|
43
|
+
assert "id" in dataset.fields
|
44
|
+
assert dataset.fields["id"].is_merge_key
|
45
|
+
|
46
|
+
|
47
|
+
def test_dataset_initialization_with_metadata(tmp_path):
|
48
|
+
dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
|
49
|
+
assert dataset.dataset_name == "test_dataset"
|
50
|
+
assert dataset._metadata_folder.startswith(".riv-meta")
|
51
|
+
|
52
|
+
|
53
|
+
def test_invalid_dataset_initialization():
|
54
|
+
with pytest.raises(ValueError, match="Name must be a non-empty string"):
|
55
|
+
Dataset(dataset_name="")
|
56
|
+
|
57
|
+
|
58
|
+
def test_dataset_creation_metadata_structure(tmp_path):
|
59
|
+
dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
|
60
|
+
|
61
|
+
assert dataset._metadata_folder.startswith(".riv-meta")
|
62
|
+
assert dataset._namespace == "default"
|
63
|
+
assert dataset.dataset_name == "test_dataset"
|
64
|
+
assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
|
65
|
+
|
66
|
+
locator = dataset._locator
|
67
|
+
root_uri = dataset._metadata_path
|
68
|
+
|
69
|
+
partition_path = _find_partition_path(root_uri, locator)
|
70
|
+
|
71
|
+
# Ensures that directory structure for namespace -> table -> table_version -> stream_id -> partition_id exists
|
72
|
+
assert posixpath.exists(partition_path)
|
73
|
+
|
74
|
+
|
75
|
+
def test_fields_accessor_add_field(tmp_path, sample_schema):
|
76
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
77
|
+
dataset.fields.add("new_field", Datatype.float())
|
78
|
+
assert "new_field" in dataset.fields
|
79
|
+
assert dataset.fields["new_field"].datatype == Datatype.float()
|
80
|
+
|
81
|
+
dataset.fields["new_field2"] = Field("new_field2", Datatype.int32())
|
82
|
+
assert "new_field2" in dataset.fields
|
83
|
+
assert "new_field2" in dataset.schemas["all"]
|
84
|
+
with pytest.raises(TypeError):
|
85
|
+
dataset.fields["new_field3"] = 2
|
86
|
+
|
87
|
+
|
88
|
+
def test_field_removal(tmp_path, sample_schema):
|
89
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
90
|
+
del dataset.fields["age"]
|
91
|
+
assert "age" not in dataset.fields
|
92
|
+
with pytest.raises(ValueError):
|
93
|
+
del dataset.fields["age"]
|
94
|
+
with pytest.raises(KeyError):
|
95
|
+
_ = dataset.fields["age"]
|
96
|
+
|
97
|
+
|
98
|
+
def test_fields_accessor_repr(tmp_path, sample_schema):
|
99
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
100
|
+
repr_output = repr(dataset.fields)
|
101
|
+
for field_name in ["id", "name", "age"]:
|
102
|
+
assert field_name in repr_output, f"Field '{field_name}' missing in repr output"
|
103
|
+
|
104
|
+
|
105
|
+
def test_schemas_accessor_add_group(tmp_path, sample_schema):
|
106
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
107
|
+
dataset.schemas["analytics"] = ["id", "name"]
|
108
|
+
assert "analytics" in dataset.schemas
|
109
|
+
assert len(dataset.schemas["analytics"]) == 2
|
110
|
+
|
111
|
+
|
112
|
+
def test_schema_removal(tmp_path, sample_schema):
|
113
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
114
|
+
with pytest.raises(ValueError):
|
115
|
+
del dataset.schemas["all"]
|
116
|
+
with pytest.raises(ValueError):
|
117
|
+
del dataset.schemas["does_not_exist"]
|
118
|
+
dataset.schemas["new"] = ["id", "name"]
|
119
|
+
del dataset.schemas["new"]
|
120
|
+
with pytest.raises(KeyError):
|
121
|
+
_ = dataset.schemas["new"]
|
122
|
+
|
123
|
+
|
124
|
+
def test_dataset_from_parquet(tmp_path, sample_parquet_data):
|
125
|
+
dataset = Dataset.from_parquet(
|
126
|
+
name="test_dataset",
|
127
|
+
file_uri=str(sample_parquet_data),
|
128
|
+
metadata_uri=str(tmp_path),
|
129
|
+
merge_keys="id",
|
130
|
+
)
|
131
|
+
assert len(dataset.fields) == 3
|
132
|
+
assert "id" in dataset.fields
|
133
|
+
assert dataset.fields["id"].is_merge_key
|
134
|
+
|
135
|
+
|
136
|
+
def test_parquet_schema_modes(tmp_path, sample_pydict):
|
137
|
+
# Create two parquet files with overlapping and unique schemas
|
138
|
+
data_1 = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
|
139
|
+
data_2 = {"id": [4, 5, 6], "age": [25, 30, 35]}
|
140
|
+
|
141
|
+
path_1 = tmp_path / "data1.parquet"
|
142
|
+
path_2 = tmp_path / "data2.parquet"
|
143
|
+
pa.parquet.write_table(pa.Table.from_pydict(data_1), path_1)
|
144
|
+
pa.parquet.write_table(pa.Table.from_pydict(data_2), path_2)
|
145
|
+
|
146
|
+
dataset_union = Dataset.from_parquet(
|
147
|
+
name="test_dataset_union",
|
148
|
+
file_uri=str(tmp_path),
|
149
|
+
merge_keys="id",
|
150
|
+
schema_mode="union",
|
151
|
+
)
|
152
|
+
assert len(dataset_union.fields) == 3 # id, name, age
|
153
|
+
|
154
|
+
dataset_intersect = Dataset.from_parquet(
|
155
|
+
name="test_dataset_intersect",
|
156
|
+
file_uri=str(tmp_path),
|
157
|
+
merge_keys="id",
|
158
|
+
schema_mode="intersect",
|
159
|
+
)
|
160
|
+
assert len(dataset_intersect.fields) == 1 # Only id
|
161
|
+
|
162
|
+
|
163
|
+
def test_merge_all_schemas():
|
164
|
+
schema1 = Schema(
|
165
|
+
fields=[
|
166
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
167
|
+
Field("name", Datatype.string()),
|
168
|
+
]
|
169
|
+
)
|
170
|
+
schema2 = Schema(
|
171
|
+
fields=[
|
172
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
173
|
+
Field("age", Datatype.int32()),
|
174
|
+
]
|
175
|
+
)
|
176
|
+
merged_schema = Schema.merge_all([schema1, schema2])
|
177
|
+
assert len(merged_schema) == 3
|
178
|
+
assert "id" in merged_schema
|
179
|
+
assert "name" in merged_schema
|
180
|
+
assert "age" in merged_schema
|
181
|
+
|
182
|
+
|
183
|
+
def test_writer_creation_with_custom_format(tmp_path, sample_schema):
|
184
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
185
|
+
writer = dataset.writer(file_format="feather")
|
186
|
+
assert writer is not None
|
187
|
+
|
188
|
+
|
189
|
+
def test_scan_with_query(tmp_path, sample_schema):
|
190
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
191
|
+
query = QueryExpression() # Placeholder query
|
192
|
+
scan = dataset.scan(query)
|
193
|
+
assert scan is not None
|
194
|
+
|
195
|
+
|
196
|
+
def test_add_schema_to_new_schemas(tmp_path):
|
197
|
+
"""Test adding a schema to a new field group."""
|
198
|
+
base_uri = str(tmp_path / "test_dataset")
|
199
|
+
dataset = Dataset(dataset_name=base_uri)
|
200
|
+
|
201
|
+
schema = Schema(
|
202
|
+
[
|
203
|
+
("id", Datatype.int32()),
|
204
|
+
("name", Datatype.string()),
|
205
|
+
("age", Datatype.int32()),
|
206
|
+
],
|
207
|
+
merge_keys=["id"],
|
208
|
+
)
|
209
|
+
|
210
|
+
dataset.add_schema(schema, schema_name="new_group")
|
211
|
+
|
212
|
+
# Verify the field group is added
|
213
|
+
assert "new_group" in dataset.schemas
|
214
|
+
assert len(dataset.schemas["new_group"]) == 3
|
215
|
+
assert dataset.schemas["new_group"]["id"].datatype == Datatype.int32()
|
216
|
+
assert dataset.schemas["new_group"]["name"].datatype == Datatype.string()
|
217
|
+
assert dataset.schemas["new_group"]["age"].datatype == Datatype.int32()
|
218
|
+
|
219
|
+
|
220
|
+
def test_add_schema_to_existing_schemas(tmp_path):
|
221
|
+
"""Test merging a schema into an existing field group."""
|
222
|
+
base_uri = str(tmp_path / "test_dataset")
|
223
|
+
dataset = Dataset(dataset_name=base_uri)
|
224
|
+
|
225
|
+
schema_1 = Schema(
|
226
|
+
[
|
227
|
+
("id", Datatype.int32()),
|
228
|
+
("name", Datatype.string()),
|
229
|
+
],
|
230
|
+
merge_keys=["id"],
|
231
|
+
)
|
232
|
+
|
233
|
+
dataset.add_schema(schema_1, schema_name="existing_group")
|
234
|
+
|
235
|
+
schema_2 = Schema(
|
236
|
+
[
|
237
|
+
("age", Datatype.int32()),
|
238
|
+
("email", Datatype.string()),
|
239
|
+
],
|
240
|
+
merge_keys=["id"],
|
241
|
+
)
|
242
|
+
|
243
|
+
dataset.add_schema(schema_2, schema_name="existing_group")
|
244
|
+
|
245
|
+
# Verify the merged schema
|
246
|
+
assert "existing_group" in dataset.schemas
|
247
|
+
assert len(dataset.schemas["existing_group"]) == 4
|
248
|
+
assert dataset.schemas["existing_group"]["id"].datatype == Datatype.int32()
|
249
|
+
assert dataset.schemas["existing_group"]["name"].datatype == Datatype.string()
|
250
|
+
assert dataset.schemas["existing_group"]["age"].datatype == Datatype.int32()
|
251
|
+
assert dataset.schemas["existing_group"]["email"].datatype == Datatype.string()
|
252
|
+
|
253
|
+
|
254
|
+
def test_add_schema_conflicting_fields(tmp_path):
|
255
|
+
"""Test adding a schema with conflicting fields."""
|
256
|
+
base_uri = str(tmp_path / "test_dataset")
|
257
|
+
dataset = Dataset(dataset_name=base_uri)
|
258
|
+
|
259
|
+
schema_1 = Schema(
|
260
|
+
[
|
261
|
+
("id", Datatype.int32()),
|
262
|
+
("name", Datatype.string()),
|
263
|
+
],
|
264
|
+
merge_keys=["id"],
|
265
|
+
)
|
266
|
+
|
267
|
+
dataset.add_schema(schema_1, schema_name="conflicting_group")
|
268
|
+
|
269
|
+
schema_2 = Schema(
|
270
|
+
[
|
271
|
+
("id", Datatype.string()), # Conflict: datatype mismatch
|
272
|
+
("age", Datatype.int32()),
|
273
|
+
],
|
274
|
+
merge_keys=["id"],
|
275
|
+
)
|
276
|
+
|
277
|
+
with pytest.raises(ValueError, match="already exists"):
|
278
|
+
dataset.add_schema(schema_2, schema_name="conflicting_group")
|
279
|
+
|
280
|
+
schema_3 = Schema(
|
281
|
+
[
|
282
|
+
("id", Datatype.int32()), # Conflict: datatype mismatch
|
283
|
+
("age", Datatype.int32()),
|
284
|
+
],
|
285
|
+
merge_keys=["id"],
|
286
|
+
)
|
287
|
+
|
288
|
+
dataset.add_schema(schema_3, schema_name="conflicting_group")
|
289
|
+
assert "conflicting_group" in dataset.schemas
|
290
|
+
assert len(dataset.schemas["conflicting_group"]) == 3
|
291
|
+
assert dataset.schemas["conflicting_group"]["id"].datatype == Datatype.int32()
|
292
|
+
assert dataset.schemas["conflicting_group"]["name"].datatype == Datatype.string()
|
293
|
+
assert dataset.schemas["conflicting_group"]["age"].datatype == Datatype.int32()
|
294
|
+
|
295
|
+
|
296
|
+
def test_add_fields_with_merge_key_field(tmp_path):
|
297
|
+
base_uri = str(tmp_path / "test_dataset")
|
298
|
+
dataset = Dataset(dataset_name=base_uri)
|
299
|
+
dataset.add_fields([Field("my_merge_key", Datatype.string(), True)])
|
300
|
+
assert dataset.schemas["default"].get_merge_key() == "my_merge_key"
|
301
|
+
|
302
|
+
|
303
|
+
def test_add_schema_to_nonexistent_schemas(tmp_path):
|
304
|
+
"""Test adding a schema to a nonexistent field group."""
|
305
|
+
base_uri = str(tmp_path / "test_dataset")
|
306
|
+
dataset = Dataset(dataset_name=base_uri)
|
307
|
+
|
308
|
+
schema = Schema(
|
309
|
+
[
|
310
|
+
("id", Datatype.int32()),
|
311
|
+
("name", Datatype.string()),
|
312
|
+
],
|
313
|
+
merge_keys=["id"],
|
314
|
+
)
|
315
|
+
|
316
|
+
# Add to a non-existent field group
|
317
|
+
dataset.add_schema(schema, schema_name="nonexistent_group")
|
318
|
+
|
319
|
+
# Verify the field group is created
|
320
|
+
assert "nonexistent_group" in dataset.schemas
|
321
|
+
assert len(dataset.schemas["nonexistent_group"]) == 2
|
322
|
+
|
323
|
+
|
324
|
+
def test_add_missing_field_to_schema_raises_error(tmp_path, sample_schema):
|
325
|
+
"""
|
326
|
+
Test that attempting to add a missing field to the 'all' schema raises a ValueError.
|
327
|
+
"""
|
328
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
329
|
+
|
330
|
+
# Attempt to add a non-existent field to the 'all' schema
|
331
|
+
with pytest.raises(
|
332
|
+
ValueError, match="Field 'missing_field' does not exist in the dataset."
|
333
|
+
):
|
334
|
+
dataset.schemas["all"] = [
|
335
|
+
"missing_field"
|
336
|
+
] # Attempt to set a list with a missing field
|
337
|
+
|
338
|
+
|
339
|
+
def test_schemas_accessor_methods(tmp_path, sample_schema):
|
340
|
+
"""
|
341
|
+
Test the __iter__, __len__, and __repr__ methods of SchemasAccessor.
|
342
|
+
"""
|
343
|
+
dataset = Dataset(
|
344
|
+
dataset_name="test_dataset", schema=sample_schema
|
345
|
+
) # Default schema is defined automatically
|
346
|
+
dataset.schemas["schema_1"] = ["id", "name"]
|
347
|
+
dataset.schemas["schema_2"] = ["age"]
|
348
|
+
|
349
|
+
# Test __iter__
|
350
|
+
schema_names = list(iter(dataset.schemas))
|
351
|
+
assert set(schema_names) == {
|
352
|
+
"schema_1",
|
353
|
+
"schema_2",
|
354
|
+
"all",
|
355
|
+
"default",
|
356
|
+
}, "Schema names do not match expected values"
|
357
|
+
|
358
|
+
# Test __len__
|
359
|
+
assert len(dataset.schemas) == 4, "Length of schemas accessor is incorrect"
|
360
|
+
|
361
|
+
# Test __repr__
|
362
|
+
repr_output = repr(dataset.schemas)
|
363
|
+
for schema_name in ["schema_1", "schema_2", "all"]:
|
364
|
+
assert (
|
365
|
+
schema_name in repr_output
|
366
|
+
), f"Schema '{schema_name}' missing in repr output"
|
367
|
+
|
368
|
+
|
369
|
+
def test_get_merge_keys(tmp_path, sample_schema):
|
370
|
+
"""
|
371
|
+
Test the get_merge_keys method to ensure it returns all merge keys in the dataset.
|
372
|
+
"""
|
373
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
374
|
+
|
375
|
+
# Add fields with additional merge key to the dataset
|
376
|
+
other_schema = Schema(
|
377
|
+
[("id2", Datatype.int32()), ("zip", Datatype.string())], merge_keys=["id2"]
|
378
|
+
)
|
379
|
+
|
380
|
+
dataset.add_schema(other_schema, "id2+zip")
|
381
|
+
|
382
|
+
# Call get_merge_keys and validate the result
|
383
|
+
merge_keys = dataset.get_merge_keys()
|
384
|
+
assert merge_keys == [
|
385
|
+
"id",
|
386
|
+
"id2",
|
387
|
+
], f"Expected merge keys ['id', 'id2'], got {merge_keys}"
|
388
|
+
|
389
|
+
|
390
|
+
def test_add_fields_no_fields_raises_error(tmp_path, sample_schema):
|
391
|
+
dataset = Dataset(dataset_name="test_dataset")
|
392
|
+
with pytest.raises(ValueError):
|
393
|
+
dataset.add_fields(fields=[])
|
394
|
+
|
395
|
+
|
396
|
+
def test_add_fields_mismatched_merge_keys_raises_error(tmp_path, sample_schema):
|
397
|
+
dataset = Dataset(dataset_name="test_dataset")
|
398
|
+
with pytest.raises(
|
399
|
+
ValueError,
|
400
|
+
match="The following merge keys were not found in the provided fields: does_not_exist",
|
401
|
+
):
|
402
|
+
dataset.add_fields(fields=sample_schema.values(), merge_keys=["does_not_exist"])
|
403
|
+
|
404
|
+
with pytest.raises(TypeError, match="Merge key status conflict"):
|
405
|
+
dataset.add_fields(
|
406
|
+
fields=[Field("id", Datatype.int32()), Field("name", Datatype.string())],
|
407
|
+
merge_keys=["id"],
|
408
|
+
)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
8
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
10
|
+
import pyarrow as pa
|
11
|
+
import pyarrow.parquet
|
12
|
+
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def sample_schema():
|
16
|
+
return Schema(
|
17
|
+
fields=[
|
18
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
19
|
+
Field("name", Datatype.string()),
|
20
|
+
Field("age", Datatype.int32()),
|
21
|
+
]
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
@pytest.fixture
|
26
|
+
def sample_pydict():
|
27
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
|
28
|
+
|
29
|
+
|
30
|
+
@pytest.fixture
|
31
|
+
def path(tmp_path):
|
32
|
+
return tmp_path
|
33
|
+
|
34
|
+
|
35
|
+
@pytest.fixture
|
36
|
+
def sample_parquet_data(path, sample_pydict):
|
37
|
+
parquet_path = path / "test.parquet"
|
38
|
+
table = pa.Table.from_pydict(sample_pydict)
|
39
|
+
pyarrow.parquet.write_table(table, parquet_path)
|
40
|
+
return parquet_path
|
41
|
+
|
42
|
+
|
43
|
+
def test_write_manifest_round_trip(sample_parquet_data, sample_schema):
|
44
|
+
dataset = Dataset.from_parquet(
|
45
|
+
file_uri=sample_parquet_data, name="dataset", merge_keys="id"
|
46
|
+
)
|
47
|
+
|
48
|
+
path, filesystem = FileStore.filesystem(dataset._metadata_path)
|
49
|
+
file_store = FileStore(path, filesystem=filesystem)
|
50
|
+
manifest_io = DeltacatManifestIO(path, dataset._locator)
|
51
|
+
|
52
|
+
sst_files = ["sst1.sst", "sst2.sst"]
|
53
|
+
schema = Schema(
|
54
|
+
{("id", Datatype.int32()), ("name", Datatype.string())},
|
55
|
+
"id",
|
56
|
+
)
|
57
|
+
level = 2
|
58
|
+
|
59
|
+
uri = os.path.join(path, "manifest.json")
|
60
|
+
|
61
|
+
file_store.create_output_file(uri)
|
62
|
+
written = manifest_io.write(sst_files, schema, level)
|
63
|
+
manifest = manifest_io.read(written)
|
64
|
+
|
65
|
+
assert manifest.context.schema == schema
|
66
|
+
assert manifest.context.level == level
|
67
|
+
assert manifest.sst_files == sst_files
|
@@ -0,0 +1,232 @@
|
|
1
|
+
from typing import List, FrozenSet, Dict
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
8
|
+
BlockIntervalTree,
|
9
|
+
BlockGroup,
|
10
|
+
OrderedBlockGroups,
|
11
|
+
Block,
|
12
|
+
)
|
13
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
14
|
+
from deltacat.experimental.storage.rivulet import Schema
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.fixture
|
18
|
+
def schema1() -> Schema:
|
19
|
+
return Schema(
|
20
|
+
{
|
21
|
+
("id", Datatype.int32()),
|
22
|
+
("name", Datatype.string()),
|
23
|
+
("age", Datatype.int32()),
|
24
|
+
},
|
25
|
+
"id",
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
@pytest.fixture
|
30
|
+
def schema2() -> Schema:
|
31
|
+
return Schema(
|
32
|
+
{
|
33
|
+
("id", Datatype.int32()),
|
34
|
+
("address", Datatype.string()),
|
35
|
+
("zip", Datatype.string()),
|
36
|
+
},
|
37
|
+
"id",
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
@pytest.fixture
|
42
|
+
def sst_row_list() -> List[SSTableRow]:
|
43
|
+
return [
|
44
|
+
SSTableRow(0, 100, "block1", 0, 1),
|
45
|
+
SSTableRow(3, 90, "block2", 0, 1),
|
46
|
+
SSTableRow(10, 95, "block3", 0, 1),
|
47
|
+
SSTableRow(0, 10, "block4", 0, 1),
|
48
|
+
SSTableRow(0, 100, "block5", 0, 1),
|
49
|
+
]
|
50
|
+
|
51
|
+
|
52
|
+
@pytest.fixture
|
53
|
+
def sst1(sst_row_list) -> SSTable:
|
54
|
+
return SSTable(sst_row_list[0:3], 0, 100)
|
55
|
+
|
56
|
+
|
57
|
+
@pytest.fixture
|
58
|
+
def sst2(sst_row_list) -> SSTable:
|
59
|
+
return SSTable(sst_row_list[3:5], 0, 100)
|
60
|
+
|
61
|
+
|
62
|
+
@pytest.fixture
|
63
|
+
def manifest_context1(schema1) -> DeltaContext:
|
64
|
+
return DeltaContext(schema1, "manifest-001", 0)
|
65
|
+
|
66
|
+
|
67
|
+
@pytest.fixture
|
68
|
+
def manifest_context2(schema2) -> DeltaContext:
|
69
|
+
return DeltaContext(schema2, "manifest-002", 1)
|
70
|
+
|
71
|
+
|
72
|
+
def with_field_group(
|
73
|
+
context: DeltaContext, rows: List[SSTableRow], indexes: List[int]
|
74
|
+
) -> Dict[Schema, FrozenSet[Block]]:
|
75
|
+
"""Construct a BlockGroup dict for a singular field group"""
|
76
|
+
schema = context.schema
|
77
|
+
return {schema: frozenset([Block(rows[i], context) for i in indexes])}
|
78
|
+
|
79
|
+
|
80
|
+
@pytest.fixture
|
81
|
+
def expected_block_groups(
|
82
|
+
manifest_context1, manifest_context2, sst_row_list
|
83
|
+
) -> List[BlockGroup]:
|
84
|
+
return [
|
85
|
+
BlockGroup(
|
86
|
+
0,
|
87
|
+
3,
|
88
|
+
with_field_group(manifest_context1, sst_row_list, [0])
|
89
|
+
| with_field_group(manifest_context2, sst_row_list, [3, 4]),
|
90
|
+
),
|
91
|
+
BlockGroup(
|
92
|
+
3,
|
93
|
+
10,
|
94
|
+
with_field_group(manifest_context1, sst_row_list, [0, 1])
|
95
|
+
| with_field_group(manifest_context2, sst_row_list, [3, 4]),
|
96
|
+
),
|
97
|
+
BlockGroup(
|
98
|
+
10,
|
99
|
+
90,
|
100
|
+
with_field_group(manifest_context1, sst_row_list, [0, 1, 2])
|
101
|
+
| with_field_group(manifest_context2, sst_row_list, [3, 4]),
|
102
|
+
),
|
103
|
+
BlockGroup(
|
104
|
+
90,
|
105
|
+
95,
|
106
|
+
with_field_group(manifest_context1, sst_row_list, [0, 1, 2])
|
107
|
+
| with_field_group(manifest_context2, sst_row_list, [4]),
|
108
|
+
),
|
109
|
+
BlockGroup(
|
110
|
+
95,
|
111
|
+
100,
|
112
|
+
with_field_group(manifest_context1, sst_row_list, [0, 2])
|
113
|
+
| with_field_group(manifest_context2, sst_row_list, [4]),
|
114
|
+
),
|
115
|
+
]
|
116
|
+
|
117
|
+
|
118
|
+
def test_build_sst(
|
119
|
+
sst1,
|
120
|
+
sst2,
|
121
|
+
manifest_context1,
|
122
|
+
manifest_context2,
|
123
|
+
sst_row_list,
|
124
|
+
expected_block_groups,
|
125
|
+
):
|
126
|
+
t = BlockIntervalTree()
|
127
|
+
t.add_sst_table(sst1, manifest_context1)
|
128
|
+
t.add_sst_table(sst2, manifest_context2)
|
129
|
+
|
130
|
+
block_groups = t.get_sorted_block_groups()
|
131
|
+
expected = _build_ordered_block_groups(expected_block_groups)
|
132
|
+
assert expected == block_groups
|
133
|
+
|
134
|
+
|
135
|
+
def test_build_sst_with_bounds(
|
136
|
+
sst1,
|
137
|
+
sst2,
|
138
|
+
manifest_context1,
|
139
|
+
manifest_context2,
|
140
|
+
sst_row_list,
|
141
|
+
expected_block_groups,
|
142
|
+
):
|
143
|
+
t = BlockIntervalTree()
|
144
|
+
t.add_sst_table(sst1, manifest_context1)
|
145
|
+
t.add_sst_table(sst2, manifest_context2)
|
146
|
+
|
147
|
+
block_groups_filtered = t.get_sorted_block_groups(20, 100)
|
148
|
+
expected = _build_ordered_block_groups(expected_block_groups[2:])
|
149
|
+
assert expected == block_groups_filtered
|
150
|
+
|
151
|
+
block_groups_filtered = t.get_sorted_block_groups(96, 100)
|
152
|
+
expected = _build_ordered_block_groups(expected_block_groups[4:])
|
153
|
+
assert expected == block_groups_filtered
|
154
|
+
|
155
|
+
block_groups_filtered = t.get_sorted_block_groups(0, 10)
|
156
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:3])
|
157
|
+
assert expected == block_groups_filtered
|
158
|
+
|
159
|
+
# Max key of 95 is inclusive of last range so it is included
|
160
|
+
block_groups_filtered = t.get_sorted_block_groups(None, 95)
|
161
|
+
expected = _build_ordered_block_groups(expected_block_groups)
|
162
|
+
assert expected == block_groups_filtered
|
163
|
+
|
164
|
+
block_groups_filtered = t.get_sorted_block_groups(None, 94)
|
165
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:4])
|
166
|
+
assert expected == block_groups_filtered
|
167
|
+
|
168
|
+
block_groups_filtered = t.get_sorted_block_groups(0, 10)
|
169
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:3])
|
170
|
+
assert expected == block_groups_filtered
|
171
|
+
|
172
|
+
block_groups_filtered = t.get_sorted_block_groups(0, 0)
|
173
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:1])
|
174
|
+
assert expected == block_groups_filtered
|
175
|
+
|
176
|
+
|
177
|
+
def test_build_sst_with_non_zero_min_key_matching_global_min_key(manifest_context1):
|
178
|
+
# Using a non-0 value since 0 evaluates to False
|
179
|
+
min_key = 1
|
180
|
+
max_key = 95
|
181
|
+
|
182
|
+
sst_row = SSTableRow(min_key, max_key, "row-with-non-zero-min-key", 0, 1)
|
183
|
+
t = BlockIntervalTree()
|
184
|
+
t.add_sst_table(SSTable([sst_row], min_key, max_key), manifest_context1)
|
185
|
+
|
186
|
+
block_groups_filtered = t.get_sorted_block_groups(min_key, min_key + 1)
|
187
|
+
expected = _build_ordered_block_groups(
|
188
|
+
[
|
189
|
+
BlockGroup(
|
190
|
+
min_key,
|
191
|
+
max_key,
|
192
|
+
{
|
193
|
+
manifest_context1.schema: frozenset(
|
194
|
+
[Block(sst_row, manifest_context1)]
|
195
|
+
)
|
196
|
+
},
|
197
|
+
)
|
198
|
+
]
|
199
|
+
)
|
200
|
+
assert expected == block_groups_filtered
|
201
|
+
|
202
|
+
|
203
|
+
def test_build_sst_invalid_bounds(
|
204
|
+
sst1, sst2, schema1, schema2, sst_row_list, expected_block_groups
|
205
|
+
):
|
206
|
+
t = BlockIntervalTree()
|
207
|
+
|
208
|
+
with pytest.raises(ValueError):
|
209
|
+
t.get_sorted_block_groups(10, 0)
|
210
|
+
|
211
|
+
|
212
|
+
def _build_ordered_block_groups(block_groups: List[BlockGroup]) -> OrderedBlockGroups:
|
213
|
+
"""
|
214
|
+
Helper method to build OrderedBlockGroups from a sorted list of block groups
|
215
|
+
|
216
|
+
"""
|
217
|
+
ordered_groups = []
|
218
|
+
boundary_table = []
|
219
|
+
for i, bg in enumerate(block_groups):
|
220
|
+
boundary_table.append(bg.key_min)
|
221
|
+
is_last = i == len(block_groups) - 1
|
222
|
+
if is_last:
|
223
|
+
bg = BlockGroup(bg.key_min, bg.key_max, bg.field_group_to_blocks, True)
|
224
|
+
boundary_table.append(bg.key_max)
|
225
|
+
ordered_groups.append(bg)
|
226
|
+
|
227
|
+
return OrderedBlockGroups(
|
228
|
+
ordered_groups[0].key_min,
|
229
|
+
ordered_groups[-1].key_max,
|
230
|
+
ordered_groups,
|
231
|
+
boundary_table,
|
232
|
+
)
|