deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
import daft
|
2
|
+
from daft import Table, Identifier
|
3
|
+
import pytest
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from deltacat.catalog import Catalog as DeltaCATCatalog
|
7
|
+
from deltacat.catalog import CatalogProperties
|
8
|
+
from deltacat.experimental.daft.daft_catalog import DaftCatalog
|
9
|
+
import shutil
|
10
|
+
import tempfile
|
11
|
+
|
12
|
+
from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
|
13
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
14
|
+
|
15
|
+
from pyiceberg.catalog import CatalogType
|
16
|
+
|
17
|
+
|
18
|
+
class TestCatalogIntegration:
|
19
|
+
@classmethod
|
20
|
+
def setup_method(cls):
|
21
|
+
cls.tmpdir = tempfile.mkdtemp()
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def teardown_method(cls):
|
25
|
+
shutil.rmtree(cls.tmpdir)
|
26
|
+
|
27
|
+
def test_create_table(self):
|
28
|
+
"""Demonstrate DeltaCAT-Daft integration."""
|
29
|
+
# Create a DeltaCAT catalog
|
30
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
31
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
32
|
+
|
33
|
+
# Use a random catalog name to prevent namespacing conflicts with other tests
|
34
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
35
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
36
|
+
|
37
|
+
daft_catalog = DaftCatalog(catalog=dc_catalog, name=catalog_name)
|
38
|
+
|
39
|
+
# Register the catalog with Daft's catalog system
|
40
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
41
|
+
|
42
|
+
# Create a sample DataFrame
|
43
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
44
|
+
# Create then get table
|
45
|
+
daft_catalog.create_table(Identifier("example_table"), df)
|
46
|
+
table: Table = daft_catalog.get_table(Identifier("example_table"))
|
47
|
+
assert table.name == "example_table"
|
48
|
+
|
49
|
+
def test_get_table(self):
|
50
|
+
"""Test getting a table from the DeltaCAT-Daft catalog."""
|
51
|
+
# Create a DeltaCAT catalog using the existing tmpdir
|
52
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
53
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
54
|
+
|
55
|
+
# Convert to DaftCatalog and attach to Daft
|
56
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
57
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
58
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
59
|
+
|
60
|
+
# Create a sample DataFrame and table
|
61
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
62
|
+
table_name = "test_get_table"
|
63
|
+
daft_catalog.create_table(Identifier(table_name), df)
|
64
|
+
|
65
|
+
# Get the table using different forms of identifiers
|
66
|
+
table2 = daft_catalog.get_table(Identifier(table_name))
|
67
|
+
assert table2 is not None
|
68
|
+
assert table2.name == table_name
|
69
|
+
|
70
|
+
# 3. With namespace. DeltaCAT used the default namespace since it was not provided
|
71
|
+
table3 = daft_catalog.get_table(Identifier("default", table_name))
|
72
|
+
assert table3 is not None
|
73
|
+
assert table3.name == table_name
|
74
|
+
|
75
|
+
# Test non-existent table raises an appropriate error
|
76
|
+
with pytest.raises(ValueError, match="Table nonexistent_table not found"):
|
77
|
+
daft_catalog.get_table(Identifier("nonexistent_table"))
|
78
|
+
|
79
|
+
|
80
|
+
class TestIcebergCatalogIntegration:
|
81
|
+
@classmethod
|
82
|
+
def setup_method(cls):
|
83
|
+
cls.tmpdir = tempfile.mkdtemp()
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def teardown_method(cls):
|
87
|
+
shutil.rmtree(cls.tmpdir)
|
88
|
+
|
89
|
+
def test_iceberg_catalog_integration(self):
|
90
|
+
# Create a unique warehouse path for this test
|
91
|
+
warehouse_path = self.tmpdir
|
92
|
+
|
93
|
+
# Configure an Iceberg catalog with the warehouse path
|
94
|
+
config = IcebergCatalogConfig(
|
95
|
+
type=CatalogType.SQL,
|
96
|
+
properties={
|
97
|
+
"warehouse": warehouse_path,
|
98
|
+
"uri": f"sqlite:////{warehouse_path}/sql-catalog.db",
|
99
|
+
},
|
100
|
+
)
|
101
|
+
dc_catalog = IcebergCatalog.from_config(config)
|
102
|
+
|
103
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
104
|
+
catalog_name = f"deltacat_iceberg_{uuid.uuid4().hex[:8]}"
|
105
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
106
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
107
|
+
|
108
|
+
# Create a sample DataFrame
|
109
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
110
|
+
|
111
|
+
# Create a table with the Daft catalog
|
112
|
+
table_name = "example_table"
|
113
|
+
namespace = "example_namespace"
|
114
|
+
daft_catalog.create_table(Identifier(namespace, table_name), df)
|
115
|
+
|
116
|
+
# Query that Iceberg table exists using PyIceberg
|
117
|
+
iceberg_catalog = dc_catalog.inner
|
118
|
+
|
119
|
+
# Verify the table exists in the Iceberg catalog
|
120
|
+
tables = iceberg_catalog.list_tables(namespace)
|
121
|
+
|
122
|
+
assert any(
|
123
|
+
t[0] == namespace and t[1] == table_name for t in tables
|
124
|
+
), f"Table {table_name} not found in Iceberg catalog"
|
125
|
+
|
126
|
+
# Load the table from Iceberg catalog and verify its properties
|
127
|
+
iceberg_table = iceberg_catalog.load_table(f"{namespace}.{table_name}")
|
128
|
+
|
129
|
+
# Check that the schema matches our DataFrame
|
130
|
+
schema = iceberg_table.schema()
|
131
|
+
assert (
|
132
|
+
schema.find_field("id") is not None
|
133
|
+
), "Field 'id' not fcound in table schema"
|
134
|
+
assert (
|
135
|
+
schema.find_field("value") is not None
|
136
|
+
), "Field 'value' not found in table schema"
|
File without changes
|
File without changes
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import io
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
from faker import Faker
|
5
|
+
|
6
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
9
|
+
import random
|
10
|
+
import string
|
11
|
+
from PIL import Image
|
12
|
+
|
13
|
+
FIXTURE_ROW_COUNT = 10000
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture
|
17
|
+
def ds1_dataset() -> MvpTable:
|
18
|
+
"""
|
19
|
+
dataset with one million rows
|
20
|
+
primary key is integer between 1 and 1,000,000
|
21
|
+
|
22
|
+
TODO change to user Faker instead of int ranges
|
23
|
+
"""
|
24
|
+
|
25
|
+
# Function to generate random names
|
26
|
+
def generate_random_name():
|
27
|
+
return "".join(
|
28
|
+
random.choices(
|
29
|
+
string.ascii_uppercase + string.ascii_lowercase, k=random.randint(3, 10)
|
30
|
+
)
|
31
|
+
)
|
32
|
+
|
33
|
+
# Create a list of numbers from 1 to TEST_ROW_COUNT
|
34
|
+
ids = list(range(1, FIXTURE_ROW_COUNT + 1))
|
35
|
+
random.shuffle(ids)
|
36
|
+
|
37
|
+
# Generate one million rows
|
38
|
+
return MvpTable(
|
39
|
+
{
|
40
|
+
"id": ids,
|
41
|
+
"name": [generate_random_name() for _ in range(FIXTURE_ROW_COUNT)],
|
42
|
+
"age": [random.randint(18, 100) for _ in range(FIXTURE_ROW_COUNT)],
|
43
|
+
}
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.fixture
|
48
|
+
def ds1_schema():
|
49
|
+
return Schema(
|
50
|
+
{
|
51
|
+
("id", Datatype.int32()),
|
52
|
+
("name", Datatype.string()),
|
53
|
+
("age", Datatype.int32()),
|
54
|
+
},
|
55
|
+
"id",
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
@pytest.fixture
|
60
|
+
def ds2_dataset():
|
61
|
+
"""
|
62
|
+
dataset2 with one million rows that can be joined to ds1
|
63
|
+
primary key is integer between 1 and 1,000,000
|
64
|
+
"""
|
65
|
+
# Create a list of numbers from 1 to 1,000,000
|
66
|
+
ids = list(range(1, FIXTURE_ROW_COUNT + 1))
|
67
|
+
random.shuffle(ids)
|
68
|
+
|
69
|
+
fake = Faker()
|
70
|
+
|
71
|
+
# Generate one million rows
|
72
|
+
return MvpTable(
|
73
|
+
{
|
74
|
+
"id": ids,
|
75
|
+
"address": [fake.address() for _ in range(FIXTURE_ROW_COUNT)],
|
76
|
+
"zip": [fake.zipcode() for _ in range(FIXTURE_ROW_COUNT)],
|
77
|
+
}
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
@pytest.fixture
|
82
|
+
def ds2_schema():
|
83
|
+
return Schema(
|
84
|
+
{
|
85
|
+
("id", Datatype.int32()),
|
86
|
+
("address", Datatype.string()),
|
87
|
+
("zip", Datatype.string()),
|
88
|
+
},
|
89
|
+
"id",
|
90
|
+
)
|
91
|
+
|
92
|
+
|
93
|
+
@pytest.fixture
|
94
|
+
def combined_schema(ds1_schema, ds2_schema):
|
95
|
+
return Schema(
|
96
|
+
{
|
97
|
+
("id", Datatype.int32()),
|
98
|
+
("address", Datatype.string()),
|
99
|
+
("zip", Datatype.string()),
|
100
|
+
("name", Datatype.string()),
|
101
|
+
("age", Datatype.int32()),
|
102
|
+
},
|
103
|
+
"id",
|
104
|
+
)
|
105
|
+
|
106
|
+
|
107
|
+
@pytest.fixture
|
108
|
+
def dataset_images_with_label() -> (MvpTable, Schema):
|
109
|
+
"""
|
110
|
+
dataset with one thousand images and labels, generated dynamically
|
111
|
+
primary key is integer between 1 and 1,000
|
112
|
+
"""
|
113
|
+
ROW_COUNT = 1000
|
114
|
+
fake = Faker()
|
115
|
+
schema = Schema(
|
116
|
+
{
|
117
|
+
("id", Datatype.int32()),
|
118
|
+
("image", Datatype.image("jpg")),
|
119
|
+
("label", Datatype.string()),
|
120
|
+
},
|
121
|
+
"id",
|
122
|
+
)
|
123
|
+
|
124
|
+
# Create a list of numbers from 1 to ROW_COUNT
|
125
|
+
ids = list(range(1, ROW_COUNT + 1))
|
126
|
+
random.shuffle(ids)
|
127
|
+
|
128
|
+
fake_image = Image.new(
|
129
|
+
"RGB",
|
130
|
+
(512, 512),
|
131
|
+
color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
|
132
|
+
)
|
133
|
+
# get bytes from image encoded as png
|
134
|
+
buffer = io.BytesIO()
|
135
|
+
fake_image.save(buffer, format="PNG")
|
136
|
+
# seek to start of buffer since we just wrote to it
|
137
|
+
buffer.seek(0)
|
138
|
+
image_bytes = buffer.read()
|
139
|
+
# Generate one million rows
|
140
|
+
return (
|
141
|
+
MvpTable(
|
142
|
+
{
|
143
|
+
"id": ids,
|
144
|
+
"image": [image_bytes for _ in range(ROW_COUNT)],
|
145
|
+
"label": [fake.name() for _ in range(ROW_COUNT)],
|
146
|
+
}
|
147
|
+
),
|
148
|
+
schema,
|
149
|
+
)
|
File without changes
|
@@ -0,0 +1,94 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.parquet as pq
|
5
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
6
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
8
|
+
from deltacat.utils.metafile_locator import _find_partition_path
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.fixture
|
12
|
+
def sample_schema():
|
13
|
+
return Schema(
|
14
|
+
fields=[
|
15
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
16
|
+
Field("name", Datatype.string()),
|
17
|
+
Field("age", Datatype.int32()),
|
18
|
+
]
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@pytest.fixture
|
23
|
+
def sample_pydict():
|
24
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.fixture
|
28
|
+
def temp_storage_path(tmp_path):
|
29
|
+
return tmp_path
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture
|
33
|
+
def sample_parquet_data(temp_storage_path, sample_pydict):
|
34
|
+
parquet_path = temp_storage_path / "test.parquet"
|
35
|
+
table = pa.Table.from_pydict(sample_pydict)
|
36
|
+
pq.write_table(table, parquet_path)
|
37
|
+
return parquet_path
|
38
|
+
|
39
|
+
|
40
|
+
@pytest.fixture
|
41
|
+
def dataset(sample_parquet_data):
|
42
|
+
return Dataset.from_parquet(
|
43
|
+
file_uri=sample_parquet_data, name="dataset", merge_keys="id"
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.fixture
|
48
|
+
def file_provider(dataset):
|
49
|
+
return dataset._file_provider
|
50
|
+
|
51
|
+
|
52
|
+
def test_provide_data_file(file_provider):
|
53
|
+
output_file = file_provider.provide_data_file("parquet")
|
54
|
+
assert "data" in output_file.location
|
55
|
+
assert output_file.location.endswith(".parquet")
|
56
|
+
|
57
|
+
output_file2 = file_provider.provide_data_file("parquet")
|
58
|
+
assert "data" in output_file2.location
|
59
|
+
assert output_file2.location.endswith(".parquet")
|
60
|
+
|
61
|
+
assert (
|
62
|
+
output_file.location != output_file2.location
|
63
|
+
), "Two output files should have different locations."
|
64
|
+
|
65
|
+
|
66
|
+
def test_provide_manifest_file(file_provider):
|
67
|
+
output_file = file_provider.provide_manifest_file()
|
68
|
+
assert "metadata/manifests" in output_file.location
|
69
|
+
assert output_file.location.endswith(".json")
|
70
|
+
|
71
|
+
|
72
|
+
def test_provide_l0_sst_file(file_provider):
|
73
|
+
output_file = file_provider.provide_l0_sst_file()
|
74
|
+
assert "metadata/ssts/0" in output_file.location
|
75
|
+
assert output_file.location.endswith(".json")
|
76
|
+
|
77
|
+
|
78
|
+
def test_provide_input_file(file_provider, sample_parquet_data):
|
79
|
+
input_file = file_provider.provide_input_file(str(sample_parquet_data))
|
80
|
+
assert input_file.location == str(sample_parquet_data)
|
81
|
+
|
82
|
+
|
83
|
+
def test_generate_sst_uris(file_provider):
|
84
|
+
generated_files = list(file_provider.generate_sst_uris())
|
85
|
+
for file in generated_files:
|
86
|
+
assert "metadata/ssts/0" in file.location
|
87
|
+
assert file.location.endswith(".json")
|
88
|
+
|
89
|
+
|
90
|
+
def test_get_scan_directories(file_provider):
|
91
|
+
partition_path = _find_partition_path(file_provider.uri, file_provider._locator)
|
92
|
+
assert file_provider.get_sst_scan_directories() == [
|
93
|
+
f"{partition_path}/metadata/ssts/0/"
|
94
|
+
]
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
3
|
+
QueryExpression,
|
4
|
+
)
|
5
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def sample_range_shard():
|
10
|
+
return RangeShard(min_key=5, max_key=15)
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_string_shard():
|
15
|
+
return RangeShard(min_key="apple", max_key="zebra")
|
16
|
+
|
17
|
+
|
18
|
+
def test_with_key():
|
19
|
+
query = QueryExpression[int]()
|
20
|
+
query.with_key(5)
|
21
|
+
assert query.min_key == 5
|
22
|
+
assert query.max_key == 5
|
23
|
+
with pytest.raises(ValueError):
|
24
|
+
query.with_key(10)
|
25
|
+
|
26
|
+
|
27
|
+
def test_with_range():
|
28
|
+
query = QueryExpression[int]()
|
29
|
+
query.with_range(10, 5)
|
30
|
+
assert query.min_key == 5
|
31
|
+
assert query.max_key == 10
|
32
|
+
with pytest.raises(ValueError):
|
33
|
+
query.with_range(20, 25)
|
34
|
+
|
35
|
+
|
36
|
+
def test_matches_query():
|
37
|
+
query = QueryExpression[int]()
|
38
|
+
assert query.matches_query(5)
|
39
|
+
assert query.matches_query(-999)
|
40
|
+
query.with_range(10, 20)
|
41
|
+
assert query.matches_query(15)
|
42
|
+
assert not query.matches_query(25)
|
43
|
+
assert not query.matches_query(5)
|
44
|
+
|
45
|
+
|
46
|
+
def test_below_query_range():
|
47
|
+
query = QueryExpression[int]()
|
48
|
+
assert not query.below_query_range(5)
|
49
|
+
query.with_range(10, 20)
|
50
|
+
assert query.below_query_range(5)
|
51
|
+
assert not query.below_query_range(15)
|
52
|
+
assert not query.below_query_range(25)
|
53
|
+
|
54
|
+
|
55
|
+
def test_with_shard_existing_query(sample_range_shard):
|
56
|
+
query = QueryExpression[int]().with_range(10, 20)
|
57
|
+
new_query = QueryExpression.with_shard(query, sample_range_shard)
|
58
|
+
assert new_query.min_key == 5
|
59
|
+
assert new_query.max_key == 20
|
60
|
+
|
61
|
+
|
62
|
+
def test_with_shard_none_shard():
|
63
|
+
query = QueryExpression[int]().with_range(10, 20)
|
64
|
+
result = QueryExpression.with_shard(query, None)
|
65
|
+
assert result.min_key == 10
|
66
|
+
assert result.max_key == 20
|
67
|
+
|
68
|
+
|
69
|
+
def test_with_shard_existing_query_string(sample_string_shard):
|
70
|
+
query = QueryExpression[str]().with_range("banana", "yellow")
|
71
|
+
new_query = QueryExpression.with_shard(query, sample_string_shard)
|
72
|
+
assert new_query.min_key == "apple"
|
73
|
+
assert new_query.max_key == "zebra"
|
74
|
+
|
75
|
+
|
76
|
+
def test_query_expression_string_matches():
|
77
|
+
query = QueryExpression[str]().with_range("apple", "cat")
|
78
|
+
assert query.matches_query("apple")
|
79
|
+
assert query.matches_query("banana")
|
80
|
+
assert not query.matches_query("dog")
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
|
3
|
+
import pyarrow as pa
|
4
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def combined_schema():
|
10
|
+
return Schema(
|
11
|
+
fields=[
|
12
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
13
|
+
Field("name", Datatype.string()),
|
14
|
+
Field("age", Datatype.int32()),
|
15
|
+
Field("height", Datatype.int64()),
|
16
|
+
Field("gender", Datatype.string()),
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def initial_schema():
|
23
|
+
return Schema(
|
24
|
+
fields=[
|
25
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
26
|
+
Field("name", Datatype.string()),
|
27
|
+
Field("age", Datatype.int32()),
|
28
|
+
]
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture
|
33
|
+
def extended_schema():
|
34
|
+
return Schema(
|
35
|
+
fields=[
|
36
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
37
|
+
Field("height", Datatype.int64()),
|
38
|
+
Field("gender", Datatype.string()),
|
39
|
+
]
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
@pytest.fixture
|
44
|
+
def sample_data():
|
45
|
+
return {
|
46
|
+
"id": [1, 2, 3],
|
47
|
+
"name": ["Alice", "Bob", "Charlie"],
|
48
|
+
"age": [25, 30, 35],
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
@pytest.fixture
|
53
|
+
def extended_data():
|
54
|
+
return {
|
55
|
+
"id": [1, 2, 3],
|
56
|
+
"height": [150, 160, 159],
|
57
|
+
"gender": ["male", "female", "male"],
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
@pytest.fixture
|
62
|
+
def combined_data(sample_data, extended_data):
|
63
|
+
data = sample_data.copy()
|
64
|
+
data.update(extended_data)
|
65
|
+
return data
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture
|
69
|
+
def parquet_data(tmp_path, sample_data):
|
70
|
+
parquet_path = tmp_path / "test.parquet"
|
71
|
+
table = pa.Table.from_pydict(sample_data)
|
72
|
+
pa.parquet.write_table(table, parquet_path)
|
73
|
+
return parquet_path
|
74
|
+
|
75
|
+
|
76
|
+
@pytest.fixture
|
77
|
+
def sample_dataset(parquet_data, tmp_path):
|
78
|
+
return Dataset.from_parquet(
|
79
|
+
name="test_dataset",
|
80
|
+
file_uri=str(parquet_data),
|
81
|
+
metadata_uri=str(tmp_path),
|
82
|
+
merge_keys="id",
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
def test_end_to_end_scan_with_multiple_schemas(
|
87
|
+
sample_dataset,
|
88
|
+
initial_schema,
|
89
|
+
extended_schema,
|
90
|
+
combined_schema,
|
91
|
+
sample_data,
|
92
|
+
extended_data,
|
93
|
+
combined_data,
|
94
|
+
):
|
95
|
+
# Verify initial scan.
|
96
|
+
verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
|
97
|
+
|
98
|
+
# Add a new schema to the dataset
|
99
|
+
sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
|
100
|
+
new_data = [
|
101
|
+
{"id": 1, "height": 150, "gender": "male"},
|
102
|
+
{"id": 2, "height": 160, "gender": "female"},
|
103
|
+
{"id": 3, "height": 159, "gender": "male"},
|
104
|
+
]
|
105
|
+
writer = sample_dataset.writer(schema_name="schema2")
|
106
|
+
writer.write(new_data)
|
107
|
+
writer.flush()
|
108
|
+
|
109
|
+
# Verify scan with the extended schema retrieves only extended datfa
|
110
|
+
verify_pyarrow_scan(
|
111
|
+
sample_dataset.scan(schema_name="schema2").to_arrow(),
|
112
|
+
extended_schema,
|
113
|
+
extended_data,
|
114
|
+
)
|
115
|
+
|
116
|
+
# Verify a combined scan retrieves data matching the combined schema
|
117
|
+
verify_pyarrow_scan(
|
118
|
+
sample_dataset.scan().to_arrow(), combined_schema, combined_data
|
119
|
+
)
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
|
4
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
5
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
6
|
+
DatasetMetastore,
|
7
|
+
)
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
9
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
10
|
+
from deltacat.experimental.storage.rivulet import Schema
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_schema():
|
15
|
+
return Schema(
|
16
|
+
{("id", Datatype.int32()), ("name", Datatype.string())},
|
17
|
+
"id",
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def sample_pydict():
|
23
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
|
24
|
+
|
25
|
+
|
26
|
+
def test_dataset_metastore_e2e(sample_schema, tmp_path):
|
27
|
+
# Setup
|
28
|
+
dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
|
29
|
+
file_provider = dataset._file_provider
|
30
|
+
manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
|
31
|
+
|
32
|
+
# Create multiple manifests
|
33
|
+
manifests_data = [
|
34
|
+
{"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
|
35
|
+
{"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
|
36
|
+
]
|
37
|
+
|
38
|
+
# Create SST files and manifests
|
39
|
+
manifest_paths = []
|
40
|
+
for manifest_data in manifests_data:
|
41
|
+
sst_files = manifest_data["sst_files"]
|
42
|
+
for sst in sst_files:
|
43
|
+
with open(os.path.join(file_provider.uri, sst), "w") as f:
|
44
|
+
f.write("test data")
|
45
|
+
|
46
|
+
manifest_path = manifest_io.write(
|
47
|
+
sst_files, sample_schema, manifest_data["level"]
|
48
|
+
)
|
49
|
+
manifest_paths.append(manifest_path)
|
50
|
+
|
51
|
+
# Initialize DatasetMetastore
|
52
|
+
metastore = DatasetMetastore(
|
53
|
+
file_provider.uri,
|
54
|
+
file_provider,
|
55
|
+
file_provider._locator,
|
56
|
+
manifest_io=manifest_io,
|
57
|
+
)
|
58
|
+
|
59
|
+
# Test manifest generation
|
60
|
+
manifest_accessors = list(metastore.generate_manifests())
|
61
|
+
assert len(manifest_accessors) == len(manifests_data)
|
62
|
+
|
63
|
+
# Verify each manifest accessor
|
64
|
+
for accessor in manifest_accessors:
|
65
|
+
assert accessor.context.schema == sample_schema
|
66
|
+
manifests_data_index = 0 if accessor.context.level == 1 else 1
|
67
|
+
assert accessor.context.level == manifests_data[manifests_data_index]["level"]
|
68
|
+
assert (
|
69
|
+
accessor.manifest.sst_files
|
70
|
+
== manifests_data[manifests_data_index]["sst_files"]
|
71
|
+
)
|
File without changes
|