deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
import pyarrow
|
5
|
+
from deltacat.storage.model.scan.push_down import PartitionFilter
|
6
|
+
|
7
|
+
import deltacat.logs as logs
|
8
|
+
from deltacat.storage.model.expression import Reference, Literal
|
9
|
+
from deltacat.storage.model.expression.visitor import ExpressionVisitor
|
10
|
+
from pyiceberg.expressions import (
|
11
|
+
And,
|
12
|
+
Or,
|
13
|
+
Not,
|
14
|
+
EqualTo,
|
15
|
+
NotEqualTo,
|
16
|
+
GreaterThan,
|
17
|
+
GreaterThanOrEqual,
|
18
|
+
LessThan,
|
19
|
+
LessThanOrEqual,
|
20
|
+
IsNull,
|
21
|
+
In,
|
22
|
+
)
|
23
|
+
|
24
|
+
# Initialize DeltaCAT logger
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
+
|
27
|
+
|
28
|
+
class IcebergExpressionVisitor(ExpressionVisitor[None, Any]):
|
29
|
+
"""
|
30
|
+
Visitor that translates DeltaCAT expressions to PyIceberg expressions.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def visit(self, expr, context=None):
|
34
|
+
# Handle PartitionFilter by extracting and visiting the inner expression
|
35
|
+
if isinstance(expr, PartitionFilter):
|
36
|
+
return self.visit(expr.expr, context)
|
37
|
+
# Handle all other expressions using the parent's visit method
|
38
|
+
return super().visit(expr, context)
|
39
|
+
|
40
|
+
def visit_reference(self, expr: Reference, context=None) -> str:
|
41
|
+
return expr.field
|
42
|
+
|
43
|
+
def visit_literal(self, expr: Literal, context=None) -> Any:
|
44
|
+
# Convert PyArrow scalar to Python native type
|
45
|
+
return (
|
46
|
+
expr.value.as_py() if isinstance(expr.value, pyarrow.Scalar) else expr.value
|
47
|
+
)
|
48
|
+
|
49
|
+
def visit_and(self, expr, context=None):
|
50
|
+
left = self.visit(expr.left, context)
|
51
|
+
right = self.visit(expr.right, context)
|
52
|
+
return And(left, right)
|
53
|
+
|
54
|
+
def visit_or(self, expr, context=None):
|
55
|
+
left = self.visit(expr.left, context)
|
56
|
+
right = self.visit(expr.right, context)
|
57
|
+
return Or(left, right)
|
58
|
+
|
59
|
+
def visit_not(self, expr, context=None):
|
60
|
+
operand = self.visit(expr.operand, context)
|
61
|
+
return Not(operand)
|
62
|
+
|
63
|
+
def visit_equal(self, expr, context=None):
|
64
|
+
left = self.visit(expr.left, context)
|
65
|
+
right = self.visit(expr.right, context)
|
66
|
+
return EqualTo(left, right)
|
67
|
+
|
68
|
+
def visit_not_equal(self, expr, context=None):
|
69
|
+
left = self.visit(expr.left, context)
|
70
|
+
right = self.visit(expr.right, context)
|
71
|
+
return NotEqualTo(left, right)
|
72
|
+
|
73
|
+
def visit_greater_than(self, expr, context=None):
|
74
|
+
left = self.visit(expr.left, context)
|
75
|
+
right = self.visit(expr.right, context)
|
76
|
+
return GreaterThan(left, right)
|
77
|
+
|
78
|
+
def visit_greater_than_equal(self, expr, context=None):
|
79
|
+
left = self.visit(expr.left, context)
|
80
|
+
right = self.visit(expr.right, context)
|
81
|
+
return GreaterThanOrEqual(left, right)
|
82
|
+
|
83
|
+
def visit_less_than(self, expr, context=None):
|
84
|
+
left = self.visit(expr.left, context)
|
85
|
+
right = self.visit(expr.right, context)
|
86
|
+
return LessThan(left, right)
|
87
|
+
|
88
|
+
def visit_less_than_equal(self, expr, context=None):
|
89
|
+
left = self.visit(expr.left, context)
|
90
|
+
right = self.visit(expr.right, context)
|
91
|
+
return LessThanOrEqual(left, right)
|
92
|
+
|
93
|
+
def visit_is_null(self, expr, context=None):
|
94
|
+
operand = self.visit(expr.operand, context)
|
95
|
+
return IsNull(operand)
|
96
|
+
|
97
|
+
def visit_in(self, expr, context=None):
|
98
|
+
value = self.visit(expr.value, context)
|
99
|
+
values = [self.visit(v, context) for v in expr.values]
|
100
|
+
return In(value, values)
|
101
|
+
|
102
|
+
def visit_between(self, expr, context=None):
|
103
|
+
value = self.visit(expr.value, context)
|
104
|
+
lower = self.visit(expr.lower, context)
|
105
|
+
upper = self.visit(expr.upper, context)
|
106
|
+
return And(GreaterThanOrEqual(value, lower), LessThanOrEqual(value, upper))
|
107
|
+
|
108
|
+
# PyIceberg does not have a direct equivalent of LIKE
|
109
|
+
def visit_like(self, expr, context=None):
|
110
|
+
value = self.visit(expr.value, context)
|
111
|
+
pattern = self.visit(expr.pattern, context)
|
112
|
+
logger.warning(
|
113
|
+
f"LIKE operation is not supported in PyIceberg. Ignoring LIKE filter: {value} LIKE '{pattern}'. "
|
114
|
+
"This may result in more data being returned than expected."
|
115
|
+
)
|
116
|
+
# Return None or a default expression that won't filter anything
|
117
|
+
return (
|
118
|
+
None # or return NotEqualTo(value, None) # matches everything except NULL
|
119
|
+
)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
2
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Field
|
3
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
4
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Datatype
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"Schema",
|
8
|
+
"Field",
|
9
|
+
"Dataset",
|
10
|
+
"Datatype",
|
11
|
+
]
|
File without changes
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Iterator, List, Any
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema
|
7
|
+
from deltacat.experimental.storage.rivulet.serializer import (
|
8
|
+
DataSerializer,
|
9
|
+
MEMTABLE_DATA,
|
10
|
+
)
|
11
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
12
|
+
|
13
|
+
|
14
|
+
class ArrowSerializer(DataSerializer, ABC):
|
15
|
+
"""
|
16
|
+
Utility class which can serialize data by first converting to arrow as intermediate format
|
17
|
+
and then using the provided serialization function
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, file_provider: FileProvider, schema: Schema):
|
21
|
+
self.schema = schema
|
22
|
+
self.file_provider = file_provider
|
23
|
+
self.arrow_schema = self.schema.to_pyarrow()
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def serialize(self, table: pa.Table) -> List[SSTableRow]:
|
27
|
+
"""
|
28
|
+
Write an Arrow table to the specified output file
|
29
|
+
|
30
|
+
:param table: PyArrow table to write
|
31
|
+
:return: Number of row groups in the written file
|
32
|
+
"""
|
33
|
+
pass
|
34
|
+
|
35
|
+
def _to_arrow_table(self, sorted_records: MEMTABLE_DATA) -> pa.Table:
|
36
|
+
"""
|
37
|
+
Convert input records to an Arrow table
|
38
|
+
"""
|
39
|
+
if isinstance(sorted_records, pa.Table):
|
40
|
+
return sorted_records
|
41
|
+
elif isinstance(sorted_records, (Iterator, List)):
|
42
|
+
return pa.Table.from_pylist(sorted_records, schema=self.arrow_schema)
|
43
|
+
else:
|
44
|
+
raise ValueError(f"Unsupported record type: {type(sorted_records)}")
|
45
|
+
|
46
|
+
def _get_min_max_key(self, table: pa.Table) -> (Any, Any):
|
47
|
+
"""
|
48
|
+
Get min and max values for the merge key from the table
|
49
|
+
"""
|
50
|
+
key_col = table[self.schema.get_merge_key()]
|
51
|
+
return key_col[0].as_py(), key_col[len(key_col) - 1].as_py()
|
52
|
+
|
53
|
+
def flush_batch(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
|
54
|
+
"""
|
55
|
+
Write records to new parquet file as row group
|
56
|
+
For now, we will depend on pyarrow to write to parquet
|
57
|
+
|
58
|
+
:param sorted_records: record batch in SORTED ORDER
|
59
|
+
:return: metadata for constructing SSTable
|
60
|
+
"""
|
61
|
+
if not sorted_records:
|
62
|
+
return []
|
63
|
+
|
64
|
+
table = self._to_arrow_table(sorted_records)
|
65
|
+
return self.serialize(table)
|
66
|
+
|
67
|
+
def write(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
|
68
|
+
"""
|
69
|
+
Write records using the provided serialization function
|
70
|
+
|
71
|
+
:param sorted_records: record batch in SORTED ORDER
|
72
|
+
:return: metadata for constructing SSTable
|
73
|
+
"""
|
74
|
+
if not sorted_records:
|
75
|
+
return []
|
76
|
+
|
77
|
+
table = self._to_arrow_table(sorted_records)
|
78
|
+
return self.serialize(table)
|