deltacat 2.0.0.post1__tar.gz → 2.0.0.post3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deltacat-2.0.0.post1/deltacat.egg-info → deltacat-2.0.0.post3}/PKG-INFO +409 -94
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/README.md +408 -93
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/__init__.py +10 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/api.py +127 -22
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/__init__.py +6 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/delegate.py +170 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/interface.py +35 -2
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/main/impl.py +159 -207
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/model/catalog.py +150 -35
- deltacat-2.0.0.post3/deltacat/catalog/model/properties.py +333 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/private/compaction_utils.py +8 -2
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/steps/merge.py +9 -7
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/converter_session.py +15 -10
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +7 -5
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/io.py +22 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/janitor.py +38 -15
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/constants.py +11 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/bootstrap.py +3 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/explorer.py +0 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/utils/common.py +0 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +0 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/hello_world.py +10 -4
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/indexer.py +3 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/job_runner.py +6 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/exceptions.py +15 -0
- deltacat-2.0.0.post3/deltacat/experimental/compatibility/backfill_transaction_partitions.py +513 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/table_monitor.py +2 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/daft/daft_catalog.py +1 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +7 -2
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/interface.py +6 -7
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/main/impl.py +209 -121
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/delta.py +22 -8
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/manifest.py +81 -9
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/metafile.py +113 -30
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/namespace.py +11 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/partition.py +19 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/schema.py +17 -4
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/stream.py +10 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/table.py +10 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/table_version.py +10 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/transaction.py +259 -108
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/types.py +1 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/aws/test_s3u.py +9 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +57 -6
- deltacat-2.0.0.post3/deltacat/tests/catalog/model/test_properties_transaction_migration.py +232 -0
- deltacat-2.0.0.post3/deltacat/tests/catalog/test_catalogs.py +651 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/test_default_catalog_impl.py +1382 -46
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -18
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/test_convert_session.py +2 -2
- deltacat-2.0.0.post3/deltacat/tests/compute/converter/test_converter_commit_conflict_resolution.py +626 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_janitor.py +60 -38
- deltacat-2.0.0.post3/deltacat/tests/conftest.py +56 -0
- deltacat-2.0.0.post3/deltacat/tests/experimental/compatibility/test_backfill_transaction_partitions.py +477 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/main/test_main_storage.py +17 -8
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_metafile_io.py +142 -18
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_transaction_history.py +128 -68
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_deltacat_api.py +334 -25
- deltacat-2.0.0.post3/deltacat/tests/utils/test_filesystem.py +3319 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/media.py +278 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/tables.py +116 -124
- deltacat-2.0.0.post3/deltacat/utils/filesystem.py +1590 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/pandas.py +11 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/polars.py +3 -1
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/pyarrow.py +7 -3
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/url.py +111 -18
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3/deltacat.egg-info}/PKG-INFO +409 -94
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/SOURCES.txt +5 -2
- deltacat-2.0.0.post1/deltacat/catalog/model/properties.py +0 -155
- deltacat-2.0.0.post1/deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +0 -201
- deltacat-2.0.0.post1/deltacat/tests/catalog/test_catalogs.py +0 -321
- deltacat-2.0.0.post1/deltacat/tests/conftest.py +0 -25
- deltacat-2.0.0.post1/deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +0 -582
- deltacat-2.0.0.post1/deltacat/utils/filesystem.py +0 -450
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/LICENSE +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/MANIFEST.in +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/annotations.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/clients.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/constants.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/s3u.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_engine.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_report.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_suite.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/conftest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/data/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/data/random_row_generator.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/data/row_generator.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/test_benchmark_pipeline.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/main/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/compaction_session.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/compactor_version.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/materialize_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/table_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/repartition_session.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/dedupe.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/materialize.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/repartition.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/io.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/round_completion_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/sort_key.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/system_columns.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/constants.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/io.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/constants.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/convert_input.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/convert_input_files.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/convert_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/converter_session_params.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/catalog.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/overrides.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/steps/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/steps/convert.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/steps/dedupe.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/convert_task_options.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/converter_session_utils.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/iceberg_columns.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/s3u.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/jobs/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/jobs/client.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/delta.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/manifest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/model.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/parquet.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/types.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/inference/generate_type_mappings.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/env.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/basic_logging.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/aws/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/compactor.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/gcp/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/job_runner.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/app.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/main.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/iceberg_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/aws/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/gcp/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/impl.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/overrides.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/compatibility/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/beam/managed.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/daft/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/impl.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/model.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/visitor.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/arrow/serializer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/dataset.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/dataset_executor.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/feather/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/feather/file_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/feather/serializer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/file_provider.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/file_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/input_file.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/output_file.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/logical_plan.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/delta.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/json_sst.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/sst.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/mvp/Table.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/mvp/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/file_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/serializer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/block_scanner.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/data_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/data_scan.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/dataset_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/query_expression.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/schema/datatype.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/schema/schema.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/serializer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/serializer_factory.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/shard/range_shard.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/dataset/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/dataset/deltacat_dataset.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasink/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasink/deltacat_datasink.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasource/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasource/deltacat_datasource.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/file_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/memcached_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/ray_plasma_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/reader/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/reader/deltacat_read_api.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/redis_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/s3_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/logs.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/main/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/expression/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/expression/expression.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/expression/visitor.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/interop.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/locator.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/push_down.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/scan_plan.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/scan_task.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/shard.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/sort_key.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/transform.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/util/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/util/scan_planner.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/reader/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_cloudpickle_bug_fix.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_file_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_memcached_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_ray_plasma_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_redis_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_s3_object_store.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/aws/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/aws/test_clients.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/data/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/main/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/model/test_table_definition.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/conftest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/conftest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/utils.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_incremental.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_rebase.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_util_common.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_util_constant.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/daft/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/daft/test_model.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/catalog/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/compatibility/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/daft/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/conftest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_dataset.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_manifest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_utils.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/main/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_delete_parameters.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_expression.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_manifest.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_partition_scheme.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_schema.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_schema_update.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_shard.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_sort_scheme.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_table_version.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_transaction.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_exceptions.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_logs.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/filesystem.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/message_pack_utils.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/pyarrow.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/storage.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/utils.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/types/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/types/test_tables.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/data/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/exceptions.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/main_deltacat_storage_mock.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_cloudpickle.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_daft.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_metrics.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_numpy.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_pandas.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_placement.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_polars.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_pyarrow.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_resources.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/partial_download.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/arguments.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/cloudpickle.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/common.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/daft.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/export.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/metafile_locator.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/metrics.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/numpy.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/performance.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/placement.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/concurrency.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/reader_compatibility_mapping.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/resources.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/schema.py +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/requires.txt +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/setup.cfg +0 -0
- {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deltacat
|
3
|
-
Version: 2.0.0.
|
3
|
+
Version: 2.0.0.post3
|
4
4
|
Summary: DeltaCAT is a portable Pythonic Data Lakehouse powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -53,22 +53,20 @@ Dynamic: summary
|
|
53
53
|
<img src="https://github.com/ray-project/deltacat/raw/2.0/media/deltacat-logo-alpha-750.png" alt="deltacat logo" style="width:55%; height:auto; text-align: center;">
|
54
54
|
</p>
|
55
55
|
|
56
|
-
DeltaCAT is a portable
|
57
|
-
fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
|
58
|
-
data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
|
56
|
+
DeltaCAT is a portable Multimodal Lakehouse powered by [Ray](https://github.com/ray-project/ray), [Apache Arrow](https://github.com/apache/arrow), and [Daft](https://github.com/Eventual-Inc/Daft). It lets you create ACID-compliant multimodal data lakes [that efficiently scale to manage exabytes of production data](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
|
59
57
|
|
60
|
-
It
|
61
|
-
|
62
|
-
|
58
|
+
It provides data lake level transactions & time travel, zero-copy schema evolution, zero-copy multimodal file processing (image, audio, video, text, etc.), and transparent dataset optimization. It runs locally for rapid development or in the cloud for production workloads. It runs on any filesystem for easy setup and sharing - no external catalog services, lock managers, or key value stores required.
|
59
|
+
|
60
|
+
|
61
|
+
## Overview
|
63
62
|
|
64
63
|
DeltaCAT provides the following high-level components:
|
65
|
-
1. [**Catalog**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/catalog/interface.py):
|
66
|
-
2. [**Compute**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/compute/): Distributed data management procedures
|
67
|
-
3. [**Storage**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/storage/):
|
64
|
+
1. [**Catalog**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/catalog/interface.py): Pythonic APIs to discover, read, write, and manage datasets.
|
65
|
+
2. [**Compute**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/compute/): Distributed data management procedures that automatically optimize your datasets.
|
66
|
+
3. [**Storage**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/storage/): A portable multimodal data lake format useable with any filesystem.
|
68
67
|
4. **Sync** (in development): Synchronize DeltaCAT datasets to data warehouses and other table formats.
|
69
68
|
|
70
|
-
|
71
|
-
DeltaCAT's **Catalog**, **Compute**, and **Storage** layers work together to bring ACID-compliant data management to any Ray application. These components automate data indexing, change management, dataset read/write optimization, schema evolution, and other common data management tasks across any set of data files readable by Ray Data, Daft, Pandas, Polars, PyArrow, or NumPy.
|
69
|
+
DeltaCAT's **Catalog**, **Compute**, and **Storage** layers work together to bring ACID-compliant data management to any Ray application. These components automate data indexing, change management, dataset read/write optimization, schema evolution, and other common data management tasks across any set of data files readable by [Pandas](https://github.com/pandas-dev/pandas), [NumPy](https://github.com/numpy/numpy), [Polars](https://github.com/pola-rs/polars), [PyArrow](https://arrow.apache.org/docs/python/index.html), [Ray Data](https://docs.ray.io/en/latest/data/data.html), and [Daft](https://docs.daft.ai/en/stable/api/dataframe/).
|
72
70
|
|
73
71
|
<p align="center">
|
74
72
|
<img src="https://github.com/ray-project/deltacat/raw/2.0/media/deltacat-tech-overview.png" alt="deltacat tech overview" style="width:100%; height:auto; text-align: center;">
|
@@ -79,10 +77,15 @@ Data consumers that prefer to stay within the ecosystem of Pythonic data managem
|
|
79
77
|
## Getting Started
|
80
78
|
DeltaCAT applications run anywhere that Ray runs, including your local laptop, cloud computing cluster, or on-premise cluster.
|
81
79
|
|
82
|
-
DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of
|
80
|
+
DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of data files. A **Catalog** can be thought of as a named data lake that contains a set of **Tables**. A **Catalog** provides a root location (e.g., a local file path or S3 Bucket) to store information about all your **Tables**, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
|
81
|
+
|
83
82
|
|
84
83
|
### Quick Start
|
85
84
|
|
85
|
+
Install DeltaCAT with: `pip install deltacat`
|
86
|
+
|
87
|
+
Then run this script to create and read your first table:
|
88
|
+
|
86
89
|
```python
|
87
90
|
import deltacat as dc
|
88
91
|
import pandas as pd
|
@@ -108,8 +111,8 @@ dc.write(data, "users")
|
|
108
111
|
daft_df = dc.read("users") # Returns Daft DataFrame (default)
|
109
112
|
daft_df.show() # Materialize and print the DataFrame
|
110
113
|
|
111
|
-
#
|
112
|
-
# Compaction and schema evolution are handled automatically.
|
114
|
+
# Add more data and add a new column.
|
115
|
+
# Compaction and zero-copy schema evolution are handled automatically.
|
113
116
|
data = pd.DataFrame({
|
114
117
|
"id": [4, 5, 6],
|
115
118
|
"name": ["Tom", "Simpkin", "Delta"],
|
@@ -125,13 +128,13 @@ daft_df.select("name", "age", "city").show()
|
|
125
128
|
```
|
126
129
|
|
127
130
|
### Core Concepts
|
128
|
-
DeltaCAT can do much more than just
|
131
|
+
DeltaCAT can do much more than just add data to tables and read it back again. Expand the sections below to see examples of other core DeltaCAT concepts and APIs.
|
129
132
|
|
130
133
|
<details>
|
131
134
|
|
132
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
135
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Idempotent Writes</span></summary>
|
133
136
|
|
134
|
-
If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **
|
137
|
+
If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **adds** table data by default. One way to prevent this perpetual table growth and make the example idempotent is to use the **REPLACE** write mode if the table already exists:
|
135
138
|
|
136
139
|
```python
|
137
140
|
import deltacat as dc
|
@@ -165,7 +168,7 @@ dc.write(data, "users", mode=write_mode)
|
|
165
168
|
daft_df = dc.read("users") # Returns Daft DataFrame (default)
|
166
169
|
daft_df.show() # Materialize and print the DataFrame
|
167
170
|
|
168
|
-
# Explicitly
|
171
|
+
# Explicitly add more data and add a new column.
|
169
172
|
# Compaction and schema evolution are handled automatically.
|
170
173
|
data = pd.DataFrame({
|
171
174
|
"id": [4, 5, 6],
|
@@ -173,7 +176,7 @@ data = pd.DataFrame({
|
|
173
176
|
"age": [2, 12, 4],
|
174
177
|
"city": ["Hollywood", "Gloucester", "San Francisco"]
|
175
178
|
})
|
176
|
-
dc.write(data, "users", mode=dc.TableWriteMode.
|
179
|
+
dc.write(data, "users", mode=dc.TableWriteMode.ADD)
|
177
180
|
|
178
181
|
# Read the full table back into a Daft DataFrame.
|
179
182
|
daft_df = dc.read("users")
|
@@ -217,7 +220,7 @@ dc.write(data, "users", mode=dc.TableWriteMode.CREATE)
|
|
217
220
|
daft_df = dc.read("users") # Returns Daft DataFrame (default)
|
218
221
|
daft_df.show() # Materialize and print the DataFrame
|
219
222
|
|
220
|
-
# Explicitly
|
223
|
+
# Explicitly add more data and add a new column.
|
221
224
|
# Compaction and schema evolution are handled automatically.
|
222
225
|
data = pd.DataFrame({
|
223
226
|
"id": [4, 5, 6],
|
@@ -225,7 +228,7 @@ data = pd.DataFrame({
|
|
225
228
|
"age": [2, 12, 4],
|
226
229
|
"city": ["Hollywood", "Gloucester", "San Francisco"]
|
227
230
|
})
|
228
|
-
dc.write(data, "users", mode=dc.TableWriteMode.
|
231
|
+
dc.write(data, "users", mode=dc.TableWriteMode.ADD)
|
229
232
|
|
230
233
|
# Read the full table back into a Daft DataFrame.
|
231
234
|
daft_df = dc.read("users")
|
@@ -237,9 +240,117 @@ assert dc.dataset_length(daft_df) == 6
|
|
237
240
|
|
238
241
|
</details>
|
239
242
|
|
243
|
+
|
240
244
|
<details>
|
241
245
|
|
242
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
246
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Ordered Writes</span></summary>
|
247
|
+
DeltaCAT writes are unordered by default, which means that the order of data written to the table isn't guaranteed to match the order that it is read back. While this is useful for preventing conflicts between concurrent writers, you can also use the **APPEND** write mode to preserve write order and raise explicit concurrency conflicts between parallel writers:
|
248
|
+
|
249
|
+
```python
|
250
|
+
import deltacat as dc
|
251
|
+
import pandas as pd
|
252
|
+
|
253
|
+
# Initialize DeltaCAT with a default local catalog.
|
254
|
+
# Ray will be initialized automatically.
|
255
|
+
# Catalog files will be stored in .deltacat/ in the current working directory.
|
256
|
+
dc.init_local()
|
257
|
+
|
258
|
+
# Create data to write.
|
259
|
+
data = pd.DataFrame({
|
260
|
+
"id": [1, 2],
|
261
|
+
"name": ["Cheshire", "Dinah"],
|
262
|
+
"age": [3, 7]
|
263
|
+
})
|
264
|
+
|
265
|
+
# Derive a DeltaCAT schema for the data.
|
266
|
+
schema = dc.Schema.of(dc.dataset_schema(data))
|
267
|
+
|
268
|
+
# Create an empty table to hold ordered user data.
|
269
|
+
if not dc.table_exists("users_ordered"):
|
270
|
+
dc.create_table("users_ordered", schema=schema)
|
271
|
+
|
272
|
+
# Write the first ordered delta to the table.
|
273
|
+
dc.write(data, "users_ordered", mode=dc.TableWriteMode.APPEND)
|
274
|
+
|
275
|
+
# Write the second ordered delta to the table.
|
276
|
+
data = pd.DataFrame({
|
277
|
+
"id": [3, 4],
|
278
|
+
"name": ["Felix", "Tom"],
|
279
|
+
"age": [2, 12],
|
280
|
+
"city": ["Hollywood", "Gloucester"]
|
281
|
+
})
|
282
|
+
dc.write(data, "users_ordered", mode=dc.TableWriteMode.APPEND)
|
283
|
+
|
284
|
+
# Write the third ordered delta to the table.
|
285
|
+
data = pd.DataFrame({
|
286
|
+
"id": [5, 6],
|
287
|
+
"name": ["Simpkin", "Delta"],
|
288
|
+
"age": [12, 4],
|
289
|
+
"city": ["San Francisco", "San Francisco"]
|
290
|
+
})
|
291
|
+
dc.write(data, "users_ordered", mode=dc.TableWriteMode.APPEND)
|
292
|
+
|
293
|
+
# Read the data back as a Pandas DataFrame, and ensure that the
|
294
|
+
# order of the records returned matches the order they were written.
|
295
|
+
pandas_df = dc.read("users_ordered", read_as=dc.DatasetType.PANDAS)
|
296
|
+
print(pandas_df)
|
297
|
+
```
|
298
|
+
|
299
|
+
</details>
|
300
|
+
|
301
|
+
<details>
|
302
|
+
|
303
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Schemaless Tables</span></summary>
|
304
|
+
Tables created automatically via `dc.write` have a schema inferred from the data written by default. However, if you create an empty table without providing a schema, it defaults to schemaless. Writes to schemaless tables are more efficient and flexible, since they simply track the location and basic metadata associated with the data files written to the table. However, if you know that a unified schema can be derived for your schemaless data, then you can you can still read it back as a structured dataset:
|
305
|
+
|
306
|
+
```python
|
307
|
+
import deltacat as dc
|
308
|
+
import pandas as pd
|
309
|
+
|
310
|
+
# Initialize DeltaCAT with a default local catalog.
|
311
|
+
# Ray will be initialized automatically.
|
312
|
+
# Catalog files will be stored in .deltacat/ in the current working directory.
|
313
|
+
dc.init_local()
|
314
|
+
|
315
|
+
# Create data to write.
|
316
|
+
data = pd.DataFrame({
|
317
|
+
"id": [1, 2],
|
318
|
+
"name": ["Cheshire", "Dinah"],
|
319
|
+
"age": [3, 7]
|
320
|
+
})
|
321
|
+
|
322
|
+
# Create an empty schemaless table to hold ordered user data.
|
323
|
+
if not dc.table_exists("users_schemaless"):
|
324
|
+
dc.create_table("users_schemaless")
|
325
|
+
|
326
|
+
# Write the first ordered delta to the table.
|
327
|
+
dc.write(data, "users_schemaless", mode=dc.TableWriteMode.APPEND)
|
328
|
+
|
329
|
+
# Write the second ordered delta to the table.
|
330
|
+
data = pd.DataFrame({
|
331
|
+
"id": [3, 4],
|
332
|
+
"name": ["Felix", "Tom"],
|
333
|
+
"age": [2, 12],
|
334
|
+
"city": ["Hollywood", "Gloucester"]
|
335
|
+
})
|
336
|
+
dc.write(data, "users_schemaless", mode=dc.TableWriteMode.APPEND)
|
337
|
+
|
338
|
+
# Read back the file manifest of the schemaless table.
|
339
|
+
# Notice that file paths, sizes, etc. are returned instead of the dataframes written.
|
340
|
+
manifest_df = dc.read("users_schemaless", read_as=dc.DatasetType.PANDAS)
|
341
|
+
print(manifest_df)
|
342
|
+
|
343
|
+
# Use from_manifest_table to convert the manifest table to a structured dataset.
|
344
|
+
structured_daft_df = dc.from_manifest_table(manifest_df)
|
345
|
+
structured_daft_df.show()
|
346
|
+
```
|
347
|
+
|
348
|
+
</details>
|
349
|
+
|
350
|
+
|
351
|
+
<details>
|
352
|
+
|
353
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Working Across Dataset and File Types</span></summary>
|
243
354
|
|
244
355
|
DeltaCAT natively supports a variety of open dataset and file formats already integrated with Ray and Arrow. You can use `dc.read` to read tables back as a Daft DataFrame, Ray Dataset, Pandas DataFrame, PyArrow Table, Polars DataFrame, NumPy Array, or list of PyArrow ParquetFile objects:
|
245
356
|
|
@@ -329,7 +440,7 @@ print("\n=== NumPy Table ===")
|
|
329
440
|
dc.read("my_numpy_table").show()
|
330
441
|
```
|
331
442
|
|
332
|
-
|
443
|
+
DeltaCAT tables also support persisting data in heterogeneous table file formats like Avro, ORC, or Feather:
|
333
444
|
|
334
445
|
```python
|
335
446
|
data = pd.DataFrame({"id": [1], "name": ["Cheshire"], "age": [3]})
|
@@ -372,9 +483,9 @@ print(pandas_df)
|
|
372
483
|
|
373
484
|
<details>
|
374
485
|
|
375
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
486
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Live Feature Enrichment</span></summary>
|
376
487
|
|
377
|
-
DeltaCAT can
|
488
|
+
DeltaCAT can update your datasets on-the-fly to keep up with a continuous stream of new insights, and support common ML use-cases like feature enrichment. Just define a table schema with one or more merge keys to start updating and deleting existing records:
|
378
489
|
|
379
490
|
```python
|
380
491
|
import deltacat as dc
|
@@ -385,53 +496,50 @@ import tempfile
|
|
385
496
|
# Initialize DeltaCAT with a fresh temporary catalog
|
386
497
|
dc.init_local(tempfile.mkdtemp())
|
387
498
|
|
388
|
-
#
|
389
|
-
|
499
|
+
# Start with minimal schema - just user_id as merge key and name
|
500
|
+
initial_schema = dc.Schema.of([
|
390
501
|
dc.Field.of(pa.field("user_id", pa.int64()), is_merge_key=True),
|
391
502
|
dc.Field.of(pa.field("name", pa.string())),
|
392
|
-
dc.Field.of(pa.field("age", pa.int32())),
|
393
|
-
dc.Field.of(pa.field("status", pa.string())),
|
394
503
|
])
|
395
504
|
|
396
|
-
# Initial user data
|
505
|
+
# Initial user data - just basic info
|
397
506
|
initial_users = pd.DataFrame({
|
398
507
|
"user_id": [1, 2, 3],
|
399
|
-
"name": ["
|
400
|
-
"age": [3, 7, 2],
|
401
|
-
"status": ["active", "active", "inactive"]
|
508
|
+
"name": ["Jim", "Dinah", "Bob"],
|
402
509
|
})
|
403
510
|
|
404
|
-
# Write initial data with
|
405
|
-
dc.write(initial_users, "users", schema=
|
511
|
+
# Write initial data with minimal schema
|
512
|
+
dc.write(initial_users, "users", schema=initial_schema)
|
406
513
|
|
407
|
-
# Read the data back as a Pandas DataFrame
|
514
|
+
# Read the data back as a Pandas DataFrame
|
408
515
|
df = dc.read("users", read_as=dc.DatasetType.PANDAS)
|
409
|
-
print("=== Initial Users ===")
|
516
|
+
print("=== Initial Users (Basic Info) ===")
|
410
517
|
print(df.sort_values("user_id"))
|
411
518
|
|
412
|
-
#
|
413
|
-
|
414
|
-
"user_id": [
|
415
|
-
"name": ["
|
416
|
-
"age": [
|
417
|
-
"
|
519
|
+
# Later, enrich with new insights: add age/job features + new users
|
520
|
+
enriched_data = pd.DataFrame({
|
521
|
+
"user_id": [1, 3, 4, 5, 6],
|
522
|
+
"name": ["Cheshire", "Felix", "Tom", "Simpkin", "Delta"],
|
523
|
+
"age": [3, 2, 5, 12, 4],
|
524
|
+
"job": ["Tour Guide", "Drifter", "Housekeeper", "Mouser", "Engineer"]
|
418
525
|
})
|
419
526
|
|
420
|
-
#
|
421
|
-
# 1.
|
422
|
-
# 2.
|
423
|
-
|
527
|
+
# DeltaCAT automatically evolves the schema and merges by user_id:
|
528
|
+
# 1. Enriches existing users (Jim -> Cheshire age=3, job="Tour Guide"; Bob -> Felix)
|
529
|
+
# 2. Adds new age/job columns with automatic schema evolution
|
530
|
+
# 3. Inserts new users (Tom, Simpkin, Delta) with full feature set
|
531
|
+
dc.write(enriched_data, "users")
|
424
532
|
|
425
|
-
# Read back to see
|
533
|
+
# Read back to see live feature enrichment results
|
426
534
|
df = dc.read("users", read_as=dc.DatasetType.PANDAS)
|
427
|
-
print("\n===
|
535
|
+
print("\n=== Enriched Users (Age & Job) ===")
|
428
536
|
print(df.sort_values("user_id"))
|
429
537
|
|
430
|
-
# - Cheshire (user_id=1)
|
431
|
-
# - Dinah (user_id=2)
|
432
|
-
# - Felix (user_id=3) updated
|
433
|
-
# - New users (4,5,6)
|
434
|
-
# -
|
538
|
+
# - Cheshire (user_id=1) name updated from Jim, gets age=3, job="Tour Guide"
|
539
|
+
# - Dinah (user_id=2) keeps original name, gets null age/job (missing features)
|
540
|
+
# - Felix (user_id=3) name updated from Bob, gets age=2, job="Drifter"
|
541
|
+
# - New users (4,5,6) added with complete feature set
|
542
|
+
# - Schema automatically evolved to include age/job columns
|
435
543
|
|
436
544
|
# Specify the users to delete.
|
437
545
|
# We only need to specify matching merge key values.
|
@@ -440,7 +548,7 @@ users_to_delete = pd.DataFrame({
|
|
440
548
|
})
|
441
549
|
|
442
550
|
# Delete the records that match our merge keys.
|
443
|
-
dc.write(users_to_delete, "users",
|
551
|
+
dc.write(users_to_delete, "users", mode=dc.TableWriteMode.DELETE)
|
444
552
|
|
445
553
|
# Read the table back to confirm target users have been deleted.
|
446
554
|
df = dc.read("users", read_as=dc.DatasetType.PANDAS)
|
@@ -456,6 +564,117 @@ print(df.sort_values("user_id"))
|
|
456
564
|
|
457
565
|
<details>
|
458
566
|
|
567
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Zero-Copy Multimodal URL Processing</span></summary>
|
568
|
+
|
569
|
+
DeltaCAT can register and process existing multimodal datasets from local or remote URLs. This enables zero-copy distributed processing of images, audio, text, and other file formats:
|
570
|
+
|
571
|
+
```python
|
572
|
+
import deltacat as dc
|
573
|
+
import pandas as pd
|
574
|
+
import pyarrow as pa
|
575
|
+
import tempfile
|
576
|
+
import ray
|
577
|
+
|
578
|
+
# Initialize DeltaCAT with a fresh temporary catalog
|
579
|
+
dc.init_local(tempfile.mkdtemp())
|
580
|
+
|
581
|
+
# Create dataset with DeltaCAT URLs pointing to existing files
|
582
|
+
urls_df = pd.DataFrame({
|
583
|
+
"file_id": [1, 2, 3, 4, 5, 6],
|
584
|
+
"url": [
|
585
|
+
# URLs with common file extensions will have their content type inferred.
|
586
|
+
"https://picsum.photos/id/237/400/300.jpg",
|
587
|
+
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
|
588
|
+
"https://raw.githubusercontent.com/SergLam/Audio-Sample-files/master/sample.mp3",
|
589
|
+
"https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
|
590
|
+
"https://raw.githubusercontent.com/microsoft/vscode/main/package.json",
|
591
|
+
# URLs without common file extensions will be read as binary by default.
|
592
|
+
"https://picsum.photos/200"
|
593
|
+
]
|
594
|
+
})
|
595
|
+
|
596
|
+
# Create empty table with merge key to efficiently add insights about each file
|
597
|
+
dc.create_table(
|
598
|
+
"multimodal_files",
|
599
|
+
schema=dc.Schema.of([
|
600
|
+
dc.Field.of(pa.field("file_id", pa.int64()), is_merge_key=True),
|
601
|
+
dc.Field.of(pa.field("url", pa.string()))
|
602
|
+
])
|
603
|
+
)
|
604
|
+
|
605
|
+
# Write URLs to DeltaCAT table
|
606
|
+
dc.write(urls_df, "multimodal_files")
|
607
|
+
|
608
|
+
# UDF to process each file in parallel using Ray Dataset map method
|
609
|
+
def analyze_file(row):
|
610
|
+
file_id = row["file_id"]
|
611
|
+
url = row["url"]
|
612
|
+
|
613
|
+
# DeltaCAT automatically infers the right Ray Data reader for the URL
|
614
|
+
dataset = dc.get(url)
|
615
|
+
records = dataset.take_all()
|
616
|
+
url_type = dc.DatastoreType.from_url(url)
|
617
|
+
|
618
|
+
# Extract standard Ray Dataset fields for each file type
|
619
|
+
if url_type == dc.DatastoreType.IMAGES:
|
620
|
+
image = records[0]["image"]
|
621
|
+
analysis = f"Image {image.shape[1]}x{image.shape[0]} pixels"
|
622
|
+
elif url_type == dc.DatastoreType.CSV:
|
623
|
+
analysis = f"CSV with {len(records)} rows, {len(records[0].keys())} columns"
|
624
|
+
elif url_type == dc.DatastoreType.AUDIO:
|
625
|
+
sample_rate = records[0]["sample_rate"]
|
626
|
+
duration = len(records[0]["amplitude"][0]) / sample_rate
|
627
|
+
analysis = f"Audio {duration:.1f}s, {sample_rate}Hz"
|
628
|
+
elif url_type == dc.DatastoreType.JSON:
|
629
|
+
analysis = f"JSON with {len(records[0].keys())} fields"
|
630
|
+
elif url_type == dc.DatastoreType.TEXT:
|
631
|
+
analysis = f"Text with {len(records)} records"
|
632
|
+
else:
|
633
|
+
analysis = f"Binary with {len(records[0]['bytes'])} bytes"
|
634
|
+
|
635
|
+
return {"file_id": file_id, "analysis": analysis}
|
636
|
+
|
637
|
+
# Read the multimodal_files table as a Ray Dataset
|
638
|
+
ray_dataset = dc.read("multimodal_files", read_as=dc.DatasetType.RAY_DATASET)
|
639
|
+
# Download and analyze each URL in parallel using map
|
640
|
+
results_dataset = ray_dataset.map(analyze_file)
|
641
|
+
|
642
|
+
# Write results back to the multimodal_files table
|
643
|
+
dc.write(results_dataset, "multimodal_files", mode=dc.TableWriteMode.MERGE)
|
644
|
+
|
645
|
+
# Read final results and compare to initial dataset
|
646
|
+
print("\n=== Initial Dataset ===")
|
647
|
+
print(dc.to_pandas(ray_dataset))
|
648
|
+
|
649
|
+
print("\n=== Final Results with Analysis ===")
|
650
|
+
print(dc.read("multimodal_files", read_as=dc.DatasetType.PANDAS))
|
651
|
+
```
|
652
|
+
|
653
|
+
The default dataset type used by `dc.get` is a Ray Dataset but, similar to `dc.read`, `dc.get` can also read URLs into other dataset types like Daft:
|
654
|
+
|
655
|
+
```python
|
656
|
+
import deltacat as dc
|
657
|
+
|
658
|
+
# Create dataset with DeltaCAT URLs pointing to existing files
|
659
|
+
urls = [
|
660
|
+
# URLs with common file extensions will have their content type inferred.
|
661
|
+
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
|
662
|
+
"https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
|
663
|
+
# URLs without common file extensions will be read as binary by default.
|
664
|
+
"https://picsum.photos/200"
|
665
|
+
]
|
666
|
+
|
667
|
+
# Download each URL into a Daft DataFrame serially
|
668
|
+
for url in urls:
|
669
|
+
dataset = dc.get(url, read_as=dc.DatasetType.DAFT)
|
670
|
+
print(f"\n=== {url} ===")
|
671
|
+
print(dataset.show())
|
672
|
+
```
|
673
|
+
|
674
|
+
</details>
|
675
|
+
|
676
|
+
<details>
|
677
|
+
|
459
678
|
<summary><span style="font-size: 1.25em; font-weight: bold;">Organizing Tables with Namespaces</span></summary>
|
460
679
|
|
461
680
|
In DeltaCAT, table **Namespaces** are optional but useful for organizing related tables within a catalog:
|
@@ -486,6 +705,10 @@ order_data = pd.DataFrame({
|
|
486
705
|
"product_id": [101, 102, 103],
|
487
706
|
"quantity": [2, 1, 2]
|
488
707
|
})
|
708
|
+
# Create identity, inventory, and sales namespaces
|
709
|
+
dc.create_namespace("identity")
|
710
|
+
dc.create_namespace("inventory")
|
711
|
+
dc.create_namespace("sales")
|
489
712
|
|
490
713
|
# Write tables to different namespaces to organize them by domain
|
491
714
|
dc.write(user_data, "users", namespace="identity")
|
@@ -511,7 +734,10 @@ finance_users = pd.DataFrame({
|
|
511
734
|
"preferred_payment_method": ["credit", "cash", "paypal"]
|
512
735
|
})
|
513
736
|
|
737
|
+
dc.create_namespace("marketing")
|
514
738
|
dc.write(marketing_users, "users", namespace="marketing")
|
739
|
+
|
740
|
+
dc.create_namespace("finance")
|
515
741
|
dc.write(finance_users, "users", namespace="finance")
|
516
742
|
|
517
743
|
# Each namespace maintains its own "users" table with different schemas
|
@@ -534,9 +760,9 @@ print(finance_df)
|
|
534
760
|
|
535
761
|
<details>
|
536
762
|
|
537
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
763
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Transactions</span></summary>
|
538
764
|
|
539
|
-
DeltaCAT transactions can span multiple tables and namespaces. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
|
765
|
+
DeltaCAT transactions can span multiple tables and namespaces. Since transaction history is maintained at the catalog level, every transaction operates against a consistent snapshot of every object in your data lake. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
|
540
766
|
|
541
767
|
Consider the previous example that organized tables with namespaces. One table tracked customer orders, and another table tracked the lifetime payments of each customer. If one table was updated but not the other, then it would result in an accounting discrepancy. This edge case can be eliminated by using multi-table transactions:
|
542
768
|
|
@@ -557,6 +783,7 @@ product_data = pd.DataFrame({
|
|
557
783
|
})
|
558
784
|
|
559
785
|
# The product catalog can be created independently.
|
786
|
+
dc.create_namespace("inventory")
|
560
787
|
dc.write(product_data, "catalog", namespace="inventory")
|
561
788
|
|
562
789
|
print(f"\n=== Initial Product Data ===")
|
@@ -583,7 +810,9 @@ finance_schema = dc.Schema.of([
|
|
583
810
|
# Create user identities and user finance data within a single transaction.
|
584
811
|
# Since transactions are atomic, this prevents accounting discrepancies.
|
585
812
|
with dc.transaction():
|
813
|
+
dc.create_namespace("identity")
|
586
814
|
dc.write(user_data, "users", namespace="identity")
|
815
|
+
dc.create_namespace("finance")
|
587
816
|
dc.write(initial_finance, "users", namespace="finance", schema=finance_schema)
|
588
817
|
|
589
818
|
print(f"\n=== Initial User Data ===")
|
@@ -602,6 +831,7 @@ new_orders = pd.DataFrame({
|
|
602
831
|
# Process new orders and update lifetime payment totals within a single transaction.
|
603
832
|
with dc.transaction():
|
604
833
|
# Step 1: Write the new orders
|
834
|
+
dc.create_namespace("sales")
|
605
835
|
dc.write(new_orders, "transactions", namespace="sales")
|
606
836
|
|
607
837
|
# Step 2: Read back transactions and products to compute actual totals
|
@@ -617,6 +847,7 @@ with dc.transaction():
|
|
617
847
|
finance_updates.columns = ["user_id", "lifetime_payments"]
|
618
848
|
|
619
849
|
# Step 4: Write the computed totals
|
850
|
+
dc.create_namespace("finance")
|
620
851
|
dc.write(finance_updates, "users", namespace="finance", mode=dc.TableWriteMode.MERGE)
|
621
852
|
|
622
853
|
# Verify that orders and and lifetime payments are kept in sync.
|
@@ -630,7 +861,7 @@ print(dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS))
|
|
630
861
|
|
631
862
|
<details>
|
632
863
|
|
633
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
864
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Managing Multiple Data Lakes</span></summary>
|
634
865
|
|
635
866
|
DeltaCAT lets you work with multiple catalogs in a single application. All catalogs registered with DeltaCAT are tracked by a Ray Actor to make them available to all workers in your Ray application.
|
636
867
|
|
@@ -646,16 +877,14 @@ import tempfile
|
|
646
877
|
from decimal import Decimal
|
647
878
|
|
648
879
|
# Initialize catalogs with separate names and catalog roots.
|
649
|
-
dc.init(
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
))
|
658
|
-
})
|
880
|
+
dc.init(
|
881
|
+
catalogs={
|
882
|
+
# Use temporary directory for staging
|
883
|
+
"staging": dc.Catalog(dc.CatalogProperties(tempfile.mkdtemp())),
|
884
|
+
# Use S3 for prod
|
885
|
+
"prod": dc.Catalog(dc.CatalogProperties("s3://example/deltacat"))
|
886
|
+
}
|
887
|
+
)
|
659
888
|
|
660
889
|
# Create a PyArrow table with decimal256 data
|
661
890
|
decimal_table = pa.table({
|
@@ -705,9 +934,95 @@ print(dc.read("financial_data", catalog="prod", read_as=dc.DatasetType.PANDAS))
|
|
705
934
|
|
706
935
|
<details>
|
707
936
|
|
708
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
937
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Sharing & Portability</span></summary>
|
938
|
+
|
939
|
+
DeltaCAT catalogs are self-contained directories on a filesystem, so you can easily share your data lake with others. A local catalog on your laptop can be compressed and sent anywhere. A cloud catalog in S3, GCS, or Azure Blog Storage can be shared via URL. The read/write permissions of your catalog are the read/write permissions of your filesystem.
|
940
|
+
|
941
|
+
For example, you can zip up your local catalog and upload it to S3 via:
|
942
|
+
```bash
|
943
|
+
# zip a local catalog
|
944
|
+
zip -r catalog.zip .deltacat/
|
945
|
+
|
946
|
+
# copy the catalog to a cloud bucket
|
947
|
+
aws s3 cp catalog.zip s3://my-bucket/catalog.zip
|
948
|
+
```
|
949
|
+
|
950
|
+
The person you shared it with can retrieve and decompress it via:
|
951
|
+
```bash
|
952
|
+
# copy the cloud catalog to local disk
|
953
|
+
aws s3 cp s3://my-bucket/catalog.zip .
|
954
|
+
|
955
|
+
# unzip the catalog to a local directory
|
956
|
+
unzip catalog.zip -d .deltacat_copy/
|
957
|
+
```
|
958
|
+
|
959
|
+
And then initialize it together with any other catalogs they're working with:
|
960
|
+
```python
|
961
|
+
import deltacat as dc
|
962
|
+
|
963
|
+
# Initialize catalogs with separate names and catalog roots.
|
964
|
+
dc.init(
|
965
|
+
catalogs={
|
966
|
+
"original": dc.Catalog(dc.CatalogProperties(".deltacat")),
|
967
|
+
"copy": dc.Catalog(dc.CatalogProperties(".deltacat_copy")),
|
968
|
+
"prod_aws": dc.Catalog(dc.CatalogProperties("s3://prod/deltacat")),
|
969
|
+
"prod_gcp": dc.Catalog(dc.CatalogProperties("gs://prod/deltacat")),
|
970
|
+
"prod_azure": dc.Catalog(dc.CatalogProperties("az://prod/deltacat")),
|
971
|
+
}
|
972
|
+
)
|
973
|
+
|
974
|
+
# List all namespaces in the original catalog
|
975
|
+
namespaces = dc.list("dc://original")
|
976
|
+
print([namespace.name for namespace in namespaces])
|
977
|
+
|
978
|
+
# List all namespaces in the copy catalog
|
979
|
+
namespaces = dc.list("dc://copy")
|
980
|
+
print([namespace.name for namespace in namespaces])
|
981
|
+
|
982
|
+
# List all tables in the default namespace of the original catalog
|
983
|
+
tables = dc.list("dc://original/default")
|
984
|
+
print([table.name for table in tables])
|
985
|
+
|
986
|
+
# List all tables in the default namespace of the copy catalog
|
987
|
+
tables = dc.list("dc://copy/default")
|
988
|
+
print([table.name for table in tables])
|
989
|
+
```
|
990
|
+
|
991
|
+
`dc.copy` can also be used to copy namespaces and tables between catalogs:
|
992
|
+
```python
|
993
|
+
# Copy the "default" namespace from the original local catalog over to the "myspace" namespace in the copy catalog
|
994
|
+
dc.copy("dc://original/default", "dc://copy/default/myspace")
|
995
|
+
|
996
|
+
# By default, no tables are copied from the source namespace to the destination
|
997
|
+
tables = dc.list("dc://copy/myspace")
|
998
|
+
print(f"{len(tables)} tables in myspace.")
|
999
|
+
|
1000
|
+
# Copy the "users" table from the original local catalog over to "local_users" in the prod_aws catalog
|
1001
|
+
dc.copy("dc://original/default/users", "dc://prod_aws/default/local_users")
|
1002
|
+
|
1003
|
+
# Read the copied table back
|
1004
|
+
df = dc.read("local_users", catalog="prod_aws")
|
1005
|
+
df.show()
|
1006
|
+
|
1007
|
+
# We can also copy all tables in the default namespace using **
|
1008
|
+
dc.copy("dc://original/default/**", "dc://copy/default/myspace")
|
1009
|
+
tables = dc.list("dc://copy/myspace")
|
1010
|
+
print(f"{len(tables)} tables in myspace.")
|
1011
|
+
|
1012
|
+
# Or we can copy all namespaces from the original catalog using *
|
1013
|
+
dc.copy("dc://original/*", "dc://copy")
|
1014
|
+
namespaces = dc.list("dc://copy")
|
1015
|
+
print([namespace.name for namespace in namespaces])
|
1016
|
+
```
|
1017
|
+
|
1018
|
+
</details>
|
1019
|
+
|
1020
|
+
|
1021
|
+
<details>
|
1022
|
+
|
1023
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Time Travel</span></summary>
|
709
1024
|
|
710
|
-
DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with
|
1025
|
+
DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with catalog-level transactions, this enables consistent point-in-time views across your entire data lake.
|
711
1026
|
|
712
1027
|
```python
|
713
1028
|
import deltacat as dc
|
@@ -744,10 +1059,10 @@ initial_finance = pd.DataFrame({
|
|
744
1059
|
|
745
1060
|
# Write initial state atomically with a commit message
|
746
1061
|
with dc.transaction(commit_message="Initial data load: users, products, orders, and finance"):
|
747
|
-
dc.write(initial_users, "users", namespace="identity")
|
748
|
-
dc.write(initial_products, "catalog", namespace="inventory")
|
749
|
-
dc.write(initial_orders, "transactions", namespace="sales")
|
750
|
-
dc.write(initial_finance, "users", namespace="finance")
|
1062
|
+
dc.write(initial_users, "users", namespace="identity", auto_create_namespace=True)
|
1063
|
+
dc.write(initial_products, "catalog", namespace="inventory", auto_create_namespace=True)
|
1064
|
+
dc.write(initial_orders, "transactions", namespace="sales", auto_create_namespace=True)
|
1065
|
+
dc.write(initial_finance, "users", namespace="finance", auto_create_namespace=True)
|
751
1066
|
|
752
1067
|
# Sleep briefly to ensure transaction timestamp separation
|
753
1068
|
time.sleep(0.1)
|
@@ -847,7 +1162,7 @@ print("\nTime travel validation successful!")
|
|
847
1162
|
|
848
1163
|
<summary><span style="font-size: 1.25em; font-weight: bold;">Multimodal Batch Inference</span></summary>
|
849
1164
|
|
850
|
-
DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed
|
1165
|
+
DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed predictions for each image:
|
851
1166
|
|
852
1167
|
> **Requirements**: This example requires PyTorch ≥ 2.8.0 and torchvision ≥ 0.23.0. Install via: `pip install torch>=2.8.0 torchvision>=0.23.0`
|
853
1168
|
|
@@ -938,7 +1253,7 @@ final_df.show()
|
|
938
1253
|
|
939
1254
|
<summary><span style="font-size: 1.25em; font-weight: bold;">LLM Batch Inference</span></summary>
|
940
1255
|
|
941
|
-
DeltaCAT multi-table transactions, time travel
|
1256
|
+
DeltaCAT multi-table transactions, data lake time travel, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
|
942
1257
|
|
943
1258
|
```python
|
944
1259
|
import deltacat as dc
|
@@ -963,7 +1278,7 @@ daft_docs = daft_docs.with_column("content", daft_docs["path"].url.download().de
|
|
963
1278
|
# Capture basic feedback sentiment analysis in a parallel multi-table transaction
|
964
1279
|
with dc.transaction():
|
965
1280
|
# Write the full customer feedback to a new "documents" table.
|
966
|
-
dc.write(daft_docs, "documents"
|
1281
|
+
dc.write(daft_docs, "documents")
|
967
1282
|
|
968
1283
|
# Define a UDF to analyze customer feedback sentiment.
|
969
1284
|
@daft.udf(return_dtype=daft.DataType.struct({
|
@@ -1000,14 +1315,14 @@ with dc.transaction():
|
|
1000
1315
|
dc.Field.of(pa.field("confidence", pa.float64())),
|
1001
1316
|
dc.Field.of(pa.field("model_version", pa.large_string())),
|
1002
1317
|
])
|
1003
|
-
dc.write(daft_results, "insights",
|
1318
|
+
dc.write(daft_results, "insights", schema=initial_schema)
|
1004
1319
|
|
1005
1320
|
# Write to a new audit trail table.
|
1006
1321
|
audit_df = pd.DataFrame([{
|
1007
1322
|
"version": "v1.0",
|
1008
1323
|
"docs_processed": dc.dataset_length(daft_docs),
|
1009
1324
|
}])
|
1010
|
-
dc.write(audit_df, "audit"
|
1325
|
+
dc.write(audit_df, "audit")
|
1011
1326
|
|
1012
1327
|
print("=== V1.0: Customer feedback sentiment analysis processing complete! ===")
|
1013
1328
|
|
@@ -1048,9 +1363,9 @@ with dc.transaction():
|
|
1048
1363
|
)
|
1049
1364
|
|
1050
1365
|
# Merge new V2.0 insights into the existing V1.0 insights table.
|
1051
|
-
dc.write(daft_emotions, "insights"
|
1366
|
+
dc.write(daft_emotions, "insights")
|
1052
1367
|
audit_df = pd.DataFrame([{"version": "v2.0", "docs_processed": dc.dataset_length(daft_docs)}])
|
1053
|
-
dc.write(audit_df, "audit"
|
1368
|
+
dc.write(audit_df, "audit")
|
1054
1369
|
|
1055
1370
|
print("=== V2.0: Customer feedback emotion analysis processing complete! ===")
|
1056
1371
|
|
@@ -1062,7 +1377,7 @@ time.sleep(0.1)
|
|
1062
1377
|
# Generate customer service responses based on emotion analysis results.
|
1063
1378
|
with dc.transaction():
|
1064
1379
|
# First, read the current insights table with emotion analysis
|
1065
|
-
current_insights = dc.read("insights"
|
1380
|
+
current_insights = dc.read("insights")
|
1066
1381
|
|
1067
1382
|
# Define a UDF to generate customer service responses based on analysis results.
|
1068
1383
|
@daft.udf(return_dtype=daft.DataType.struct({
|
@@ -1109,39 +1424,39 @@ with dc.transaction():
|
|
1109
1424
|
)
|
1110
1425
|
# Merge new V3.0 responses into the existing V2.0 insights table.
|
1111
1426
|
# The new response columns are automatically joined by document ID.
|
1112
|
-
dc.write(daft_responses, "insights"
|
1427
|
+
dc.write(daft_responses, "insights")
|
1113
1428
|
audit_df = pd.DataFrame([{"version": "v3.0", "docs_processed": dc.dataset_length(current_insights)}])
|
1114
|
-
dc.write(audit_df, "audit"
|
1429
|
+
dc.write(audit_df, "audit")
|
1115
1430
|
|
1116
1431
|
print("=== V3.0: Customer service response generation processing complete! ===")
|
1117
1432
|
|
1118
1433
|
print("\n=== Time Travel Comparison of all Versions ===")
|
1119
1434
|
with dc.transaction(as_of=checkpoint_v1):
|
1120
1435
|
print(f"== V1.0 Insights (sentiment) ==")
|
1121
|
-
print(dc.read("insights"
|
1436
|
+
print(dc.read("insights").show())
|
1122
1437
|
print(f"== V1.0 Audit ==")
|
1123
|
-
print(dc.read("audit"
|
1438
|
+
print(dc.read("audit").show())
|
1124
1439
|
|
1125
1440
|
with dc.transaction(as_of=checkpoint_v2):
|
1126
1441
|
print(f"== V2.0 Insights (emotion) ==")
|
1127
|
-
print(dc.read("insights"
|
1442
|
+
print(dc.read("insights").show())
|
1128
1443
|
print(f"== V2.0 Audit ==")
|
1129
|
-
print(dc.read("audit"
|
1444
|
+
print(dc.read("audit").show())
|
1130
1445
|
|
1131
|
-
v3_results = dc.read("insights"
|
1446
|
+
v3_results = dc.read("insights")
|
1132
1447
|
print(f"== V3.0 Insights (customer service response) ==")
|
1133
|
-
print(dc.read("insights"
|
1448
|
+
print(dc.read("insights").show())
|
1134
1449
|
print(f"== V3.0 Audit ==")
|
1135
|
-
print(dc.read("audit"
|
1450
|
+
print(dc.read("audit").show())
|
1136
1451
|
```
|
1137
1452
|
|
1138
1453
|
</details>
|
1139
1454
|
|
1140
1455
|
## Runtime Environment Requirements
|
1141
1456
|
|
1142
|
-
DeltaCAT's transaction system assumes that the host machine provides strong system clock accuracy guarantees, and that the filesystem hosting the catalog root directory offers strong consistency.
|
1457
|
+
DeltaCAT's transaction system assumes that the host machine provides strong system clock accuracy guarantees, and that the filesystem hosting the catalog root directory offers strong read-after-write consistency.
|
1143
1458
|
|
1144
|
-
Taken together, these requirements make DeltaCAT suitable for production use on most major cloud computing hosts (e.g., EC2, GCE, Azure VMs) and storage systems (e.g., S3, GCS, Azure Blob Storage), but local laptops should typically be limited to testing/experimental purposes.
|
1459
|
+
Taken together, these requirements make DeltaCAT suitable for production use on most major cloud computing hosts (e.g., EC2, GCE, Azure VMs) and storage systems (e.g., S3, GCS, Azure Blob Storage), but local laptops should typically be limited to testing/experimental purposes (e.g., due to potential system clock drift).
|
1145
1460
|
|
1146
1461
|
## Additional Resources
|
1147
1462
|
### Table Documentation
|