deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -9,11 +9,8 @@ from functools import partial
|
|
9
9
|
import ray
|
10
10
|
|
11
11
|
from deltacat import logs
|
12
|
-
from deltacat.
|
13
|
-
from deltacat.catalog.
|
14
|
-
from deltacat.catalog.iceberg import impl as IcebergCatalog
|
15
|
-
from deltacat.catalog import CatalogProperties
|
16
|
-
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
12
|
+
from deltacat.catalog.main import impl as dcat
|
13
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
17
14
|
from deltacat.constants import DEFAULT_CATALOG
|
18
15
|
|
19
16
|
all_catalogs: Optional[ray.actor.ActorHandle] = None
|
@@ -22,17 +19,20 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
22
19
|
|
23
20
|
|
24
21
|
class Catalog:
|
25
|
-
def __init__(
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
config: Optional[Union[CatalogProperties, Any]] = None,
|
25
|
+
impl: ModuleType = dcat,
|
26
|
+
*args,
|
27
|
+
**kwargs,
|
28
|
+
):
|
26
29
|
"""
|
27
30
|
Constructor for a Catalog.
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
Note: all initialization configuration MUST be pickle-able. When `Catalog` is pickled, _inner is excluded.
|
34
|
-
Instead, we only pass impl/args/kwargs, which are pickled and then _inner is re-constituted by calling __init__.
|
35
|
-
See `ray.util.register_serializer` in Catalogs class.
|
32
|
+
Invokes `impl.initialize(config, *args, **kwargs)` and stores its
|
33
|
+
return value in the `inner` property. This captures all state required
|
34
|
+
to deterministically reconstruct this Catalog instance on any node, and
|
35
|
+
must be pickleable by Ray cloudpickle.
|
36
36
|
"""
|
37
37
|
if not isinstance(self, Catalog):
|
38
38
|
# self may contain the tuple returned from __reduce__ (ray pickle bug?)
|
@@ -43,32 +43,15 @@ class Catalog:
|
|
43
43
|
err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
|
44
44
|
raise RuntimeError(err_msg)
|
45
45
|
|
46
|
+
self._config = config
|
46
47
|
self._impl = impl
|
47
|
-
self._inner = self._impl.initialize(*args, **kwargs)
|
48
|
+
self._inner = self._impl.initialize(config=config, *args, **kwargs)
|
48
49
|
self._args = args
|
49
50
|
self._kwargs = kwargs
|
50
51
|
|
51
|
-
@
|
52
|
-
|
53
|
-
|
54
|
-
"""
|
55
|
-
!!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
|
56
|
-
|
57
|
-
Factory method to construct a catalog from Iceberg catalog params
|
58
|
-
|
59
|
-
This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
|
60
|
-
plumbing __params__ through as kwargs
|
61
|
-
"""
|
62
|
-
return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
|
63
|
-
|
64
|
-
@classmethod
|
65
|
-
def default(cls, config: CatalogProperties, *args, **kwargs):
|
66
|
-
"""
|
67
|
-
Factory method to construct a catalog with the default implementation
|
68
|
-
|
69
|
-
Uses CatalogProperties as configuration
|
70
|
-
"""
|
71
|
-
return cls(impl=DeltacatCatalog, *args, **{"config": config, **kwargs})
|
52
|
+
@property
|
53
|
+
def config(self):
|
54
|
+
return self._config
|
72
55
|
|
73
56
|
@property
|
74
57
|
def impl(self):
|
@@ -82,7 +65,11 @@ class Catalog:
|
|
82
65
|
def __reduce__(self):
|
83
66
|
# instantiated catalogs may fail to pickle, so exclude _inner
|
84
67
|
# (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
|
85
|
-
return partial(self.__class__, **self._kwargs), (
|
68
|
+
return partial(self.__class__, **self._kwargs), (
|
69
|
+
self._config,
|
70
|
+
self._impl,
|
71
|
+
*self._args,
|
72
|
+
)
|
86
73
|
|
87
74
|
def __str__(self):
|
88
75
|
string_rep = f"{self.__class__.__name__}("
|
@@ -104,101 +91,165 @@ class Catalogs:
|
|
104
91
|
self,
|
105
92
|
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
93
|
default: Optional[str] = None,
|
107
|
-
*args,
|
108
|
-
**kwargs,
|
109
94
|
):
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
if not catalogs:
|
115
|
-
raise ValueError(
|
116
|
-
f"No catalogs given to register. "
|
117
|
-
f"Please specify one or more catalogs."
|
118
|
-
)
|
95
|
+
self._catalogs = {}
|
96
|
+
self._default_catalog_name = None
|
97
|
+
self._default_catalog = None
|
98
|
+
self.update(catalogs, default)
|
119
99
|
|
120
|
-
|
100
|
+
def all(self) -> Dict[str, Catalog]:
|
101
|
+
return self._catalogs
|
102
|
+
|
103
|
+
def update(
|
104
|
+
self,
|
105
|
+
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
|
+
default: Optional[str] = None,
|
107
|
+
) -> None:
|
121
108
|
if isinstance(catalogs, Catalog):
|
122
109
|
catalogs = {DEFAULT_CATALOG: catalogs}
|
123
|
-
|
124
|
-
|
110
|
+
elif not isinstance(catalogs, dict):
|
111
|
+
raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
|
112
|
+
self._catalogs.update(catalogs)
|
125
113
|
if default:
|
126
|
-
|
114
|
+
if default not in catalogs:
|
115
|
+
raise ValueError(
|
116
|
+
f"Default catalog `{default}` not found in: {catalogs}"
|
117
|
+
)
|
118
|
+
self._default_catalog = self._catalogs[default]
|
119
|
+
self._default_catalog_name = default
|
127
120
|
elif len(catalogs) == 1:
|
128
|
-
self.
|
121
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
129
122
|
else:
|
130
|
-
self.
|
131
|
-
|
132
|
-
def all(self) -> Dict[str, Catalog]:
|
133
|
-
return self.catalogs
|
123
|
+
self._default_catalog = None
|
134
124
|
|
135
125
|
def names(self) -> List[str]:
|
136
|
-
return list(self.
|
126
|
+
return list(self._catalogs.keys())
|
137
127
|
|
138
128
|
def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
|
139
|
-
self.
|
140
|
-
if set_default:
|
141
|
-
self.
|
129
|
+
self._catalogs[name] = catalog
|
130
|
+
if set_default or len(self._catalogs) == 1:
|
131
|
+
self._default_catalog = catalog
|
132
|
+
|
133
|
+
def get(self, name) -> Optional[Catalog]:
|
134
|
+
return self._catalogs.get(name)
|
135
|
+
|
136
|
+
def pop(self, name) -> Optional[Catalog]:
|
137
|
+
catalog = self._catalogs.pop(name, None)
|
138
|
+
if catalog and self._default_catalog_name == name:
|
139
|
+
if len(self._catalogs) == 1:
|
140
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
141
|
+
else:
|
142
|
+
self._default_catalog = None
|
143
|
+
return catalog
|
142
144
|
|
143
|
-
def
|
144
|
-
|
145
|
+
def clear(self) -> None:
|
146
|
+
self._catalogs.clear()
|
147
|
+
self._default_catalog = None
|
145
148
|
|
146
149
|
def default(self) -> Optional[Catalog]:
|
147
|
-
return self.
|
150
|
+
return self._default_catalog
|
148
151
|
|
149
152
|
|
150
153
|
def is_initialized(*args, **kwargs) -> bool:
|
151
154
|
"""
|
152
|
-
Check if DeltaCAT is initialized
|
155
|
+
Check if DeltaCAT is initialized.
|
153
156
|
"""
|
154
157
|
global all_catalogs
|
155
158
|
|
156
|
-
# If ray is not initialized, then Catalogs cannot be initialized
|
157
159
|
if not ray.is_initialized():
|
158
|
-
# Any existing actor reference
|
160
|
+
# Any existing Catalogs actor reference must be stale - reset it
|
159
161
|
all_catalogs = None
|
160
|
-
return False
|
161
|
-
|
162
162
|
return all_catalogs is not None
|
163
163
|
|
164
164
|
|
165
|
+
def raise_if_not_initialized(
|
166
|
+
err_msg: str = "DeltaCAT is not initialized. Please call `deltacat.init()` and try again.",
|
167
|
+
) -> None:
|
168
|
+
"""
|
169
|
+
Raises a RuntimeError with the given error message if DeltaCAT is not
|
170
|
+
initialized.
|
171
|
+
|
172
|
+
:param err_msg: Custom error message to raise if DeltaCAT is not
|
173
|
+
initialized. If unspecified, the default error message is used.
|
174
|
+
"""
|
175
|
+
if not is_initialized():
|
176
|
+
raise RuntimeError(err_msg)
|
177
|
+
|
178
|
+
|
165
179
|
def init(
|
166
|
-
catalogs: Union[Dict[str, Catalog], Catalog],
|
180
|
+
catalogs: Union[Dict[str, Catalog], Catalog] = {},
|
167
181
|
default: Optional[str] = None,
|
168
|
-
ray_init_args: Dict[str, Any] =
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
) -> None:
|
182
|
+
ray_init_args: Dict[str, Any] = {},
|
183
|
+
*,
|
184
|
+
force=False,
|
185
|
+
) -> Optional[ray.runtime.BaseContext]:
|
173
186
|
"""
|
174
187
|
Initialize DeltaCAT catalogs.
|
175
188
|
|
176
|
-
:param catalogs:
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
:param
|
189
|
+
:param catalogs: A single Catalog instance or a map of catalog names to
|
190
|
+
Catalog instances.
|
191
|
+
:param default: The name of the default Catalog. If only one Catalog is
|
192
|
+
provided, it will always be the default.
|
193
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
194
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
195
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
196
|
+
catalogs.
|
197
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
181
198
|
"""
|
182
199
|
global all_catalogs
|
183
200
|
|
184
|
-
if is_initialized() and not
|
201
|
+
if is_initialized() and not force:
|
185
202
|
logger.warning("DeltaCAT already initialized.")
|
186
|
-
return
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
ray.init()
|
203
|
+
return None
|
204
|
+
|
205
|
+
# initialize ray (and ignore reinitialization errors)
|
206
|
+
ray_init_args["ignore_reinit_error"] = True
|
207
|
+
context = ray.init(**ray_init_args)
|
192
208
|
|
193
209
|
# register custom serializer for catalogs since these may contain
|
194
210
|
# unserializable objects like boto3 clients with SSLContext
|
195
211
|
ray.util.register_serializer(
|
196
212
|
Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
|
197
213
|
)
|
214
|
+
# TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
|
215
|
+
# with all catalogs from the last session
|
198
216
|
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
217
|
+
return context
|
218
|
+
|
219
|
+
|
220
|
+
def init_local(
|
221
|
+
path: Optional[str] = None,
|
222
|
+
ray_init_args: Dict[str, Any] = {},
|
223
|
+
*,
|
224
|
+
force=False,
|
225
|
+
) -> Optional[ray.runtime.BaseContext]:
|
226
|
+
"""
|
227
|
+
Initialize DeltaCAT with a default local catalog.
|
228
|
+
|
229
|
+
This is a convenience function that creates a default catalog for local usage.
|
230
|
+
Equivalent to calling init(catalogs={"default": Catalog()}).
|
231
|
+
|
232
|
+
:param path: Optional path for catalog root directory. If not provided, uses
|
233
|
+
the default behavior of CatalogProperties (DELTACAT_ROOT env var or
|
234
|
+
"./.deltacat/").
|
235
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
236
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
237
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
238
|
+
catalogs.
|
239
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
240
|
+
"""
|
241
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
242
|
+
|
243
|
+
config = CatalogProperties(root=path) if path is not None else None
|
244
|
+
return init(
|
245
|
+
catalogs={"default": Catalog(config=config)},
|
246
|
+
default="default",
|
247
|
+
ray_init_args=ray_init_args,
|
248
|
+
force=force,
|
249
|
+
)
|
199
250
|
|
200
251
|
|
201
|
-
def get_catalog(name: Optional[str] = None
|
252
|
+
def get_catalog(name: Optional[str] = None) -> Catalog:
|
202
253
|
"""
|
203
254
|
Get a catalog by name, or the default catalog if no name is provided.
|
204
255
|
|
@@ -216,7 +267,6 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
216
267
|
"`deltacat.init(catalogs={...})` to register one or more "
|
217
268
|
"catalogs then retry."
|
218
269
|
)
|
219
|
-
|
220
270
|
if name is not None:
|
221
271
|
catalog = ray.get(all_catalogs.get.remote(name))
|
222
272
|
if not catalog:
|
@@ -225,17 +275,42 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
225
275
|
f"Catalog '{name}' not found. Available catalogs: "
|
226
276
|
f"{available_catalogs}."
|
227
277
|
)
|
228
|
-
return catalog
|
229
|
-
|
230
278
|
else:
|
231
279
|
catalog = ray.get(all_catalogs.default.remote())
|
232
280
|
if not catalog:
|
233
|
-
available_catalogs = ray.get(all_catalogs.all.remote()).
|
281
|
+
available_catalogs = list(ray.get(all_catalogs.all.remote()).keys())
|
234
282
|
raise ValueError(
|
235
|
-
f"Call to get_catalog without name set failed because there
|
283
|
+
f"Call to get_catalog without name set failed because there "
|
284
|
+
f"is no default Catalog set. Available catalogs: "
|
236
285
|
f"{available_catalogs}."
|
237
286
|
)
|
238
|
-
|
287
|
+
return catalog
|
288
|
+
|
289
|
+
|
290
|
+
def clear_catalogs() -> None:
|
291
|
+
"""
|
292
|
+
Clear all catalogs from the global map of named catalogs.
|
293
|
+
"""
|
294
|
+
if all_catalogs:
|
295
|
+
ray.get(all_catalogs.clear.remote())
|
296
|
+
|
297
|
+
|
298
|
+
def pop_catalog(name: str) -> Optional[Catalog]:
|
299
|
+
"""
|
300
|
+
Remove a named catalog from the global map of named catalogs.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
name: Name of the catalog to remove.
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
The removed catalog, or None if not found.
|
307
|
+
"""
|
308
|
+
global all_catalogs
|
309
|
+
|
310
|
+
if not all_catalogs:
|
311
|
+
return None
|
312
|
+
catalog = ray.get(all_catalogs.pop.remote(name))
|
313
|
+
return catalog
|
239
314
|
|
240
315
|
|
241
316
|
def put_catalog(
|
@@ -243,48 +318,68 @@ def put_catalog(
|
|
243
318
|
catalog: Catalog = None,
|
244
319
|
*,
|
245
320
|
default: bool = False,
|
246
|
-
ray_init_args: Dict[str, Any] =
|
321
|
+
ray_init_args: Dict[str, Any] = {},
|
247
322
|
fail_if_exists: bool = False,
|
248
323
|
**kwargs,
|
249
|
-
) ->
|
324
|
+
) -> Catalog:
|
250
325
|
"""
|
251
|
-
Add a named catalog to the global map of named catalogs. Initializes
|
326
|
+
Add a named catalog to the global map of named catalogs. Initializes
|
327
|
+
DeltaCAT if not already initialized.
|
252
328
|
|
253
329
|
Args:
|
254
|
-
name:
|
255
|
-
catalog:
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
330
|
+
name: Name of the catalog.
|
331
|
+
catalog: Catalog instance to use. If none is provided, then all
|
332
|
+
additional keyword arguments will be forwarded to
|
333
|
+
`CatalogProperties` for a default DeltaCAT native Catalog.
|
334
|
+
default: Make this the default catalog if multiple catalogs are
|
335
|
+
available. If only one catalog is available, it will always be the
|
336
|
+
default.
|
337
|
+
ray_init_args: Ray initialization args (used only if ray is not already
|
338
|
+
initialized).
|
339
|
+
fail_if_exists: if True, raises an error if a catalog with the given
|
340
|
+
name already exists. If False, inserts or replaces the given
|
341
|
+
catalog name.
|
342
|
+
kwargs: Additional keyword arguments to forward to `CatalogProperties`
|
343
|
+
for a default DeltaCAT native Catalog.
|
344
|
+
|
345
|
+
Returns:
|
346
|
+
The catalog put in the named catalog map.
|
260
347
|
"""
|
261
348
|
global all_catalogs
|
262
349
|
|
350
|
+
if not catalog:
|
351
|
+
catalog = Catalog(**kwargs)
|
352
|
+
if name is None:
|
353
|
+
raise ValueError("Catalog name cannot be None")
|
354
|
+
|
263
355
|
# Initialize, if necessary
|
264
356
|
if not is_initialized():
|
265
|
-
#
|
357
|
+
# We are initializing a single catalog - make it the default
|
266
358
|
if not default:
|
267
359
|
logger.info(
|
268
360
|
f"Calling put_catalog with set_as_default=False, "
|
269
|
-
f"but still setting Catalog {catalog} as default since it is
|
361
|
+
f"but still setting Catalog {catalog} as default since it is "
|
362
|
+
f"the only catalog."
|
270
363
|
)
|
271
364
|
init({name: catalog}, ray_init_args=ray_init_args)
|
272
|
-
return
|
365
|
+
return catalog
|
273
366
|
|
274
367
|
# Fail if fail_if_exists and catalog already exists
|
275
368
|
if fail_if_exists:
|
276
|
-
catalog_already_exists = False
|
277
369
|
try:
|
278
370
|
get_catalog(name)
|
279
|
-
#
|
280
|
-
# caught in the except block which is meant to catch the ValueError from get_catalog
|
281
|
-
catalog_already_exists = True
|
282
|
-
except ValueError:
|
283
|
-
pass
|
284
|
-
if catalog_already_exists:
|
371
|
+
# If we get here, catalog exists - raise error
|
285
372
|
raise ValueError(
|
286
|
-
f"Failed to put catalog {name} because it already exists and
|
373
|
+
f"Failed to put catalog {name} because it already exists and "
|
374
|
+
f"fail_if_exists={fail_if_exists}"
|
287
375
|
)
|
376
|
+
except ValueError as e:
|
377
|
+
if "not found" not in str(e):
|
378
|
+
# Re-raise if it's not a "catalog not found" error
|
379
|
+
raise
|
380
|
+
# If catalog doesn't exist, continue normally
|
381
|
+
pass
|
288
382
|
|
289
383
|
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
290
384
|
ray.get(all_catalogs.put.remote(name, catalog, default))
|
385
|
+
return catalog
|
@@ -1,5 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
from typing import Optional, Any
|
4
|
+
import urllib.parse
|
5
|
+
|
6
|
+
import os
|
3
7
|
|
4
8
|
import pyarrow
|
5
9
|
from deltacat.constants import DELTACAT_ROOT
|
@@ -8,18 +12,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
|
|
8
12
|
|
9
13
|
|
10
14
|
def get_catalog_properties(
|
11
|
-
|
15
|
+
*,
|
12
16
|
catalog: Optional[CatalogProperties] = None,
|
13
17
|
inner: Optional[CatalogProperties] = None,
|
14
18
|
**kwargs,
|
15
19
|
) -> CatalogProperties:
|
16
20
|
"""
|
17
|
-
Helper function to fetch CatalogProperties instance.
|
18
|
-
kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
|
21
|
+
Helper function to fetch CatalogProperties instance.
|
19
22
|
|
20
|
-
This will look
|
21
|
-
|
22
|
-
constructor.
|
23
|
+
This will look first look for CatalogProperties in either "catalog"
|
24
|
+
or "inner" and otherwise passes all keyword arguments to the
|
25
|
+
CatalogProperties constructor.
|
23
26
|
"""
|
24
27
|
properties = catalog if catalog is not None else inner
|
25
28
|
if properties is not None and isinstance(properties, CatalogProperties):
|
@@ -39,21 +42,22 @@ class CatalogProperties:
|
|
39
42
|
DeltaCAT catalog instance. Properties are set from system environment
|
40
43
|
variables unless explicit overrides are provided during initialization.
|
41
44
|
|
42
|
-
Catalog and storage APIs rely on the property catalog to retrieve durable
|
43
|
-
working against.
|
45
|
+
Catalog and storage APIs rely on the property catalog to retrieve durable
|
46
|
+
state about the catalog they're working against.
|
44
47
|
|
45
48
|
Attributes:
|
46
|
-
root
|
47
|
-
|
48
|
-
1.
|
49
|
-
2.
|
50
|
-
3. default to
|
49
|
+
root: The root path for catalog metadata and data storage. Resolved by
|
50
|
+
searching for the root path in the following order:
|
51
|
+
1. "root" constructor input argument
|
52
|
+
2. "DELTACAT_ROOT" system environment variable
|
53
|
+
3. default to "./.deltacat/"
|
51
54
|
|
52
55
|
filesystem: The filesystem implementation that should be used for
|
53
56
|
reading/writing files. If None, a filesystem will be inferred from
|
54
57
|
the catalog root path.
|
55
58
|
|
56
|
-
storage: Storage class implementation (overrides default filesystem
|
59
|
+
storage: Storage class implementation (overrides default filesystem
|
60
|
+
storage impl)
|
57
61
|
"""
|
58
62
|
|
59
63
|
def __init__(
|
@@ -61,28 +65,31 @@ class CatalogProperties:
|
|
61
65
|
root: Optional[str] = None,
|
62
66
|
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
63
67
|
storage=None,
|
64
|
-
*args,
|
65
|
-
**kwargs,
|
66
68
|
):
|
67
69
|
"""
|
68
70
|
Initialize a CatalogProperties instance.
|
69
71
|
|
70
72
|
Args:
|
71
|
-
root:
|
73
|
+
root: Catalog root directory path. Uses the "DELTACAT_ROOT"
|
74
|
+
system environment variable if not set, and defaults to
|
75
|
+
"./.deltacat/" if this environment variable is not set.
|
72
76
|
filesystem: The filesystem implementation that should be used for
|
73
77
|
reading these files. If None, a filesystem will be inferred.
|
74
|
-
If
|
75
|
-
|
78
|
+
If provided, this will be validated for compatibility with the
|
79
|
+
catalog root path.
|
80
|
+
storage: DeltaCAT storage implementation override.
|
76
81
|
"""
|
77
82
|
# set root, using precedence rules described in pydoc
|
78
83
|
if root is None:
|
79
84
|
# Check environment variables
|
80
|
-
# This is set or defaulted in constants.py
|
81
85
|
root = DELTACAT_ROOT
|
82
|
-
if root
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
+
if not root:
|
87
|
+
# Default to "./.deltacat/"
|
88
|
+
root = os.path.join(os.getcwd(), ".deltacat")
|
89
|
+
|
90
|
+
# Store the original root with its scheme for reconstruction later
|
91
|
+
self._original_root = root
|
92
|
+
self._original_scheme = urllib.parse.urlparse(root).scheme
|
86
93
|
|
87
94
|
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
88
95
|
path=root,
|
@@ -107,6 +114,38 @@ class CatalogProperties:
|
|
107
114
|
"""
|
108
115
|
return self._storage
|
109
116
|
|
117
|
+
def reconstruct_full_path(self, path: str) -> str:
|
118
|
+
"""
|
119
|
+
Reconstruct a full path with the original scheme for external readers.
|
120
|
+
|
121
|
+
This addresses GitHub issue #567 by ensuring that cloud storage URIs
|
122
|
+
include the relevant scheme prefix (e.g., s3://) that some file readers
|
123
|
+
require regardless of the filesystem being used to read the file
|
124
|
+
(e.g., Daft).
|
125
|
+
|
126
|
+
Args:
|
127
|
+
path: A path relative to the catalog root or absolute path
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
Full path with appropriate scheme prefix for external readers
|
131
|
+
"""
|
132
|
+
# If the path already has a scheme, return it as-is
|
133
|
+
if urllib.parse.urlparse(path).scheme:
|
134
|
+
return path
|
135
|
+
|
136
|
+
# If we don't have an original scheme (local filesystem), return as-is
|
137
|
+
if not self._original_scheme:
|
138
|
+
return path
|
139
|
+
|
140
|
+
# Reconstruct the full path with the original scheme
|
141
|
+
# Handle both absolute and relative paths
|
142
|
+
if path.startswith("/"):
|
143
|
+
# Absolute path - this shouldn't happen normally but handle it
|
144
|
+
return f"{self._original_scheme}:/{path}"
|
145
|
+
else:
|
146
|
+
# Relative path - prepend the s3:// scheme
|
147
|
+
return f"{self._original_scheme}://{path}"
|
148
|
+
|
110
149
|
def __str__(self):
|
111
150
|
return (
|
112
151
|
f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
|
deltacat/compute/__init__.py
CHANGED