deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -9,11 +9,8 @@ from functools import partial
|
|
9
9
|
import ray
|
10
10
|
|
11
11
|
from deltacat import logs
|
12
|
-
from deltacat.
|
13
|
-
from deltacat.catalog.
|
14
|
-
from deltacat.catalog.iceberg import impl as IcebergCatalog
|
15
|
-
from deltacat.catalog import CatalogProperties
|
16
|
-
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
12
|
+
from deltacat.catalog.main import impl as dcat
|
13
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
17
14
|
from deltacat.constants import DEFAULT_CATALOG
|
18
15
|
|
19
16
|
all_catalogs: Optional[ray.actor.ActorHandle] = None
|
@@ -22,14 +19,20 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
22
19
|
|
23
20
|
|
24
21
|
class Catalog:
|
25
|
-
def __init__(
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
config: Optional[Union[CatalogProperties, Any]] = None,
|
25
|
+
impl: ModuleType = dcat,
|
26
|
+
*args,
|
27
|
+
**kwargs,
|
28
|
+
):
|
26
29
|
"""
|
27
30
|
Constructor for a Catalog.
|
28
31
|
|
29
|
-
Invokes `impl.initialize(*args, **kwargs)` and stores its
|
30
|
-
in the `inner` property
|
31
|
-
deterministically reconstruct this Catalog instance on any node
|
32
|
-
must
|
32
|
+
Invokes `impl.initialize(config, *args, **kwargs)` and stores its
|
33
|
+
return value in the `inner` property. This captures all state required
|
34
|
+
to deterministically reconstruct this Catalog instance on any node, and
|
35
|
+
must be pickleable by Ray cloudpickle.
|
33
36
|
"""
|
34
37
|
if not isinstance(self, Catalog):
|
35
38
|
# self may contain the tuple returned from __reduce__ (ray pickle bug?)
|
@@ -40,32 +43,15 @@ class Catalog:
|
|
40
43
|
err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
|
41
44
|
raise RuntimeError(err_msg)
|
42
45
|
|
46
|
+
self._config = config
|
43
47
|
self._impl = impl
|
44
|
-
self._inner = self._impl.initialize(*args, **kwargs)
|
48
|
+
self._inner = self._impl.initialize(config=config, *args, **kwargs)
|
45
49
|
self._args = args
|
46
50
|
self._kwargs = kwargs
|
47
51
|
|
48
|
-
@
|
49
|
-
|
50
|
-
|
51
|
-
"""
|
52
|
-
!!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
|
53
|
-
|
54
|
-
Factory method to construct a catalog from Iceberg catalog params
|
55
|
-
|
56
|
-
This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
|
57
|
-
plumbing __params__ through as kwargs
|
58
|
-
"""
|
59
|
-
return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
|
60
|
-
|
61
|
-
@classmethod
|
62
|
-
def default(cls, config: CatalogProperties, *args, **kwargs):
|
63
|
-
"""
|
64
|
-
Factory method to construct a catalog with the default implementation
|
65
|
-
|
66
|
-
Uses CatalogProperties as configuration
|
67
|
-
"""
|
68
|
-
return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
|
52
|
+
@property
|
53
|
+
def config(self):
|
54
|
+
return self._config
|
69
55
|
|
70
56
|
@property
|
71
57
|
def impl(self):
|
@@ -79,7 +65,11 @@ class Catalog:
|
|
79
65
|
def __reduce__(self):
|
80
66
|
# instantiated catalogs may fail to pickle, so exclude _inner
|
81
67
|
# (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
|
82
|
-
return partial(self.__class__, **self._kwargs), (
|
68
|
+
return partial(self.__class__, **self._kwargs), (
|
69
|
+
self._config,
|
70
|
+
self._impl,
|
71
|
+
*self._args,
|
72
|
+
)
|
83
73
|
|
84
74
|
def __str__(self):
|
85
75
|
string_rep = f"{self.__class__.__name__}("
|
@@ -102,38 +92,62 @@ class Catalogs:
|
|
102
92
|
catalogs: Union[Catalog, Dict[str, Catalog]],
|
103
93
|
default: Optional[str] = None,
|
104
94
|
):
|
95
|
+
self._catalogs = {}
|
96
|
+
self._default_catalog_name = None
|
97
|
+
self._default_catalog = None
|
98
|
+
self.update(catalogs, default)
|
99
|
+
|
100
|
+
def all(self) -> Dict[str, Catalog]:
|
101
|
+
return self._catalogs
|
102
|
+
|
103
|
+
def update(
|
104
|
+
self,
|
105
|
+
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
|
+
default: Optional[str] = None,
|
107
|
+
) -> None:
|
105
108
|
if isinstance(catalogs, Catalog):
|
106
109
|
catalogs = {DEFAULT_CATALOG: catalogs}
|
107
110
|
elif not isinstance(catalogs, dict):
|
108
111
|
raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
|
109
|
-
self.catalogs
|
112
|
+
self._catalogs.update(catalogs)
|
110
113
|
if default:
|
111
114
|
if default not in catalogs:
|
112
115
|
raise ValueError(
|
113
116
|
f"Default catalog `{default}` not found in: {catalogs}"
|
114
117
|
)
|
115
|
-
self.
|
118
|
+
self._default_catalog = self._catalogs[default]
|
119
|
+
self._default_catalog_name = default
|
116
120
|
elif len(catalogs) == 1:
|
117
|
-
self.
|
121
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
118
122
|
else:
|
119
|
-
self.
|
120
|
-
|
121
|
-
def all(self) -> Dict[str, Catalog]:
|
122
|
-
return self.catalogs
|
123
|
+
self._default_catalog = None
|
123
124
|
|
124
125
|
def names(self) -> List[str]:
|
125
|
-
return list(self.
|
126
|
+
return list(self._catalogs.keys())
|
126
127
|
|
127
128
|
def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
|
128
|
-
self.
|
129
|
-
if set_default:
|
130
|
-
self.
|
129
|
+
self._catalogs[name] = catalog
|
130
|
+
if set_default or len(self._catalogs) == 1:
|
131
|
+
self._default_catalog = catalog
|
131
132
|
|
132
133
|
def get(self, name) -> Optional[Catalog]:
|
133
|
-
return self.
|
134
|
+
return self._catalogs.get(name)
|
135
|
+
|
136
|
+
def pop(self, name) -> Optional[Catalog]:
|
137
|
+
catalog = self._catalogs.pop(name, None)
|
138
|
+
if catalog and self._default_catalog_name == name:
|
139
|
+
if len(self._catalogs) == 1:
|
140
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
141
|
+
else:
|
142
|
+
self._default_catalog = None
|
143
|
+
return catalog
|
144
|
+
|
145
|
+
def clear(self) -> None:
|
146
|
+
self._catalogs.clear()
|
147
|
+
self._default_catalog = None
|
134
148
|
|
135
149
|
def default(self) -> Optional[Catalog]:
|
136
|
-
return self.
|
150
|
+
return self._default_catalog
|
137
151
|
|
138
152
|
|
139
153
|
def is_initialized(*args, **kwargs) -> bool:
|
@@ -142,12 +156,9 @@ def is_initialized(*args, **kwargs) -> bool:
|
|
142
156
|
"""
|
143
157
|
global all_catalogs
|
144
158
|
|
145
|
-
# If ray is not initialized, then Catalogs cannot be initialized
|
146
159
|
if not ray.is_initialized():
|
147
|
-
# Any existing actor reference
|
160
|
+
# Any existing Catalogs actor reference must be stale - reset it
|
148
161
|
all_catalogs = None
|
149
|
-
return False
|
150
|
-
|
151
162
|
return all_catalogs is not None
|
152
163
|
|
153
164
|
|
@@ -168,10 +179,10 @@ def raise_if_not_initialized(
|
|
168
179
|
def init(
|
169
180
|
catalogs: Union[Dict[str, Catalog], Catalog] = {},
|
170
181
|
default: Optional[str] = None,
|
171
|
-
ray_init_args: Dict[str, Any] =
|
182
|
+
ray_init_args: Dict[str, Any] = {},
|
172
183
|
*,
|
173
|
-
|
174
|
-
) ->
|
184
|
+
force=False,
|
185
|
+
) -> Optional[ray.runtime.BaseContext]:
|
175
186
|
"""
|
176
187
|
Initialize DeltaCAT catalogs.
|
177
188
|
|
@@ -180,18 +191,20 @@ def init(
|
|
180
191
|
:param default: The name of the default Catalog. If only one Catalog is
|
181
192
|
provided, it will always be the default.
|
182
193
|
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
183
|
-
:param
|
194
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
195
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
196
|
+
catalogs.
|
197
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
184
198
|
"""
|
185
199
|
global all_catalogs
|
186
200
|
|
187
|
-
if is_initialized() and not
|
201
|
+
if is_initialized() and not force:
|
188
202
|
logger.warning("DeltaCAT already initialized.")
|
189
|
-
return
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
ray.init()
|
203
|
+
return None
|
204
|
+
|
205
|
+
# initialize ray (and ignore reinitialization errors)
|
206
|
+
ray_init_args["ignore_reinit_error"] = True
|
207
|
+
context = ray.init(**ray_init_args)
|
195
208
|
|
196
209
|
# register custom serializer for catalogs since these may contain
|
197
210
|
# unserializable objects like boto3 clients with SSLContext
|
@@ -201,9 +214,42 @@ def init(
|
|
201
214
|
# TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
|
202
215
|
# with all catalogs from the last session
|
203
216
|
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
217
|
+
return context
|
204
218
|
|
205
219
|
|
206
|
-
def
|
220
|
+
def init_local(
|
221
|
+
path: Optional[str] = None,
|
222
|
+
ray_init_args: Dict[str, Any] = {},
|
223
|
+
*,
|
224
|
+
force=False,
|
225
|
+
) -> Optional[ray.runtime.BaseContext]:
|
226
|
+
"""
|
227
|
+
Initialize DeltaCAT with a default local catalog.
|
228
|
+
|
229
|
+
This is a convenience function that creates a default catalog for local usage.
|
230
|
+
Equivalent to calling init(catalogs={"default": Catalog()}).
|
231
|
+
|
232
|
+
:param path: Optional path for catalog root directory. If not provided, uses
|
233
|
+
the default behavior of CatalogProperties (DELTACAT_ROOT env var or
|
234
|
+
"./.deltacat/").
|
235
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
236
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
237
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
238
|
+
catalogs.
|
239
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
240
|
+
"""
|
241
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
242
|
+
|
243
|
+
config = CatalogProperties(root=path) if path is not None else None
|
244
|
+
return init(
|
245
|
+
catalogs={"default": Catalog(config=config)},
|
246
|
+
default="default",
|
247
|
+
ray_init_args=ray_init_args,
|
248
|
+
force=force,
|
249
|
+
)
|
250
|
+
|
251
|
+
|
252
|
+
def get_catalog(name: Optional[str] = None) -> Catalog:
|
207
253
|
"""
|
208
254
|
Get a catalog by name, or the default catalog if no name is provided.
|
209
255
|
|
@@ -232,7 +278,7 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
232
278
|
else:
|
233
279
|
catalog = ray.get(all_catalogs.default.remote())
|
234
280
|
if not catalog:
|
235
|
-
available_catalogs = ray.get(all_catalogs.all.remote()).
|
281
|
+
available_catalogs = list(ray.get(all_catalogs.all.remote()).keys())
|
236
282
|
raise ValueError(
|
237
283
|
f"Call to get_catalog without name set failed because there "
|
238
284
|
f"is no default Catalog set. Available catalogs: "
|
@@ -241,17 +287,44 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
241
287
|
return catalog
|
242
288
|
|
243
289
|
|
290
|
+
def clear_catalogs() -> None:
|
291
|
+
"""
|
292
|
+
Clear all catalogs from the global map of named catalogs.
|
293
|
+
"""
|
294
|
+
if all_catalogs:
|
295
|
+
ray.get(all_catalogs.clear.remote())
|
296
|
+
|
297
|
+
|
298
|
+
def pop_catalog(name: str) -> Optional[Catalog]:
|
299
|
+
"""
|
300
|
+
Remove a named catalog from the global map of named catalogs.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
name: Name of the catalog to remove.
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
The removed catalog, or None if not found.
|
307
|
+
"""
|
308
|
+
global all_catalogs
|
309
|
+
|
310
|
+
if not all_catalogs:
|
311
|
+
return None
|
312
|
+
catalog = ray.get(all_catalogs.pop.remote(name))
|
313
|
+
return catalog
|
314
|
+
|
315
|
+
|
244
316
|
def put_catalog(
|
245
317
|
name: str,
|
246
318
|
catalog: Catalog = None,
|
247
319
|
*,
|
248
320
|
default: bool = False,
|
249
|
-
ray_init_args: Dict[str, Any] =
|
321
|
+
ray_init_args: Dict[str, Any] = {},
|
250
322
|
fail_if_exists: bool = False,
|
251
323
|
**kwargs,
|
252
324
|
) -> Catalog:
|
253
325
|
"""
|
254
|
-
Add a named catalog to the global map of named catalogs. Initializes
|
326
|
+
Add a named catalog to the global map of named catalogs. Initializes
|
327
|
+
DeltaCAT if not already initialized.
|
255
328
|
|
256
329
|
Args:
|
257
330
|
name: Name of the catalog.
|
@@ -261,8 +334,8 @@ def put_catalog(
|
|
261
334
|
default: Make this the default catalog if multiple catalogs are
|
262
335
|
available. If only one catalog is available, it will always be the
|
263
336
|
default.
|
264
|
-
ray_init_args: Ray initialization args (used only if ray not already
|
265
|
-
initialized)
|
337
|
+
ray_init_args: Ray initialization args (used only if ray is not already
|
338
|
+
initialized).
|
266
339
|
fail_if_exists: if True, raises an error if a catalog with the given
|
267
340
|
name already exists. If False, inserts or replaces the given
|
268
341
|
catalog name.
|
@@ -276,6 +349,8 @@ def put_catalog(
|
|
276
349
|
|
277
350
|
if not catalog:
|
278
351
|
catalog = Catalog(**kwargs)
|
352
|
+
if name is None:
|
353
|
+
raise ValueError("Catalog name cannot be None")
|
279
354
|
|
280
355
|
# Initialize, if necessary
|
281
356
|
if not is_initialized():
|
@@ -283,25 +358,27 @@ def put_catalog(
|
|
283
358
|
if not default:
|
284
359
|
logger.info(
|
285
360
|
f"Calling put_catalog with set_as_default=False, "
|
286
|
-
f"but still setting Catalog {catalog} as default since it is
|
361
|
+
f"but still setting Catalog {catalog} as default since it is "
|
362
|
+
f"the only catalog."
|
287
363
|
)
|
288
364
|
init({name: catalog}, ray_init_args=ray_init_args)
|
289
|
-
return
|
365
|
+
return catalog
|
290
366
|
|
291
367
|
# Fail if fail_if_exists and catalog already exists
|
292
368
|
if fail_if_exists:
|
293
|
-
catalog_already_exists = False
|
294
369
|
try:
|
295
370
|
get_catalog(name)
|
296
|
-
#
|
297
|
-
# caught in the except block which is meant to catch the ValueError from get_catalog
|
298
|
-
catalog_already_exists = True
|
299
|
-
except ValueError:
|
300
|
-
pass
|
301
|
-
if catalog_already_exists:
|
371
|
+
# If we get here, catalog exists - raise error
|
302
372
|
raise ValueError(
|
303
|
-
f"Failed to put catalog {name} because it already exists and
|
373
|
+
f"Failed to put catalog {name} because it already exists and "
|
374
|
+
f"fail_if_exists={fail_if_exists}"
|
304
375
|
)
|
376
|
+
except ValueError as e:
|
377
|
+
if "not found" not in str(e):
|
378
|
+
# Re-raise if it's not a "catalog not found" error
|
379
|
+
raise
|
380
|
+
# If catalog doesn't exist, continue normally
|
381
|
+
pass
|
305
382
|
|
306
383
|
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
307
384
|
ray.get(all_catalogs.put.remote(name, catalog, default))
|
@@ -1,5 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
from typing import Optional, Any
|
4
|
+
import urllib.parse
|
5
|
+
|
6
|
+
import os
|
3
7
|
|
4
8
|
import pyarrow
|
5
9
|
from deltacat.constants import DELTACAT_ROOT
|
@@ -8,18 +12,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
|
|
8
12
|
|
9
13
|
|
10
14
|
def get_catalog_properties(
|
11
|
-
|
15
|
+
*,
|
12
16
|
catalog: Optional[CatalogProperties] = None,
|
13
17
|
inner: Optional[CatalogProperties] = None,
|
14
18
|
**kwargs,
|
15
19
|
) -> CatalogProperties:
|
16
20
|
"""
|
17
|
-
Helper function to fetch CatalogProperties instance.
|
18
|
-
kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
|
21
|
+
Helper function to fetch CatalogProperties instance.
|
19
22
|
|
20
|
-
This will look
|
21
|
-
|
22
|
-
constructor.
|
23
|
+
This will look first look for CatalogProperties in either "catalog"
|
24
|
+
or "inner" and otherwise passes all keyword arguments to the
|
25
|
+
CatalogProperties constructor.
|
23
26
|
"""
|
24
27
|
properties = catalog if catalog is not None else inner
|
25
28
|
if properties is not None and isinstance(properties, CatalogProperties):
|
@@ -39,21 +42,22 @@ class CatalogProperties:
|
|
39
42
|
DeltaCAT catalog instance. Properties are set from system environment
|
40
43
|
variables unless explicit overrides are provided during initialization.
|
41
44
|
|
42
|
-
Catalog and storage APIs rely on the property catalog to retrieve durable
|
43
|
-
working against.
|
45
|
+
Catalog and storage APIs rely on the property catalog to retrieve durable
|
46
|
+
state about the catalog they're working against.
|
44
47
|
|
45
48
|
Attributes:
|
46
|
-
root
|
47
|
-
|
48
|
-
1.
|
49
|
-
2.
|
50
|
-
3. default to
|
49
|
+
root: The root path for catalog metadata and data storage. Resolved by
|
50
|
+
searching for the root path in the following order:
|
51
|
+
1. "root" constructor input argument
|
52
|
+
2. "DELTACAT_ROOT" system environment variable
|
53
|
+
3. default to "./.deltacat/"
|
51
54
|
|
52
55
|
filesystem: The filesystem implementation that should be used for
|
53
56
|
reading/writing files. If None, a filesystem will be inferred from
|
54
57
|
the catalog root path.
|
55
58
|
|
56
|
-
storage: Storage class implementation (overrides default filesystem
|
59
|
+
storage: Storage class implementation (overrides default filesystem
|
60
|
+
storage impl)
|
57
61
|
"""
|
58
62
|
|
59
63
|
def __init__(
|
@@ -66,21 +70,26 @@ class CatalogProperties:
|
|
66
70
|
Initialize a CatalogProperties instance.
|
67
71
|
|
68
72
|
Args:
|
69
|
-
root:
|
73
|
+
root: Catalog root directory path. Uses the "DELTACAT_ROOT"
|
74
|
+
system environment variable if not set, and defaults to
|
75
|
+
"./.deltacat/" if this environment variable is not set.
|
70
76
|
filesystem: The filesystem implementation that should be used for
|
71
77
|
reading these files. If None, a filesystem will be inferred.
|
72
|
-
If
|
73
|
-
|
78
|
+
If provided, this will be validated for compatibility with the
|
79
|
+
catalog root path.
|
80
|
+
storage: DeltaCAT storage implementation override.
|
74
81
|
"""
|
75
82
|
# set root, using precedence rules described in pydoc
|
76
83
|
if root is None:
|
77
84
|
# Check environment variables
|
78
|
-
# This is set or defaulted in constants.py
|
79
85
|
root = DELTACAT_ROOT
|
80
|
-
if root
|
81
|
-
|
82
|
-
|
83
|
-
|
86
|
+
if not root:
|
87
|
+
# Default to "./.deltacat/"
|
88
|
+
root = os.path.join(os.getcwd(), ".deltacat")
|
89
|
+
|
90
|
+
# Store the original root with its scheme for reconstruction later
|
91
|
+
self._original_root = root
|
92
|
+
self._original_scheme = urllib.parse.urlparse(root).scheme
|
84
93
|
|
85
94
|
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
86
95
|
path=root,
|
@@ -105,6 +114,38 @@ class CatalogProperties:
|
|
105
114
|
"""
|
106
115
|
return self._storage
|
107
116
|
|
117
|
+
def reconstruct_full_path(self, path: str) -> str:
|
118
|
+
"""
|
119
|
+
Reconstruct a full path with the original scheme for external readers.
|
120
|
+
|
121
|
+
This addresses GitHub issue #567 by ensuring that cloud storage URIs
|
122
|
+
include the relevant scheme prefix (e.g., s3://) that some file readers
|
123
|
+
require regardless of the filesystem being used to read the file
|
124
|
+
(e.g., Daft).
|
125
|
+
|
126
|
+
Args:
|
127
|
+
path: A path relative to the catalog root or absolute path
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
Full path with appropriate scheme prefix for external readers
|
131
|
+
"""
|
132
|
+
# If the path already has a scheme, return it as-is
|
133
|
+
if urllib.parse.urlparse(path).scheme:
|
134
|
+
return path
|
135
|
+
|
136
|
+
# If we don't have an original scheme (local filesystem), return as-is
|
137
|
+
if not self._original_scheme:
|
138
|
+
return path
|
139
|
+
|
140
|
+
# Reconstruct the full path with the original scheme
|
141
|
+
# Handle both absolute and relative paths
|
142
|
+
if path.startswith("/"):
|
143
|
+
# Absolute path - this shouldn't happen normally but handle it
|
144
|
+
return f"{self._original_scheme}:/{path}"
|
145
|
+
else:
|
146
|
+
# Relative path - prepend the s3:// scheme
|
147
|
+
return f"{self._original_scheme}://{path}"
|
148
|
+
|
108
149
|
def __str__(self):
|
109
150
|
return (
|
110
151
|
f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
|