deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +19 -15
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +18 -8
- deltacat/catalog/model/catalog.py +111 -73
- deltacat/catalog/model/properties.py +25 -22
- deltacat/compute/jobs/client.py +7 -5
- deltacat/constants.py +1 -2
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/model/shard.py +6 -2
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/types/media.py +3 -3
- deltacat/utils/daft.py +530 -4
- deltacat/utils/export.py +3 -1
- deltacat/utils/url.py +1 -1
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → examples/experimental}/__init__.py +0 -0
- /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
- /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -8,7 +8,7 @@ from deltacat.api import (
|
|
8
8
|
list,
|
9
9
|
put,
|
10
10
|
)
|
11
|
-
from deltacat.catalog
|
11
|
+
from deltacat.catalog import ( # noqa: F401
|
12
12
|
alter_namespace,
|
13
13
|
alter_table,
|
14
14
|
create_namespace,
|
@@ -27,17 +27,18 @@ from deltacat.catalog.delegate import (
|
|
27
27
|
table_exists,
|
28
28
|
truncate_table,
|
29
29
|
write_to_table,
|
30
|
-
)
|
31
|
-
from deltacat.catalog.model.catalog import ( # noqa: F401
|
32
|
-
Catalog,
|
33
|
-
Catalogs,
|
34
|
-
raise_if_not_initialized,
|
35
|
-
is_initialized,
|
36
30
|
init,
|
31
|
+
is_initialized,
|
32
|
+
clear_catalogs,
|
37
33
|
get_catalog,
|
34
|
+
get_catalog_properties,
|
35
|
+
pop_catalog,
|
38
36
|
put_catalog,
|
37
|
+
raise_if_not_initialized,
|
38
|
+
Catalog,
|
39
|
+
CatalogProperties,
|
40
|
+
TableDefinition,
|
39
41
|
)
|
40
|
-
from deltacat.catalog.model.table_definition import TableDefinition
|
41
42
|
from deltacat.compute import (
|
42
43
|
job_client,
|
43
44
|
local_job_client,
|
@@ -60,7 +61,6 @@ from deltacat.storage import (
|
|
60
61
|
SortScheme,
|
61
62
|
NullOrder,
|
62
63
|
)
|
63
|
-
from deltacat.storage.rivulet import Dataset as RivDataset, Datatype as RivDatatype
|
64
64
|
from deltacat.types.media import (
|
65
65
|
ContentEncoding,
|
66
66
|
ContentType,
|
@@ -73,7 +73,9 @@ from deltacat.utils.url import DeltaCatUrl
|
|
73
73
|
|
74
74
|
__iceberg__ = []
|
75
75
|
if importlib.util.find_spec("pyiceberg") is not None:
|
76
|
-
from deltacat.catalog.iceberg import
|
76
|
+
from deltacat.experimental.catalog.iceberg import ( # noqa: F401
|
77
|
+
impl as IcebergCatalog,
|
78
|
+
)
|
77
79
|
|
78
80
|
__iceberg__ = [
|
79
81
|
"IcebergCatalog",
|
@@ -81,7 +83,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
|
|
81
83
|
|
82
84
|
deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
|
83
85
|
|
84
|
-
__version__ = "2.0.
|
86
|
+
__version__ = "2.0.0b11"
|
85
87
|
|
86
88
|
|
87
89
|
__all__ = [
|
@@ -110,12 +112,16 @@ __all__ = [
|
|
110
112
|
"default_namespace",
|
111
113
|
"write_to_table",
|
112
114
|
"read_table",
|
115
|
+
"init",
|
116
|
+
"is_initialized",
|
117
|
+
"clear_catalogs",
|
113
118
|
"get_catalog",
|
119
|
+
"get_catalog_properties",
|
120
|
+
"pop_catalog",
|
114
121
|
"put_catalog",
|
115
122
|
"raise_if_not_initialized",
|
116
|
-
"is_initialized",
|
117
|
-
"init",
|
118
123
|
"Catalog",
|
124
|
+
"CatalogProperties",
|
119
125
|
"ContentType",
|
120
126
|
"ContentEncoding",
|
121
127
|
"Dataset",
|
@@ -123,8 +129,6 @@ __all__ = [
|
|
123
129
|
"DatastoreType",
|
124
130
|
"DeltaCatUrl",
|
125
131
|
"DistributedDataset",
|
126
|
-
"RivDataset",
|
127
|
-
"RivDatatype",
|
128
132
|
"Field",
|
129
133
|
"LifecycleState",
|
130
134
|
"ListResult",
|
@@ -4,8 +4,10 @@ from contextlib import contextmanager
|
|
4
4
|
from typing import Generator, Tuple
|
5
5
|
|
6
6
|
from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
|
7
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
7
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
|
10
12
|
|
11
13
|
@contextmanager
|
@@ -1,10 +1,12 @@
|
|
1
1
|
import math
|
2
2
|
from random import shuffle
|
3
3
|
import pytest
|
4
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
5
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
6
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
7
|
-
|
4
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
5
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
6
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
7
|
+
QueryExpression,
|
8
|
+
)
|
9
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
8
10
|
from deltacat.benchmarking.benchmark_engine import BenchmarkEngine
|
9
11
|
from deltacat.benchmarking.benchmark_report import BenchmarkRun, BenchmarkReport
|
10
12
|
from deltacat.benchmarking.benchmark_suite import BenchmarkSuite
|
deltacat/catalog/__init__.py
CHANGED
@@ -1,14 +1,71 @@
|
|
1
|
+
from deltacat.catalog.delegate import (
|
2
|
+
alter_namespace,
|
3
|
+
alter_table,
|
4
|
+
create_namespace,
|
5
|
+
create_table,
|
6
|
+
default_namespace,
|
7
|
+
drop_namespace,
|
8
|
+
drop_table,
|
9
|
+
get_namespace,
|
10
|
+
get_table,
|
11
|
+
list_namespaces,
|
12
|
+
list_tables,
|
13
|
+
namespace_exists,
|
14
|
+
read_table,
|
15
|
+
refresh_table,
|
16
|
+
rename_table,
|
17
|
+
table_exists,
|
18
|
+
truncate_table,
|
19
|
+
write_to_table,
|
20
|
+
)
|
21
|
+
from deltacat.catalog.model.catalog import ( # noqa: F401
|
22
|
+
all_catalogs,
|
23
|
+
init,
|
24
|
+
is_initialized,
|
25
|
+
clear_catalogs,
|
26
|
+
get_catalog,
|
27
|
+
pop_catalog,
|
28
|
+
put_catalog,
|
29
|
+
raise_if_not_initialized,
|
30
|
+
Catalog,
|
31
|
+
)
|
1
32
|
from deltacat.catalog.model.properties import ( # noqa: F401
|
2
33
|
CatalogProperties,
|
3
34
|
get_catalog_properties,
|
4
35
|
)
|
5
|
-
from deltacat.catalog.model.
|
6
|
-
from deltacat.catalog.main import impl as
|
36
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
37
|
+
from deltacat.catalog.main import impl as dcat
|
7
38
|
|
8
39
|
__all__ = [
|
9
|
-
"
|
40
|
+
"alter_namespace",
|
41
|
+
"alter_table",
|
42
|
+
"create_namespace",
|
43
|
+
"create_table",
|
44
|
+
"default_namespace",
|
45
|
+
"drop_namespace",
|
46
|
+
"drop_table",
|
47
|
+
"get_namespace",
|
48
|
+
"get_table",
|
49
|
+
"list_namespaces",
|
50
|
+
"list_tables",
|
51
|
+
"namespace_exists",
|
52
|
+
"read_table",
|
53
|
+
"refresh_table",
|
54
|
+
"rename_table",
|
55
|
+
"table_exists",
|
56
|
+
"truncate_table",
|
57
|
+
"write_to_table",
|
58
|
+
"all_catalogs",
|
59
|
+
"init",
|
60
|
+
"is_initialized",
|
61
|
+
"clear_catalogs",
|
62
|
+
"get_catalog",
|
10
63
|
"get_catalog_properties",
|
64
|
+
"pop_catalog",
|
65
|
+
"put_catalog",
|
66
|
+
"raise_if_not_initialized",
|
67
|
+
"dcat",
|
11
68
|
"Catalog",
|
12
|
-
"
|
13
|
-
"
|
69
|
+
"CatalogProperties",
|
70
|
+
"TableDefinition",
|
14
71
|
]
|
deltacat/catalog/main/impl.py
CHANGED
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
|
4
4
|
import deltacat as dc
|
5
5
|
|
6
|
-
from deltacat.catalog import CatalogProperties
|
6
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
7
7
|
from deltacat.exceptions import (
|
8
8
|
NamespaceAlreadyExistsError,
|
9
9
|
StreamNotFoundError,
|
@@ -42,20 +42,30 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
42
42
|
"""
|
43
43
|
Default Catalog interface implementation using DeltaCAT native storage.
|
44
44
|
|
45
|
-
|
46
|
-
|
45
|
+
The functions here should not be invoked directly, but should instead be
|
46
|
+
invoked through `delegate.py` (e.g., to support passing catalog's by name, and
|
47
|
+
to ensure that each initialized `Catalog` implementation has its `inner`
|
48
|
+
property set to the `CatalogProperties` returned from `initialize()`).
|
47
49
|
|
48
|
-
`CatalogProperties`
|
49
|
-
|
50
|
+
The `CatalogProperties` instance returned by `initialize()` contains all
|
51
|
+
durable state required to deterministically reconstruct the associated DeltaCAT
|
52
|
+
native `Catalog` implementation (e.g., the root URI for the catalog metastore).
|
50
53
|
"""
|
51
54
|
|
52
55
|
|
53
56
|
# catalog functions
|
54
|
-
def initialize(
|
57
|
+
def initialize(
|
58
|
+
config: Optional[CatalogProperties] = None,
|
59
|
+
*args,
|
60
|
+
**kwargs,
|
61
|
+
) -> CatalogProperties:
|
55
62
|
"""
|
56
|
-
|
63
|
+
Performs any required one-time initialization and validation of this
|
64
|
+
catalog implementation based on the input configuration. If no config
|
65
|
+
instance is given, a new `CatalogProperties` instance is constructed
|
66
|
+
using the given keyword arguments.
|
57
67
|
|
58
|
-
|
68
|
+
Returns the input config if given, and the newly created config otherwise.
|
59
69
|
"""
|
60
70
|
if config is not None:
|
61
71
|
if not isinstance(config, CatalogProperties):
|
@@ -9,11 +9,8 @@ from functools import partial
|
|
9
9
|
import ray
|
10
10
|
|
11
11
|
from deltacat import logs
|
12
|
-
from deltacat.
|
13
|
-
from deltacat.catalog.
|
14
|
-
from deltacat.catalog.iceberg import impl as IcebergCatalog
|
15
|
-
from deltacat.catalog import CatalogProperties
|
16
|
-
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
12
|
+
from deltacat.catalog.main import impl as dcat
|
13
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
17
14
|
from deltacat.constants import DEFAULT_CATALOG
|
18
15
|
|
19
16
|
all_catalogs: Optional[ray.actor.ActorHandle] = None
|
@@ -22,14 +19,20 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
22
19
|
|
23
20
|
|
24
21
|
class Catalog:
|
25
|
-
def __init__(
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
config: Optional[Union[CatalogProperties, Any]] = None,
|
25
|
+
impl: ModuleType = dcat,
|
26
|
+
*args,
|
27
|
+
**kwargs,
|
28
|
+
):
|
26
29
|
"""
|
27
30
|
Constructor for a Catalog.
|
28
31
|
|
29
|
-
Invokes `impl.initialize(*args, **kwargs)` and stores its
|
30
|
-
in the `inner` property
|
31
|
-
deterministically reconstruct this Catalog instance on any node
|
32
|
-
must
|
32
|
+
Invokes `impl.initialize(config, *args, **kwargs)` and stores its
|
33
|
+
return value in the `inner` property. This captures all state required
|
34
|
+
to deterministically reconstruct this Catalog instance on any node, and
|
35
|
+
must be pickleable by Ray cloudpickle.
|
33
36
|
"""
|
34
37
|
if not isinstance(self, Catalog):
|
35
38
|
# self may contain the tuple returned from __reduce__ (ray pickle bug?)
|
@@ -40,32 +43,15 @@ class Catalog:
|
|
40
43
|
err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
|
41
44
|
raise RuntimeError(err_msg)
|
42
45
|
|
46
|
+
self._config = config
|
43
47
|
self._impl = impl
|
44
|
-
self._inner = self._impl.initialize(*args, **kwargs)
|
48
|
+
self._inner = self._impl.initialize(config=config, *args, **kwargs)
|
45
49
|
self._args = args
|
46
50
|
self._kwargs = kwargs
|
47
51
|
|
48
|
-
@
|
49
|
-
|
50
|
-
|
51
|
-
"""
|
52
|
-
!!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
|
53
|
-
|
54
|
-
Factory method to construct a catalog from Iceberg catalog params
|
55
|
-
|
56
|
-
This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
|
57
|
-
plumbing __params__ through as kwargs
|
58
|
-
"""
|
59
|
-
return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
|
60
|
-
|
61
|
-
@classmethod
|
62
|
-
def default(cls, config: CatalogProperties, *args, **kwargs):
|
63
|
-
"""
|
64
|
-
Factory method to construct a catalog with the default implementation
|
65
|
-
|
66
|
-
Uses CatalogProperties as configuration
|
67
|
-
"""
|
68
|
-
return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
|
52
|
+
@property
|
53
|
+
def config(self):
|
54
|
+
return self._config
|
69
55
|
|
70
56
|
@property
|
71
57
|
def impl(self):
|
@@ -79,7 +65,11 @@ class Catalog:
|
|
79
65
|
def __reduce__(self):
|
80
66
|
# instantiated catalogs may fail to pickle, so exclude _inner
|
81
67
|
# (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
|
82
|
-
return partial(self.__class__, **self._kwargs), (
|
68
|
+
return partial(self.__class__, **self._kwargs), (
|
69
|
+
self._config,
|
70
|
+
self._impl,
|
71
|
+
*self._args,
|
72
|
+
)
|
83
73
|
|
84
74
|
def __str__(self):
|
85
75
|
string_rep = f"{self.__class__.__name__}("
|
@@ -102,38 +92,62 @@ class Catalogs:
|
|
102
92
|
catalogs: Union[Catalog, Dict[str, Catalog]],
|
103
93
|
default: Optional[str] = None,
|
104
94
|
):
|
95
|
+
self._catalogs = {}
|
96
|
+
self._default_catalog_name = None
|
97
|
+
self._default_catalog = None
|
98
|
+
self.update(catalogs, default)
|
99
|
+
|
100
|
+
def all(self) -> Dict[str, Catalog]:
|
101
|
+
return self._catalogs
|
102
|
+
|
103
|
+
def update(
|
104
|
+
self,
|
105
|
+
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
|
+
default: Optional[str] = None,
|
107
|
+
) -> None:
|
105
108
|
if isinstance(catalogs, Catalog):
|
106
109
|
catalogs = {DEFAULT_CATALOG: catalogs}
|
107
110
|
elif not isinstance(catalogs, dict):
|
108
111
|
raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
|
109
|
-
self.catalogs
|
112
|
+
self._catalogs.update(catalogs)
|
110
113
|
if default:
|
111
114
|
if default not in catalogs:
|
112
115
|
raise ValueError(
|
113
116
|
f"Default catalog `{default}` not found in: {catalogs}"
|
114
117
|
)
|
115
|
-
self.
|
118
|
+
self._default_catalog = self._catalogs[default]
|
119
|
+
self._default_catalog_name = default
|
116
120
|
elif len(catalogs) == 1:
|
117
|
-
self.
|
121
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
118
122
|
else:
|
119
|
-
self.
|
120
|
-
|
121
|
-
def all(self) -> Dict[str, Catalog]:
|
122
|
-
return self.catalogs
|
123
|
+
self._default_catalog = None
|
123
124
|
|
124
125
|
def names(self) -> List[str]:
|
125
|
-
return list(self.
|
126
|
+
return list(self._catalogs.keys())
|
126
127
|
|
127
128
|
def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
|
128
|
-
self.
|
129
|
+
self._catalogs[name] = catalog
|
129
130
|
if set_default:
|
130
|
-
self.
|
131
|
+
self._default_catalog = catalog
|
131
132
|
|
132
133
|
def get(self, name) -> Optional[Catalog]:
|
133
|
-
return self.
|
134
|
+
return self._catalogs.get(name)
|
135
|
+
|
136
|
+
def pop(self, name) -> Optional[Catalog]:
|
137
|
+
catalog = self._catalogs.pop(name, None)
|
138
|
+
if catalog and self._default_catalog_name == name:
|
139
|
+
if len(self._catalogs) == 1:
|
140
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
141
|
+
else:
|
142
|
+
self._default_catalog = None
|
143
|
+
return catalog
|
144
|
+
|
145
|
+
def clear(self) -> None:
|
146
|
+
self._catalogs.clear()
|
147
|
+
self._default_catalog = None
|
134
148
|
|
135
149
|
def default(self) -> Optional[Catalog]:
|
136
|
-
return self.
|
150
|
+
return self._default_catalog
|
137
151
|
|
138
152
|
|
139
153
|
def is_initialized(*args, **kwargs) -> bool:
|
@@ -142,12 +156,9 @@ def is_initialized(*args, **kwargs) -> bool:
|
|
142
156
|
"""
|
143
157
|
global all_catalogs
|
144
158
|
|
145
|
-
# If ray is not initialized, then Catalogs cannot be initialized
|
146
159
|
if not ray.is_initialized():
|
147
|
-
# Any existing actor reference
|
160
|
+
# Any existing Catalogs actor reference must be stale - reset it
|
148
161
|
all_catalogs = None
|
149
|
-
return False
|
150
|
-
|
151
162
|
return all_catalogs is not None
|
152
163
|
|
153
164
|
|
@@ -168,9 +179,9 @@ def raise_if_not_initialized(
|
|
168
179
|
def init(
|
169
180
|
catalogs: Union[Dict[str, Catalog], Catalog] = {},
|
170
181
|
default: Optional[str] = None,
|
171
|
-
ray_init_args: Dict[str, Any] =
|
182
|
+
ray_init_args: Dict[str, Any] = {},
|
172
183
|
*,
|
173
|
-
|
184
|
+
force=False,
|
174
185
|
) -> None:
|
175
186
|
"""
|
176
187
|
Initialize DeltaCAT catalogs.
|
@@ -180,18 +191,19 @@ def init(
|
|
180
191
|
:param default: The name of the default Catalog. If only one Catalog is
|
181
192
|
provided, it will always be the default.
|
182
193
|
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
183
|
-
:param
|
194
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
195
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
196
|
+
catalogs.
|
184
197
|
"""
|
185
198
|
global all_catalogs
|
186
199
|
|
187
|
-
if is_initialized() and not
|
200
|
+
if is_initialized() and not force:
|
188
201
|
logger.warning("DeltaCAT already initialized.")
|
189
202
|
return
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
ray.init()
|
203
|
+
|
204
|
+
# initialize ray (and ignore reinitialization errors)
|
205
|
+
ray_init_args["ignore_reinit_error"] = True
|
206
|
+
ray.init(**ray_init_args)
|
195
207
|
|
196
208
|
# register custom serializer for catalogs since these may contain
|
197
209
|
# unserializable objects like boto3 clients with SSLContext
|
@@ -203,7 +215,7 @@ def init(
|
|
203
215
|
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
204
216
|
|
205
217
|
|
206
|
-
def get_catalog(name: Optional[str] = None
|
218
|
+
def get_catalog(name: Optional[str] = None) -> Catalog:
|
207
219
|
"""
|
208
220
|
Get a catalog by name, or the default catalog if no name is provided.
|
209
221
|
|
@@ -241,17 +253,44 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
|
241
253
|
return catalog
|
242
254
|
|
243
255
|
|
256
|
+
def clear_catalogs() -> None:
|
257
|
+
"""
|
258
|
+
Clear all catalogs from the global map of named catalogs.
|
259
|
+
"""
|
260
|
+
if all_catalogs:
|
261
|
+
ray.get(all_catalogs.clear.remote())
|
262
|
+
|
263
|
+
|
264
|
+
def pop_catalog(name: str) -> Optional[Catalog]:
|
265
|
+
"""
|
266
|
+
Remove a named catalog from the global map of named catalogs.
|
267
|
+
|
268
|
+
Args:
|
269
|
+
name: Name of the catalog to remove.
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
The removed catalog, or None if not found.
|
273
|
+
"""
|
274
|
+
global all_catalogs
|
275
|
+
|
276
|
+
if not all_catalogs:
|
277
|
+
return None
|
278
|
+
catalog = ray.get(all_catalogs.pop.remote(name))
|
279
|
+
return catalog
|
280
|
+
|
281
|
+
|
244
282
|
def put_catalog(
|
245
283
|
name: str,
|
246
284
|
catalog: Catalog = None,
|
247
285
|
*,
|
248
286
|
default: bool = False,
|
249
|
-
ray_init_args: Dict[str, Any] =
|
287
|
+
ray_init_args: Dict[str, Any] = {},
|
250
288
|
fail_if_exists: bool = False,
|
251
289
|
**kwargs,
|
252
290
|
) -> Catalog:
|
253
291
|
"""
|
254
|
-
Add a named catalog to the global map of named catalogs. Initializes
|
292
|
+
Add a named catalog to the global map of named catalogs. Initializes
|
293
|
+
DeltaCAT if not already initialized.
|
255
294
|
|
256
295
|
Args:
|
257
296
|
name: Name of the catalog.
|
@@ -261,8 +300,8 @@ def put_catalog(
|
|
261
300
|
default: Make this the default catalog if multiple catalogs are
|
262
301
|
available. If only one catalog is available, it will always be the
|
263
302
|
default.
|
264
|
-
ray_init_args: Ray initialization args (used only if ray not already
|
265
|
-
initialized)
|
303
|
+
ray_init_args: Ray initialization args (used only if ray is not already
|
304
|
+
initialized).
|
266
305
|
fail_if_exists: if True, raises an error if a catalog with the given
|
267
306
|
name already exists. If False, inserts or replaces the given
|
268
307
|
catalog name.
|
@@ -276,6 +315,8 @@ def put_catalog(
|
|
276
315
|
|
277
316
|
if not catalog:
|
278
317
|
catalog = Catalog(**kwargs)
|
318
|
+
if name is None:
|
319
|
+
raise ValueError("Catalog name cannot be None")
|
279
320
|
|
280
321
|
# Initialize, if necessary
|
281
322
|
if not is_initialized():
|
@@ -283,25 +324,22 @@ def put_catalog(
|
|
283
324
|
if not default:
|
284
325
|
logger.info(
|
285
326
|
f"Calling put_catalog with set_as_default=False, "
|
286
|
-
f"but still setting Catalog {catalog} as default since it is
|
327
|
+
f"but still setting Catalog {catalog} as default since it is "
|
328
|
+
f"the only catalog."
|
287
329
|
)
|
288
330
|
init({name: catalog}, ray_init_args=ray_init_args)
|
289
|
-
return
|
331
|
+
return catalog
|
290
332
|
|
291
333
|
# Fail if fail_if_exists and catalog already exists
|
292
334
|
if fail_if_exists:
|
293
|
-
catalog_already_exists = False
|
294
335
|
try:
|
295
336
|
get_catalog(name)
|
296
|
-
# Note - need to set state catalog_already_exists and throw ValueError later, or else it will be
|
297
|
-
# caught in the except block which is meant to catch the ValueError from get_catalog
|
298
|
-
catalog_already_exists = True
|
299
337
|
except ValueError:
|
300
338
|
pass
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
339
|
+
raise ValueError(
|
340
|
+
f"Failed to put catalog {name} because it already exists and "
|
341
|
+
f"fail_if_exists={fail_if_exists}"
|
342
|
+
)
|
305
343
|
|
306
344
|
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
307
345
|
ray.get(all_catalogs.put.remote(name, catalog, default))
|
@@ -1,6 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
from typing import Optional, Any
|
3
4
|
|
5
|
+
import os
|
6
|
+
|
4
7
|
import pyarrow
|
5
8
|
from deltacat.constants import DELTACAT_ROOT
|
6
9
|
|
@@ -8,18 +11,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
|
|
8
11
|
|
9
12
|
|
10
13
|
def get_catalog_properties(
|
11
|
-
|
14
|
+
*,
|
12
15
|
catalog: Optional[CatalogProperties] = None,
|
13
16
|
inner: Optional[CatalogProperties] = None,
|
14
17
|
**kwargs,
|
15
18
|
) -> CatalogProperties:
|
16
19
|
"""
|
17
|
-
Helper function to fetch CatalogProperties instance.
|
18
|
-
kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
|
20
|
+
Helper function to fetch CatalogProperties instance.
|
19
21
|
|
20
|
-
This will look
|
21
|
-
|
22
|
-
constructor.
|
22
|
+
This will look first look for CatalogProperties in either "catalog"
|
23
|
+
or "inner" and otherwise passes all keyword arguments to the
|
24
|
+
CatalogProperties constructor.
|
23
25
|
"""
|
24
26
|
properties = catalog if catalog is not None else inner
|
25
27
|
if properties is not None and isinstance(properties, CatalogProperties):
|
@@ -39,21 +41,22 @@ class CatalogProperties:
|
|
39
41
|
DeltaCAT catalog instance. Properties are set from system environment
|
40
42
|
variables unless explicit overrides are provided during initialization.
|
41
43
|
|
42
|
-
Catalog and storage APIs rely on the property catalog to retrieve durable
|
43
|
-
working against.
|
44
|
+
Catalog and storage APIs rely on the property catalog to retrieve durable
|
45
|
+
state about the catalog they're working against.
|
44
46
|
|
45
47
|
Attributes:
|
46
|
-
root
|
47
|
-
|
48
|
-
1.
|
49
|
-
2.
|
50
|
-
3. default to
|
48
|
+
root: The root path for catalog metadata and data storage. Resolved by
|
49
|
+
searching for the root path in the following order:
|
50
|
+
1. "root" constructor input argument
|
51
|
+
2. "DELTACAT_ROOT" system environment variable
|
52
|
+
3. default to "./.deltacat/"
|
51
53
|
|
52
54
|
filesystem: The filesystem implementation that should be used for
|
53
55
|
reading/writing files. If None, a filesystem will be inferred from
|
54
56
|
the catalog root path.
|
55
57
|
|
56
|
-
storage: Storage class implementation (overrides default filesystem
|
58
|
+
storage: Storage class implementation (overrides default filesystem
|
59
|
+
storage impl)
|
57
60
|
"""
|
58
61
|
|
59
62
|
def __init__(
|
@@ -66,21 +69,21 @@ class CatalogProperties:
|
|
66
69
|
Initialize a CatalogProperties instance.
|
67
70
|
|
68
71
|
Args:
|
69
|
-
root:
|
72
|
+
root: Catalog root directory path. Uses the "DELTACAT_ROOT"
|
73
|
+
system environment variable if not set, and defaults to
|
74
|
+
"./.deltacat/" if this environment variable is not set.
|
70
75
|
filesystem: The filesystem implementation that should be used for
|
71
76
|
reading these files. If None, a filesystem will be inferred.
|
72
|
-
If
|
73
|
-
|
77
|
+
If provided, this will be validated for compatibility with the
|
78
|
+
catalog root path.
|
74
79
|
"""
|
75
80
|
# set root, using precedence rules described in pydoc
|
76
81
|
if root is None:
|
77
82
|
# Check environment variables
|
78
|
-
# This is set or defaulted in constants.py
|
79
83
|
root = DELTACAT_ROOT
|
80
|
-
if root
|
81
|
-
|
82
|
-
|
83
|
-
)
|
84
|
+
if not root:
|
85
|
+
# Default to "./.deltacat/"
|
86
|
+
root = os.path.join(os.getcwd(), ".deltacat")
|
84
87
|
|
85
88
|
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
86
89
|
path=root,
|