deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,47 +1,129 @@
|
|
1
1
|
# Allow self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
|
4
|
+
import logging
|
5
|
+
from types import ModuleType
|
5
6
|
|
7
|
+
from typing import Any, Dict, List, Optional, Union
|
8
|
+
from functools import partial
|
6
9
|
import ray
|
7
10
|
|
8
|
-
from deltacat
|
11
|
+
from deltacat import logs
|
12
|
+
from deltacat.annotations import ExperimentalAPI
|
13
|
+
from deltacat.catalog.main import impl as DeltacatCatalog
|
14
|
+
from deltacat.catalog.iceberg import impl as IcebergCatalog
|
15
|
+
from deltacat.catalog import CatalogProperties
|
16
|
+
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
17
|
+
from deltacat.constants import DEFAULT_CATALOG
|
9
18
|
|
10
|
-
all_catalogs: Optional[
|
19
|
+
all_catalogs: Optional[ray.actor.ActorHandle] = None
|
20
|
+
|
21
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
22
|
|
12
23
|
|
13
24
|
class Catalog:
|
14
|
-
def __init__(self, impl=
|
25
|
+
def __init__(self, impl: ModuleType = DeltacatCatalog, *args, **kwargs):
|
26
|
+
"""
|
27
|
+
Constructor for a Catalog.
|
28
|
+
|
29
|
+
The args and kwargs here will be plumbed through to the catalog initialize function, and the results
|
30
|
+
are stored in Catalog.inner. Any state which is required (like: metastore root URI, pyiceberg native catalog)
|
31
|
+
MUST be returned by initialize.
|
32
|
+
|
33
|
+
Note: all initialization configuration MUST be pickle-able. When `Catalog` is pickled, _inner is excluded.
|
34
|
+
Instead, we only pass impl/args/kwargs, which are pickled and then _inner is re-constituted by calling __init__.
|
35
|
+
See `ray.util.register_serializer` in Catalogs class.
|
36
|
+
"""
|
37
|
+
if not isinstance(self, Catalog):
|
38
|
+
# self may contain the tuple returned from __reduce__ (ray pickle bug?)
|
39
|
+
if callable(self[0]) and isinstance(self[1], tuple):
|
40
|
+
logger.info(f"Invoking {self[0]} with positional args: {self[1]}")
|
41
|
+
return self[0](*self[1])
|
42
|
+
else:
|
43
|
+
err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
|
44
|
+
raise RuntimeError(err_msg)
|
45
|
+
|
15
46
|
self._impl = impl
|
16
|
-
self._impl.initialize(*args, **kwargs)
|
47
|
+
self._inner = self._impl.initialize(*args, **kwargs)
|
48
|
+
self._args = args
|
49
|
+
self._kwargs = kwargs
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
@ExperimentalAPI
|
53
|
+
def iceberg(cls, config: IcebergCatalogConfig, *args, **kwargs):
|
54
|
+
"""
|
55
|
+
!!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
|
56
|
+
|
57
|
+
Factory method to construct a catalog from Iceberg catalog params
|
58
|
+
|
59
|
+
This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
|
60
|
+
plumbing __params__ through as kwargs
|
61
|
+
"""
|
62
|
+
return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
|
63
|
+
|
64
|
+
@classmethod
|
65
|
+
def default(cls, config: CatalogProperties, *args, **kwargs):
|
66
|
+
"""
|
67
|
+
Factory method to construct a catalog with the default implementation
|
68
|
+
|
69
|
+
Uses CatalogProperties as configuration
|
70
|
+
"""
|
71
|
+
return cls(impl=DeltacatCatalog, *args, **{"config": config, **kwargs})
|
17
72
|
|
18
73
|
@property
|
19
74
|
def impl(self):
|
20
75
|
return self._impl
|
21
76
|
|
77
|
+
@property
|
78
|
+
def inner(self) -> Optional[Any]:
|
79
|
+
return self._inner
|
80
|
+
|
81
|
+
# support pickle, copy, deepcopy, etc.
|
82
|
+
def __reduce__(self):
|
83
|
+
# instantiated catalogs may fail to pickle, so exclude _inner
|
84
|
+
# (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
|
85
|
+
return partial(self.__class__, **self._kwargs), (self._impl, *self._args)
|
86
|
+
|
87
|
+
def __str__(self):
|
88
|
+
string_rep = f"{self.__class__.__name__}("
|
89
|
+
if self._args:
|
90
|
+
string_rep += f"args={self._args}, "
|
91
|
+
if self._kwargs:
|
92
|
+
string_rep += f"kwargs={self._kwargs}, "
|
93
|
+
if self._inner:
|
94
|
+
string_rep += f"inner={self._inner})"
|
95
|
+
return string_rep
|
96
|
+
|
97
|
+
def __repr__(self):
|
98
|
+
return self.__str__()
|
99
|
+
|
22
100
|
|
23
101
|
@ray.remote
|
24
102
|
class Catalogs:
|
25
103
|
def __init__(
|
26
104
|
self,
|
27
|
-
catalogs: Dict[str, Catalog],
|
28
|
-
|
105
|
+
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
|
+
default: Optional[str] = None,
|
29
107
|
*args,
|
30
108
|
**kwargs,
|
31
109
|
):
|
32
|
-
if
|
110
|
+
if default and default not in catalogs:
|
33
111
|
raise ValueError(
|
34
|
-
f"Catalog {
|
35
|
-
f"in catalogs to register: {catalogs}"
|
112
|
+
f"Catalog {default} not found " f"in catalogs to register: {catalogs}"
|
36
113
|
)
|
37
114
|
if not catalogs:
|
38
115
|
raise ValueError(
|
39
116
|
f"No catalogs given to register. "
|
40
117
|
f"Please specify one or more catalogs."
|
41
118
|
)
|
119
|
+
|
120
|
+
# if user only provides single Catalog, override it to be a map with default key
|
121
|
+
if isinstance(catalogs, Catalog):
|
122
|
+
catalogs = {DEFAULT_CATALOG: catalogs}
|
123
|
+
|
42
124
|
self.catalogs: Dict[str, Catalog] = catalogs
|
43
|
-
if
|
44
|
-
self.default_catalog = self.catalogs[
|
125
|
+
if default:
|
126
|
+
self.default_catalog = self.catalogs[default]
|
45
127
|
elif len(catalogs) == 1:
|
46
128
|
self.default_catalog = list(self.catalogs.values())[0]
|
47
129
|
else:
|
@@ -53,8 +135,10 @@ class Catalogs:
|
|
53
135
|
def names(self) -> List[str]:
|
54
136
|
return list(self.catalogs.keys())
|
55
137
|
|
56
|
-
def put(self, name: str, catalog: Catalog) -> None:
|
138
|
+
def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
|
57
139
|
self.catalogs[name] = catalog
|
140
|
+
if set_default:
|
141
|
+
self.default_catalog = catalog
|
58
142
|
|
59
143
|
def get(self, name) -> Catalog:
|
60
144
|
return self.catalogs.get(name)
|
@@ -63,21 +147,144 @@ class Catalogs:
|
|
63
147
|
return self.default_catalog
|
64
148
|
|
65
149
|
|
150
|
+
def is_initialized(*args, **kwargs) -> bool:
|
151
|
+
"""
|
152
|
+
Check if DeltaCAT is initialized
|
153
|
+
"""
|
154
|
+
global all_catalogs
|
155
|
+
|
156
|
+
# If ray is not initialized, then Catalogs cannot be initialized
|
157
|
+
if not ray.is_initialized():
|
158
|
+
# Any existing actor reference stored in catalog_module must be stale - reset it
|
159
|
+
all_catalogs = None
|
160
|
+
return False
|
161
|
+
|
162
|
+
return all_catalogs is not None
|
163
|
+
|
164
|
+
|
66
165
|
def init(
|
67
|
-
catalogs: Dict[str, Catalog],
|
68
|
-
|
166
|
+
catalogs: Union[Dict[str, Catalog], Catalog],
|
167
|
+
default: Optional[str] = None,
|
69
168
|
ray_init_args: Dict[str, Any] = None,
|
70
169
|
*args,
|
170
|
+
force_reinitialize=False,
|
71
171
|
**kwargs,
|
72
172
|
) -> None:
|
173
|
+
"""
|
174
|
+
Initialize DeltaCAT catalogs.
|
73
175
|
|
74
|
-
|
176
|
+
:param catalogs: Either a single Catalog instance or a map of string to Catalog instance
|
177
|
+
:param default: The Catalog to use by default. If only one Catalog is provided, it will
|
178
|
+
be set as the default
|
179
|
+
:param ray_init_args: kwargs to pass to ray initialization
|
180
|
+
:param force_reinitialize: if True, force the reinitialization of Ray. If false, will do nothing if ray already initialized
|
181
|
+
"""
|
182
|
+
global all_catalogs
|
183
|
+
|
184
|
+
if is_initialized() and not force_reinitialize:
|
185
|
+
logger.warning("DeltaCAT already initialized.")
|
186
|
+
return
|
187
|
+
else:
|
75
188
|
if ray_init_args:
|
76
189
|
ray.init(**ray_init_args)
|
77
190
|
else:
|
78
|
-
ray.init(
|
191
|
+
ray.init()
|
79
192
|
|
80
|
-
|
81
|
-
|
82
|
-
|
193
|
+
# register custom serializer for catalogs since these may contain
|
194
|
+
# unserializable objects like boto3 clients with SSLContext
|
195
|
+
ray.util.register_serializer(
|
196
|
+
Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
|
83
197
|
)
|
198
|
+
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
199
|
+
|
200
|
+
|
201
|
+
def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
|
202
|
+
"""
|
203
|
+
Get a catalog by name, or the default catalog if no name is provided.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
name: Name of catalog to retrieve (optional, uses default if not provided)
|
207
|
+
|
208
|
+
Returns:
|
209
|
+
The requested Catalog, or ValueError if it does not exist
|
210
|
+
"""
|
211
|
+
global all_catalogs
|
212
|
+
|
213
|
+
if not all_catalogs:
|
214
|
+
raise ValueError(
|
215
|
+
"No catalogs available! Call "
|
216
|
+
"`deltacat.init(catalogs={...})` to register one or more "
|
217
|
+
"catalogs then retry."
|
218
|
+
)
|
219
|
+
|
220
|
+
if name is not None:
|
221
|
+
catalog = ray.get(all_catalogs.get.remote(name))
|
222
|
+
if not catalog:
|
223
|
+
available_catalogs = ray.get(all_catalogs.all.remote()).values()
|
224
|
+
raise ValueError(
|
225
|
+
f"Catalog '{name}' not found. Available catalogs: "
|
226
|
+
f"{available_catalogs}."
|
227
|
+
)
|
228
|
+
return catalog
|
229
|
+
|
230
|
+
else:
|
231
|
+
catalog = ray.get(all_catalogs.default.remote())
|
232
|
+
if not catalog:
|
233
|
+
available_catalogs = ray.get(all_catalogs.all.remote()).values()
|
234
|
+
raise ValueError(
|
235
|
+
f"Call to get_catalog without name set failed because there is no default Catalog set. Available catalogs: "
|
236
|
+
f"{available_catalogs}."
|
237
|
+
)
|
238
|
+
return catalog
|
239
|
+
|
240
|
+
|
241
|
+
def put_catalog(
|
242
|
+
name: str,
|
243
|
+
catalog: Catalog = None,
|
244
|
+
*,
|
245
|
+
default: bool = False,
|
246
|
+
ray_init_args: Dict[str, Any] = None,
|
247
|
+
fail_if_exists: bool = False,
|
248
|
+
**kwargs,
|
249
|
+
) -> None:
|
250
|
+
"""
|
251
|
+
Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
|
252
|
+
|
253
|
+
Args:
|
254
|
+
name: name of catalog
|
255
|
+
catalog: catalog instance to use, if provided
|
256
|
+
default: Make this the default catalog if multiple catalogs are available.
|
257
|
+
ignored if this is the only catalog available, since it will always be the default catalog.
|
258
|
+
ray_init_args: ray initialization args (used only if ray not already initialized)
|
259
|
+
fail_if_exists: if True, raises KeyError if the catalog name already exists. Otherwise, overwrite catalog
|
260
|
+
"""
|
261
|
+
global all_catalogs
|
262
|
+
|
263
|
+
# Initialize, if necessary
|
264
|
+
if not is_initialized():
|
265
|
+
# NOTE - since we are initializing with a single catalog, it will be set to the default
|
266
|
+
if not default:
|
267
|
+
logger.info(
|
268
|
+
f"Calling put_catalog with set_as_default=False, "
|
269
|
+
f"but still setting Catalog {catalog} as default since it is the only catalog."
|
270
|
+
)
|
271
|
+
init({name: catalog}, ray_init_args=ray_init_args)
|
272
|
+
return
|
273
|
+
|
274
|
+
# Fail if fail_if_exists and catalog already exists
|
275
|
+
if fail_if_exists:
|
276
|
+
catalog_already_exists = False
|
277
|
+
try:
|
278
|
+
get_catalog(name)
|
279
|
+
# Note - need to set state catalog_already_exists and throw ValueError later, or else it will be
|
280
|
+
# caught in the except block which is meant to catch the ValueError from get_catalog
|
281
|
+
catalog_already_exists = True
|
282
|
+
except ValueError:
|
283
|
+
pass
|
284
|
+
if catalog_already_exists:
|
285
|
+
raise ValueError(
|
286
|
+
f"Failed to put catalog {name} because it already exists and fail_if_exists={fail_if_exists}"
|
287
|
+
)
|
288
|
+
|
289
|
+
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
290
|
+
ray.get(all_catalogs.put.remote(name, catalog, default))
|
@@ -0,0 +1,116 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Optional, Any
|
3
|
+
|
4
|
+
import pyarrow
|
5
|
+
from deltacat.constants import DELTACAT_ROOT
|
6
|
+
|
7
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
8
|
+
|
9
|
+
|
10
|
+
def get_catalog_properties(
|
11
|
+
*args,
|
12
|
+
catalog: Optional[CatalogProperties] = None,
|
13
|
+
inner: Optional[CatalogProperties] = None,
|
14
|
+
**kwargs,
|
15
|
+
) -> CatalogProperties:
|
16
|
+
"""
|
17
|
+
Helper function to fetch CatalogProperties instance. You are meant to call this by providing your functions
|
18
|
+
kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
|
19
|
+
|
20
|
+
This will look for a CatalogProperty value in the kwargs "catalog" or "inner". If these are found, it returns
|
21
|
+
the CatalogProperty value under that kwarg. Otherwise, it will pass through kwargs to the CatalogProperties
|
22
|
+
constructor.
|
23
|
+
"""
|
24
|
+
properties = catalog if catalog is not None else inner
|
25
|
+
if properties is not None and isinstance(properties, CatalogProperties):
|
26
|
+
return properties
|
27
|
+
elif properties is not None and not isinstance(properties, CatalogProperties):
|
28
|
+
raise ValueError(
|
29
|
+
f"Expected catalog properties of type {CatalogProperties.__name__} "
|
30
|
+
f"but found {type(properties)}."
|
31
|
+
)
|
32
|
+
else:
|
33
|
+
return CatalogProperties(**kwargs)
|
34
|
+
|
35
|
+
|
36
|
+
class CatalogProperties:
|
37
|
+
"""
|
38
|
+
DeltaCAT catalog properties used to deterministically resolve a durable
|
39
|
+
DeltaCAT catalog instance. Properties are set from system environment
|
40
|
+
variables unless explicit overrides are provided during initialization.
|
41
|
+
|
42
|
+
Catalog and storage APIs rely on the property catalog to retrieve durable state about the catalog they're
|
43
|
+
working against.
|
44
|
+
|
45
|
+
Attributes:
|
46
|
+
root (str): URI string The root path where catalog metadata and data
|
47
|
+
files are stored. Root is determined (in prededence order) by:
|
48
|
+
1. check kwargs for "root"
|
49
|
+
2. check env variable "DELTACAT_ROOT"
|
50
|
+
3. default to ${cwd}/.deltacat
|
51
|
+
|
52
|
+
filesystem: The filesystem implementation that should be used for
|
53
|
+
reading/writing files. If None, a filesystem will be inferred from
|
54
|
+
the catalog root path.
|
55
|
+
|
56
|
+
storage: Storage class implementation (overrides default filesystem storage impl)
|
57
|
+
"""
|
58
|
+
|
59
|
+
def __init__(
|
60
|
+
self,
|
61
|
+
root: Optional[str] = None,
|
62
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
63
|
+
storage=None,
|
64
|
+
*args,
|
65
|
+
**kwargs,
|
66
|
+
):
|
67
|
+
"""
|
68
|
+
Initialize a CatalogProperties instance.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
root: A single directory path that serves as the catalog root dir.
|
72
|
+
filesystem: The filesystem implementation that should be used for
|
73
|
+
reading these files. If None, a filesystem will be inferred.
|
74
|
+
If not None, the provided filesystem will still be validated
|
75
|
+
against the provided path to ensure compatibility.
|
76
|
+
"""
|
77
|
+
# set root, using precedence rules described in pydoc
|
78
|
+
if root is None:
|
79
|
+
# Check environment variables
|
80
|
+
# This is set or defaulted in constants.py
|
81
|
+
root = DELTACAT_ROOT
|
82
|
+
if root is None:
|
83
|
+
raise ValueError(
|
84
|
+
"Expected environment variable DELTACAT_ROOT to be set or defaulted"
|
85
|
+
)
|
86
|
+
|
87
|
+
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
88
|
+
path=root,
|
89
|
+
filesystem=filesystem,
|
90
|
+
)
|
91
|
+
self._root = resolved_root
|
92
|
+
self._filesystem = resolved_filesystem
|
93
|
+
self._storage = storage
|
94
|
+
|
95
|
+
@property
|
96
|
+
def root(self) -> str:
|
97
|
+
return self._root
|
98
|
+
|
99
|
+
@property
|
100
|
+
def filesystem(self) -> Optional[pyarrow.fs.FileSystem]:
|
101
|
+
return self._filesystem
|
102
|
+
|
103
|
+
@property
|
104
|
+
def storage(self) -> Optional[Any]:
|
105
|
+
"""
|
106
|
+
Return overridden storage impl, if any
|
107
|
+
"""
|
108
|
+
return self._storage
|
109
|
+
|
110
|
+
def __str__(self):
|
111
|
+
return (
|
112
|
+
f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
|
113
|
+
)
|
114
|
+
|
115
|
+
def __repr__(self):
|
116
|
+
return self.__str__()
|
@@ -1,19 +1,30 @@
|
|
1
1
|
# Allow self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Optional, Any
|
5
|
+
|
4
6
|
from deltacat.storage import Stream, Table, TableVersion
|
7
|
+
from deltacat.storage.model.scan.push_down import Pushdown
|
8
|
+
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
9
|
+
from deltacat.storage.util.scan_planner import ScanPlanner
|
5
10
|
|
6
11
|
|
7
12
|
class TableDefinition(dict):
|
8
13
|
@staticmethod
|
9
14
|
def of(
|
10
|
-
table: Table,
|
15
|
+
table: Table,
|
16
|
+
table_version: TableVersion,
|
17
|
+
stream: Stream,
|
18
|
+
native_object: Optional[Any] = None,
|
19
|
+
scan_planner: Optional[ScanPlanner] = None,
|
11
20
|
) -> TableDefinition:
|
12
21
|
return TableDefinition(
|
13
22
|
{
|
14
23
|
"table": table,
|
15
24
|
"tableVersion": table_version,
|
16
25
|
"stream": stream,
|
26
|
+
"nativeObject": native_object,
|
27
|
+
"scan_planner": scan_planner,
|
17
28
|
}
|
18
29
|
)
|
19
30
|
|
@@ -28,3 +39,23 @@ class TableDefinition(dict):
|
|
28
39
|
@property
|
29
40
|
def stream(self) -> Stream:
|
30
41
|
return self["stream"]
|
42
|
+
|
43
|
+
@property
|
44
|
+
def native_object(self) -> Optional[Any]:
|
45
|
+
return self.get("nativeObject")
|
46
|
+
|
47
|
+
@property
|
48
|
+
def scan_planner(self) -> Optional[ScanPlanner]:
|
49
|
+
return self.get("scan_planner")
|
50
|
+
|
51
|
+
def create_scan_plan(self, pushdown: Optional[Pushdown] = None) -> ScanPlan:
|
52
|
+
if not self.scan_planner:
|
53
|
+
raise RuntimeError(
|
54
|
+
f"ScanPlanner is not initialized for table '{self.table.table_name}' "
|
55
|
+
f"of namespace '{self.table.namespace}'"
|
56
|
+
)
|
57
|
+
return self.scan_planner.create_scan_plan(
|
58
|
+
table_name=self.table.table_name,
|
59
|
+
namespace=self.table.namespace,
|
60
|
+
pushdown=pushdown,
|
61
|
+
)
|
@@ -8,6 +8,7 @@ from typing import List, Union
|
|
8
8
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
9
9
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
10
10
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
11
|
+
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
11
12
|
from deltacat.utils.performance import timed_invocation
|
12
13
|
from deltacat.utils.resources import ClusterUtilization, get_size_of_object_in_bytes
|
13
14
|
from deltacat.compute.compactor import PyArrowWriteResult
|
@@ -670,13 +671,13 @@ class CompactionSessionAuditInfo(dict):
|
|
670
671
|
self, output_size_bytes: float
|
671
672
|
) -> CompactionSessionAuditInfo:
|
672
673
|
self["outputSizeBytes"] = output_size_bytes
|
673
|
-
return
|
674
|
+
return self
|
674
675
|
|
675
676
|
def set_output_size_pyarrow_bytes(
|
676
677
|
self, output_size_pyarrow_bytes: float
|
677
678
|
) -> CompactionSessionAuditInfo:
|
678
679
|
self["outputSizePyarrowBytes"] = output_size_pyarrow_bytes
|
679
|
-
return
|
680
|
+
return self
|
680
681
|
|
681
682
|
def set_total_cluster_memory_bytes(
|
682
683
|
self, total_cluster_memory_bytes: float
|
@@ -787,7 +788,10 @@ class CompactionSessionAuditInfo(dict):
|
|
787
788
|
self,
|
788
789
|
step_name: str,
|
789
790
|
task_results: Union[
|
790
|
-
List[HashBucketResult],
|
791
|
+
List[HashBucketResult],
|
792
|
+
List[DedupeResult],
|
793
|
+
List[MaterializeResult],
|
794
|
+
List[MergeResult],
|
791
795
|
],
|
792
796
|
task_results_retrieved_at: float,
|
793
797
|
invoke_time_in_seconds: float,
|
@@ -11,10 +11,10 @@ from deltacat import logs
|
|
11
11
|
from deltacat.storage import (
|
12
12
|
Delta,
|
13
13
|
DeltaType,
|
14
|
-
Manifest,
|
15
14
|
ManifestEntry,
|
16
15
|
ManifestEntryList,
|
17
16
|
)
|
17
|
+
from deltacat.storage.model.manifest import Manifest
|
18
18
|
|
19
19
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
20
20
|
|
@@ -107,7 +107,7 @@ class DeltaAnnotated(Delta):
|
|
107
107
|
assert len(src_da_annotations) == len(src_da_entries), (
|
108
108
|
f"Unexpected Error: Length of delta annotations "
|
109
109
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
110
|
-
f"delta manifest entries ({len(src_da_entries)})."
|
110
|
+
f"delta manifest entries ({len(src_da_entries)})."
|
111
111
|
)
|
112
112
|
for i, src_entry in enumerate(src_da_entries):
|
113
113
|
# create a new da group if src and dest has different delta locator
|
@@ -161,7 +161,7 @@ class DeltaAnnotated(Delta):
|
|
161
161
|
assert len(src_da_annotations) == len(src_da_entries), (
|
162
162
|
f"Unexpected Error: Length of delta annotations "
|
163
163
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
164
|
-
f"delta manifest entries ({len(src_da_entries)})."
|
164
|
+
f"delta manifest entries ({len(src_da_entries)})."
|
165
165
|
)
|
166
166
|
src_da_entries_length = len(src_da_entries)
|
167
167
|
equal_length = src_da_entries_length // pieces
|
@@ -37,7 +37,9 @@ class DeltaFileEnvelope(dict):
|
|
37
37
|
pointing to a file from the uncompacted source table, False if
|
38
38
|
this Locator is pointing to a file in the compacted destination
|
39
39
|
table.
|
40
|
-
|
40
|
+
file_record_count: Record count in the delta file table.
|
41
|
+
table_storage_strategy: The way the table object is stored in the
|
42
|
+
delta file envelope. If None just stores the table normally
|
41
43
|
Returns:
|
42
44
|
A delta file envelope.
|
43
45
|
|
@@ -31,9 +31,11 @@ class DeltaFileLocator(Locator, tuple):
|
|
31
31
|
|
32
32
|
file_index: Index of the file in the Delta Manifest.
|
33
33
|
|
34
|
+
file_record_count: Count of records in the Delta File.
|
35
|
+
|
34
36
|
Returns:
|
35
37
|
delta_file_locator: The Delta File Locator Tuple as
|
36
|
-
(
|
38
|
+
(is_src_delta, stream_position, file_index, file_record_count).
|
37
39
|
"""
|
38
40
|
return DeltaFileLocator(
|
39
41
|
(is_src_delta, stream_position, file_index, file_record_count)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import Tuple
|
4
|
+
from typing import List, Tuple, Union
|
5
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
6
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
7
7
|
from typing import Any, Dict, Optional
|
@@ -34,7 +34,7 @@ class RoundCompletionInfo(dict):
|
|
34
34
|
|
35
35
|
@staticmethod
|
36
36
|
def of(
|
37
|
-
high_watermark: HighWatermark,
|
37
|
+
high_watermark: Union[HighWatermark, int],
|
38
38
|
compacted_delta_locator: DeltaLocator,
|
39
39
|
compacted_pyarrow_write_result: PyArrowWriteResult,
|
40
40
|
sort_keys_bit_width: int,
|
@@ -66,7 +66,7 @@ class RoundCompletionInfo(dict):
|
|
66
66
|
return rci
|
67
67
|
|
68
68
|
@property
|
69
|
-
def high_watermark(self) -> HighWatermark:
|
69
|
+
def high_watermark(self) -> Union[HighWatermark, int]:
|
70
70
|
val: Dict[str, Any] = self.get("highWatermark")
|
71
71
|
if (
|
72
72
|
val is not None
|
@@ -111,7 +111,7 @@ class RoundCompletionInfo(dict):
|
|
111
111
|
return self["hashBucketCount"]
|
112
112
|
|
113
113
|
@property
|
114
|
-
def hb_index_to_entry_range(self) -> Optional[Dict[
|
114
|
+
def hb_index_to_entry_range(self) -> Optional[Dict[str, Tuple[int, int]]]:
|
115
115
|
"""
|
116
116
|
The start index is inclusive and end index is exclusive by default.
|
117
117
|
"""
|
@@ -130,5 +130,5 @@ class RoundCompletionInfo(dict):
|
|
130
130
|
return self.get("inputAverageRecordSizeBytes")
|
131
131
|
|
132
132
|
@staticmethod
|
133
|
-
def get_audit_bucket_name_and_key(compaction_audit_url: str) ->
|
133
|
+
def get_audit_bucket_name_and_key(compaction_audit_url: str) -> List[str]:
|
134
134
|
return compaction_audit_url.replace("s3://", "").split("/", 1)
|
@@ -4,7 +4,7 @@ from ray.types import ObjectRef
|
|
4
4
|
|
5
5
|
from typing import Any, Union
|
6
6
|
|
7
|
-
from abc import ABC, abstractmethod
|
7
|
+
from abc import ABC, abstractmethod
|
8
8
|
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
9
|
from deltacat.storage import (
|
10
10
|
LocalTable,
|
@@ -15,7 +15,8 @@ LocalTableReference = Union[ObjectRef, LocalTable]
|
|
15
15
|
|
16
16
|
|
17
17
|
class LocalTableStorageStrategy(ABC):
|
18
|
-
@
|
18
|
+
@property
|
19
|
+
@abstractmethod
|
19
20
|
def object_store(cls) -> IObjectStore:
|
20
21
|
pass
|
21
22
|
|
@@ -138,7 +138,7 @@ def repartition(
|
|
138
138
|
)
|
139
139
|
logger.info(f"Getting {len(repar_tasks_pending)} task results...")
|
140
140
|
repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
|
141
|
-
repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
|
141
|
+
repar_results: List[List[Delta]] = [rp.range_deltas for rp in repar_results]
|
142
142
|
transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
|
143
143
|
ordered_deltas: List[Delta] = [
|
144
144
|
i for sublist in transposed for i in sublist if i is not None
|
@@ -15,7 +15,8 @@ from deltacat.compute.compactor import (
|
|
15
15
|
DeltaFileEnvelope,
|
16
16
|
DeltaFileLocator,
|
17
17
|
)
|
18
|
-
from deltacat.storage.model.sort_key import SortKey
|
18
|
+
from deltacat.storage.model.sort_key import SortKey
|
19
|
+
from deltacat.storage import SortOrder
|
19
20
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
20
21
|
from deltacat.compute.compactor.utils import system_columns as sc
|
21
22
|
from deltacat.utils.ray_utils.runtime import (
|
@@ -155,15 +156,21 @@ def _timed_dedupe(
|
|
155
156
|
sort_keys.extend(
|
156
157
|
[
|
157
158
|
SortKey.of(
|
158
|
-
sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
|
159
|
+
[sc._PARTITION_STREAM_POSITION_COLUMN_NAME],
|
159
160
|
SortOrder.ASCENDING,
|
160
161
|
),
|
161
162
|
SortKey.of(
|
162
|
-
sc._ORDERED_FILE_IDX_COLUMN_NAME,
|
163
|
+
[sc._ORDERED_FILE_IDX_COLUMN_NAME],
|
164
|
+
SortOrder.ASCENDING,
|
163
165
|
),
|
164
166
|
]
|
165
167
|
)
|
166
|
-
table = table.take(
|
168
|
+
table = table.take(
|
169
|
+
pc.sort_indices(
|
170
|
+
table,
|
171
|
+
sort_keys=[pa_key for key in sort_keys for pa_key in key.arrow],
|
172
|
+
)
|
173
|
+
)
|
167
174
|
|
168
175
|
# drop duplicates by primary key hash column
|
169
176
|
logger.info(
|