deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/catalog/interface.py
CHANGED
@@ -1,19 +1,32 @@
|
|
1
|
-
import pyarrow as pa
|
2
|
-
from deltacat import SortKey, TableWriteMode, ContentType, ListResult, \
|
3
|
-
Namespace, LifecycleState, SchemaConsistencyType, LocalTable, \
|
4
|
-
LocalDataset, DistributedDataset, TableDefinition
|
5
1
|
from typing import Any, Dict, List, Optional, Set, Union
|
6
2
|
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
6
|
+
from deltacat.compute.compactor.model.sort_key import SortKey
|
7
|
+
from deltacat.storage.model.list_result import ListResult
|
8
|
+
from deltacat.storage.model.namespace import Namespace
|
9
|
+
from deltacat.storage.model.types import (
|
10
|
+
DistributedDataset,
|
11
|
+
LifecycleState,
|
12
|
+
LocalDataset,
|
13
|
+
LocalTable,
|
14
|
+
SchemaConsistencyType,
|
15
|
+
)
|
16
|
+
from deltacat.types.media import ContentType
|
17
|
+
from deltacat.types.tables import TableWriteMode
|
18
|
+
|
7
19
|
|
8
20
|
# table functions
|
9
21
|
def write_to_table(
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
22
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset],
|
23
|
+
table: str,
|
24
|
+
namespace: Optional[str] = None,
|
25
|
+
mode: TableWriteMode = TableWriteMode.AUTO,
|
26
|
+
content_type: ContentType = ContentType.PARQUET,
|
27
|
+
*args,
|
28
|
+
**kwargs
|
29
|
+
) -> None:
|
17
30
|
"""Write local or distributed data to a table. Raises an error if the
|
18
31
|
table does not exist and the table write mode is not CREATE or AUTO.
|
19
32
|
|
@@ -25,168 +38,136 @@ def write_to_table(
|
|
25
38
|
|
26
39
|
|
27
40
|
def read_table(
|
28
|
-
|
29
|
-
|
30
|
-
*args,
|
31
|
-
**kwargs) -> DistributedDataset:
|
41
|
+
table: str, namespace: Optional[str] = None, *args, **kwargs
|
42
|
+
) -> DistributedDataset:
|
32
43
|
"""Read a table into a distributed dataset."""
|
33
44
|
raise NotImplementedError("read_table not implemented")
|
34
45
|
|
35
46
|
|
36
47
|
def alter_table(
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
+
table: str,
|
49
|
+
namespace: Optional[str] = None,
|
50
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
51
|
+
schema_updates: Optional[Dict[str, Any]] = None,
|
52
|
+
partition_updates: Optional[Dict[str, Any]] = None,
|
53
|
+
primary_keys: Optional[Set[str]] = None,
|
54
|
+
sort_keys: Optional[List[SortKey]] = None,
|
55
|
+
description: Optional[str] = None,
|
56
|
+
properties: Optional[Dict[str, str]] = None,
|
57
|
+
*args,
|
58
|
+
**kwargs
|
59
|
+
) -> None:
|
48
60
|
"""Alter table definition."""
|
49
61
|
raise NotImplementedError("alter_table not implemented")
|
50
62
|
|
51
63
|
|
52
64
|
def create_table(
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
65
|
+
table: str,
|
66
|
+
namespace: Optional[str] = None,
|
67
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
68
|
+
schema: Optional[Union[pa.Schema, str, bytes]] = None,
|
69
|
+
schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
|
70
|
+
partition_keys: Optional[List[Dict[str, Any]]] = None,
|
71
|
+
primary_keys: Optional[Set[str]] = None,
|
72
|
+
sort_keys: Optional[List[SortKey]] = None,
|
73
|
+
description: Optional[str] = None,
|
74
|
+
properties: Optional[Dict[str, str]] = None,
|
75
|
+
permissions: Optional[Dict[str, Any]] = None,
|
76
|
+
content_types: Optional[List[ContentType]] = None,
|
77
|
+
replace_existing_table: bool = False,
|
78
|
+
*args,
|
79
|
+
**kwargs
|
80
|
+
) -> TableDefinition:
|
68
81
|
"""Create an empty table. Raises an error if the table already exists and
|
69
82
|
`replace_existing_table` is False."""
|
70
83
|
raise NotImplementedError("create_table not implemented")
|
71
84
|
|
72
85
|
|
73
86
|
def drop_table(
|
74
|
-
|
75
|
-
|
76
|
-
purge: bool = False,
|
77
|
-
*args,
|
78
|
-
**kwargs) -> None:
|
87
|
+
table: str, namespace: Optional[str] = None, purge: bool = False, *args, **kwargs
|
88
|
+
) -> None:
|
79
89
|
"""Drop a table from the catalog and optionally purge it. Raises an error
|
80
90
|
if the table does not exist."""
|
81
91
|
raise NotImplementedError("drop_table not implemented")
|
82
92
|
|
83
93
|
|
84
|
-
def refresh_table(
|
85
|
-
table: str,
|
86
|
-
namespace: Optional[str] = None,
|
87
|
-
*args,
|
88
|
-
**kwargs) -> None:
|
94
|
+
def refresh_table(table: str, namespace: Optional[str] = None, *args, **kwargs) -> None:
|
89
95
|
"""Refresh metadata cached on the Ray cluster for the given table."""
|
90
96
|
raise NotImplementedError("refresh_table not implemented")
|
91
97
|
|
92
98
|
|
93
99
|
def list_tables(
|
94
|
-
|
95
|
-
|
96
|
-
**kwargs) -> ListResult[TableDefinition]:
|
100
|
+
namespace: Optional[str] = None, *args, **kwargs
|
101
|
+
) -> ListResult[TableDefinition]:
|
97
102
|
"""List a page of table definitions. Raises an error if the given namespace
|
98
103
|
does not exist."""
|
99
104
|
raise NotImplementedError("list_tables not implemented")
|
100
105
|
|
101
106
|
|
102
107
|
def get_table(
|
103
|
-
|
104
|
-
|
105
|
-
*args,
|
106
|
-
**kwargs) -> Optional[TableDefinition]:
|
108
|
+
table: str, namespace: Optional[str] = None, *args, **kwargs
|
109
|
+
) -> Optional[TableDefinition]:
|
107
110
|
"""Get table definition metadata. Returns None if the given table does not
|
108
111
|
exist."""
|
109
112
|
raise NotImplementedError("get_table not implemented")
|
110
113
|
|
111
114
|
|
112
115
|
def truncate_table(
|
113
|
-
|
114
|
-
|
115
|
-
*args,
|
116
|
-
**kwargs) -> None:
|
116
|
+
table: str, namespace: Optional[str] = None, *args, **kwargs
|
117
|
+
) -> None:
|
117
118
|
"""Truncate table data. Raises an error if the table does not exist."""
|
118
119
|
raise NotImplementedError("truncate_table not implemented")
|
119
120
|
|
120
121
|
|
121
122
|
def rename_table(
|
122
|
-
|
123
|
-
|
124
|
-
namespace: Optional[str] = None,
|
125
|
-
*args,
|
126
|
-
**kwargs) -> None:
|
123
|
+
table: str, new_name: str, namespace: Optional[str] = None, *args, **kwargs
|
124
|
+
) -> None:
|
127
125
|
"""Rename a table."""
|
128
126
|
raise NotImplementedError("rename_table not implemented")
|
129
127
|
|
130
128
|
|
131
|
-
def table_exists(
|
132
|
-
table: str,
|
133
|
-
namespace: Optional[str] = None,
|
134
|
-
*args,
|
135
|
-
**kwargs) -> bool:
|
129
|
+
def table_exists(table: str, namespace: Optional[str] = None, *args, **kwargs) -> bool:
|
136
130
|
"""Returns True if the given table exists, False if not."""
|
137
131
|
raise NotImplementedError("table_exists not implemented")
|
138
132
|
|
139
133
|
|
140
134
|
# namespace functions
|
141
|
-
def list_namespaces(
|
142
|
-
*args,
|
143
|
-
**kwargs) -> ListResult[Namespace]:
|
135
|
+
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
144
136
|
"""List a page of table namespaces."""
|
145
137
|
raise NotImplementedError("list_namespaces not implemented")
|
146
138
|
|
147
139
|
|
148
|
-
def get_namespace(
|
149
|
-
namespace: str,
|
150
|
-
*args,
|
151
|
-
**kwargs) -> Optional[Namespace]:
|
140
|
+
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
152
141
|
"""Gets table namespace metadata for the specified table namespace. Returns
|
153
142
|
None if the given namespace does not exist."""
|
154
143
|
raise NotImplementedError("get_namespace not implemented")
|
155
144
|
|
156
145
|
|
157
|
-
def namespace_exists(
|
158
|
-
namespace: str,
|
159
|
-
*args,
|
160
|
-
**kwargs) -> bool:
|
146
|
+
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
161
147
|
"""Returns True if the given table namespace exists, False if not."""
|
162
148
|
raise NotImplementedError("namespace_exists not implemented")
|
163
149
|
|
164
150
|
|
165
151
|
def create_namespace(
|
166
|
-
|
167
|
-
|
168
|
-
*args,
|
169
|
-
**kwargs) -> Namespace:
|
152
|
+
namespace: str, permissions: Dict[str, Any], *args, **kwargs
|
153
|
+
) -> Namespace:
|
170
154
|
"""Creates a table namespace with the given name and permissions. Returns
|
171
155
|
the created namespace. Raises an error if the namespace already exists."""
|
172
156
|
raise NotImplementedError("create_namespace not implemented")
|
173
157
|
|
174
158
|
|
175
159
|
def alter_namespace(
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
160
|
+
namespace: str,
|
161
|
+
permissions: Optional[Dict[str, Any]] = None,
|
162
|
+
new_namespace: Optional[str] = None,
|
163
|
+
*args,
|
164
|
+
**kwargs
|
165
|
+
) -> None:
|
181
166
|
"""Alter table namespace definition."""
|
182
167
|
raise NotImplementedError("alter_namespace not implemented")
|
183
168
|
|
184
169
|
|
185
|
-
def drop_namespace(
|
186
|
-
namespace: str,
|
187
|
-
purge: bool = False,
|
188
|
-
*args,
|
189
|
-
**kwargs) -> None:
|
170
|
+
def drop_namespace(namespace: str, purge: bool = False, *args, **kwargs) -> None:
|
190
171
|
"""Drop the given namespace and all of its tables from the catalog,
|
191
172
|
optionally purging them."""
|
192
173
|
raise NotImplementedError("drop_namespace not implemented")
|
@@ -1,21 +1,17 @@
|
|
1
1
|
# Allow self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
4
6
|
import ray
|
5
7
|
|
6
8
|
from deltacat.catalog import interface as catalog_interface
|
7
|
-
from typing import Any, Dict, List, Optional
|
8
|
-
|
9
9
|
|
10
10
|
all_catalogs: Optional[Catalogs] = None
|
11
11
|
|
12
12
|
|
13
13
|
class Catalog:
|
14
|
-
def __init__(
|
15
|
-
self,
|
16
|
-
impl=catalog_interface,
|
17
|
-
*args,
|
18
|
-
**kwargs):
|
14
|
+
def __init__(self, impl=catalog_interface, *args, **kwargs):
|
19
15
|
self._impl = impl
|
20
16
|
self._impl.initialize(*args, **kwargs)
|
21
17
|
|
@@ -27,19 +23,22 @@ class Catalog:
|
|
27
23
|
@ray.remote
|
28
24
|
class Catalogs:
|
29
25
|
def __init__(
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
26
|
+
self,
|
27
|
+
catalogs: Dict[str, Catalog],
|
28
|
+
default_catalog_name: str = None,
|
29
|
+
*args,
|
30
|
+
**kwargs,
|
31
|
+
):
|
35
32
|
if default_catalog_name and default_catalog_name not in catalogs:
|
36
33
|
raise ValueError(
|
37
34
|
f"Catalog {default_catalog_name} not found "
|
38
|
-
f"in catalogs to register: {catalogs}"
|
35
|
+
f"in catalogs to register: {catalogs}"
|
36
|
+
)
|
39
37
|
if not catalogs:
|
40
38
|
raise ValueError(
|
41
39
|
f"No catalogs given to register. "
|
42
|
-
f"Please specify one or more catalogs."
|
40
|
+
f"Please specify one or more catalogs."
|
41
|
+
)
|
43
42
|
self.catalogs: Dict[str, Catalog] = catalogs
|
44
43
|
if default_catalog_name:
|
45
44
|
self.default_catalog = self.catalogs[default_catalog_name]
|
@@ -65,11 +64,12 @@ class Catalogs:
|
|
65
64
|
|
66
65
|
|
67
66
|
def init(
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
67
|
+
catalogs: Dict[str, Catalog],
|
68
|
+
default_catalog_name: str = None,
|
69
|
+
ray_init_args: Dict[str, Any] = None,
|
70
|
+
*args,
|
71
|
+
**kwargs,
|
72
|
+
) -> None:
|
73
73
|
|
74
74
|
if not ray.is_initialized():
|
75
75
|
if ray_init_args:
|
@@ -79,5 +79,5 @@ def init(
|
|
79
79
|
|
80
80
|
global all_catalogs
|
81
81
|
all_catalogs = Catalogs.remote(
|
82
|
-
catalogs=catalogs,
|
83
|
-
|
82
|
+
catalogs=catalogs, default_catalog_name=default_catalog_name
|
83
|
+
)
|
@@ -1,19 +1,21 @@
|
|
1
1
|
# Allow self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from deltacat.storage import Table, TableVersion
|
4
|
+
from deltacat.storage import Stream, Table, TableVersion
|
5
5
|
|
6
6
|
|
7
7
|
class TableDefinition(dict):
|
8
8
|
@staticmethod
|
9
|
-
def of(
|
10
|
-
|
11
|
-
|
12
|
-
return TableDefinition(
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
9
|
+
def of(
|
10
|
+
table: Table, table_version: TableVersion, stream: Stream
|
11
|
+
) -> TableDefinition:
|
12
|
+
return TableDefinition(
|
13
|
+
{
|
14
|
+
"table": table,
|
15
|
+
"tableVersion": table_version,
|
16
|
+
"stream": stream,
|
17
|
+
}
|
18
|
+
)
|
17
19
|
|
18
20
|
@property
|
19
21
|
def table(self) -> Table:
|
@@ -1,20 +1,16 @@
|
|
1
1
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
2
|
-
from deltacat.compute.compactor.model.delta_file_envelope import
|
3
|
-
|
4
|
-
from deltacat.compute.compactor.model.
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
from deltacat.compute.compactor.model.pyarrow_write_result import
|
12
|
-
|
13
|
-
from deltacat.compute.compactor.model.
|
14
|
-
RoundCompletionInfo
|
15
|
-
from deltacat.compute.compactor.model.sort_key import \
|
16
|
-
SortKey, SortOrder
|
17
|
-
|
2
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
3
|
+
from deltacat.compute.compactor.model.delta_file_locator import DeltaFileLocator
|
4
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
5
|
+
from deltacat.compute.compactor.model.primary_key_index import (
|
6
|
+
PrimaryKeyIndexLocator,
|
7
|
+
PrimaryKeyIndexMeta,
|
8
|
+
PrimaryKeyIndexVersionLocator,
|
9
|
+
PrimaryKeyIndexVersionMeta,
|
10
|
+
)
|
11
|
+
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
12
|
+
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
13
|
+
from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
|
18
14
|
|
19
15
|
__all__ = [
|
20
16
|
"DeltaAnnotated",
|