deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,23 +1,49 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
import deltacat.logs
|
4
|
-
from deltacat.
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
3
|
+
import deltacat.logs # noqa: F401
|
4
|
+
from deltacat.catalog.delegate import (
|
5
|
+
alter_namespace,
|
6
|
+
alter_table,
|
7
|
+
create_namespace,
|
8
|
+
create_table,
|
9
|
+
default_namespace,
|
10
|
+
drop_namespace,
|
11
|
+
drop_table,
|
12
|
+
get_namespace,
|
13
|
+
get_table,
|
14
|
+
list_namespaces,
|
15
|
+
list_tables,
|
16
|
+
namespace_exists,
|
17
|
+
read_table,
|
18
|
+
refresh_table,
|
19
|
+
rename_table,
|
20
|
+
table_exists,
|
21
|
+
truncate_table,
|
22
|
+
write_to_table,
|
23
|
+
)
|
24
|
+
from deltacat.catalog.model.catalog import ( # noqa: F401
|
25
|
+
Catalog,
|
26
|
+
Catalogs,
|
27
|
+
all_catalogs,
|
28
|
+
init,
|
29
|
+
)
|
9
30
|
from deltacat.catalog.model.table_definition import TableDefinition
|
10
|
-
from deltacat.
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
31
|
+
from deltacat.compute.compactor import SortKey, SortOrder
|
32
|
+
from deltacat.storage import (
|
33
|
+
DistributedDataset,
|
34
|
+
LifecycleState,
|
35
|
+
ListResult,
|
36
|
+
LocalDataset,
|
37
|
+
LocalTable,
|
38
|
+
Namespace,
|
39
|
+
SchemaConsistencyType,
|
40
|
+
)
|
41
|
+
from deltacat.types.media import ContentEncoding, ContentType, TableType
|
42
|
+
from deltacat.types.tables import TableWriteMode
|
17
43
|
|
18
|
-
logs.configure_deltacat_logger(logging.getLogger(__name__))
|
44
|
+
deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
|
19
45
|
|
20
|
-
__version__ = "0.1.
|
46
|
+
__version__ = "0.1.12"
|
21
47
|
|
22
48
|
|
23
49
|
__all__ = [
|
deltacat/aws/clients.py
CHANGED
@@ -1,28 +1,21 @@
|
|
1
|
-
import boto3
|
2
1
|
import logging
|
2
|
+
from functools import lru_cache
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import boto3
|
6
|
+
from boto3.exceptions import ResourceNotExistsError
|
3
7
|
from boto3.resources.base import ServiceResource
|
4
8
|
from botocore.client import BaseClient
|
5
9
|
from botocore.config import Config
|
6
|
-
|
7
|
-
from functools import lru_cache
|
10
|
+
|
8
11
|
from deltacat import logs
|
9
12
|
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
10
13
|
|
11
|
-
from typing import Optional
|
12
|
-
|
13
14
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
15
|
|
15
16
|
|
16
|
-
def _resource(
|
17
|
-
|
18
|
-
region: Optional[str],
|
19
|
-
**kwargs) -> ServiceResource:
|
20
|
-
boto_config = Config(
|
21
|
-
retries={
|
22
|
-
"max_attempts": BOTO_MAX_RETRIES,
|
23
|
-
"mode": 'standard'
|
24
|
-
}
|
25
|
-
)
|
17
|
+
def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
18
|
+
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"})
|
26
19
|
return boto3.resource(
|
27
20
|
name,
|
28
21
|
region,
|
@@ -31,20 +24,14 @@ def _resource(
|
|
31
24
|
)
|
32
25
|
|
33
26
|
|
34
|
-
def _client(
|
35
|
-
name: str,
|
36
|
-
region: Optional[str],
|
37
|
-
**kwargs) -> BaseClient:
|
27
|
+
def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
38
28
|
try:
|
39
29
|
# try to re-use a client from the resource cache first
|
40
30
|
return resource_cache(name, region, **kwargs).meta.client
|
41
31
|
except ResourceNotExistsError:
|
42
32
|
# fall back for clients without an associated resource
|
43
33
|
boto_config = Config(
|
44
|
-
retries={
|
45
|
-
"max_attempts": BOTO_MAX_RETRIES,
|
46
|
-
"mode": 'standard'
|
47
|
-
}
|
34
|
+
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"}
|
48
35
|
)
|
49
36
|
return boto3.client(
|
50
37
|
name,
|
@@ -54,19 +41,13 @@ def _client(
|
|
54
41
|
)
|
55
42
|
|
56
43
|
|
57
|
-
def resource_cache(
|
58
|
-
name: str,
|
59
|
-
region: Optional[str],
|
60
|
-
**kwargs) -> ServiceResource:
|
44
|
+
def resource_cache(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
61
45
|
# we don't use the @lru_cache decorator because Ray can't pickle it
|
62
46
|
cached_function = lru_cache()(_resource)
|
63
47
|
return cached_function(name, region, **kwargs)
|
64
48
|
|
65
49
|
|
66
|
-
def client_cache(
|
67
|
-
name: str,
|
68
|
-
region: Optional[str],
|
69
|
-
**kwargs) -> BaseClient:
|
50
|
+
def client_cache(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
70
51
|
# we don't use the @lru_cache decorator because Ray can't pickle it
|
71
52
|
cached_function = lru_cache()(_client)
|
72
53
|
return cached_function(name, region, **kwargs)
|
deltacat/aws/constants.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
from deltacat.aws.redshift.model.manifest import
|
2
|
-
|
1
|
+
from deltacat.aws.redshift.model.manifest import (
|
2
|
+
Manifest,
|
3
|
+
ManifestAuthor,
|
4
|
+
ManifestEntry,
|
5
|
+
ManifestEntryList,
|
6
|
+
ManifestMeta,
|
7
|
+
)
|
3
8
|
|
4
9
|
__all__ = [
|
5
10
|
"Manifest",
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import logging
|
5
4
|
import itertools
|
5
|
+
import logging
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
|
+
from uuid import uuid4
|
6
8
|
|
7
9
|
from deltacat import logs
|
8
|
-
from uuid import uuid4
|
9
|
-
from typing import Any, Dict, List, Optional
|
10
10
|
|
11
11
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
12
|
|
@@ -14,10 +14,11 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
14
14
|
class Manifest(dict):
|
15
15
|
@staticmethod
|
16
16
|
def _build_manifest(
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
meta: Optional[ManifestMeta],
|
18
|
+
entries: Optional[ManifestEntryList],
|
19
|
+
author: Optional[ManifestAuthor] = None,
|
20
|
+
uuid: str = None,
|
21
|
+
) -> Manifest:
|
21
22
|
if not uuid:
|
22
23
|
uuid = str(uuid4())
|
23
24
|
manifest = Manifest()
|
@@ -31,9 +32,11 @@ class Manifest(dict):
|
|
31
32
|
return manifest
|
32
33
|
|
33
34
|
@staticmethod
|
34
|
-
def of(
|
35
|
-
|
36
|
-
|
35
|
+
def of(
|
36
|
+
entries: ManifestEntryList,
|
37
|
+
author: Optional[ManifestAuthor] = None,
|
38
|
+
uuid: str = None,
|
39
|
+
) -> Manifest:
|
37
40
|
if not uuid:
|
38
41
|
uuid = str(uuid4())
|
39
42
|
total_record_count = 0
|
@@ -52,15 +55,19 @@ class Manifest(dict):
|
|
52
55
|
content_encoding = None
|
53
56
|
entry_content_type = meta.content_type
|
54
57
|
if entry_content_type != content_type:
|
55
|
-
msg =
|
56
|
-
|
57
|
-
|
58
|
+
msg = (
|
59
|
+
f"Expected all manifest entries to have content "
|
60
|
+
f"type '{content_type}' but found "
|
61
|
+
f"'{entry_content_type}'"
|
62
|
+
)
|
58
63
|
raise ValueError(msg)
|
59
64
|
entry_content_encoding = meta["content_encoding"]
|
60
65
|
if entry_content_encoding != content_encoding:
|
61
|
-
msg =
|
62
|
-
|
63
|
-
|
66
|
+
msg = (
|
67
|
+
f"Expected all manifest entries to have content "
|
68
|
+
f"encoding '{content_encoding}' but found "
|
69
|
+
f"'{entry_content_encoding}'"
|
70
|
+
)
|
64
71
|
raise ValueError(msg)
|
65
72
|
total_record_count += meta.record_count or 0
|
66
73
|
total_content_length += meta.content_length or 0
|
@@ -70,25 +77,19 @@ class Manifest(dict):
|
|
70
77
|
total_content_length,
|
71
78
|
content_type,
|
72
79
|
content_encoding,
|
73
|
-
total_source_content_length
|
74
|
-
)
|
75
|
-
manifest = Manifest._build_manifest(
|
76
|
-
meta,
|
77
|
-
entries,
|
78
|
-
author,
|
79
|
-
uuid
|
80
|
+
total_source_content_length,
|
80
81
|
)
|
82
|
+
manifest = Manifest._build_manifest(meta, entries, author, uuid)
|
81
83
|
return manifest
|
82
84
|
|
83
85
|
@staticmethod
|
84
86
|
def merge_manifests(
|
85
|
-
|
86
|
-
|
87
|
+
manifests: List[Manifest], author: Optional[ManifestAuthor] = None
|
88
|
+
) -> Manifest:
|
87
89
|
all_entries = ManifestEntryList(
|
88
|
-
itertools.chain(*[m.entries for m in manifests])
|
89
|
-
|
90
|
-
|
91
|
-
author)
|
90
|
+
itertools.chain(*[m.entries for m in manifests])
|
91
|
+
)
|
92
|
+
merged_manifest = Manifest.of(all_entries, author)
|
92
93
|
return merged_manifest
|
93
94
|
|
94
95
|
@property
|
@@ -119,14 +120,15 @@ class Manifest(dict):
|
|
119
120
|
|
120
121
|
class ManifestMeta(dict):
|
121
122
|
@staticmethod
|
122
|
-
def of(
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
123
|
+
def of(
|
124
|
+
record_count: Optional[int],
|
125
|
+
content_length: Optional[int],
|
126
|
+
content_type: Optional[str],
|
127
|
+
content_encoding: Optional[str],
|
128
|
+
source_content_length: Optional[int] = None,
|
129
|
+
credentials: Optional[Dict[str, str]] = None,
|
130
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
131
|
+
) -> ManifestMeta:
|
130
132
|
manifest_meta = ManifestMeta()
|
131
133
|
if record_count is not None:
|
132
134
|
manifest_meta["record_count"] = record_count
|
@@ -175,8 +177,7 @@ class ManifestMeta(dict):
|
|
175
177
|
|
176
178
|
class ManifestAuthor(dict):
|
177
179
|
@staticmethod
|
178
|
-
def of(name: Optional[str],
|
179
|
-
version: Optional[str]) -> ManifestAuthor:
|
180
|
+
def of(name: Optional[str], version: Optional[str]) -> ManifestAuthor:
|
180
181
|
manifest_author = ManifestAuthor()
|
181
182
|
if name is not None:
|
182
183
|
manifest_author["name"] = name
|
@@ -195,15 +196,16 @@ class ManifestAuthor(dict):
|
|
195
196
|
|
196
197
|
class ManifestEntry(dict):
|
197
198
|
@staticmethod
|
198
|
-
def of(
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
199
|
+
def of(
|
200
|
+
url: Optional[str],
|
201
|
+
meta: Optional[ManifestMeta],
|
202
|
+
mandatory: bool = True,
|
203
|
+
uri: Optional[str] = None,
|
204
|
+
uuid: Optional[str] = None,
|
205
|
+
) -> ManifestEntry:
|
203
206
|
manifest_entry = ManifestEntry()
|
204
207
|
if not (uri or url):
|
205
|
-
raise ValueError(
|
206
|
-
"No URI or URL specified for manifest entry contents.")
|
208
|
+
raise ValueError("No URI or URL specified for manifest entry contents.")
|
207
209
|
if (uri and url) and (uri != url):
|
208
210
|
raise ValueError(f"Manifest entry URI ({uri}) != URL ({url})")
|
209
211
|
if url:
|
@@ -220,11 +222,13 @@ class ManifestEntry(dict):
|
|
220
222
|
|
221
223
|
@staticmethod
|
222
224
|
def from_s3_obj_url(
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
225
|
+
url: str,
|
226
|
+
record_count: int,
|
227
|
+
source_content_length: Optional[int] = None,
|
228
|
+
**s3_client_kwargs,
|
229
|
+
) -> ManifestEntry:
|
227
230
|
from deltacat.aws import s3u as s3_utils
|
231
|
+
|
228
232
|
s3_obj = s3_utils.get_object_at_url(url, **s3_client_kwargs)
|
229
233
|
logger.debug(f"Building manifest entry from {url}: {s3_obj}")
|
230
234
|
manifest_entry_meta = ManifestMeta.of(
|