deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/logs.py
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
-
import os
|
2
1
|
import logging
|
2
|
+
import os
|
3
3
|
import pathlib
|
4
|
-
from logging import
|
5
|
-
from
|
4
|
+
from logging import FileHandler, Handler, Logger, LoggerAdapter, handlers
|
5
|
+
from typing import Union
|
6
|
+
|
7
|
+
import ray
|
8
|
+
from ray.runtime_context import RuntimeContext
|
9
|
+
|
10
|
+
from deltacat.constants import APPLICATION_LOG_LEVEL, DELTACAT_LOG_LEVEL
|
6
11
|
|
7
12
|
DEFAULT_LOG_LEVEL = "INFO"
|
8
|
-
DEFAULT_LOG_FORMAT =
|
9
|
-
"%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
|
13
|
+
DEFAULT_LOG_FORMAT = (
|
14
|
+
"%(asctime)s\t%(levelname)s pid=%(process)d %(filename)s:%(lineno)s -- %(message)s"
|
15
|
+
)
|
10
16
|
DEFAULT_MAX_BYTES_PER_LOG = 2 ^ 20 * 256 # 256 MiB
|
11
17
|
DEFAULT_BACKUP_COUNT = 0
|
12
18
|
|
@@ -19,9 +25,55 @@ DEFAULT_DELTACAT_LOG_BASE_FILE_NAME = "deltacat-python.info.log"
|
|
19
25
|
DEFAULT_DEBUG_DELTACAT_LOG_BASE_FILE_NAME = "deltacat-python.debug.log"
|
20
26
|
|
21
27
|
|
22
|
-
|
23
|
-
|
24
|
-
|
28
|
+
class RayRuntimeContextLoggerAdapter(logging.LoggerAdapter):
|
29
|
+
"""
|
30
|
+
Logger Adapter for injecting Ray Runtime Context into logging messages.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def __init__(self, logger: Logger, runtime_context: RuntimeContext):
|
34
|
+
super().__init__(logger, {})
|
35
|
+
self.runtime_context = runtime_context
|
36
|
+
|
37
|
+
def process(self, msg, kwargs):
|
38
|
+
"""
|
39
|
+
Injects Ray Runtime Context details into each log message.
|
40
|
+
|
41
|
+
This may include information such as the raylet node ID, task/actor ID, job ID,
|
42
|
+
placement group ID of the worker, and assigned resources to the task/actor.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
msg: The original log message
|
46
|
+
kwargs: Keyword arguments for the log message
|
47
|
+
|
48
|
+
Returns: A log message with Ray Runtime Context details
|
49
|
+
|
50
|
+
"""
|
51
|
+
runtime_context_dict = self.runtime_context.get()
|
52
|
+
runtime_context_dict[
|
53
|
+
"worker_id"
|
54
|
+
] = self.runtime_context.worker.core_worker.get_worker_id()
|
55
|
+
if self.runtime_context.get_task_id() or self.runtime_context.get_actor_id():
|
56
|
+
runtime_context_dict[
|
57
|
+
"pg_id"
|
58
|
+
] = self.runtime_context.get_placement_group_id()
|
59
|
+
runtime_context_dict[
|
60
|
+
"assigned_resources"
|
61
|
+
] = self.runtime_context.get_assigned_resources()
|
62
|
+
|
63
|
+
return "(ray_runtime_context=%s) -- %s" % (runtime_context_dict, msg), kwargs
|
64
|
+
|
65
|
+
def __reduce__(self):
|
66
|
+
"""
|
67
|
+
Used to unpickle the class during Ray object store transfer.
|
68
|
+
"""
|
69
|
+
|
70
|
+
def deserializer(*args):
|
71
|
+
return RayRuntimeContextLoggerAdapter(args[0], ray.get_runtime_context())
|
72
|
+
|
73
|
+
return deserializer, (self.logger,)
|
74
|
+
|
75
|
+
|
76
|
+
def _add_logger_handler(logger: Logger, handler: Handler) -> Logger:
|
25
77
|
|
26
78
|
logger.setLevel(logging.getLevelName("DEBUG"))
|
27
79
|
logger.addHandler(handler)
|
@@ -29,89 +81,89 @@ def _add_logger_handler(
|
|
29
81
|
|
30
82
|
|
31
83
|
def _create_rotating_file_handler(
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
84
|
+
log_directory: str,
|
85
|
+
log_base_file_name: str,
|
86
|
+
logging_level: str = DEFAULT_LOG_LEVEL,
|
87
|
+
max_bytes_per_log_file: int = DEFAULT_MAX_BYTES_PER_LOG,
|
88
|
+
backup_count: int = DEFAULT_BACKUP_COUNT,
|
89
|
+
logging_format: str = DEFAULT_LOG_FORMAT,
|
90
|
+
) -> FileHandler:
|
38
91
|
|
39
92
|
if type(logging_level) is str:
|
40
93
|
logging_level = logging.getLevelName(logging_level.upper())
|
41
94
|
assert log_base_file_name, "log file name is required"
|
42
95
|
assert log_directory, "log directory is required"
|
43
|
-
|
44
96
|
log_dir_path = pathlib.Path(log_directory)
|
45
97
|
log_dir_path.mkdir(parents=True, exist_ok=True)
|
46
|
-
handler =
|
98
|
+
handler = handlers.RotatingFileHandler(
|
47
99
|
os.path.join(log_directory, log_base_file_name),
|
48
100
|
maxBytes=max_bytes_per_log_file,
|
49
|
-
backupCount=backup_count
|
101
|
+
backupCount=backup_count,
|
102
|
+
)
|
50
103
|
handler.setFormatter(logging.Formatter(logging_format))
|
51
104
|
handler.setLevel(logging_level)
|
52
105
|
return handler
|
53
106
|
|
54
107
|
|
55
|
-
def _file_handler_exists(
|
56
|
-
logger: Logger,
|
57
|
-
log_dir: str,
|
58
|
-
log_base_file_name: str) -> bool:
|
108
|
+
def _file_handler_exists(logger: Logger, log_dir: str, log_base_file_name: str) -> bool:
|
59
109
|
|
60
110
|
handler_exists = False
|
61
111
|
base_file_path = os.path.join(log_dir, log_base_file_name)
|
62
112
|
if len(logger.handlers) > 0:
|
63
113
|
norm_base_file_path = os.path.normpath(base_file_path)
|
64
|
-
handler_exists = any(
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
114
|
+
handler_exists = any(
|
115
|
+
[
|
116
|
+
isinstance(handler, logging.FileHandler)
|
117
|
+
and os.path.normpath(handler.baseFilename) == norm_base_file_path
|
118
|
+
for handler in logger.handlers
|
119
|
+
]
|
120
|
+
)
|
69
121
|
return handler_exists
|
70
122
|
|
71
123
|
|
72
124
|
def _configure_logger(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
125
|
+
logger: Logger,
|
126
|
+
log_level: str,
|
127
|
+
log_dir: str,
|
128
|
+
log_base_file_name: str,
|
129
|
+
debug_log_base_file_name: str,
|
130
|
+
) -> Union[Logger, LoggerAdapter]:
|
79
131
|
primary_log_level = log_level
|
80
132
|
logger.propagate = False
|
81
133
|
if log_level.upper() == "DEBUG":
|
82
134
|
if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
|
83
135
|
handler = _create_rotating_file_handler(
|
84
|
-
log_dir,
|
85
|
-
debug_log_base_file_name,
|
86
|
-
"DEBUG",
|
136
|
+
log_dir, debug_log_base_file_name, "DEBUG"
|
87
137
|
)
|
88
138
|
_add_logger_handler(logger, handler)
|
89
139
|
primary_log_level = "INFO"
|
90
140
|
if not _file_handler_exists(logger, log_dir, log_base_file_name):
|
91
141
|
handler = _create_rotating_file_handler(
|
92
|
-
log_dir,
|
93
|
-
log_base_file_name,
|
94
|
-
primary_log_level,
|
142
|
+
log_dir, log_base_file_name, primary_log_level
|
95
143
|
)
|
96
144
|
_add_logger_handler(logger, handler)
|
145
|
+
ray_runtime_ctx = ray.get_runtime_context()
|
146
|
+
if ray_runtime_ctx.worker.connected:
|
147
|
+
logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
|
148
|
+
|
97
149
|
return logger
|
98
150
|
|
99
151
|
|
100
|
-
def configure_deltacat_logger(logger: Logger) -> Logger:
|
152
|
+
def configure_deltacat_logger(logger: Logger) -> Union[Logger, LoggerAdapter]:
|
101
153
|
return _configure_logger(
|
102
154
|
logger,
|
103
155
|
DELTACAT_LOG_LEVEL,
|
104
156
|
DEFAULT_DELTACAT_LOG_DIR,
|
105
157
|
DEFAULT_DELTACAT_LOG_BASE_FILE_NAME,
|
106
|
-
DEFAULT_DEBUG_DELTACAT_LOG_BASE_FILE_NAME
|
158
|
+
DEFAULT_DEBUG_DELTACAT_LOG_BASE_FILE_NAME,
|
107
159
|
)
|
108
160
|
|
109
161
|
|
110
|
-
def configure_application_logger(logger: Logger) -> Logger:
|
162
|
+
def configure_application_logger(logger: Logger) -> Union[Logger, LoggerAdapter]:
|
111
163
|
return _configure_logger(
|
112
164
|
logger,
|
113
165
|
APPLICATION_LOG_LEVEL,
|
114
166
|
DEFAULT_APPLICATION_LOG_DIR,
|
115
167
|
DEFAULT_APPLICATION_LOG_BASE_FILE_NAME,
|
116
|
-
DEFAULT_DEBUG_APPLICATION_LOG_BASE_FILE_NAME
|
168
|
+
DEFAULT_DEBUG_APPLICATION_LOG_BASE_FILE_NAME,
|
117
169
|
)
|
deltacat/storage/__init__.py
CHANGED
@@ -1,17 +1,27 @@
|
|
1
|
-
from deltacat.aws.redshift import
|
2
|
-
|
1
|
+
from deltacat.aws.redshift import (
|
2
|
+
Manifest,
|
3
|
+
ManifestAuthor,
|
4
|
+
ManifestEntry,
|
5
|
+
ManifestEntryList,
|
6
|
+
ManifestMeta,
|
7
|
+
)
|
3
8
|
from deltacat.storage.model.delta import Delta, DeltaLocator
|
4
|
-
from deltacat.storage.model.partition import Partition, PartitionLocator
|
5
9
|
from deltacat.storage.model.list_result import ListResult
|
6
10
|
from deltacat.storage.model.locator import Locator
|
7
11
|
from deltacat.storage.model.namespace import Namespace, NamespaceLocator
|
12
|
+
from deltacat.storage.model.partition import Partition, PartitionLocator
|
8
13
|
from deltacat.storage.model.stream import Stream, StreamLocator
|
9
14
|
from deltacat.storage.model.table import Table, TableLocator
|
10
|
-
from deltacat.storage.model.table_version import TableVersion,
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
DistributedDataset
|
15
|
+
from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
|
16
|
+
from deltacat.storage.model.types import (
|
17
|
+
CommitState,
|
18
|
+
DeltaType,
|
19
|
+
DistributedDataset,
|
20
|
+
LifecycleState,
|
21
|
+
LocalDataset,
|
22
|
+
LocalTable,
|
23
|
+
SchemaConsistencyType,
|
24
|
+
)
|
15
25
|
|
16
26
|
__all__ = [
|
17
27
|
"CommitState",
|