deltacat 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.3"
47
+ __version__ = "1.1.5"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/constants.py CHANGED
@@ -6,7 +6,3 @@ DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE
6
6
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
7
7
  TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
8
8
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
9
-
10
- # Metric Names
11
- DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX = "download_manifest_entry"
12
- UPLOAD_SLICED_TABLE_METRIC_PREFIX = "upload_sliced_table"
deltacat/aws/s3u.py CHANGED
@@ -27,8 +27,6 @@ import deltacat.aws.clients as aws_utils
27
27
  from deltacat import logs
28
28
  from deltacat.aws.constants import (
29
29
  TIMEOUT_ERROR_CODES,
30
- DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX,
31
- UPLOAD_SLICED_TABLE_METRIC_PREFIX,
32
30
  )
33
31
  from deltacat.exceptions import NonRetryableError, RetryableError
34
32
  from deltacat.storage import (
@@ -54,7 +52,6 @@ from deltacat.types.tables import (
54
52
  )
55
53
  from deltacat.types.partial_download import PartialFileDownloadParams
56
54
  from deltacat.utils.common import ReadKwargsProvider
57
- from deltacat.utils.metrics import metrics
58
55
 
59
56
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
60
57
 
@@ -121,6 +118,32 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
121
118
  self.block_refs.append(block)
122
119
  return write_path
123
120
 
121
+ def __call__(
122
+ self,
123
+ base_path: str,
124
+ *,
125
+ filesystem: Optional[pa.filesystem.FileSystem] = None,
126
+ dataset_uuid: Optional[str] = None,
127
+ block: Optional[ObjectRef[Block]] = None,
128
+ block_index: Optional[int] = None,
129
+ file_format: Optional[str] = None,
130
+ ) -> str:
131
+ """
132
+ TODO: BlockWritePathProvider is deprecated as of Ray version 2.20.0. Please use FilenameProvider.
133
+ See: https://docs.ray.io/en/master/data/api/doc/ray.data.datasource.FilenameProvider.html
134
+ Also See: https://github.com/ray-project/deltacat/issues/299
135
+
136
+ Hence, this class only works with Ray version 2.20.0 or lower when used in Ray Dataset.
137
+ """
138
+ return self._get_write_path_for_block(
139
+ base_path,
140
+ filesystem=filesystem,
141
+ dataset_uuid=dataset_uuid,
142
+ block=block,
143
+ block_index=block_index,
144
+ file_format=file_format,
145
+ )
146
+
124
147
 
125
148
  class S3Url:
126
149
  def __init__(self, url: str):
@@ -243,7 +266,6 @@ def read_file(
243
266
  raise e
244
267
 
245
268
 
246
- @metrics(prefix=UPLOAD_SLICED_TABLE_METRIC_PREFIX)
247
269
  def upload_sliced_table(
248
270
  table: Union[LocalTable, DistributedDataset],
249
271
  s3_url_prefix: str,
@@ -352,7 +374,6 @@ def upload_table(
352
374
  return manifest_entries
353
375
 
354
376
 
355
- @metrics(prefix=DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX)
356
377
  def download_manifest_entry(
357
378
  manifest_entry: ManifestEntry,
358
379
  token_holder: Optional[Dict[str, Any]] = None,
@@ -6,6 +6,7 @@ from deltacat.compute.compactor import RoundCompletionInfo
6
6
  from deltacat.storage import PartitionLocator
7
7
  from deltacat.aws import s3u as s3_utils
8
8
  from typing import Optional
9
+ from deltacat.utils.metrics import metrics
9
10
 
10
11
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
12
 
@@ -18,6 +19,7 @@ def get_round_completion_file_s3_url(
18
19
  return f"{base_url}.json"
19
20
 
20
21
 
22
+ @metrics
21
23
  def read_round_completion_file(
22
24
  bucket: str,
23
25
  source_partition_locator: PartitionLocator,
@@ -38,6 +40,7 @@ def read_round_completion_file(
38
40
  return round_completion_info
39
41
 
40
42
 
43
+ @metrics
41
44
  def write_round_completion_file(
42
45
  bucket: Optional[str],
43
46
  source_partition_locator: Optional[PartitionLocator],
@@ -17,7 +17,11 @@ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
17
17
  from deltacat.aws import s3u as s3_utils
18
18
  import deltacat
19
19
  from deltacat import logs
20
- from deltacat.compute.compactor import PyArrowWriteResult, RoundCompletionInfo
20
+ from deltacat.compute.compactor import (
21
+ HighWatermark,
22
+ PyArrowWriteResult,
23
+ RoundCompletionInfo,
24
+ )
21
25
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
22
26
  from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
23
27
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
@@ -37,6 +41,7 @@ from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
37
41
  from deltacat.storage import (
38
42
  Delta,
39
43
  DeltaLocator,
44
+ Manifest,
40
45
  Partition,
41
46
  )
42
47
  from deltacat.compute.compactor.model.compact_partition_params import (
@@ -50,6 +55,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
50
55
  from deltacat.compute.compactor_v2.steps import hash_bucket as hb
51
56
  from deltacat.compute.compactor_v2.utils import io
52
57
  from deltacat.compute.compactor.utils import round_completion_file as rcf
58
+ from deltacat.utils.metrics import metrics
53
59
 
54
60
  from typing import List, Optional, Tuple
55
61
  from collections import defaultdict
@@ -73,6 +79,7 @@ if importlib.util.find_spec("memray"):
73
79
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
74
80
 
75
81
 
82
+ @metrics
76
83
  def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
77
84
 
78
85
  assert (
@@ -94,7 +101,7 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
94
101
  round_completion_file_s3_url = None
95
102
  if new_partition:
96
103
  logger.info(f"Committing compacted partition to: {new_partition.locator}")
97
- partition = params.deltacat_storage.commit_partition(
104
+ partition: Partition = params.deltacat_storage.commit_partition(
98
105
  new_partition, **params.deltacat_storage_kwargs
99
106
  )
100
107
  logger.info(f"Committed compacted partition: {partition}")
@@ -148,9 +155,9 @@ def _execute_compaction(
148
155
  compaction_audit.set_total_cluster_memory_bytes(cluster_memory)
149
156
 
150
157
  # read the results from any previously completed compaction round
151
- round_completion_info = None
152
- high_watermark = None
153
- previous_compacted_delta_manifest = None
158
+ round_completion_info: Optional[RoundCompletionInfo] = None
159
+ high_watermark: Optional[HighWatermark] = None
160
+ previous_compacted_delta_manifest: Optional[Manifest] = None
154
161
 
155
162
  if not params.rebase_source_partition_locator:
156
163
  round_completion_info = rcf.read_round_completion_file(
@@ -269,6 +276,7 @@ def _execute_compaction(
269
276
  total_hb_record_count = np.int64(0)
270
277
  telemetry_time_hb = 0
271
278
  if params.hash_bucket_count == 1:
279
+ logger.info("Hash bucket count set to 1. Running local merge")
272
280
  merge_start = time.monotonic()
273
281
  local_merge_input = generate_local_merge_input(
274
282
  params,
@@ -632,6 +640,19 @@ def _execute_compaction(
632
640
  f"partition-{params.source_partition_locator.partition_values},"
633
641
  f"compacted at: {params.last_stream_position_to_compact},"
634
642
  )
643
+ is_inplace_compacted: bool = (
644
+ params.source_partition_locator.partition_values
645
+ == params.destination_partition_locator.partition_values
646
+ and params.source_partition_locator.stream_id
647
+ == params.destination_partition_locator.stream_id
648
+ )
649
+ if is_inplace_compacted:
650
+ logger.info(
651
+ "Overriding round completion file source partition locator as in-place compacted. "
652
+ + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
653
+ f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
654
+ )
655
+ rcf_source_partition_locator = compacted_partition.locator
635
656
  return (
636
657
  compacted_partition,
637
658
  new_round_completion_info,
@@ -65,6 +65,3 @@ DISCOVER_DELTAS_METRIC_PREFIX = "discover_deltas"
65
65
 
66
66
  # Metric prefix for prepare deletes
67
67
  PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
68
-
69
- # Metric prefix for materialize
70
- MATERIALIZE_METRIC_PREFIX = "delta_materialize"
@@ -31,14 +31,11 @@ from deltacat.compute.compactor_v2.deletes.delete_strategy import (
31
31
  from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
32
32
  DeleteFileEnvelope,
33
33
  )
34
- from deltacat.utils.metrics import metrics
35
- from deltacat.compute.compactor_v2.constants import MATERIALIZE_METRIC_PREFIX
36
34
 
37
35
 
38
36
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
39
37
 
40
38
 
41
- @metrics(prefix=MATERIALIZE_METRIC_PREFIX)
42
39
  def materialize(
43
40
  input: MergeInput,
44
41
  task_index: int,
@@ -21,6 +21,9 @@ from deltacat.compute.compactor_v2.constants import (
21
21
  PARQUET_TO_PYARROW_INFLATION,
22
22
  )
23
23
 
24
+ from daft.exceptions import DaftTransientError
25
+
26
+
24
27
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
25
28
 
26
29
 
@@ -76,6 +79,7 @@ def get_task_options(
76
79
  botocore.exceptions.HTTPClientError,
77
80
  ConnectionError,
78
81
  TimeoutError,
82
+ DaftTransientError,
79
83
  ]
80
84
 
81
85
  return task_opts
deltacat/logs.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import os
3
+ import json
3
4
  import pathlib
4
5
  from logging import FileHandler, Handler, Logger, LoggerAdapter, handlers
5
6
  from typing import Any, Dict, Optional, Union
@@ -19,13 +20,106 @@ from deltacat.constants import (
19
20
  )
20
21
 
21
22
  DEFAULT_LOG_LEVEL = "INFO"
22
- DEFAULT_LOG_FORMAT = (
23
- "%(asctime)s\t%(levelname)s pid=%(process)d %(filename)s:%(lineno)s -- %(message)s"
24
- )
23
+ DEFAULT_LOG_FORMAT = {
24
+ "level": "levelname",
25
+ "message": "message",
26
+ "loggerName": "name",
27
+ "processName": "processName",
28
+ "processID": "process",
29
+ "threadName": "threadName",
30
+ "timestamp": "asctime",
31
+ "filename": "filename",
32
+ "lineno": "lineno",
33
+ }
25
34
  DEFAULT_MAX_BYTES_PER_LOG = 2 ^ 20 * 256 # 256 MiB
26
35
  DEFAULT_BACKUP_COUNT = 0
27
36
 
28
37
 
38
+ class JsonFormatter(logging.Formatter):
39
+ """
40
+ Formatter that outputs JSON strings after parsing the LogRecord.
41
+
42
+ @param dict fmt_dict: Key: logging format attribute pairs. Defaults to {"message": "message"}.
43
+ @param str time_format: time.strftime() format string. Default: "%Y-%m-%dT%H:%M:%S"
44
+ @param str msec_format: Microsecond formatting. Appended at the end. Default: "%s.%03dZ"
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ fmt_dict: dict = None,
50
+ time_format: str = "%Y-%m-%dT%H:%M:%S",
51
+ msec_format: str = "%s.%03dZ",
52
+ ):
53
+ self.fmt_dict = fmt_dict if fmt_dict is not None else {"message": "message"}
54
+ self.default_time_format = time_format
55
+ self.default_msec_format = msec_format
56
+ self.datefmt = None
57
+ if ray.is_initialized():
58
+ self.ray_runtime_ctx: RuntimeContext = ray.get_runtime_context()
59
+ self.context = {}
60
+ self.context["worker_id"] = self.ray_runtime_ctx.get_worker_id()
61
+ self.context["node_id"] = self.ray_runtime_ctx.get_node_id()
62
+ self.context["job_id"] = self.ray_runtime_ctx.get_job_id()
63
+ else:
64
+ self.ray_runtime_ctx = None
65
+ self.context = {}
66
+
67
+ def usesTime(self) -> bool:
68
+ """
69
+ Overwritten to look for the attribute in the format dict values instead of the fmt string.
70
+ """
71
+ return "asctime" in self.fmt_dict.values()
72
+
73
+ def formatMessage(self, record) -> dict:
74
+ """
75
+ Overwritten to return a dictionary of the relevant LogRecord attributes instead of a string.
76
+ KeyError is raised if an unknown attribute is provided in the fmt_dict.
77
+ """
78
+ return {
79
+ fmt_key: record.__dict__[fmt_val]
80
+ for fmt_key, fmt_val in self.fmt_dict.items()
81
+ }
82
+
83
+ def format(self, record) -> str:
84
+ """
85
+ Mostly the same as the parent's class method, the difference being that a dict is manipulated and dumped as JSON
86
+ instead of a string.
87
+ """
88
+ record.message = record.getMessage()
89
+
90
+ if self.usesTime():
91
+ record.asctime = self.formatTime(record, self.datefmt)
92
+
93
+ message_dict = self.formatMessage(record)
94
+
95
+ if record.exc_info:
96
+ # Cache the traceback text to avoid converting it multiple times
97
+ # (it's constant anyway)
98
+ if not record.exc_text:
99
+ record.exc_text = self.formatException(record.exc_info)
100
+
101
+ if record.exc_text:
102
+ message_dict["exc_info"] = record.exc_text
103
+
104
+ if record.stack_info:
105
+ message_dict["stack_info"] = self.formatStack(record.stack_info)
106
+
107
+ if self.ray_runtime_ctx:
108
+ # only workers will have task ID
109
+ if (
110
+ self.ray_runtime_ctx.worker
111
+ and self.ray_runtime_ctx.worker.mode == ray._private.worker.WORKER_MODE
112
+ ):
113
+ self.context["task_id"] = self.ray_runtime_ctx.get_task_id()
114
+ self.context[
115
+ "assigned_resources"
116
+ ] = self.ray_runtime_ctx.get_assigned_resources()
117
+
118
+ message_dict["ray_runtime_context"] = self.context
119
+
120
+ return json.dumps(message_dict, default=str)
121
+
122
+
29
123
  class DeltaCATLoggerAdapter(logging.LoggerAdapter):
30
124
  """
31
125
  Logger Adapter class with additional functionality
@@ -51,54 +145,6 @@ class DeltaCATLoggerAdapter(logging.LoggerAdapter):
51
145
  self.error(msg, *args, **kwargs)
52
146
 
53
147
 
54
- class RayRuntimeContextLoggerAdapter(DeltaCATLoggerAdapter):
55
- """
56
- Logger Adapter for injecting Ray Runtime Context into logging messages.
57
- """
58
-
59
- def __init__(self, logger: Logger, runtime_context: RuntimeContext):
60
- super().__init__(logger, {})
61
- self.runtime_context = runtime_context
62
-
63
- def process(self, msg, kwargs):
64
- """
65
- Injects Ray Runtime Context details into each log message.
66
-
67
- This may include information such as the raylet node ID, task/actor ID, job ID,
68
- placement group ID of the worker, and assigned resources to the task/actor.
69
-
70
- Args:
71
- msg: The original log message
72
- kwargs: Keyword arguments for the log message
73
-
74
- Returns: A log message with Ray Runtime Context details
75
-
76
- """
77
- runtime_context_dict = self.runtime_context.get()
78
- runtime_context_dict[
79
- "worker_id"
80
- ] = self.runtime_context.worker.core_worker.get_worker_id()
81
- if self.runtime_context.get_task_id() or self.runtime_context.get_actor_id():
82
- runtime_context_dict[
83
- "pg_id"
84
- ] = self.runtime_context.get_placement_group_id()
85
- runtime_context_dict[
86
- "assigned_resources"
87
- ] = self.runtime_context.get_assigned_resources()
88
-
89
- return "(ray_runtime_context=%s) -- %s" % (runtime_context_dict, msg), kwargs
90
-
91
- def __reduce__(self):
92
- """
93
- Used to unpickle the class during Ray object store transfer.
94
- """
95
-
96
- def deserializer(*args):
97
- return RayRuntimeContextLoggerAdapter(args[0], ray.get_runtime_context())
98
-
99
- return deserializer, (self.logger,)
100
-
101
-
102
148
  def _add_logger_handler(logger: Logger, handler: Handler) -> Logger:
103
149
 
104
150
  logger.setLevel(logging.getLevelName("DEBUG"))
@@ -109,10 +155,10 @@ def _add_logger_handler(logger: Logger, handler: Handler) -> Logger:
109
155
  def _create_rotating_file_handler(
110
156
  log_directory: str,
111
157
  log_base_file_name: str,
112
- logging_level: str = DEFAULT_LOG_LEVEL,
158
+ logging_level: Union[str, int] = DEFAULT_LOG_LEVEL,
113
159
  max_bytes_per_log_file: int = DEFAULT_MAX_BYTES_PER_LOG,
114
160
  backup_count: int = DEFAULT_BACKUP_COUNT,
115
- logging_format: str = DEFAULT_LOG_FORMAT,
161
+ logging_format: Union[str, dict] = DEFAULT_LOG_FORMAT,
116
162
  ) -> FileHandler:
117
163
 
118
164
  if type(logging_level) is str:
@@ -126,7 +172,12 @@ def _create_rotating_file_handler(
126
172
  maxBytes=max_bytes_per_log_file,
127
173
  backupCount=backup_count,
128
174
  )
129
- handler.setFormatter(logging.Formatter(logging_format))
175
+
176
+ if type(logging_format) is str:
177
+ handler.setFormatter(logging.Formatter(logging_format))
178
+ else:
179
+ handler.setFormatter(JsonFormatter(logging_format))
180
+
130
181
  handler.setLevel(logging_level)
131
182
  return handler
132
183
 
@@ -135,7 +186,8 @@ def _file_handler_exists(logger: Logger, log_dir: str, log_base_file_name: str)
135
186
 
136
187
  handler_exists = False
137
188
  base_file_path = os.path.join(log_dir, log_base_file_name)
138
- if len(logger.handlers) > 0:
189
+
190
+ if logger.handlers:
139
191
  norm_base_file_path = os.path.normpath(base_file_path)
140
192
  handler_exists = any(
141
193
  [
@@ -149,49 +201,54 @@ def _file_handler_exists(logger: Logger, log_dir: str, log_base_file_name: str)
149
201
 
150
202
  def _configure_logger(
151
203
  logger: Logger,
152
- log_level: str,
204
+ log_level: int,
153
205
  log_dir: str,
154
206
  log_base_file_name: str,
155
207
  debug_log_base_file_name: str,
156
208
  ) -> Union[Logger, LoggerAdapter]:
209
+ # This maintains log level of rotating file handlers
157
210
  primary_log_level = log_level
158
211
  logger.propagate = False
159
- if log_level.upper() == "DEBUG":
212
+ if log_level <= logging.getLevelName("DEBUG"):
160
213
  if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
161
214
  handler = _create_rotating_file_handler(
162
215
  log_dir, debug_log_base_file_name, "DEBUG"
163
216
  )
164
217
  _add_logger_handler(logger, handler)
165
- primary_log_level = "INFO"
218
+ primary_log_level = logging.getLevelName("INFO")
166
219
  if not _file_handler_exists(logger, log_dir, log_base_file_name):
167
220
  handler = _create_rotating_file_handler(
168
221
  log_dir, log_base_file_name, primary_log_level
169
222
  )
170
223
  _add_logger_handler(logger, handler)
171
- if ray.is_initialized():
172
- ray_runtime_ctx = ray.get_runtime_context()
173
- if ray_runtime_ctx.worker.connected:
174
- logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
175
- else:
176
- logger = DeltaCATLoggerAdapter(logger)
177
224
 
178
- return logger
225
+ return DeltaCATLoggerAdapter(logger)
226
+
179
227
 
228
+ def configure_deltacat_logger(
229
+ logger: Logger, level: int = None
230
+ ) -> Union[Logger, LoggerAdapter]:
231
+ if level is None:
232
+ level = logging.getLevelName(DELTACAT_SYS_LOG_LEVEL)
180
233
 
181
- def configure_deltacat_logger(logger: Logger) -> Union[Logger, LoggerAdapter]:
182
234
  return _configure_logger(
183
235
  logger,
184
- DELTACAT_SYS_LOG_LEVEL,
236
+ level,
185
237
  DELTACAT_SYS_LOG_DIR,
186
238
  DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
187
239
  DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
188
240
  )
189
241
 
190
242
 
191
- def configure_application_logger(logger: Logger) -> Union[Logger, LoggerAdapter]:
243
+ def configure_application_logger(
244
+ logger: Logger, level: int = None
245
+ ) -> Union[Logger, LoggerAdapter]:
246
+ if level is None:
247
+ level = logging.getLevelName(DELTACAT_APP_LOG_LEVEL)
248
+
192
249
  return _configure_logger(
193
250
  logger,
194
- DELTACAT_APP_LOG_LEVEL,
251
+ level,
195
252
  DELTACAT_APP_LOG_DIR,
196
253
  DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
197
254
  DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
@@ -0,0 +1,12 @@
1
+ import unittest
2
+ from deltacat.aws.s3u import UuidBlockWritePathProvider, CapturedBlockWritePaths
3
+
4
+
5
+ class TestUuidBlockWritePathProvider(unittest.TestCase):
6
+ def test_uuid_block_write_provider_sanity(self):
7
+ capture_object = CapturedBlockWritePaths()
8
+ provider = UuidBlockWritePathProvider(capture_object=capture_object)
9
+
10
+ result = provider("base_path")
11
+
12
+ self.assertRegex(result, r"^base_path/[\w-]{36}$")