deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/logs.py CHANGED
@@ -1,12 +1,18 @@
1
- import os
2
1
  import logging
2
+ import os
3
3
  import pathlib
4
- from logging import handlers, Logger, Handler, FileHandler
5
- from deltacat.constants import DELTACAT_LOG_LEVEL, APPLICATION_LOG_LEVEL
4
+ from logging import FileHandler, Handler, Logger, LoggerAdapter, handlers
5
+ from typing import Union
6
+
7
+ import ray
8
+ from ray.runtime_context import RuntimeContext
9
+
10
+ from deltacat.constants import APPLICATION_LOG_LEVEL, DELTACAT_LOG_LEVEL
6
11
 
7
12
  DEFAULT_LOG_LEVEL = "INFO"
8
- DEFAULT_LOG_FORMAT = \
9
- "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
13
+ DEFAULT_LOG_FORMAT = (
14
+ "%(asctime)s\t%(levelname)s pid=%(process)d %(filename)s:%(lineno)s -- %(message)s"
15
+ )
10
16
  DEFAULT_MAX_BYTES_PER_LOG = 2 ^ 20 * 256 # 256 MiB
11
17
  DEFAULT_BACKUP_COUNT = 0
12
18
 
@@ -19,9 +25,55 @@ DEFAULT_DELTACAT_LOG_BASE_FILE_NAME = "deltacat-python.info.log"
19
25
  DEFAULT_DEBUG_DELTACAT_LOG_BASE_FILE_NAME = "deltacat-python.debug.log"
20
26
 
21
27
 
22
- def _add_logger_handler(
23
- logger: Logger,
24
- handler: Handler) -> Logger:
28
+ class RayRuntimeContextLoggerAdapter(logging.LoggerAdapter):
29
+ """
30
+ Logger Adapter for injecting Ray Runtime Context into logging messages.
31
+ """
32
+
33
+ def __init__(self, logger: Logger, runtime_context: RuntimeContext):
34
+ super().__init__(logger, {})
35
+ self.runtime_context = runtime_context
36
+
37
+ def process(self, msg, kwargs):
38
+ """
39
+ Injects Ray Runtime Context details into each log message.
40
+
41
+ This may include information such as the raylet node ID, task/actor ID, job ID,
42
+ placement group ID of the worker, and assigned resources to the task/actor.
43
+
44
+ Args:
45
+ msg: The original log message
46
+ kwargs: Keyword arguments for the log message
47
+
48
+ Returns: A log message with Ray Runtime Context details
49
+
50
+ """
51
+ runtime_context_dict = self.runtime_context.get()
52
+ runtime_context_dict[
53
+ "worker_id"
54
+ ] = self.runtime_context.worker.core_worker.get_worker_id()
55
+ if self.runtime_context.get_task_id() or self.runtime_context.get_actor_id():
56
+ runtime_context_dict[
57
+ "pg_id"
58
+ ] = self.runtime_context.get_placement_group_id()
59
+ runtime_context_dict[
60
+ "assigned_resources"
61
+ ] = self.runtime_context.get_assigned_resources()
62
+
63
+ return "(ray_runtime_context=%s) -- %s" % (runtime_context_dict, msg), kwargs
64
+
65
+ def __reduce__(self):
66
+ """
67
+ Used to unpickle the class during Ray object store transfer.
68
+ """
69
+
70
+ def deserializer(*args):
71
+ return RayRuntimeContextLoggerAdapter(args[0], ray.get_runtime_context())
72
+
73
+ return deserializer, (self.logger,)
74
+
75
+
76
+ def _add_logger_handler(logger: Logger, handler: Handler) -> Logger:
25
77
 
26
78
  logger.setLevel(logging.getLevelName("DEBUG"))
27
79
  logger.addHandler(handler)
@@ -29,89 +81,89 @@ def _add_logger_handler(
29
81
 
30
82
 
31
83
  def _create_rotating_file_handler(
32
- log_directory: str,
33
- log_base_file_name: str,
34
- logging_level: str = DEFAULT_LOG_LEVEL,
35
- max_bytes_per_log_file: int = DEFAULT_MAX_BYTES_PER_LOG,
36
- backup_count: int = DEFAULT_BACKUP_COUNT,
37
- logging_format: str = DEFAULT_LOG_FORMAT) -> FileHandler:
84
+ log_directory: str,
85
+ log_base_file_name: str,
86
+ logging_level: str = DEFAULT_LOG_LEVEL,
87
+ max_bytes_per_log_file: int = DEFAULT_MAX_BYTES_PER_LOG,
88
+ backup_count: int = DEFAULT_BACKUP_COUNT,
89
+ logging_format: str = DEFAULT_LOG_FORMAT,
90
+ ) -> FileHandler:
38
91
 
39
92
  if type(logging_level) is str:
40
93
  logging_level = logging.getLevelName(logging_level.upper())
41
94
  assert log_base_file_name, "log file name is required"
42
95
  assert log_directory, "log directory is required"
43
-
44
96
  log_dir_path = pathlib.Path(log_directory)
45
97
  log_dir_path.mkdir(parents=True, exist_ok=True)
46
- handler = logging.handlers.RotatingFileHandler(
98
+ handler = handlers.RotatingFileHandler(
47
99
  os.path.join(log_directory, log_base_file_name),
48
100
  maxBytes=max_bytes_per_log_file,
49
- backupCount=backup_count)
101
+ backupCount=backup_count,
102
+ )
50
103
  handler.setFormatter(logging.Formatter(logging_format))
51
104
  handler.setLevel(logging_level)
52
105
  return handler
53
106
 
54
107
 
55
- def _file_handler_exists(
56
- logger: Logger,
57
- log_dir: str,
58
- log_base_file_name: str) -> bool:
108
+ def _file_handler_exists(logger: Logger, log_dir: str, log_base_file_name: str) -> bool:
59
109
 
60
110
  handler_exists = False
61
111
  base_file_path = os.path.join(log_dir, log_base_file_name)
62
112
  if len(logger.handlers) > 0:
63
113
  norm_base_file_path = os.path.normpath(base_file_path)
64
- handler_exists = any([
65
- isinstance(handler, logging.FileHandler)
66
- and os.path.normpath(handler.baseFilename) == norm_base_file_path
67
- for handler in logger.handlers
68
- ])
114
+ handler_exists = any(
115
+ [
116
+ isinstance(handler, logging.FileHandler)
117
+ and os.path.normpath(handler.baseFilename) == norm_base_file_path
118
+ for handler in logger.handlers
119
+ ]
120
+ )
69
121
  return handler_exists
70
122
 
71
123
 
72
124
  def _configure_logger(
73
- logger: Logger,
74
- log_level: str,
75
- log_dir: str,
76
- log_base_file_name: str,
77
- debug_log_base_file_name: str) -> Logger:
78
-
125
+ logger: Logger,
126
+ log_level: str,
127
+ log_dir: str,
128
+ log_base_file_name: str,
129
+ debug_log_base_file_name: str,
130
+ ) -> Union[Logger, LoggerAdapter]:
79
131
  primary_log_level = log_level
80
132
  logger.propagate = False
81
133
  if log_level.upper() == "DEBUG":
82
134
  if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
83
135
  handler = _create_rotating_file_handler(
84
- log_dir,
85
- debug_log_base_file_name,
86
- "DEBUG",
136
+ log_dir, debug_log_base_file_name, "DEBUG"
87
137
  )
88
138
  _add_logger_handler(logger, handler)
89
139
  primary_log_level = "INFO"
90
140
  if not _file_handler_exists(logger, log_dir, log_base_file_name):
91
141
  handler = _create_rotating_file_handler(
92
- log_dir,
93
- log_base_file_name,
94
- primary_log_level,
142
+ log_dir, log_base_file_name, primary_log_level
95
143
  )
96
144
  _add_logger_handler(logger, handler)
145
+ ray_runtime_ctx = ray.get_runtime_context()
146
+ if ray_runtime_ctx.worker.connected:
147
+ logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
148
+
97
149
  return logger
98
150
 
99
151
 
100
- def configure_deltacat_logger(logger: Logger) -> Logger:
152
+ def configure_deltacat_logger(logger: Logger) -> Union[Logger, LoggerAdapter]:
101
153
  return _configure_logger(
102
154
  logger,
103
155
  DELTACAT_LOG_LEVEL,
104
156
  DEFAULT_DELTACAT_LOG_DIR,
105
157
  DEFAULT_DELTACAT_LOG_BASE_FILE_NAME,
106
- DEFAULT_DEBUG_DELTACAT_LOG_BASE_FILE_NAME
158
+ DEFAULT_DEBUG_DELTACAT_LOG_BASE_FILE_NAME,
107
159
  )
108
160
 
109
161
 
110
- def configure_application_logger(logger: Logger) -> Logger:
162
+ def configure_application_logger(logger: Logger) -> Union[Logger, LoggerAdapter]:
111
163
  return _configure_logger(
112
164
  logger,
113
165
  APPLICATION_LOG_LEVEL,
114
166
  DEFAULT_APPLICATION_LOG_DIR,
115
167
  DEFAULT_APPLICATION_LOG_BASE_FILE_NAME,
116
- DEFAULT_DEBUG_APPLICATION_LOG_BASE_FILE_NAME
168
+ DEFAULT_DEBUG_APPLICATION_LOG_BASE_FILE_NAME,
117
169
  )
@@ -1,17 +1,27 @@
1
- from deltacat.aws.redshift import Manifest, ManifestMeta, ManifestEntry, \
2
- ManifestAuthor, ManifestEntryList
1
+ from deltacat.aws.redshift import (
2
+ Manifest,
3
+ ManifestAuthor,
4
+ ManifestEntry,
5
+ ManifestEntryList,
6
+ ManifestMeta,
7
+ )
3
8
  from deltacat.storage.model.delta import Delta, DeltaLocator
4
- from deltacat.storage.model.partition import Partition, PartitionLocator
5
9
  from deltacat.storage.model.list_result import ListResult
6
10
  from deltacat.storage.model.locator import Locator
7
11
  from deltacat.storage.model.namespace import Namespace, NamespaceLocator
12
+ from deltacat.storage.model.partition import Partition, PartitionLocator
8
13
  from deltacat.storage.model.stream import Stream, StreamLocator
9
14
  from deltacat.storage.model.table import Table, TableLocator
10
- from deltacat.storage.model.table_version import TableVersion, \
11
- TableVersionLocator
12
- from deltacat.storage.model.types import CommitState, DeltaType, \
13
- LifecycleState, SchemaConsistencyType, LocalTable, LocalDataset, \
14
- DistributedDataset
15
+ from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
16
+ from deltacat.storage.model.types import (
17
+ CommitState,
18
+ DeltaType,
19
+ DistributedDataset,
20
+ LifecycleState,
21
+ LocalDataset,
22
+ LocalTable,
23
+ SchemaConsistencyType,
24
+ )
15
25
 
16
26
  __all__ = [
17
27
  "CommitState",