deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,216 +0,0 @@
1
- import argparse
2
- import base64
3
- import gzip
4
- import json
5
- import logging
6
- import pathlib
7
- from io import BytesIO
8
-
9
- import math
10
- import yaml
11
-
12
- from typing import Dict, Any, List, Tuple, Optional, Set, Union, TextIO
13
-
14
- from deltacat.autoscaler.events.compaction.cluster import ClusterSizeSuggester
15
- from deltacat.autoscaler.events.compaction.collections.partition_key_value import PartitionKeyValues, PartitionKeyValue
16
- from deltacat.autoscaler.events.compaction.input import CompactionInput
17
- from deltacat.autoscaler.events.session_manager import SessionManager, SESSION_ID_KEY
18
- from deltacat.compute.compactor.utils import round_completion_file as rcf
19
- from deltacat.compute.stats.models.delta_stats import DeltaStats
20
- from deltacat.storage import interface as dcs
21
- from deltacat import ContentType, logs, SortKey
22
- from deltacat.compute.compactor import RoundCompletionInfo, compaction_session, PrimaryKeyIndexMeta, \
23
- PrimaryKeyIndexLocator
24
- from deltacat.storage import PartitionLocator
25
-
26
- _PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
27
-
28
-
29
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
30
-
31
-
32
- def read_latest_round_completion_file(source_partition_locator,
33
- compacted_partition_locator,
34
- compaction_artifact_s3_bucket,
35
- primary_keys,
36
- sort_keys: List[SortKey] = None):
37
- if sort_keys is None:
38
- sort_keys = []
39
- # get the root path of a compatible primary key index for this round
40
- compatible_primary_key_index_meta = PrimaryKeyIndexMeta.of(
41
- compacted_partition_locator,
42
- primary_keys,
43
- sort_keys,
44
- _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
45
- )
46
- compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
47
- compatible_primary_key_index_meta)
48
- compatible_primary_key_index_root_path = \
49
- compatible_primary_key_index_locator.primary_key_index_root_path
50
-
51
- # read the results from any previously completed compaction round that used
52
- # a compatible primary key index
53
- round_completion_info = rcf.read_round_completion_file(
54
- compaction_artifact_s3_bucket,
55
- source_partition_locator,
56
- compatible_primary_key_index_root_path,
57
- )
58
- return round_completion_info
59
-
60
-
61
- def build_partition_locator(partition_data: Dict[str, Any]):
62
- return PartitionLocator.at(
63
- partition_data["owner"],
64
- partition_data["name"],
65
- partition_data.get("tableVersion"),
66
- partition_data["streamUUID"],
67
- None, # storage type
68
- partition_data["partition_values"], # partition values
69
- None # partition ID
70
- )
71
-
72
-
73
- def compact(compaction_input: CompactionInput,
74
- hash_bucket_count: Optional[int] = None,
75
- deltacat_storage=dcs,
76
- **kwargs):
77
- compaction_session.compact_partition(
78
- compaction_input.source_partition_locator,
79
- compaction_input.compacted_partition_locator,
80
- set(compaction_input.primary_keys),
81
- compaction_input.compaction_artifact_s3_bucket,
82
- compaction_input.last_stream_position_to_compact,
83
- schema_on_read=compaction_input.schema_on_read,
84
- input_deltas_stats=compaction_input.input_deltas_stats,
85
- hash_bucket_count=hash_bucket_count if hash_bucket_count else compaction_input.hash_bucket_count,
86
- deltacat_storage=deltacat_storage,
87
- **kwargs
88
- )
89
-
90
-
91
- def calc_new_hash_bucket_count(cluster_memory_bytes: int,
92
- max_memory_per_vcpu: int,
93
- vcpu_per_node: int):
94
- new_hash_bucket_count = max(
95
- math.ceil(cluster_memory_bytes / max_memory_per_vcpu),
96
- min(vcpu_per_node, 256) # Do not exceed 256 CPUs as a safety measure
97
- )
98
-
99
- return new_hash_bucket_count
100
-
101
-
102
- def get_round_completion_file(
103
- source_partition_locator: PartitionLocator,
104
- compacted_partition_locator: PartitionLocator,
105
- primary_keys: Set[str],
106
- compaction_artifact_s3_bucket: str
107
- ):
108
- return read_latest_round_completion_file(source_partition_locator,
109
- compacted_partition_locator,
110
- compaction_artifact_s3_bucket,
111
- sorted(primary_keys))
112
-
113
-
114
- def calc_compaction_cluster_memory_bytes(compaction_input: CompactionInput,
115
- new_uncompacted_deltas_bytes: int = 0) -> int:
116
- round_completion_file = get_round_completion_file(compaction_input.source_partition_locator,
117
- compaction_input.compacted_partition_locator,
118
- compaction_input.primary_keys,
119
- compaction_input.compaction_artifact_s3_bucket)
120
- if round_completion_file is None:
121
- # if no previous compaction rounds exist, use the incoming delta size as a place to start for calculations
122
- est_incoming_delta_size = new_uncompacted_deltas_bytes * 1.3
123
- logger.warning(f"No previous round completion file found for {compaction_input}."
124
- f"Using estimates: {est_incoming_delta_size}")
125
- return int(est_incoming_delta_size)
126
-
127
- old_num_records = round_completion_file.compacted_pyarrow_write_result.records
128
- sort_keys_bit_width = round_completion_file.sort_keys_bit_width
129
- pk_index_row_size = 32 + math.ceil(sort_keys_bit_width / 8)
130
- cluster_memory_bytes = max(
131
- old_num_records * pk_index_row_size * 1.3, # object store memory (hash bucketing)
132
- round_completion_file.compacted_pyarrow_write_result.pyarrow_bytes + new_uncompacted_deltas_bytes # dedupe
133
- )
134
- return int(cluster_memory_bytes)
135
-
136
-
137
- def get_compaction_size_inputs(config: Dict[str, Any],
138
- partition_key_values: PartitionKeyValues,
139
- cluster_memory_bytes: int,
140
- stats_metadata: Dict[int, DeltaStats] = None,
141
- parent_session_id: str = None,
142
- session_id: str = None) -> Tuple[int, TextIO]:
143
- suggester = ClusterSizeSuggester(cluster_memory_bytes=cluster_memory_bytes)
144
- new_hash_bucket_count = calc_new_hash_bucket_count(cluster_memory_bytes,
145
- suggester.get_max_memory_per_vcpu(),
146
- suggester.get_num_vcpu_per_node())
147
- cluster_cpus = max(
148
- new_hash_bucket_count,
149
- suggester.get_suggested_vcpu_count()
150
- )
151
- cluster_nodes = int(math.ceil(cluster_cpus / suggester.get_num_vcpu_per_node()))
152
- yaml_file = generate_compaction_session_yaml(config,
153
- partition_key_values,
154
- worker_node_count=cluster_nodes,
155
- instance_type=suggester.instance_type,
156
- stats_metadata=stats_metadata,
157
- parent_session_id=parent_session_id,
158
- session_id=session_id)
159
- return new_hash_bucket_count, yaml_file
160
-
161
-
162
- def generate_compaction_session_yaml(config: Dict[str, Any],
163
- partition_key_values: PartitionKeyValues,
164
- head_node_count: int = 0,
165
- worker_node_count: int = 0,
166
- stats_metadata: Dict[int, DeltaStats] = None,
167
- instance_type: str = None,
168
- parent_session_id: str = None,
169
- session_id: str = None) -> TextIO:
170
- # TODO: Remove this workaround when custom AMIs are built with baked-in build files (i.e. wheels, jars)
171
- new_config = {**config}
172
- for local_path, _ in new_config["file_mounts"].items():
173
- new_config["file_mounts"][local_path] = local_path
174
- pkv_id = partition_key_values.id
175
- new_filename = f"compact.{pkv_id}.yaml"
176
- new_config["cluster_name"] = f"compaction-session-{pkv_id}"
177
- # Allow child clusters to re-use the same SSH key provided from the parent cluster
178
- new_config["auth"]["ssh_private_key"] = f"~/ray_bootstrap_key.pem"
179
- new_config["file_mounts"] = {
180
- **config["file_mounts"],
181
- f"~/{new_filename}": f"~/{new_filename}"
182
- }
183
- new_config["provider"]["use_internal_ips"] = True
184
- new_config["max_workers"] = worker_node_count
185
- # TODO: Determine optimal object store memory / worker heap memory allocation ratios?
186
- new_config["available_node_types"]["ray.worker.default"]["min_workers"] = \
187
- new_config["available_node_types"]["ray.worker.default"]["max_workers"] = worker_node_count
188
- new_config["available_node_types"]["ray.worker.default"]["node_config"]["InstanceType"] = instance_type
189
- new_config["available_node_types"]["ray.head.default"]["node_config"]["InstanceType"] = instance_type
190
-
191
- # TODO: Formalize supported parameter key/values after initial shadow compaction
192
- new_events = {
193
- **config["events"],
194
- "parameters": {
195
- **config["events"]["parameters"],
196
- SESSION_ID_KEY: session_id,
197
- },
198
- "metadata": {
199
- "partitionKeyValues": compress(partition_key_values).decode('utf-8')
200
- }
201
- }
202
- if stats_metadata:
203
- new_events["metadata"]["statsMetadata"] = compress(stats_metadata).decode('utf-8')
204
- new_config["events"] = new_events
205
-
206
- with open(new_filename, "w") as yaml_file:
207
- yaml.dump(new_config, yaml_file, default_flow_style=False)
208
- return yaml_file
209
-
210
- def compress(serializable_obj: Union[Dict, Tuple, List]) -> bytes:
211
- json_dict = json.dumps(serializable_obj)
212
- out = BytesIO()
213
- with gzip.open(out, "wt", encoding="utf-8") as zipfile:
214
- zipfile.write(json_dict)
215
-
216
- return base64.b64encode(out.getvalue())
@@ -1,303 +0,0 @@
1
- import json
2
- import logging
3
- import uuid
4
- from typing import List, Union, Dict, Set, Any, Optional
5
-
6
- from deltacat import logs
7
- from deltacat.autoscaler.events.compaction.input import CompactionInput
8
- from deltacat.autoscaler.events.compaction.process import CompactionProcess
9
- from deltacat.autoscaler.events.compaction.utils import calc_compaction_cluster_memory_bytes, get_compaction_size_inputs
10
- from deltacat.autoscaler.events.event_store import EventStoreClient
11
- from deltacat.autoscaler.events.exceptions import WorkflowException
12
- from deltacat.autoscaler.events.session_manager import PARENT_SESSION_ID_KEY, SESSION_ID_KEY
13
- from deltacat.autoscaler.events.workflow import EventWorkflow, StateTransitionMap
14
- from deltacat.autoscaler.events.compaction.dispatcher import CompactionEventDispatcher
15
- from deltacat.autoscaler.events.states import ScriptStartedEvent, ScriptInProgressEvent, \
16
- ScriptCompletedEvent, States, ScriptInProgressCustomEvent, RayJobRequestEvent
17
- from deltacat.storage import PartitionLocator
18
-
19
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
-
21
-
22
- NEW_REQUEST = "NEW"
23
- STATS_METADATA_COLLECTION_STARTED = "STATS_METADATA_COLLECTION_STARTED"
24
- STATS_METADATA_COLLECTION_COMPLETED = "STATS_METADATA_COLLECTION_COMPLETED"
25
- COMPACTION_SESSION_STARTED = "COMPACTION_SESSION_STARTED"
26
- COMPACTION_SESSION_PARTITIONS_COMPACTED = "COMPACTION_SESSION_PARTITIONS_COMPACTED"
27
- COMPACTION_SESSION_PARTITIONS_FAILURE = "COMPACTION_SESSION_PARTITIONS_FAILURE"
28
- COMPACTION_SESSION_PROGRESS_UPDATE = "COMPACTION_SESSION_PROGRESS_UPDATE"
29
- COMPACTION_SESSION_COMPLETED = "COMPACTION_SESSION_COMPLETED"
30
- COMPACTION_METADATA_SESSION_WORKFLOW_FAILURE = "COMPACTION_METADATA_SESSION_WORKFLOW_FAILURE"
31
-
32
-
33
- # TODO: Might be worth renaming this later to avoid confusion with Ray Workflows
34
- class CompactionWorkflow(EventWorkflow):
35
-
36
- def __init__(self,
37
- config: Dict[str, Any],
38
- event_dispatcher: CompactionEventDispatcher,
39
- event_store: EventStoreClient = None,
40
- compaction_inputs: List[CompactionInput] = None):
41
- self.config = config
42
- self.event_dispatcher = event_dispatcher
43
- assert self.event_dispatcher is not None, f"Event dispatcher must be provided to build and transition " \
44
- f"to different job states."
45
- self.event_store = event_store
46
- self._compaction_inputs = compaction_inputs
47
-
48
- # Initialization
49
- self._add_state_handlers()
50
- self.state_transitions = self._build_state_transitions()
51
- self._metastats = {}
52
-
53
- @property
54
- def state_transition_map(self) -> StateTransitionMap:
55
- return self.state_transitions
56
-
57
- def _add_state_handlers(self):
58
- in_progress_custom_events = [
59
- ScriptInProgressCustomEvent(COMPACTION_METADATA_SESSION_WORKFLOW_FAILURE, 0),
60
- ScriptInProgressCustomEvent(STATS_METADATA_COLLECTION_STARTED, 1),
61
- ScriptInProgressCustomEvent(STATS_METADATA_COLLECTION_COMPLETED, 2),
62
- ScriptInProgressCustomEvent(COMPACTION_SESSION_STARTED, 3),
63
- ScriptInProgressCustomEvent(COMPACTION_SESSION_PARTITIONS_COMPACTED, 4),
64
- ScriptInProgressCustomEvent(COMPACTION_SESSION_PROGRESS_UPDATE, 5),
65
- ScriptInProgressCustomEvent(COMPACTION_SESSION_COMPLETED, 6),
66
- ScriptInProgressCustomEvent(COMPACTION_SESSION_PARTITIONS_FAILURE, 7),
67
- ]
68
-
69
- self._event_map = event_map = {event.name: event for event in in_progress_custom_events}
70
- self.event_dispatcher.add_event_handlers([val for _, val in event_map.items()])
71
-
72
- # This callback is added for tracking new child jobs to be launched
73
- self.event_dispatcher.add_event_handlers([RayJobRequestEvent.new_request_delivered])
74
-
75
- def _build_state_transitions(self) -> StateTransitionMap:
76
- """Builds a mapping of event states to state transitioning callbacks, or
77
- a dictionary of state transitioning callbacks.
78
-
79
- If an event has state sequences, a dictionary of callbacks is provided
80
- with sequences as keys and callback functions as values.
81
-
82
- Returns: a map of event states to callbacks or a dictionary of callbacks
83
- """
84
- init_sequence = 0
85
- in_progress_sequence = {name: event.state_sequence for name, event in self._event_map.items()}
86
- return {
87
- States.IN_PROGRESS.name: {
88
- init_sequence: self.begin_stats_metadata_collection,
89
- in_progress_sequence[STATS_METADATA_COLLECTION_COMPLETED]: self.begin_compaction,
90
- in_progress_sequence[COMPACTION_SESSION_STARTED]: self.wait_or_mark_compaction_complete,
91
- in_progress_sequence[COMPACTION_SESSION_PARTITIONS_COMPACTED]: self.wait_or_mark_compaction_complete,
92
- in_progress_sequence[COMPACTION_SESSION_PROGRESS_UPDATE]: self.wait_or_mark_compaction_complete,
93
- in_progress_sequence[COMPACTION_SESSION_PARTITIONS_FAILURE]: self.wait_or_mark_compaction_complete,
94
- in_progress_sequence[COMPACTION_SESSION_COMPLETED]: self.complete_job
95
- },
96
- }
97
-
98
- def register_compaction_inputs(self, compaction_inputs: List[CompactionInput]):
99
- """Extracts and registers a set of partition IDs that need compaction from the compaction inputs.
100
- """
101
- self._compaction_inputs = compaction_inputs
102
- compaction_source_partition_locators = [task.source_partition_locator for task in self._compaction_inputs]
103
- compaction_source_partition_ids = [loc.partition_id for loc in compaction_source_partition_locators]
104
- self._partition_ids_to_compact = set(compaction_source_partition_ids)
105
-
106
- def start_workflow(self):
107
- """Publish a job state event that indicates that a request to start a job run has been successfully received,
108
- but the job run has not yet finished prerequisite initialization steps.
109
- """
110
- self.event_dispatcher.dispatch_event(ScriptInProgressEvent.in_progress)
111
-
112
- def begin_stats_metadata_collection(self):
113
- """Publish a job state event that indicates that stats metadata collection has started.
114
- """
115
- event = self._event_map[STATS_METADATA_COLLECTION_STARTED]
116
- self.event_dispatcher.dispatch_event(event,
117
- event_data={
118
- "eventName": event.name,
119
- "stateDetailDescription": "Running stats metadata session",
120
- })
121
- if self.session_manager:
122
- self._metastats = self.session_manager.launch_stats_metadata_collection(
123
- [compact.source_partition_locator for compact in self._compaction_inputs]
124
- )
125
- self.stats_metadata_collection_completed()
126
-
127
- def stats_metadata_collection_completed(self):
128
- """Publish a job state event that indicates that stats metadata collection is complete.
129
- """
130
- event = self._event_map[STATS_METADATA_COLLECTION_COMPLETED]
131
- self.event_dispatcher.dispatch_event(event,
132
- event_data={
133
- "eventName": event.name,
134
- "stateDetailDescription": "Finished collecting stats metadata",
135
- })
136
-
137
- def begin_compaction(self):
138
- """Publish a job state event that indicates that the compaction run has started.
139
- """
140
- event = self._event_map[COMPACTION_SESSION_STARTED]
141
- self.event_dispatcher.dispatch_event(event,
142
- event_data={
143
- "eventName": event.name,
144
- "stateDetailDescription": "Running compaction session",
145
- })
146
- if self.session_manager:
147
- processes = self.build_compaction_processes()
148
- self.session_manager.launch_compaction(processes)
149
-
150
- def build_compaction_processes(self) -> List[CompactionProcess]:
151
- processes = []
152
- partition_stats_metadata = self._metastats
153
- for compaction_input in self._compaction_inputs:
154
- stats_metadata = partition_stats_metadata.get(compaction_input.source_partition_locator.partition_id, {})
155
- stats_metadata = {stream_pos: delta_stats for stream_pos, delta_stats in stats_metadata.items()
156
- if stream_pos <= compaction_input.last_stream_position_to_compact}
157
- total_pyarrow_table_bytes = sum([stats_result.stats.pyarrow_table_bytes
158
- for stream_pos, stats_result in stats_metadata.items()
159
- if stats_result.stats is not None])
160
- cluster_memory_bytes = calc_compaction_cluster_memory_bytes(compaction_input, total_pyarrow_table_bytes)
161
- new_session_id = str(uuid.uuid4())
162
- self.event_dispatcher.dispatch_event(RayJobRequestEvent.new_request_delivered,
163
- event_data={
164
- PARENT_SESSION_ID_KEY: self.session_manager.session_id,
165
- SESSION_ID_KEY: new_session_id
166
- })
167
- new_hash_bucket_count, yaml_file = get_compaction_size_inputs(self.config,
168
- compaction_input.partition_key_values,
169
- cluster_memory_bytes,
170
- stats_metadata=stats_metadata,
171
- parent_session_id=self.session_manager.session_id,
172
- session_id=new_session_id)
173
- compaction_process = CompactionProcess(compaction_input.source_partition_locator,
174
- yaml_file.name,
175
- new_hash_bucket_count,
176
- compaction_input.last_stream_position_to_compact,
177
- compaction_input.partition_key_values,
178
- cluster_memory_bytes=cluster_memory_bytes,
179
- input_delta_total_bytes=total_pyarrow_table_bytes)
180
-
181
- # TODO: Increase file descriptor limit on host (up to ~60k)
182
- # TODO: Emit metrics for compaction jobs with very high number of partitions
183
- processes.append(compaction_process)
184
- return processes
185
-
186
- def partitions_compacted(self,
187
- partition_locators: List[PartitionLocator]):
188
- """Publish a job state event that indicates that a single partition has finished compaction.
189
- A compaction session can have 1...N partitions to compact.
190
- """
191
- partition_completed_event = self._event_map[COMPACTION_SESSION_PARTITIONS_COMPACTED]
192
- self.event_dispatcher.dispatch_event(partition_completed_event,
193
- event_data={
194
- "eventName": partition_completed_event.name,
195
- "stateDetailDescription":
196
- f"Finished compaction on partitions: "
197
- f"{[pl.partition_id for pl in partition_locators]}",
198
- "stateDetailMetadata": {
199
- **{pl.partition_id: json.dumps(pl.partition_values)
200
- for pl in partition_locators}
201
- }
202
- })
203
-
204
- def partitions_compaction_failure(self,
205
- partition_locators: List[PartitionLocator],
206
- error_trace: Optional[str] = None ):
207
- """Publish a job state event that indicates failure to compact a list of partitions.
208
- """
209
- failed_event = self._event_map[COMPACTION_SESSION_PARTITIONS_FAILURE]
210
- self.event_dispatcher.dispatch_event(failed_event,
211
- event_data={
212
- "eventName": failed_event.name,
213
- "errorMessage":
214
- f"Failure to compact partitions: "
215
- f"{[pl.partition_id for pl in partition_locators]}",
216
- "errorStackTrace": error_trace,
217
- "stateDetailDescription":
218
- f"Failure to compact partitions: "
219
- f"{[pl.partition_id for pl in partition_locators]}",
220
- "stateDetailMetadata": {
221
- **{pl.partition_id: json.dumps(pl.partition_values)
222
- for pl in partition_locators}
223
- }
224
- })
225
-
226
- def update_compaction_job_progress(self,
227
- partition_locator: PartitionLocator,
228
- session_id: str):
229
- """Dispatch a compaction job update event for a given session ID.
230
-
231
- :param partition_locator: Locator for a partition
232
- :param session_id: Session ID to dispatch the event for
233
- :return:
234
- """
235
- progress_event = self._event_map[COMPACTION_SESSION_PROGRESS_UPDATE]
236
- partition_id = partition_locator.partition_id
237
- self.event_dispatcher.dispatch_event(progress_event,
238
- event_data={
239
- PARENT_SESSION_ID_KEY: session_id,
240
- SESSION_ID_KEY: session_id,
241
- "eventName": progress_event.name,
242
- "stateDetailDescription": f"Compaction Update",
243
- "stateDetailMetadata": {
244
- partition_id: str(partition_locator.partition_values)
245
- }
246
- })
247
-
248
- def wait_or_mark_compaction_complete(self):
249
- """Publish a job state event that indicates that the compaction run is complete.
250
- """
251
- if self.event_store is None or self._partition_ids_to_compact is None:
252
- # TODO: Separate this workflow out into multiple workflows, for different applications
253
- raise WorkflowException(f"Event store and partition IDs must be defined in a workflow."
254
- f"Event store: {self.event_store}"
255
- f"Partition IDs to compact: {self._partition_ids_to_compact}")
256
-
257
- partition_ids_failed = set(self.event_store.get_failed_partition_ids(self.trace_id))
258
- if len(partition_ids_failed) > 0:
259
- raise WorkflowException(f"Compaction workflow failed due to partition errors: {partition_ids_failed}")
260
-
261
-
262
- partition_ids_completed = set(self.event_store.get_compacted_partition_ids(self.trace_id))
263
- if partition_ids_completed == self._partition_ids_to_compact:
264
- logger.info(f"Compaction run complete.")
265
- event = self._event_map[COMPACTION_SESSION_COMPLETED]
266
- self.event_dispatcher.dispatch_event(event,
267
- event_data={
268
- "eventName": event.name,
269
- "stateDetailDescription": "Finished compaction run",
270
- })
271
- else:
272
- logger.info(f"Compaction is in progress: {len(partition_ids_completed)} "
273
- f"out of {len(self._partition_ids_to_compact)} partitions completed...")
274
-
275
- def complete_job(self):
276
- """Publish a job state event that indicates that the job run has completed.
277
- """
278
- self.event_dispatcher.dispatch_event(ScriptCompletedEvent.completed)
279
-
280
- def workflow_failure(
281
- self,
282
- error_message: Optional[str] = None,
283
- error_trace: Optional[str] = None):
284
- """Publish a job state event that indicates failure to compact a list of partitions.
285
- """
286
- failed_workflow_event = self._event_map[COMPACTION_METADATA_SESSION_WORKFLOW_FAILURE]
287
- self.event_dispatcher.dispatch_event(failed_workflow_event,
288
- event_data={
289
- "eventName": failed_workflow_event.name,
290
- "errorMessage": error_message,
291
- "errorStackTrace": error_trace,
292
- "stateDetailDescription":
293
- f"Workflow encountered a failure.",
294
- "stateDetailStatus": "FAILED",
295
- })
296
-
297
- @property
298
- def session_manager(self):
299
- return self.event_dispatcher.session_manager
300
-
301
- @property
302
- def trace_id(self):
303
- return self.event_dispatcher.events_publisher.event_base_params.get("traceId", "UNKNOWN_TRACE_ID")
@@ -1,95 +0,0 @@
1
- import logging
2
- from typing import Dict, Any, Optional, Callable, List
3
-
4
- from deltacat.autoscaler.events.session_manager import SessionManager, SESSION_ID_KEY, PARENT_SESSION_ID_KEY
5
- from deltacat.autoscaler.events.states import event_enum_values
6
- from ray.autoscaler._private.event_system import RayEvent, EventPublisher
7
-
8
- from deltacat import logs
9
- from deltacat.storage import interface as unimplemented_deltacat_storage
10
-
11
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
-
13
-
14
- class EventDispatcher:
15
- def __init__(self,
16
- events_publisher: EventPublisher,
17
- deltacat_storage: unimplemented_deltacat_storage,
18
- session_manager: SessionManager = None):
19
- """Constructor for the event dispatcher.
20
-
21
- Intended for usage by Ray parent and child clusters running managed jobs.
22
-
23
- Args:
24
- events_publisher: Events manager for publishing events through a cloud provider
25
- session_manager: Manager for tracking and launching Ray sessions
26
- deltacat_storage: Storage interface for deltacat
27
- """
28
- self.events_publisher = events_publisher
29
- self.deltacat_storage = deltacat_storage
30
- self.session_manager = session_manager
31
-
32
- # Setup event callbacks in the constructor
33
- self._add_base_event_handlers()
34
-
35
- def dispatch_event(self,
36
- event: RayEvent,
37
- event_data: Optional[Dict[str, Any]] = None):
38
- """Generic helper method to dispatch Ray job events
39
-
40
- Args:
41
- event: Ray job event to dispatch
42
- event_data: Additional metadata for the given event. Optional.
43
-
44
- Returns:
45
-
46
- """
47
- if event_data is None:
48
- event_data = {}
49
-
50
- event_data["event_name"] = event
51
- if self.session_manager:
52
- event_data.setdefault(PARENT_SESSION_ID_KEY, self.session_manager.session_id)
53
- event_data.setdefault(SESSION_ID_KEY, self.session_manager.session_id)
54
-
55
- logger.info(f"Dispatching event {event.name} "
56
- f"with parent Ray session ID = {event_data[PARENT_SESSION_ID_KEY]} "
57
- f"and current Ray session ID = {event_data[SESSION_ID_KEY]}")
58
-
59
- event_payload = {
60
- **self.events_publisher.config["parameters"],
61
- **event_data
62
- }
63
-
64
- # Trim un-required, space intensive data from payload
65
- if "statsMetadata" in event_payload:
66
- event_payload.pop("statsMetadata")
67
- if "partitionsToCompact" in event_payload:
68
- event_payload.pop("partitionsToCompact")
69
-
70
- self._publish_event(event_payload)
71
-
72
- def _add_base_event_handlers(self):
73
- """Add callback handlers for base job events
74
- """
75
- publisher = self.events_publisher
76
- if publisher:
77
- for event in event_enum_values:
78
- logger.info(f"[{publisher.__class__.__name__}]: Adding callback for event {event.name}")
79
- publisher.add_callback(event)
80
-
81
- def add_event_handlers(self, custom_events: List[RayEvent]):
82
- """Add callback handlers for custom job events
83
- """
84
- publisher = self.events_publisher
85
- if publisher:
86
- for event in custom_events:
87
- logger.info(f"[{publisher.__class__.__name__}]: Adding callback for event {event.name}")
88
- publisher.add_callback(event)
89
-
90
- def _publish_event(self, event_data: Dict[str, Any]):
91
- publisher = self.events_publisher
92
- if publisher and event_data and event_data.get("event_name"):
93
- event: RayEvent = event_data["event_name"]
94
- logger.info(f"[{publisher.__class__.__name__}]: Publishing event {event.name}")
95
- publisher.publish(event, event_data)
File without changes