matrice-compute 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,490 @@
1
+ """
2
+ Compute Operations Handler - Kafka Event-Driven Operations Manager
3
+
4
+ This module handles compute instance operations (start/stop/restart) triggered from
5
+ the frontend dashboard via Kafka events. It consumes events from the 'compute_operations'
6
+ topic and performs the actual operations on compute instances and their actions.
7
+
8
+ Uses EventListener from matrice_common for simplified Kafka consumption.
9
+
10
+ Event Structure:
11
+ {
12
+ "instance_id": "string",
13
+ "action_record_id": "string", # Can be ObjectID("000000000000000000000000") or all zeros for instance-level operations
14
+ "operation": "start|stop|restart",
15
+ "account_number": 12345,
16
+ "requested_by": "user@example.com",
17
+ "request_id": "uuid-string",
18
+ "timestamp": "2025-11-21T10:30:00.123Z"
19
+ }
20
+ """
21
+
22
+ import logging
23
+ import re
24
+ import time
25
+ from typing import Dict, Any, Optional
26
+ import sys
27
+ import traceback
28
+ import os
29
+ import subprocess
30
+
31
+ from matrice_common.stream.event_listener import EventListener
32
+
33
+ # Configure logging
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class ComputeOperationsHandler:
38
+ """
39
+ Handles Kafka-based compute operations for instance and action management.
40
+
41
+ This class uses EventListener from matrice_common to listen for operation
42
+ events from the 'compute_operations' Kafka topic. It delegates operations
43
+ to the ActionsManager for execution and updates status via API calls.
44
+ """
45
+
46
+ KAFKA_TOPIC = "compute_operations"
47
+
48
+ def __init__(self, actions_manager, session, scaling, instance_id: str):
49
+ """
50
+ Initialize the Compute Operations Handler.
51
+
52
+ Args:
53
+ actions_manager: Reference to the ActionsManager instance
54
+ session: Session object for authentication and Kafka configuration
55
+ scaling: Scaling service instance for API status updates
56
+ instance_id: This compute instance's ID for filtering events
57
+ """
58
+ self.actions_manager = actions_manager
59
+ self.session = session
60
+ self.scaling = scaling
61
+ self.instance_id = instance_id
62
+ self.event_listener: Optional[EventListener] = None
63
+ self.running = False
64
+
65
+ logger.info(f"Initializing ComputeOperationsHandler for instance ID: {instance_id}")
66
+
67
+ def start(self) -> bool:
68
+ """
69
+ Start the operations handler using EventListener.
70
+
71
+ Returns:
72
+ bool: True if started successfully, False otherwise
73
+ """
74
+ if self.running:
75
+ logger.warning("ComputeOperationsHandler is already running")
76
+ return False
77
+
78
+ try:
79
+ self.event_listener = EventListener(
80
+ session=self.session,
81
+ topics=[self.KAFKA_TOPIC],
82
+ event_handler=self._handle_operation_event,
83
+ filter_field='instance_id',
84
+ filter_value=self.instance_id,
85
+ consumer_group_id=f"compute_ops_{self.instance_id}"
86
+ )
87
+ self.running = self.event_listener.start()
88
+
89
+ if self.running:
90
+ logger.info("ComputeOperationsHandler started successfully")
91
+ else:
92
+ logger.error("ComputeOperationsHandler failed to start")
93
+
94
+ return self.running
95
+
96
+ except Exception as e:
97
+ logger.error(f"Failed to start ComputeOperationsHandler: {e}")
98
+ logger.error(traceback.format_exc())
99
+ return False
100
+
101
+ def stop(self):
102
+ """
103
+ Stop the operations handler gracefully.
104
+ """
105
+ logger.info("Stopping ComputeOperationsHandler...")
106
+ self.running = False
107
+
108
+ if self.event_listener:
109
+ self.event_listener.stop()
110
+
111
+ logger.info("ComputeOperationsHandler stopped")
112
+
113
+ def _handle_operation_event(self, event: Dict[str, Any]):
114
+ """
115
+ Handle incoming operation event from Kafka.
116
+
117
+ This is the callback function passed to EventListener.
118
+
119
+ Args:
120
+ event: The operation event dictionary
121
+ """
122
+ logger.info(f"Received operation event: {event}")
123
+
124
+ # Validate event structure
125
+ if not self._validate_event(event):
126
+ logger.error(f"Invalid event structure: {event}")
127
+ return
128
+
129
+ # Process the operation
130
+ self._process_operation(event)
131
+
132
+ def _is_instance_level_operation(self, action_record_id: str) -> bool:
133
+ """
134
+ Check if action_record_id represents an instance-level operation.
135
+ Instance-level operations are identified by action_record_id containing only zeros,
136
+ which can come in various formats:
137
+ - "000000000000000000000000"
138
+ - "ObjectID(\"000000000000000000000000\")"
139
+ - "ObjectID('000000000000000000000000')"
140
+
141
+ Args:
142
+ action_record_id: The action record ID to check
143
+
144
+ Returns:
145
+ True if this is an instance-level operation, False otherwise
146
+ """
147
+ if not action_record_id:
148
+ return False
149
+
150
+ # Handle ObjectID("...") or ObjectID('...') format from Kafka messages
151
+ clean_id = action_record_id
152
+ if 'ObjectID' in action_record_id:
153
+ match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
154
+ if match:
155
+ clean_id = match.group(1)
156
+
157
+ # Check if the string contains only zeros (any length)
158
+ return clean_id.replace('0', '') == ''
159
+
160
+ def _extract_action_record_id(self, action_record_id: str) -> str:
161
+ """
162
+ Extract the actual action record ID from various formats.
163
+
164
+ Args:
165
+ action_record_id: The raw action record ID (may be wrapped in ObjectID)
166
+
167
+ Returns:
168
+ The extracted action record ID string
169
+ """
170
+ if not action_record_id:
171
+ return action_record_id
172
+
173
+ # Handle ObjectID("...") or ObjectID('...') format
174
+ if 'ObjectID' in action_record_id:
175
+ match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
176
+ if match:
177
+ return match.group(1)
178
+
179
+ return action_record_id
180
+
181
+ def _validate_event(self, event: Dict[str, Any]) -> bool:
182
+ """
183
+ Validate that the event has all required fields.
184
+
185
+ Args:
186
+ event: The event dictionary to validate
187
+
188
+ Returns:
189
+ True if event is valid, False otherwise
190
+ """
191
+ required_fields = [
192
+ "instance_id",
193
+ "action_record_id",
194
+ "operation",
195
+ "account_number",
196
+ "requested_by",
197
+ "request_id",
198
+ "timestamp"
199
+ ]
200
+
201
+ for field in required_fields:
202
+ if field not in event:
203
+ logger.error(f"Missing required field: {field}")
204
+ return False
205
+
206
+ # Validate operation type
207
+ valid_operations = ["start", "stop", "restart"]
208
+ if event["operation"] not in valid_operations:
209
+ logger.error(f"Invalid operation: {event['operation']}. Must be one of {valid_operations}")
210
+ return False
211
+
212
+ return True
213
+
214
+ def _process_operation(self, event: Dict[str, Any]):
215
+ """
216
+ Process a compute operation event.
217
+
218
+ Args:
219
+ event: The operation event dictionary
220
+ """
221
+ operation = event["operation"]
222
+ raw_action_record_id = event["action_record_id"]
223
+ action_record_id = self._extract_action_record_id(raw_action_record_id)
224
+ request_id = event["request_id"]
225
+ requested_by = event["requested_by"]
226
+
227
+ logger.info(f"Processing {operation} operation for action {action_record_id} "
228
+ f"(request: {request_id}, user: {requested_by})")
229
+
230
+ try:
231
+ # Check if this is an instance-level operation (action_record_id contains only zeros)
232
+ is_instance_operation = self._is_instance_level_operation(raw_action_record_id)
233
+
234
+ if is_instance_operation:
235
+ result = self._handle_instance_operation(operation, event)
236
+ else:
237
+ result = self._handle_action_operation(operation, action_record_id, event)
238
+
239
+ # Update status via API and logging
240
+ self._update_operation_status(event, action_record_id, "completed", result)
241
+
242
+ except Exception as e:
243
+ error_msg = f"Operation failed: {str(e)}"
244
+ logger.error(error_msg)
245
+ logger.error(traceback.format_exc())
246
+
247
+ # Update failure status
248
+ self._update_operation_status(event, action_record_id, "failed", {"error": error_msg})
249
+
250
+ def _handle_action_operation(self, operation: str, action_record_id: str,
251
+ event: Dict[str, Any]) -> Dict[str, Any]:
252
+ """
253
+ Handle operations on a specific action.
254
+
255
+ Args:
256
+ operation: The operation type (start/stop/restart)
257
+ action_record_id: The action record ID to operate on
258
+ event: The full event dictionary
259
+
260
+ Returns:
261
+ Result dictionary with operation details
262
+ """
263
+ if operation == "start":
264
+ return self._start_action(action_record_id, event)
265
+ elif operation == "stop":
266
+ return self._stop_action(action_record_id, event)
267
+ elif operation == "restart":
268
+ return self._restart_action(action_record_id, event)
269
+ else:
270
+ raise ValueError(f"Unknown operation: {operation}")
271
+
272
+ def _handle_instance_operation(self, operation: str, event: Dict[str, Any]) -> Dict[str, Any]:
273
+ """
274
+ Handle operations on the entire instance (the Python application itself).
275
+
276
+ Args:
277
+ operation: The operation type (start/stop/restart)
278
+ event: The full event dictionary
279
+
280
+ Returns:
281
+ Result dictionary with operation details (may not return if app is killed/restarted)
282
+ """
283
+ logger.info(f"Executing instance-level {operation} operation on Python application")
284
+
285
+ if operation == "stop":
286
+ # Kill the Python application itself
287
+ logger.critical("Instance-level STOP: Killing Python application process")
288
+ try:
289
+ # Log status before killing
290
+ logger.warning(
291
+ f"Operation {operation} on instance {self.instance_id}: "
292
+ f"completed - killing_application (PID: {os.getpid()})"
293
+ )
294
+ # Give a moment for logs to be written
295
+ time.sleep(0.5)
296
+ except Exception as e:
297
+ logger.error(f"Failed to log status before kill: {e}")
298
+
299
+ # Forcefully exit the application
300
+ logger.critical(f"Terminating Python application (PID: {os.getpid()})")
301
+ os._exit(0) # Forceful exit, doesn't call cleanup handlers
302
+
303
+ elif operation == "restart":
304
+ # Restart the Python application itself
305
+ logger.critical("Instance-level RESTART: Restarting Python application process")
306
+ try:
307
+ # Log status before restarting
308
+ logger.warning(
309
+ f"Operation {operation} on instance {self.instance_id}: "
310
+ f"completed - restarting_application (PID: {os.getpid()})"
311
+ )
312
+ # Give a moment for logs to be written
313
+ time.sleep(0.5)
314
+ except Exception as e:
315
+ logger.error(f"Failed to log status before restart: {e}")
316
+
317
+ # Restart the application
318
+ logger.critical(f"Restarting Python application (PID: {os.getpid()})")
319
+ self._restart_application()
320
+
321
+ elif operation == "start":
322
+ # Start doesn't make sense for instance-level
323
+ logger.warning("Start operation not supported at instance level")
324
+ return {
325
+ "operation": operation,
326
+ "instance_level": True,
327
+ "status": "not_supported",
328
+ "message": "Start operation is not supported at instance level"
329
+ }
330
+
331
+ # This should not be reached for stop/restart operations
332
+ return {
333
+ "operation": operation,
334
+ "instance_level": True,
335
+ "status": "completed"
336
+ }
337
+
338
+ def _restart_application(self):
339
+ """
340
+ Restart the Python application by replacing the current process.
341
+ This uses os.execv() to replace the current process with a new one.
342
+ """
343
+ try:
344
+ python_executable = sys.executable
345
+ script_args = sys.argv
346
+
347
+ logger.info(f"Restarting with: {python_executable} {' '.join(script_args)}")
348
+
349
+ # Use os.execv() to replace the current process
350
+ # This will restart the application with the same arguments
351
+ os.execv(python_executable, [python_executable] + script_args)
352
+
353
+ except Exception as e:
354
+ logger.error(f"Failed to restart application: {e}")
355
+ logger.error(traceback.format_exc())
356
+ # Fallback: try using subprocess to start a new process and exit
357
+ try:
358
+ logger.info("Attempting fallback restart method")
359
+ python_executable = sys.executable
360
+ script_args = sys.argv
361
+
362
+ # Start new process
363
+ subprocess.Popen([python_executable] + script_args)
364
+ # Exit current process
365
+ logger.critical("New process started, exiting current process")
366
+ os._exit(0)
367
+ except Exception as fallback_error:
368
+ logger.error(f"Fallback restart also failed: {fallback_error}")
369
+ logger.error(traceback.format_exc())
370
+ # Last resort: just exit
371
+ os._exit(1)
372
+
373
+ def _start_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
374
+ """
375
+ Start a specific action.
376
+
377
+ Args:
378
+ action_record_id: The action record ID to start
379
+ event: The full event dictionary
380
+
381
+ Returns:
382
+ Result dictionary
383
+ """
384
+ logger.info(f"Starting action: {action_record_id}")
385
+
386
+ # Check if action is already running
387
+ current_actions = self.actions_manager.get_current_actions()
388
+ if action_record_id in current_actions:
389
+ action_instance = current_actions[action_record_id]
390
+ if action_instance.is_running():
391
+ logger.warning(f"Action {action_record_id} is already running")
392
+ return {
393
+ "status": "already_running",
394
+ "action_id": action_record_id
395
+ }
396
+
397
+ # Fetch action details from backend and start it
398
+ # This will be handled by the ActionsManager's normal flow
399
+ # Force a fetch to pick up this specific action
400
+ self.actions_manager.fetch_actions()
401
+
402
+ return {
403
+ "status": "started",
404
+ "action_id": action_record_id
405
+ }
406
+
407
+ def _stop_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
408
+ """
409
+ Stop a specific action.
410
+
411
+ Args:
412
+ action_record_id: The action record ID to stop
413
+ event: The full event dictionary
414
+
415
+ Returns:
416
+ Result dictionary
417
+ """
418
+ logger.info(f"Stopping action: {action_record_id}")
419
+
420
+ result = self.actions_manager.stop_action(action_record_id)
421
+
422
+ return {
423
+ "status": "stopped",
424
+ "action_id": action_record_id,
425
+ "details": result
426
+ }
427
+
428
+ def _restart_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
429
+ """
430
+ Restart a specific action.
431
+
432
+ Args:
433
+ action_record_id: The action record ID to restart
434
+ event: The full event dictionary
435
+
436
+ Returns:
437
+ Result dictionary
438
+ """
439
+ logger.info(f"Restarting action: {action_record_id}")
440
+
441
+ result = self.actions_manager.restart_action(action_record_id)
442
+
443
+ return {
444
+ "status": "restarted",
445
+ "action_id": action_record_id,
446
+ "details": result
447
+ }
448
+
449
+ def _update_operation_status(self, event: Dict[str, Any], action_record_id: str,
450
+ status: str, result: Dict[str, Any]):
451
+ """
452
+ Update operation status via API and logging.
453
+
454
+ Args:
455
+ event: The original event
456
+ action_record_id: The extracted action record ID
457
+ status: Operation status (completed/failed)
458
+ result: Result details
459
+ """
460
+ operation = event["operation"]
461
+ request_id = event["request_id"]
462
+
463
+ # Log status as warning for visibility
464
+ logger.warning(
465
+ f"Operation {operation} on {action_record_id}: {status} - "
466
+ f"request_id={request_id}, result={result}"
467
+ )
468
+
469
+ # Update via API (for action-level operations only)
470
+ if not self._is_instance_level_operation(event["action_record_id"]):
471
+ try:
472
+ # Determine isRunning based on operation and status
473
+ is_running = False
474
+ if status == "completed":
475
+ if operation == "start":
476
+ is_running = True
477
+ elif operation == "restart":
478
+ is_running = True
479
+ elif operation == "stop":
480
+ is_running = False
481
+
482
+ self.scaling.update_action_status(
483
+ service_provider=os.environ.get("SERVICE_PROVIDER", ""),
484
+ action_record_id=action_record_id,
485
+ status=status,
486
+ isRunning=is_running,
487
+ )
488
+ logger.info(f"API status updated for action {action_record_id}: {status}")
489
+ except Exception as e:
490
+ logger.error(f"Failed to update API status for action {action_record_id}: {e}")