matrice-compute 0.1.25__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@ class ActionsManager:
27
27
  scaling (Scaling): Scaling service instance
28
28
  """
29
29
  self.current_actions: dict[str, ActionInstance] = {}
30
+ self.stopped_actions: dict[str, ActionInstance] = {} # Track stopped actions separately
30
31
  self.scaling = scaling
31
32
  self.memory_threshold = 0.9
32
33
  self.poll_interval = 10
@@ -111,75 +112,110 @@ class ActionsManager:
111
112
  def process_actions(self) -> None:
112
113
  """Process fetched actions."""
113
114
  for action in self.fetch_actions():
115
+ action_id = action["_id"]
116
+
117
+ # Skip if action is already running in current_actions
118
+ if action_id in self.current_actions:
119
+ logging.info("Action %s already in current_actions, skipping", action_id)
120
+ continue
121
+
122
+ # If action exists in stopped_actions, remove it before starting fresh
123
+ if action_id in self.stopped_actions:
124
+ logging.info("Action %s found in stopped_actions, removing before restart", action_id)
125
+ del self.stopped_actions[action_id]
126
+
127
+ # Process and add to current_actions
114
128
  action_instance = self.process_action(action)
115
129
  if action_instance:
116
- self.current_actions[action["_id"]] = action_instance
130
+ # Ensure action is not in stopped_actions (defensive check)
131
+ if action_id in self.stopped_actions:
132
+ del self.stopped_actions[action_id]
133
+ self.current_actions[action_id] = action_instance
117
134
 
118
135
  @log_errors(raise_exception=False)
119
- def purge_unwanted(self) -> None:
120
- """Purge completed or failed actions.
121
-
122
- This method checks all actions in the current_actions dictionary and removes any that:
123
- 1. Are explicitly reported as not running by the is_running() method
124
- 2. Have invalid or corrupted process objects
136
+ def update_actions_status(self) -> None:
137
+ """Update tracking of running vs stopped actions.
138
+
139
+ This method checks all actions and moves stopped ones to stopped_actions dict
140
+ without deleting them. This prevents interference with compute operations
141
+ handler while maintaining accurate status reporting.
125
142
  """
126
- purged_count = 0
127
-
128
- # Check each action and purge if needed
143
+ moved_to_stopped = 0
144
+
145
+ # Check each action and update its status
129
146
  for action_id, instance in list(self.current_actions.items()):
130
- should_purge = False
131
- purge_reason = ""
132
-
133
- # Check if process is reported as not running
134
- if not instance.is_running():
135
- should_purge = True
136
- purge_reason = "process reported as not running"
137
-
147
+ is_running = False
148
+ status_reason = ""
149
+
150
+ # Check if process is running
151
+ if hasattr(instance, 'is_running'):
152
+ try:
153
+ is_running = instance.is_running()
154
+ except Exception as e:
155
+ logging.error("Error checking is_running for action %s: %s", action_id, str(e))
156
+ is_running = False
157
+ status_reason = f"error checking status: {str(e)}"
158
+
138
159
  # Check for process object validity
139
- elif not hasattr(instance, 'process') or instance.process is None:
140
- should_purge = True
141
- purge_reason = "invalid process object"
142
-
143
- # Purge if any condition was met
144
- if should_purge:
160
+ if not is_running and not status_reason:
161
+ if not hasattr(instance, 'process') or instance.process is None:
162
+ status_reason = "no process object"
163
+ else:
164
+ status_reason = "process not running"
165
+
166
+ # Move to stopped_actions if not running (but don't delete)
167
+ if not is_running:
145
168
  logging.info(
146
- "Action %s is being purged: %s",
169
+ "Action %s moved to stopped_actions: %s",
147
170
  action_id,
148
- purge_reason
171
+ status_reason
149
172
  )
150
-
151
- # Remove from tracking dictionaries
152
- del self.current_actions[action_id]
153
- purged_count += 1
173
+ # Ensure action is removed from current_actions before adding to stopped_actions
174
+ if action_id in self.current_actions:
175
+ del self.current_actions[action_id]
176
+ # Ensure action is not duplicated in stopped_actions
177
+ if action_id not in self.stopped_actions:
178
+ self.stopped_actions[action_id] = instance
179
+ moved_to_stopped += 1
154
180
 
155
- # Try to explicitly stop the action if possible
156
- try:
157
- if hasattr(instance, 'stop'):
158
- instance.stop()
159
- except Exception as e:
160
- logging.error(f"Error stopping action {action_id}: {str(e)}")
161
-
162
- if purged_count > 0:
181
+ # Log current state
182
+ running_ids = list(self.current_actions.keys())
183
+ stopped_ids = list(self.stopped_actions.keys())
184
+
185
+ if self.current_actions or self.stopped_actions:
163
186
  logging.info(
164
- "Purged %d completed actions, %d actions remain in queue",
165
- purged_count,
166
- len(self.current_actions)
187
+ "Actions status: %d running %s, %d stopped %s",
188
+ len(self.current_actions),
189
+ running_ids if running_ids else "[]",
190
+ len(self.stopped_actions),
191
+ stopped_ids if stopped_ids else "[]"
167
192
  )
168
193
 
194
+ @log_errors(raise_exception=False)
195
+ def purge_unwanted(self) -> None:
196
+ """Purge completed or failed actions.
197
+
198
+ NOTE: This now calls update_actions_status() which moves stopped actions
199
+ to a separate dict instead of deleting them. This prevents interference
200
+ with compute operations handler while maintaining accurate status.
201
+ """
202
+ self.update_actions_status()
203
+
169
204
  @log_errors(default_return={}, raise_exception=False)
170
205
  def get_current_actions(self) -> dict:
171
- """Get the current actions.
206
+ """Get the current running actions.
172
207
 
173
208
  This method:
174
- 1. Purges any completed actions using purge_unwanted()
175
- 2. Double-checks remaining actions to ensure they are truly running
209
+ 1. Updates action status tracking via update_actions_status()
210
+ 2. Returns only the running actions (current_actions dict)
176
211
  3. Provides detailed logging about current actions state
177
212
 
178
213
  Returns:
179
- dict: Current active actions
214
+ dict: Current running actions only
180
215
  """
181
- # Always purge unwanted actions first
182
- self.purge_unwanted()
216
+ # Update status tracking (moves stopped to stopped_actions)
217
+ self.update_actions_status()
218
+
183
219
  if self.current_actions:
184
220
  action_ids = list(self.current_actions.keys())
185
221
  logging.info(
@@ -189,9 +225,213 @@ class ActionsManager:
189
225
  )
190
226
  else:
191
227
  logging.debug("No actions currently running")
192
- return {}
228
+
193
229
  return self.current_actions
194
230
 
231
+ @log_errors(default_return={}, raise_exception=False)
232
+ def get_all_actions(self) -> dict:
233
+ """Get all tracked actions (both running and stopped).
234
+
235
+ Returns:
236
+ dict: All tracked actions with their status
237
+ """
238
+ all_actions = {}
239
+ for action_id, instance in self.current_actions.items():
240
+ all_actions[action_id] = {"instance": instance, "status": "running"}
241
+ for action_id, instance in self.stopped_actions.items():
242
+ all_actions[action_id] = {"instance": instance, "status": "stopped"}
243
+ return all_actions
244
+
245
+ @log_errors(default_return={}, raise_exception=False)
246
+ def get_stopped_actions(self) -> dict:
247
+ """Get stopped actions.
248
+
249
+ Returns:
250
+ dict: Stopped actions
251
+ """
252
+ return self.stopped_actions
253
+
254
+ @log_errors(default_return={}, raise_exception=False)
255
+ def stop_action(self, action_record_id: str) -> dict:
256
+ """Stop a specific action by its record ID.
257
+
258
+ Args:
259
+ action_record_id (str): The action record ID to stop
260
+
261
+ Returns:
262
+ dict: Result dictionary with status information
263
+ """
264
+ logging.info("Attempting to stop action: %s", action_record_id)
265
+
266
+ # Check if action exists in current (running) actions
267
+ action_instance = None
268
+ action_source = None
269
+
270
+ if action_record_id in self.current_actions:
271
+ action_instance = self.current_actions[action_record_id]
272
+ action_source = "current_actions"
273
+ elif action_record_id in self.stopped_actions:
274
+ # Action already in stopped_actions
275
+ logging.info("Action %s already in stopped_actions", action_record_id)
276
+ return {
277
+ "success": True,
278
+ "reason": "already_stopped",
279
+ "action_id": action_record_id
280
+ }
281
+ else:
282
+ logging.warning("Action %s not found in current or stopped actions", action_record_id)
283
+ return {
284
+ "success": False,
285
+ "reason": "action_not_found",
286
+ "action_id": action_record_id
287
+ }
288
+
289
+ # Check if action is actually running
290
+ if not action_instance.is_running():
291
+ logging.info("Action %s is not running, moving to stopped_actions", action_record_id)
292
+ # Move to stopped_actions instead of deleting
293
+ # Ensure action is removed from current_actions first
294
+ if action_record_id in self.current_actions:
295
+ del self.current_actions[action_record_id]
296
+ # Ensure action is not duplicated in stopped_actions
297
+ if action_record_id not in self.stopped_actions:
298
+ self.stopped_actions[action_record_id] = action_instance
299
+ return {
300
+ "success": True,
301
+ "reason": "already_stopped",
302
+ "action_id": action_record_id
303
+ }
304
+
305
+ # Stop the action
306
+ try:
307
+ logging.info("Stopping action %s", action_record_id)
308
+ action_instance.stop()
309
+
310
+ # Update action status to stopped
311
+ self.scaling.update_action_status(
312
+ service_provider=os.environ["SERVICE_PROVIDER"],
313
+ action_record_id=action_record_id,
314
+ status="stopped",
315
+ isRunning=False,
316
+ action_duration=0,
317
+ )
318
+
319
+ # Move to stopped_actions instead of deleting
320
+ # Ensure action is removed from current_actions first
321
+ if action_record_id in self.current_actions:
322
+ del self.current_actions[action_record_id]
323
+ # Ensure action is not duplicated in stopped_actions
324
+ if action_record_id not in self.stopped_actions:
325
+ self.stopped_actions[action_record_id] = action_instance
326
+
327
+ logging.info("Successfully stopped action: %s", action_record_id)
328
+ return {
329
+ "success": True,
330
+ "action_id": action_record_id,
331
+ "stopped_at": time.time()
332
+ }
333
+
334
+ except Exception as e:
335
+ logging.error("Error stopping action %s: %s", action_record_id, str(e))
336
+ return {
337
+ "success": False,
338
+ "reason": "stop_failed",
339
+ "error": str(e),
340
+ "action_id": action_record_id
341
+ }
342
+
343
+ @log_errors(default_return={}, raise_exception=False)
344
+ def restart_action(self, action_record_id: str) -> dict:
345
+ """Restart a specific action by its record ID.
346
+
347
+ This method stops the action if it's running, then fetches fresh action
348
+ details from the backend and starts it again.
349
+
350
+ Args:
351
+ action_record_id (str): The action record ID to restart
352
+
353
+ Returns:
354
+ dict: Result dictionary with status information
355
+ """
356
+ logging.info("Attempting to restart action: %s", action_record_id)
357
+
358
+ # Step 1: Stop the action if it exists in current_actions or stopped_actions
359
+ stop_result = {"success": True, "reason": "not_running"}
360
+ if action_record_id in self.current_actions:
361
+ logging.info("Stopping existing action %s before restart", action_record_id)
362
+ stop_result = self.stop_action(action_record_id)
363
+
364
+ if not stop_result.get("success"):
365
+ logging.error("Failed to stop action %s for restart", action_record_id)
366
+ return {
367
+ "success": False,
368
+ "reason": "stop_failed_before_restart",
369
+ "stop_result": stop_result,
370
+ "action_id": action_record_id
371
+ }
372
+
373
+ # Wait a moment for cleanup
374
+ time.sleep(2)
375
+ elif action_record_id in self.stopped_actions:
376
+ logging.info("Action %s found in stopped_actions, will restart", action_record_id)
377
+ stop_result = {"success": True, "reason": "was_stopped"}
378
+
379
+ # Step 2: Fetch fresh action details from backend
380
+ try:
381
+ logging.info("Fetching action details for restart: %s", action_record_id)
382
+
383
+ # Get action details via API
384
+ action_details, error, _ = self.scaling.get_action_details(action_record_id)
385
+
386
+ if error or not action_details:
387
+ logging.error("Failed to fetch action details for %s: %s",
388
+ action_record_id, error)
389
+ return {
390
+ "success": False,
391
+ "reason": "fetch_failed",
392
+ "error": error,
393
+ "action_id": action_record_id
394
+ }
395
+
396
+ # Step 3: Process (start) the action
397
+ logging.info("Starting action %s after restart", action_record_id)
398
+ action_instance = self.process_action(action_details)
399
+
400
+ if action_instance:
401
+ # Ensure action is removed from stopped_actions if present
402
+ if action_record_id in self.stopped_actions:
403
+ del self.stopped_actions[action_record_id]
404
+ # Ensure action is removed from current_actions if present (defensive check)
405
+ if action_record_id in self.current_actions:
406
+ logging.warning("Action %s already in current_actions during restart, replacing", action_record_id)
407
+ del self.current_actions[action_record_id]
408
+ # Add to current_actions
409
+ self.current_actions[action_record_id] = action_instance
410
+
411
+ logging.info("Successfully restarted action: %s", action_record_id)
412
+ return {
413
+ "success": True,
414
+ "action_id": action_record_id,
415
+ "restarted_at": time.time(),
416
+ "stop_result": stop_result
417
+ }
418
+ else:
419
+ logging.error("Failed to start action %s after restart", action_record_id)
420
+ return {
421
+ "success": False,
422
+ "reason": "start_failed_after_restart",
423
+ "action_id": action_record_id
424
+ }
425
+
426
+ except Exception as e:
427
+ logging.error("Error restarting action %s: %s", action_record_id, str(e))
428
+ return {
429
+ "success": False,
430
+ "reason": "restart_failed",
431
+ "error": str(e),
432
+ "action_id": action_record_id
433
+ }
434
+
195
435
  @log_errors(raise_exception=True)
196
436
  def start_actions_manager(self) -> None:
197
437
  """Start the actions manager main loop."""
@@ -0,0 +1,490 @@
1
+ """
2
+ Compute Operations Handler - Kafka Event-Driven Operations Manager
3
+
4
+ This module handles compute instance operations (start/stop/restart) triggered from
5
+ the frontend dashboard via Kafka events. It consumes events from the 'compute_operations'
6
+ topic and performs the actual operations on compute instances and their actions.
7
+
8
+ Uses EventListener from matrice_common for simplified Kafka consumption.
9
+
10
+ Event Structure:
11
+ {
12
+ "instance_id": "string",
13
+ "action_record_id": "string", # Can be ObjectID("000000000000000000000000") or all zeros for instance-level operations
14
+ "operation": "start|stop|restart",
15
+ "account_number": 12345,
16
+ "requested_by": "user@example.com",
17
+ "request_id": "uuid-string",
18
+ "timestamp": "2025-11-21T10:30:00.123Z"
19
+ }
20
+ """
21
+
22
+ import logging
23
+ import re
24
+ import time
25
+ from typing import Dict, Any, Optional
26
+ import sys
27
+ import traceback
28
+ import os
29
+ import subprocess
30
+
31
+ from matrice_common.stream.event_listener import EventListener
32
+
33
+ # Configure logging
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class ComputeOperationsHandler:
38
+ """
39
+ Handles Kafka-based compute operations for instance and action management.
40
+
41
+ This class uses EventListener from matrice_common to listen for operation
42
+ events from the 'compute_operations' Kafka topic. It delegates operations
43
+ to the ActionsManager for execution and updates status via API calls.
44
+ """
45
+
46
+ KAFKA_TOPIC = "compute_operations"
47
+
48
+ def __init__(self, actions_manager, session, scaling, instance_id: str):
49
+ """
50
+ Initialize the Compute Operations Handler.
51
+
52
+ Args:
53
+ actions_manager: Reference to the ActionsManager instance
54
+ session: Session object for authentication and Kafka configuration
55
+ scaling: Scaling service instance for API status updates
56
+ instance_id: This compute instance's ID for filtering events
57
+ """
58
+ self.actions_manager = actions_manager
59
+ self.session = session
60
+ self.scaling = scaling
61
+ self.instance_id = instance_id
62
+ self.event_listener: Optional[EventListener] = None
63
+ self.running = False
64
+
65
+ logger.info(f"Initializing ComputeOperationsHandler for instance ID: {instance_id}")
66
+
67
+ def start(self) -> bool:
68
+ """
69
+ Start the operations handler using EventListener.
70
+
71
+ Returns:
72
+ bool: True if started successfully, False otherwise
73
+ """
74
+ if self.running:
75
+ logger.warning("ComputeOperationsHandler is already running")
76
+ return False
77
+
78
+ try:
79
+ self.event_listener = EventListener(
80
+ session=self.session,
81
+ topics=[self.KAFKA_TOPIC],
82
+ event_handler=self._handle_operation_event,
83
+ filter_field='instance_id',
84
+ filter_value=self.instance_id,
85
+ consumer_group_id=f"compute_ops_{self.instance_id}"
86
+ )
87
+ self.running = self.event_listener.start()
88
+
89
+ if self.running:
90
+ logger.info("ComputeOperationsHandler started successfully")
91
+ else:
92
+ logger.error("ComputeOperationsHandler failed to start")
93
+
94
+ return self.running
95
+
96
+ except Exception as e:
97
+ logger.error(f"Failed to start ComputeOperationsHandler: {e}")
98
+ logger.error(traceback.format_exc())
99
+ return False
100
+
101
+ def stop(self):
102
+ """
103
+ Stop the operations handler gracefully.
104
+ """
105
+ logger.info("Stopping ComputeOperationsHandler...")
106
+ self.running = False
107
+
108
+ if self.event_listener:
109
+ self.event_listener.stop()
110
+
111
+ logger.info("ComputeOperationsHandler stopped")
112
+
113
+ def _handle_operation_event(self, event: Dict[str, Any]):
114
+ """
115
+ Handle incoming operation event from Kafka.
116
+
117
+ This is the callback function passed to EventListener.
118
+
119
+ Args:
120
+ event: The operation event dictionary
121
+ """
122
+ logger.info(f"Received operation event: {event}")
123
+
124
+ # Validate event structure
125
+ if not self._validate_event(event):
126
+ logger.error(f"Invalid event structure: {event}")
127
+ return
128
+
129
+ # Process the operation
130
+ self._process_operation(event)
131
+
132
+ def _is_instance_level_operation(self, action_record_id: str) -> bool:
133
+ """
134
+ Check if action_record_id represents an instance-level operation.
135
+ Instance-level operations are identified by action_record_id containing only zeros,
136
+ which can come in various formats:
137
+ - "000000000000000000000000"
138
+ - "ObjectID(\"000000000000000000000000\")"
139
+ - "ObjectID('000000000000000000000000')"
140
+
141
+ Args:
142
+ action_record_id: The action record ID to check
143
+
144
+ Returns:
145
+ True if this is an instance-level operation, False otherwise
146
+ """
147
+ if not action_record_id:
148
+ return False
149
+
150
+ # Handle ObjectID("...") or ObjectID('...') format from Kafka messages
151
+ clean_id = action_record_id
152
+ if 'ObjectID' in action_record_id:
153
+ match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
154
+ if match:
155
+ clean_id = match.group(1)
156
+
157
+ # Check if the string contains only zeros (any length)
158
+ return clean_id.replace('0', '') == ''
159
+
160
+ def _extract_action_record_id(self, action_record_id: str) -> str:
161
+ """
162
+ Extract the actual action record ID from various formats.
163
+
164
+ Args:
165
+ action_record_id: The raw action record ID (may be wrapped in ObjectID)
166
+
167
+ Returns:
168
+ The extracted action record ID string
169
+ """
170
+ if not action_record_id:
171
+ return action_record_id
172
+
173
+ # Handle ObjectID("...") or ObjectID('...') format
174
+ if 'ObjectID' in action_record_id:
175
+ match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
176
+ if match:
177
+ return match.group(1)
178
+
179
+ return action_record_id
180
+
181
+ def _validate_event(self, event: Dict[str, Any]) -> bool:
182
+ """
183
+ Validate that the event has all required fields.
184
+
185
+ Args:
186
+ event: The event dictionary to validate
187
+
188
+ Returns:
189
+ True if event is valid, False otherwise
190
+ """
191
+ required_fields = [
192
+ "instance_id",
193
+ "action_record_id",
194
+ "operation",
195
+ "account_number",
196
+ "requested_by",
197
+ "request_id",
198
+ "timestamp"
199
+ ]
200
+
201
+ for field in required_fields:
202
+ if field not in event:
203
+ logger.error(f"Missing required field: {field}")
204
+ return False
205
+
206
+ # Validate operation type
207
+ valid_operations = ["start", "stop", "restart"]
208
+ if event["operation"] not in valid_operations:
209
+ logger.error(f"Invalid operation: {event['operation']}. Must be one of {valid_operations}")
210
+ return False
211
+
212
+ return True
213
+
214
+ def _process_operation(self, event: Dict[str, Any]):
215
+ """
216
+ Process a compute operation event.
217
+
218
+ Args:
219
+ event: The operation event dictionary
220
+ """
221
+ operation = event["operation"]
222
+ raw_action_record_id = event["action_record_id"]
223
+ action_record_id = self._extract_action_record_id(raw_action_record_id)
224
+ request_id = event["request_id"]
225
+ requested_by = event["requested_by"]
226
+
227
+ logger.info(f"Processing {operation} operation for action {action_record_id} "
228
+ f"(request: {request_id}, user: {requested_by})")
229
+
230
+ try:
231
+ # Check if this is an instance-level operation (action_record_id contains only zeros)
232
+ is_instance_operation = self._is_instance_level_operation(raw_action_record_id)
233
+
234
+ if is_instance_operation:
235
+ result = self._handle_instance_operation(operation, event)
236
+ else:
237
+ result = self._handle_action_operation(operation, action_record_id, event)
238
+
239
+ # Update status via API and logging
240
+ self._update_operation_status(event, action_record_id, "completed", result)
241
+
242
+ except Exception as e:
243
+ error_msg = f"Operation failed: {str(e)}"
244
+ logger.error(error_msg)
245
+ logger.error(traceback.format_exc())
246
+
247
+ # Update failure status
248
+ self._update_operation_status(event, action_record_id, "failed", {"error": error_msg})
249
+
250
+ def _handle_action_operation(self, operation: str, action_record_id: str,
251
+ event: Dict[str, Any]) -> Dict[str, Any]:
252
+ """
253
+ Handle operations on a specific action.
254
+
255
+ Args:
256
+ operation: The operation type (start/stop/restart)
257
+ action_record_id: The action record ID to operate on
258
+ event: The full event dictionary
259
+
260
+ Returns:
261
+ Result dictionary with operation details
262
+ """
263
+ if operation == "start":
264
+ return self._start_action(action_record_id, event)
265
+ elif operation == "stop":
266
+ return self._stop_action(action_record_id, event)
267
+ elif operation == "restart":
268
+ return self._restart_action(action_record_id, event)
269
+ else:
270
+ raise ValueError(f"Unknown operation: {operation}")
271
+
272
+ def _handle_instance_operation(self, operation: str, event: Dict[str, Any]) -> Dict[str, Any]:
273
+ """
274
+ Handle operations on the entire instance (the Python application itself).
275
+
276
+ Args:
277
+ operation: The operation type (start/stop/restart)
278
+ event: The full event dictionary
279
+
280
+ Returns:
281
+ Result dictionary with operation details (may not return if app is killed/restarted)
282
+ """
283
+ logger.info(f"Executing instance-level {operation} operation on Python application")
284
+
285
+ if operation == "stop":
286
+ # Kill the Python application itself
287
+ logger.critical("Instance-level STOP: Killing Python application process")
288
+ try:
289
+ # Log status before killing
290
+ logger.warning(
291
+ f"Operation {operation} on instance {self.instance_id}: "
292
+ f"completed - killing_application (PID: {os.getpid()})"
293
+ )
294
+ # Give a moment for logs to be written
295
+ time.sleep(0.5)
296
+ except Exception as e:
297
+ logger.error(f"Failed to log status before kill: {e}")
298
+
299
+ # Forcefully exit the application
300
+ logger.critical(f"Terminating Python application (PID: {os.getpid()})")
301
+ os._exit(0) # Forceful exit, doesn't call cleanup handlers
302
+
303
+ elif operation == "restart":
304
+ # Restart the Python application itself
305
+ logger.critical("Instance-level RESTART: Restarting Python application process")
306
+ try:
307
+ # Log status before restarting
308
+ logger.warning(
309
+ f"Operation {operation} on instance {self.instance_id}: "
310
+ f"completed - restarting_application (PID: {os.getpid()})"
311
+ )
312
+ # Give a moment for logs to be written
313
+ time.sleep(0.5)
314
+ except Exception as e:
315
+ logger.error(f"Failed to log status before restart: {e}")
316
+
317
+ # Restart the application
318
+ logger.critical(f"Restarting Python application (PID: {os.getpid()})")
319
+ self._restart_application()
320
+
321
+ elif operation == "start":
322
+ # Start doesn't make sense for instance-level
323
+ logger.warning("Start operation not supported at instance level")
324
+ return {
325
+ "operation": operation,
326
+ "instance_level": True,
327
+ "status": "not_supported",
328
+ "message": "Start operation is not supported at instance level"
329
+ }
330
+
331
+ # This should not be reached for stop/restart operations
332
+ return {
333
+ "operation": operation,
334
+ "instance_level": True,
335
+ "status": "completed"
336
+ }
337
+
338
+ def _restart_application(self):
339
+ """
340
+ Restart the Python application by replacing the current process.
341
+ This uses os.execv() to replace the current process with a new one.
342
+ """
343
+ try:
344
+ python_executable = sys.executable
345
+ script_args = sys.argv
346
+
347
+ logger.info(f"Restarting with: {python_executable} {' '.join(script_args)}")
348
+
349
+ # Use os.execv() to replace the current process
350
+ # This will restart the application with the same arguments
351
+ os.execv(python_executable, [python_executable] + script_args)
352
+
353
+ except Exception as e:
354
+ logger.error(f"Failed to restart application: {e}")
355
+ logger.error(traceback.format_exc())
356
+ # Fallback: try using subprocess to start a new process and exit
357
+ try:
358
+ logger.info("Attempting fallback restart method")
359
+ python_executable = sys.executable
360
+ script_args = sys.argv
361
+
362
+ # Start new process
363
+ subprocess.Popen([python_executable] + script_args)
364
+ # Exit current process
365
+ logger.critical("New process started, exiting current process")
366
+ os._exit(0)
367
+ except Exception as fallback_error:
368
+ logger.error(f"Fallback restart also failed: {fallback_error}")
369
+ logger.error(traceback.format_exc())
370
+ # Last resort: just exit
371
+ os._exit(1)
372
+
373
+ def _start_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
374
+ """
375
+ Start a specific action.
376
+
377
+ Args:
378
+ action_record_id: The action record ID to start
379
+ event: The full event dictionary
380
+
381
+ Returns:
382
+ Result dictionary
383
+ """
384
+ logger.info(f"Starting action: {action_record_id}")
385
+
386
+ # Check if action is already running
387
+ current_actions = self.actions_manager.get_current_actions()
388
+ if action_record_id in current_actions:
389
+ action_instance = current_actions[action_record_id]
390
+ if action_instance.is_running():
391
+ logger.warning(f"Action {action_record_id} is already running")
392
+ return {
393
+ "status": "already_running",
394
+ "action_id": action_record_id
395
+ }
396
+
397
+ # Fetch action details from backend and start it
398
+ # This will be handled by the ActionsManager's normal flow
399
+ # Force a fetch to pick up this specific action
400
+ self.actions_manager.fetch_actions()
401
+
402
+ return {
403
+ "status": "started",
404
+ "action_id": action_record_id
405
+ }
406
+
407
+ def _stop_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
408
+ """
409
+ Stop a specific action.
410
+
411
+ Args:
412
+ action_record_id: The action record ID to stop
413
+ event: The full event dictionary
414
+
415
+ Returns:
416
+ Result dictionary
417
+ """
418
+ logger.info(f"Stopping action: {action_record_id}")
419
+
420
+ result = self.actions_manager.stop_action(action_record_id)
421
+
422
+ return {
423
+ "status": "stopped",
424
+ "action_id": action_record_id,
425
+ "details": result
426
+ }
427
+
428
+ def _restart_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
429
+ """
430
+ Restart a specific action.
431
+
432
+ Args:
433
+ action_record_id: The action record ID to restart
434
+ event: The full event dictionary
435
+
436
+ Returns:
437
+ Result dictionary
438
+ """
439
+ logger.info(f"Restarting action: {action_record_id}")
440
+
441
+ result = self.actions_manager.restart_action(action_record_id)
442
+
443
+ return {
444
+ "status": "restarted",
445
+ "action_id": action_record_id,
446
+ "details": result
447
+ }
448
+
449
+ def _update_operation_status(self, event: Dict[str, Any], action_record_id: str,
450
+ status: str, result: Dict[str, Any]):
451
+ """
452
+ Update operation status via API and logging.
453
+
454
+ Args:
455
+ event: The original event
456
+ action_record_id: The extracted action record ID
457
+ status: Operation status (completed/failed)
458
+ result: Result details
459
+ """
460
+ operation = event["operation"]
461
+ request_id = event["request_id"]
462
+
463
+ # Log status as warning for visibility
464
+ logger.warning(
465
+ f"Operation {operation} on {action_record_id}: {status} - "
466
+ f"request_id={request_id}, result={result}"
467
+ )
468
+
469
+ # Update via API (for action-level operations only)
470
+ if not self._is_instance_level_operation(event["action_record_id"]):
471
+ try:
472
+ # Determine isRunning based on operation and status
473
+ is_running = False
474
+ if status == "completed":
475
+ if operation == "start":
476
+ is_running = True
477
+ elif operation == "restart":
478
+ is_running = True
479
+ elif operation == "stop":
480
+ is_running = False
481
+
482
+ self.scaling.update_action_status(
483
+ service_provider=os.environ.get("SERVICE_PROVIDER", ""),
484
+ action_record_id=action_record_id,
485
+ status=status,
486
+ isRunning=is_running,
487
+ )
488
+ logger.info(f"API status updated for action {action_record_id}: {status}")
489
+ except Exception as e:
490
+ logger.error(f"Failed to update API status for action {action_record_id}: {e}")
@@ -7,6 +7,7 @@ import threading
7
7
  import time
8
8
  from matrice_compute.actions_manager import ActionsManager
9
9
  from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
10
+ from matrice_compute.compute_operations_handler import ComputeOperationsHandler
10
11
  from matrice_compute.instance_utils import (
11
12
  get_instance_info,
12
13
  get_decrypted_access_key_pair,
@@ -90,6 +91,22 @@ class InstanceManager:
90
91
  logging.info("InstanceManager initialized with machine resources tracker")
91
92
  self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
92
93
  logging.info("InstanceManager initialized with actions resources tracker")
94
+
95
+ # Initialize Compute Operations Handler for event-driven operations
96
+ # Uses EventListener from matrice_common for simplified Kafka consumption
97
+ try:
98
+ instance_id = os.environ.get("INSTANCE_ID")
99
+ self.compute_operations_handler = ComputeOperationsHandler(
100
+ actions_manager=self.actions_manager,
101
+ session=self.session,
102
+ scaling=self.scaling,
103
+ instance_id=instance_id
104
+ )
105
+ logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
106
+ except Exception as e:
107
+ logging.warning("Failed to initialize Compute Operations Handler: %s", e)
108
+ self.compute_operations_handler = None
109
+
93
110
  self.poll_interval = 10
94
111
  # Note: encryption_key is set in _setup_env_credentials
95
112
  logging.info("InstanceManager initialized.")
@@ -252,6 +269,14 @@ class InstanceManager:
252
269
  Returns:
253
270
  tuple: (instance_manager_thread, actions_manager_thread)
254
271
  """
272
+ # Start Compute Operations Handler in background thread
273
+ if self.compute_operations_handler:
274
+ try:
275
+ self.compute_operations_handler.start()
276
+ logging.info("Started Compute Operations Handler")
277
+ except Exception as exc:
278
+ logging.error("Failed to start Compute Operations Handler: %s", str(exc))
279
+
255
280
  # Create and start threads
256
281
  instance_manager_thread = threading.Thread(
257
282
  target=self.start_instance_manager,
@@ -402,8 +402,13 @@ class ActionsResourcesTracker:
402
402
  new_args.extend(x.replace('"', "").replace("'", "") for x in arg.split(" "))
403
403
  return new_args
404
404
 
405
- args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
406
- action_record_id = args_24[-1] if args_24 else None
405
+ def is_valid_objectid(s: str) -> bool:
406
+ """Check if string is a valid MongoDB ObjectId (24 hex characters)"""
407
+ s = s.strip()
408
+ return len(s) == 24 and all(c in '0123456789abcdefABCDEF' for c in s)
409
+
410
+ valid_objectids = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if is_valid_objectid(arg)]
411
+ action_record_id = valid_objectids[-1] if valid_objectids else None
407
412
  if not action_record_id:
408
413
  logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
409
414
  duration = calculate_time_difference(start_time, finish_time)
@@ -2,6 +2,7 @@
2
2
 
3
3
  import os
4
4
  import logging
5
+ import base64
5
6
  from matrice_common.utils import log_errors
6
7
 
7
8
  class Scaling:
@@ -33,6 +34,28 @@ class Scaling:
33
34
  "Initialized Scaling with instance_id: %s (REST API only)",
34
35
  instance_id
35
36
  )
37
+
38
+ @log_errors(default_return=None, log_error=True)
39
+ def get_kafka_bootstrap_servers(self):
40
+ """Get Kafka bootstrap servers from API and decode base64 fields.
41
+
42
+ Returns:
43
+ str: Kafka bootstrap servers in format "ip:port"
44
+
45
+ Raises:
46
+ ValueError: If unable to fetch Kafka configuration
47
+ """
48
+ path = "/v1/actions/get_kafka_info"
49
+ response = self.rpc.get(path=path)
50
+ if not response or not response.get("success"):
51
+ raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
52
+ encoded_ip = response["data"]["ip"]
53
+ encoded_port = response["data"]["port"]
54
+ ip = base64.b64decode(encoded_ip).decode("utf-8")
55
+ port = base64.b64decode(encoded_port).decode("utf-8")
56
+ bootstrap_servers = f"{ip}:{port}"
57
+ # logging.info(f"Retrieved Kafka bootstrap servers: {bootstrap_servers}")
58
+ return bootstrap_servers
36
59
 
37
60
  @log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
38
61
  def handle_response(self, resp, success_message, error_message):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.25
3
+ Version: 0.1.26
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,17 +1,18 @@
1
1
  matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
2
2
  matrice_compute/action_instance.py,sha256=SYUZrfj6dtcgEjeEgCyKlrc2p2o08jlW84Y__V4Aqew,69552
3
- matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
3
+ matrice_compute/actions_manager.py,sha256=Iex5uw0PLRR4pvIAZDxc2CypucbanKDbJ3SK8mMGXK8,18148
4
4
  matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
- matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
5
+ matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
6
+ matrice_compute/instance_manager.py,sha256=sUkDsy_XrPp7CKQxlujQRz3E_8rVbVZOy7byJOgMlEs,11376
6
7
  matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
7
8
  matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
8
9
  matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- matrice_compute/resources_tracker.py,sha256=pkdt0aVKx_TpY_Sq---73w9INkDffZZe3mZGlp1EftE,22573
10
- matrice_compute/scaling.py,sha256=CeT_lxJNkjJamRETG1lWaOtdSr5ySmcaMcqt7-lFRbo,23731
10
+ matrice_compute/resources_tracker.py,sha256=wy1huqB3Tw_kYC2wfnLa9iSyhDmgI7WQ5I9Kyr-1RSs,22829
11
+ matrice_compute/scaling.py,sha256=JNOgSpAPqbTlZ4qJokkdS9PehqyFwfPh4q98qrfNVCQ,24708
11
12
  matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
12
13
  matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
13
- matrice_compute-0.1.25.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
14
- matrice_compute-0.1.25.dist-info/METADATA,sha256=YxPD7gjTuET4wsbq0ywgIw8AmR8U7-EdAuZlIVIramg,1038
15
- matrice_compute-0.1.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- matrice_compute-0.1.25.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
17
- matrice_compute-0.1.25.dist-info/RECORD,,
14
+ matrice_compute-0.1.26.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
+ matrice_compute-0.1.26.dist-info/METADATA,sha256=t7TsI5DcNElRmlKsa8CArXCcA4iBO-9QwZ6j9UQOdg0,1038
16
+ matrice_compute-0.1.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ matrice_compute-0.1.26.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
+ matrice_compute-0.1.26.dist-info/RECORD,,