matrice-compute 0.1.24__tar.gz → 0.1.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/SOURCES.txt +1 -0
  4. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/action_instance.py +105 -8
  5. matrice_compute-0.1.26/src/matrice_compute/actions_manager.py +467 -0
  6. matrice_compute-0.1.26/src/matrice_compute/compute_operations_handler.py +490 -0
  7. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/instance_manager.py +25 -0
  8. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/instance_utils.py +114 -0
  9. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/resources_tracker.py +7 -2
  10. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/scaling.py +23 -0
  11. matrice_compute-0.1.24/src/matrice_compute/actions_manager.py +0 -227
  12. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/LICENSE.txt +0 -0
  13. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/README.md +0 -0
  14. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/dependency_links.txt +0 -0
  15. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/not-zip-safe +0 -0
  16. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/top_level.txt +0 -0
  17. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/pyproject.toml +0 -0
  18. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/setup.cfg +0 -0
  19. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/setup.py +0 -0
  20. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/__init__.py +0 -0
  21. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  22. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/prechecks.py +0 -0
  23. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/py.typed +0 -0
  24. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/shutdown_manager.py +0 -0
  25. {matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ src/matrice_compute/__init__.py
11
11
  src/matrice_compute/action_instance.py
12
12
  src/matrice_compute/actions_manager.py
13
13
  src/matrice_compute/actions_scaledown_manager.py
14
+ src/matrice_compute/compute_operations_handler.py
14
15
  src/matrice_compute/instance_manager.py
15
16
  src/matrice_compute/instance_utils.py
16
17
  src/matrice_compute/prechecks.py
@@ -10,6 +10,7 @@ import signal
10
10
  import urllib.request
11
11
  from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
+ get_gpu_config_for_deployment,
13
14
  get_decrypted_access_key_pair,
14
15
  get_max_file_system,
15
16
  get_best_service_ip_and_network,
@@ -26,6 +27,10 @@ from matrice_common.utils import log_errors
26
27
  class ActionInstance:
27
28
  """Base class for tasks that run in Action containers."""
28
29
 
30
+ # Class-level dictionary to track deployed services and their ports
31
+ # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
32
+ _deployed_services = {}
33
+
29
34
  def __init__(self, scaling: Scaling, action_info: dict):
30
35
  """Initialize an action instance.
31
36
 
@@ -84,6 +89,67 @@ class ActionInstance:
84
89
  raise ValueError(f"Unknown action type: {self.action_type}")
85
90
  self.task = self.actions_map[self.action_type]
86
91
 
92
+ @classmethod
93
+ def is_first_deployment_for_service(cls, service_id):
94
+ """Check if this is the first deployment for a given service.
95
+
96
+ Args:
97
+ service_id (str): Service ID (_idService)
98
+
99
+ Returns:
100
+ bool: True if this is the first deployment, False otherwise
101
+ """
102
+ if not service_id:
103
+ return False
104
+ return service_id not in cls._deployed_services
105
+
106
+ @classmethod
107
+ def get_or_create_triton_ports(cls, service_id, scaling_instance):
108
+ """Get existing TRITON_PORTS for a service or create new ones.
109
+
110
+ Args:
111
+ service_id (str): Service ID (_idService)
112
+ scaling_instance: Scaling instance to get open ports
113
+
114
+ Returns:
115
+ str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
116
+ """
117
+ if not service_id:
118
+ # No service_id, generate new ports
119
+ port1 = scaling_instance.get_open_port()
120
+ port2 = scaling_instance.get_open_port()
121
+ port3 = scaling_instance.get_open_port()
122
+ return f"{port1},{port2},{port3}"
123
+
124
+ # Check if ports already exist for this service
125
+ if service_id in cls._deployed_services:
126
+ triton_ports = cls._deployed_services[service_id]["triton_ports"]
127
+ logging.info(
128
+ "Reusing TRITON_PORTS for service %s: %s",
129
+ service_id,
130
+ triton_ports
131
+ )
132
+ return triton_ports
133
+
134
+ # First deployment: generate new ports and store them
135
+ port1 = scaling_instance.get_open_port()
136
+ port2 = scaling_instance.get_open_port()
137
+ port3 = scaling_instance.get_open_port()
138
+ triton_ports = f"{port1},{port2},{port3}"
139
+
140
+ # Store for future use
141
+ cls._deployed_services[service_id] = {
142
+ "triton_ports": triton_ports,
143
+ "is_first": False
144
+ }
145
+
146
+ logging.info(
147
+ "First deployment for service %s - generated TRITON_PORTS: %s",
148
+ service_id,
149
+ triton_ports
150
+ )
151
+ return triton_ports
152
+
87
153
  @log_errors(default_return={}, raise_exception=True, log_error=False)
88
154
  def _init_credentials(self):
89
155
  """Initialize Matrice credentials.
@@ -1387,10 +1453,27 @@ def redis_setup_execute(self: ActionInstance):
1387
1453
  f"docker run -d --net=host "
1388
1454
  f"--name redis_container_{int(time.time())} "
1389
1455
  f"--restart unless-stopped "
1456
+ f"--memory=32g "
1457
+ f"--cpus=8 "
1390
1458
  f"{redis_image} "
1391
- f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1459
+ f"redis-server --bind 0.0.0.0 "
1460
+ f"--appendonly no "
1461
+ f'--save "" '
1462
+ f"--maxmemory 30gb "
1463
+ f"--maxmemory-policy allkeys-lru "
1464
+ f"--io-threads 4 "
1465
+ f"--io-threads-do-reads yes "
1466
+ f"--stream-node-max-bytes 8192 "
1467
+ f"--stream-node-max-entries 1000 "
1468
+ f"--hz 100 "
1469
+ f"--tcp-backlog 2048 "
1470
+ f"--timeout 0 "
1471
+ f"--lazyfree-lazy-eviction yes "
1472
+ f"--lazyfree-lazy-expire yes "
1473
+ f"--lazyfree-lazy-server-del yes "
1474
+ f"--activedefrag yes "
1475
+ f"--requirepass {redis_password}"
1392
1476
  )
1393
-
1394
1477
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1395
1478
 
1396
1479
  # Start Redis container first
@@ -1455,6 +1538,10 @@ def model_deploy_execute(self: ActionInstance):
1455
1538
  return
1456
1539
  action_id = action_details["_id"]
1457
1540
  model_family = action_details["actionDetails"]["modelFamily"]
1541
+
1542
+ # Get the service ID to track deployments
1543
+ service_id = action_details.get("_idService")
1544
+
1458
1545
  self.setup_action_requirements(
1459
1546
  action_details,
1460
1547
  work_fs,
@@ -1462,17 +1549,27 @@ def model_deploy_execute(self: ActionInstance):
1462
1549
  action_id=action_id,
1463
1550
  )
1464
1551
 
1465
- # Get GPU configuration based on requirements and availability
1466
- # This selects the GPU(s) with the most free memory to balance load
1467
- use_gpu = self.get_gpu_config(action_details)
1552
+ # Check if this is the first deployment for this service
1553
+ is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
1554
+
1555
+ # Get GPU configuration (uses utility function with fail-safe fallback)
1556
+ use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
1468
1557
 
1469
1558
  logging.info(
1470
- "Action %s: Model deployment GPU config: %s",
1559
+ "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
1471
1560
  action_id,
1472
- use_gpu if use_gpu else "CPU-only"
1561
+ use_gpu if use_gpu else "CPU-only",
1562
+ is_first_deployment
1473
1563
  )
1474
1564
 
1475
- extra_env_vars = {"INTERNAL_PORT": internal_port}
1565
+ # Get or create TRITON_PORTS (uses utility method)
1566
+ triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1567
+
1568
+ extra_env_vars = {
1569
+ "INTERNAL_PORT": internal_port,
1570
+ "TRITON_PORTS": triton_ports
1571
+ }
1572
+
1476
1573
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1477
1574
  logging.info("cmd is: %s", cmd)
1478
1575
  self.start(cmd, "deploy_log")
@@ -0,0 +1,467 @@
1
+ """Module providing actions_manager functionality."""
2
+
3
+ import logging
4
+ import os
5
+ import time
6
+ from matrice_compute.action_instance import (
7
+ ActionInstance,
8
+ )
9
+ from matrice_compute.instance_utils import (
10
+ has_gpu,
11
+ get_mem_usage,
12
+ cleanup_docker_storage,
13
+ )
14
+ from matrice_compute.scaling import (
15
+ Scaling,
16
+ )
17
+ from matrice_common.utils import log_errors
18
+
19
+
20
+ class ActionsManager:
21
+ """Class for managing actions."""
22
+
23
+ def __init__(self, scaling: Scaling):
24
+ """Initialize an action manager.
25
+
26
+ Args:
27
+ scaling (Scaling): Scaling service instance
28
+ """
29
+ self.current_actions: dict[str, ActionInstance] = {}
30
+ self.stopped_actions: dict[str, ActionInstance] = {} # Track stopped actions separately
31
+ self.scaling = scaling
32
+ self.memory_threshold = 0.9
33
+ self.poll_interval = 10
34
+ self.last_actions_check = 0
35
+ logging.info("ActionsManager initialized")
36
+
37
+ @log_errors(default_return=[], raise_exception=False)
38
+ def fetch_actions(self) -> list:
39
+ """Poll for actions and process them if memory threshold is not exceeded.
40
+
41
+ Returns:
42
+ list: List of fetched actions
43
+ """
44
+ actions = []
45
+ logging.info("Polling backend for new jobs")
46
+ fetched_actions, error, _ = self.scaling.assign_jobs(has_gpu())
47
+ if error:
48
+ logging.error("Error assigning jobs: %s", error)
49
+ return actions
50
+ if not isinstance(fetched_actions, list):
51
+ fetched_actions = [fetched_actions]
52
+ for action in fetched_actions:
53
+ if not action:
54
+ continue
55
+ if action["_id"] != "000000000000000000000000":
56
+ actions.append(action)
57
+ logging.info(
58
+ "Fetched action details: %s",
59
+ actions,
60
+ )
61
+ return actions
62
+
63
+ @log_errors(default_return=None, raise_exception=False)
64
+ def process_action(self, action: dict) -> ActionInstance:
65
+ """Process the given action.
66
+
67
+ Args:
68
+ action (dict): Action details to process
69
+
70
+ Returns:
71
+ ActionInstance: Processed action instance or None if failed
72
+ """
73
+ logging.info(
74
+ "Processing action: %s",
75
+ action["_id"],
76
+ )
77
+ action_instance = ActionInstance(self.scaling, action)
78
+ self.scaling.update_action_status(
79
+ service_provider=os.environ["SERVICE_PROVIDER"],
80
+ action_record_id=action["_id"],
81
+ status="starting",
82
+ action_duration=0,
83
+ )
84
+ logging.info("locking action")
85
+ self.scaling.update_action_status(
86
+ service_provider=os.environ["SERVICE_PROVIDER"],
87
+ status="started",
88
+ action_record_id=action["_id"],
89
+ isRunning=True,
90
+ action_duration=0,
91
+ cpuUtilisation=0.0,
92
+ gpuUtilisation=0.0,
93
+ memoryUtilisation=0.0,
94
+ gpuMemoryUsed=0,
95
+ )
96
+ self.scaling.update_status(
97
+ action["_id"],
98
+ action["action"],
99
+ "bg-job-scheduler",
100
+ "JBSS_LCK",
101
+ "OK",
102
+ "Job is locked for processing",
103
+ )
104
+ action_instance.execute()
105
+ logging.info(
106
+ "action %s started.",
107
+ action_instance.action_record_id,
108
+ )
109
+ return action_instance
110
+
111
+ @log_errors(raise_exception=False)
112
+ def process_actions(self) -> None:
113
+ """Process fetched actions."""
114
+ for action in self.fetch_actions():
115
+ action_id = action["_id"]
116
+
117
+ # Skip if action is already running in current_actions
118
+ if action_id in self.current_actions:
119
+ logging.info("Action %s already in current_actions, skipping", action_id)
120
+ continue
121
+
122
+ # If action exists in stopped_actions, remove it before starting fresh
123
+ if action_id in self.stopped_actions:
124
+ logging.info("Action %s found in stopped_actions, removing before restart", action_id)
125
+ del self.stopped_actions[action_id]
126
+
127
+ # Process and add to current_actions
128
+ action_instance = self.process_action(action)
129
+ if action_instance:
130
+ # Ensure action is not in stopped_actions (defensive check)
131
+ if action_id in self.stopped_actions:
132
+ del self.stopped_actions[action_id]
133
+ self.current_actions[action_id] = action_instance
134
+
135
+ @log_errors(raise_exception=False)
136
+ def update_actions_status(self) -> None:
137
+ """Update tracking of running vs stopped actions.
138
+
139
+ This method checks all actions and moves stopped ones to stopped_actions dict
140
+ without deleting them. This prevents interference with compute operations
141
+ handler while maintaining accurate status reporting.
142
+ """
143
+ moved_to_stopped = 0
144
+
145
+ # Check each action and update its status
146
+ for action_id, instance in list(self.current_actions.items()):
147
+ is_running = False
148
+ status_reason = ""
149
+
150
+ # Check if process is running
151
+ if hasattr(instance, 'is_running'):
152
+ try:
153
+ is_running = instance.is_running()
154
+ except Exception as e:
155
+ logging.error("Error checking is_running for action %s: %s", action_id, str(e))
156
+ is_running = False
157
+ status_reason = f"error checking status: {str(e)}"
158
+
159
+ # Check for process object validity
160
+ if not is_running and not status_reason:
161
+ if not hasattr(instance, 'process') or instance.process is None:
162
+ status_reason = "no process object"
163
+ else:
164
+ status_reason = "process not running"
165
+
166
+ # Move to stopped_actions if not running (but don't delete)
167
+ if not is_running:
168
+ logging.info(
169
+ "Action %s moved to stopped_actions: %s",
170
+ action_id,
171
+ status_reason
172
+ )
173
+ # Ensure action is removed from current_actions before adding to stopped_actions
174
+ if action_id in self.current_actions:
175
+ del self.current_actions[action_id]
176
+ # Ensure action is not duplicated in stopped_actions
177
+ if action_id not in self.stopped_actions:
178
+ self.stopped_actions[action_id] = instance
179
+ moved_to_stopped += 1
180
+
181
+ # Log current state
182
+ running_ids = list(self.current_actions.keys())
183
+ stopped_ids = list(self.stopped_actions.keys())
184
+
185
+ if self.current_actions or self.stopped_actions:
186
+ logging.info(
187
+ "Actions status: %d running %s, %d stopped %s",
188
+ len(self.current_actions),
189
+ running_ids if running_ids else "[]",
190
+ len(self.stopped_actions),
191
+ stopped_ids if stopped_ids else "[]"
192
+ )
193
+
194
+ @log_errors(raise_exception=False)
195
+ def purge_unwanted(self) -> None:
196
+ """Purge completed or failed actions.
197
+
198
+ NOTE: This now calls update_actions_status() which moves stopped actions
199
+ to a separate dict instead of deleting them. This prevents interference
200
+ with compute operations handler while maintaining accurate status.
201
+ """
202
+ self.update_actions_status()
203
+
204
+ @log_errors(default_return={}, raise_exception=False)
205
+ def get_current_actions(self) -> dict:
206
+ """Get the current running actions.
207
+
208
+ This method:
209
+ 1. Updates action status tracking via update_actions_status()
210
+ 2. Returns only the running actions (current_actions dict)
211
+ 3. Provides detailed logging about current actions state
212
+
213
+ Returns:
214
+ dict: Current running actions only
215
+ """
216
+ # Update status tracking (moves stopped to stopped_actions)
217
+ self.update_actions_status()
218
+
219
+ if self.current_actions:
220
+ action_ids = list(self.current_actions.keys())
221
+ logging.info(
222
+ "Currently running %d actions: %s",
223
+ len(self.current_actions),
224
+ action_ids
225
+ )
226
+ else:
227
+ logging.debug("No actions currently running")
228
+
229
+ return self.current_actions
230
+
231
+ @log_errors(default_return={}, raise_exception=False)
232
+ def get_all_actions(self) -> dict:
233
+ """Get all tracked actions (both running and stopped).
234
+
235
+ Returns:
236
+ dict: All tracked actions with their status
237
+ """
238
+ all_actions = {}
239
+ for action_id, instance in self.current_actions.items():
240
+ all_actions[action_id] = {"instance": instance, "status": "running"}
241
+ for action_id, instance in self.stopped_actions.items():
242
+ all_actions[action_id] = {"instance": instance, "status": "stopped"}
243
+ return all_actions
244
+
245
+ @log_errors(default_return={}, raise_exception=False)
246
+ def get_stopped_actions(self) -> dict:
247
+ """Get stopped actions.
248
+
249
+ Returns:
250
+ dict: Stopped actions
251
+ """
252
+ return self.stopped_actions
253
+
254
+ @log_errors(default_return={}, raise_exception=False)
255
+ def stop_action(self, action_record_id: str) -> dict:
256
+ """Stop a specific action by its record ID.
257
+
258
+ Args:
259
+ action_record_id (str): The action record ID to stop
260
+
261
+ Returns:
262
+ dict: Result dictionary with status information
263
+ """
264
+ logging.info("Attempting to stop action: %s", action_record_id)
265
+
266
+ # Check if action exists in current (running) actions
267
+ action_instance = None
268
+ action_source = None
269
+
270
+ if action_record_id in self.current_actions:
271
+ action_instance = self.current_actions[action_record_id]
272
+ action_source = "current_actions"
273
+ elif action_record_id in self.stopped_actions:
274
+ # Action already in stopped_actions
275
+ logging.info("Action %s already in stopped_actions", action_record_id)
276
+ return {
277
+ "success": True,
278
+ "reason": "already_stopped",
279
+ "action_id": action_record_id
280
+ }
281
+ else:
282
+ logging.warning("Action %s not found in current or stopped actions", action_record_id)
283
+ return {
284
+ "success": False,
285
+ "reason": "action_not_found",
286
+ "action_id": action_record_id
287
+ }
288
+
289
+ # Check if action is actually running
290
+ if not action_instance.is_running():
291
+ logging.info("Action %s is not running, moving to stopped_actions", action_record_id)
292
+ # Move to stopped_actions instead of deleting
293
+ # Ensure action is removed from current_actions first
294
+ if action_record_id in self.current_actions:
295
+ del self.current_actions[action_record_id]
296
+ # Ensure action is not duplicated in stopped_actions
297
+ if action_record_id not in self.stopped_actions:
298
+ self.stopped_actions[action_record_id] = action_instance
299
+ return {
300
+ "success": True,
301
+ "reason": "already_stopped",
302
+ "action_id": action_record_id
303
+ }
304
+
305
+ # Stop the action
306
+ try:
307
+ logging.info("Stopping action %s", action_record_id)
308
+ action_instance.stop()
309
+
310
+ # Update action status to stopped
311
+ self.scaling.update_action_status(
312
+ service_provider=os.environ["SERVICE_PROVIDER"],
313
+ action_record_id=action_record_id,
314
+ status="stopped",
315
+ isRunning=False,
316
+ action_duration=0,
317
+ )
318
+
319
+ # Move to stopped_actions instead of deleting
320
+ # Ensure action is removed from current_actions first
321
+ if action_record_id in self.current_actions:
322
+ del self.current_actions[action_record_id]
323
+ # Ensure action is not duplicated in stopped_actions
324
+ if action_record_id not in self.stopped_actions:
325
+ self.stopped_actions[action_record_id] = action_instance
326
+
327
+ logging.info("Successfully stopped action: %s", action_record_id)
328
+ return {
329
+ "success": True,
330
+ "action_id": action_record_id,
331
+ "stopped_at": time.time()
332
+ }
333
+
334
+ except Exception as e:
335
+ logging.error("Error stopping action %s: %s", action_record_id, str(e))
336
+ return {
337
+ "success": False,
338
+ "reason": "stop_failed",
339
+ "error": str(e),
340
+ "action_id": action_record_id
341
+ }
342
+
343
+ @log_errors(default_return={}, raise_exception=False)
344
+ def restart_action(self, action_record_id: str) -> dict:
345
+ """Restart a specific action by its record ID.
346
+
347
+ This method stops the action if it's running, then fetches fresh action
348
+ details from the backend and starts it again.
349
+
350
+ Args:
351
+ action_record_id (str): The action record ID to restart
352
+
353
+ Returns:
354
+ dict: Result dictionary with status information
355
+ """
356
+ logging.info("Attempting to restart action: %s", action_record_id)
357
+
358
+ # Step 1: Stop the action if it exists in current_actions or stopped_actions
359
+ stop_result = {"success": True, "reason": "not_running"}
360
+ if action_record_id in self.current_actions:
361
+ logging.info("Stopping existing action %s before restart", action_record_id)
362
+ stop_result = self.stop_action(action_record_id)
363
+
364
+ if not stop_result.get("success"):
365
+ logging.error("Failed to stop action %s for restart", action_record_id)
366
+ return {
367
+ "success": False,
368
+ "reason": "stop_failed_before_restart",
369
+ "stop_result": stop_result,
370
+ "action_id": action_record_id
371
+ }
372
+
373
+ # Wait a moment for cleanup
374
+ time.sleep(2)
375
+ elif action_record_id in self.stopped_actions:
376
+ logging.info("Action %s found in stopped_actions, will restart", action_record_id)
377
+ stop_result = {"success": True, "reason": "was_stopped"}
378
+
379
+ # Step 2: Fetch fresh action details from backend
380
+ try:
381
+ logging.info("Fetching action details for restart: %s", action_record_id)
382
+
383
+ # Get action details via API
384
+ action_details, error, _ = self.scaling.get_action_details(action_record_id)
385
+
386
+ if error or not action_details:
387
+ logging.error("Failed to fetch action details for %s: %s",
388
+ action_record_id, error)
389
+ return {
390
+ "success": False,
391
+ "reason": "fetch_failed",
392
+ "error": error,
393
+ "action_id": action_record_id
394
+ }
395
+
396
+ # Step 3: Process (start) the action
397
+ logging.info("Starting action %s after restart", action_record_id)
398
+ action_instance = self.process_action(action_details)
399
+
400
+ if action_instance:
401
+ # Ensure action is removed from stopped_actions if present
402
+ if action_record_id in self.stopped_actions:
403
+ del self.stopped_actions[action_record_id]
404
+ # Ensure action is removed from current_actions if present (defensive check)
405
+ if action_record_id in self.current_actions:
406
+ logging.warning("Action %s already in current_actions during restart, replacing", action_record_id)
407
+ del self.current_actions[action_record_id]
408
+ # Add to current_actions
409
+ self.current_actions[action_record_id] = action_instance
410
+
411
+ logging.info("Successfully restarted action: %s", action_record_id)
412
+ return {
413
+ "success": True,
414
+ "action_id": action_record_id,
415
+ "restarted_at": time.time(),
416
+ "stop_result": stop_result
417
+ }
418
+ else:
419
+ logging.error("Failed to start action %s after restart", action_record_id)
420
+ return {
421
+ "success": False,
422
+ "reason": "start_failed_after_restart",
423
+ "action_id": action_record_id
424
+ }
425
+
426
+ except Exception as e:
427
+ logging.error("Error restarting action %s: %s", action_record_id, str(e))
428
+ return {
429
+ "success": False,
430
+ "reason": "restart_failed",
431
+ "error": str(e),
432
+ "action_id": action_record_id
433
+ }
434
+
435
+ @log_errors(raise_exception=True)
436
+ def start_actions_manager(self) -> None:
437
+ """Start the actions manager main loop."""
438
+ while True:
439
+ waiting_time = self.poll_interval # Default wait time
440
+ try:
441
+ mem_usage = get_mem_usage()
442
+ logging.info("Memory usage: %d", mem_usage)
443
+ waiting_time = int(
444
+ min(
445
+ self.poll_interval
446
+ / max(
447
+ 0.001,
448
+ self.memory_threshold - mem_usage,
449
+ ),
450
+ 120,
451
+ )
452
+ )
453
+ if mem_usage < self.memory_threshold:
454
+ self.process_actions()
455
+ logging.info(
456
+ "Waiting for %d seconds before next poll",
457
+ waiting_time,
458
+ )
459
+ else:
460
+ logging.info(
461
+ "Memory threshold exceeded, waiting for %d seconds",
462
+ waiting_time,
463
+ )
464
+ cleanup_docker_storage()
465
+ except Exception as e:
466
+ logging.error("Error in actions manager: %s", e)
467
+ time.sleep(waiting_time)