matrice-compute 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import signal
10
10
  import urllib.request
11
11
  from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
+ get_gpu_config_for_deployment,
13
14
  get_decrypted_access_key_pair,
14
15
  get_max_file_system,
15
16
  get_best_service_ip_and_network,
@@ -26,6 +27,10 @@ from matrice_common.utils import log_errors
26
27
  class ActionInstance:
27
28
  """Base class for tasks that run in Action containers."""
28
29
 
30
+ # Class-level dictionary to track deployed services and their ports
31
+ # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
32
+ _deployed_services = {}
33
+
29
34
  def __init__(self, scaling: Scaling, action_info: dict):
30
35
  """Initialize an action instance.
31
36
 
@@ -84,6 +89,67 @@ class ActionInstance:
84
89
  raise ValueError(f"Unknown action type: {self.action_type}")
85
90
  self.task = self.actions_map[self.action_type]
86
91
 
92
+ @classmethod
93
+ def is_first_deployment_for_service(cls, service_id):
94
+ """Check if this is the first deployment for a given service.
95
+
96
+ Args:
97
+ service_id (str): Service ID (_idService)
98
+
99
+ Returns:
100
+ bool: True if this is the first deployment, False otherwise
101
+ """
102
+ if not service_id:
103
+ return False
104
+ return service_id not in cls._deployed_services
105
+
106
+ @classmethod
107
+ def get_or_create_triton_ports(cls, service_id, scaling_instance):
108
+ """Get existing TRITON_PORTS for a service or create new ones.
109
+
110
+ Args:
111
+ service_id (str): Service ID (_idService)
112
+ scaling_instance: Scaling instance to get open ports
113
+
114
+ Returns:
115
+ str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
116
+ """
117
+ if not service_id:
118
+ # No service_id, generate new ports
119
+ port1 = scaling_instance.get_open_port()
120
+ port2 = scaling_instance.get_open_port()
121
+ port3 = scaling_instance.get_open_port()
122
+ return f"{port1},{port2},{port3}"
123
+
124
+ # Check if ports already exist for this service
125
+ if service_id in cls._deployed_services:
126
+ triton_ports = cls._deployed_services[service_id]["triton_ports"]
127
+ logging.info(
128
+ "Reusing TRITON_PORTS for service %s: %s",
129
+ service_id,
130
+ triton_ports
131
+ )
132
+ return triton_ports
133
+
134
+ # First deployment: generate new ports and store them
135
+ port1 = scaling_instance.get_open_port()
136
+ port2 = scaling_instance.get_open_port()
137
+ port3 = scaling_instance.get_open_port()
138
+ triton_ports = f"{port1},{port2},{port3}"
139
+
140
+ # Store for future use
141
+ cls._deployed_services[service_id] = {
142
+ "triton_ports": triton_ports,
143
+ "is_first": False
144
+ }
145
+
146
+ logging.info(
147
+ "First deployment for service %s - generated TRITON_PORTS: %s",
148
+ service_id,
149
+ triton_ports
150
+ )
151
+ return triton_ports
152
+
87
153
  @log_errors(default_return={}, raise_exception=True, log_error=False)
88
154
  def _init_credentials(self):
89
155
  """Initialize Matrice credentials.
@@ -1387,10 +1453,27 @@ def redis_setup_execute(self: ActionInstance):
1387
1453
  f"docker run -d --net=host "
1388
1454
  f"--name redis_container_{int(time.time())} "
1389
1455
  f"--restart unless-stopped "
1456
+ f"--memory=32g "
1457
+ f"--cpus=8 "
1390
1458
  f"{redis_image} "
1391
- f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1459
+ f"redis-server --bind 0.0.0.0 "
1460
+ f"--appendonly no "
1461
+ f'--save "" '
1462
+ f"--maxmemory 30gb "
1463
+ f"--maxmemory-policy allkeys-lru "
1464
+ f"--io-threads 4 "
1465
+ f"--io-threads-do-reads yes "
1466
+ f"--stream-node-max-bytes 8192 "
1467
+ f"--stream-node-max-entries 1000 "
1468
+ f"--hz 100 "
1469
+ f"--tcp-backlog 2048 "
1470
+ f"--timeout 0 "
1471
+ f"--lazyfree-lazy-eviction yes "
1472
+ f"--lazyfree-lazy-expire yes "
1473
+ f"--lazyfree-lazy-server-del yes "
1474
+ f"--activedefrag yes "
1475
+ f"--requirepass {redis_password}"
1392
1476
  )
1393
-
1394
1477
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1395
1478
 
1396
1479
  # Start Redis container first
@@ -1455,6 +1538,10 @@ def model_deploy_execute(self: ActionInstance):
1455
1538
  return
1456
1539
  action_id = action_details["_id"]
1457
1540
  model_family = action_details["actionDetails"]["modelFamily"]
1541
+
1542
+ # Get the service ID to track deployments
1543
+ service_id = action_details.get("_idService")
1544
+
1458
1545
  self.setup_action_requirements(
1459
1546
  action_details,
1460
1547
  work_fs,
@@ -1462,17 +1549,27 @@ def model_deploy_execute(self: ActionInstance):
1462
1549
  action_id=action_id,
1463
1550
  )
1464
1551
 
1465
- # Get GPU configuration based on requirements and availability
1466
- # This selects the GPU(s) with the most free memory to balance load
1467
- use_gpu = self.get_gpu_config(action_details)
1552
+ # Check if this is the first deployment for this service
1553
+ is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
1554
+
1555
+ # Get GPU configuration (uses utility function with fail-safe fallback)
1556
+ use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
1468
1557
 
1469
1558
  logging.info(
1470
- "Action %s: Model deployment GPU config: %s",
1559
+ "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
1471
1560
  action_id,
1472
- use_gpu if use_gpu else "CPU-only"
1561
+ use_gpu if use_gpu else "CPU-only",
1562
+ is_first_deployment
1473
1563
  )
1474
1564
 
1475
- extra_env_vars = {"INTERNAL_PORT": internal_port}
1565
+ # Get or create TRITON_PORTS (uses utility method)
1566
+ triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1567
+
1568
+ extra_env_vars = {
1569
+ "INTERNAL_PORT": internal_port,
1570
+ "TRITON_PORTS": triton_ports
1571
+ }
1572
+
1476
1573
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1477
1574
  logging.info("cmd is: %s", cmd)
1478
1575
  self.start(cmd, "deploy_log")
@@ -27,6 +27,7 @@ class ActionsManager:
27
27
  scaling (Scaling): Scaling service instance
28
28
  """
29
29
  self.current_actions: dict[str, ActionInstance] = {}
30
+ self.stopped_actions: dict[str, ActionInstance] = {} # Track stopped actions separately
30
31
  self.scaling = scaling
31
32
  self.memory_threshold = 0.9
32
33
  self.poll_interval = 10
@@ -111,75 +112,110 @@ class ActionsManager:
111
112
  def process_actions(self) -> None:
112
113
  """Process fetched actions."""
113
114
  for action in self.fetch_actions():
115
+ action_id = action["_id"]
116
+
117
+ # Skip if action is already running in current_actions
118
+ if action_id in self.current_actions:
119
+ logging.info("Action %s already in current_actions, skipping", action_id)
120
+ continue
121
+
122
+ # If action exists in stopped_actions, remove it before starting fresh
123
+ if action_id in self.stopped_actions:
124
+ logging.info("Action %s found in stopped_actions, removing before restart", action_id)
125
+ del self.stopped_actions[action_id]
126
+
127
+ # Process and add to current_actions
114
128
  action_instance = self.process_action(action)
115
129
  if action_instance:
116
- self.current_actions[action["_id"]] = action_instance
130
+ # Ensure action is not in stopped_actions (defensive check)
131
+ if action_id in self.stopped_actions:
132
+ del self.stopped_actions[action_id]
133
+ self.current_actions[action_id] = action_instance
117
134
 
118
135
  @log_errors(raise_exception=False)
119
- def purge_unwanted(self) -> None:
120
- """Purge completed or failed actions.
121
-
122
- This method checks all actions in the current_actions dictionary and removes any that:
123
- 1. Are explicitly reported as not running by the is_running() method
124
- 2. Have invalid or corrupted process objects
136
+ def update_actions_status(self) -> None:
137
+ """Update tracking of running vs stopped actions.
138
+
139
+ This method checks all actions and moves stopped ones to stopped_actions dict
140
+ without deleting them. This prevents interference with compute operations
141
+ handler while maintaining accurate status reporting.
125
142
  """
126
- purged_count = 0
127
-
128
- # Check each action and purge if needed
143
+ moved_to_stopped = 0
144
+
145
+ # Check each action and update its status
129
146
  for action_id, instance in list(self.current_actions.items()):
130
- should_purge = False
131
- purge_reason = ""
132
-
133
- # Check if process is reported as not running
134
- if not instance.is_running():
135
- should_purge = True
136
- purge_reason = "process reported as not running"
137
-
147
+ is_running = False
148
+ status_reason = ""
149
+
150
+ # Check if process is running
151
+ if hasattr(instance, 'is_running'):
152
+ try:
153
+ is_running = instance.is_running()
154
+ except Exception as e:
155
+ logging.error("Error checking is_running for action %s: %s", action_id, str(e))
156
+ is_running = False
157
+ status_reason = f"error checking status: {str(e)}"
158
+
138
159
  # Check for process object validity
139
- elif not hasattr(instance, 'process') or instance.process is None:
140
- should_purge = True
141
- purge_reason = "invalid process object"
142
-
143
- # Purge if any condition was met
144
- if should_purge:
160
+ if not is_running and not status_reason:
161
+ if not hasattr(instance, 'process') or instance.process is None:
162
+ status_reason = "no process object"
163
+ else:
164
+ status_reason = "process not running"
165
+
166
+ # Move to stopped_actions if not running (but don't delete)
167
+ if not is_running:
145
168
  logging.info(
146
- "Action %s is being purged: %s",
169
+ "Action %s moved to stopped_actions: %s",
147
170
  action_id,
148
- purge_reason
171
+ status_reason
149
172
  )
150
-
151
- # Remove from tracking dictionaries
152
- del self.current_actions[action_id]
153
- purged_count += 1
173
+ # Ensure action is removed from current_actions before adding to stopped_actions
174
+ if action_id in self.current_actions:
175
+ del self.current_actions[action_id]
176
+ # Ensure action is not duplicated in stopped_actions
177
+ if action_id not in self.stopped_actions:
178
+ self.stopped_actions[action_id] = instance
179
+ moved_to_stopped += 1
154
180
 
155
- # Try to explicitly stop the action if possible
156
- try:
157
- if hasattr(instance, 'stop'):
158
- instance.stop()
159
- except Exception as e:
160
- logging.error(f"Error stopping action {action_id}: {str(e)}")
161
-
162
- if purged_count > 0:
181
+ # Log current state
182
+ running_ids = list(self.current_actions.keys())
183
+ stopped_ids = list(self.stopped_actions.keys())
184
+
185
+ if self.current_actions or self.stopped_actions:
163
186
  logging.info(
164
- "Purged %d completed actions, %d actions remain in queue",
165
- purged_count,
166
- len(self.current_actions)
187
+ "Actions status: %d running %s, %d stopped %s",
188
+ len(self.current_actions),
189
+ running_ids if running_ids else "[]",
190
+ len(self.stopped_actions),
191
+ stopped_ids if stopped_ids else "[]"
167
192
  )
168
193
 
194
+ @log_errors(raise_exception=False)
195
+ def purge_unwanted(self) -> None:
196
+ """Purge completed or failed actions.
197
+
198
+ NOTE: This now calls update_actions_status() which moves stopped actions
199
+ to a separate dict instead of deleting them. This prevents interference
200
+ with compute operations handler while maintaining accurate status.
201
+ """
202
+ self.update_actions_status()
203
+
169
204
  @log_errors(default_return={}, raise_exception=False)
170
205
  def get_current_actions(self) -> dict:
171
- """Get the current actions.
206
+ """Get the current running actions.
172
207
 
173
208
  This method:
174
- 1. Purges any completed actions using purge_unwanted()
175
- 2. Double-checks remaining actions to ensure they are truly running
209
+ 1. Updates action status tracking via update_actions_status()
210
+ 2. Returns only the running actions (current_actions dict)
176
211
  3. Provides detailed logging about current actions state
177
212
 
178
213
  Returns:
179
- dict: Current active actions
214
+ dict: Current running actions only
180
215
  """
181
- # Always purge unwanted actions first
182
- self.purge_unwanted()
216
+ # Update status tracking (moves stopped to stopped_actions)
217
+ self.update_actions_status()
218
+
183
219
  if self.current_actions:
184
220
  action_ids = list(self.current_actions.keys())
185
221
  logging.info(
@@ -189,9 +225,213 @@ class ActionsManager:
189
225
  )
190
226
  else:
191
227
  logging.debug("No actions currently running")
192
- return {}
228
+
193
229
  return self.current_actions
194
230
 
231
+ @log_errors(default_return={}, raise_exception=False)
232
+ def get_all_actions(self) -> dict:
233
+ """Get all tracked actions (both running and stopped).
234
+
235
+ Returns:
236
+ dict: All tracked actions with their status
237
+ """
238
+ all_actions = {}
239
+ for action_id, instance in self.current_actions.items():
240
+ all_actions[action_id] = {"instance": instance, "status": "running"}
241
+ for action_id, instance in self.stopped_actions.items():
242
+ all_actions[action_id] = {"instance": instance, "status": "stopped"}
243
+ return all_actions
244
+
245
+ @log_errors(default_return={}, raise_exception=False)
246
+ def get_stopped_actions(self) -> dict:
247
+ """Get stopped actions.
248
+
249
+ Returns:
250
+ dict: Stopped actions
251
+ """
252
+ return self.stopped_actions
253
+
254
+ @log_errors(default_return={}, raise_exception=False)
255
+ def stop_action(self, action_record_id: str) -> dict:
256
+ """Stop a specific action by its record ID.
257
+
258
+ Args:
259
+ action_record_id (str): The action record ID to stop
260
+
261
+ Returns:
262
+ dict: Result dictionary with status information
263
+ """
264
+ logging.info("Attempting to stop action: %s", action_record_id)
265
+
266
+ # Check if action exists in current (running) actions
267
+ action_instance = None
268
+ action_source = None
269
+
270
+ if action_record_id in self.current_actions:
271
+ action_instance = self.current_actions[action_record_id]
272
+ action_source = "current_actions"
273
+ elif action_record_id in self.stopped_actions:
274
+ # Action already in stopped_actions
275
+ logging.info("Action %s already in stopped_actions", action_record_id)
276
+ return {
277
+ "success": True,
278
+ "reason": "already_stopped",
279
+ "action_id": action_record_id
280
+ }
281
+ else:
282
+ logging.warning("Action %s not found in current or stopped actions", action_record_id)
283
+ return {
284
+ "success": False,
285
+ "reason": "action_not_found",
286
+ "action_id": action_record_id
287
+ }
288
+
289
+ # Check if action is actually running
290
+ if not action_instance.is_running():
291
+ logging.info("Action %s is not running, moving to stopped_actions", action_record_id)
292
+ # Move to stopped_actions instead of deleting
293
+ # Ensure action is removed from current_actions first
294
+ if action_record_id in self.current_actions:
295
+ del self.current_actions[action_record_id]
296
+ # Ensure action is not duplicated in stopped_actions
297
+ if action_record_id not in self.stopped_actions:
298
+ self.stopped_actions[action_record_id] = action_instance
299
+ return {
300
+ "success": True,
301
+ "reason": "already_stopped",
302
+ "action_id": action_record_id
303
+ }
304
+
305
+ # Stop the action
306
+ try:
307
+ logging.info("Stopping action %s", action_record_id)
308
+ action_instance.stop()
309
+
310
+ # Update action status to stopped
311
+ self.scaling.update_action_status(
312
+ service_provider=os.environ["SERVICE_PROVIDER"],
313
+ action_record_id=action_record_id,
314
+ status="stopped",
315
+ isRunning=False,
316
+ action_duration=0,
317
+ )
318
+
319
+ # Move to stopped_actions instead of deleting
320
+ # Ensure action is removed from current_actions first
321
+ if action_record_id in self.current_actions:
322
+ del self.current_actions[action_record_id]
323
+ # Ensure action is not duplicated in stopped_actions
324
+ if action_record_id not in self.stopped_actions:
325
+ self.stopped_actions[action_record_id] = action_instance
326
+
327
+ logging.info("Successfully stopped action: %s", action_record_id)
328
+ return {
329
+ "success": True,
330
+ "action_id": action_record_id,
331
+ "stopped_at": time.time()
332
+ }
333
+
334
+ except Exception as e:
335
+ logging.error("Error stopping action %s: %s", action_record_id, str(e))
336
+ return {
337
+ "success": False,
338
+ "reason": "stop_failed",
339
+ "error": str(e),
340
+ "action_id": action_record_id
341
+ }
342
+
343
+ @log_errors(default_return={}, raise_exception=False)
344
+ def restart_action(self, action_record_id: str) -> dict:
345
+ """Restart a specific action by its record ID.
346
+
347
+ This method stops the action if it's running, then fetches fresh action
348
+ details from the backend and starts it again.
349
+
350
+ Args:
351
+ action_record_id (str): The action record ID to restart
352
+
353
+ Returns:
354
+ dict: Result dictionary with status information
355
+ """
356
+ logging.info("Attempting to restart action: %s", action_record_id)
357
+
358
+ # Step 1: Stop the action if it exists in current_actions or stopped_actions
359
+ stop_result = {"success": True, "reason": "not_running"}
360
+ if action_record_id in self.current_actions:
361
+ logging.info("Stopping existing action %s before restart", action_record_id)
362
+ stop_result = self.stop_action(action_record_id)
363
+
364
+ if not stop_result.get("success"):
365
+ logging.error("Failed to stop action %s for restart", action_record_id)
366
+ return {
367
+ "success": False,
368
+ "reason": "stop_failed_before_restart",
369
+ "stop_result": stop_result,
370
+ "action_id": action_record_id
371
+ }
372
+
373
+ # Wait a moment for cleanup
374
+ time.sleep(2)
375
+ elif action_record_id in self.stopped_actions:
376
+ logging.info("Action %s found in stopped_actions, will restart", action_record_id)
377
+ stop_result = {"success": True, "reason": "was_stopped"}
378
+
379
+ # Step 2: Fetch fresh action details from backend
380
+ try:
381
+ logging.info("Fetching action details for restart: %s", action_record_id)
382
+
383
+ # Get action details via API
384
+ action_details, error, _ = self.scaling.get_action_details(action_record_id)
385
+
386
+ if error or not action_details:
387
+ logging.error("Failed to fetch action details for %s: %s",
388
+ action_record_id, error)
389
+ return {
390
+ "success": False,
391
+ "reason": "fetch_failed",
392
+ "error": error,
393
+ "action_id": action_record_id
394
+ }
395
+
396
+ # Step 3: Process (start) the action
397
+ logging.info("Starting action %s after restart", action_record_id)
398
+ action_instance = self.process_action(action_details)
399
+
400
+ if action_instance:
401
+ # Ensure action is removed from stopped_actions if present
402
+ if action_record_id in self.stopped_actions:
403
+ del self.stopped_actions[action_record_id]
404
+ # Ensure action is removed from current_actions if present (defensive check)
405
+ if action_record_id in self.current_actions:
406
+ logging.warning("Action %s already in current_actions during restart, replacing", action_record_id)
407
+ del self.current_actions[action_record_id]
408
+ # Add to current_actions
409
+ self.current_actions[action_record_id] = action_instance
410
+
411
+ logging.info("Successfully restarted action: %s", action_record_id)
412
+ return {
413
+ "success": True,
414
+ "action_id": action_record_id,
415
+ "restarted_at": time.time(),
416
+ "stop_result": stop_result
417
+ }
418
+ else:
419
+ logging.error("Failed to start action %s after restart", action_record_id)
420
+ return {
421
+ "success": False,
422
+ "reason": "start_failed_after_restart",
423
+ "action_id": action_record_id
424
+ }
425
+
426
+ except Exception as e:
427
+ logging.error("Error restarting action %s: %s", action_record_id, str(e))
428
+ return {
429
+ "success": False,
430
+ "reason": "restart_failed",
431
+ "error": str(e),
432
+ "action_id": action_record_id
433
+ }
434
+
195
435
  @log_errors(raise_exception=True)
196
436
  def start_actions_manager(self) -> None:
197
437
  """Start the actions manager main loop."""
@@ -0,0 +1,490 @@
1
+ """
2
+ Compute Operations Handler - Kafka Event-Driven Operations Manager
3
+
4
+ This module handles compute instance operations (start/stop/restart) triggered from
5
+ the frontend dashboard via Kafka events. It consumes events from the 'compute_operations'
6
+ topic and performs the actual operations on compute instances and their actions.
7
+
8
+ Uses EventListener from matrice_common for simplified Kafka consumption.
9
+
10
+ Event Structure:
11
+ {
12
+ "instance_id": "string",
13
+ "action_record_id": "string", # Can be ObjectID("000000000000000000000000") or all zeros for instance-level operations
14
+ "operation": "start|stop|restart",
15
+ "account_number": 12345,
16
+ "requested_by": "user@example.com",
17
+ "request_id": "uuid-string",
18
+ "timestamp": "2025-11-21T10:30:00.123Z"
19
+ }
20
+ """
21
+
22
+ import logging
23
+ import re
24
+ import time
25
+ from typing import Dict, Any, Optional
26
+ import sys
27
+ import traceback
28
+ import os
29
+ import subprocess
30
+
31
+ from matrice_common.stream.event_listener import EventListener
32
+
33
+ # Configure logging
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class ComputeOperationsHandler:
38
+ """
39
+ Handles Kafka-based compute operations for instance and action management.
40
+
41
+ This class uses EventListener from matrice_common to listen for operation
42
+ events from the 'compute_operations' Kafka topic. It delegates operations
43
+ to the ActionsManager for execution and updates status via API calls.
44
+ """
45
+
46
+ KAFKA_TOPIC = "compute_operations"
47
+
48
+ def __init__(self, actions_manager, session, scaling, instance_id: str):
49
+ """
50
+ Initialize the Compute Operations Handler.
51
+
52
+ Args:
53
+ actions_manager: Reference to the ActionsManager instance
54
+ session: Session object for authentication and Kafka configuration
55
+ scaling: Scaling service instance for API status updates
56
+ instance_id: This compute instance's ID for filtering events
57
+ """
58
+ self.actions_manager = actions_manager
59
+ self.session = session
60
+ self.scaling = scaling
61
+ self.instance_id = instance_id
62
+ self.event_listener: Optional[EventListener] = None
63
+ self.running = False
64
+
65
+ logger.info(f"Initializing ComputeOperationsHandler for instance ID: {instance_id}")
66
+
67
+ def start(self) -> bool:
68
+ """
69
+ Start the operations handler using EventListener.
70
+
71
+ Returns:
72
+ bool: True if started successfully, False otherwise
73
+ """
74
+ if self.running:
75
+ logger.warning("ComputeOperationsHandler is already running")
76
+ return False
77
+
78
+ try:
79
+ self.event_listener = EventListener(
80
+ session=self.session,
81
+ topics=[self.KAFKA_TOPIC],
82
+ event_handler=self._handle_operation_event,
83
+ filter_field='instance_id',
84
+ filter_value=self.instance_id,
85
+ consumer_group_id=f"compute_ops_{self.instance_id}"
86
+ )
87
+ self.running = self.event_listener.start()
88
+
89
+ if self.running:
90
+ logger.info("ComputeOperationsHandler started successfully")
91
+ else:
92
+ logger.error("ComputeOperationsHandler failed to start")
93
+
94
+ return self.running
95
+
96
+ except Exception as e:
97
+ logger.error(f"Failed to start ComputeOperationsHandler: {e}")
98
+ logger.error(traceback.format_exc())
99
+ return False
100
+
101
+ def stop(self):
102
+ """
103
+ Stop the operations handler gracefully.
104
+ """
105
+ logger.info("Stopping ComputeOperationsHandler...")
106
+ self.running = False
107
+
108
+ if self.event_listener:
109
+ self.event_listener.stop()
110
+
111
+ logger.info("ComputeOperationsHandler stopped")
112
+
113
+ def _handle_operation_event(self, event: Dict[str, Any]):
114
+ """
115
+ Handle incoming operation event from Kafka.
116
+
117
+ This is the callback function passed to EventListener.
118
+
119
+ Args:
120
+ event: The operation event dictionary
121
+ """
122
+ logger.info(f"Received operation event: {event}")
123
+
124
+ # Validate event structure
125
+ if not self._validate_event(event):
126
+ logger.error(f"Invalid event structure: {event}")
127
+ return
128
+
129
+ # Process the operation
130
+ self._process_operation(event)
131
+
132
+ def _is_instance_level_operation(self, action_record_id: str) -> bool:
133
+ """
134
+ Check if action_record_id represents an instance-level operation.
135
+ Instance-level operations are identified by action_record_id containing only zeros,
136
+ which can come in various formats:
137
+ - "000000000000000000000000"
138
+ - "ObjectID(\"000000000000000000000000\")"
139
+ - "ObjectID('000000000000000000000000')"
140
+
141
+ Args:
142
+ action_record_id: The action record ID to check
143
+
144
+ Returns:
145
+ True if this is an instance-level operation, False otherwise
146
+ """
147
+ if not action_record_id:
148
+ return False
149
+
150
+ # Handle ObjectID("...") or ObjectID('...') format from Kafka messages
151
+ clean_id = action_record_id
152
+ if 'ObjectID' in action_record_id:
153
+ match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
154
+ if match:
155
+ clean_id = match.group(1)
156
+
157
+ # Check if the string contains only zeros (any length)
158
+ return clean_id.replace('0', '') == ''
159
+
160
+ def _extract_action_record_id(self, action_record_id: str) -> str:
161
+ """
162
+ Extract the actual action record ID from various formats.
163
+
164
+ Args:
165
+ action_record_id: The raw action record ID (may be wrapped in ObjectID)
166
+
167
+ Returns:
168
+ The extracted action record ID string
169
+ """
170
+ if not action_record_id:
171
+ return action_record_id
172
+
173
+ # Handle ObjectID("...") or ObjectID('...') format
174
+ if 'ObjectID' in action_record_id:
175
+ match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
176
+ if match:
177
+ return match.group(1)
178
+
179
+ return action_record_id
180
+
181
+ def _validate_event(self, event: Dict[str, Any]) -> bool:
182
+ """
183
+ Validate that the event has all required fields.
184
+
185
+ Args:
186
+ event: The event dictionary to validate
187
+
188
+ Returns:
189
+ True if event is valid, False otherwise
190
+ """
191
+ required_fields = [
192
+ "instance_id",
193
+ "action_record_id",
194
+ "operation",
195
+ "account_number",
196
+ "requested_by",
197
+ "request_id",
198
+ "timestamp"
199
+ ]
200
+
201
+ for field in required_fields:
202
+ if field not in event:
203
+ logger.error(f"Missing required field: {field}")
204
+ return False
205
+
206
+ # Validate operation type
207
+ valid_operations = ["start", "stop", "restart"]
208
+ if event["operation"] not in valid_operations:
209
+ logger.error(f"Invalid operation: {event['operation']}. Must be one of {valid_operations}")
210
+ return False
211
+
212
+ return True
213
+
214
+ def _process_operation(self, event: Dict[str, Any]):
215
+ """
216
+ Process a compute operation event.
217
+
218
+ Args:
219
+ event: The operation event dictionary
220
+ """
221
+ operation = event["operation"]
222
+ raw_action_record_id = event["action_record_id"]
223
+ action_record_id = self._extract_action_record_id(raw_action_record_id)
224
+ request_id = event["request_id"]
225
+ requested_by = event["requested_by"]
226
+
227
+ logger.info(f"Processing {operation} operation for action {action_record_id} "
228
+ f"(request: {request_id}, user: {requested_by})")
229
+
230
+ try:
231
+ # Check if this is an instance-level operation (action_record_id contains only zeros)
232
+ is_instance_operation = self._is_instance_level_operation(raw_action_record_id)
233
+
234
+ if is_instance_operation:
235
+ result = self._handle_instance_operation(operation, event)
236
+ else:
237
+ result = self._handle_action_operation(operation, action_record_id, event)
238
+
239
+ # Update status via API and logging
240
+ self._update_operation_status(event, action_record_id, "completed", result)
241
+
242
+ except Exception as e:
243
+ error_msg = f"Operation failed: {str(e)}"
244
+ logger.error(error_msg)
245
+ logger.error(traceback.format_exc())
246
+
247
+ # Update failure status
248
+ self._update_operation_status(event, action_record_id, "failed", {"error": error_msg})
249
+
250
+ def _handle_action_operation(self, operation: str, action_record_id: str,
251
+ event: Dict[str, Any]) -> Dict[str, Any]:
252
+ """
253
+ Handle operations on a specific action.
254
+
255
+ Args:
256
+ operation: The operation type (start/stop/restart)
257
+ action_record_id: The action record ID to operate on
258
+ event: The full event dictionary
259
+
260
+ Returns:
261
+ Result dictionary with operation details
262
+ """
263
+ if operation == "start":
264
+ return self._start_action(action_record_id, event)
265
+ elif operation == "stop":
266
+ return self._stop_action(action_record_id, event)
267
+ elif operation == "restart":
268
+ return self._restart_action(action_record_id, event)
269
+ else:
270
+ raise ValueError(f"Unknown operation: {operation}")
271
+
272
+ def _handle_instance_operation(self, operation: str, event: Dict[str, Any]) -> Dict[str, Any]:
273
+ """
274
+ Handle operations on the entire instance (the Python application itself).
275
+
276
+ Args:
277
+ operation: The operation type (start/stop/restart)
278
+ event: The full event dictionary
279
+
280
+ Returns:
281
+ Result dictionary with operation details (may not return if app is killed/restarted)
282
+ """
283
+ logger.info(f"Executing instance-level {operation} operation on Python application")
284
+
285
+ if operation == "stop":
286
+ # Kill the Python application itself
287
+ logger.critical("Instance-level STOP: Killing Python application process")
288
+ try:
289
+ # Log status before killing
290
+ logger.warning(
291
+ f"Operation {operation} on instance {self.instance_id}: "
292
+ f"completed - killing_application (PID: {os.getpid()})"
293
+ )
294
+ # Give a moment for logs to be written
295
+ time.sleep(0.5)
296
+ except Exception as e:
297
+ logger.error(f"Failed to log status before kill: {e}")
298
+
299
+ # Forcefully exit the application
300
+ logger.critical(f"Terminating Python application (PID: {os.getpid()})")
301
+ os._exit(0) # Forceful exit, doesn't call cleanup handlers
302
+
303
+ elif operation == "restart":
304
+ # Restart the Python application itself
305
+ logger.critical("Instance-level RESTART: Restarting Python application process")
306
+ try:
307
+ # Log status before restarting
308
+ logger.warning(
309
+ f"Operation {operation} on instance {self.instance_id}: "
310
+ f"completed - restarting_application (PID: {os.getpid()})"
311
+ )
312
+ # Give a moment for logs to be written
313
+ time.sleep(0.5)
314
+ except Exception as e:
315
+ logger.error(f"Failed to log status before restart: {e}")
316
+
317
+ # Restart the application
318
+ logger.critical(f"Restarting Python application (PID: {os.getpid()})")
319
+ self._restart_application()
320
+
321
+ elif operation == "start":
322
+ # Start doesn't make sense for instance-level
323
+ logger.warning("Start operation not supported at instance level")
324
+ return {
325
+ "operation": operation,
326
+ "instance_level": True,
327
+ "status": "not_supported",
328
+ "message": "Start operation is not supported at instance level"
329
+ }
330
+
331
+ # This should not be reached for stop/restart operations
332
+ return {
333
+ "operation": operation,
334
+ "instance_level": True,
335
+ "status": "completed"
336
+ }
337
+
338
+ def _restart_application(self):
339
+ """
340
+ Restart the Python application by replacing the current process.
341
+ This uses os.execv() to replace the current process with a new one.
342
+ """
343
+ try:
344
+ python_executable = sys.executable
345
+ script_args = sys.argv
346
+
347
+ logger.info(f"Restarting with: {python_executable} {' '.join(script_args)}")
348
+
349
+ # Use os.execv() to replace the current process
350
+ # This will restart the application with the same arguments
351
+ os.execv(python_executable, [python_executable] + script_args)
352
+
353
+ except Exception as e:
354
+ logger.error(f"Failed to restart application: {e}")
355
+ logger.error(traceback.format_exc())
356
+ # Fallback: try using subprocess to start a new process and exit
357
+ try:
358
+ logger.info("Attempting fallback restart method")
359
+ python_executable = sys.executable
360
+ script_args = sys.argv
361
+
362
+ # Start new process
363
+ subprocess.Popen([python_executable] + script_args)
364
+ # Exit current process
365
+ logger.critical("New process started, exiting current process")
366
+ os._exit(0)
367
+ except Exception as fallback_error:
368
+ logger.error(f"Fallback restart also failed: {fallback_error}")
369
+ logger.error(traceback.format_exc())
370
+ # Last resort: just exit
371
+ os._exit(1)
372
+
373
+ def _start_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
374
+ """
375
+ Start a specific action.
376
+
377
+ Args:
378
+ action_record_id: The action record ID to start
379
+ event: The full event dictionary
380
+
381
+ Returns:
382
+ Result dictionary
383
+ """
384
+ logger.info(f"Starting action: {action_record_id}")
385
+
386
+ # Check if action is already running
387
+ current_actions = self.actions_manager.get_current_actions()
388
+ if action_record_id in current_actions:
389
+ action_instance = current_actions[action_record_id]
390
+ if action_instance.is_running():
391
+ logger.warning(f"Action {action_record_id} is already running")
392
+ return {
393
+ "status": "already_running",
394
+ "action_id": action_record_id
395
+ }
396
+
397
+ # Fetch action details from backend and start it
398
+ # This will be handled by the ActionsManager's normal flow
399
+ # Force a fetch to pick up this specific action
400
+ self.actions_manager.fetch_actions()
401
+
402
+ return {
403
+ "status": "started",
404
+ "action_id": action_record_id
405
+ }
406
+
407
+ def _stop_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
408
+ """
409
+ Stop a specific action.
410
+
411
+ Args:
412
+ action_record_id: The action record ID to stop
413
+ event: The full event dictionary
414
+
415
+ Returns:
416
+ Result dictionary
417
+ """
418
+ logger.info(f"Stopping action: {action_record_id}")
419
+
420
+ result = self.actions_manager.stop_action(action_record_id)
421
+
422
+ return {
423
+ "status": "stopped",
424
+ "action_id": action_record_id,
425
+ "details": result
426
+ }
427
+
428
+ def _restart_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
429
+ """
430
+ Restart a specific action.
431
+
432
+ Args:
433
+ action_record_id: The action record ID to restart
434
+ event: The full event dictionary
435
+
436
+ Returns:
437
+ Result dictionary
438
+ """
439
+ logger.info(f"Restarting action: {action_record_id}")
440
+
441
+ result = self.actions_manager.restart_action(action_record_id)
442
+
443
+ return {
444
+ "status": "restarted",
445
+ "action_id": action_record_id,
446
+ "details": result
447
+ }
448
+
449
+ def _update_operation_status(self, event: Dict[str, Any], action_record_id: str,
450
+ status: str, result: Dict[str, Any]):
451
+ """
452
+ Update operation status via API and logging.
453
+
454
+ Args:
455
+ event: The original event
456
+ action_record_id: The extracted action record ID
457
+ status: Operation status (completed/failed)
458
+ result: Result details
459
+ """
460
+ operation = event["operation"]
461
+ request_id = event["request_id"]
462
+
463
+ # Log status as warning for visibility
464
+ logger.warning(
465
+ f"Operation {operation} on {action_record_id}: {status} - "
466
+ f"request_id={request_id}, result={result}"
467
+ )
468
+
469
+ # Update via API (for action-level operations only)
470
+ if not self._is_instance_level_operation(event["action_record_id"]):
471
+ try:
472
+ # Determine isRunning based on operation and status
473
+ is_running = False
474
+ if status == "completed":
475
+ if operation == "start":
476
+ is_running = True
477
+ elif operation == "restart":
478
+ is_running = True
479
+ elif operation == "stop":
480
+ is_running = False
481
+
482
+ self.scaling.update_action_status(
483
+ service_provider=os.environ.get("SERVICE_PROVIDER", ""),
484
+ action_record_id=action_record_id,
485
+ status=status,
486
+ isRunning=is_running,
487
+ )
488
+ logger.info(f"API status updated for action {action_record_id}: {status}")
489
+ except Exception as e:
490
+ logger.error(f"Failed to update API status for action {action_record_id}: {e}")
@@ -7,6 +7,7 @@ import threading
7
7
  import time
8
8
  from matrice_compute.actions_manager import ActionsManager
9
9
  from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
10
+ from matrice_compute.compute_operations_handler import ComputeOperationsHandler
10
11
  from matrice_compute.instance_utils import (
11
12
  get_instance_info,
12
13
  get_decrypted_access_key_pair,
@@ -90,6 +91,22 @@ class InstanceManager:
90
91
  logging.info("InstanceManager initialized with machine resources tracker")
91
92
  self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
92
93
  logging.info("InstanceManager initialized with actions resources tracker")
94
+
95
+ # Initialize Compute Operations Handler for event-driven operations
96
+ # Uses EventListener from matrice_common for simplified Kafka consumption
97
+ try:
98
+ instance_id = os.environ.get("INSTANCE_ID")
99
+ self.compute_operations_handler = ComputeOperationsHandler(
100
+ actions_manager=self.actions_manager,
101
+ session=self.session,
102
+ scaling=self.scaling,
103
+ instance_id=instance_id
104
+ )
105
+ logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
106
+ except Exception as e:
107
+ logging.warning("Failed to initialize Compute Operations Handler: %s", e)
108
+ self.compute_operations_handler = None
109
+
93
110
  self.poll_interval = 10
94
111
  # Note: encryption_key is set in _setup_env_credentials
95
112
  logging.info("InstanceManager initialized.")
@@ -252,6 +269,14 @@ class InstanceManager:
252
269
  Returns:
253
270
  tuple: (instance_manager_thread, actions_manager_thread)
254
271
  """
272
+ # Start Compute Operations Handler in background thread
273
+ if self.compute_operations_handler:
274
+ try:
275
+ self.compute_operations_handler.start()
276
+ logging.info("Started Compute Operations Handler")
277
+ except Exception as exc:
278
+ logging.error("Failed to start Compute Operations Handler: %s", str(exc))
279
+
255
280
  # Create and start threads
256
281
  instance_manager_thread = threading.Thread(
257
282
  target=self.start_instance_manager,
@@ -941,6 +941,120 @@ def get_single_gpu_with_sufficient_memory_for_action(
941
941
  raise ValueError(error_msg)
942
942
 
943
943
 
944
+ @log_errors(default_return="", raise_exception=False)
945
+ def get_gpu_config_for_deployment(action_details, is_first_deployment=False):
946
+ """Get GPU configuration for deployment actions.
947
+
948
+ For first deployment of a service, attempts to use all GPUs.
949
+ For subsequent deployments, uses standard GPU selection (most free memory).
950
+ Falls back gracefully to standard GPU selection if '--gpus all' is not available.
951
+
952
+ Args:
953
+ action_details (dict): Action details containing GPU requirements
954
+ is_first_deployment (bool): Whether this is the first deployment for this service
955
+
956
+ Returns:
957
+ str: GPU configuration string ('--gpus all' or '--gpus "device=X"' or '')
958
+ """
959
+ action_id = action_details.get("_id", "unknown")
960
+
961
+ # Check if GPU is required
962
+ gpu_required = action_details.get("actionDetails", {}).get("gpuRequired", False)
963
+ if not gpu_required:
964
+ logging.info(
965
+ "Action %s does not require GPU - will run on CPU",
966
+ action_id
967
+ )
968
+ return ""
969
+
970
+ # First deployment: try to use all GPUs
971
+ if is_first_deployment:
972
+ logging.info(
973
+ "Action %s: First deployment - attempting to use all GPUs",
974
+ action_id
975
+ )
976
+
977
+ try:
978
+ # Check if GPUs are available
979
+ result = subprocess.run(
980
+ ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
981
+ stdout=subprocess.PIPE,
982
+ stderr=subprocess.PIPE,
983
+ timeout=5,
984
+ check=False,
985
+ )
986
+
987
+ if result.returncode == 0 and result.stdout.strip():
988
+ # GPUs are available, use all of them
989
+ logging.info(
990
+ "Action %s: Using all GPUs for first deployment",
991
+ action_id
992
+ )
993
+ return '--gpus all'
994
+ else:
995
+ logging.warning(
996
+ "Action %s: No GPUs detected via nvidia-smi for first deployment, falling back to standard GPU selection",
997
+ action_id
998
+ )
999
+ except Exception as e:
1000
+ logging.warning(
1001
+ "Action %s: Error checking GPU availability (%s), falling back to standard GPU selection",
1002
+ action_id,
1003
+ str(e)
1004
+ )
1005
+
1006
+ # Fall back to standard GPU selection (most free memory)
1007
+ # This also handles subsequent deployments
1008
+ logging.info(
1009
+ "Action %s: Using standard GPU allocation (most free memory)",
1010
+ action_id
1011
+ )
1012
+
1013
+ required_memory = action_details.get("actionDetails", {}).get(
1014
+ "expectedResources", {}
1015
+ ).get("gpuMemory", 0)
1016
+
1017
+ try:
1018
+ # Get the GPU(s) with most free memory that have sufficient memory
1019
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(
1020
+ action_details=action_details
1021
+ )
1022
+
1023
+ if gpu_indices:
1024
+ gpu_str = ",".join(map(str, gpu_indices))
1025
+ logging.info(
1026
+ "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
1027
+ action_id,
1028
+ gpu_str,
1029
+ required_memory
1030
+ )
1031
+
1032
+ # Return Docker GPU configuration
1033
+ return f'--gpus "device={gpu_str}"'
1034
+ else:
1035
+ logging.warning(
1036
+ "Action %s: No GPUs with sufficient memory found (required: %d MB)",
1037
+ action_id,
1038
+ required_memory
1039
+ )
1040
+ return ""
1041
+
1042
+ except ValueError as e:
1043
+ logging.error(
1044
+ "Action %s: Error selecting GPU - %s",
1045
+ action_id,
1046
+ str(e)
1047
+ )
1048
+ return ""
1049
+ except Exception as e:
1050
+ logging.error(
1051
+ "Action %s: Unexpected error in GPU selection - %s",
1052
+ action_id,
1053
+ str(e)
1054
+ )
1055
+ return ""
1056
+
1057
+
944
1058
  @log_errors(default_return=(None, None), raise_exception=False)
945
1059
  def get_decrypted_access_key_pair(
946
1060
  enc_access_key: str,
@@ -402,8 +402,13 @@ class ActionsResourcesTracker:
402
402
  new_args.extend(x.replace('"', "").replace("'", "") for x in arg.split(" "))
403
403
  return new_args
404
404
 
405
- args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
406
- action_record_id = args_24[-1] if args_24 else None
405
+ def is_valid_objectid(s: str) -> bool:
406
+ """Check if string is a valid MongoDB ObjectId (24 hex characters)"""
407
+ s = s.strip()
408
+ return len(s) == 24 and all(c in '0123456789abcdefABCDEF' for c in s)
409
+
410
+ valid_objectids = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if is_valid_objectid(arg)]
411
+ action_record_id = valid_objectids[-1] if valid_objectids else None
407
412
  if not action_record_id:
408
413
  logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
409
414
  duration = calculate_time_difference(start_time, finish_time)
@@ -2,6 +2,7 @@
2
2
 
3
3
  import os
4
4
  import logging
5
+ import base64
5
6
  from matrice_common.utils import log_errors
6
7
 
7
8
  class Scaling:
@@ -33,6 +34,28 @@ class Scaling:
33
34
  "Initialized Scaling with instance_id: %s (REST API only)",
34
35
  instance_id
35
36
  )
37
+
38
+ @log_errors(default_return=None, log_error=True)
39
+ def get_kafka_bootstrap_servers(self):
40
+ """Get Kafka bootstrap servers from API and decode base64 fields.
41
+
42
+ Returns:
43
+ str: Kafka bootstrap servers in format "ip:port"
44
+
45
+ Raises:
46
+ ValueError: If unable to fetch Kafka configuration
47
+ """
48
+ path = "/v1/actions/get_kafka_info"
49
+ response = self.rpc.get(path=path)
50
+ if not response or not response.get("success"):
51
+ raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
52
+ encoded_ip = response["data"]["ip"]
53
+ encoded_port = response["data"]["port"]
54
+ ip = base64.b64decode(encoded_ip).decode("utf-8")
55
+ port = base64.b64decode(encoded_port).decode("utf-8")
56
+ bootstrap_servers = f"{ip}:{port}"
57
+ # logging.info(f"Retrieved Kafka bootstrap servers: {bootstrap_servers}")
58
+ return bootstrap_servers
36
59
 
37
60
  @log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
38
61
  def handle_response(self, resp, success_message, error_message):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -0,0 +1,18 @@
1
+ matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
2
+ matrice_compute/action_instance.py,sha256=SYUZrfj6dtcgEjeEgCyKlrc2p2o08jlW84Y__V4Aqew,69552
3
+ matrice_compute/actions_manager.py,sha256=Iex5uw0PLRR4pvIAZDxc2CypucbanKDbJ3SK8mMGXK8,18148
4
+ matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
+ matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
6
+ matrice_compute/instance_manager.py,sha256=sUkDsy_XrPp7CKQxlujQRz3E_8rVbVZOy7byJOgMlEs,11376
7
+ matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
8
+ matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
9
+ matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ matrice_compute/resources_tracker.py,sha256=wy1huqB3Tw_kYC2wfnLa9iSyhDmgI7WQ5I9Kyr-1RSs,22829
11
+ matrice_compute/scaling.py,sha256=JNOgSpAPqbTlZ4qJokkdS9PehqyFwfPh4q98qrfNVCQ,24708
12
+ matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
13
+ matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
14
+ matrice_compute-0.1.26.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
+ matrice_compute-0.1.26.dist-info/METADATA,sha256=t7TsI5DcNElRmlKsa8CArXCcA4iBO-9QwZ6j9UQOdg0,1038
16
+ matrice_compute-0.1.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ matrice_compute-0.1.26.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
+ matrice_compute-0.1.26.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
2
- matrice_compute/action_instance.py,sha256=NK_ZWvNDrLUeOzWwXjxrX7XP-lDHbx5-A0K8ByFpnUg,66241
3
- matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
4
- matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
- matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
6
- matrice_compute/instance_utils.py,sha256=xDOLo21G7unvlGTpnYQkEWSkyuAsVAcs4scOHy5Oxi4,38204
7
- matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
8
- matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- matrice_compute/resources_tracker.py,sha256=pkdt0aVKx_TpY_Sq---73w9INkDffZZe3mZGlp1EftE,22573
10
- matrice_compute/scaling.py,sha256=CeT_lxJNkjJamRETG1lWaOtdSr5ySmcaMcqt7-lFRbo,23731
11
- matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
12
- matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
13
- matrice_compute-0.1.24.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
14
- matrice_compute-0.1.24.dist-info/METADATA,sha256=5fsmPC37r0KPPd6h0qQXnvm0dFqLqboVInQdv7KCr5Y,1038
15
- matrice_compute-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- matrice_compute-0.1.24.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
17
- matrice_compute-0.1.24.dist-info/RECORD,,