matrice-compute 0.1.44__py3-none-any.whl → 0.1.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +21 -10
- matrice_compute/__init__.pyi +2056 -0
- matrice_compute/action_instance.py +21 -6
- matrice_compute/actions_manager.py +2 -1
- matrice_compute/actions_scaledown_manager.py +5 -0
- matrice_compute/instance_manager.py +26 -6
- matrice_compute/instance_utils.py +8 -8
- matrice_compute/k8s_scheduler.py +749 -0
- matrice_compute/prechecks.py +5 -6
- matrice_compute/resources_tracker.py +68 -53
- matrice_compute/scaling.py +31 -2
- matrice_compute/task_utils.py +51 -0
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/METADATA +4 -4
- matrice_compute-0.1.46.dist-info/RECORD +20 -0
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/WHEEL +1 -1
- matrice_compute-0.1.44.dist-info/RECORD +0 -18
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,2056 @@
|
|
|
1
|
+
"""Auto-generated stubs for package: matrice_compute."""
|
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
3
|
+
|
|
4
|
+
from cryptography.hazmat.backends import default_backend
|
|
5
|
+
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from docker.client import DockerClient
|
|
9
|
+
from docker.models.containers import Container
|
|
10
|
+
from kafka import KafkaProducer
|
|
11
|
+
from kafka import KafkaProducer, KafkaConsumer
|
|
12
|
+
from kubernetes import client, config
|
|
13
|
+
from kubernetes.client.rest import ApiException
|
|
14
|
+
from matrice.docker_utils import check_docker
|
|
15
|
+
from matrice_common.session import Session
|
|
16
|
+
from matrice_common.stream.event_listener import EventListener
|
|
17
|
+
from matrice_common.utils import log_errors
|
|
18
|
+
from matrice_compute.action_instance import ActionInstance
|
|
19
|
+
from matrice_compute.actions_manager import ActionsManager
|
|
20
|
+
from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
|
|
21
|
+
from matrice_compute.compute_operations_handler import ComputeOperationsHandler
|
|
22
|
+
from matrice_compute.instance_utils import get_docker_disk_space_usage
|
|
23
|
+
from matrice_compute.instance_utils import get_gpu_with_sufficient_memory_for_action, get_decrypted_access_key_pair, get_max_file_system, get_best_service_ip_and_network
|
|
24
|
+
from matrice_compute.instance_utils import get_instance_info, cleanup_docker_storage, get_cpu_memory_usage, get_gpu_memory_usage, get_mem_usage, get_gpu_with_sufficient_memory_for_action, get_max_file_system, has_gpu
|
|
25
|
+
from matrice_compute.instance_utils import get_instance_info, get_decrypted_access_key_pair
|
|
26
|
+
from matrice_compute.instance_utils import has_gpu, get_gpu_info, calculate_time_difference
|
|
27
|
+
from matrice_compute.instance_utils import has_gpu, get_mem_usage, cleanup_docker_storage
|
|
28
|
+
from matrice_compute.resources_tracker import MachineResourcesTracker, ActionsResourcesTracker, KafkaResourceMonitor, ContainerResourceMonitor
|
|
29
|
+
from matrice_compute.resources_tracker import ResourcesTracker, MachineResourcesTracker, ActionsResourcesTracker
|
|
30
|
+
from matrice_compute.scaling import Scaling
|
|
31
|
+
from matrice_compute.shutdown_manager import ShutdownManager
|
|
32
|
+
from matrice_compute.task_utils import setup_workspace_and_run_task
|
|
33
|
+
import base64
|
|
34
|
+
import docker
|
|
35
|
+
import json
|
|
36
|
+
import logging
|
|
37
|
+
import os
|
|
38
|
+
import platform
|
|
39
|
+
import psutil
|
|
40
|
+
import re
|
|
41
|
+
import shlex
|
|
42
|
+
import shutil
|
|
43
|
+
import signal
|
|
44
|
+
import socket
|
|
45
|
+
import subprocess
|
|
46
|
+
import sys
|
|
47
|
+
import threading
|
|
48
|
+
import time
|
|
49
|
+
import time as time_module
|
|
50
|
+
import torch
|
|
51
|
+
import traceback
|
|
52
|
+
import urllib.parse
|
|
53
|
+
import urllib.request
|
|
54
|
+
import uuid
|
|
55
|
+
import zipfile
|
|
56
|
+
|
|
57
|
+
# Constants
|
|
58
|
+
logger: Any = ... # From compute_operations_handler
|
|
59
|
+
logger: Any = ... # From k8s_scheduler
|
|
60
|
+
|
|
61
|
+
# Functions
|
|
62
|
+
# From action_instance
|
|
63
|
+
def augmentation_server_creation_execute(self) -> Any:
|
|
64
|
+
"""
|
|
65
|
+
Create Augmentation Server
|
|
66
|
+
"""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
# From action_instance
|
|
70
|
+
def data_preparation_execute(self) -> Any:
|
|
71
|
+
"""
|
|
72
|
+
Execute data preparation task.
|
|
73
|
+
"""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
# From action_instance
|
|
77
|
+
def data_processing_execute(self) -> Any:
|
|
78
|
+
"""
|
|
79
|
+
Execute data processing task.
|
|
80
|
+
"""
|
|
81
|
+
...
|
|
82
|
+
|
|
83
|
+
# From action_instance
|
|
84
|
+
def data_split_execute(self) -> Any:
|
|
85
|
+
"""
|
|
86
|
+
Execute data split task.
|
|
87
|
+
"""
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
# From action_instance
|
|
91
|
+
def database_setup_execute(self) -> Any:
|
|
92
|
+
"""
|
|
93
|
+
Creates and setup the database for facial recognition server.
|
|
94
|
+
MongoDB runs on port 27020:27017 (localhost only with --net=host).
|
|
95
|
+
Qdrant runs on port 6334 (localhost only with --net=host).
|
|
96
|
+
"""
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
# From action_instance
|
|
100
|
+
def dataset_annotation_execute(self) -> Any:
|
|
101
|
+
"""
|
|
102
|
+
Execute dataset annotation task.
|
|
103
|
+
"""
|
|
104
|
+
...
|
|
105
|
+
|
|
106
|
+
# From action_instance
|
|
107
|
+
def dataset_augmentation_execute(self) -> Any:
|
|
108
|
+
"""
|
|
109
|
+
Execute dataset augmentation task.
|
|
110
|
+
"""
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
# From action_instance
|
|
114
|
+
def deploy_aggregator_execute(self) -> Any:
|
|
115
|
+
"""
|
|
116
|
+
Execute deploy aggregator task.
|
|
117
|
+
"""
|
|
118
|
+
...
|
|
119
|
+
|
|
120
|
+
# From action_instance
|
|
121
|
+
def facial_recognition_setup_execute(self) -> Any:
|
|
122
|
+
"""
|
|
123
|
+
Creates and setup the facial recognition worker server.
|
|
124
|
+
Facial recognition worker runs on port 8081 (localhost only with --net=host).
|
|
125
|
+
"""
|
|
126
|
+
...
|
|
127
|
+
|
|
128
|
+
# From action_instance
|
|
129
|
+
def fe_analytics_service_execute(self) -> Any:
|
|
130
|
+
"""
|
|
131
|
+
Creates and setup the frontend analytics service.
|
|
132
|
+
Frontend analytics service runs on port 3001 (localhost only with --net=host).
|
|
133
|
+
"""
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
# From action_instance
|
|
137
|
+
def fe_fs_streaming_execute(self) -> Any:
|
|
138
|
+
"""
|
|
139
|
+
Creates and setup the frontend for fs streaming.
|
|
140
|
+
Frontend streaming runs on port 3000 (localhost only with --net=host).
|
|
141
|
+
"""
|
|
142
|
+
...
|
|
143
|
+
|
|
144
|
+
# From action_instance
|
|
145
|
+
def image_build_execute(self) -> Any:
|
|
146
|
+
"""
|
|
147
|
+
Execute image building task.
|
|
148
|
+
"""
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
# From action_instance
|
|
152
|
+
def inference_tracker_setup_execute(self) -> Any:
|
|
153
|
+
"""
|
|
154
|
+
Creates and start inference tracker.
|
|
155
|
+
Inference tracker runs on port 8110 (localhost only with --net=host).
|
|
156
|
+
"""
|
|
157
|
+
...
|
|
158
|
+
|
|
159
|
+
# From action_instance
|
|
160
|
+
def inference_ws_server_execute(self) -> Any:
|
|
161
|
+
"""
|
|
162
|
+
Creates and start inference pipeline.
|
|
163
|
+
Inference WebSocket server runs on port 8102 (localhost only with --net=host).
|
|
164
|
+
"""
|
|
165
|
+
...
|
|
166
|
+
|
|
167
|
+
# From action_instance
|
|
168
|
+
def kafka_setup_execute(self) -> Any:
|
|
169
|
+
"""
|
|
170
|
+
Execute kafka server task.
|
|
171
|
+
Kafka runs on port 9092 (SASL_PLAINTEXT) and 9093 (CONTROLLER) - localhost only with --net=host.
|
|
172
|
+
"""
|
|
173
|
+
...
|
|
174
|
+
|
|
175
|
+
# From action_instance
|
|
176
|
+
def lpr_setup_execute(self) -> Any:
|
|
177
|
+
"""
|
|
178
|
+
Creates and setup the license plate recognition server.
|
|
179
|
+
LPR worker runs on port 8082 (localhost only with --net=host).
|
|
180
|
+
"""
|
|
181
|
+
...
|
|
182
|
+
|
|
183
|
+
# From action_instance
|
|
184
|
+
def model_deploy_execute(self) -> Any:
|
|
185
|
+
"""
|
|
186
|
+
Execute model deployment task.
|
|
187
|
+
"""
|
|
188
|
+
...
|
|
189
|
+
|
|
190
|
+
# From action_instance
|
|
191
|
+
def model_eval_execute(self) -> Any:
|
|
192
|
+
"""
|
|
193
|
+
Execute model evaluation task.
|
|
194
|
+
"""
|
|
195
|
+
...
|
|
196
|
+
|
|
197
|
+
# From action_instance
|
|
198
|
+
def model_export_execute(self) -> Any:
|
|
199
|
+
"""
|
|
200
|
+
Execute model export task.
|
|
201
|
+
"""
|
|
202
|
+
...
|
|
203
|
+
|
|
204
|
+
# From action_instance
|
|
205
|
+
def model_train_execute(self) -> Any:
|
|
206
|
+
"""
|
|
207
|
+
Execute model training task.
|
|
208
|
+
"""
|
|
209
|
+
...
|
|
210
|
+
|
|
211
|
+
# From action_instance
|
|
212
|
+
def redis_setup_execute(self) -> Any:
|
|
213
|
+
"""
|
|
214
|
+
Creates and starts a Redis container using Docker.
|
|
215
|
+
Redis runs on port 6379 (localhost only with --net=host).
|
|
216
|
+
"""
|
|
217
|
+
...
|
|
218
|
+
|
|
219
|
+
# From action_instance
|
|
220
|
+
def resource_clone_execute(self) -> Any:
|
|
221
|
+
"""
|
|
222
|
+
Execute resource clone task.
|
|
223
|
+
"""
|
|
224
|
+
...
|
|
225
|
+
|
|
226
|
+
# From action_instance
|
|
227
|
+
def streaming_gateway_execute(self) -> Any:
|
|
228
|
+
"""
|
|
229
|
+
Execute streaming gateway task.
|
|
230
|
+
"""
|
|
231
|
+
...
|
|
232
|
+
|
|
233
|
+
# From action_instance
|
|
234
|
+
def synthetic_data_setup_execute(self) -> Any:
|
|
235
|
+
"""
|
|
236
|
+
Execute synthetic data setup task.
|
|
237
|
+
"""
|
|
238
|
+
...
|
|
239
|
+
|
|
240
|
+
# From action_instance
|
|
241
|
+
def synthetic_dataset_generation_execute(self) -> Any:
|
|
242
|
+
"""
|
|
243
|
+
Execute synthetic dataset generation task.
|
|
244
|
+
"""
|
|
245
|
+
...
|
|
246
|
+
|
|
247
|
+
# From action_instance
|
|
248
|
+
def video_storage_setup_execute(self) -> Any:
|
|
249
|
+
"""
|
|
250
|
+
Creates and start Video Storage
|
|
251
|
+
Video Stroage runs on port 8106 (localhost only with --net=host).
|
|
252
|
+
"""
|
|
253
|
+
...
|
|
254
|
+
|
|
255
|
+
# From instance_utils
|
|
256
|
+
def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
|
|
257
|
+
"""
|
|
258
|
+
Calculate time difference between start and finish times.
|
|
259
|
+
|
|
260
|
+
Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
|
|
261
|
+
and different precision levels (nanoseconds, microseconds, milliseconds).
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
start_time_str (str): Start time string in ISO format
|
|
265
|
+
finish_time_str (str): Finish time string in ISO format
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
int: Time difference in seconds
|
|
269
|
+
"""
|
|
270
|
+
...
|
|
271
|
+
|
|
272
|
+
# From instance_utils
|
|
273
|
+
def cleanup_docker_storage() -> None:
|
|
274
|
+
"""
|
|
275
|
+
Clean up Docker storage if space is low.
|
|
276
|
+
"""
|
|
277
|
+
...
|
|
278
|
+
|
|
279
|
+
# From instance_utils
|
|
280
|
+
def get_best_service_ip_and_network(port: int) -> tuple:
|
|
281
|
+
"""
|
|
282
|
+
Determine the best IP address and network configuration for a service.
|
|
283
|
+
|
|
284
|
+
This function intelligently selects the best IP to bind a service to:
|
|
285
|
+
|
|
286
|
+
Priority:
|
|
287
|
+
1. Public IP if it's actually on a local interface (cloud servers)
|
|
288
|
+
2. Private/LAN IP (NAT, local network, Docker)
|
|
289
|
+
3. localhost with --net=host (fallback)
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
port (int): Port number for the service
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
tuple: (ip_address, use_host_network) where:
|
|
296
|
+
- ip_address: The IP address to use (public, private, or localhost)
|
|
297
|
+
- use_host_network: True if should use --net=host, False if should use port mapping
|
|
298
|
+
"""
|
|
299
|
+
...
|
|
300
|
+
|
|
301
|
+
# From instance_utils
|
|
302
|
+
def get_cpu_memory_usage() -> float:
|
|
303
|
+
"""
|
|
304
|
+
Get CPU memory usage.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
float: Memory usage between 0 and 1
|
|
308
|
+
"""
|
|
309
|
+
...
|
|
310
|
+
|
|
311
|
+
# From instance_utils
|
|
312
|
+
def get_decrypted_access_key_pair(enc_access_key: str, enc_secret_key: str, encryption_key: str = '') -> Tuple[Optional[str], Optional[str]]:
|
|
313
|
+
"""
|
|
314
|
+
Get decrypted access key pair.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
enc_access_key (str): Encrypted access key
|
|
318
|
+
enc_secret_key (str): Encrypted secret key
|
|
319
|
+
encryption_key (str): Encryption key
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
tuple: (access_key, secret_key) strings
|
|
323
|
+
"""
|
|
324
|
+
...
|
|
325
|
+
|
|
326
|
+
# From instance_utils
|
|
327
|
+
def get_disk_space_usage() -> list:
|
|
328
|
+
"""
|
|
329
|
+
Get disk space usage for all filesystems.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
list: List of disk usage information dictionaries
|
|
333
|
+
"""
|
|
334
|
+
...
|
|
335
|
+
|
|
336
|
+
# From instance_utils
|
|
337
|
+
def get_docker_disk_space_usage() -> dict:
|
|
338
|
+
"""
|
|
339
|
+
Get disk space usage for Docker storage.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
dict: Docker disk usage information
|
|
343
|
+
"""
|
|
344
|
+
...
|
|
345
|
+
|
|
346
|
+
# From instance_utils
|
|
347
|
+
def get_encrypted_access_key_pair(access_key: str, secret_key: str, encryption_key: str = '') -> Tuple[Optional[str], Optional[str]]:
|
|
348
|
+
"""
|
|
349
|
+
Get encrypted access key pair.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
access_key (str): access key
|
|
353
|
+
secret_key (str): secret key
|
|
354
|
+
encryption_key (str): Encryption key
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
tuple: (encrypted_access_key, encrypted_secret_key) strings
|
|
358
|
+
"""
|
|
359
|
+
...
|
|
360
|
+
|
|
361
|
+
# From instance_utils
|
|
362
|
+
def get_gpu_config_for_deployment(action_details, is_first_deployment = False) -> Any:
|
|
363
|
+
"""
|
|
364
|
+
Get GPU configuration for deployment actions.
|
|
365
|
+
|
|
366
|
+
For first deployment of a service, attempts to use all GPUs.
|
|
367
|
+
For subsequent deployments, uses standard GPU selection (most free memory).
|
|
368
|
+
Falls back gracefully to standard GPU selection if '--gpus all' is not available.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
action_details (dict): Action details containing GPU requirements
|
|
372
|
+
is_first_deployment (bool): Whether this is the first deployment for this service
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
str: GPU configuration string ('--gpus all' or '--gpus "device=X"' or '')
|
|
376
|
+
"""
|
|
377
|
+
...
|
|
378
|
+
|
|
379
|
+
# From instance_utils
|
|
380
|
+
def get_gpu_info() -> list:
|
|
381
|
+
"""
|
|
382
|
+
Get GPU information.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
list: GPU information strings
|
|
386
|
+
"""
|
|
387
|
+
...
|
|
388
|
+
|
|
389
|
+
# From instance_utils
|
|
390
|
+
def get_gpu_memory_usage() -> float:
|
|
391
|
+
"""
|
|
392
|
+
Get GPU memory usage percentage.
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
float: Memory usage between 0 and 1
|
|
396
|
+
"""
|
|
397
|
+
...
|
|
398
|
+
|
|
399
|
+
# From instance_utils
|
|
400
|
+
def get_gpu_with_sufficient_memory_for_action(action_details: dict) -> list:
|
|
401
|
+
"""
|
|
402
|
+
Get GPUs with sufficient memory for action.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
action_details (dict): Action details
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
list: List of GPU indices
|
|
409
|
+
|
|
410
|
+
Raises:
|
|
411
|
+
ValueError: If insufficient GPU memory
|
|
412
|
+
"""
|
|
413
|
+
...
|
|
414
|
+
|
|
415
|
+
# From instance_utils
|
|
416
|
+
def get_instance_id() -> str:
|
|
417
|
+
"""
|
|
418
|
+
Get instance ID.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
str: Instance ID or empty string
|
|
422
|
+
"""
|
|
423
|
+
...
|
|
424
|
+
|
|
425
|
+
# From instance_utils
|
|
426
|
+
def get_instance_info(service_provider: Optional[str] = None, instance_id: Optional[str] = None) -> tuple:
|
|
427
|
+
"""
|
|
428
|
+
Get instance provider and ID information.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
tuple: (service_provider, instance_id) strings
|
|
432
|
+
"""
|
|
433
|
+
...
|
|
434
|
+
|
|
435
|
+
# From instance_utils
|
|
436
|
+
def get_max_file_system() -> Optional[str]:
|
|
437
|
+
"""
|
|
438
|
+
Get filesystem with maximum available space.
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
str: Path to filesystem with most space or None
|
|
442
|
+
"""
|
|
443
|
+
...
|
|
444
|
+
|
|
445
|
+
# From instance_utils
|
|
446
|
+
def get_mem_usage() -> float:
|
|
447
|
+
"""
|
|
448
|
+
Get memory usage for either GPU or CPU.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
float: Memory usage between 0 and 1
|
|
452
|
+
"""
|
|
453
|
+
...
|
|
454
|
+
|
|
455
|
+
# From instance_utils
|
|
456
|
+
def get_required_gpu_memory(action_details: dict) -> int:
|
|
457
|
+
"""
|
|
458
|
+
Get required GPU memory from action details.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
action_details (dict): Action details
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
int: Required GPU memory
|
|
465
|
+
"""
|
|
466
|
+
...
|
|
467
|
+
|
|
468
|
+
# From instance_utils
|
|
469
|
+
def get_single_gpu_with_sufficient_memory_for_action(action_details: dict) -> list:
|
|
470
|
+
"""
|
|
471
|
+
Get single GPU with sufficient memory using most-free algorithm.
|
|
472
|
+
|
|
473
|
+
Selects the GPU with the MOST free memory that meets the requirements,
|
|
474
|
+
to balance load across GPUs and prevent any single GPU from being overused.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
action_details (dict): Action details
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
list: List with single GPU index
|
|
481
|
+
|
|
482
|
+
Raises:
|
|
483
|
+
ValueError: If no GPU has sufficient memory
|
|
484
|
+
"""
|
|
485
|
+
...
|
|
486
|
+
|
|
487
|
+
# From instance_utils
|
|
488
|
+
def has_gpu() -> bool:
|
|
489
|
+
"""
|
|
490
|
+
Check if the system has a GPU.
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
bool: True if GPU is present, False otherwise
|
|
494
|
+
"""
|
|
495
|
+
...
|
|
496
|
+
|
|
497
|
+
# From instance_utils
|
|
498
|
+
def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
499
|
+
"""
|
|
500
|
+
Check if GPU device is allowed based on GPUS environment variable.
|
|
501
|
+
|
|
502
|
+
The GPUS environment variable can be used to restrict which GPU devices
|
|
503
|
+
are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
gpu_index (int): GPU device index
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
bool: True if GPU is allowed (or no filter is set), False otherwise
|
|
510
|
+
"""
|
|
511
|
+
...
|
|
512
|
+
|
|
513
|
+
# From instance_utils
|
|
514
|
+
def is_docker_running() -> bool:
|
|
515
|
+
"""
|
|
516
|
+
Check if Docker is running.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
bool: True if Docker containers are running
|
|
520
|
+
"""
|
|
521
|
+
...
|
|
522
|
+
|
|
523
|
+
# From instance_utils
|
|
524
|
+
def prune_docker_images() -> None:
|
|
525
|
+
"""
|
|
526
|
+
Prune Docker images.
|
|
527
|
+
"""
|
|
528
|
+
...
|
|
529
|
+
|
|
530
|
+
# From task_utils
|
|
531
|
+
def refresh_url_if_needed(url: Optional[str], scaling: Optional[Scaling] = None) -> Optional[str]:
|
|
532
|
+
"""
|
|
533
|
+
Refresh a presigned URL if it appears to be expired or about to expire.
|
|
534
|
+
|
|
535
|
+
This function attempts to refresh presigned URLs for model codebase and requirements
|
|
536
|
+
to ensure they are valid before downloading.
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
url: The URL to potentially refresh. If None or empty, returns None.
|
|
540
|
+
scaling: The Scaling instance to use for API calls. If None, returns original URL.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
The refreshed URL if successful, or the original URL if refresh fails or is not needed.
|
|
544
|
+
"""
|
|
545
|
+
...
|
|
546
|
+
|
|
547
|
+
# From task_utils
|
|
548
|
+
def setup_workspace_and_run_task(work_fs: str, action_id: str, model_codebase_url: str, model_codebase_requirements_url: Optional[str] = None, scaling: Optional[Scaling] = None) -> None:
|
|
549
|
+
"""
|
|
550
|
+
Set up workspace and run task with provided parameters.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
work_fs (str): Working filesystem path.
|
|
554
|
+
action_id (str): Unique identifier for the action.
|
|
555
|
+
model_codebase_url (str): URL to download model codebase from.
|
|
556
|
+
model_codebase_requirements_url (Optional[str]): URL to download requirements from. Defaults to None.
|
|
557
|
+
scaling (Optional[Scaling]): Scaling instance for refreshing presigned URLs. Defaults to None.
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
None
|
|
561
|
+
"""
|
|
562
|
+
...
|
|
563
|
+
|
|
564
|
+
# Classes
|
|
565
|
+
# From action_instance
|
|
566
|
+
class ActionInstance:
|
|
567
|
+
"""
|
|
568
|
+
Base class for tasks that run in Action containers.
|
|
569
|
+
"""
|
|
570
|
+
|
|
571
|
+
def __init__(self, scaling, action_info: dict) -> None:
|
|
572
|
+
"""
|
|
573
|
+
Initialize an action instance.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
scaling (Scaling): Scaling service instance
|
|
577
|
+
action_info (dict): Action information dictionary
|
|
578
|
+
"""
|
|
579
|
+
...
|
|
580
|
+
|
|
581
|
+
def execute(self) -> Any:
|
|
582
|
+
"""
|
|
583
|
+
Execute the task.
|
|
584
|
+
"""
|
|
585
|
+
...
|
|
586
|
+
|
|
587
|
+
def get_action_details(self) -> Any:
|
|
588
|
+
"""
|
|
589
|
+
Get action details from scaling service.
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
dict: Action details if successful, None otherwise
|
|
593
|
+
"""
|
|
594
|
+
...
|
|
595
|
+
|
|
596
|
+
def get_base_docker_cmd(self, work_fs: str = '', use_gpu: str = '', mount_docker_sock: bool = False, action_id: str = '', model_key: str = '', extra_env_vars: dict = {}, port_mapping: dict = {}, network_config: str = '', destination_workspace_path: str = '/usr/src/workspace', docker_workdir: str = '', extra_pkgs: list = []) -> Any:
|
|
597
|
+
"""
|
|
598
|
+
Build base Docker command with common options.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
work_fs (str): Work filesystem path
|
|
602
|
+
use_gpu (str): GPU configuration string
|
|
603
|
+
mount_docker_sock (bool): Whether to mount Docker socket
|
|
604
|
+
action_id (str): Action ID
|
|
605
|
+
model_key (str): Model key
|
|
606
|
+
extra_env_vars (dict): Additional environment variables
|
|
607
|
+
port_mapping (dict): Port mappings {host_port: container_port}
|
|
608
|
+
destination_workspace_path (str): Container workspace path
|
|
609
|
+
docker_workdir (str): Docker working directory
|
|
610
|
+
extra_pkgs (list): List of extra packages to install
|
|
611
|
+
Returns:
|
|
612
|
+
str: Base Docker command
|
|
613
|
+
"""
|
|
614
|
+
...
|
|
615
|
+
|
|
616
|
+
def get_gpu_config(self, action_details) -> Any:
|
|
617
|
+
"""
|
|
618
|
+
Get GPU configuration string based on available GPUs.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
action_details (dict): Action details containing GPU requirements
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
str: GPU configuration string
|
|
625
|
+
"""
|
|
626
|
+
...
|
|
627
|
+
|
|
628
|
+
def get_hugging_face_token(self, model_key) -> Any:
|
|
629
|
+
"""
|
|
630
|
+
Get Hugging Face token for specific model keys.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
model_key (str): Model key to check
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
str: Hugging Face token if available, empty string otherwise
|
|
637
|
+
"""
|
|
638
|
+
...
|
|
639
|
+
|
|
640
|
+
def get_hugging_face_token_for_data_generation(self) -> Any: ...
|
|
641
|
+
|
|
642
|
+
def get_internal_api_key(self, action_id) -> Any:
|
|
643
|
+
"""
|
|
644
|
+
Get internal API key for action.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
action_id (str): Action ID
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
str: Internal API key if available, empty string otherwise
|
|
651
|
+
"""
|
|
652
|
+
...
|
|
653
|
+
|
|
654
|
+
def get_log_path(self) -> Any:
|
|
655
|
+
"""
|
|
656
|
+
Get log directory path, creating if needed.
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
str: Path to log directory
|
|
660
|
+
"""
|
|
661
|
+
...
|
|
662
|
+
|
|
663
|
+
def is_running(self) -> bool:
|
|
664
|
+
"""
|
|
665
|
+
Check if task process is running.
|
|
666
|
+
|
|
667
|
+
This method performs a thorough check to determine if the process is still running:
|
|
668
|
+
1. Verifies that the process attribute exists and is not None
|
|
669
|
+
2. Checks if the process has terminated using poll() method
|
|
670
|
+
3. Additional safeguards against zombie processes
|
|
671
|
+
4. Coordinates with log monitoring to ensure all logs are sent before cleanup
|
|
672
|
+
|
|
673
|
+
Returns:
|
|
674
|
+
bool: True if process exists and is still running, False if process
|
|
675
|
+
does not exist or has terminated
|
|
676
|
+
"""
|
|
677
|
+
...
|
|
678
|
+
|
|
679
|
+
def send_logs_continuously(self) -> Any:
|
|
680
|
+
"""
|
|
681
|
+
Continuously read and send logs from the log file to the scaling service.
|
|
682
|
+
|
|
683
|
+
Enhanced version that tracks log position and handles graceful shutdown.
|
|
684
|
+
"""
|
|
685
|
+
...
|
|
686
|
+
|
|
687
|
+
def setup_action_requirements(self, action_details, work_fs = '', model_family = '', action_id = '') -> Any:
|
|
688
|
+
"""
|
|
689
|
+
Setup action requirements.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
action_details (dict): Action details
|
|
693
|
+
work_fs (str): Work filesystem path
|
|
694
|
+
model_family (str): Model family name
|
|
695
|
+
action_id (str): Action ID
|
|
696
|
+
|
|
697
|
+
Raises:
|
|
698
|
+
Exception: If setup fails
|
|
699
|
+
"""
|
|
700
|
+
...
|
|
701
|
+
|
|
702
|
+
def start(self, cmd: str = '', log_name: str = '') -> Any:
|
|
703
|
+
"""
|
|
704
|
+
Start the process and log monitoring thread.
|
|
705
|
+
|
|
706
|
+
Args:
|
|
707
|
+
cmd (str): Command to execute
|
|
708
|
+
log_name (str): Name for log file
|
|
709
|
+
"""
|
|
710
|
+
...
|
|
711
|
+
|
|
712
|
+
def start_logger(self) -> Any:
|
|
713
|
+
"""
|
|
714
|
+
Start the log monitoring thread.
|
|
715
|
+
"""
|
|
716
|
+
...
|
|
717
|
+
|
|
718
|
+
def start_process(self, cmd, log_name) -> Any:
|
|
719
|
+
"""
|
|
720
|
+
Start the process and initialize logging.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
cmd (str): Command to execute
|
|
724
|
+
log_name (str): Name for log file
|
|
725
|
+
|
|
726
|
+
Raises:
|
|
727
|
+
Exception: If process fails to start
|
|
728
|
+
"""
|
|
729
|
+
...
|
|
730
|
+
|
|
731
|
+
def stop(self) -> Any:
|
|
732
|
+
"""
|
|
733
|
+
Stop the process and log monitoring thread.
|
|
734
|
+
|
|
735
|
+
Enhanced version that ensures proper cleanup sequencing and log completion.
|
|
736
|
+
"""
|
|
737
|
+
...
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
# From actions_manager
|
|
741
|
+
class ActionsManager:
|
|
742
|
+
"""
|
|
743
|
+
Class for managing actions.
|
|
744
|
+
"""
|
|
745
|
+
|
|
746
|
+
def __init__(self, scaling) -> None:
|
|
747
|
+
"""
|
|
748
|
+
Initialize an action manager.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
scaling (Scaling): Scaling service instance
|
|
752
|
+
"""
|
|
753
|
+
...
|
|
754
|
+
|
|
755
|
+
def fetch_actions(self) -> list:
|
|
756
|
+
"""
|
|
757
|
+
Poll for actions and process them if memory threshold is not exceeded.
|
|
758
|
+
|
|
759
|
+
Returns:
|
|
760
|
+
list: List of fetched actions
|
|
761
|
+
"""
|
|
762
|
+
...
|
|
763
|
+
|
|
764
|
+
def get_all_actions(self) -> dict:
|
|
765
|
+
"""
|
|
766
|
+
Get all tracked actions (both running and stopped).
|
|
767
|
+
|
|
768
|
+
Returns:
|
|
769
|
+
dict: All tracked actions with their status
|
|
770
|
+
"""
|
|
771
|
+
...
|
|
772
|
+
|
|
773
|
+
def get_current_actions(self) -> dict:
|
|
774
|
+
"""
|
|
775
|
+
Get the current running actions.
|
|
776
|
+
|
|
777
|
+
This method:
|
|
778
|
+
1. Updates action status tracking via update_actions_status()
|
|
779
|
+
2. Returns only the running actions (current_actions dict)
|
|
780
|
+
3. Provides detailed logging about current actions state
|
|
781
|
+
|
|
782
|
+
Returns:
|
|
783
|
+
dict: Current running actions only
|
|
784
|
+
"""
|
|
785
|
+
...
|
|
786
|
+
|
|
787
|
+
def get_stopped_actions(self) -> dict:
|
|
788
|
+
"""
|
|
789
|
+
Get stopped actions.
|
|
790
|
+
|
|
791
|
+
Returns:
|
|
792
|
+
dict: Stopped actions
|
|
793
|
+
"""
|
|
794
|
+
...
|
|
795
|
+
|
|
796
|
+
def process_action(self, action: dict) -> Any:
|
|
797
|
+
"""
|
|
798
|
+
Process the given action.
|
|
799
|
+
|
|
800
|
+
Args:
|
|
801
|
+
action (dict): Action details to process
|
|
802
|
+
|
|
803
|
+
Returns:
|
|
804
|
+
ActionInstance: Processed action instance or None if failed
|
|
805
|
+
"""
|
|
806
|
+
...
|
|
807
|
+
|
|
808
|
+
def process_actions(self) -> None:
|
|
809
|
+
"""
|
|
810
|
+
Process fetched actions.
|
|
811
|
+
"""
|
|
812
|
+
...
|
|
813
|
+
|
|
814
|
+
def purge_unwanted(self) -> None:
|
|
815
|
+
"""
|
|
816
|
+
Purge completed or failed actions.
|
|
817
|
+
|
|
818
|
+
NOTE: This now calls update_actions_status() which moves stopped actions
|
|
819
|
+
to a separate dict instead of deleting them. This prevents interference
|
|
820
|
+
with compute operations handler while maintaining accurate status.
|
|
821
|
+
"""
|
|
822
|
+
...
|
|
823
|
+
|
|
824
|
+
def restart_action(self, action_record_id: str) -> dict:
|
|
825
|
+
"""
|
|
826
|
+
Restart a specific action by its record ID.
|
|
827
|
+
|
|
828
|
+
This method stops the action if it's running, then fetches fresh action
|
|
829
|
+
details from the backend and starts it again.
|
|
830
|
+
|
|
831
|
+
Args:
|
|
832
|
+
action_record_id (str): The action record ID to restart
|
|
833
|
+
|
|
834
|
+
Returns:
|
|
835
|
+
dict: Result dictionary with status information
|
|
836
|
+
"""
|
|
837
|
+
...
|
|
838
|
+
|
|
839
|
+
def start_actions_manager(self) -> None:
|
|
840
|
+
"""
|
|
841
|
+
Start the actions manager main loop.
|
|
842
|
+
"""
|
|
843
|
+
...
|
|
844
|
+
|
|
845
|
+
def stop_action(self, action_record_id: str) -> dict:
|
|
846
|
+
"""
|
|
847
|
+
Stop a specific action by its record ID.
|
|
848
|
+
|
|
849
|
+
Args:
|
|
850
|
+
action_record_id (str): The action record ID to stop
|
|
851
|
+
|
|
852
|
+
Returns:
|
|
853
|
+
dict: Result dictionary with status information
|
|
854
|
+
"""
|
|
855
|
+
...
|
|
856
|
+
|
|
857
|
+
def update_actions_status(self) -> None:
|
|
858
|
+
"""
|
|
859
|
+
Update tracking of running vs stopped actions.
|
|
860
|
+
|
|
861
|
+
This method checks all actions and moves stopped ones to stopped_actions dict
|
|
862
|
+
without deleting them. This prevents interference with compute operations
|
|
863
|
+
handler while maintaining accurate status reporting.
|
|
864
|
+
"""
|
|
865
|
+
...
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
# From actions_scaledown_manager
|
|
869
|
+
class ActionsScaleDownManager:
|
|
870
|
+
"""
|
|
871
|
+
Class for managing container scale down operations.
|
|
872
|
+
"""
|
|
873
|
+
|
|
874
|
+
def __init__(self, scaling) -> None:
|
|
875
|
+
"""
|
|
876
|
+
Initialize the scale down manager.
|
|
877
|
+
|
|
878
|
+
Args:
|
|
879
|
+
scaling (Scaling): Scaling service instance
|
|
880
|
+
"""
|
|
881
|
+
...
|
|
882
|
+
|
|
883
|
+
def auto_scaledown_actions(self) -> None:
|
|
884
|
+
"""
|
|
885
|
+
Start polling for containers that need to be scaled down and stop them.
|
|
886
|
+
"""
|
|
887
|
+
...
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
# From compute_operations_handler
|
|
891
|
+
class ComputeOperationsHandler:
|
|
892
|
+
"""
|
|
893
|
+
Handles Kafka-based compute operations for instance and action management.
|
|
894
|
+
|
|
895
|
+
This class uses EventListener from matrice_common to listen for operation
|
|
896
|
+
events from the 'compute_operations' Kafka topic. It delegates operations
|
|
897
|
+
to the ActionsManager for execution and updates status via API calls.
|
|
898
|
+
"""
|
|
899
|
+
|
|
900
|
+
def __init__(self, actions_manager, session, scaling, instance_id: str) -> None:
|
|
901
|
+
"""
|
|
902
|
+
Initialize the Compute Operations Handler.
|
|
903
|
+
|
|
904
|
+
Args:
|
|
905
|
+
actions_manager: Reference to the ActionsManager instance
|
|
906
|
+
session: Session object for authentication and Kafka configuration
|
|
907
|
+
scaling: Scaling service instance for API status updates
|
|
908
|
+
instance_id: This compute instance's ID for filtering events
|
|
909
|
+
"""
|
|
910
|
+
...
|
|
911
|
+
|
|
912
|
+
KAFKA_TOPIC: Any
|
|
913
|
+
|
|
914
|
+
def start(self) -> bool:
|
|
915
|
+
"""
|
|
916
|
+
Start the operations handler using EventListener.
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
bool: True if started successfully, False otherwise
|
|
920
|
+
"""
|
|
921
|
+
...
|
|
922
|
+
|
|
923
|
+
def stop(self) -> Any:
|
|
924
|
+
"""
|
|
925
|
+
Stop the operations handler gracefully.
|
|
926
|
+
"""
|
|
927
|
+
...
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
# From instance_manager
|
|
931
|
+
class InstanceManager:
|
|
932
|
+
"""
|
|
933
|
+
Class for managing compute instances and their associated actions.
|
|
934
|
+
|
|
935
|
+
Now includes auto streaming capabilities for specified deployment IDs.
|
|
936
|
+
"""
|
|
937
|
+
|
|
938
|
+
def __init__(self, matrice_access_key_id: str = '', matrice_secret_access_key: str = '', encryption_key: str = '', instance_id: str = '', service_provider: str = '', env: str = '', gpus: str = '', workspace_dir: str = 'matrice_workspace', enable_kafka: bool = False) -> None:
|
|
939
|
+
"""
|
|
940
|
+
Initialize an instance manager.
|
|
941
|
+
|
|
942
|
+
Args:
|
|
943
|
+
matrice_access_key_id (str): Access key ID for Matrice authentication.
|
|
944
|
+
Defaults to empty string.
|
|
945
|
+
matrice_secret_access_key (str): Secret access key for Matrice
|
|
946
|
+
authentication. Defaults to empty string.
|
|
947
|
+
encryption_key (str): Key used for encrypting sensitive data.
|
|
948
|
+
Defaults to empty string.
|
|
949
|
+
instance_id (str): Unique identifier for this compute instance.
|
|
950
|
+
Defaults to empty string.
|
|
951
|
+
service_provider (str): Cloud service provider being used.
|
|
952
|
+
Defaults to empty string.
|
|
953
|
+
env (str): Environment name (e.g. dev, prod).
|
|
954
|
+
Defaults to empty string.
|
|
955
|
+
gpus (str): GPU configuration string (e.g. "0,1").
|
|
956
|
+
Defaults to empty string.
|
|
957
|
+
workspace_dir (str): Directory for workspace files.
|
|
958
|
+
Defaults to "matrice_workspace".
|
|
959
|
+
enable_kafka (bool): Enable Kafka communication (default False).
|
|
960
|
+
"""
|
|
961
|
+
...
|
|
962
|
+
|
|
963
|
+
def start(self) -> tuple:
|
|
964
|
+
"""
|
|
965
|
+
Start the instance manager threads.
|
|
966
|
+
|
|
967
|
+
Returns:
|
|
968
|
+
tuple: (instance_manager_thread, actions_manager_thread)
|
|
969
|
+
"""
|
|
970
|
+
...
|
|
971
|
+
|
|
972
|
+
def start_container_status_monitor(self) -> Any:
|
|
973
|
+
"""
|
|
974
|
+
Start the background container status monitoring.
|
|
975
|
+
"""
|
|
976
|
+
...
|
|
977
|
+
|
|
978
|
+
def start_instance_manager(self) -> None:
|
|
979
|
+
"""
|
|
980
|
+
Run the instance manager loop.
|
|
981
|
+
"""
|
|
982
|
+
...
|
|
983
|
+
|
|
984
|
+
def stop(self) -> Any:
|
|
985
|
+
"""
|
|
986
|
+
Stop all background threads and cleanup resources.
|
|
987
|
+
"""
|
|
988
|
+
...
|
|
989
|
+
|
|
990
|
+
def stop_container_status_monitor(self) -> Any:
|
|
991
|
+
"""
|
|
992
|
+
Stop the background container status monitoring.
|
|
993
|
+
"""
|
|
994
|
+
...
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
# From k8s_scheduler
|
|
998
|
+
class K8sScheduler:
|
|
999
|
+
"""
|
|
1000
|
+
Kubernetes Scheduler that polls for actions and creates K8s Jobs.
|
|
1001
|
+
Runs inside the cluster using in-cluster authentication.
|
|
1002
|
+
"""
|
|
1003
|
+
|
|
1004
|
+
def __init__(self) -> None: ...
|
|
1005
|
+
|
|
1006
|
+
def check_job_status(self, action_id: str, job_name: str, namespace: str) -> Optional[str]:
|
|
1007
|
+
"""
|
|
1008
|
+
Check the status of a K8s job and return status if completed. Also monitors resource usage.
|
|
1009
|
+
"""
|
|
1010
|
+
...
|
|
1011
|
+
|
|
1012
|
+
def create_k8s_job(self, action: Dict[str, Any]) -> Optional[str]:
|
|
1013
|
+
"""
|
|
1014
|
+
Create a Kubernetes Job for the given action
|
|
1015
|
+
"""
|
|
1016
|
+
...
|
|
1017
|
+
|
|
1018
|
+
def monitor_running_jobs(self) -> Any:
|
|
1019
|
+
"""
|
|
1020
|
+
Monitor running jobs and update action statuses
|
|
1021
|
+
"""
|
|
1022
|
+
...
|
|
1023
|
+
|
|
1024
|
+
def poll_pending_actions(self) -> List[Dict[str, Any]]:
|
|
1025
|
+
"""
|
|
1026
|
+
Poll for actions assigned to this Kubernetes cluster.
|
|
1027
|
+
|
|
1028
|
+
Uses the new K8s-specific endpoint:
|
|
1029
|
+
- processClusterName in be-action detects K8s clusters and sets kubernetesClusterId
|
|
1030
|
+
- Scheduler calls /v1/actions/assign_jobs_kubernetes/{cluster_id} to fetch assigned actions
|
|
1031
|
+
"""
|
|
1032
|
+
...
|
|
1033
|
+
|
|
1034
|
+
def send_heartbeat(self) -> Any:
|
|
1035
|
+
"""
|
|
1036
|
+
Send heartbeat to Matrice API with cluster health info
|
|
1037
|
+
"""
|
|
1038
|
+
...
|
|
1039
|
+
|
|
1040
|
+
def start(self) -> Any:
|
|
1041
|
+
"""
|
|
1042
|
+
Main scheduler loop - matches InstanceManager.start() pattern
|
|
1043
|
+
"""
|
|
1044
|
+
...
|
|
1045
|
+
|
|
1046
|
+
def update_action_status(self, action_id: str, step_code: str, status: str, description: str, extra_details: Optional[Dict] = None) -> Any:
|
|
1047
|
+
"""
|
|
1048
|
+
Update action status using the existing action update endpoint.
|
|
1049
|
+
|
|
1050
|
+
Uses the standard action record update API that accepts:
|
|
1051
|
+
- stepCode: The step code for the action
|
|
1052
|
+
- status: Status (OK, ERROR, etc.)
|
|
1053
|
+
- statusDescription: Human-readable description
|
|
1054
|
+
|
|
1055
|
+
Extra details are merged into the action record's actionDetails.
|
|
1056
|
+
"""
|
|
1057
|
+
...
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
# From prechecks
|
|
1061
|
+
class Prechecks:
|
|
1062
|
+
"""
|
|
1063
|
+
Class for running pre-checks before compute operations.
|
|
1064
|
+
"""
|
|
1065
|
+
|
|
1066
|
+
def __init__(self, session, instance_id: Optional[str] = None) -> None:
|
|
1067
|
+
"""
|
|
1068
|
+
Initialize Prechecks.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
session: Session object for RPC calls
|
|
1072
|
+
instance_id: Optional instance ID
|
|
1073
|
+
"""
|
|
1074
|
+
...
|
|
1075
|
+
|
|
1076
|
+
def check_credentials(self, access_key: Optional[str] = None, secret_key: Optional[str] = None) -> bool:
|
|
1077
|
+
"""
|
|
1078
|
+
Check if access key and secret key are valid.
|
|
1079
|
+
|
|
1080
|
+
Args:
|
|
1081
|
+
access_key: Optional access key to validate
|
|
1082
|
+
secret_key: Optional secret key to validate
|
|
1083
|
+
|
|
1084
|
+
Returns:
|
|
1085
|
+
bool: True if credentials are valid
|
|
1086
|
+
"""
|
|
1087
|
+
...
|
|
1088
|
+
|
|
1089
|
+
def check_docker(self) -> bool:
|
|
1090
|
+
"""
|
|
1091
|
+
Check if docker is installed and working.
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
bool: True if docker is working
|
|
1095
|
+
"""
|
|
1096
|
+
...
|
|
1097
|
+
|
|
1098
|
+
def check_fetch_actions(self) -> bool:
|
|
1099
|
+
"""
|
|
1100
|
+
Test action fetching and validation.
|
|
1101
|
+
|
|
1102
|
+
Returns:
|
|
1103
|
+
bool: True if action fetching works
|
|
1104
|
+
"""
|
|
1105
|
+
...
|
|
1106
|
+
|
|
1107
|
+
def check_filesystem_space(self) -> bool:
|
|
1108
|
+
"""
|
|
1109
|
+
Check available filesystem space and usage.
|
|
1110
|
+
|
|
1111
|
+
Returns:
|
|
1112
|
+
bool: True if filesystem space is sufficient
|
|
1113
|
+
"""
|
|
1114
|
+
...
|
|
1115
|
+
|
|
1116
|
+
def check_get_gpu_indices(self) -> bool:
|
|
1117
|
+
"""
|
|
1118
|
+
Check if get_gpu_indices returns valid indices.
|
|
1119
|
+
|
|
1120
|
+
Returns:
|
|
1121
|
+
bool: True if GPU indices are valid
|
|
1122
|
+
"""
|
|
1123
|
+
...
|
|
1124
|
+
|
|
1125
|
+
def check_gpu(self) -> bool:
|
|
1126
|
+
"""
|
|
1127
|
+
Check if machine has GPU and it's functioning.
|
|
1128
|
+
|
|
1129
|
+
Returns:
|
|
1130
|
+
bool: True if GPU check passes
|
|
1131
|
+
"""
|
|
1132
|
+
...
|
|
1133
|
+
|
|
1134
|
+
def check_instance_id(self, instance_id: Optional[str] = None) -> bool:
|
|
1135
|
+
"""
|
|
1136
|
+
Validate instance ID from args or env.
|
|
1137
|
+
|
|
1138
|
+
Args:
|
|
1139
|
+
instance_id: Optional instance ID to validate
|
|
1140
|
+
|
|
1141
|
+
Returns:
|
|
1142
|
+
bool: True if instance ID is valid
|
|
1143
|
+
"""
|
|
1144
|
+
...
|
|
1145
|
+
|
|
1146
|
+
def check_resources(self) -> bool:
|
|
1147
|
+
"""
|
|
1148
|
+
Validate system resource limits and availability.
|
|
1149
|
+
|
|
1150
|
+
Returns:
|
|
1151
|
+
bool: True if resource checks pass
|
|
1152
|
+
"""
|
|
1153
|
+
...
|
|
1154
|
+
|
|
1155
|
+
def check_resources_tracking(self) -> bool:
|
|
1156
|
+
"""
|
|
1157
|
+
Test resource tracking updates and monitoring.
|
|
1158
|
+
|
|
1159
|
+
Returns:
|
|
1160
|
+
bool: True if resource tracking is working
|
|
1161
|
+
"""
|
|
1162
|
+
...
|
|
1163
|
+
|
|
1164
|
+
def check_scaling_status(self) -> bool:
|
|
1165
|
+
"""
|
|
1166
|
+
Test scaling service status.
|
|
1167
|
+
|
|
1168
|
+
Returns:
|
|
1169
|
+
bool: True if scaling status is ok
|
|
1170
|
+
"""
|
|
1171
|
+
...
|
|
1172
|
+
|
|
1173
|
+
def cleanup_docker_storage(self) -> bool:
|
|
1174
|
+
"""
|
|
1175
|
+
Clean up docker storage and verify space freed.
|
|
1176
|
+
|
|
1177
|
+
Returns:
|
|
1178
|
+
bool: True if cleanup successful
|
|
1179
|
+
"""
|
|
1180
|
+
...
|
|
1181
|
+
|
|
1182
|
+
def create_docker_volume(self) -> bool:
|
|
1183
|
+
"""
|
|
1184
|
+
Create docker volume.
|
|
1185
|
+
|
|
1186
|
+
Returns:
|
|
1187
|
+
bool: True if volume created successfully
|
|
1188
|
+
"""
|
|
1189
|
+
...
|
|
1190
|
+
|
|
1191
|
+
def get_available_resources(self) -> bool:
|
|
1192
|
+
"""
|
|
1193
|
+
Check available system resources are within valid ranges.
|
|
1194
|
+
|
|
1195
|
+
Returns:
|
|
1196
|
+
bool: True if resources are within valid ranges
|
|
1197
|
+
"""
|
|
1198
|
+
...
|
|
1199
|
+
|
|
1200
|
+
def get_shutdown_details(self) -> bool:
|
|
1201
|
+
"""
|
|
1202
|
+
Get and validate shutdown details from response.
|
|
1203
|
+
|
|
1204
|
+
Returns:
|
|
1205
|
+
bool: True if shutdown details are valid
|
|
1206
|
+
"""
|
|
1207
|
+
...
|
|
1208
|
+
|
|
1209
|
+
def run_all_checks(self, instance_id: Optional[str] = None, access_key: Optional[str] = None, secret_key: Optional[str] = None) -> bool:
|
|
1210
|
+
"""
|
|
1211
|
+
Run all prechecks in sequence.
|
|
1212
|
+
|
|
1213
|
+
Args:
|
|
1214
|
+
instance_id: Optional instance ID to validate
|
|
1215
|
+
access_key: Optional access key to validate
|
|
1216
|
+
secret_key: Optional secret key to validate
|
|
1217
|
+
|
|
1218
|
+
Returns:
|
|
1219
|
+
bool: True if all checks pass
|
|
1220
|
+
"""
|
|
1221
|
+
...
|
|
1222
|
+
|
|
1223
|
+
def setup_docker(self) -> bool:
|
|
1224
|
+
"""
|
|
1225
|
+
Setup docker.
|
|
1226
|
+
|
|
1227
|
+
Returns:
|
|
1228
|
+
bool: True if setup successful
|
|
1229
|
+
"""
|
|
1230
|
+
...
|
|
1231
|
+
|
|
1232
|
+
def test_actions_scale_down(self) -> bool:
|
|
1233
|
+
"""
|
|
1234
|
+
Test actions scale down.
|
|
1235
|
+
|
|
1236
|
+
Returns:
|
|
1237
|
+
bool: True if scale down test passes
|
|
1238
|
+
"""
|
|
1239
|
+
...
|
|
1240
|
+
|
|
1241
|
+
def test_gpu(self) -> bool:
|
|
1242
|
+
"""
|
|
1243
|
+
Test if GPU is working and has sufficient memory.
|
|
1244
|
+
|
|
1245
|
+
Returns:
|
|
1246
|
+
bool: True if GPU test passes
|
|
1247
|
+
"""
|
|
1248
|
+
...
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
# From resources_tracker
|
|
1252
|
+
class ActionsResourcesTracker:
|
|
1253
|
+
"""
|
|
1254
|
+
Tracks Docker container action resources
|
|
1255
|
+
"""
|
|
1256
|
+
|
|
1257
|
+
def __init__(self, scaling) -> None:
|
|
1258
|
+
"""
|
|
1259
|
+
Initialize ActionsResourcesTracker
|
|
1260
|
+
"""
|
|
1261
|
+
...
|
|
1262
|
+
|
|
1263
|
+
def get_current_action_usage(self, container, status: str) -> Tuple[float, int, float, float]:
|
|
1264
|
+
"""
|
|
1265
|
+
Get current resource usage for a container
|
|
1266
|
+
"""
|
|
1267
|
+
...
|
|
1268
|
+
|
|
1269
|
+
def get_sub_containers_by_label(self, label_key: str, label_value: str) -> list:
|
|
1270
|
+
"""
|
|
1271
|
+
Get running containers with specified label key and value
|
|
1272
|
+
"""
|
|
1273
|
+
...
|
|
1274
|
+
|
|
1275
|
+
def update_actions_resources(self) -> None:
|
|
1276
|
+
"""
|
|
1277
|
+
Process both running and exited containers.
|
|
1278
|
+
|
|
1279
|
+
Note: Does not remove containers to keep logs. Only tracks resource usage.
|
|
1280
|
+
"""
|
|
1281
|
+
...
|
|
1282
|
+
|
|
1283
|
+
def update_max_action_usage(self, action_record_id: str, current_gpu_utilization: float, current_gpu_memory: int, current_cpu_utilization: float, current_memory_utilization: float) -> Tuple[float, int, float, float]:
|
|
1284
|
+
"""
|
|
1285
|
+
Update and return maximum resource usage values for an action
|
|
1286
|
+
"""
|
|
1287
|
+
...
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
# From resources_tracker
|
|
1291
|
+
class ContainerResourceMonitor:
|
|
1292
|
+
"""
|
|
1293
|
+
Monitors individual container resource utilization and publishes to Kafka.
|
|
1294
|
+
This thread runs independently and reports CPU, memory, and GPU usage for all running containers.
|
|
1295
|
+
"""
|
|
1296
|
+
|
|
1297
|
+
def __init__(self, instance_id: Optional[str] = None, kafka_bootstrap: Optional[str] = None, interval_seconds: int = 30) -> None:
|
|
1298
|
+
"""
|
|
1299
|
+
Initialize ContainerResourceMonitor.
|
|
1300
|
+
|
|
1301
|
+
Args:
|
|
1302
|
+
instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
|
|
1303
|
+
kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
|
|
1304
|
+
interval_seconds: Interval between container checks in seconds. Defaults to 30.
|
|
1305
|
+
"""
|
|
1306
|
+
...
|
|
1307
|
+
|
|
1308
|
+
def is_running(self) -> bool:
|
|
1309
|
+
"""
|
|
1310
|
+
Check if the container resource monitor is currently running.
|
|
1311
|
+
|
|
1312
|
+
Returns:
|
|
1313
|
+
bool: True if running, False otherwise.
|
|
1314
|
+
"""
|
|
1315
|
+
...
|
|
1316
|
+
|
|
1317
|
+
def start(self) -> Any:
|
|
1318
|
+
"""
|
|
1319
|
+
Start the container resource monitoring thread.
|
|
1320
|
+
|
|
1321
|
+
Returns:
|
|
1322
|
+
bool: True if started successfully, False otherwise.
|
|
1323
|
+
"""
|
|
1324
|
+
...
|
|
1325
|
+
|
|
1326
|
+
def stop(self, timeout: int = 10) -> Any:
|
|
1327
|
+
"""
|
|
1328
|
+
Stop the container resource monitoring thread gracefully.
|
|
1329
|
+
|
|
1330
|
+
Args:
|
|
1331
|
+
timeout: Maximum time to wait for thread to stop in seconds.
|
|
1332
|
+
|
|
1333
|
+
Returns:
|
|
1334
|
+
bool: True if stopped successfully, False otherwise.
|
|
1335
|
+
"""
|
|
1336
|
+
...
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
# From resources_tracker
|
|
1340
|
+
class KafkaResourceMonitor:
|
|
1341
|
+
"""
|
|
1342
|
+
Monitors system resources and publishes them to Kafka in a separate thread.
|
|
1343
|
+
This class provides thread-safe start/stop operations for resource monitoring.
|
|
1344
|
+
"""
|
|
1345
|
+
|
|
1346
|
+
def __init__(self, instance_id: Optional[str] = None, kafka_bootstrap: Optional[str] = None, interval_seconds: int = 60) -> None:
|
|
1347
|
+
"""
|
|
1348
|
+
Initialize KafkaResourceMonitor.
|
|
1349
|
+
|
|
1350
|
+
Args:
|
|
1351
|
+
instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
|
|
1352
|
+
kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
|
|
1353
|
+
interval_seconds: Interval between resource checks in seconds. Defaults to 60.
|
|
1354
|
+
"""
|
|
1355
|
+
...
|
|
1356
|
+
|
|
1357
|
+
def get_all_gpu_memory() -> Dict[int, tuple]:
|
|
1358
|
+
"""
|
|
1359
|
+
Get GPU memory usage and total for all GPUs.
|
|
1360
|
+
|
|
1361
|
+
Returns:
|
|
1362
|
+
Dict[int, tuple]: Dictionary mapping GPU ID to (used_gb, total_gb).
|
|
1363
|
+
Returns empty dict if nvidia-smi is not available.
|
|
1364
|
+
"""
|
|
1365
|
+
...
|
|
1366
|
+
|
|
1367
|
+
def get_all_storage_info() -> Dict[str, tuple]:
|
|
1368
|
+
"""
|
|
1369
|
+
Get storage information for all mounted drives.
|
|
1370
|
+
|
|
1371
|
+
Returns:
|
|
1372
|
+
Dict[str, tuple]: Dictionary mapping mount point to (free_gb, total_gb).
|
|
1373
|
+
"""
|
|
1374
|
+
...
|
|
1375
|
+
|
|
1376
|
+
def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str, tuple]]:
|
|
1377
|
+
"""
|
|
1378
|
+
Collect current system resource statistics.
|
|
1379
|
+
|
|
1380
|
+
Returns:
|
|
1381
|
+
Tuple[float, int, float, float, Dict[int, tuple], Dict[str, tuple]]:
|
|
1382
|
+
CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total), Storage dict (free, total)
|
|
1383
|
+
"""
|
|
1384
|
+
...
|
|
1385
|
+
|
|
1386
|
+
def is_running(self) -> bool:
|
|
1387
|
+
"""
|
|
1388
|
+
Check if the resource monitor is currently running.
|
|
1389
|
+
|
|
1390
|
+
Returns:
|
|
1391
|
+
bool: True if running, False otherwise.
|
|
1392
|
+
"""
|
|
1393
|
+
...
|
|
1394
|
+
|
|
1395
|
+
def start(self) -> Any:
|
|
1396
|
+
"""
|
|
1397
|
+
Start the resource monitoring thread.
|
|
1398
|
+
|
|
1399
|
+
Returns:
|
|
1400
|
+
bool: True if started successfully, False otherwise.
|
|
1401
|
+
"""
|
|
1402
|
+
...
|
|
1403
|
+
|
|
1404
|
+
def stop(self, timeout: int = 10) -> Any:
|
|
1405
|
+
"""
|
|
1406
|
+
Stop the resource monitoring thread gracefully.
|
|
1407
|
+
|
|
1408
|
+
Args:
|
|
1409
|
+
timeout: Maximum time to wait for thread to stop in seconds.
|
|
1410
|
+
|
|
1411
|
+
Returns:
|
|
1412
|
+
bool: True if stopped successfully, False otherwise.
|
|
1413
|
+
"""
|
|
1414
|
+
...
|
|
1415
|
+
|
|
1416
|
+
|
|
1417
|
+
# From resources_tracker
|
|
1418
|
+
class MachineResourcesTracker:
|
|
1419
|
+
"""
|
|
1420
|
+
Tracks machine-level resources like CPU, memory and GPU
|
|
1421
|
+
"""
|
|
1422
|
+
|
|
1423
|
+
def __init__(self, scaling) -> None:
|
|
1424
|
+
"""
|
|
1425
|
+
Initialize MachineResourcesTracker
|
|
1426
|
+
"""
|
|
1427
|
+
...
|
|
1428
|
+
|
|
1429
|
+
def update_available_resources(self) -> Any:
|
|
1430
|
+
"""
|
|
1431
|
+
Update available machine resources
|
|
1432
|
+
"""
|
|
1433
|
+
...
|
|
1434
|
+
|
|
1435
|
+
|
|
1436
|
+
# From resources_tracker
|
|
1437
|
+
class ResourcesTracker:
|
|
1438
|
+
"""
|
|
1439
|
+
Tracks machine and container resources.
|
|
1440
|
+
|
|
1441
|
+
GPU Utilization Note:
|
|
1442
|
+
GPU utilization is tracked at the DEVICE level, not per-container.
|
|
1443
|
+
NVIDIA does not expose reliable per-process GPU utilization.
|
|
1444
|
+
Per-container GPU MEMORY is accurate; per-container GPU UTILIZATION is best-effort.
|
|
1445
|
+
"""
|
|
1446
|
+
|
|
1447
|
+
def __init__(self) -> None:
|
|
1448
|
+
"""
|
|
1449
|
+
Initialize ResourcesTracker.
|
|
1450
|
+
"""
|
|
1451
|
+
...
|
|
1452
|
+
|
|
1453
|
+
def get_all_container_pids(self, container_id: str) -> set:
|
|
1454
|
+
"""
|
|
1455
|
+
Get ALL PIDs belonging to a container (including child processes).
|
|
1456
|
+
|
|
1457
|
+
Uses multiple methods for robustness:
|
|
1458
|
+
1. docker top (most reliable for standard Docker)
|
|
1459
|
+
2. Docker API inspect + process tree enumeration
|
|
1460
|
+
3. cgroup procs files (v1 and v2)
|
|
1461
|
+
|
|
1462
|
+
Known limitations:
|
|
1463
|
+
- May miss processes in rootless Docker
|
|
1464
|
+
- CRI-O/containerd may have different layouts
|
|
1465
|
+
|
|
1466
|
+
Args:
|
|
1467
|
+
container_id (str): ID of the Docker container.
|
|
1468
|
+
|
|
1469
|
+
Returns:
|
|
1470
|
+
set: Set of all PIDs (as strings) belonging to the container.
|
|
1471
|
+
"""
|
|
1472
|
+
...
|
|
1473
|
+
|
|
1474
|
+
def get_available_resources(self) -> Tuple[float, float, int, float]:
|
|
1475
|
+
"""
|
|
1476
|
+
Get available machine resources.
|
|
1477
|
+
|
|
1478
|
+
Note: CPU measurement is non-blocking (uses interval=0).
|
|
1479
|
+
For more accurate CPU usage, call this method periodically and track trends.
|
|
1480
|
+
|
|
1481
|
+
Returns:
|
|
1482
|
+
Tuple[float, float, int, float]:
|
|
1483
|
+
- Available memory in GB
|
|
1484
|
+
- Available CPU percentage (100 - current_usage)
|
|
1485
|
+
- Free GPU memory in MB
|
|
1486
|
+
- GPU utilization percentage (0-100)
|
|
1487
|
+
"""
|
|
1488
|
+
...
|
|
1489
|
+
|
|
1490
|
+
def get_container_cpu_and_memory(self, container) -> Tuple[float, float]:
|
|
1491
|
+
"""
|
|
1492
|
+
Get CPU and memory usage for a container.
|
|
1493
|
+
|
|
1494
|
+
Args:
|
|
1495
|
+
container (docker.models.containers.Container): Docker container instance.
|
|
1496
|
+
|
|
1497
|
+
Returns:
|
|
1498
|
+
Tuple[float, float]: CPU utilization percentage (0-100 per core used) and memory usage in MB.
|
|
1499
|
+
"""
|
|
1500
|
+
...
|
|
1501
|
+
|
|
1502
|
+
def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
|
|
1503
|
+
"""
|
|
1504
|
+
Get CPU and memory usage for a specific container by its ID.
|
|
1505
|
+
|
|
1506
|
+
Args:
|
|
1507
|
+
container_id (str): ID of the Docker container.
|
|
1508
|
+
|
|
1509
|
+
Returns:
|
|
1510
|
+
Tuple[float, float]: CPU utilization percentage and memory usage in MB.
|
|
1511
|
+
"""
|
|
1512
|
+
...
|
|
1513
|
+
|
|
1514
|
+
def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
|
|
1515
|
+
"""
|
|
1516
|
+
Get GPU usage for a specific container.
|
|
1517
|
+
|
|
1518
|
+
IMPORTANT: GPU utilization tracking limitations:
|
|
1519
|
+
- GPU MEMORY per container is ACCURATE (from nvidia-smi per-process data)
|
|
1520
|
+
- GPU UTILIZATION per container is BEST-EFFORT (NVIDIA doesn't expose per-process SM usage)
|
|
1521
|
+
|
|
1522
|
+
For GPU utilization, we report the utilization of GPUs that have container processes.
|
|
1523
|
+
If multiple containers share a GPU, they will all report similar utilization.
|
|
1524
|
+
|
|
1525
|
+
Args:
|
|
1526
|
+
container_id (str): ID of the Docker container.
|
|
1527
|
+
|
|
1528
|
+
Returns:
|
|
1529
|
+
Tuple[float, int]:
|
|
1530
|
+
- GPU utilization percentage (device-level, for GPUs used by container)
|
|
1531
|
+
- GPU memory usage in MB (accurate per-container)
|
|
1532
|
+
"""
|
|
1533
|
+
...
|
|
1534
|
+
|
|
1535
|
+
def get_container_gpu_memory_usage(self, container_pid: str) -> int:
|
|
1536
|
+
"""
|
|
1537
|
+
Get GPU memory usage for a container PID.
|
|
1538
|
+
|
|
1539
|
+
Args:
|
|
1540
|
+
container_pid (str): PID of the Docker container.
|
|
1541
|
+
|
|
1542
|
+
Returns:
|
|
1543
|
+
int: GPU memory usage in MB.
|
|
1544
|
+
"""
|
|
1545
|
+
...
|
|
1546
|
+
|
|
1547
|
+
def get_container_gpu_memory_usage_multi_pid(self, container_pids: set) -> int:
|
|
1548
|
+
"""
|
|
1549
|
+
Get GPU memory usage for multiple container PIDs.
|
|
1550
|
+
|
|
1551
|
+
Args:
|
|
1552
|
+
container_pids (set): Set of container PIDs (as strings).
|
|
1553
|
+
|
|
1554
|
+
Returns:
|
|
1555
|
+
int: Total GPU memory usage in MB across all matching processes.
|
|
1556
|
+
"""
|
|
1557
|
+
...
|
|
1558
|
+
|
|
1559
|
+
def get_container_gpu_usage(self, container_pid: str) -> float:
|
|
1560
|
+
"""
|
|
1561
|
+
Get GPU usage for a container PID.
|
|
1562
|
+
|
|
1563
|
+
Args:
|
|
1564
|
+
container_pid (str): PID of the Docker container.
|
|
1565
|
+
|
|
1566
|
+
Returns:
|
|
1567
|
+
float: GPU utilization percentage.
|
|
1568
|
+
"""
|
|
1569
|
+
...
|
|
1570
|
+
|
|
1571
|
+
def get_container_gpu_usage_multi_pid(self, container_pids: set) -> float:
|
|
1572
|
+
"""
|
|
1573
|
+
Get GPU usage for multiple container PIDs.
|
|
1574
|
+
|
|
1575
|
+
Args:
|
|
1576
|
+
container_pids (set): Set of container PIDs (as strings).
|
|
1577
|
+
|
|
1578
|
+
Returns:
|
|
1579
|
+
float: Total GPU utilization percentage across all matching processes.
|
|
1580
|
+
"""
|
|
1581
|
+
...
|
|
1582
|
+
|
|
1583
|
+
def get_pid_id_by_container_id(self, container_id: str) -> str:
|
|
1584
|
+
"""
|
|
1585
|
+
Get PID for a container ID.
|
|
1586
|
+
|
|
1587
|
+
Args:
|
|
1588
|
+
container_id (str): ID of the Docker container.
|
|
1589
|
+
|
|
1590
|
+
Returns:
|
|
1591
|
+
str: PID of the container.
|
|
1592
|
+
"""
|
|
1593
|
+
...
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
# From scaling
|
|
1597
|
+
class Scaling:
|
|
1598
|
+
"""
|
|
1599
|
+
Class providing scaling functionality for compute instances.
|
|
1600
|
+
"""
|
|
1601
|
+
|
|
1602
|
+
def __init__(self, session, instance_id = None, enable_kafka = False) -> None:
|
|
1603
|
+
"""
|
|
1604
|
+
Initialize Scaling instance.
|
|
1605
|
+
|
|
1606
|
+
Args:
|
|
1607
|
+
session: Session object for making RPC calls
|
|
1608
|
+
instance_id: ID of the compute instance
|
|
1609
|
+
enable_kafka: Enable Kafka communication (default True)
|
|
1610
|
+
|
|
1611
|
+
Raises:
|
|
1612
|
+
Exception: If instance_id is not provided
|
|
1613
|
+
"""
|
|
1614
|
+
...
|
|
1615
|
+
|
|
1616
|
+
def add_account_compute_instance(self, account_number, alias, service_provider, instance_type, shut_down_time, lease_type, launch_duration) -> Any:
|
|
1617
|
+
"""
|
|
1618
|
+
Add a compute instance for an account.
|
|
1619
|
+
|
|
1620
|
+
Args:
|
|
1621
|
+
account_number: Account number
|
|
1622
|
+
alias: Instance alias
|
|
1623
|
+
service_provider: Cloud service provider
|
|
1624
|
+
instance_type: Type of instance
|
|
1625
|
+
shut_down_time: Time to shutdown
|
|
1626
|
+
lease_type: Type of lease
|
|
1627
|
+
launch_duration: Duration to launch
|
|
1628
|
+
|
|
1629
|
+
Returns:
|
|
1630
|
+
Tuple of (data, error, message) from API response
|
|
1631
|
+
"""
|
|
1632
|
+
...
|
|
1633
|
+
|
|
1634
|
+
def assign_jobs(self, is_gpu) -> Any:
|
|
1635
|
+
"""
|
|
1636
|
+
Assign jobs to the instance using REST API.
|
|
1637
|
+
|
|
1638
|
+
Args:
|
|
1639
|
+
is_gpu: Boolean or any value indicating if this is a GPU instance.
|
|
1640
|
+
Will be converted to proper boolean.
|
|
1641
|
+
|
|
1642
|
+
Returns:
|
|
1643
|
+
Tuple of (data, error, message) from API response
|
|
1644
|
+
"""
|
|
1645
|
+
...
|
|
1646
|
+
|
|
1647
|
+
def delete_account_compute(self, account_number, alias) -> Any:
|
|
1648
|
+
"""
|
|
1649
|
+
Delete a compute instance for an account.
|
|
1650
|
+
|
|
1651
|
+
Args:
|
|
1652
|
+
account_number: Account number
|
|
1653
|
+
alias: Instance alias
|
|
1654
|
+
|
|
1655
|
+
Returns:
|
|
1656
|
+
Tuple of (data, error, message) from API response
|
|
1657
|
+
"""
|
|
1658
|
+
...
|
|
1659
|
+
|
|
1660
|
+
def get_action_details(self, action_status_id) -> Any:
|
|
1661
|
+
"""
|
|
1662
|
+
Get details for a specific action using Kafka (with REST fallback).
|
|
1663
|
+
|
|
1664
|
+
Args:
|
|
1665
|
+
action_status_id: ID of the action status to fetch
|
|
1666
|
+
|
|
1667
|
+
Returns:
|
|
1668
|
+
Tuple of (data, error, message) from API response
|
|
1669
|
+
"""
|
|
1670
|
+
...
|
|
1671
|
+
|
|
1672
|
+
def get_all_instances_type(self) -> Any:
|
|
1673
|
+
"""
|
|
1674
|
+
Get all instance types using Kafka (with REST fallback).
|
|
1675
|
+
|
|
1676
|
+
Returns:
|
|
1677
|
+
Tuple of (data, error, message) from API response
|
|
1678
|
+
"""
|
|
1679
|
+
...
|
|
1680
|
+
|
|
1681
|
+
def get_compute_details(self) -> Any:
|
|
1682
|
+
"""
|
|
1683
|
+
Get compute instance details using Kafka (with REST fallback).
|
|
1684
|
+
|
|
1685
|
+
Returns:
|
|
1686
|
+
Tuple of (data, error, message) from API response
|
|
1687
|
+
"""
|
|
1688
|
+
...
|
|
1689
|
+
|
|
1690
|
+
def get_data_processing_image(self) -> Any:
|
|
1691
|
+
"""
|
|
1692
|
+
Get data processing image name.
|
|
1693
|
+
|
|
1694
|
+
Returns:
|
|
1695
|
+
Full image name including repository and tag
|
|
1696
|
+
"""
|
|
1697
|
+
...
|
|
1698
|
+
|
|
1699
|
+
def get_docker_hub_credentials(self) -> Any:
|
|
1700
|
+
"""
|
|
1701
|
+
Get Docker Hub credentials using Kafka (with REST fallback).
|
|
1702
|
+
|
|
1703
|
+
Returns:
|
|
1704
|
+
Tuple of (data, error, message) from API response
|
|
1705
|
+
"""
|
|
1706
|
+
...
|
|
1707
|
+
|
|
1708
|
+
def get_downscaled_ids(self) -> Any:
|
|
1709
|
+
"""
|
|
1710
|
+
Get IDs of downscaled instances using Kafka (with REST fallback).
|
|
1711
|
+
|
|
1712
|
+
Returns:
|
|
1713
|
+
Tuple of (data, error, message) from API response
|
|
1714
|
+
"""
|
|
1715
|
+
...
|
|
1716
|
+
|
|
1717
|
+
def get_internal_api_key(self, action_id) -> Any:
|
|
1718
|
+
"""
|
|
1719
|
+
Get internal API key using Kafka (with REST fallback).
|
|
1720
|
+
|
|
1721
|
+
Args:
|
|
1722
|
+
action_id: ID of the action
|
|
1723
|
+
|
|
1724
|
+
Returns:
|
|
1725
|
+
Tuple of (data, error, message) from API response
|
|
1726
|
+
"""
|
|
1727
|
+
...
|
|
1728
|
+
|
|
1729
|
+
def get_kafka_bootstrap_servers(self) -> Any:
|
|
1730
|
+
"""
|
|
1731
|
+
Get Kafka bootstrap servers from API and decode base64 fields.
|
|
1732
|
+
|
|
1733
|
+
Returns:
|
|
1734
|
+
str: Kafka bootstrap servers in format "ip:port"
|
|
1735
|
+
|
|
1736
|
+
Raises:
|
|
1737
|
+
ValueError: If unable to fetch Kafka configuration
|
|
1738
|
+
"""
|
|
1739
|
+
...
|
|
1740
|
+
|
|
1741
|
+
def get_model_codebase(self, model_family_id) -> Any:
|
|
1742
|
+
"""
|
|
1743
|
+
Get model codebase.
|
|
1744
|
+
|
|
1745
|
+
Args:
|
|
1746
|
+
model_family_id: ID of the model family
|
|
1747
|
+
|
|
1748
|
+
Returns:
|
|
1749
|
+
Tuple of (data, error, message) from API response
|
|
1750
|
+
"""
|
|
1751
|
+
...
|
|
1752
|
+
|
|
1753
|
+
def get_model_codebase_requirements(self, dockerId) -> Any:
|
|
1754
|
+
"""
|
|
1755
|
+
Get model codebase requirements.
|
|
1756
|
+
|
|
1757
|
+
Args:
|
|
1758
|
+
dockerId: ID of the docker
|
|
1759
|
+
|
|
1760
|
+
Returns:
|
|
1761
|
+
Tuple of (data, error, message) from API response
|
|
1762
|
+
"""
|
|
1763
|
+
...
|
|
1764
|
+
|
|
1765
|
+
def get_model_codebase_script(self, model_family_id) -> Any:
|
|
1766
|
+
"""
|
|
1767
|
+
Get model codebase script.
|
|
1768
|
+
|
|
1769
|
+
Args:
|
|
1770
|
+
model_family_id: ID of the model family
|
|
1771
|
+
|
|
1772
|
+
Returns:
|
|
1773
|
+
Tuple of (data, error, message) from API response
|
|
1774
|
+
"""
|
|
1775
|
+
...
|
|
1776
|
+
|
|
1777
|
+
def get_model_secret_keys(self, secret_name) -> Any:
|
|
1778
|
+
"""
|
|
1779
|
+
Get model secret keys using Kafka (with REST fallback).
|
|
1780
|
+
|
|
1781
|
+
Args:
|
|
1782
|
+
secret_name: Name of the secret
|
|
1783
|
+
|
|
1784
|
+
Returns:
|
|
1785
|
+
Tuple of (data, error, message) from API response
|
|
1786
|
+
"""
|
|
1787
|
+
...
|
|
1788
|
+
|
|
1789
|
+
def get_open_port(self) -> Any:
|
|
1790
|
+
"""
|
|
1791
|
+
Get an available open port.
|
|
1792
|
+
|
|
1793
|
+
Returns:
|
|
1794
|
+
Port number if available, None otherwise
|
|
1795
|
+
"""
|
|
1796
|
+
...
|
|
1797
|
+
|
|
1798
|
+
def get_open_ports_config(self) -> Any:
|
|
1799
|
+
"""
|
|
1800
|
+
Get open ports configuration using Kafka (with REST fallback).
|
|
1801
|
+
|
|
1802
|
+
Returns:
|
|
1803
|
+
Tuple of (data, error, message) from API response
|
|
1804
|
+
"""
|
|
1805
|
+
...
|
|
1806
|
+
|
|
1807
|
+
def get_shutdown_details(self) -> Any:
|
|
1808
|
+
"""
|
|
1809
|
+
Get shutdown details for the instance using Kafka (with REST fallback).
|
|
1810
|
+
|
|
1811
|
+
Returns:
|
|
1812
|
+
Tuple of (data, error, message) from API response
|
|
1813
|
+
"""
|
|
1814
|
+
...
|
|
1815
|
+
|
|
1816
|
+
def get_tasks_details(self) -> Any:
|
|
1817
|
+
"""
|
|
1818
|
+
Get task details for the instance using Kafka (with REST fallback).
|
|
1819
|
+
|
|
1820
|
+
Returns:
|
|
1821
|
+
Tuple of (data, error, message) from API response
|
|
1822
|
+
"""
|
|
1823
|
+
...
|
|
1824
|
+
|
|
1825
|
+
def get_user_access_key_pair(self, user_id) -> Any:
|
|
1826
|
+
"""
|
|
1827
|
+
Get user access key pair using Kafka (with REST fallback).
|
|
1828
|
+
|
|
1829
|
+
Args:
|
|
1830
|
+
user_id: ID of the user
|
|
1831
|
+
|
|
1832
|
+
Returns:
|
|
1833
|
+
Tuple of (data, error, message) from API response
|
|
1834
|
+
"""
|
|
1835
|
+
...
|
|
1836
|
+
|
|
1837
|
+
def handle_response(self, resp, success_message, error_message) -> Any:
|
|
1838
|
+
"""
|
|
1839
|
+
Helper function to handle API response.
|
|
1840
|
+
|
|
1841
|
+
Args:
|
|
1842
|
+
resp: Response from API call
|
|
1843
|
+
success_message: Message to log on success
|
|
1844
|
+
error_message: Message to log on error
|
|
1845
|
+
|
|
1846
|
+
Returns:
|
|
1847
|
+
Tuple of (data, error, message)
|
|
1848
|
+
"""
|
|
1849
|
+
...
|
|
1850
|
+
|
|
1851
|
+
def refresh_presigned_url(self, url: str) -> Any:
|
|
1852
|
+
"""
|
|
1853
|
+
Refresh a presigned URL that may have expired.
|
|
1854
|
+
|
|
1855
|
+
Args:
|
|
1856
|
+
url: The presigned URL to refresh
|
|
1857
|
+
|
|
1858
|
+
Returns:
|
|
1859
|
+
Tuple of (refreshed_url, error, message) from API response
|
|
1860
|
+
"""
|
|
1861
|
+
...
|
|
1862
|
+
|
|
1863
|
+
def report_architecture_info(self) -> Any:
|
|
1864
|
+
"""
|
|
1865
|
+
Collects and sends architecture info to the compute service.
|
|
1866
|
+
"""
|
|
1867
|
+
...
|
|
1868
|
+
|
|
1869
|
+
def restart_account_compute(self, account_number, alias) -> Any:
|
|
1870
|
+
"""
|
|
1871
|
+
Restart a compute instance for an account using Kafka (with REST fallback).
|
|
1872
|
+
|
|
1873
|
+
Args:
|
|
1874
|
+
account_number: Account number
|
|
1875
|
+
alias: Instance alias
|
|
1876
|
+
|
|
1877
|
+
Returns:
|
|
1878
|
+
Tuple of (data, error, message) from API response
|
|
1879
|
+
"""
|
|
1880
|
+
...
|
|
1881
|
+
|
|
1882
|
+
def shutdown(self) -> Any:
|
|
1883
|
+
"""
|
|
1884
|
+
Gracefully shutdown Kafka connections.
|
|
1885
|
+
"""
|
|
1886
|
+
...
|
|
1887
|
+
|
|
1888
|
+
def stop_account_compute(self, account_number, alias) -> Any:
|
|
1889
|
+
"""
|
|
1890
|
+
Stop a compute instance for an account using Kafka (with REST fallback).
|
|
1891
|
+
|
|
1892
|
+
Args:
|
|
1893
|
+
account_number: Account number
|
|
1894
|
+
alias: Instance alias
|
|
1895
|
+
|
|
1896
|
+
Returns:
|
|
1897
|
+
Tuple of (data, error, message) from API response
|
|
1898
|
+
"""
|
|
1899
|
+
...
|
|
1900
|
+
|
|
1901
|
+
def stop_instance(self) -> Any:
|
|
1902
|
+
"""
|
|
1903
|
+
Stop the compute instance using Kafka (with REST fallback).
|
|
1904
|
+
|
|
1905
|
+
Returns:
|
|
1906
|
+
Tuple of (data, error, message) from API response
|
|
1907
|
+
"""
|
|
1908
|
+
...
|
|
1909
|
+
|
|
1910
|
+
def update_action(self, id = '', step_code = '', action_type = '', status = '', sub_action = '', status_description = '', service = '', job_params = None) -> Any:
|
|
1911
|
+
"""
|
|
1912
|
+
Update an action using Kafka (with REST fallback).
|
|
1913
|
+
|
|
1914
|
+
Args:
|
|
1915
|
+
id: Action ID
|
|
1916
|
+
step_code: Step code
|
|
1917
|
+
action_type: Type of action
|
|
1918
|
+
status: Status of the action
|
|
1919
|
+
sub_action: Sub-action details
|
|
1920
|
+
status_description: Description of the status
|
|
1921
|
+
service: Service name
|
|
1922
|
+
job_params: Job parameters dictionary
|
|
1923
|
+
|
|
1924
|
+
Returns:
|
|
1925
|
+
Tuple of (data, error, message) from API response
|
|
1926
|
+
"""
|
|
1927
|
+
...
|
|
1928
|
+
|
|
1929
|
+
def update_action_container_id(self, action_record_id, container_id) -> Any:
|
|
1930
|
+
"""
|
|
1931
|
+
Update container ID for an action using Kafka (with REST fallback).
|
|
1932
|
+
|
|
1933
|
+
Args:
|
|
1934
|
+
action_record_id: ID of the action record
|
|
1935
|
+
container_id: Container ID to update
|
|
1936
|
+
|
|
1937
|
+
Returns:
|
|
1938
|
+
Tuple of (data, error, message) from API response
|
|
1939
|
+
"""
|
|
1940
|
+
...
|
|
1941
|
+
|
|
1942
|
+
def update_action_docker_logs(self, action_record_id, log_content) -> Any:
|
|
1943
|
+
"""
|
|
1944
|
+
Update docker logs for an action using Kafka (with REST fallback).
|
|
1945
|
+
|
|
1946
|
+
Args:
|
|
1947
|
+
action_record_id: ID of the action record
|
|
1948
|
+
log_content: Content of the logs to update
|
|
1949
|
+
|
|
1950
|
+
Returns:
|
|
1951
|
+
Tuple of (data, error, message) from API response
|
|
1952
|
+
"""
|
|
1953
|
+
...
|
|
1954
|
+
|
|
1955
|
+
def update_action_status(self, service_provider = '', action_record_id = '', isRunning = True, status = '', docker_start_time = None, action_duration = 0, cpuUtilisation = 0.0, gpuUtilisation = 0.0, memoryUtilisation = 0.0, gpuMemoryUsed = 0, createdAt = None, updatedAt = None) -> Any:
|
|
1956
|
+
"""
|
|
1957
|
+
Update status of an action using Kafka (with REST fallback).
|
|
1958
|
+
|
|
1959
|
+
Args:
|
|
1960
|
+
service_provider: Provider of the service
|
|
1961
|
+
action_record_id: ID of the action record
|
|
1962
|
+
isRunning: Whether action is running
|
|
1963
|
+
status: Status of the action
|
|
1964
|
+
docker_start_time: Start time of docker container
|
|
1965
|
+
action_duration: Duration of the action
|
|
1966
|
+
cpuUtilisation: CPU utilization percentage
|
|
1967
|
+
gpuUtilisation: GPU utilization percentage
|
|
1968
|
+
memoryUtilisation: Memory utilization percentage
|
|
1969
|
+
gpuMemoryUsed: GPU memory used
|
|
1970
|
+
createdAt: Creation timestamp
|
|
1971
|
+
updatedAt: Last update timestamp
|
|
1972
|
+
|
|
1973
|
+
Returns:
|
|
1974
|
+
Tuple of (data, error, message) from API response
|
|
1975
|
+
"""
|
|
1976
|
+
...
|
|
1977
|
+
|
|
1978
|
+
def update_available_resources(self, availableCPU = 0, availableGPU = 0, availableMemory = 0, availableGPUMemory = 0) -> Any:
|
|
1979
|
+
"""
|
|
1980
|
+
Update available resources for the instance using Kafka (with REST fallback).
|
|
1981
|
+
|
|
1982
|
+
Args:
|
|
1983
|
+
availableCPU: Available CPU resources
|
|
1984
|
+
availableGPU: Available GPU resources
|
|
1985
|
+
availableMemory: Available memory
|
|
1986
|
+
availableGPUMemory: Available GPU memory
|
|
1987
|
+
|
|
1988
|
+
Returns:
|
|
1989
|
+
Tuple of (data, error, message) from API response
|
|
1990
|
+
"""
|
|
1991
|
+
...
|
|
1992
|
+
|
|
1993
|
+
def update_jupyter_token(self, token = '') -> Any:
|
|
1994
|
+
"""
|
|
1995
|
+
Update Jupyter notebook token using Kafka (with REST fallback).
|
|
1996
|
+
"""
|
|
1997
|
+
...
|
|
1998
|
+
|
|
1999
|
+
def update_status(self, action_record_id, action_type, service_name, stepCode, status, status_description) -> None:
|
|
2000
|
+
"""
|
|
2001
|
+
Update status of an action using Kafka (with REST fallback).
|
|
2002
|
+
|
|
2003
|
+
Args:
|
|
2004
|
+
action_record_id: ID of the action record
|
|
2005
|
+
action_type: Type of action
|
|
2006
|
+
service_name: Name of the service
|
|
2007
|
+
stepCode: Code indicating step in process
|
|
2008
|
+
status: Status to update
|
|
2009
|
+
status_description: Description of the status
|
|
2010
|
+
"""
|
|
2011
|
+
...
|
|
2012
|
+
|
|
2013
|
+
|
|
2014
|
+
# From shutdown_manager
|
|
2015
|
+
class ShutdownManager:
|
|
2016
|
+
"""
|
|
2017
|
+
Class for managing compute instance shutdown.
|
|
2018
|
+
"""
|
|
2019
|
+
|
|
2020
|
+
def __init__(self, scaling) -> None:
|
|
2021
|
+
"""
|
|
2022
|
+
Initialize ShutdownManager.
|
|
2023
|
+
|
|
2024
|
+
Args:
|
|
2025
|
+
scaling (Scaling): Scaling instance to manage shutdown.
|
|
2026
|
+
"""
|
|
2027
|
+
...
|
|
2028
|
+
|
|
2029
|
+
def do_cleanup_and_shutdown(self) -> bool:
|
|
2030
|
+
"""
|
|
2031
|
+
Clean up resources and shut down the instance.
|
|
2032
|
+
|
|
2033
|
+
This method attempts a coordinated shutdown with multiple fallback strategies:
|
|
2034
|
+
1. API call to notify the scaling service
|
|
2035
|
+
2. Graceful OS shutdown command
|
|
2036
|
+
3. Aggressive shutdown methods if needed
|
|
2037
|
+
4. Emergency forced shutdown as last resort
|
|
2038
|
+
|
|
2039
|
+
Returns:
|
|
2040
|
+
bool: True if shutdown was initiated successfully, False otherwise
|
|
2041
|
+
"""
|
|
2042
|
+
...
|
|
2043
|
+
|
|
2044
|
+
def handle_shutdown(self, tasks_running: bool) -> None:
|
|
2045
|
+
"""
|
|
2046
|
+
Check idle time and trigger shutdown if threshold is exceeded.
|
|
2047
|
+
|
|
2048
|
+
Args:
|
|
2049
|
+
tasks_running: Boolean indicating if there are running tasks
|
|
2050
|
+
"""
|
|
2051
|
+
...
|
|
2052
|
+
|
|
2053
|
+
|
|
2054
|
+
from . import action_instance, actions_manager, actions_scaledown_manager, compute_operations_handler, instance_manager, instance_utils, k8s_scheduler, prechecks, resources_tracker, scaling, shutdown_manager, task_utils
|
|
2055
|
+
|
|
2056
|
+
def __getattr__(name: str) -> Any: ...
|