matrice-compute 0.1.43__py3-none-any.whl → 0.1.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2056 @@
1
+ """Auto-generated stubs for package: matrice_compute."""
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
+
4
+ from cryptography.hazmat.backends import default_backend
5
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
6
+ from datetime import datetime
7
+ from datetime import datetime, timezone
8
+ from docker.client import DockerClient
9
+ from docker.models.containers import Container
10
+ from kafka import KafkaProducer
11
+ from kafka import KafkaProducer, KafkaConsumer
12
+ from kubernetes import client, config
13
+ from kubernetes.client.rest import ApiException
14
+ from matrice.docker_utils import check_docker
15
+ from matrice_common.session import Session
16
+ from matrice_common.stream.event_listener import EventListener
17
+ from matrice_common.utils import log_errors
18
+ from matrice_compute.action_instance import ActionInstance
19
+ from matrice_compute.actions_manager import ActionsManager
20
+ from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
21
+ from matrice_compute.compute_operations_handler import ComputeOperationsHandler
22
+ from matrice_compute.instance_utils import get_docker_disk_space_usage
23
+ from matrice_compute.instance_utils import get_gpu_with_sufficient_memory_for_action, get_decrypted_access_key_pair, get_max_file_system, get_best_service_ip_and_network
24
+ from matrice_compute.instance_utils import get_instance_info, cleanup_docker_storage, get_cpu_memory_usage, get_gpu_memory_usage, get_mem_usage, get_gpu_with_sufficient_memory_for_action, get_max_file_system, has_gpu
25
+ from matrice_compute.instance_utils import get_instance_info, get_decrypted_access_key_pair
26
+ from matrice_compute.instance_utils import has_gpu, get_gpu_info, calculate_time_difference
27
+ from matrice_compute.instance_utils import has_gpu, get_mem_usage, cleanup_docker_storage
28
+ from matrice_compute.resources_tracker import MachineResourcesTracker, ActionsResourcesTracker, KafkaResourceMonitor, ContainerResourceMonitor
29
+ from matrice_compute.resources_tracker import ResourcesTracker, MachineResourcesTracker, ActionsResourcesTracker
30
+ from matrice_compute.scaling import Scaling
31
+ from matrice_compute.shutdown_manager import ShutdownManager
32
+ from matrice_compute.task_utils import setup_workspace_and_run_task
33
+ import base64
34
+ import docker
35
+ import json
36
+ import logging
37
+ import os
38
+ import platform
39
+ import psutil
40
+ import re
41
+ import shlex
42
+ import shutil
43
+ import signal
44
+ import socket
45
+ import subprocess
46
+ import sys
47
+ import threading
48
+ import time
49
+ import time as time_module
50
+ import torch
51
+ import traceback
52
+ import urllib.parse
53
+ import urllib.request
54
+ import uuid
55
+ import zipfile
56
+
57
+ # Constants
58
+ logger: Any = ... # From compute_operations_handler
59
+ logger: Any = ... # From k8s_scheduler
60
+
61
+ # Functions
62
+ # From action_instance
63
+ def augmentation_server_creation_execute(self) -> Any:
64
+ """
65
+ Create Augmentation Server
66
+ """
67
+ ...
68
+
69
+ # From action_instance
70
+ def data_preparation_execute(self) -> Any:
71
+ """
72
+ Execute data preparation task.
73
+ """
74
+ ...
75
+
76
+ # From action_instance
77
+ def data_processing_execute(self) -> Any:
78
+ """
79
+ Execute data processing task.
80
+ """
81
+ ...
82
+
83
+ # From action_instance
84
+ def data_split_execute(self) -> Any:
85
+ """
86
+ Execute data split task.
87
+ """
88
+ ...
89
+
90
+ # From action_instance
91
+ def database_setup_execute(self) -> Any:
92
+ """
93
+ Creates and setup the database for facial recognition server.
94
+ MongoDB runs on port 27020:27017 (localhost only with --net=host).
95
+ Qdrant runs on port 6334 (localhost only with --net=host).
96
+ """
97
+ ...
98
+
99
+ # From action_instance
100
+ def dataset_annotation_execute(self) -> Any:
101
+ """
102
+ Execute dataset annotation task.
103
+ """
104
+ ...
105
+
106
+ # From action_instance
107
+ def dataset_augmentation_execute(self) -> Any:
108
+ """
109
+ Execute dataset augmentation task.
110
+ """
111
+ ...
112
+
113
+ # From action_instance
114
+ def deploy_aggregator_execute(self) -> Any:
115
+ """
116
+ Execute deploy aggregator task.
117
+ """
118
+ ...
119
+
120
+ # From action_instance
121
+ def facial_recognition_setup_execute(self) -> Any:
122
+ """
123
+ Creates and setup the facial recognition worker server.
124
+ Facial recognition worker runs on port 8081 (localhost only with --net=host).
125
+ """
126
+ ...
127
+
128
+ # From action_instance
129
+ def fe_analytics_service_execute(self) -> Any:
130
+ """
131
+ Creates and setup the frontend analytics service.
132
+ Frontend analytics service runs on port 3001 (localhost only with --net=host).
133
+ """
134
+ ...
135
+
136
+ # From action_instance
137
+ def fe_fs_streaming_execute(self) -> Any:
138
+ """
139
+ Creates and setup the frontend for fs streaming.
140
+ Frontend streaming runs on port 3000 (localhost only with --net=host).
141
+ """
142
+ ...
143
+
144
+ # From action_instance
145
+ def image_build_execute(self) -> Any:
146
+ """
147
+ Execute image building task.
148
+ """
149
+ ...
150
+
151
+ # From action_instance
152
+ def inference_tracker_setup_execute(self) -> Any:
153
+ """
154
+ Creates and start inference tracker.
155
+ Inference tracker runs on port 8110 (localhost only with --net=host).
156
+ """
157
+ ...
158
+
159
+ # From action_instance
160
+ def inference_ws_server_execute(self) -> Any:
161
+ """
162
+ Creates and start inference pipeline.
163
+ Inference WebSocket server runs on port 8102 (localhost only with --net=host).
164
+ """
165
+ ...
166
+
167
+ # From action_instance
168
+ def kafka_setup_execute(self) -> Any:
169
+ """
170
+ Execute kafka server task.
171
+ Kafka runs on port 9092 (SASL_PLAINTEXT) and 9093 (CONTROLLER) - localhost only with --net=host.
172
+ """
173
+ ...
174
+
175
+ # From action_instance
176
+ def lpr_setup_execute(self) -> Any:
177
+ """
178
+ Creates and setup the license plate recognition server.
179
+ LPR worker runs on port 8082 (localhost only with --net=host).
180
+ """
181
+ ...
182
+
183
+ # From action_instance
184
+ def model_deploy_execute(self) -> Any:
185
+ """
186
+ Execute model deployment task.
187
+ """
188
+ ...
189
+
190
+ # From action_instance
191
+ def model_eval_execute(self) -> Any:
192
+ """
193
+ Execute model evaluation task.
194
+ """
195
+ ...
196
+
197
+ # From action_instance
198
+ def model_export_execute(self) -> Any:
199
+ """
200
+ Execute model export task.
201
+ """
202
+ ...
203
+
204
+ # From action_instance
205
+ def model_train_execute(self) -> Any:
206
+ """
207
+ Execute model training task.
208
+ """
209
+ ...
210
+
211
+ # From action_instance
212
+ def redis_setup_execute(self) -> Any:
213
+ """
214
+ Creates and starts a Redis container using Docker.
215
+ Redis runs on port 6379 (localhost only with --net=host).
216
+ """
217
+ ...
218
+
219
+ # From action_instance
220
+ def resource_clone_execute(self) -> Any:
221
+ """
222
+ Execute resource clone task.
223
+ """
224
+ ...
225
+
226
+ # From action_instance
227
+ def streaming_gateway_execute(self) -> Any:
228
+ """
229
+ Execute streaming gateway task.
230
+ """
231
+ ...
232
+
233
+ # From action_instance
234
+ def synthetic_data_setup_execute(self) -> Any:
235
+ """
236
+ Execute synthetic data setup task.
237
+ """
238
+ ...
239
+
240
+ # From action_instance
241
+ def synthetic_dataset_generation_execute(self) -> Any:
242
+ """
243
+ Execute synthetic dataset generation task.
244
+ """
245
+ ...
246
+
247
+ # From action_instance
248
+ def video_storage_setup_execute(self) -> Any:
249
+ """
250
+ Creates and start Video Storage
251
+ Video Stroage runs on port 8106 (localhost only with --net=host).
252
+ """
253
+ ...
254
+
255
+ # From instance_utils
256
+ def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
257
+ """
258
+ Calculate time difference between start and finish times.
259
+
260
+ Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
261
+ and different precision levels (nanoseconds, microseconds, milliseconds).
262
+
263
+ Args:
264
+ start_time_str (str): Start time string in ISO format
265
+ finish_time_str (str): Finish time string in ISO format
266
+
267
+ Returns:
268
+ int: Time difference in seconds
269
+ """
270
+ ...
271
+
272
+ # From instance_utils
273
+ def cleanup_docker_storage() -> None:
274
+ """
275
+ Clean up Docker storage if space is low.
276
+ """
277
+ ...
278
+
279
+ # From instance_utils
280
+ def get_best_service_ip_and_network(port: int) -> tuple:
281
+ """
282
+ Determine the best IP address and network configuration for a service.
283
+
284
+ This function intelligently selects the best IP to bind a service to:
285
+
286
+ Priority:
287
+ 1. Public IP if it's actually on a local interface (cloud servers)
288
+ 2. Private/LAN IP (NAT, local network, Docker)
289
+ 3. localhost with --net=host (fallback)
290
+
291
+ Args:
292
+ port (int): Port number for the service
293
+
294
+ Returns:
295
+ tuple: (ip_address, use_host_network) where:
296
+ - ip_address: The IP address to use (public, private, or localhost)
297
+ - use_host_network: True if should use --net=host, False if should use port mapping
298
+ """
299
+ ...
300
+
301
+ # From instance_utils
302
+ def get_cpu_memory_usage() -> float:
303
+ """
304
+ Get CPU memory usage.
305
+
306
+ Returns:
307
+ float: Memory usage between 0 and 1
308
+ """
309
+ ...
310
+
311
+ # From instance_utils
312
+ def get_decrypted_access_key_pair(enc_access_key: str, enc_secret_key: str, encryption_key: str = '') -> Tuple[Optional[str], Optional[str]]:
313
+ """
314
+ Get decrypted access key pair.
315
+
316
+ Args:
317
+ enc_access_key (str): Encrypted access key
318
+ enc_secret_key (str): Encrypted secret key
319
+ encryption_key (str): Encryption key
320
+
321
+ Returns:
322
+ tuple: (access_key, secret_key) strings
323
+ """
324
+ ...
325
+
326
+ # From instance_utils
327
+ def get_disk_space_usage() -> list:
328
+ """
329
+ Get disk space usage for all filesystems.
330
+
331
+ Returns:
332
+ list: List of disk usage information dictionaries
333
+ """
334
+ ...
335
+
336
+ # From instance_utils
337
+ def get_docker_disk_space_usage() -> dict:
338
+ """
339
+ Get disk space usage for Docker storage.
340
+
341
+ Returns:
342
+ dict: Docker disk usage information
343
+ """
344
+ ...
345
+
346
+ # From instance_utils
347
+ def get_encrypted_access_key_pair(access_key: str, secret_key: str, encryption_key: str = '') -> Tuple[Optional[str], Optional[str]]:
348
+ """
349
+ Get encrypted access key pair.
350
+
351
+ Args:
352
+ access_key (str): access key
353
+ secret_key (str): secret key
354
+ encryption_key (str): Encryption key
355
+
356
+ Returns:
357
+ tuple: (encrypted_access_key, encrypted_secret_key) strings
358
+ """
359
+ ...
360
+
361
+ # From instance_utils
362
+ def get_gpu_config_for_deployment(action_details, is_first_deployment = False) -> Any:
363
+ """
364
+ Get GPU configuration for deployment actions.
365
+
366
+ For first deployment of a service, attempts to use all GPUs.
367
+ For subsequent deployments, uses standard GPU selection (most free memory).
368
+ Falls back gracefully to standard GPU selection if '--gpus all' is not available.
369
+
370
+ Args:
371
+ action_details (dict): Action details containing GPU requirements
372
+ is_first_deployment (bool): Whether this is the first deployment for this service
373
+
374
+ Returns:
375
+ str: GPU configuration string ('--gpus all' or '--gpus "device=X"' or '')
376
+ """
377
+ ...
378
+
379
+ # From instance_utils
380
+ def get_gpu_info() -> list:
381
+ """
382
+ Get GPU information.
383
+
384
+ Returns:
385
+ list: GPU information strings
386
+ """
387
+ ...
388
+
389
+ # From instance_utils
390
+ def get_gpu_memory_usage() -> float:
391
+ """
392
+ Get GPU memory usage percentage.
393
+
394
+ Returns:
395
+ float: Memory usage between 0 and 1
396
+ """
397
+ ...
398
+
399
+ # From instance_utils
400
+ def get_gpu_with_sufficient_memory_for_action(action_details: dict) -> list:
401
+ """
402
+ Get GPUs with sufficient memory for action.
403
+
404
+ Args:
405
+ action_details (dict): Action details
406
+
407
+ Returns:
408
+ list: List of GPU indices
409
+
410
+ Raises:
411
+ ValueError: If insufficient GPU memory
412
+ """
413
+ ...
414
+
415
+ # From instance_utils
416
+ def get_instance_id() -> str:
417
+ """
418
+ Get instance ID.
419
+
420
+ Returns:
421
+ str: Instance ID or empty string
422
+ """
423
+ ...
424
+
425
+ # From instance_utils
426
+ def get_instance_info(service_provider: Optional[str] = None, instance_id: Optional[str] = None) -> tuple:
427
+ """
428
+ Get instance provider and ID information.
429
+
430
+ Returns:
431
+ tuple: (service_provider, instance_id) strings
432
+ """
433
+ ...
434
+
435
+ # From instance_utils
436
+ def get_max_file_system() -> Optional[str]:
437
+ """
438
+ Get filesystem with maximum available space.
439
+
440
+ Returns:
441
+ str: Path to filesystem with most space or None
442
+ """
443
+ ...
444
+
445
+ # From instance_utils
446
+ def get_mem_usage() -> float:
447
+ """
448
+ Get memory usage for either GPU or CPU.
449
+
450
+ Returns:
451
+ float: Memory usage between 0 and 1
452
+ """
453
+ ...
454
+
455
+ # From instance_utils
456
+ def get_required_gpu_memory(action_details: dict) -> int:
457
+ """
458
+ Get required GPU memory from action details.
459
+
460
+ Args:
461
+ action_details (dict): Action details
462
+
463
+ Returns:
464
+ int: Required GPU memory
465
+ """
466
+ ...
467
+
468
+ # From instance_utils
469
+ def get_single_gpu_with_sufficient_memory_for_action(action_details: dict) -> list:
470
+ """
471
+ Get single GPU with sufficient memory using most-free algorithm.
472
+
473
+ Selects the GPU with the MOST free memory that meets the requirements,
474
+ to balance load across GPUs and prevent any single GPU from being overused.
475
+
476
+ Args:
477
+ action_details (dict): Action details
478
+
479
+ Returns:
480
+ list: List with single GPU index
481
+
482
+ Raises:
483
+ ValueError: If no GPU has sufficient memory
484
+ """
485
+ ...
486
+
487
+ # From instance_utils
488
+ def has_gpu() -> bool:
489
+ """
490
+ Check if the system has a GPU.
491
+
492
+ Returns:
493
+ bool: True if GPU is present, False otherwise
494
+ """
495
+ ...
496
+
497
+ # From instance_utils
498
+ def is_allowed_gpu_device(gpu_index: int) -> bool:
499
+ """
500
+ Check if GPU device is allowed based on GPUS environment variable.
501
+
502
+ The GPUS environment variable can be used to restrict which GPU devices
503
+ are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
504
+
505
+ Args:
506
+ gpu_index (int): GPU device index
507
+
508
+ Returns:
509
+ bool: True if GPU is allowed (or no filter is set), False otherwise
510
+ """
511
+ ...
512
+
513
+ # From instance_utils
514
+ def is_docker_running() -> bool:
515
+ """
516
+ Check if Docker is running.
517
+
518
+ Returns:
519
+ bool: True if Docker containers are running
520
+ """
521
+ ...
522
+
523
+ # From instance_utils
524
+ def prune_docker_images() -> None:
525
+ """
526
+ Prune Docker images.
527
+ """
528
+ ...
529
+
530
+ # From task_utils
531
+ def refresh_url_if_needed(url: Optional[str], scaling: Optional[Scaling] = None) -> Optional[str]:
532
+ """
533
+ Refresh a presigned URL if it appears to be expired or about to expire.
534
+
535
+ This function attempts to refresh presigned URLs for model codebase and requirements
536
+ to ensure they are valid before downloading.
537
+
538
+ Args:
539
+ url: The URL to potentially refresh. If None or empty, returns None.
540
+ scaling: The Scaling instance to use for API calls. If None, returns original URL.
541
+
542
+ Returns:
543
+ The refreshed URL if successful, or the original URL if refresh fails or is not needed.
544
+ """
545
+ ...
546
+
547
+ # From task_utils
548
+ def setup_workspace_and_run_task(work_fs: str, action_id: str, model_codebase_url: str, model_codebase_requirements_url: Optional[str] = None, scaling: Optional[Scaling] = None) -> None:
549
+ """
550
+ Set up workspace and run task with provided parameters.
551
+
552
+ Args:
553
+ work_fs (str): Working filesystem path.
554
+ action_id (str): Unique identifier for the action.
555
+ model_codebase_url (str): URL to download model codebase from.
556
+ model_codebase_requirements_url (Optional[str]): URL to download requirements from. Defaults to None.
557
+ scaling (Optional[Scaling]): Scaling instance for refreshing presigned URLs. Defaults to None.
558
+
559
+ Returns:
560
+ None
561
+ """
562
+ ...
563
+
564
+ # Classes
565
+ # From action_instance
566
+ class ActionInstance:
567
+ """
568
+ Base class for tasks that run in Action containers.
569
+ """
570
+
571
+ def __init__(self, scaling, action_info: dict) -> None:
572
+ """
573
+ Initialize an action instance.
574
+
575
+ Args:
576
+ scaling (Scaling): Scaling service instance
577
+ action_info (dict): Action information dictionary
578
+ """
579
+ ...
580
+
581
+ def execute(self) -> Any:
582
+ """
583
+ Execute the task.
584
+ """
585
+ ...
586
+
587
+ def get_action_details(self) -> Any:
588
+ """
589
+ Get action details from scaling service.
590
+
591
+ Returns:
592
+ dict: Action details if successful, None otherwise
593
+ """
594
+ ...
595
+
596
+ def get_base_docker_cmd(self, work_fs: str = '', use_gpu: str = '', mount_docker_sock: bool = False, action_id: str = '', model_key: str = '', extra_env_vars: dict = {}, port_mapping: dict = {}, network_config: str = '', destination_workspace_path: str = '/usr/src/workspace', docker_workdir: str = '', extra_pkgs: list = []) -> Any:
597
+ """
598
+ Build base Docker command with common options.
599
+
600
+ Args:
601
+ work_fs (str): Work filesystem path
602
+ use_gpu (str): GPU configuration string
603
+ mount_docker_sock (bool): Whether to mount Docker socket
604
+ action_id (str): Action ID
605
+ model_key (str): Model key
606
+ extra_env_vars (dict): Additional environment variables
607
+ port_mapping (dict): Port mappings {host_port: container_port}
608
+ destination_workspace_path (str): Container workspace path
609
+ docker_workdir (str): Docker working directory
610
+ extra_pkgs (list): List of extra packages to install
611
+ Returns:
612
+ str: Base Docker command
613
+ """
614
+ ...
615
+
616
+ def get_gpu_config(self, action_details) -> Any:
617
+ """
618
+ Get GPU configuration string based on available GPUs.
619
+
620
+ Args:
621
+ action_details (dict): Action details containing GPU requirements
622
+
623
+ Returns:
624
+ str: GPU configuration string
625
+ """
626
+ ...
627
+
628
+ def get_hugging_face_token(self, model_key) -> Any:
629
+ """
630
+ Get Hugging Face token for specific model keys.
631
+
632
+ Args:
633
+ model_key (str): Model key to check
634
+
635
+ Returns:
636
+ str: Hugging Face token if available, empty string otherwise
637
+ """
638
+ ...
639
+
640
+ def get_hugging_face_token_for_data_generation(self) -> Any: ...
641
+
642
+ def get_internal_api_key(self, action_id) -> Any:
643
+ """
644
+ Get internal API key for action.
645
+
646
+ Args:
647
+ action_id (str): Action ID
648
+
649
+ Returns:
650
+ str: Internal API key if available, empty string otherwise
651
+ """
652
+ ...
653
+
654
+ def get_log_path(self) -> Any:
655
+ """
656
+ Get log directory path, creating if needed.
657
+
658
+ Returns:
659
+ str: Path to log directory
660
+ """
661
+ ...
662
+
663
+ def is_running(self) -> bool:
664
+ """
665
+ Check if task process is running.
666
+
667
+ This method performs a thorough check to determine if the process is still running:
668
+ 1. Verifies that the process attribute exists and is not None
669
+ 2. Checks if the process has terminated using poll() method
670
+ 3. Additional safeguards against zombie processes
671
+ 4. Coordinates with log monitoring to ensure all logs are sent before cleanup
672
+
673
+ Returns:
674
+ bool: True if process exists and is still running, False if process
675
+ does not exist or has terminated
676
+ """
677
+ ...
678
+
679
+ def send_logs_continuously(self) -> Any:
680
+ """
681
+ Continuously read and send logs from the log file to the scaling service.
682
+
683
+ Enhanced version that tracks log position and handles graceful shutdown.
684
+ """
685
+ ...
686
+
687
+ def setup_action_requirements(self, action_details, work_fs = '', model_family = '', action_id = '') -> Any:
688
+ """
689
+ Setup action requirements.
690
+
691
+ Args:
692
+ action_details (dict): Action details
693
+ work_fs (str): Work filesystem path
694
+ model_family (str): Model family name
695
+ action_id (str): Action ID
696
+
697
+ Raises:
698
+ Exception: If setup fails
699
+ """
700
+ ...
701
+
702
+ def start(self, cmd: str = '', log_name: str = '') -> Any:
703
+ """
704
+ Start the process and log monitoring thread.
705
+
706
+ Args:
707
+ cmd (str): Command to execute
708
+ log_name (str): Name for log file
709
+ """
710
+ ...
711
+
712
+ def start_logger(self) -> Any:
713
+ """
714
+ Start the log monitoring thread.
715
+ """
716
+ ...
717
+
718
+ def start_process(self, cmd, log_name) -> Any:
719
+ """
720
+ Start the process and initialize logging.
721
+
722
+ Args:
723
+ cmd (str): Command to execute
724
+ log_name (str): Name for log file
725
+
726
+ Raises:
727
+ Exception: If process fails to start
728
+ """
729
+ ...
730
+
731
+ def stop(self) -> Any:
732
+ """
733
+ Stop the process and log monitoring thread.
734
+
735
+ Enhanced version that ensures proper cleanup sequencing and log completion.
736
+ """
737
+ ...
738
+
739
+
740
+ # From actions_manager
741
+ class ActionsManager:
742
+ """
743
+ Class for managing actions.
744
+ """
745
+
746
+ def __init__(self, scaling) -> None:
747
+ """
748
+ Initialize an action manager.
749
+
750
+ Args:
751
+ scaling (Scaling): Scaling service instance
752
+ """
753
+ ...
754
+
755
+ def fetch_actions(self) -> list:
756
+ """
757
+ Poll for actions and process them if memory threshold is not exceeded.
758
+
759
+ Returns:
760
+ list: List of fetched actions
761
+ """
762
+ ...
763
+
764
+ def get_all_actions(self) -> dict:
765
+ """
766
+ Get all tracked actions (both running and stopped).
767
+
768
+ Returns:
769
+ dict: All tracked actions with their status
770
+ """
771
+ ...
772
+
773
+ def get_current_actions(self) -> dict:
774
+ """
775
+ Get the current running actions.
776
+
777
+ This method:
778
+ 1. Updates action status tracking via update_actions_status()
779
+ 2. Returns only the running actions (current_actions dict)
780
+ 3. Provides detailed logging about current actions state
781
+
782
+ Returns:
783
+ dict: Current running actions only
784
+ """
785
+ ...
786
+
787
+ def get_stopped_actions(self) -> dict:
788
+ """
789
+ Get stopped actions.
790
+
791
+ Returns:
792
+ dict: Stopped actions
793
+ """
794
+ ...
795
+
796
+ def process_action(self, action: dict) -> Any:
797
+ """
798
+ Process the given action.
799
+
800
+ Args:
801
+ action (dict): Action details to process
802
+
803
+ Returns:
804
+ ActionInstance: Processed action instance or None if failed
805
+ """
806
+ ...
807
+
808
+ def process_actions(self) -> None:
809
+ """
810
+ Process fetched actions.
811
+ """
812
+ ...
813
+
814
+ def purge_unwanted(self) -> None:
815
+ """
816
+ Purge completed or failed actions.
817
+
818
+ NOTE: This now calls update_actions_status() which moves stopped actions
819
+ to a separate dict instead of deleting them. This prevents interference
820
+ with compute operations handler while maintaining accurate status.
821
+ """
822
+ ...
823
+
824
+ def restart_action(self, action_record_id: str) -> dict:
825
+ """
826
+ Restart a specific action by its record ID.
827
+
828
+ This method stops the action if it's running, then fetches fresh action
829
+ details from the backend and starts it again.
830
+
831
+ Args:
832
+ action_record_id (str): The action record ID to restart
833
+
834
+ Returns:
835
+ dict: Result dictionary with status information
836
+ """
837
+ ...
838
+
839
+ def start_actions_manager(self) -> None:
840
+ """
841
+ Start the actions manager main loop.
842
+ """
843
+ ...
844
+
845
+ def stop_action(self, action_record_id: str) -> dict:
846
+ """
847
+ Stop a specific action by its record ID.
848
+
849
+ Args:
850
+ action_record_id (str): The action record ID to stop
851
+
852
+ Returns:
853
+ dict: Result dictionary with status information
854
+ """
855
+ ...
856
+
857
+ def update_actions_status(self) -> None:
858
+ """
859
+ Update tracking of running vs stopped actions.
860
+
861
+ This method checks all actions and moves stopped ones to stopped_actions dict
862
+ without deleting them. This prevents interference with compute operations
863
+ handler while maintaining accurate status reporting.
864
+ """
865
+ ...
866
+
867
+
868
+ # From actions_scaledown_manager
869
+ class ActionsScaleDownManager:
870
+ """
871
+ Class for managing container scale down operations.
872
+ """
873
+
874
+ def __init__(self, scaling) -> None:
875
+ """
876
+ Initialize the scale down manager.
877
+
878
+ Args:
879
+ scaling (Scaling): Scaling service instance
880
+ """
881
+ ...
882
+
883
+ def auto_scaledown_actions(self) -> None:
884
+ """
885
+ Start polling for containers that need to be scaled down and stop them.
886
+ """
887
+ ...
888
+
889
+
890
+ # From compute_operations_handler
891
+ class ComputeOperationsHandler:
892
+ """
893
+ Handles Kafka-based compute operations for instance and action management.
894
+
895
+ This class uses EventListener from matrice_common to listen for operation
896
+ events from the 'compute_operations' Kafka topic. It delegates operations
897
+ to the ActionsManager for execution and updates status via API calls.
898
+ """
899
+
900
+ def __init__(self, actions_manager, session, scaling, instance_id: str) -> None:
901
+ """
902
+ Initialize the Compute Operations Handler.
903
+
904
+ Args:
905
+ actions_manager: Reference to the ActionsManager instance
906
+ session: Session object for authentication and Kafka configuration
907
+ scaling: Scaling service instance for API status updates
908
+ instance_id: This compute instance's ID for filtering events
909
+ """
910
+ ...
911
+
912
+ KAFKA_TOPIC: Any
913
+
914
+ def start(self) -> bool:
915
+ """
916
+ Start the operations handler using EventListener.
917
+
918
+ Returns:
919
+ bool: True if started successfully, False otherwise
920
+ """
921
+ ...
922
+
923
+ def stop(self) -> Any:
924
+ """
925
+ Stop the operations handler gracefully.
926
+ """
927
+ ...
928
+
929
+
930
+ # From instance_manager
931
+ class InstanceManager:
932
+ """
933
+ Class for managing compute instances and their associated actions.
934
+
935
+ Now includes auto streaming capabilities for specified deployment IDs.
936
+ """
937
+
938
+ def __init__(self, matrice_access_key_id: str = '', matrice_secret_access_key: str = '', encryption_key: str = '', instance_id: str = '', service_provider: str = '', env: str = '', gpus: str = '', workspace_dir: str = 'matrice_workspace', enable_kafka: bool = False) -> None:
939
+ """
940
+ Initialize an instance manager.
941
+
942
+ Args:
943
+ matrice_access_key_id (str): Access key ID for Matrice authentication.
944
+ Defaults to empty string.
945
+ matrice_secret_access_key (str): Secret access key for Matrice
946
+ authentication. Defaults to empty string.
947
+ encryption_key (str): Key used for encrypting sensitive data.
948
+ Defaults to empty string.
949
+ instance_id (str): Unique identifier for this compute instance.
950
+ Defaults to empty string.
951
+ service_provider (str): Cloud service provider being used.
952
+ Defaults to empty string.
953
+ env (str): Environment name (e.g. dev, prod).
954
+ Defaults to empty string.
955
+ gpus (str): GPU configuration string (e.g. "0,1").
956
+ Defaults to empty string.
957
+ workspace_dir (str): Directory for workspace files.
958
+ Defaults to "matrice_workspace".
959
+ enable_kafka (bool): Enable Kafka communication (default False).
960
+ """
961
+ ...
962
+
963
+ def start(self) -> tuple:
964
+ """
965
+ Start the instance manager threads.
966
+
967
+ Returns:
968
+ tuple: (instance_manager_thread, actions_manager_thread)
969
+ """
970
+ ...
971
+
972
+ def start_container_status_monitor(self) -> Any:
973
+ """
974
+ Start the background container status monitoring.
975
+ """
976
+ ...
977
+
978
+ def start_instance_manager(self) -> None:
979
+ """
980
+ Run the instance manager loop.
981
+ """
982
+ ...
983
+
984
+ def stop(self) -> Any:
985
+ """
986
+ Stop all background threads and cleanup resources.
987
+ """
988
+ ...
989
+
990
+ def stop_container_status_monitor(self) -> Any:
991
+ """
992
+ Stop the background container status monitoring.
993
+ """
994
+ ...
995
+
996
+
997
+ # From k8s_scheduler
998
+ class K8sScheduler:
999
+ """
1000
+ Kubernetes Scheduler that polls for actions and creates K8s Jobs.
1001
+ Runs inside the cluster using in-cluster authentication.
1002
+ """
1003
+
1004
+ def __init__(self) -> None: ...
1005
+
1006
+ def check_job_status(self, action_id: str, job_name: str, namespace: str) -> Optional[str]:
1007
+ """
1008
+ Check the status of a K8s job and return status if completed. Also monitors resource usage.
1009
+ """
1010
+ ...
1011
+
1012
+ def create_k8s_job(self, action: Dict[str, Any]) -> Optional[str]:
1013
+ """
1014
+ Create a Kubernetes Job for the given action
1015
+ """
1016
+ ...
1017
+
1018
+ def monitor_running_jobs(self) -> Any:
1019
+ """
1020
+ Monitor running jobs and update action statuses
1021
+ """
1022
+ ...
1023
+
1024
+ def poll_pending_actions(self) -> List[Dict[str, Any]]:
1025
+ """
1026
+ Poll for actions assigned to this Kubernetes cluster.
1027
+
1028
+ Uses the new K8s-specific endpoint:
1029
+ - processClusterName in be-action detects K8s clusters and sets kubernetesClusterId
1030
+ - Scheduler calls /v1/actions/assign_jobs_kubernetes/{cluster_id} to fetch assigned actions
1031
+ """
1032
+ ...
1033
+
1034
+ def send_heartbeat(self) -> Any:
1035
+ """
1036
+ Send heartbeat to Matrice API with cluster health info
1037
+ """
1038
+ ...
1039
+
1040
+ def start(self) -> Any:
1041
+ """
1042
+ Main scheduler loop - matches InstanceManager.start() pattern
1043
+ """
1044
+ ...
1045
+
1046
+ def update_action_status(self, action_id: str, step_code: str, status: str, description: str, extra_details: Optional[Dict] = None) -> Any:
1047
+ """
1048
+ Update action status using the existing action update endpoint.
1049
+
1050
+ Uses the standard action record update API that accepts:
1051
+ - stepCode: The step code for the action
1052
+ - status: Status (OK, ERROR, etc.)
1053
+ - statusDescription: Human-readable description
1054
+
1055
+ Extra details are merged into the action record's actionDetails.
1056
+ """
1057
+ ...
1058
+
1059
+
1060
+ # From prechecks
1061
+ class Prechecks:
1062
+ """
1063
+ Class for running pre-checks before compute operations.
1064
+ """
1065
+
1066
+ def __init__(self, session, instance_id: Optional[str] = None) -> None:
1067
+ """
1068
+ Initialize Prechecks.
1069
+
1070
+ Args:
1071
+ session: Session object for RPC calls
1072
+ instance_id: Optional instance ID
1073
+ """
1074
+ ...
1075
+
1076
+ def check_credentials(self, access_key: Optional[str] = None, secret_key: Optional[str] = None) -> bool:
1077
+ """
1078
+ Check if access key and secret key are valid.
1079
+
1080
+ Args:
1081
+ access_key: Optional access key to validate
1082
+ secret_key: Optional secret key to validate
1083
+
1084
+ Returns:
1085
+ bool: True if credentials are valid
1086
+ """
1087
+ ...
1088
+
1089
+ def check_docker(self) -> bool:
1090
+ """
1091
+ Check if docker is installed and working.
1092
+
1093
+ Returns:
1094
+ bool: True if docker is working
1095
+ """
1096
+ ...
1097
+
1098
+ def check_fetch_actions(self) -> bool:
1099
+ """
1100
+ Test action fetching and validation.
1101
+
1102
+ Returns:
1103
+ bool: True if action fetching works
1104
+ """
1105
+ ...
1106
+
1107
+ def check_filesystem_space(self) -> bool:
1108
+ """
1109
+ Check available filesystem space and usage.
1110
+
1111
+ Returns:
1112
+ bool: True if filesystem space is sufficient
1113
+ """
1114
+ ...
1115
+
1116
+ def check_get_gpu_indices(self) -> bool:
1117
+ """
1118
+ Check if get_gpu_indices returns valid indices.
1119
+
1120
+ Returns:
1121
+ bool: True if GPU indices are valid
1122
+ """
1123
+ ...
1124
+
1125
+ def check_gpu(self) -> bool:
1126
+ """
1127
+ Check if machine has GPU and it's functioning.
1128
+
1129
+ Returns:
1130
+ bool: True if GPU check passes
1131
+ """
1132
+ ...
1133
+
1134
+ def check_instance_id(self, instance_id: Optional[str] = None) -> bool:
1135
+ """
1136
+ Validate instance ID from args or env.
1137
+
1138
+ Args:
1139
+ instance_id: Optional instance ID to validate
1140
+
1141
+ Returns:
1142
+ bool: True if instance ID is valid
1143
+ """
1144
+ ...
1145
+
1146
+ def check_resources(self) -> bool:
1147
+ """
1148
+ Validate system resource limits and availability.
1149
+
1150
+ Returns:
1151
+ bool: True if resource checks pass
1152
+ """
1153
+ ...
1154
+
1155
+ def check_resources_tracking(self) -> bool:
1156
+ """
1157
+ Test resource tracking updates and monitoring.
1158
+
1159
+ Returns:
1160
+ bool: True if resource tracking is working
1161
+ """
1162
+ ...
1163
+
1164
+ def check_scaling_status(self) -> bool:
1165
+ """
1166
+ Test scaling service status.
1167
+
1168
+ Returns:
1169
+ bool: True if scaling status is ok
1170
+ """
1171
+ ...
1172
+
1173
+ def cleanup_docker_storage(self) -> bool:
1174
+ """
1175
+ Clean up docker storage and verify space freed.
1176
+
1177
+ Returns:
1178
+ bool: True if cleanup successful
1179
+ """
1180
+ ...
1181
+
1182
+ def create_docker_volume(self) -> bool:
1183
+ """
1184
+ Create docker volume.
1185
+
1186
+ Returns:
1187
+ bool: True if volume created successfully
1188
+ """
1189
+ ...
1190
+
1191
+ def get_available_resources(self) -> bool:
1192
+ """
1193
+ Check available system resources are within valid ranges.
1194
+
1195
+ Returns:
1196
+ bool: True if resources are within valid ranges
1197
+ """
1198
+ ...
1199
+
1200
+ def get_shutdown_details(self) -> bool:
1201
+ """
1202
+ Get and validate shutdown details from response.
1203
+
1204
+ Returns:
1205
+ bool: True if shutdown details are valid
1206
+ """
1207
+ ...
1208
+
1209
+ def run_all_checks(self, instance_id: Optional[str] = None, access_key: Optional[str] = None, secret_key: Optional[str] = None) -> bool:
1210
+ """
1211
+ Run all prechecks in sequence.
1212
+
1213
+ Args:
1214
+ instance_id: Optional instance ID to validate
1215
+ access_key: Optional access key to validate
1216
+ secret_key: Optional secret key to validate
1217
+
1218
+ Returns:
1219
+ bool: True if all checks pass
1220
+ """
1221
+ ...
1222
+
1223
+ def setup_docker(self) -> bool:
1224
+ """
1225
+ Setup docker.
1226
+
1227
+ Returns:
1228
+ bool: True if setup successful
1229
+ """
1230
+ ...
1231
+
1232
+ def test_actions_scale_down(self) -> bool:
1233
+ """
1234
+ Test actions scale down.
1235
+
1236
+ Returns:
1237
+ bool: True if scale down test passes
1238
+ """
1239
+ ...
1240
+
1241
+ def test_gpu(self) -> bool:
1242
+ """
1243
+ Test if GPU is working and has sufficient memory.
1244
+
1245
+ Returns:
1246
+ bool: True if GPU test passes
1247
+ """
1248
+ ...
1249
+
1250
+
1251
+ # From resources_tracker
1252
+ class ActionsResourcesTracker:
1253
+ """
1254
+ Tracks Docker container action resources
1255
+ """
1256
+
1257
+ def __init__(self, scaling) -> None:
1258
+ """
1259
+ Initialize ActionsResourcesTracker
1260
+ """
1261
+ ...
1262
+
1263
+ def get_current_action_usage(self, container, status: str) -> Tuple[float, int, float, float]:
1264
+ """
1265
+ Get current resource usage for a container
1266
+ """
1267
+ ...
1268
+
1269
+ def get_sub_containers_by_label(self, label_key: str, label_value: str) -> list:
1270
+ """
1271
+ Get running containers with specified label key and value
1272
+ """
1273
+ ...
1274
+
1275
+ def update_actions_resources(self) -> None:
1276
+ """
1277
+ Process both running and exited containers.
1278
+
1279
+ Note: Does not remove containers to keep logs. Only tracks resource usage.
1280
+ """
1281
+ ...
1282
+
1283
+ def update_max_action_usage(self, action_record_id: str, current_gpu_utilization: float, current_gpu_memory: int, current_cpu_utilization: float, current_memory_utilization: float) -> Tuple[float, int, float, float]:
1284
+ """
1285
+ Update and return maximum resource usage values for an action
1286
+ """
1287
+ ...
1288
+
1289
+
1290
+ # From resources_tracker
1291
+ class ContainerResourceMonitor:
1292
+ """
1293
+ Monitors individual container resource utilization and publishes to Kafka.
1294
+ This thread runs independently and reports CPU, memory, and GPU usage for all running containers.
1295
+ """
1296
+
1297
+ def __init__(self, instance_id: Optional[str] = None, kafka_bootstrap: Optional[str] = None, interval_seconds: int = 30) -> None:
1298
+ """
1299
+ Initialize ContainerResourceMonitor.
1300
+
1301
+ Args:
1302
+ instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
1303
+ kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
1304
+ interval_seconds: Interval between container checks in seconds. Defaults to 30.
1305
+ """
1306
+ ...
1307
+
1308
+ def is_running(self) -> bool:
1309
+ """
1310
+ Check if the container resource monitor is currently running.
1311
+
1312
+ Returns:
1313
+ bool: True if running, False otherwise.
1314
+ """
1315
+ ...
1316
+
1317
+ def start(self) -> Any:
1318
+ """
1319
+ Start the container resource monitoring thread.
1320
+
1321
+ Returns:
1322
+ bool: True if started successfully, False otherwise.
1323
+ """
1324
+ ...
1325
+
1326
+ def stop(self, timeout: int = 10) -> Any:
1327
+ """
1328
+ Stop the container resource monitoring thread gracefully.
1329
+
1330
+ Args:
1331
+ timeout: Maximum time to wait for thread to stop in seconds.
1332
+
1333
+ Returns:
1334
+ bool: True if stopped successfully, False otherwise.
1335
+ """
1336
+ ...
1337
+
1338
+
1339
+ # From resources_tracker
1340
+ class KafkaResourceMonitor:
1341
+ """
1342
+ Monitors system resources and publishes them to Kafka in a separate thread.
1343
+ This class provides thread-safe start/stop operations for resource monitoring.
1344
+ """
1345
+
1346
+ def __init__(self, instance_id: Optional[str] = None, kafka_bootstrap: Optional[str] = None, interval_seconds: int = 60) -> None:
1347
+ """
1348
+ Initialize KafkaResourceMonitor.
1349
+
1350
+ Args:
1351
+ instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
1352
+ kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
1353
+ interval_seconds: Interval between resource checks in seconds. Defaults to 60.
1354
+ """
1355
+ ...
1356
+
1357
+ def get_all_gpu_memory() -> Dict[int, tuple]:
1358
+ """
1359
+ Get GPU memory usage and total for all GPUs.
1360
+
1361
+ Returns:
1362
+ Dict[int, tuple]: Dictionary mapping GPU ID to (used_gb, total_gb).
1363
+ Returns empty dict if nvidia-smi is not available.
1364
+ """
1365
+ ...
1366
+
1367
+ def get_all_storage_info() -> Dict[str, tuple]:
1368
+ """
1369
+ Get storage information for all mounted drives.
1370
+
1371
+ Returns:
1372
+ Dict[str, tuple]: Dictionary mapping mount point to (free_gb, total_gb).
1373
+ """
1374
+ ...
1375
+
1376
+ def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str, tuple]]:
1377
+ """
1378
+ Collect current system resource statistics.
1379
+
1380
+ Returns:
1381
+ Tuple[float, int, float, float, Dict[int, tuple], Dict[str, tuple]]:
1382
+ CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total), Storage dict (free, total)
1383
+ """
1384
+ ...
1385
+
1386
+ def is_running(self) -> bool:
1387
+ """
1388
+ Check if the resource monitor is currently running.
1389
+
1390
+ Returns:
1391
+ bool: True if running, False otherwise.
1392
+ """
1393
+ ...
1394
+
1395
+ def start(self) -> Any:
1396
+ """
1397
+ Start the resource monitoring thread.
1398
+
1399
+ Returns:
1400
+ bool: True if started successfully, False otherwise.
1401
+ """
1402
+ ...
1403
+
1404
+ def stop(self, timeout: int = 10) -> Any:
1405
+ """
1406
+ Stop the resource monitoring thread gracefully.
1407
+
1408
+ Args:
1409
+ timeout: Maximum time to wait for thread to stop in seconds.
1410
+
1411
+ Returns:
1412
+ bool: True if stopped successfully, False otherwise.
1413
+ """
1414
+ ...
1415
+
1416
+
1417
+ # From resources_tracker
1418
+ class MachineResourcesTracker:
1419
+ """
1420
+ Tracks machine-level resources like CPU, memory and GPU
1421
+ """
1422
+
1423
+ def __init__(self, scaling) -> None:
1424
+ """
1425
+ Initialize MachineResourcesTracker
1426
+ """
1427
+ ...
1428
+
1429
+ def update_available_resources(self) -> Any:
1430
+ """
1431
+ Update available machine resources
1432
+ """
1433
+ ...
1434
+
1435
+
1436
+ # From resources_tracker
1437
+ class ResourcesTracker:
1438
+ """
1439
+ Tracks machine and container resources.
1440
+
1441
+ GPU Utilization Note:
1442
+ GPU utilization is tracked at the DEVICE level, not per-container.
1443
+ NVIDIA does not expose reliable per-process GPU utilization.
1444
+ Per-container GPU MEMORY is accurate; per-container GPU UTILIZATION is best-effort.
1445
+ """
1446
+
1447
+ def __init__(self) -> None:
1448
+ """
1449
+ Initialize ResourcesTracker.
1450
+ """
1451
+ ...
1452
+
1453
+ def get_all_container_pids(self, container_id: str) -> set:
1454
+ """
1455
+ Get ALL PIDs belonging to a container (including child processes).
1456
+
1457
+ Uses multiple methods for robustness:
1458
+ 1. docker top (most reliable for standard Docker)
1459
+ 2. Docker API inspect + process tree enumeration
1460
+ 3. cgroup procs files (v1 and v2)
1461
+
1462
+ Known limitations:
1463
+ - May miss processes in rootless Docker
1464
+ - CRI-O/containerd may have different layouts
1465
+
1466
+ Args:
1467
+ container_id (str): ID of the Docker container.
1468
+
1469
+ Returns:
1470
+ set: Set of all PIDs (as strings) belonging to the container.
1471
+ """
1472
+ ...
1473
+
1474
+ def get_available_resources(self) -> Tuple[float, float, int, float]:
1475
+ """
1476
+ Get available machine resources.
1477
+
1478
+ Note: CPU measurement is non-blocking (uses interval=0).
1479
+ For more accurate CPU usage, call this method periodically and track trends.
1480
+
1481
+ Returns:
1482
+ Tuple[float, float, int, float]:
1483
+ - Available memory in GB
1484
+ - Available CPU percentage (100 - current_usage)
1485
+ - Free GPU memory in MB
1486
+ - GPU utilization percentage (0-100)
1487
+ """
1488
+ ...
1489
+
1490
+ def get_container_cpu_and_memory(self, container) -> Tuple[float, float]:
1491
+ """
1492
+ Get CPU and memory usage for a container.
1493
+
1494
+ Args:
1495
+ container (docker.models.containers.Container): Docker container instance.
1496
+
1497
+ Returns:
1498
+ Tuple[float, float]: CPU utilization percentage (0-100 per core used) and memory usage in MB.
1499
+ """
1500
+ ...
1501
+
1502
+ def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
1503
+ """
1504
+ Get CPU and memory usage for a specific container by its ID.
1505
+
1506
+ Args:
1507
+ container_id (str): ID of the Docker container.
1508
+
1509
+ Returns:
1510
+ Tuple[float, float]: CPU utilization percentage and memory usage in MB.
1511
+ """
1512
+ ...
1513
+
1514
+ def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
1515
+ """
1516
+ Get GPU usage for a specific container.
1517
+
1518
+ IMPORTANT: GPU utilization tracking limitations:
1519
+ - GPU MEMORY per container is ACCURATE (from nvidia-smi per-process data)
1520
+ - GPU UTILIZATION per container is BEST-EFFORT (NVIDIA doesn't expose per-process SM usage)
1521
+
1522
+ For GPU utilization, we report the utilization of GPUs that have container processes.
1523
+ If multiple containers share a GPU, they will all report similar utilization.
1524
+
1525
+ Args:
1526
+ container_id (str): ID of the Docker container.
1527
+
1528
+ Returns:
1529
+ Tuple[float, int]:
1530
+ - GPU utilization percentage (device-level, for GPUs used by container)
1531
+ - GPU memory usage in MB (accurate per-container)
1532
+ """
1533
+ ...
1534
+
1535
+ def get_container_gpu_memory_usage(self, container_pid: str) -> int:
1536
+ """
1537
+ Get GPU memory usage for a container PID.
1538
+
1539
+ Args:
1540
+ container_pid (str): PID of the Docker container.
1541
+
1542
+ Returns:
1543
+ int: GPU memory usage in MB.
1544
+ """
1545
+ ...
1546
+
1547
+ def get_container_gpu_memory_usage_multi_pid(self, container_pids: set) -> int:
1548
+ """
1549
+ Get GPU memory usage for multiple container PIDs.
1550
+
1551
+ Args:
1552
+ container_pids (set): Set of container PIDs (as strings).
1553
+
1554
+ Returns:
1555
+ int: Total GPU memory usage in MB across all matching processes.
1556
+ """
1557
+ ...
1558
+
1559
+ def get_container_gpu_usage(self, container_pid: str) -> float:
1560
+ """
1561
+ Get GPU usage for a container PID.
1562
+
1563
+ Args:
1564
+ container_pid (str): PID of the Docker container.
1565
+
1566
+ Returns:
1567
+ float: GPU utilization percentage.
1568
+ """
1569
+ ...
1570
+
1571
+ def get_container_gpu_usage_multi_pid(self, container_pids: set) -> float:
1572
+ """
1573
+ Get GPU usage for multiple container PIDs.
1574
+
1575
+ Args:
1576
+ container_pids (set): Set of container PIDs (as strings).
1577
+
1578
+ Returns:
1579
+ float: Total GPU utilization percentage across all matching processes.
1580
+ """
1581
+ ...
1582
+
1583
+ def get_pid_id_by_container_id(self, container_id: str) -> str:
1584
+ """
1585
+ Get PID for a container ID.
1586
+
1587
+ Args:
1588
+ container_id (str): ID of the Docker container.
1589
+
1590
+ Returns:
1591
+ str: PID of the container.
1592
+ """
1593
+ ...
1594
+
1595
+
1596
+ # From scaling
1597
+ class Scaling:
1598
+ """
1599
+ Class providing scaling functionality for compute instances.
1600
+ """
1601
+
1602
+ def __init__(self, session, instance_id = None, enable_kafka = False) -> None:
1603
+ """
1604
+ Initialize Scaling instance.
1605
+
1606
+ Args:
1607
+ session: Session object for making RPC calls
1608
+ instance_id: ID of the compute instance
1609
+ enable_kafka: Enable Kafka communication (default True)
1610
+
1611
+ Raises:
1612
+ Exception: If instance_id is not provided
1613
+ """
1614
+ ...
1615
+
1616
+ def add_account_compute_instance(self, account_number, alias, service_provider, instance_type, shut_down_time, lease_type, launch_duration) -> Any:
1617
+ """
1618
+ Add a compute instance for an account.
1619
+
1620
+ Args:
1621
+ account_number: Account number
1622
+ alias: Instance alias
1623
+ service_provider: Cloud service provider
1624
+ instance_type: Type of instance
1625
+ shut_down_time: Time to shutdown
1626
+ lease_type: Type of lease
1627
+ launch_duration: Duration to launch
1628
+
1629
+ Returns:
1630
+ Tuple of (data, error, message) from API response
1631
+ """
1632
+ ...
1633
+
1634
+ def assign_jobs(self, is_gpu) -> Any:
1635
+ """
1636
+ Assign jobs to the instance using REST API.
1637
+
1638
+ Args:
1639
+ is_gpu: Boolean or any value indicating if this is a GPU instance.
1640
+ Will be converted to proper boolean.
1641
+
1642
+ Returns:
1643
+ Tuple of (data, error, message) from API response
1644
+ """
1645
+ ...
1646
+
1647
+ def delete_account_compute(self, account_number, alias) -> Any:
1648
+ """
1649
+ Delete a compute instance for an account.
1650
+
1651
+ Args:
1652
+ account_number: Account number
1653
+ alias: Instance alias
1654
+
1655
+ Returns:
1656
+ Tuple of (data, error, message) from API response
1657
+ """
1658
+ ...
1659
+
1660
+ def get_action_details(self, action_status_id) -> Any:
1661
+ """
1662
+ Get details for a specific action using Kafka (with REST fallback).
1663
+
1664
+ Args:
1665
+ action_status_id: ID of the action status to fetch
1666
+
1667
+ Returns:
1668
+ Tuple of (data, error, message) from API response
1669
+ """
1670
+ ...
1671
+
1672
+ def get_all_instances_type(self) -> Any:
1673
+ """
1674
+ Get all instance types using Kafka (with REST fallback).
1675
+
1676
+ Returns:
1677
+ Tuple of (data, error, message) from API response
1678
+ """
1679
+ ...
1680
+
1681
+ def get_compute_details(self) -> Any:
1682
+ """
1683
+ Get compute instance details using Kafka (with REST fallback).
1684
+
1685
+ Returns:
1686
+ Tuple of (data, error, message) from API response
1687
+ """
1688
+ ...
1689
+
1690
+ def get_data_processing_image(self) -> Any:
1691
+ """
1692
+ Get data processing image name.
1693
+
1694
+ Returns:
1695
+ Full image name including repository and tag
1696
+ """
1697
+ ...
1698
+
1699
+ def get_docker_hub_credentials(self) -> Any:
1700
+ """
1701
+ Get Docker Hub credentials using Kafka (with REST fallback).
1702
+
1703
+ Returns:
1704
+ Tuple of (data, error, message) from API response
1705
+ """
1706
+ ...
1707
+
1708
+ def get_downscaled_ids(self) -> Any:
1709
+ """
1710
+ Get IDs of downscaled instances using Kafka (with REST fallback).
1711
+
1712
+ Returns:
1713
+ Tuple of (data, error, message) from API response
1714
+ """
1715
+ ...
1716
+
1717
+ def get_internal_api_key(self, action_id) -> Any:
1718
+ """
1719
+ Get internal API key using Kafka (with REST fallback).
1720
+
1721
+ Args:
1722
+ action_id: ID of the action
1723
+
1724
+ Returns:
1725
+ Tuple of (data, error, message) from API response
1726
+ """
1727
+ ...
1728
+
1729
+ def get_kafka_bootstrap_servers(self) -> Any:
1730
+ """
1731
+ Get Kafka bootstrap servers from API and decode base64 fields.
1732
+
1733
+ Returns:
1734
+ str: Kafka bootstrap servers in format "ip:port"
1735
+
1736
+ Raises:
1737
+ ValueError: If unable to fetch Kafka configuration
1738
+ """
1739
+ ...
1740
+
1741
+ def get_model_codebase(self, model_family_id) -> Any:
1742
+ """
1743
+ Get model codebase.
1744
+
1745
+ Args:
1746
+ model_family_id: ID of the model family
1747
+
1748
+ Returns:
1749
+ Tuple of (data, error, message) from API response
1750
+ """
1751
+ ...
1752
+
1753
+ def get_model_codebase_requirements(self, dockerId) -> Any:
1754
+ """
1755
+ Get model codebase requirements.
1756
+
1757
+ Args:
1758
+ dockerId: ID of the docker
1759
+
1760
+ Returns:
1761
+ Tuple of (data, error, message) from API response
1762
+ """
1763
+ ...
1764
+
1765
+ def get_model_codebase_script(self, model_family_id) -> Any:
1766
+ """
1767
+ Get model codebase script.
1768
+
1769
+ Args:
1770
+ model_family_id: ID of the model family
1771
+
1772
+ Returns:
1773
+ Tuple of (data, error, message) from API response
1774
+ """
1775
+ ...
1776
+
1777
+ def get_model_secret_keys(self, secret_name) -> Any:
1778
+ """
1779
+ Get model secret keys using Kafka (with REST fallback).
1780
+
1781
+ Args:
1782
+ secret_name: Name of the secret
1783
+
1784
+ Returns:
1785
+ Tuple of (data, error, message) from API response
1786
+ """
1787
+ ...
1788
+
1789
+ def get_open_port(self) -> Any:
1790
+ """
1791
+ Get an available open port.
1792
+
1793
+ Returns:
1794
+ Port number if available, None otherwise
1795
+ """
1796
+ ...
1797
+
1798
+ def get_open_ports_config(self) -> Any:
1799
+ """
1800
+ Get open ports configuration using Kafka (with REST fallback).
1801
+
1802
+ Returns:
1803
+ Tuple of (data, error, message) from API response
1804
+ """
1805
+ ...
1806
+
1807
+ def get_shutdown_details(self) -> Any:
1808
+ """
1809
+ Get shutdown details for the instance using Kafka (with REST fallback).
1810
+
1811
+ Returns:
1812
+ Tuple of (data, error, message) from API response
1813
+ """
1814
+ ...
1815
+
1816
+ def get_tasks_details(self) -> Any:
1817
+ """
1818
+ Get task details for the instance using Kafka (with REST fallback).
1819
+
1820
+ Returns:
1821
+ Tuple of (data, error, message) from API response
1822
+ """
1823
+ ...
1824
+
1825
+ def get_user_access_key_pair(self, user_id) -> Any:
1826
+ """
1827
+ Get user access key pair using Kafka (with REST fallback).
1828
+
1829
+ Args:
1830
+ user_id: ID of the user
1831
+
1832
+ Returns:
1833
+ Tuple of (data, error, message) from API response
1834
+ """
1835
+ ...
1836
+
1837
+ def handle_response(self, resp, success_message, error_message) -> Any:
1838
+ """
1839
+ Helper function to handle API response.
1840
+
1841
+ Args:
1842
+ resp: Response from API call
1843
+ success_message: Message to log on success
1844
+ error_message: Message to log on error
1845
+
1846
+ Returns:
1847
+ Tuple of (data, error, message)
1848
+ """
1849
+ ...
1850
+
1851
+ def refresh_presigned_url(self, url: str) -> Any:
1852
+ """
1853
+ Refresh a presigned URL that may have expired.
1854
+
1855
+ Args:
1856
+ url: The presigned URL to refresh
1857
+
1858
+ Returns:
1859
+ Tuple of (refreshed_url, error, message) from API response
1860
+ """
1861
+ ...
1862
+
1863
+ def report_architecture_info(self) -> Any:
1864
+ """
1865
+ Collects and sends architecture info to the compute service.
1866
+ """
1867
+ ...
1868
+
1869
+ def restart_account_compute(self, account_number, alias) -> Any:
1870
+ """
1871
+ Restart a compute instance for an account using Kafka (with REST fallback).
1872
+
1873
+ Args:
1874
+ account_number: Account number
1875
+ alias: Instance alias
1876
+
1877
+ Returns:
1878
+ Tuple of (data, error, message) from API response
1879
+ """
1880
+ ...
1881
+
1882
+ def shutdown(self) -> Any:
1883
+ """
1884
+ Gracefully shutdown Kafka connections.
1885
+ """
1886
+ ...
1887
+
1888
+ def stop_account_compute(self, account_number, alias) -> Any:
1889
+ """
1890
+ Stop a compute instance for an account using Kafka (with REST fallback).
1891
+
1892
+ Args:
1893
+ account_number: Account number
1894
+ alias: Instance alias
1895
+
1896
+ Returns:
1897
+ Tuple of (data, error, message) from API response
1898
+ """
1899
+ ...
1900
+
1901
+ def stop_instance(self) -> Any:
1902
+ """
1903
+ Stop the compute instance using Kafka (with REST fallback).
1904
+
1905
+ Returns:
1906
+ Tuple of (data, error, message) from API response
1907
+ """
1908
+ ...
1909
+
1910
+ def update_action(self, id = '', step_code = '', action_type = '', status = '', sub_action = '', status_description = '', service = '', job_params = None) -> Any:
1911
+ """
1912
+ Update an action using Kafka (with REST fallback).
1913
+
1914
+ Args:
1915
+ id: Action ID
1916
+ step_code: Step code
1917
+ action_type: Type of action
1918
+ status: Status of the action
1919
+ sub_action: Sub-action details
1920
+ status_description: Description of the status
1921
+ service: Service name
1922
+ job_params: Job parameters dictionary
1923
+
1924
+ Returns:
1925
+ Tuple of (data, error, message) from API response
1926
+ """
1927
+ ...
1928
+
1929
+ def update_action_container_id(self, action_record_id, container_id) -> Any:
1930
+ """
1931
+ Update container ID for an action using Kafka (with REST fallback).
1932
+
1933
+ Args:
1934
+ action_record_id: ID of the action record
1935
+ container_id: Container ID to update
1936
+
1937
+ Returns:
1938
+ Tuple of (data, error, message) from API response
1939
+ """
1940
+ ...
1941
+
1942
+ def update_action_docker_logs(self, action_record_id, log_content) -> Any:
1943
+ """
1944
+ Update docker logs for an action using Kafka (with REST fallback).
1945
+
1946
+ Args:
1947
+ action_record_id: ID of the action record
1948
+ log_content: Content of the logs to update
1949
+
1950
+ Returns:
1951
+ Tuple of (data, error, message) from API response
1952
+ """
1953
+ ...
1954
+
1955
+ def update_action_status(self, service_provider = '', action_record_id = '', isRunning = True, status = '', docker_start_time = None, action_duration = 0, cpuUtilisation = 0.0, gpuUtilisation = 0.0, memoryUtilisation = 0.0, gpuMemoryUsed = 0, createdAt = None, updatedAt = None) -> Any:
1956
+ """
1957
+ Update status of an action using Kafka (with REST fallback).
1958
+
1959
+ Args:
1960
+ service_provider: Provider of the service
1961
+ action_record_id: ID of the action record
1962
+ isRunning: Whether action is running
1963
+ status: Status of the action
1964
+ docker_start_time: Start time of docker container
1965
+ action_duration: Duration of the action
1966
+ cpuUtilisation: CPU utilization percentage
1967
+ gpuUtilisation: GPU utilization percentage
1968
+ memoryUtilisation: Memory utilization percentage
1969
+ gpuMemoryUsed: GPU memory used
1970
+ createdAt: Creation timestamp
1971
+ updatedAt: Last update timestamp
1972
+
1973
+ Returns:
1974
+ Tuple of (data, error, message) from API response
1975
+ """
1976
+ ...
1977
+
1978
+ def update_available_resources(self, availableCPU = 0, availableGPU = 0, availableMemory = 0, availableGPUMemory = 0) -> Any:
1979
+ """
1980
+ Update available resources for the instance using Kafka (with REST fallback).
1981
+
1982
+ Args:
1983
+ availableCPU: Available CPU resources
1984
+ availableGPU: Available GPU resources
1985
+ availableMemory: Available memory
1986
+ availableGPUMemory: Available GPU memory
1987
+
1988
+ Returns:
1989
+ Tuple of (data, error, message) from API response
1990
+ """
1991
+ ...
1992
+
1993
+ def update_jupyter_token(self, token = '') -> Any:
1994
+ """
1995
+ Update Jupyter notebook token using Kafka (with REST fallback).
1996
+ """
1997
+ ...
1998
+
1999
+ def update_status(self, action_record_id, action_type, service_name, stepCode, status, status_description) -> None:
2000
+ """
2001
+ Update status of an action using Kafka (with REST fallback).
2002
+
2003
+ Args:
2004
+ action_record_id: ID of the action record
2005
+ action_type: Type of action
2006
+ service_name: Name of the service
2007
+ stepCode: Code indicating step in process
2008
+ status: Status to update
2009
+ status_description: Description of the status
2010
+ """
2011
+ ...
2012
+
2013
+
2014
+ # From shutdown_manager
2015
+ class ShutdownManager:
2016
+ """
2017
+ Class for managing compute instance shutdown.
2018
+ """
2019
+
2020
+ def __init__(self, scaling) -> None:
2021
+ """
2022
+ Initialize ShutdownManager.
2023
+
2024
+ Args:
2025
+ scaling (Scaling): Scaling instance to manage shutdown.
2026
+ """
2027
+ ...
2028
+
2029
+ def do_cleanup_and_shutdown(self) -> bool:
2030
+ """
2031
+ Clean up resources and shut down the instance.
2032
+
2033
+ This method attempts a coordinated shutdown with multiple fallback strategies:
2034
+ 1. API call to notify the scaling service
2035
+ 2. Graceful OS shutdown command
2036
+ 3. Aggressive shutdown methods if needed
2037
+ 4. Emergency forced shutdown as last resort
2038
+
2039
+ Returns:
2040
+ bool: True if shutdown was initiated successfully, False otherwise
2041
+ """
2042
+ ...
2043
+
2044
+ def handle_shutdown(self, tasks_running: bool) -> None:
2045
+ """
2046
+ Check idle time and trigger shutdown if threshold is exceeded.
2047
+
2048
+ Args:
2049
+ tasks_running: Boolean indicating if there are running tasks
2050
+ """
2051
+ ...
2052
+
2053
+
2054
+ from . import action_instance, actions_manager, actions_scaledown_manager, compute_operations_handler, instance_manager, instance_utils, k8s_scheduler, prechecks, resources_tracker, scaling, shutdown_manager, task_utils
2055
+
2056
+ def __getattr__(name: str) -> Any: ...