matrice-compute 0.1.20__tar.gz → 0.1.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/PKG-INFO +1 -1
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/action_instance.py +69 -14
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/instance_utils.py +226 -28
- matrice_compute-0.1.22/src/matrice_compute/scaling.py +1235 -0
- matrice_compute-0.1.20/src/matrice_compute/scaling.py +0 -972
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/LICENSE.txt +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/README.md +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/pyproject.toml +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/setup.cfg +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/setup.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/resources_tracker.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.20 → matrice_compute-0.1.22}/src/matrice_compute/task_utils.py +0 -0
|
@@ -268,17 +268,68 @@ class ActionInstance:
|
|
|
268
268
|
Returns:
|
|
269
269
|
str: GPU configuration string
|
|
270
270
|
"""
|
|
271
|
-
|
|
271
|
+
action_id = action_details.get("_id", "unknown")
|
|
272
|
+
|
|
273
|
+
# Check if GPU is required
|
|
274
|
+
gpu_required = action_details["actionDetails"].get("gpuRequired", False)
|
|
275
|
+
if not gpu_required:
|
|
276
|
+
logging.info(
|
|
277
|
+
"Action %s does not require GPU - will run on CPU",
|
|
278
|
+
action_id
|
|
279
|
+
)
|
|
272
280
|
return ""
|
|
273
|
-
|
|
274
|
-
|
|
281
|
+
|
|
282
|
+
# Get required GPU memory for logging
|
|
283
|
+
required_memory = action_details.get("actionDetails", {}).get(
|
|
284
|
+
"expectedResources", {}
|
|
285
|
+
).get("gpuMemory", 0)
|
|
286
|
+
|
|
287
|
+
logging.info(
|
|
288
|
+
"Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
|
|
289
|
+
action_id,
|
|
290
|
+
required_memory
|
|
275
291
|
)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
# Get the best-fit GPU(s) with sufficient memory
|
|
295
|
+
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
296
|
+
action_details=action_details
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
if gpu_indices:
|
|
300
|
+
gpu_str = ",".join(map(str, gpu_indices))
|
|
301
|
+
logging.info(
|
|
302
|
+
"Action %s: Selected GPU device(s): %s (required memory: %d MB)",
|
|
303
|
+
action_id,
|
|
304
|
+
gpu_str,
|
|
305
|
+
required_memory
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Return Docker GPU configuration
|
|
309
|
+
# Format: --gpus "device=0" or --gpus "device=0,1,2"
|
|
310
|
+
return f'--gpus "device={gpu_str}"'
|
|
311
|
+
else:
|
|
312
|
+
logging.warning(
|
|
313
|
+
"Action %s: No GPUs with sufficient memory found (required: %d MB)",
|
|
314
|
+
action_id,
|
|
315
|
+
required_memory
|
|
316
|
+
)
|
|
317
|
+
return ""
|
|
318
|
+
|
|
319
|
+
except ValueError as e:
|
|
320
|
+
logging.error(
|
|
321
|
+
"Action %s: Error selecting GPU - %s",
|
|
322
|
+
action_id,
|
|
323
|
+
str(e)
|
|
324
|
+
)
|
|
325
|
+
return ""
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logging.error(
|
|
328
|
+
"Action %s: Unexpected error in GPU selection - %s",
|
|
329
|
+
action_id,
|
|
330
|
+
str(e)
|
|
331
|
+
)
|
|
332
|
+
return ""
|
|
282
333
|
|
|
283
334
|
@log_errors(default_return="", raise_exception=False)
|
|
284
335
|
def get_base_docker_cmd(
|
|
@@ -1410,13 +1461,17 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1410
1461
|
model_family=model_family,
|
|
1411
1462
|
action_id=action_id,
|
|
1412
1463
|
)
|
|
1464
|
+
|
|
1465
|
+
# Get GPU configuration based on requirements and availability
|
|
1466
|
+
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1413
1467
|
use_gpu = self.get_gpu_config(action_details)
|
|
1414
1468
|
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1469
|
+
logging.info(
|
|
1470
|
+
"Action %s: Model deployment GPU config: %s",
|
|
1471
|
+
action_id,
|
|
1472
|
+
use_gpu if use_gpu else "CPU-only"
|
|
1473
|
+
)
|
|
1474
|
+
|
|
1420
1475
|
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1421
1476
|
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1422
1477
|
logging.info("cmd is: %s", cmd)
|
|
@@ -589,19 +589,42 @@ def get_required_gpu_memory(action_details: dict) -> int:
|
|
|
589
589
|
|
|
590
590
|
@log_errors(default_return=True, raise_exception=False)
|
|
591
591
|
def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
592
|
-
"""Check if GPU device is allowed.
|
|
592
|
+
"""Check if GPU device is allowed based on GPUS environment variable.
|
|
593
|
+
|
|
594
|
+
The GPUS environment variable can be used to restrict which GPU devices
|
|
595
|
+
are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
|
|
593
596
|
|
|
594
597
|
Args:
|
|
595
598
|
gpu_index (int): GPU device index
|
|
596
599
|
|
|
597
600
|
Returns:
|
|
598
|
-
bool: True if GPU is allowed
|
|
601
|
+
bool: True if GPU is allowed (or no filter is set), False otherwise
|
|
599
602
|
"""
|
|
600
603
|
gpus = os.environ.get("GPUS")
|
|
601
604
|
if not gpus:
|
|
605
|
+
# No filter set - all GPUs are allowed
|
|
606
|
+
return True
|
|
607
|
+
|
|
608
|
+
try:
|
|
609
|
+
allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
|
|
610
|
+
is_allowed = int(gpu_index) in allowed_gpus
|
|
611
|
+
|
|
612
|
+
if not is_allowed:
|
|
613
|
+
logging.debug(
|
|
614
|
+
"GPU %d is not in allowed GPU list: %s",
|
|
615
|
+
gpu_index,
|
|
616
|
+
allowed_gpus
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
return is_allowed
|
|
620
|
+
|
|
621
|
+
except ValueError as e:
|
|
622
|
+
logging.warning(
|
|
623
|
+
"Invalid GPUS environment variable format '%s': %s. Allowing all GPUs.",
|
|
624
|
+
gpus,
|
|
625
|
+
e
|
|
626
|
+
)
|
|
602
627
|
return True
|
|
603
|
-
allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
|
|
604
|
-
return int(gpu_index) in allowed_gpus
|
|
605
628
|
|
|
606
629
|
|
|
607
630
|
@log_errors(raise_exception=True, log_error=False)
|
|
@@ -620,7 +643,15 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
620
643
|
Raises:
|
|
621
644
|
ValueError: If insufficient GPU memory
|
|
622
645
|
"""
|
|
646
|
+
action_id = action_details.get("_id", "unknown")
|
|
623
647
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
648
|
+
|
|
649
|
+
logging.info(
|
|
650
|
+
"Action %s: Searching for GPU(s) with %d MB available memory",
|
|
651
|
+
action_id,
|
|
652
|
+
required_gpu_memory
|
|
653
|
+
)
|
|
654
|
+
|
|
624
655
|
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
625
656
|
try:
|
|
626
657
|
result = subprocess.run(
|
|
@@ -631,47 +662,137 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
631
662
|
check=False,
|
|
632
663
|
)
|
|
633
664
|
if result.returncode != 0:
|
|
665
|
+
error_msg = f"nvidia-smi command failed with return code {result.returncode}"
|
|
666
|
+
logging.error("Action %s: %s", action_id, error_msg)
|
|
634
667
|
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
635
668
|
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
636
669
|
except subprocess.TimeoutExpired:
|
|
637
|
-
logging.
|
|
670
|
+
logging.error(
|
|
671
|
+
"Action %s: nvidia-smi command timed out after 5 seconds",
|
|
672
|
+
action_id
|
|
673
|
+
)
|
|
638
674
|
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
639
675
|
except FileNotFoundError:
|
|
676
|
+
logging.error(
|
|
677
|
+
"Action %s: nvidia-smi not found on this system",
|
|
678
|
+
action_id
|
|
679
|
+
)
|
|
640
680
|
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
641
681
|
except Exception as e:
|
|
642
|
-
logging.
|
|
682
|
+
logging.error(
|
|
683
|
+
"Action %s: Error running nvidia-smi: %s",
|
|
684
|
+
action_id,
|
|
685
|
+
e
|
|
686
|
+
)
|
|
643
687
|
raise ValueError(f"Failed to get GPU information: {e}")
|
|
644
|
-
|
|
688
|
+
|
|
645
689
|
if len(memory_free_info) < 2:
|
|
690
|
+
logging.error(
|
|
691
|
+
"Action %s: No GPU information available from nvidia-smi output",
|
|
692
|
+
action_id
|
|
693
|
+
)
|
|
646
694
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
647
|
-
|
|
695
|
+
|
|
648
696
|
try:
|
|
649
697
|
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
650
698
|
except (ValueError, IndexError) as e:
|
|
699
|
+
logging.error(
|
|
700
|
+
"Action %s: Error parsing GPU memory information: %s",
|
|
701
|
+
action_id,
|
|
702
|
+
e
|
|
703
|
+
)
|
|
651
704
|
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
652
|
-
|
|
705
|
+
|
|
653
706
|
if not memory_free_values:
|
|
707
|
+
logging.error("Action %s: No GPU devices found", action_id)
|
|
654
708
|
raise ValueError("No GPU devices found")
|
|
655
|
-
|
|
709
|
+
|
|
710
|
+
# Log all available GPUs and their free memory
|
|
711
|
+
logging.info(
|
|
712
|
+
"Action %s: Found %d GPU(s) - Free memory: %s",
|
|
713
|
+
action_id,
|
|
714
|
+
len(memory_free_values),
|
|
715
|
+
", ".join([f"GPU{i}: {mem}MB" for i, mem in enumerate(memory_free_values)])
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Check GPUS environment variable for allowed devices
|
|
719
|
+
allowed_gpus = os.environ.get("GPUS", "")
|
|
720
|
+
if allowed_gpus:
|
|
721
|
+
logging.info(
|
|
722
|
+
"Action %s: GPU device filter active - allowed devices: %s",
|
|
723
|
+
action_id,
|
|
724
|
+
allowed_gpus
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# For smaller memory requirements, try to fit on a single GPU first
|
|
656
728
|
if required_gpu_memory < 80000:
|
|
729
|
+
logging.debug(
|
|
730
|
+
"Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation",
|
|
731
|
+
action_id,
|
|
732
|
+
required_gpu_memory
|
|
733
|
+
)
|
|
657
734
|
try:
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
735
|
+
single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
|
|
736
|
+
logging.info(
|
|
737
|
+
"Action %s: Successfully allocated single GPU: %s",
|
|
738
|
+
action_id,
|
|
739
|
+
single_gpu
|
|
740
|
+
)
|
|
741
|
+
return single_gpu
|
|
742
|
+
except ValueError as e:
|
|
743
|
+
logging.debug(
|
|
744
|
+
"Action %s: Single GPU allocation failed (%s) - will try multiple GPUs",
|
|
745
|
+
action_id,
|
|
746
|
+
str(e)
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
# Multi-GPU allocation: accumulate GPUs until we have enough memory
|
|
750
|
+
logging.info(
|
|
751
|
+
"Action %s: Attempting multi-GPU allocation for %d MB",
|
|
752
|
+
action_id,
|
|
753
|
+
required_gpu_memory
|
|
754
|
+
)
|
|
755
|
+
|
|
661
756
|
selected_gpus = []
|
|
662
757
|
total_memory = 0
|
|
663
758
|
for i, mem in enumerate(memory_free_values):
|
|
664
759
|
if not is_allowed_gpu_device(i):
|
|
760
|
+
logging.debug(
|
|
761
|
+
"Action %s: Skipping GPU %d - not in allowed device list",
|
|
762
|
+
action_id,
|
|
763
|
+
i
|
|
764
|
+
)
|
|
665
765
|
continue
|
|
666
766
|
if total_memory >= required_gpu_memory:
|
|
667
767
|
break
|
|
668
768
|
selected_gpus.append(i)
|
|
669
769
|
total_memory += mem
|
|
770
|
+
logging.debug(
|
|
771
|
+
"Action %s: Added GPU %d (%d MB free) - Total: %d MB",
|
|
772
|
+
action_id,
|
|
773
|
+
i,
|
|
774
|
+
mem,
|
|
775
|
+
total_memory
|
|
776
|
+
)
|
|
777
|
+
|
|
670
778
|
if total_memory >= required_gpu_memory:
|
|
779
|
+
logging.info(
|
|
780
|
+
"Action %s: Successfully allocated %d GPU(s): %s (Total memory: %d MB >= Required: %d MB)",
|
|
781
|
+
action_id,
|
|
782
|
+
len(selected_gpus),
|
|
783
|
+
selected_gpus,
|
|
784
|
+
total_memory,
|
|
785
|
+
required_gpu_memory
|
|
786
|
+
)
|
|
671
787
|
return selected_gpus
|
|
672
|
-
|
|
673
|
-
|
|
788
|
+
|
|
789
|
+
error_msg = (
|
|
790
|
+
f"Insufficient GPU memory available. "
|
|
791
|
+
f"Required: {required_gpu_memory}MB, "
|
|
792
|
+
f"Available: {total_memory}MB across {len(selected_gpus)} GPU(s)"
|
|
674
793
|
)
|
|
794
|
+
logging.error("Action %s: %s", action_id, error_msg)
|
|
795
|
+
raise ValueError(error_msg)
|
|
675
796
|
|
|
676
797
|
|
|
677
798
|
@log_errors(raise_exception=True, log_error=False)
|
|
@@ -679,7 +800,10 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
679
800
|
action_details: dict,
|
|
680
801
|
) -> list:
|
|
681
802
|
"""
|
|
682
|
-
Get single GPU with sufficient memory.
|
|
803
|
+
Get single GPU with sufficient memory using best-fit algorithm.
|
|
804
|
+
|
|
805
|
+
Best-fit selects the GPU with the smallest amount of free memory
|
|
806
|
+
that still meets the requirements, minimizing fragmentation.
|
|
683
807
|
|
|
684
808
|
Args:
|
|
685
809
|
action_details (dict): Action details
|
|
@@ -690,7 +814,15 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
690
814
|
Raises:
|
|
691
815
|
ValueError: If no GPU has sufficient memory
|
|
692
816
|
"""
|
|
817
|
+
action_id = action_details.get("_id", "unknown")
|
|
693
818
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
819
|
+
|
|
820
|
+
logging.debug(
|
|
821
|
+
"Action %s: Finding best-fit single GPU for %d MB",
|
|
822
|
+
action_id,
|
|
823
|
+
required_gpu_memory
|
|
824
|
+
)
|
|
825
|
+
|
|
694
826
|
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
695
827
|
try:
|
|
696
828
|
result = subprocess.run(
|
|
@@ -704,38 +836,104 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
704
836
|
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
705
837
|
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
706
838
|
except subprocess.TimeoutExpired:
|
|
707
|
-
logging.
|
|
839
|
+
logging.error(
|
|
840
|
+
"Action %s: nvidia-smi timed out in single GPU selection",
|
|
841
|
+
action_id
|
|
842
|
+
)
|
|
708
843
|
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
709
844
|
except FileNotFoundError:
|
|
710
845
|
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
711
846
|
except Exception as e:
|
|
712
|
-
logging.
|
|
847
|
+
logging.error(
|
|
848
|
+
"Action %s: Error running nvidia-smi: %s",
|
|
849
|
+
action_id,
|
|
850
|
+
e
|
|
851
|
+
)
|
|
713
852
|
raise ValueError(f"Failed to get GPU information: {e}")
|
|
714
|
-
|
|
853
|
+
|
|
715
854
|
if len(memory_free_info) < 2:
|
|
716
855
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
717
|
-
|
|
856
|
+
|
|
718
857
|
try:
|
|
719
858
|
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
720
859
|
except (ValueError, IndexError) as e:
|
|
721
860
|
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
722
|
-
|
|
861
|
+
|
|
723
862
|
if not memory_free_values:
|
|
724
863
|
raise ValueError("No GPU devices found")
|
|
725
|
-
|
|
864
|
+
|
|
865
|
+
# Best-fit algorithm: find GPU with minimum free memory that meets requirement
|
|
726
866
|
best_fit_gpu = None
|
|
727
867
|
best_fit_memory = float("inf")
|
|
868
|
+
|
|
728
869
|
for i, mem in enumerate(memory_free_values):
|
|
870
|
+
# Check if GPU is in allowed list
|
|
729
871
|
if not is_allowed_gpu_device(i):
|
|
872
|
+
logging.debug(
|
|
873
|
+
"Action %s: Skipping GPU %d (not in allowed list) - %d MB free",
|
|
874
|
+
action_id,
|
|
875
|
+
i,
|
|
876
|
+
mem
|
|
877
|
+
)
|
|
730
878
|
continue
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
879
|
+
|
|
880
|
+
# Check if GPU has sufficient memory
|
|
881
|
+
if mem >= required_gpu_memory:
|
|
882
|
+
logging.debug(
|
|
883
|
+
"Action %s: GPU %d is candidate - %d MB free (required: %d MB)",
|
|
884
|
+
action_id,
|
|
885
|
+
i,
|
|
886
|
+
mem,
|
|
887
|
+
required_gpu_memory
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
# Best-fit: choose GPU with smallest sufficient memory
|
|
891
|
+
if mem < best_fit_memory:
|
|
892
|
+
best_fit_gpu = i
|
|
893
|
+
best_fit_memory = mem
|
|
894
|
+
logging.debug(
|
|
895
|
+
"Action %s: GPU %d is new best-fit candidate",
|
|
896
|
+
action_id,
|
|
897
|
+
i
|
|
898
|
+
)
|
|
899
|
+
else:
|
|
900
|
+
logging.debug(
|
|
901
|
+
"Action %s: GPU %d insufficient - %d MB free < %d MB required",
|
|
902
|
+
action_id,
|
|
903
|
+
i,
|
|
904
|
+
mem,
|
|
905
|
+
required_gpu_memory
|
|
906
|
+
)
|
|
907
|
+
|
|
734
908
|
if best_fit_gpu is not None:
|
|
909
|
+
logging.info(
|
|
910
|
+
"Action %s: Selected best-fit GPU %d with %d MB free (required: %d MB, waste: %d MB)",
|
|
911
|
+
action_id,
|
|
912
|
+
best_fit_gpu,
|
|
913
|
+
best_fit_memory,
|
|
914
|
+
required_gpu_memory,
|
|
915
|
+
best_fit_memory - required_gpu_memory
|
|
916
|
+
)
|
|
735
917
|
return [best_fit_gpu]
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
918
|
+
|
|
919
|
+
# No suitable GPU found - provide detailed error
|
|
920
|
+
suitable_gpus = [
|
|
921
|
+
f"GPU{i}: {mem}MB (need {required_gpu_memory}MB)"
|
|
922
|
+
for i, mem in enumerate(memory_free_values)
|
|
923
|
+
if is_allowed_gpu_device(i)
|
|
924
|
+
]
|
|
925
|
+
|
|
926
|
+
if not suitable_gpus:
|
|
927
|
+
error_msg = f"No allowed GPUs available (GPUS env filter active)"
|
|
928
|
+
else:
|
|
929
|
+
error_msg = (
|
|
930
|
+
f"No single GPU with sufficient memory. "
|
|
931
|
+
f"Required: {required_gpu_memory}MB. "
|
|
932
|
+
f"Available GPUs: {', '.join(suitable_gpus)}"
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
logging.warning("Action %s: %s", action_id, error_msg)
|
|
936
|
+
raise ValueError(error_msg)
|
|
739
937
|
|
|
740
938
|
|
|
741
939
|
@log_errors(default_return=(None, None), raise_exception=False)
|