matrice-compute 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +3 -3
- matrice_compute/instance_utils.py +21 -16
- {matrice_compute-0.1.23.dist-info → matrice_compute-0.1.24.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.23.dist-info → matrice_compute-0.1.24.dist-info}/RECORD +7 -7
- {matrice_compute-0.1.23.dist-info → matrice_compute-0.1.24.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.23.dist-info → matrice_compute-0.1.24.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.23.dist-info → matrice_compute-0.1.24.dist-info}/top_level.txt +0 -0
|
@@ -285,13 +285,13 @@ class ActionInstance:
|
|
|
285
285
|
).get("gpuMemory", 0)
|
|
286
286
|
|
|
287
287
|
logging.info(
|
|
288
|
-
"Action %s requires GPU with %d MB memory - selecting
|
|
288
|
+
"Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
|
|
289
289
|
action_id,
|
|
290
290
|
required_memory
|
|
291
291
|
)
|
|
292
292
|
|
|
293
293
|
try:
|
|
294
|
-
# Get the
|
|
294
|
+
# Get the GPU(s) with most free memory that have sufficient memory
|
|
295
295
|
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
296
296
|
action_details=action_details
|
|
297
297
|
)
|
|
@@ -1463,7 +1463,7 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1463
1463
|
)
|
|
1464
1464
|
|
|
1465
1465
|
# Get GPU configuration based on requirements and availability
|
|
1466
|
-
# This
|
|
1466
|
+
# This selects the GPU(s) with the most free memory to balance load
|
|
1467
1467
|
use_gpu = self.get_gpu_config(action_details)
|
|
1468
1468
|
|
|
1469
1469
|
logging.info(
|
|
@@ -600,13 +600,18 @@ def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
|
600
600
|
Returns:
|
|
601
601
|
bool: True if GPU is allowed (or no filter is set), False otherwise
|
|
602
602
|
"""
|
|
603
|
-
gpus = os.environ.get("GPUS")
|
|
604
|
-
|
|
605
|
-
|
|
603
|
+
gpus = os.environ.get("GPUS", "").strip()
|
|
604
|
+
# No filter set or empty string - all GPUs are allowed
|
|
605
|
+
if not gpus or gpus == '""' or gpus == "''":
|
|
606
606
|
return True
|
|
607
607
|
|
|
608
608
|
try:
|
|
609
|
-
allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
|
|
609
|
+
allowed_gpus = [int(x.strip()) for x in gpus.split(",") if x.strip()]
|
|
610
|
+
|
|
611
|
+
# If no valid GPUs after parsing, allow all
|
|
612
|
+
if not allowed_gpus:
|
|
613
|
+
return True
|
|
614
|
+
|
|
610
615
|
is_allowed = int(gpu_index) in allowed_gpus
|
|
611
616
|
|
|
612
617
|
if not is_allowed:
|
|
@@ -727,14 +732,14 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
727
732
|
# For smaller memory requirements, try to fit on a single GPU first
|
|
728
733
|
if required_gpu_memory < 80000:
|
|
729
734
|
logging.debug(
|
|
730
|
-
"Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation",
|
|
735
|
+
"Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation (selecting GPU with most free memory)",
|
|
731
736
|
action_id,
|
|
732
737
|
required_gpu_memory
|
|
733
738
|
)
|
|
734
739
|
try:
|
|
735
740
|
single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
|
|
736
741
|
logging.info(
|
|
737
|
-
"Action %s: Successfully allocated single GPU: %s",
|
|
742
|
+
"Action %s: Successfully allocated single GPU with most free memory: %s",
|
|
738
743
|
action_id,
|
|
739
744
|
single_gpu
|
|
740
745
|
)
|
|
@@ -800,10 +805,10 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
800
805
|
action_details: dict,
|
|
801
806
|
) -> list:
|
|
802
807
|
"""
|
|
803
|
-
Get single GPU with sufficient memory using
|
|
808
|
+
Get single GPU with sufficient memory using most-free algorithm.
|
|
804
809
|
|
|
805
|
-
|
|
806
|
-
|
|
810
|
+
Selects the GPU with the MOST free memory that meets the requirements,
|
|
811
|
+
to balance load across GPUs and prevent any single GPU from being overused.
|
|
807
812
|
|
|
808
813
|
Args:
|
|
809
814
|
action_details (dict): Action details
|
|
@@ -818,7 +823,7 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
818
823
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
819
824
|
|
|
820
825
|
logging.debug(
|
|
821
|
-
"Action %s: Finding
|
|
826
|
+
"Action %s: Finding GPU with most free memory for %d MB",
|
|
822
827
|
action_id,
|
|
823
828
|
required_gpu_memory
|
|
824
829
|
)
|
|
@@ -862,9 +867,9 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
862
867
|
if not memory_free_values:
|
|
863
868
|
raise ValueError("No GPU devices found")
|
|
864
869
|
|
|
865
|
-
#
|
|
870
|
+
# Most-free algorithm: find GPU with MAXIMUM free memory that meets requirement
|
|
866
871
|
best_fit_gpu = None
|
|
867
|
-
best_fit_memory = float("inf")
|
|
872
|
+
best_fit_memory = 0 # Changed from float("inf") to 0
|
|
868
873
|
|
|
869
874
|
for i, mem in enumerate(memory_free_values):
|
|
870
875
|
# Check if GPU is in allowed list
|
|
@@ -887,12 +892,12 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
887
892
|
required_gpu_memory
|
|
888
893
|
)
|
|
889
894
|
|
|
890
|
-
#
|
|
891
|
-
if mem
|
|
895
|
+
# Most-free: choose GPU with MOST free memory to balance load
|
|
896
|
+
if mem > best_fit_memory: # Changed from < to >
|
|
892
897
|
best_fit_gpu = i
|
|
893
898
|
best_fit_memory = mem
|
|
894
899
|
logging.debug(
|
|
895
|
-
"Action %s: GPU %d is new best
|
|
900
|
+
"Action %s: GPU %d is new best candidate (most free memory)",
|
|
896
901
|
action_id,
|
|
897
902
|
i
|
|
898
903
|
)
|
|
@@ -907,7 +912,7 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
907
912
|
|
|
908
913
|
if best_fit_gpu is not None:
|
|
909
914
|
logging.info(
|
|
910
|
-
"Action %s: Selected
|
|
915
|
+
"Action %s: Selected GPU %d with most free memory: %d MB free (required: %d MB, available: %d MB)",
|
|
911
916
|
action_id,
|
|
912
917
|
best_fit_gpu,
|
|
913
918
|
best_fit_memory,
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=NK_ZWvNDrLUeOzWwXjxrX7XP-lDHbx5-A0K8ByFpnUg,66241
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
|
|
6
|
-
matrice_compute/instance_utils.py,sha256=
|
|
6
|
+
matrice_compute/instance_utils.py,sha256=xDOLo21G7unvlGTpnYQkEWSkyuAsVAcs4scOHy5Oxi4,38204
|
|
7
7
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
8
8
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
matrice_compute/resources_tracker.py,sha256=pkdt0aVKx_TpY_Sq---73w9INkDffZZe3mZGlp1EftE,22573
|
|
10
10
|
matrice_compute/scaling.py,sha256=CeT_lxJNkjJamRETG1lWaOtdSr5ySmcaMcqt7-lFRbo,23731
|
|
11
11
|
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
12
12
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
13
|
-
matrice_compute-0.1.
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
13
|
+
matrice_compute-0.1.24.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
14
|
+
matrice_compute-0.1.24.dist-info/METADATA,sha256=5fsmPC37r0KPPd6h0qQXnvm0dFqLqboVInQdv7KCr5Y,1038
|
|
15
|
+
matrice_compute-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
+
matrice_compute-0.1.24.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
17
|
+
matrice_compute-0.1.24.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|