matrice-compute 0.1.20__tar.gz → 0.1.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/action_instance.py +69 -14
  4. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/instance_utils.py +226 -28
  5. matrice_compute-0.1.21/src/matrice_compute/scaling.py +1224 -0
  6. matrice_compute-0.1.20/src/matrice_compute/scaling.py +0 -972
  7. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/LICENSE.txt +0 -0
  8. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/README.md +0 -0
  9. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/matrice_compute.egg-info/SOURCES.txt +0 -0
  10. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/matrice_compute.egg-info/dependency_links.txt +0 -0
  11. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/matrice_compute.egg-info/not-zip-safe +0 -0
  12. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/matrice_compute.egg-info/top_level.txt +0 -0
  13. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/pyproject.toml +0 -0
  14. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/setup.cfg +0 -0
  15. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/setup.py +0 -0
  16. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/__init__.py +0 -0
  17. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/actions_manager.py +0 -0
  18. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  19. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/instance_manager.py +0 -0
  20. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/prechecks.py +0 -0
  21. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/py.typed +0 -0
  22. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/resources_tracker.py +0 -0
  23. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.20
3
+ Version: 0.1.21
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.20
3
+ Version: 0.1.21
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -268,17 +268,68 @@ class ActionInstance:
268
268
  Returns:
269
269
  str: GPU configuration string
270
270
  """
271
- if not action_details["actionDetails"].get("gpuRequired", False):
271
+ action_id = action_details.get("_id", "unknown")
272
+
273
+ # Check if GPU is required
274
+ gpu_required = action_details["actionDetails"].get("gpuRequired", False)
275
+ if not gpu_required:
276
+ logging.info(
277
+ "Action %s does not require GPU - will run on CPU",
278
+ action_id
279
+ )
272
280
  return ""
273
- gpu_indices = get_gpu_with_sufficient_memory_for_action(
274
- action_details=action_details
281
+
282
+ # Get required GPU memory for logging
283
+ required_memory = action_details.get("actionDetails", {}).get(
284
+ "expectedResources", {}
285
+ ).get("gpuMemory", 0)
286
+
287
+ logging.info(
288
+ "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
289
+ action_id,
290
+ required_memory
275
291
  )
276
- if gpu_indices:
277
- gpu_str = ",".join(map(str, gpu_indices))
278
- logging.info("Using GPUs: %s", gpu_str)
279
- return f'--gpus "device={gpu_str}"'
280
- logging.info("No GPUs with sufficient memory found.")
281
- return ""
292
+
293
+ try:
294
+ # Get the best-fit GPU(s) with sufficient memory
295
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(
296
+ action_details=action_details
297
+ )
298
+
299
+ if gpu_indices:
300
+ gpu_str = ",".join(map(str, gpu_indices))
301
+ logging.info(
302
+ "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
303
+ action_id,
304
+ gpu_str,
305
+ required_memory
306
+ )
307
+
308
+ # Return Docker GPU configuration
309
+ # Format: --gpus "device=0" or --gpus "device=0,1,2"
310
+ return f'--gpus "device={gpu_str}"'
311
+ else:
312
+ logging.warning(
313
+ "Action %s: No GPUs with sufficient memory found (required: %d MB)",
314
+ action_id,
315
+ required_memory
316
+ )
317
+ return ""
318
+
319
+ except ValueError as e:
320
+ logging.error(
321
+ "Action %s: Error selecting GPU - %s",
322
+ action_id,
323
+ str(e)
324
+ )
325
+ return ""
326
+ except Exception as e:
327
+ logging.error(
328
+ "Action %s: Unexpected error in GPU selection - %s",
329
+ action_id,
330
+ str(e)
331
+ )
332
+ return ""
282
333
 
283
334
  @log_errors(default_return="", raise_exception=False)
284
335
  def get_base_docker_cmd(
@@ -1410,13 +1461,17 @@ def model_deploy_execute(self: ActionInstance):
1410
1461
  model_family=model_family,
1411
1462
  action_id=action_id,
1412
1463
  )
1464
+
1465
+ # Get GPU configuration based on requirements and availability
1466
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1413
1467
  use_gpu = self.get_gpu_config(action_details)
1414
1468
 
1415
- gpuRequired = action_details["actionDetails"]["gpuRequired"]
1416
- if gpuRequired==False:
1417
- use_gpu = ""
1418
- else:
1419
- use_gpu = "--runtime=nvidia"
1469
+ logging.info(
1470
+ "Action %s: Model deployment GPU config: %s",
1471
+ action_id,
1472
+ use_gpu if use_gpu else "CPU-only"
1473
+ )
1474
+
1420
1475
  extra_env_vars = {"INTERNAL_PORT": internal_port}
1421
1476
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1422
1477
  logging.info("cmd is: %s", cmd)
@@ -589,19 +589,42 @@ def get_required_gpu_memory(action_details: dict) -> int:
589
589
 
590
590
  @log_errors(default_return=True, raise_exception=False)
591
591
  def is_allowed_gpu_device(gpu_index: int) -> bool:
592
- """Check if GPU device is allowed.
592
+ """Check if GPU device is allowed based on GPUS environment variable.
593
+
594
+ The GPUS environment variable can be used to restrict which GPU devices
595
+ are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
593
596
 
594
597
  Args:
595
598
  gpu_index (int): GPU device index
596
599
 
597
600
  Returns:
598
- bool: True if GPU is allowed
601
+ bool: True if GPU is allowed (or no filter is set), False otherwise
599
602
  """
600
603
  gpus = os.environ.get("GPUS")
601
604
  if not gpus:
605
+ # No filter set - all GPUs are allowed
606
+ return True
607
+
608
+ try:
609
+ allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
610
+ is_allowed = int(gpu_index) in allowed_gpus
611
+
612
+ if not is_allowed:
613
+ logging.debug(
614
+ "GPU %d is not in allowed GPU list: %s",
615
+ gpu_index,
616
+ allowed_gpus
617
+ )
618
+
619
+ return is_allowed
620
+
621
+ except ValueError as e:
622
+ logging.warning(
623
+ "Invalid GPUS environment variable format '%s': %s. Allowing all GPUs.",
624
+ gpus,
625
+ e
626
+ )
602
627
  return True
603
- allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
604
- return int(gpu_index) in allowed_gpus
605
628
 
606
629
 
607
630
  @log_errors(raise_exception=True, log_error=False)
@@ -620,7 +643,15 @@ def get_gpu_with_sufficient_memory_for_action(
620
643
  Raises:
621
644
  ValueError: If insufficient GPU memory
622
645
  """
646
+ action_id = action_details.get("_id", "unknown")
623
647
  required_gpu_memory = get_required_gpu_memory(action_details)
648
+
649
+ logging.info(
650
+ "Action %s: Searching for GPU(s) with %d MB available memory",
651
+ action_id,
652
+ required_gpu_memory
653
+ )
654
+
624
655
  command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
625
656
  try:
626
657
  result = subprocess.run(
@@ -631,47 +662,137 @@ def get_gpu_with_sufficient_memory_for_action(
631
662
  check=False,
632
663
  )
633
664
  if result.returncode != 0:
665
+ error_msg = f"nvidia-smi command failed with return code {result.returncode}"
666
+ logging.error("Action %s: %s", action_id, error_msg)
634
667
  raise ValueError("Failed to get GPU information - nvidia-smi command failed")
635
668
  memory_free_info = result.stdout.decode("ascii").strip().split("\n")
636
669
  except subprocess.TimeoutExpired:
637
- logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
670
+ logging.error(
671
+ "Action %s: nvidia-smi command timed out after 5 seconds",
672
+ action_id
673
+ )
638
674
  raise ValueError("Failed to get GPU information - nvidia-smi timed out")
639
675
  except FileNotFoundError:
676
+ logging.error(
677
+ "Action %s: nvidia-smi not found on this system",
678
+ action_id
679
+ )
640
680
  raise ValueError("nvidia-smi not found - no GPU support available")
641
681
  except Exception as e:
642
- logging.warning("Error running nvidia-smi: %s", e)
682
+ logging.error(
683
+ "Action %s: Error running nvidia-smi: %s",
684
+ action_id,
685
+ e
686
+ )
643
687
  raise ValueError(f"Failed to get GPU information: {e}")
644
-
688
+
645
689
  if len(memory_free_info) < 2:
690
+ logging.error(
691
+ "Action %s: No GPU information available from nvidia-smi output",
692
+ action_id
693
+ )
646
694
  raise ValueError("No GPU information available from nvidia-smi")
647
-
695
+
648
696
  try:
649
697
  memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
650
698
  except (ValueError, IndexError) as e:
699
+ logging.error(
700
+ "Action %s: Error parsing GPU memory information: %s",
701
+ action_id,
702
+ e
703
+ )
651
704
  raise ValueError(f"Error parsing GPU memory information: {e}")
652
-
705
+
653
706
  if not memory_free_values:
707
+ logging.error("Action %s: No GPU devices found", action_id)
654
708
  raise ValueError("No GPU devices found")
655
-
709
+
710
+ # Log all available GPUs and their free memory
711
+ logging.info(
712
+ "Action %s: Found %d GPU(s) - Free memory: %s",
713
+ action_id,
714
+ len(memory_free_values),
715
+ ", ".join([f"GPU{i}: {mem}MB" for i, mem in enumerate(memory_free_values)])
716
+ )
717
+
718
+ # Check GPUS environment variable for allowed devices
719
+ allowed_gpus = os.environ.get("GPUS", "")
720
+ if allowed_gpus:
721
+ logging.info(
722
+ "Action %s: GPU device filter active - allowed devices: %s",
723
+ action_id,
724
+ allowed_gpus
725
+ )
726
+
727
+ # For smaller memory requirements, try to fit on a single GPU first
656
728
  if required_gpu_memory < 80000:
729
+ logging.debug(
730
+ "Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation",
731
+ action_id,
732
+ required_gpu_memory
733
+ )
657
734
  try:
658
- return get_single_gpu_with_sufficient_memory_for_action(action_details)
659
- except ValueError:
660
- pass
735
+ single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
736
+ logging.info(
737
+ "Action %s: Successfully allocated single GPU: %s",
738
+ action_id,
739
+ single_gpu
740
+ )
741
+ return single_gpu
742
+ except ValueError as e:
743
+ logging.debug(
744
+ "Action %s: Single GPU allocation failed (%s) - will try multiple GPUs",
745
+ action_id,
746
+ str(e)
747
+ )
748
+
749
+ # Multi-GPU allocation: accumulate GPUs until we have enough memory
750
+ logging.info(
751
+ "Action %s: Attempting multi-GPU allocation for %d MB",
752
+ action_id,
753
+ required_gpu_memory
754
+ )
755
+
661
756
  selected_gpus = []
662
757
  total_memory = 0
663
758
  for i, mem in enumerate(memory_free_values):
664
759
  if not is_allowed_gpu_device(i):
760
+ logging.debug(
761
+ "Action %s: Skipping GPU %d - not in allowed device list",
762
+ action_id,
763
+ i
764
+ )
665
765
  continue
666
766
  if total_memory >= required_gpu_memory:
667
767
  break
668
768
  selected_gpus.append(i)
669
769
  total_memory += mem
770
+ logging.debug(
771
+ "Action %s: Added GPU %d (%d MB free) - Total: %d MB",
772
+ action_id,
773
+ i,
774
+ mem,
775
+ total_memory
776
+ )
777
+
670
778
  if total_memory >= required_gpu_memory:
779
+ logging.info(
780
+ "Action %s: Successfully allocated %d GPU(s): %s (Total memory: %d MB >= Required: %d MB)",
781
+ action_id,
782
+ len(selected_gpus),
783
+ selected_gpus,
784
+ total_memory,
785
+ required_gpu_memory
786
+ )
671
787
  return selected_gpus
672
- raise ValueError(
673
- f"Insufficient GPU memory available. Required: {required_gpu_memory}MB, Available: {total_memory}MB"
788
+
789
+ error_msg = (
790
+ f"Insufficient GPU memory available. "
791
+ f"Required: {required_gpu_memory}MB, "
792
+ f"Available: {total_memory}MB across {len(selected_gpus)} GPU(s)"
674
793
  )
794
+ logging.error("Action %s: %s", action_id, error_msg)
795
+ raise ValueError(error_msg)
675
796
 
676
797
 
677
798
  @log_errors(raise_exception=True, log_error=False)
@@ -679,7 +800,10 @@ def get_single_gpu_with_sufficient_memory_for_action(
679
800
  action_details: dict,
680
801
  ) -> list:
681
802
  """
682
- Get single GPU with sufficient memory.
803
+ Get single GPU with sufficient memory using best-fit algorithm.
804
+
805
+ Best-fit selects the GPU with the smallest amount of free memory
806
+ that still meets the requirements, minimizing fragmentation.
683
807
 
684
808
  Args:
685
809
  action_details (dict): Action details
@@ -690,7 +814,15 @@ def get_single_gpu_with_sufficient_memory_for_action(
690
814
  Raises:
691
815
  ValueError: If no GPU has sufficient memory
692
816
  """
817
+ action_id = action_details.get("_id", "unknown")
693
818
  required_gpu_memory = get_required_gpu_memory(action_details)
819
+
820
+ logging.debug(
821
+ "Action %s: Finding best-fit single GPU for %d MB",
822
+ action_id,
823
+ required_gpu_memory
824
+ )
825
+
694
826
  command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
695
827
  try:
696
828
  result = subprocess.run(
@@ -704,38 +836,104 @@ def get_single_gpu_with_sufficient_memory_for_action(
704
836
  raise ValueError("Failed to get GPU information - nvidia-smi command failed")
705
837
  memory_free_info = result.stdout.decode("ascii").strip().split("\n")
706
838
  except subprocess.TimeoutExpired:
707
- logging.warning("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
839
+ logging.error(
840
+ "Action %s: nvidia-smi timed out in single GPU selection",
841
+ action_id
842
+ )
708
843
  raise ValueError("Failed to get GPU information - nvidia-smi timed out")
709
844
  except FileNotFoundError:
710
845
  raise ValueError("nvidia-smi not found - no GPU support available")
711
846
  except Exception as e:
712
- logging.warning("Error running nvidia-smi: %s", e)
847
+ logging.error(
848
+ "Action %s: Error running nvidia-smi: %s",
849
+ action_id,
850
+ e
851
+ )
713
852
  raise ValueError(f"Failed to get GPU information: {e}")
714
-
853
+
715
854
  if len(memory_free_info) < 2:
716
855
  raise ValueError("No GPU information available from nvidia-smi")
717
-
856
+
718
857
  try:
719
858
  memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
720
859
  except (ValueError, IndexError) as e:
721
860
  raise ValueError(f"Error parsing GPU memory information: {e}")
722
-
861
+
723
862
  if not memory_free_values:
724
863
  raise ValueError("No GPU devices found")
725
-
864
+
865
+ # Best-fit algorithm: find GPU with minimum free memory that meets requirement
726
866
  best_fit_gpu = None
727
867
  best_fit_memory = float("inf")
868
+
728
869
  for i, mem in enumerate(memory_free_values):
870
+ # Check if GPU is in allowed list
729
871
  if not is_allowed_gpu_device(i):
872
+ logging.debug(
873
+ "Action %s: Skipping GPU %d (not in allowed list) - %d MB free",
874
+ action_id,
875
+ i,
876
+ mem
877
+ )
730
878
  continue
731
- if mem >= required_gpu_memory and mem < best_fit_memory:
732
- best_fit_gpu = i
733
- best_fit_memory = mem
879
+
880
+ # Check if GPU has sufficient memory
881
+ if mem >= required_gpu_memory:
882
+ logging.debug(
883
+ "Action %s: GPU %d is candidate - %d MB free (required: %d MB)",
884
+ action_id,
885
+ i,
886
+ mem,
887
+ required_gpu_memory
888
+ )
889
+
890
+ # Best-fit: choose GPU with smallest sufficient memory
891
+ if mem < best_fit_memory:
892
+ best_fit_gpu = i
893
+ best_fit_memory = mem
894
+ logging.debug(
895
+ "Action %s: GPU %d is new best-fit candidate",
896
+ action_id,
897
+ i
898
+ )
899
+ else:
900
+ logging.debug(
901
+ "Action %s: GPU %d insufficient - %d MB free < %d MB required",
902
+ action_id,
903
+ i,
904
+ mem,
905
+ required_gpu_memory
906
+ )
907
+
734
908
  if best_fit_gpu is not None:
909
+ logging.info(
910
+ "Action %s: Selected best-fit GPU %d with %d MB free (required: %d MB, waste: %d MB)",
911
+ action_id,
912
+ best_fit_gpu,
913
+ best_fit_memory,
914
+ required_gpu_memory,
915
+ best_fit_memory - required_gpu_memory
916
+ )
735
917
  return [best_fit_gpu]
736
- raise ValueError(
737
- f"No single GPU with sufficient memory ({required_gpu_memory}MB) available"
738
- )
918
+
919
+ # No suitable GPU found - provide detailed error
920
+ suitable_gpus = [
921
+ f"GPU{i}: {mem}MB (need {required_gpu_memory}MB)"
922
+ for i, mem in enumerate(memory_free_values)
923
+ if is_allowed_gpu_device(i)
924
+ ]
925
+
926
+ if not suitable_gpus:
927
+ error_msg = f"No allowed GPUs available (GPUS env filter active)"
928
+ else:
929
+ error_msg = (
930
+ f"No single GPU with sufficient memory. "
931
+ f"Required: {required_gpu_memory}MB. "
932
+ f"Available GPUs: {', '.join(suitable_gpus)}"
933
+ )
934
+
935
+ logging.warning("Action %s: %s", action_id, error_msg)
936
+ raise ValueError(error_msg)
739
937
 
740
938
 
741
939
  @log_errors(default_return=(None, None), raise_exception=False)