matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -95,28 +95,72 @@ def get_instance_info(service_provider: str = None, instance_id: str = None) ->
95
95
  return str(auto_service_provider), str(auto_instance_id)
96
96
 
97
97
 
98
+ def _normalize_timestamp(timestamp_str: str) -> str:
99
+ """
100
+ Normalize timestamp string to handle different precision levels.
101
+
102
+ Handles nanoseconds (9 digits), microseconds (6 digits), milliseconds (3 digits),
103
+ and various timezone formats across different cloud providers.
104
+
105
+ Args:
106
+ timestamp_str (str): Timestamp string in various formats
107
+
108
+ Returns:
109
+ str: Normalized timestamp string compatible with fromisoformat()
110
+ """
111
+ # Replace 'Z' with '+00:00' for UTC timestamps
112
+ timestamp_str = timestamp_str.replace("Z", "+00:00")
113
+
114
+ # Handle fractional seconds - Python's datetime only supports up to 6 digits (microseconds)
115
+ # Some providers (like OCI, GCP) may return nanoseconds (9 digits)
116
+ if "." in timestamp_str:
117
+ # Split into main part and fractional part
118
+ if "+" in timestamp_str:
119
+ main_part, tz_part = timestamp_str.rsplit("+", 1)
120
+ tz_suffix = "+" + tz_part
121
+ elif timestamp_str.count("-") > 2: # Has negative timezone offset
122
+ main_part, tz_part = timestamp_str.rsplit("-", 1)
123
+ tz_suffix = "-" + tz_part
124
+ else:
125
+ main_part = timestamp_str
126
+ tz_suffix = ""
127
+
128
+ # Split main part into date/time and fractional seconds
129
+ datetime_part, fractional = main_part.rsplit(".", 1)
130
+
131
+ # Truncate fractional seconds to 6 digits (microseconds)
132
+ if len(fractional) > 6:
133
+ fractional = fractional[:6]
134
+
135
+ # Reconstruct timestamp
136
+ timestamp_str = f"{datetime_part}.{fractional}{tz_suffix}"
137
+
138
+ return timestamp_str
139
+
140
+
98
141
  @log_errors(default_return=0, raise_exception=False, log_error=False)
99
142
  def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
100
143
  """
101
144
  Calculate time difference between start and finish times.
145
+
146
+ Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
147
+ and different precision levels (nanoseconds, microseconds, milliseconds).
102
148
 
103
149
  Args:
104
- start_time_str (str): Start time string
105
- finish_time_str (str): Finish time string
150
+ start_time_str (str): Start time string in ISO format
151
+ finish_time_str (str): Finish time string in ISO format
106
152
 
107
153
  Returns:
108
154
  int: Time difference in seconds
109
155
  """
110
- if os.environ["SERVICE_PROVIDER"] in [
111
- "AWS",
112
- "OCI",
113
- "LAMBDA",
114
- ]:
115
- start_time = datetime.fromisoformat(start_time_str.split(".")[0] + "+00:00")
116
- finish_time = datetime.fromisoformat(finish_time_str.split(".")[0] + "+00:00")
117
- else:
118
- start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
119
- finish_time = datetime.fromisoformat(finish_time_str.replace("Z", "+00:00"))
156
+ # Normalize both timestamps to handle different formats
157
+ normalized_start = _normalize_timestamp(start_time_str)
158
+ normalized_finish = _normalize_timestamp(finish_time_str)
159
+
160
+ # Parse the normalized timestamps
161
+ start_time = datetime.fromisoformat(normalized_start)
162
+ finish_time = datetime.fromisoformat(normalized_finish)
163
+
120
164
  return int((finish_time - start_time).total_seconds())
121
165
 
122
166
 
@@ -129,14 +173,25 @@ def has_gpu() -> bool:
129
173
  bool: True if GPU is present, False otherwise
130
174
  """
131
175
  try:
132
- subprocess.run("nvidia-smi", timeout=5)
133
- return True
176
+ result = subprocess.run(
177
+ ["nvidia-smi"],
178
+ stdout=subprocess.PIPE,
179
+ stderr=subprocess.PIPE,
180
+ timeout=5,
181
+ check=False,
182
+ )
183
+ return result.returncode == 0
134
184
  except subprocess.TimeoutExpired:
135
- logging.warning("nvidia-smi command timed out after 5 seconds")
185
+ logging.debug("nvidia-smi command timed out after 5 seconds")
186
+ return False
187
+ except FileNotFoundError:
188
+ logging.debug("nvidia-smi not found on this system")
189
+ return False
190
+ except Exception:
136
191
  return False
137
192
 
138
193
 
139
- @log_errors(default_return=0, raise_exception=False)
194
+ @log_errors(default_return=0, raise_exception=False, log_error=False)
140
195
  def get_gpu_memory_usage() -> float:
141
196
  """
142
197
  Get GPU memory usage percentage.
@@ -144,17 +199,35 @@ def get_gpu_memory_usage() -> float:
144
199
  Returns:
145
200
  float: Memory usage between 0 and 1
146
201
  """
147
- command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
202
+ command = ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"]
148
203
  try:
149
- output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
204
+ result = subprocess.run(
205
+ command,
206
+ stdout=subprocess.PIPE,
207
+ stderr=subprocess.PIPE,
208
+ timeout=5,
209
+ check=False,
210
+ )
211
+ if result.returncode != 0:
212
+ logging.debug("nvidia-smi command failed in get_gpu_memory_usage")
213
+ return 0
214
+ output = result.stdout.decode("ascii").strip().split("\n")
150
215
  memory_percentages = []
151
216
  for line in output:
152
- used, total = map(int, line.split(","))
153
- usage_percentage = used / total
154
- memory_percentages.append(usage_percentage)
155
- return min(memory_percentages)
217
+ if line.strip():
218
+ used, total = map(int, line.split(","))
219
+ if total > 0:
220
+ usage_percentage = used / total
221
+ memory_percentages.append(usage_percentage)
222
+ return min(memory_percentages) if memory_percentages else 0
156
223
  except subprocess.TimeoutExpired:
157
- logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
224
+ logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
225
+ return 0
226
+ except (ValueError, IndexError) as e:
227
+ logging.debug("Error parsing GPU memory info: %s", e)
228
+ return 0
229
+ except Exception as e:
230
+ logging.debug("Unexpected error in get_gpu_memory_usage: %s", e)
158
231
  return 0
159
232
 
160
233
 
@@ -194,7 +267,7 @@ def get_mem_usage() -> float:
194
267
  return mem_usage
195
268
 
196
269
 
197
- @log_errors(default_return=[], raise_exception=False)
270
+ @log_errors(default_return=[], raise_exception=False, log_error=False)
198
271
  def get_gpu_info() -> list:
199
272
  """
200
273
  Get GPU information.
@@ -202,23 +275,34 @@ def get_gpu_info() -> list:
202
275
  Returns:
203
276
  list: GPU information strings
204
277
  """
205
- proc = subprocess.Popen(
206
- [
207
- "nvidia-smi",
208
- "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
209
- "--format=csv,noheader,nounits",
210
- ],
211
- stdout=subprocess.PIPE,
212
- stderr=subprocess.PIPE,
213
- )
214
278
  try:
215
- stdout, stderr = proc.communicate(timeout=5)
216
- output = stdout.decode("UTF-8")
217
- return output.split("\n")[:-1]
218
- except subprocess.TimeoutExpired:
219
- logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
220
- proc.kill()
221
- proc.communicate() # flush output after kill
279
+ proc = subprocess.Popen(
280
+ [
281
+ "nvidia-smi",
282
+ "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
283
+ "--format=csv,noheader,nounits",
284
+ ],
285
+ stdout=subprocess.PIPE,
286
+ stderr=subprocess.PIPE,
287
+ )
288
+ try:
289
+ stdout, stderr = proc.communicate(timeout=5)
290
+ if proc.returncode != 0:
291
+ logging.debug("nvidia-smi command failed in get_gpu_info")
292
+ return []
293
+ output = stdout.decode("UTF-8")
294
+ result = [line for line in output.split("\n") if line.strip()]
295
+ return result
296
+ except subprocess.TimeoutExpired:
297
+ logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_info")
298
+ proc.kill()
299
+ proc.communicate() # flush output after kill
300
+ return []
301
+ except FileNotFoundError:
302
+ logging.debug("nvidia-smi not found on this system")
303
+ return []
304
+ except Exception as e:
305
+ logging.debug("Error getting GPU info: %s", e)
222
306
  return []
223
307
 
224
308
 
@@ -241,11 +325,29 @@ def is_docker_running() -> bool:
241
325
  Returns:
242
326
  bool: True if Docker containers are running
243
327
  """
244
- command = "docker ps"
245
- docker_images = (
246
- subprocess.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
247
- )
248
- return bool(docker_images)
328
+ command = ["docker", "ps"]
329
+ try:
330
+ result = subprocess.run(
331
+ command,
332
+ stdout=subprocess.PIPE,
333
+ stderr=subprocess.PIPE,
334
+ check=False,
335
+ timeout=10,
336
+ )
337
+ if result.returncode != 0:
338
+ logging.warning("docker ps command failed")
339
+ return False
340
+ docker_images = result.stdout.decode("ascii").split("\n")[:-1][1:]
341
+ return bool(docker_images)
342
+ except subprocess.TimeoutExpired:
343
+ logging.warning("docker ps command timed out")
344
+ return False
345
+ except FileNotFoundError:
346
+ logging.warning("docker command not found")
347
+ return False
348
+ except Exception as e:
349
+ logging.warning("Error checking if docker is running: %s", e)
350
+ return False
249
351
 
250
352
 
251
353
  @log_errors(default_return=None, raise_exception=False)
@@ -487,22 +589,45 @@ def get_required_gpu_memory(action_details: dict) -> int:
487
589
 
488
590
  @log_errors(default_return=True, raise_exception=False)
489
591
  def is_allowed_gpu_device(gpu_index: int) -> bool:
490
- """Check if GPU device is allowed.
592
+ """Check if GPU device is allowed based on GPUS environment variable.
593
+
594
+ The GPUS environment variable can be used to restrict which GPU devices
595
+ are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
491
596
 
492
597
  Args:
493
598
  gpu_index (int): GPU device index
494
599
 
495
600
  Returns:
496
- bool: True if GPU is allowed
601
+ bool: True if GPU is allowed (or no filter is set), False otherwise
497
602
  """
498
603
  gpus = os.environ.get("GPUS")
499
604
  if not gpus:
605
+ # No filter set - all GPUs are allowed
606
+ return True
607
+
608
+ try:
609
+ allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
610
+ is_allowed = int(gpu_index) in allowed_gpus
611
+
612
+ if not is_allowed:
613
+ logging.debug(
614
+ "GPU %d is not in allowed GPU list: %s",
615
+ gpu_index,
616
+ allowed_gpus
617
+ )
618
+
619
+ return is_allowed
620
+
621
+ except ValueError as e:
622
+ logging.warning(
623
+ "Invalid GPUS environment variable format '%s': %s. Allowing all GPUs.",
624
+ gpus,
625
+ e
626
+ )
500
627
  return True
501
- allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
502
- return int(gpu_index) in allowed_gpus
503
628
 
504
629
 
505
- @log_errors(raise_exception=True)
630
+ @log_errors(raise_exception=True, log_error=False)
506
631
  def get_gpu_with_sufficient_memory_for_action(
507
632
  action_details: dict,
508
633
  ) -> list:
@@ -518,44 +643,167 @@ def get_gpu_with_sufficient_memory_for_action(
518
643
  Raises:
519
644
  ValueError: If insufficient GPU memory
520
645
  """
646
+ action_id = action_details.get("_id", "unknown")
521
647
  required_gpu_memory = get_required_gpu_memory(action_details)
522
- command = "nvidia-smi --query-gpu=memory.free --format=csv"
648
+
649
+ logging.info(
650
+ "Action %s: Searching for GPU(s) with %d MB available memory",
651
+ action_id,
652
+ required_gpu_memory
653
+ )
654
+
655
+ command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
523
656
  try:
524
- memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
657
+ result = subprocess.run(
658
+ command,
659
+ stdout=subprocess.PIPE,
660
+ stderr=subprocess.PIPE,
661
+ timeout=5,
662
+ check=False,
663
+ )
664
+ if result.returncode != 0:
665
+ error_msg = f"nvidia-smi command failed with return code {result.returncode}"
666
+ logging.error("Action %s: %s", action_id, error_msg)
667
+ raise ValueError("Failed to get GPU information - nvidia-smi command failed")
668
+ memory_free_info = result.stdout.decode("ascii").strip().split("\n")
525
669
  except subprocess.TimeoutExpired:
526
- logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
670
+ logging.error(
671
+ "Action %s: nvidia-smi command timed out after 5 seconds",
672
+ action_id
673
+ )
527
674
  raise ValueError("Failed to get GPU information - nvidia-smi timed out")
528
-
675
+ except FileNotFoundError:
676
+ logging.error(
677
+ "Action %s: nvidia-smi not found on this system",
678
+ action_id
679
+ )
680
+ raise ValueError("nvidia-smi not found - no GPU support available")
681
+ except Exception as e:
682
+ logging.error(
683
+ "Action %s: Error running nvidia-smi: %s",
684
+ action_id,
685
+ e
686
+ )
687
+ raise ValueError(f"Failed to get GPU information: {e}")
688
+
529
689
  if len(memory_free_info) < 2:
690
+ logging.error(
691
+ "Action %s: No GPU information available from nvidia-smi output",
692
+ action_id
693
+ )
530
694
  raise ValueError("No GPU information available from nvidia-smi")
531
- memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
695
+
696
+ try:
697
+ memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
698
+ except (ValueError, IndexError) as e:
699
+ logging.error(
700
+ "Action %s: Error parsing GPU memory information: %s",
701
+ action_id,
702
+ e
703
+ )
704
+ raise ValueError(f"Error parsing GPU memory information: {e}")
705
+
706
+ if not memory_free_values:
707
+ logging.error("Action %s: No GPU devices found", action_id)
708
+ raise ValueError("No GPU devices found")
709
+
710
+ # Log all available GPUs and their free memory
711
+ logging.info(
712
+ "Action %s: Found %d GPU(s) - Free memory: %s",
713
+ action_id,
714
+ len(memory_free_values),
715
+ ", ".join([f"GPU{i}: {mem}MB" for i, mem in enumerate(memory_free_values)])
716
+ )
717
+
718
+ # Check GPUS environment variable for allowed devices
719
+ allowed_gpus = os.environ.get("GPUS", "")
720
+ if allowed_gpus:
721
+ logging.info(
722
+ "Action %s: GPU device filter active - allowed devices: %s",
723
+ action_id,
724
+ allowed_gpus
725
+ )
726
+
727
+ # For smaller memory requirements, try to fit on a single GPU first
532
728
  if required_gpu_memory < 80000:
729
+ logging.debug(
730
+ "Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation",
731
+ action_id,
732
+ required_gpu_memory
733
+ )
533
734
  try:
534
- return get_single_gpu_with_sufficient_memory_for_action(action_details)
535
- except ValueError:
536
- pass
735
+ single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
736
+ logging.info(
737
+ "Action %s: Successfully allocated single GPU: %s",
738
+ action_id,
739
+ single_gpu
740
+ )
741
+ return single_gpu
742
+ except ValueError as e:
743
+ logging.debug(
744
+ "Action %s: Single GPU allocation failed (%s) - will try multiple GPUs",
745
+ action_id,
746
+ str(e)
747
+ )
748
+
749
+ # Multi-GPU allocation: accumulate GPUs until we have enough memory
750
+ logging.info(
751
+ "Action %s: Attempting multi-GPU allocation for %d MB",
752
+ action_id,
753
+ required_gpu_memory
754
+ )
755
+
537
756
  selected_gpus = []
538
757
  total_memory = 0
539
758
  for i, mem in enumerate(memory_free_values):
540
759
  if not is_allowed_gpu_device(i):
760
+ logging.debug(
761
+ "Action %s: Skipping GPU %d - not in allowed device list",
762
+ action_id,
763
+ i
764
+ )
541
765
  continue
542
766
  if total_memory >= required_gpu_memory:
543
767
  break
544
768
  selected_gpus.append(i)
545
769
  total_memory += mem
770
+ logging.debug(
771
+ "Action %s: Added GPU %d (%d MB free) - Total: %d MB",
772
+ action_id,
773
+ i,
774
+ mem,
775
+ total_memory
776
+ )
777
+
546
778
  if total_memory >= required_gpu_memory:
779
+ logging.info(
780
+ "Action %s: Successfully allocated %d GPU(s): %s (Total memory: %d MB >= Required: %d MB)",
781
+ action_id,
782
+ len(selected_gpus),
783
+ selected_gpus,
784
+ total_memory,
785
+ required_gpu_memory
786
+ )
547
787
  return selected_gpus
548
- raise ValueError(
549
- f"Insufficient GPU memory available. Required: {required_gpu_memory}, Available: {total_memory}"
788
+
789
+ error_msg = (
790
+ f"Insufficient GPU memory available. "
791
+ f"Required: {required_gpu_memory}MB, "
792
+ f"Available: {total_memory}MB across {len(selected_gpus)} GPU(s)"
550
793
  )
794
+ logging.error("Action %s: %s", action_id, error_msg)
795
+ raise ValueError(error_msg)
551
796
 
552
797
 
553
- @log_errors(raise_exception=True)
798
+ @log_errors(raise_exception=True, log_error=False)
554
799
  def get_single_gpu_with_sufficient_memory_for_action(
555
800
  action_details: dict,
556
801
  ) -> list:
557
802
  """
558
- Get single GPU with sufficient memory.
803
+ Get single GPU with sufficient memory using best-fit algorithm.
804
+
805
+ Best-fit selects the GPU with the smallest amount of free memory
806
+ that still meets the requirements, minimizing fragmentation.
559
807
 
560
808
  Args:
561
809
  action_details (dict): Action details
@@ -566,30 +814,126 @@ def get_single_gpu_with_sufficient_memory_for_action(
566
814
  Raises:
567
815
  ValueError: If no GPU has sufficient memory
568
816
  """
817
+ action_id = action_details.get("_id", "unknown")
569
818
  required_gpu_memory = get_required_gpu_memory(action_details)
570
- command = "nvidia-smi --query-gpu=memory.free --format=csv"
819
+
820
+ logging.debug(
821
+ "Action %s: Finding best-fit single GPU for %d MB",
822
+ action_id,
823
+ required_gpu_memory
824
+ )
825
+
826
+ command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
571
827
  try:
572
- memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
828
+ result = subprocess.run(
829
+ command,
830
+ stdout=subprocess.PIPE,
831
+ stderr=subprocess.PIPE,
832
+ timeout=5,
833
+ check=False,
834
+ )
835
+ if result.returncode != 0:
836
+ raise ValueError("Failed to get GPU information - nvidia-smi command failed")
837
+ memory_free_info = result.stdout.decode("ascii").strip().split("\n")
573
838
  except subprocess.TimeoutExpired:
574
- logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
839
+ logging.error(
840
+ "Action %s: nvidia-smi timed out in single GPU selection",
841
+ action_id
842
+ )
575
843
  raise ValueError("Failed to get GPU information - nvidia-smi timed out")
576
-
844
+ except FileNotFoundError:
845
+ raise ValueError("nvidia-smi not found - no GPU support available")
846
+ except Exception as e:
847
+ logging.error(
848
+ "Action %s: Error running nvidia-smi: %s",
849
+ action_id,
850
+ e
851
+ )
852
+ raise ValueError(f"Failed to get GPU information: {e}")
853
+
577
854
  if len(memory_free_info) < 2:
578
855
  raise ValueError("No GPU information available from nvidia-smi")
579
- memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
856
+
857
+ try:
858
+ memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
859
+ except (ValueError, IndexError) as e:
860
+ raise ValueError(f"Error parsing GPU memory information: {e}")
861
+
862
+ if not memory_free_values:
863
+ raise ValueError("No GPU devices found")
864
+
865
+ # Best-fit algorithm: find GPU with minimum free memory that meets requirement
580
866
  best_fit_gpu = None
581
867
  best_fit_memory = float("inf")
868
+
582
869
  for i, mem in enumerate(memory_free_values):
870
+ # Check if GPU is in allowed list
583
871
  if not is_allowed_gpu_device(i):
872
+ logging.debug(
873
+ "Action %s: Skipping GPU %d (not in allowed list) - %d MB free",
874
+ action_id,
875
+ i,
876
+ mem
877
+ )
584
878
  continue
585
- if mem >= required_gpu_memory and mem < best_fit_memory:
586
- best_fit_gpu = i
587
- best_fit_memory = mem
879
+
880
+ # Check if GPU has sufficient memory
881
+ if mem >= required_gpu_memory:
882
+ logging.debug(
883
+ "Action %s: GPU %d is candidate - %d MB free (required: %d MB)",
884
+ action_id,
885
+ i,
886
+ mem,
887
+ required_gpu_memory
888
+ )
889
+
890
+ # Best-fit: choose GPU with smallest sufficient memory
891
+ if mem < best_fit_memory:
892
+ best_fit_gpu = i
893
+ best_fit_memory = mem
894
+ logging.debug(
895
+ "Action %s: GPU %d is new best-fit candidate",
896
+ action_id,
897
+ i
898
+ )
899
+ else:
900
+ logging.debug(
901
+ "Action %s: GPU %d insufficient - %d MB free < %d MB required",
902
+ action_id,
903
+ i,
904
+ mem,
905
+ required_gpu_memory
906
+ )
907
+
588
908
  if best_fit_gpu is not None:
909
+ logging.info(
910
+ "Action %s: Selected best-fit GPU %d with %d MB free (required: %d MB, waste: %d MB)",
911
+ action_id,
912
+ best_fit_gpu,
913
+ best_fit_memory,
914
+ required_gpu_memory,
915
+ best_fit_memory - required_gpu_memory
916
+ )
589
917
  return [best_fit_gpu]
590
- raise ValueError(
591
- f"No single GPU with sufficient memory ({required_gpu_memory}MB) available"
592
- )
918
+
919
+ # No suitable GPU found - provide detailed error
920
+ suitable_gpus = [
921
+ f"GPU{i}: {mem}MB (need {required_gpu_memory}MB)"
922
+ for i, mem in enumerate(memory_free_values)
923
+ if is_allowed_gpu_device(i)
924
+ ]
925
+
926
+ if not suitable_gpus:
927
+ error_msg = f"No allowed GPUs available (GPUS env filter active)"
928
+ else:
929
+ error_msg = (
930
+ f"No single GPU with sufficient memory. "
931
+ f"Required: {required_gpu_memory}MB. "
932
+ f"Available GPUs: {', '.join(suitable_gpus)}"
933
+ )
934
+
935
+ logging.warning("Action %s: %s", action_id, error_msg)
936
+ raise ValueError(error_msg)
593
937
 
594
938
 
595
939
  @log_errors(default_return=(None, None), raise_exception=False)
@@ -692,47 +1036,112 @@ def get_encrypted_access_key_pair(
692
1036
 
693
1037
  return encoded_access_key, encoded_secret_key
694
1038
 
695
- @log_errors(default_return=False, raise_exception=False)
696
- def check_public_port_exposure(port: int) -> bool:
1039
+ def _get_private_ip() -> str:
697
1040
  """
698
- Check if port is publicly accessible.
1041
+ Get the actual private/LAN IP address using UDP socket trick.
1042
+ This works reliably even in Docker, NAT, VPN, etc.
1043
+
1044
+ Returns:
1045
+ str: Private IP address or None if not available
1046
+ """
1047
+ try:
1048
+ # Use UDP socket to determine which interface would be used for external connection
1049
+ # No actual packets are sent
1050
+ with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
1051
+ s.connect(("8.8.8.8", 80))
1052
+ private_ip = s.getsockname()[0]
1053
+ return private_ip
1054
+ except Exception:
1055
+ return None
699
1056
 
700
- Args:
701
- port (int): Port number to check
702
1057
 
703
- Returns:
704
- bool: True if port is publicly accessible
705
- """
706
- is_public_exposed = False
707
- is_locally_available = False
708
- # Check if port is publicly accessible
709
- public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
710
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn_sock:
711
- conn_sock.settimeout(3)
712
- result = conn_sock.connect_ex((public_ip, port))
713
- is_public_exposed = result == 0
1058
+ def _public_ip_is_local(public_ip: str) -> bool:
1059
+ """
1060
+ Check if a public IP address is actually assigned to a local network interface.
1061
+ This is true on cloud servers with real public IPs, false behind NAT.
714
1062
 
715
- # Check if port is locally available
716
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as bind_sock:
717
- bind_sock.setsockopt(
718
- socket.SOL_SOCKET,
719
- socket.SO_REUSEADDR,
720
- 1,
721
- )
722
- bind_sock.bind(("", port))
723
- bind_sock.listen(1)
724
- is_locally_available = True
725
-
726
- if not is_public_exposed:
727
- logging.debug(
728
- "Port %d is not publicly exposed",
729
- port,
730
- )
1063
+ Args:
1064
+ public_ip (str): The public IP to check
1065
+
1066
+ Returns:
1067
+ bool: True if the public IP is on a local interface
1068
+ """
1069
+ try:
1070
+ for iface, addrs in psutil.net_if_addrs().items():
1071
+ for addr in addrs:
1072
+ if addr.family == socket.AF_INET:
1073
+ if addr.address == public_ip:
1074
+ return True
731
1075
  return False
732
- if not is_locally_available:
733
- logging.debug(
734
- "Port %d is not locally available",
735
- port,
736
- )
1076
+ except Exception:
737
1077
  return False
738
- return True
1078
+
1079
+
1080
+ @log_errors(default_return=("localhost", True), raise_exception=False)
1081
+ def get_best_service_ip_and_network(port: int) -> tuple:
1082
+ """
1083
+ Determine the best IP address and network configuration for a service.
1084
+
1085
+ This function intelligently selects the best IP to bind a service to:
1086
+
1087
+ Priority:
1088
+ 1. Public IP if it's actually on a local interface (cloud servers)
1089
+ 2. Private/LAN IP (NAT, local network, Docker)
1090
+ 3. localhost with --net=host (fallback)
1091
+
1092
+ Args:
1093
+ port (int): Port number for the service
1094
+
1095
+ Returns:
1096
+ tuple: (ip_address, use_host_network) where:
1097
+ - ip_address: The IP address to use (public, private, or localhost)
1098
+ - use_host_network: True if should use --net=host, False if should use port mapping
1099
+ """
1100
+ try:
1101
+ # Check if port is available (not already in use)
1102
+ try:
1103
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock:
1104
+ test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1105
+ test_sock.bind(("0.0.0.0", port))
1106
+ test_sock.listen(1)
1107
+ # Port is available - socket closes automatically
1108
+ except OSError as e:
1109
+ logging.warning(f"Port {port} is already in use or cannot be bound: {e}, will use --net=host")
1110
+ return "localhost", True
1111
+
1112
+ # Get the actual private/LAN IP
1113
+ private_ip = _get_private_ip()
1114
+ if private_ip:
1115
+ logging.info(f"Determined private/LAN IP: {private_ip}")
1116
+ else:
1117
+ logging.debug("Could not determine private IP")
1118
+
1119
+ # Try to get public IP from external service
1120
+ public_ip = None
1121
+ try:
1122
+ public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8").strip()
1123
+ # Validate it's a proper IP address
1124
+ socket.inet_aton(public_ip)
1125
+ logging.info(f"Determined external/public IP: {public_ip}")
1126
+ except Exception as e:
1127
+ logging.debug(f"Could not determine public IP: {e}")
1128
+
1129
+ # Decision logic: Choose the best IP
1130
+
1131
+ # 1. If public IP is on a local interface, use it (cloud server with real public IP)
1132
+ if public_ip and _public_ip_is_local(public_ip):
1133
+ logging.info(f"Public IP {public_ip} is on local interface, using it for port {port}")
1134
+ return public_ip, False
1135
+
1136
+ # 2. If we have a valid private IP, use it (most common case: NAT, LAN, Docker)
1137
+ if private_ip and not private_ip.startswith("127."):
1138
+ logging.info(f"Using private/LAN IP {private_ip} for port {port}")
1139
+ return private_ip, False
1140
+
1141
+ # 3. Fall back to localhost with --net=host
1142
+ logging.info(f"No suitable IP found, using localhost with --net=host for port {port}")
1143
+ return "localhost", True
1144
+
1145
+ except Exception as e:
1146
+ logging.warning(f"Error determining best IP for port {port}: {e}, falling back to localhost")
1147
+ return "localhost", True