matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +104 -19
- matrice_compute/instance_utils.py +520 -111
- matrice_compute/resources_tracker.py +125 -53
- matrice_compute/scaling.py +658 -406
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/RECORD +9 -9
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/top_level.txt +0 -0
|
@@ -95,28 +95,72 @@ def get_instance_info(service_provider: str = None, instance_id: str = None) ->
|
|
|
95
95
|
return str(auto_service_provider), str(auto_instance_id)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
+
def _normalize_timestamp(timestamp_str: str) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Normalize timestamp string to handle different precision levels.
|
|
101
|
+
|
|
102
|
+
Handles nanoseconds (9 digits), microseconds (6 digits), milliseconds (3 digits),
|
|
103
|
+
and various timezone formats across different cloud providers.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
timestamp_str (str): Timestamp string in various formats
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
str: Normalized timestamp string compatible with fromisoformat()
|
|
110
|
+
"""
|
|
111
|
+
# Replace 'Z' with '+00:00' for UTC timestamps
|
|
112
|
+
timestamp_str = timestamp_str.replace("Z", "+00:00")
|
|
113
|
+
|
|
114
|
+
# Handle fractional seconds - Python's datetime only supports up to 6 digits (microseconds)
|
|
115
|
+
# Some providers (like OCI, GCP) may return nanoseconds (9 digits)
|
|
116
|
+
if "." in timestamp_str:
|
|
117
|
+
# Split into main part and fractional part
|
|
118
|
+
if "+" in timestamp_str:
|
|
119
|
+
main_part, tz_part = timestamp_str.rsplit("+", 1)
|
|
120
|
+
tz_suffix = "+" + tz_part
|
|
121
|
+
elif timestamp_str.count("-") > 2: # Has negative timezone offset
|
|
122
|
+
main_part, tz_part = timestamp_str.rsplit("-", 1)
|
|
123
|
+
tz_suffix = "-" + tz_part
|
|
124
|
+
else:
|
|
125
|
+
main_part = timestamp_str
|
|
126
|
+
tz_suffix = ""
|
|
127
|
+
|
|
128
|
+
# Split main part into date/time and fractional seconds
|
|
129
|
+
datetime_part, fractional = main_part.rsplit(".", 1)
|
|
130
|
+
|
|
131
|
+
# Truncate fractional seconds to 6 digits (microseconds)
|
|
132
|
+
if len(fractional) > 6:
|
|
133
|
+
fractional = fractional[:6]
|
|
134
|
+
|
|
135
|
+
# Reconstruct timestamp
|
|
136
|
+
timestamp_str = f"{datetime_part}.{fractional}{tz_suffix}"
|
|
137
|
+
|
|
138
|
+
return timestamp_str
|
|
139
|
+
|
|
140
|
+
|
|
98
141
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
99
142
|
def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
|
|
100
143
|
"""
|
|
101
144
|
Calculate time difference between start and finish times.
|
|
145
|
+
|
|
146
|
+
Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
|
|
147
|
+
and different precision levels (nanoseconds, microseconds, milliseconds).
|
|
102
148
|
|
|
103
149
|
Args:
|
|
104
|
-
start_time_str (str): Start time string
|
|
105
|
-
finish_time_str (str): Finish time string
|
|
150
|
+
start_time_str (str): Start time string in ISO format
|
|
151
|
+
finish_time_str (str): Finish time string in ISO format
|
|
106
152
|
|
|
107
153
|
Returns:
|
|
108
154
|
int: Time difference in seconds
|
|
109
155
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
|
|
119
|
-
finish_time = datetime.fromisoformat(finish_time_str.replace("Z", "+00:00"))
|
|
156
|
+
# Normalize both timestamps to handle different formats
|
|
157
|
+
normalized_start = _normalize_timestamp(start_time_str)
|
|
158
|
+
normalized_finish = _normalize_timestamp(finish_time_str)
|
|
159
|
+
|
|
160
|
+
# Parse the normalized timestamps
|
|
161
|
+
start_time = datetime.fromisoformat(normalized_start)
|
|
162
|
+
finish_time = datetime.fromisoformat(normalized_finish)
|
|
163
|
+
|
|
120
164
|
return int((finish_time - start_time).total_seconds())
|
|
121
165
|
|
|
122
166
|
|
|
@@ -129,14 +173,25 @@ def has_gpu() -> bool:
|
|
|
129
173
|
bool: True if GPU is present, False otherwise
|
|
130
174
|
"""
|
|
131
175
|
try:
|
|
132
|
-
subprocess.run(
|
|
133
|
-
|
|
176
|
+
result = subprocess.run(
|
|
177
|
+
["nvidia-smi"],
|
|
178
|
+
stdout=subprocess.PIPE,
|
|
179
|
+
stderr=subprocess.PIPE,
|
|
180
|
+
timeout=5,
|
|
181
|
+
check=False,
|
|
182
|
+
)
|
|
183
|
+
return result.returncode == 0
|
|
134
184
|
except subprocess.TimeoutExpired:
|
|
135
|
-
logging.
|
|
185
|
+
logging.debug("nvidia-smi command timed out after 5 seconds")
|
|
186
|
+
return False
|
|
187
|
+
except FileNotFoundError:
|
|
188
|
+
logging.debug("nvidia-smi not found on this system")
|
|
189
|
+
return False
|
|
190
|
+
except Exception:
|
|
136
191
|
return False
|
|
137
192
|
|
|
138
193
|
|
|
139
|
-
@log_errors(default_return=0, raise_exception=False)
|
|
194
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
140
195
|
def get_gpu_memory_usage() -> float:
|
|
141
196
|
"""
|
|
142
197
|
Get GPU memory usage percentage.
|
|
@@ -144,17 +199,35 @@ def get_gpu_memory_usage() -> float:
|
|
|
144
199
|
Returns:
|
|
145
200
|
float: Memory usage between 0 and 1
|
|
146
201
|
"""
|
|
147
|
-
command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
|
|
202
|
+
command = ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"]
|
|
148
203
|
try:
|
|
149
|
-
|
|
204
|
+
result = subprocess.run(
|
|
205
|
+
command,
|
|
206
|
+
stdout=subprocess.PIPE,
|
|
207
|
+
stderr=subprocess.PIPE,
|
|
208
|
+
timeout=5,
|
|
209
|
+
check=False,
|
|
210
|
+
)
|
|
211
|
+
if result.returncode != 0:
|
|
212
|
+
logging.debug("nvidia-smi command failed in get_gpu_memory_usage")
|
|
213
|
+
return 0
|
|
214
|
+
output = result.stdout.decode("ascii").strip().split("\n")
|
|
150
215
|
memory_percentages = []
|
|
151
216
|
for line in output:
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
217
|
+
if line.strip():
|
|
218
|
+
used, total = map(int, line.split(","))
|
|
219
|
+
if total > 0:
|
|
220
|
+
usage_percentage = used / total
|
|
221
|
+
memory_percentages.append(usage_percentage)
|
|
222
|
+
return min(memory_percentages) if memory_percentages else 0
|
|
156
223
|
except subprocess.TimeoutExpired:
|
|
157
|
-
logging.
|
|
224
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
|
|
225
|
+
return 0
|
|
226
|
+
except (ValueError, IndexError) as e:
|
|
227
|
+
logging.debug("Error parsing GPU memory info: %s", e)
|
|
228
|
+
return 0
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logging.debug("Unexpected error in get_gpu_memory_usage: %s", e)
|
|
158
231
|
return 0
|
|
159
232
|
|
|
160
233
|
|
|
@@ -194,7 +267,7 @@ def get_mem_usage() -> float:
|
|
|
194
267
|
return mem_usage
|
|
195
268
|
|
|
196
269
|
|
|
197
|
-
@log_errors(default_return=[], raise_exception=False)
|
|
270
|
+
@log_errors(default_return=[], raise_exception=False, log_error=False)
|
|
198
271
|
def get_gpu_info() -> list:
|
|
199
272
|
"""
|
|
200
273
|
Get GPU information.
|
|
@@ -202,23 +275,34 @@ def get_gpu_info() -> list:
|
|
|
202
275
|
Returns:
|
|
203
276
|
list: GPU information strings
|
|
204
277
|
"""
|
|
205
|
-
proc = subprocess.Popen(
|
|
206
|
-
[
|
|
207
|
-
"nvidia-smi",
|
|
208
|
-
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
209
|
-
"--format=csv,noheader,nounits",
|
|
210
|
-
],
|
|
211
|
-
stdout=subprocess.PIPE,
|
|
212
|
-
stderr=subprocess.PIPE,
|
|
213
|
-
)
|
|
214
278
|
try:
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
279
|
+
proc = subprocess.Popen(
|
|
280
|
+
[
|
|
281
|
+
"nvidia-smi",
|
|
282
|
+
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
283
|
+
"--format=csv,noheader,nounits",
|
|
284
|
+
],
|
|
285
|
+
stdout=subprocess.PIPE,
|
|
286
|
+
stderr=subprocess.PIPE,
|
|
287
|
+
)
|
|
288
|
+
try:
|
|
289
|
+
stdout, stderr = proc.communicate(timeout=5)
|
|
290
|
+
if proc.returncode != 0:
|
|
291
|
+
logging.debug("nvidia-smi command failed in get_gpu_info")
|
|
292
|
+
return []
|
|
293
|
+
output = stdout.decode("UTF-8")
|
|
294
|
+
result = [line for line in output.split("\n") if line.strip()]
|
|
295
|
+
return result
|
|
296
|
+
except subprocess.TimeoutExpired:
|
|
297
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_info")
|
|
298
|
+
proc.kill()
|
|
299
|
+
proc.communicate() # flush output after kill
|
|
300
|
+
return []
|
|
301
|
+
except FileNotFoundError:
|
|
302
|
+
logging.debug("nvidia-smi not found on this system")
|
|
303
|
+
return []
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logging.debug("Error getting GPU info: %s", e)
|
|
222
306
|
return []
|
|
223
307
|
|
|
224
308
|
|
|
@@ -241,11 +325,29 @@ def is_docker_running() -> bool:
|
|
|
241
325
|
Returns:
|
|
242
326
|
bool: True if Docker containers are running
|
|
243
327
|
"""
|
|
244
|
-
command = "docker ps"
|
|
245
|
-
|
|
246
|
-
subprocess.
|
|
247
|
-
|
|
248
|
-
|
|
328
|
+
command = ["docker", "ps"]
|
|
329
|
+
try:
|
|
330
|
+
result = subprocess.run(
|
|
331
|
+
command,
|
|
332
|
+
stdout=subprocess.PIPE,
|
|
333
|
+
stderr=subprocess.PIPE,
|
|
334
|
+
check=False,
|
|
335
|
+
timeout=10,
|
|
336
|
+
)
|
|
337
|
+
if result.returncode != 0:
|
|
338
|
+
logging.warning("docker ps command failed")
|
|
339
|
+
return False
|
|
340
|
+
docker_images = result.stdout.decode("ascii").split("\n")[:-1][1:]
|
|
341
|
+
return bool(docker_images)
|
|
342
|
+
except subprocess.TimeoutExpired:
|
|
343
|
+
logging.warning("docker ps command timed out")
|
|
344
|
+
return False
|
|
345
|
+
except FileNotFoundError:
|
|
346
|
+
logging.warning("docker command not found")
|
|
347
|
+
return False
|
|
348
|
+
except Exception as e:
|
|
349
|
+
logging.warning("Error checking if docker is running: %s", e)
|
|
350
|
+
return False
|
|
249
351
|
|
|
250
352
|
|
|
251
353
|
@log_errors(default_return=None, raise_exception=False)
|
|
@@ -487,22 +589,45 @@ def get_required_gpu_memory(action_details: dict) -> int:
|
|
|
487
589
|
|
|
488
590
|
@log_errors(default_return=True, raise_exception=False)
|
|
489
591
|
def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
490
|
-
"""Check if GPU device is allowed.
|
|
592
|
+
"""Check if GPU device is allowed based on GPUS environment variable.
|
|
593
|
+
|
|
594
|
+
The GPUS environment variable can be used to restrict which GPU devices
|
|
595
|
+
are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
|
|
491
596
|
|
|
492
597
|
Args:
|
|
493
598
|
gpu_index (int): GPU device index
|
|
494
599
|
|
|
495
600
|
Returns:
|
|
496
|
-
bool: True if GPU is allowed
|
|
601
|
+
bool: True if GPU is allowed (or no filter is set), False otherwise
|
|
497
602
|
"""
|
|
498
603
|
gpus = os.environ.get("GPUS")
|
|
499
604
|
if not gpus:
|
|
605
|
+
# No filter set - all GPUs are allowed
|
|
606
|
+
return True
|
|
607
|
+
|
|
608
|
+
try:
|
|
609
|
+
allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
|
|
610
|
+
is_allowed = int(gpu_index) in allowed_gpus
|
|
611
|
+
|
|
612
|
+
if not is_allowed:
|
|
613
|
+
logging.debug(
|
|
614
|
+
"GPU %d is not in allowed GPU list: %s",
|
|
615
|
+
gpu_index,
|
|
616
|
+
allowed_gpus
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
return is_allowed
|
|
620
|
+
|
|
621
|
+
except ValueError as e:
|
|
622
|
+
logging.warning(
|
|
623
|
+
"Invalid GPUS environment variable format '%s': %s. Allowing all GPUs.",
|
|
624
|
+
gpus,
|
|
625
|
+
e
|
|
626
|
+
)
|
|
500
627
|
return True
|
|
501
|
-
allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
|
|
502
|
-
return int(gpu_index) in allowed_gpus
|
|
503
628
|
|
|
504
629
|
|
|
505
|
-
@log_errors(raise_exception=True)
|
|
630
|
+
@log_errors(raise_exception=True, log_error=False)
|
|
506
631
|
def get_gpu_with_sufficient_memory_for_action(
|
|
507
632
|
action_details: dict,
|
|
508
633
|
) -> list:
|
|
@@ -518,44 +643,167 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
518
643
|
Raises:
|
|
519
644
|
ValueError: If insufficient GPU memory
|
|
520
645
|
"""
|
|
646
|
+
action_id = action_details.get("_id", "unknown")
|
|
521
647
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
522
|
-
|
|
648
|
+
|
|
649
|
+
logging.info(
|
|
650
|
+
"Action %s: Searching for GPU(s) with %d MB available memory",
|
|
651
|
+
action_id,
|
|
652
|
+
required_gpu_memory
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
523
656
|
try:
|
|
524
|
-
|
|
657
|
+
result = subprocess.run(
|
|
658
|
+
command,
|
|
659
|
+
stdout=subprocess.PIPE,
|
|
660
|
+
stderr=subprocess.PIPE,
|
|
661
|
+
timeout=5,
|
|
662
|
+
check=False,
|
|
663
|
+
)
|
|
664
|
+
if result.returncode != 0:
|
|
665
|
+
error_msg = f"nvidia-smi command failed with return code {result.returncode}"
|
|
666
|
+
logging.error("Action %s: %s", action_id, error_msg)
|
|
667
|
+
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
668
|
+
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
525
669
|
except subprocess.TimeoutExpired:
|
|
526
|
-
logging.error(
|
|
670
|
+
logging.error(
|
|
671
|
+
"Action %s: nvidia-smi command timed out after 5 seconds",
|
|
672
|
+
action_id
|
|
673
|
+
)
|
|
527
674
|
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
528
|
-
|
|
675
|
+
except FileNotFoundError:
|
|
676
|
+
logging.error(
|
|
677
|
+
"Action %s: nvidia-smi not found on this system",
|
|
678
|
+
action_id
|
|
679
|
+
)
|
|
680
|
+
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
681
|
+
except Exception as e:
|
|
682
|
+
logging.error(
|
|
683
|
+
"Action %s: Error running nvidia-smi: %s",
|
|
684
|
+
action_id,
|
|
685
|
+
e
|
|
686
|
+
)
|
|
687
|
+
raise ValueError(f"Failed to get GPU information: {e}")
|
|
688
|
+
|
|
529
689
|
if len(memory_free_info) < 2:
|
|
690
|
+
logging.error(
|
|
691
|
+
"Action %s: No GPU information available from nvidia-smi output",
|
|
692
|
+
action_id
|
|
693
|
+
)
|
|
530
694
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
531
|
-
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
698
|
+
except (ValueError, IndexError) as e:
|
|
699
|
+
logging.error(
|
|
700
|
+
"Action %s: Error parsing GPU memory information: %s",
|
|
701
|
+
action_id,
|
|
702
|
+
e
|
|
703
|
+
)
|
|
704
|
+
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
705
|
+
|
|
706
|
+
if not memory_free_values:
|
|
707
|
+
logging.error("Action %s: No GPU devices found", action_id)
|
|
708
|
+
raise ValueError("No GPU devices found")
|
|
709
|
+
|
|
710
|
+
# Log all available GPUs and their free memory
|
|
711
|
+
logging.info(
|
|
712
|
+
"Action %s: Found %d GPU(s) - Free memory: %s",
|
|
713
|
+
action_id,
|
|
714
|
+
len(memory_free_values),
|
|
715
|
+
", ".join([f"GPU{i}: {mem}MB" for i, mem in enumerate(memory_free_values)])
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Check GPUS environment variable for allowed devices
|
|
719
|
+
allowed_gpus = os.environ.get("GPUS", "")
|
|
720
|
+
if allowed_gpus:
|
|
721
|
+
logging.info(
|
|
722
|
+
"Action %s: GPU device filter active - allowed devices: %s",
|
|
723
|
+
action_id,
|
|
724
|
+
allowed_gpus
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# For smaller memory requirements, try to fit on a single GPU first
|
|
532
728
|
if required_gpu_memory < 80000:
|
|
729
|
+
logging.debug(
|
|
730
|
+
"Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation",
|
|
731
|
+
action_id,
|
|
732
|
+
required_gpu_memory
|
|
733
|
+
)
|
|
533
734
|
try:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
735
|
+
single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
|
|
736
|
+
logging.info(
|
|
737
|
+
"Action %s: Successfully allocated single GPU: %s",
|
|
738
|
+
action_id,
|
|
739
|
+
single_gpu
|
|
740
|
+
)
|
|
741
|
+
return single_gpu
|
|
742
|
+
except ValueError as e:
|
|
743
|
+
logging.debug(
|
|
744
|
+
"Action %s: Single GPU allocation failed (%s) - will try multiple GPUs",
|
|
745
|
+
action_id,
|
|
746
|
+
str(e)
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
# Multi-GPU allocation: accumulate GPUs until we have enough memory
|
|
750
|
+
logging.info(
|
|
751
|
+
"Action %s: Attempting multi-GPU allocation for %d MB",
|
|
752
|
+
action_id,
|
|
753
|
+
required_gpu_memory
|
|
754
|
+
)
|
|
755
|
+
|
|
537
756
|
selected_gpus = []
|
|
538
757
|
total_memory = 0
|
|
539
758
|
for i, mem in enumerate(memory_free_values):
|
|
540
759
|
if not is_allowed_gpu_device(i):
|
|
760
|
+
logging.debug(
|
|
761
|
+
"Action %s: Skipping GPU %d - not in allowed device list",
|
|
762
|
+
action_id,
|
|
763
|
+
i
|
|
764
|
+
)
|
|
541
765
|
continue
|
|
542
766
|
if total_memory >= required_gpu_memory:
|
|
543
767
|
break
|
|
544
768
|
selected_gpus.append(i)
|
|
545
769
|
total_memory += mem
|
|
770
|
+
logging.debug(
|
|
771
|
+
"Action %s: Added GPU %d (%d MB free) - Total: %d MB",
|
|
772
|
+
action_id,
|
|
773
|
+
i,
|
|
774
|
+
mem,
|
|
775
|
+
total_memory
|
|
776
|
+
)
|
|
777
|
+
|
|
546
778
|
if total_memory >= required_gpu_memory:
|
|
779
|
+
logging.info(
|
|
780
|
+
"Action %s: Successfully allocated %d GPU(s): %s (Total memory: %d MB >= Required: %d MB)",
|
|
781
|
+
action_id,
|
|
782
|
+
len(selected_gpus),
|
|
783
|
+
selected_gpus,
|
|
784
|
+
total_memory,
|
|
785
|
+
required_gpu_memory
|
|
786
|
+
)
|
|
547
787
|
return selected_gpus
|
|
548
|
-
|
|
549
|
-
|
|
788
|
+
|
|
789
|
+
error_msg = (
|
|
790
|
+
f"Insufficient GPU memory available. "
|
|
791
|
+
f"Required: {required_gpu_memory}MB, "
|
|
792
|
+
f"Available: {total_memory}MB across {len(selected_gpus)} GPU(s)"
|
|
550
793
|
)
|
|
794
|
+
logging.error("Action %s: %s", action_id, error_msg)
|
|
795
|
+
raise ValueError(error_msg)
|
|
551
796
|
|
|
552
797
|
|
|
553
|
-
@log_errors(raise_exception=True)
|
|
798
|
+
@log_errors(raise_exception=True, log_error=False)
|
|
554
799
|
def get_single_gpu_with_sufficient_memory_for_action(
|
|
555
800
|
action_details: dict,
|
|
556
801
|
) -> list:
|
|
557
802
|
"""
|
|
558
|
-
Get single GPU with sufficient memory.
|
|
803
|
+
Get single GPU with sufficient memory using best-fit algorithm.
|
|
804
|
+
|
|
805
|
+
Best-fit selects the GPU with the smallest amount of free memory
|
|
806
|
+
that still meets the requirements, minimizing fragmentation.
|
|
559
807
|
|
|
560
808
|
Args:
|
|
561
809
|
action_details (dict): Action details
|
|
@@ -566,30 +814,126 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
566
814
|
Raises:
|
|
567
815
|
ValueError: If no GPU has sufficient memory
|
|
568
816
|
"""
|
|
817
|
+
action_id = action_details.get("_id", "unknown")
|
|
569
818
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
570
|
-
|
|
819
|
+
|
|
820
|
+
logging.debug(
|
|
821
|
+
"Action %s: Finding best-fit single GPU for %d MB",
|
|
822
|
+
action_id,
|
|
823
|
+
required_gpu_memory
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
571
827
|
try:
|
|
572
|
-
|
|
828
|
+
result = subprocess.run(
|
|
829
|
+
command,
|
|
830
|
+
stdout=subprocess.PIPE,
|
|
831
|
+
stderr=subprocess.PIPE,
|
|
832
|
+
timeout=5,
|
|
833
|
+
check=False,
|
|
834
|
+
)
|
|
835
|
+
if result.returncode != 0:
|
|
836
|
+
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
837
|
+
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
573
838
|
except subprocess.TimeoutExpired:
|
|
574
|
-
logging.error(
|
|
839
|
+
logging.error(
|
|
840
|
+
"Action %s: nvidia-smi timed out in single GPU selection",
|
|
841
|
+
action_id
|
|
842
|
+
)
|
|
575
843
|
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
576
|
-
|
|
844
|
+
except FileNotFoundError:
|
|
845
|
+
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
846
|
+
except Exception as e:
|
|
847
|
+
logging.error(
|
|
848
|
+
"Action %s: Error running nvidia-smi: %s",
|
|
849
|
+
action_id,
|
|
850
|
+
e
|
|
851
|
+
)
|
|
852
|
+
raise ValueError(f"Failed to get GPU information: {e}")
|
|
853
|
+
|
|
577
854
|
if len(memory_free_info) < 2:
|
|
578
855
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
579
|
-
|
|
856
|
+
|
|
857
|
+
try:
|
|
858
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
859
|
+
except (ValueError, IndexError) as e:
|
|
860
|
+
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
861
|
+
|
|
862
|
+
if not memory_free_values:
|
|
863
|
+
raise ValueError("No GPU devices found")
|
|
864
|
+
|
|
865
|
+
# Best-fit algorithm: find GPU with minimum free memory that meets requirement
|
|
580
866
|
best_fit_gpu = None
|
|
581
867
|
best_fit_memory = float("inf")
|
|
868
|
+
|
|
582
869
|
for i, mem in enumerate(memory_free_values):
|
|
870
|
+
# Check if GPU is in allowed list
|
|
583
871
|
if not is_allowed_gpu_device(i):
|
|
872
|
+
logging.debug(
|
|
873
|
+
"Action %s: Skipping GPU %d (not in allowed list) - %d MB free",
|
|
874
|
+
action_id,
|
|
875
|
+
i,
|
|
876
|
+
mem
|
|
877
|
+
)
|
|
584
878
|
continue
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
879
|
+
|
|
880
|
+
# Check if GPU has sufficient memory
|
|
881
|
+
if mem >= required_gpu_memory:
|
|
882
|
+
logging.debug(
|
|
883
|
+
"Action %s: GPU %d is candidate - %d MB free (required: %d MB)",
|
|
884
|
+
action_id,
|
|
885
|
+
i,
|
|
886
|
+
mem,
|
|
887
|
+
required_gpu_memory
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
# Best-fit: choose GPU with smallest sufficient memory
|
|
891
|
+
if mem < best_fit_memory:
|
|
892
|
+
best_fit_gpu = i
|
|
893
|
+
best_fit_memory = mem
|
|
894
|
+
logging.debug(
|
|
895
|
+
"Action %s: GPU %d is new best-fit candidate",
|
|
896
|
+
action_id,
|
|
897
|
+
i
|
|
898
|
+
)
|
|
899
|
+
else:
|
|
900
|
+
logging.debug(
|
|
901
|
+
"Action %s: GPU %d insufficient - %d MB free < %d MB required",
|
|
902
|
+
action_id,
|
|
903
|
+
i,
|
|
904
|
+
mem,
|
|
905
|
+
required_gpu_memory
|
|
906
|
+
)
|
|
907
|
+
|
|
588
908
|
if best_fit_gpu is not None:
|
|
909
|
+
logging.info(
|
|
910
|
+
"Action %s: Selected best-fit GPU %d with %d MB free (required: %d MB, waste: %d MB)",
|
|
911
|
+
action_id,
|
|
912
|
+
best_fit_gpu,
|
|
913
|
+
best_fit_memory,
|
|
914
|
+
required_gpu_memory,
|
|
915
|
+
best_fit_memory - required_gpu_memory
|
|
916
|
+
)
|
|
589
917
|
return [best_fit_gpu]
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
918
|
+
|
|
919
|
+
# No suitable GPU found - provide detailed error
|
|
920
|
+
suitable_gpus = [
|
|
921
|
+
f"GPU{i}: {mem}MB (need {required_gpu_memory}MB)"
|
|
922
|
+
for i, mem in enumerate(memory_free_values)
|
|
923
|
+
if is_allowed_gpu_device(i)
|
|
924
|
+
]
|
|
925
|
+
|
|
926
|
+
if not suitable_gpus:
|
|
927
|
+
error_msg = f"No allowed GPUs available (GPUS env filter active)"
|
|
928
|
+
else:
|
|
929
|
+
error_msg = (
|
|
930
|
+
f"No single GPU with sufficient memory. "
|
|
931
|
+
f"Required: {required_gpu_memory}MB. "
|
|
932
|
+
f"Available GPUs: {', '.join(suitable_gpus)}"
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
logging.warning("Action %s: %s", action_id, error_msg)
|
|
936
|
+
raise ValueError(error_msg)
|
|
593
937
|
|
|
594
938
|
|
|
595
939
|
@log_errors(default_return=(None, None), raise_exception=False)
|
|
@@ -692,47 +1036,112 @@ def get_encrypted_access_key_pair(
|
|
|
692
1036
|
|
|
693
1037
|
return encoded_access_key, encoded_secret_key
|
|
694
1038
|
|
|
695
|
-
|
|
696
|
-
def check_public_port_exposure(port: int) -> bool:
|
|
1039
|
+
def _get_private_ip() -> str:
|
|
697
1040
|
"""
|
|
698
|
-
|
|
1041
|
+
Get the actual private/LAN IP address using UDP socket trick.
|
|
1042
|
+
This works reliably even in Docker, NAT, VPN, etc.
|
|
1043
|
+
|
|
1044
|
+
Returns:
|
|
1045
|
+
str: Private IP address or None if not available
|
|
1046
|
+
"""
|
|
1047
|
+
try:
|
|
1048
|
+
# Use UDP socket to determine which interface would be used for external connection
|
|
1049
|
+
# No actual packets are sent
|
|
1050
|
+
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
|
|
1051
|
+
s.connect(("8.8.8.8", 80))
|
|
1052
|
+
private_ip = s.getsockname()[0]
|
|
1053
|
+
return private_ip
|
|
1054
|
+
except Exception:
|
|
1055
|
+
return None
|
|
699
1056
|
|
|
700
|
-
Args:
|
|
701
|
-
port (int): Port number to check
|
|
702
1057
|
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
is_locally_available = False
|
|
708
|
-
# Check if port is publicly accessible
|
|
709
|
-
public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
|
|
710
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn_sock:
|
|
711
|
-
conn_sock.settimeout(3)
|
|
712
|
-
result = conn_sock.connect_ex((public_ip, port))
|
|
713
|
-
is_public_exposed = result == 0
|
|
1058
|
+
def _public_ip_is_local(public_ip: str) -> bool:
|
|
1059
|
+
"""
|
|
1060
|
+
Check if a public IP address is actually assigned to a local network interface.
|
|
1061
|
+
This is true on cloud servers with real public IPs, false behind NAT.
|
|
714
1062
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
logging.debug(
|
|
728
|
-
"Port %d is not publicly exposed",
|
|
729
|
-
port,
|
|
730
|
-
)
|
|
1063
|
+
Args:
|
|
1064
|
+
public_ip (str): The public IP to check
|
|
1065
|
+
|
|
1066
|
+
Returns:
|
|
1067
|
+
bool: True if the public IP is on a local interface
|
|
1068
|
+
"""
|
|
1069
|
+
try:
|
|
1070
|
+
for iface, addrs in psutil.net_if_addrs().items():
|
|
1071
|
+
for addr in addrs:
|
|
1072
|
+
if addr.family == socket.AF_INET:
|
|
1073
|
+
if addr.address == public_ip:
|
|
1074
|
+
return True
|
|
731
1075
|
return False
|
|
732
|
-
|
|
733
|
-
logging.debug(
|
|
734
|
-
"Port %d is not locally available",
|
|
735
|
-
port,
|
|
736
|
-
)
|
|
1076
|
+
except Exception:
|
|
737
1077
|
return False
|
|
738
|
-
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
@log_errors(default_return=("localhost", True), raise_exception=False)
|
|
1081
|
+
def get_best_service_ip_and_network(port: int) -> tuple:
|
|
1082
|
+
"""
|
|
1083
|
+
Determine the best IP address and network configuration for a service.
|
|
1084
|
+
|
|
1085
|
+
This function intelligently selects the best IP to bind a service to:
|
|
1086
|
+
|
|
1087
|
+
Priority:
|
|
1088
|
+
1. Public IP if it's actually on a local interface (cloud servers)
|
|
1089
|
+
2. Private/LAN IP (NAT, local network, Docker)
|
|
1090
|
+
3. localhost with --net=host (fallback)
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
port (int): Port number for the service
|
|
1094
|
+
|
|
1095
|
+
Returns:
|
|
1096
|
+
tuple: (ip_address, use_host_network) where:
|
|
1097
|
+
- ip_address: The IP address to use (public, private, or localhost)
|
|
1098
|
+
- use_host_network: True if should use --net=host, False if should use port mapping
|
|
1099
|
+
"""
|
|
1100
|
+
try:
|
|
1101
|
+
# Check if port is available (not already in use)
|
|
1102
|
+
try:
|
|
1103
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock:
|
|
1104
|
+
test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
1105
|
+
test_sock.bind(("0.0.0.0", port))
|
|
1106
|
+
test_sock.listen(1)
|
|
1107
|
+
# Port is available - socket closes automatically
|
|
1108
|
+
except OSError as e:
|
|
1109
|
+
logging.warning(f"Port {port} is already in use or cannot be bound: {e}, will use --net=host")
|
|
1110
|
+
return "localhost", True
|
|
1111
|
+
|
|
1112
|
+
# Get the actual private/LAN IP
|
|
1113
|
+
private_ip = _get_private_ip()
|
|
1114
|
+
if private_ip:
|
|
1115
|
+
logging.info(f"Determined private/LAN IP: {private_ip}")
|
|
1116
|
+
else:
|
|
1117
|
+
logging.debug("Could not determine private IP")
|
|
1118
|
+
|
|
1119
|
+
# Try to get public IP from external service
|
|
1120
|
+
public_ip = None
|
|
1121
|
+
try:
|
|
1122
|
+
public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8").strip()
|
|
1123
|
+
# Validate it's a proper IP address
|
|
1124
|
+
socket.inet_aton(public_ip)
|
|
1125
|
+
logging.info(f"Determined external/public IP: {public_ip}")
|
|
1126
|
+
except Exception as e:
|
|
1127
|
+
logging.debug(f"Could not determine public IP: {e}")
|
|
1128
|
+
|
|
1129
|
+
# Decision logic: Choose the best IP
|
|
1130
|
+
|
|
1131
|
+
# 1. If public IP is on a local interface, use it (cloud server with real public IP)
|
|
1132
|
+
if public_ip and _public_ip_is_local(public_ip):
|
|
1133
|
+
logging.info(f"Public IP {public_ip} is on local interface, using it for port {port}")
|
|
1134
|
+
return public_ip, False
|
|
1135
|
+
|
|
1136
|
+
# 2. If we have a valid private IP, use it (most common case: NAT, LAN, Docker)
|
|
1137
|
+
if private_ip and not private_ip.startswith("127."):
|
|
1138
|
+
logging.info(f"Using private/LAN IP {private_ip} for port {port}")
|
|
1139
|
+
return private_ip, False
|
|
1140
|
+
|
|
1141
|
+
# 3. Fall back to localhost with --net=host
|
|
1142
|
+
logging.info(f"No suitable IP found, using localhost with --net=host for port {port}")
|
|
1143
|
+
return "localhost", True
|
|
1144
|
+
|
|
1145
|
+
except Exception as e:
|
|
1146
|
+
logging.warning(f"Error determining best IP for port {port}: {e}, falling back to localhost")
|
|
1147
|
+
return "localhost", True
|