matrice-compute 0.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +20 -0
- matrice_compute/action_instance.py +2023 -0
- matrice_compute/actions_manager.py +467 -0
- matrice_compute/actions_scaledown_manager.py +57 -0
- matrice_compute/compute_operations_handler.py +490 -0
- matrice_compute/instance_manager.py +470 -0
- matrice_compute/instance_utils.py +1266 -0
- matrice_compute/prechecks.py +538 -0
- matrice_compute/py.typed +0 -0
- matrice_compute/resources_tracker.py +842 -0
- matrice_compute/scaling.py +1395 -0
- matrice_compute/shutdown_manager.py +314 -0
- matrice_compute/task_utils.py +77 -0
- matrice_compute-0.1.29.dist-info/METADATA +28 -0
- matrice_compute-0.1.29.dist-info/RECORD +18 -0
- matrice_compute-0.1.29.dist-info/WHEEL +5 -0
- matrice_compute-0.1.29.dist-info/licenses/LICENSE.txt +21 -0
- matrice_compute-0.1.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1266 @@
|
|
|
1
|
+
"""Module providing instance utilities functionality."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import urllib.request
|
|
6
|
+
import subprocess
|
|
7
|
+
import logging
|
|
8
|
+
import base64
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
import psutil
|
|
11
|
+
from cryptography.hazmat.primitives.ciphers import (
|
|
12
|
+
Cipher,
|
|
13
|
+
algorithms,
|
|
14
|
+
modes,
|
|
15
|
+
)
|
|
16
|
+
from cryptography.hazmat.backends import default_backend
|
|
17
|
+
from matrice_common.utils import log_errors
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_instance_info(service_provider: str = None, instance_id: str = None) -> tuple:
|
|
21
|
+
"""
|
|
22
|
+
Get instance provider and ID information.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
tuple: (service_provider, instance_id) strings
|
|
26
|
+
"""
|
|
27
|
+
auto_service_provider = service_provider or os.environ.get("SERVICE_PROVIDER") or "LOCAL"
|
|
28
|
+
auto_instance_id = instance_id or os.environ.get("INSTANCE_ID") or ""
|
|
29
|
+
try:
|
|
30
|
+
gcp_check = subprocess.run(
|
|
31
|
+
"curl -s -m 1 -H 'Metadata-Flavor: Google' 'http://metadata.google.internal/computeMetadata/v1/instance/id'",
|
|
32
|
+
shell=True,
|
|
33
|
+
capture_output=True,
|
|
34
|
+
check=True,
|
|
35
|
+
)
|
|
36
|
+
if gcp_check.returncode == 0:
|
|
37
|
+
auto_service_provider = "GCP"
|
|
38
|
+
auto_instance_id = gcp_check.stdout.decode().strip()
|
|
39
|
+
except subprocess.CalledProcessError:
|
|
40
|
+
pass
|
|
41
|
+
try:
|
|
42
|
+
azure_check = subprocess.run(
|
|
43
|
+
"curl -s -m 1 -H Metadata:true 'http://169.254.169.254/metadata/instance?api-version=2020-09-01'",
|
|
44
|
+
shell=True,
|
|
45
|
+
capture_output=True,
|
|
46
|
+
check=True,
|
|
47
|
+
)
|
|
48
|
+
if azure_check.returncode == 0:
|
|
49
|
+
auto_service_provider = "AZURE"
|
|
50
|
+
azure_id = subprocess.run(
|
|
51
|
+
"curl -s -H Metadata:true 'http://169.254.169.254/metadata/instance/compute/vmId?api-version=2017-08-01&format=text'",
|
|
52
|
+
shell=True,
|
|
53
|
+
capture_output=True,
|
|
54
|
+
check=True,
|
|
55
|
+
)
|
|
56
|
+
auto_instance_id = azure_id.stdout.decode().strip()
|
|
57
|
+
except subprocess.CalledProcessError:
|
|
58
|
+
pass
|
|
59
|
+
try:
|
|
60
|
+
oci_check = subprocess.run(
|
|
61
|
+
"curl -s -m 1 -H 'Authorization: Bearer OracleCloud' 'http://169.254.169.254/opc/v1/instance/'",
|
|
62
|
+
shell=True,
|
|
63
|
+
capture_output=True,
|
|
64
|
+
check=True,
|
|
65
|
+
)
|
|
66
|
+
if oci_check.returncode == 0:
|
|
67
|
+
auto_service_provider = "OCI"
|
|
68
|
+
oci_id = subprocess.run(
|
|
69
|
+
"curl -s http://169.254.169.254/opc/v1/instance/id",
|
|
70
|
+
shell=True,
|
|
71
|
+
capture_output=True,
|
|
72
|
+
check=True,
|
|
73
|
+
)
|
|
74
|
+
auto_instance_id = oci_id.stdout.decode().strip()
|
|
75
|
+
except subprocess.CalledProcessError:
|
|
76
|
+
pass
|
|
77
|
+
try:
|
|
78
|
+
aws_check = subprocess.run(
|
|
79
|
+
"curl -s -m 1 http://169.254.169.254/latest/meta-data/",
|
|
80
|
+
shell=True,
|
|
81
|
+
capture_output=True,
|
|
82
|
+
check=True,
|
|
83
|
+
)
|
|
84
|
+
if aws_check.returncode == 0:
|
|
85
|
+
auto_service_provider = "AWS"
|
|
86
|
+
aws_id = subprocess.run(
|
|
87
|
+
"curl -s http://169.254.169.254/latest/meta-data/instance-id",
|
|
88
|
+
shell=True,
|
|
89
|
+
capture_output=True,
|
|
90
|
+
check=True,
|
|
91
|
+
)
|
|
92
|
+
auto_instance_id = aws_id.stdout.decode().strip()
|
|
93
|
+
except subprocess.CalledProcessError:
|
|
94
|
+
pass
|
|
95
|
+
return str(auto_service_provider), str(auto_instance_id)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _normalize_timestamp(timestamp_str: str) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Normalize timestamp string to handle different precision levels.
|
|
101
|
+
|
|
102
|
+
Handles nanoseconds (9 digits), microseconds (6 digits), milliseconds (3 digits),
|
|
103
|
+
and various timezone formats across different cloud providers.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
timestamp_str (str): Timestamp string in various formats
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
str: Normalized timestamp string compatible with fromisoformat()
|
|
110
|
+
"""
|
|
111
|
+
# Replace 'Z' with '+00:00' for UTC timestamps
|
|
112
|
+
timestamp_str = timestamp_str.replace("Z", "+00:00")
|
|
113
|
+
|
|
114
|
+
# Handle fractional seconds - Python's datetime only supports up to 6 digits (microseconds)
|
|
115
|
+
# Some providers (like OCI, GCP) may return nanoseconds (9 digits)
|
|
116
|
+
if "." in timestamp_str:
|
|
117
|
+
# Split into main part and fractional part
|
|
118
|
+
if "+" in timestamp_str:
|
|
119
|
+
main_part, tz_part = timestamp_str.rsplit("+", 1)
|
|
120
|
+
tz_suffix = "+" + tz_part
|
|
121
|
+
elif timestamp_str.count("-") > 2: # Has negative timezone offset
|
|
122
|
+
main_part, tz_part = timestamp_str.rsplit("-", 1)
|
|
123
|
+
tz_suffix = "-" + tz_part
|
|
124
|
+
else:
|
|
125
|
+
main_part = timestamp_str
|
|
126
|
+
tz_suffix = ""
|
|
127
|
+
|
|
128
|
+
# Split main part into date/time and fractional seconds
|
|
129
|
+
datetime_part, fractional = main_part.rsplit(".", 1)
|
|
130
|
+
|
|
131
|
+
# Truncate fractional seconds to 6 digits (microseconds)
|
|
132
|
+
if len(fractional) > 6:
|
|
133
|
+
fractional = fractional[:6]
|
|
134
|
+
|
|
135
|
+
# Reconstruct timestamp
|
|
136
|
+
timestamp_str = f"{datetime_part}.{fractional}{tz_suffix}"
|
|
137
|
+
|
|
138
|
+
return timestamp_str
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
142
|
+
def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
|
|
143
|
+
"""
|
|
144
|
+
Calculate time difference between start and finish times.
|
|
145
|
+
|
|
146
|
+
Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
|
|
147
|
+
and different precision levels (nanoseconds, microseconds, milliseconds).
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
start_time_str (str): Start time string in ISO format
|
|
151
|
+
finish_time_str (str): Finish time string in ISO format
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
int: Time difference in seconds
|
|
155
|
+
"""
|
|
156
|
+
# Normalize both timestamps to handle different formats
|
|
157
|
+
normalized_start = _normalize_timestamp(start_time_str)
|
|
158
|
+
normalized_finish = _normalize_timestamp(finish_time_str)
|
|
159
|
+
|
|
160
|
+
# Parse the normalized timestamps
|
|
161
|
+
start_time = datetime.fromisoformat(normalized_start)
|
|
162
|
+
finish_time = datetime.fromisoformat(normalized_finish)
|
|
163
|
+
|
|
164
|
+
return int((finish_time - start_time).total_seconds())
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
168
|
+
def has_gpu() -> bool:
|
|
169
|
+
"""
|
|
170
|
+
Check if the system has a GPU.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
bool: True if GPU is present, False otherwise
|
|
174
|
+
"""
|
|
175
|
+
try:
|
|
176
|
+
result = subprocess.run(
|
|
177
|
+
["nvidia-smi"],
|
|
178
|
+
stdout=subprocess.PIPE,
|
|
179
|
+
stderr=subprocess.PIPE,
|
|
180
|
+
timeout=5,
|
|
181
|
+
check=False,
|
|
182
|
+
)
|
|
183
|
+
return result.returncode == 0
|
|
184
|
+
except subprocess.TimeoutExpired:
|
|
185
|
+
logging.debug("nvidia-smi command timed out after 5 seconds")
|
|
186
|
+
return False
|
|
187
|
+
except FileNotFoundError:
|
|
188
|
+
logging.debug("nvidia-smi not found on this system")
|
|
189
|
+
return False
|
|
190
|
+
except Exception:
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
195
|
+
def get_gpu_memory_usage() -> float:
|
|
196
|
+
"""
|
|
197
|
+
Get GPU memory usage percentage.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
float: Memory usage between 0 and 1
|
|
201
|
+
"""
|
|
202
|
+
command = ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"]
|
|
203
|
+
try:
|
|
204
|
+
result = subprocess.run(
|
|
205
|
+
command,
|
|
206
|
+
stdout=subprocess.PIPE,
|
|
207
|
+
stderr=subprocess.PIPE,
|
|
208
|
+
timeout=5,
|
|
209
|
+
check=False,
|
|
210
|
+
)
|
|
211
|
+
if result.returncode != 0:
|
|
212
|
+
logging.debug("nvidia-smi command failed in get_gpu_memory_usage")
|
|
213
|
+
return 0
|
|
214
|
+
output = result.stdout.decode("ascii").strip().split("\n")
|
|
215
|
+
memory_percentages = []
|
|
216
|
+
for line in output:
|
|
217
|
+
if line.strip():
|
|
218
|
+
used, total = map(int, line.split(","))
|
|
219
|
+
if total > 0:
|
|
220
|
+
usage_percentage = used / total
|
|
221
|
+
memory_percentages.append(usage_percentage)
|
|
222
|
+
return min(memory_percentages) if memory_percentages else 0
|
|
223
|
+
except subprocess.TimeoutExpired:
|
|
224
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
|
|
225
|
+
return 0
|
|
226
|
+
except (ValueError, IndexError) as e:
|
|
227
|
+
logging.debug("Error parsing GPU memory info: %s", e)
|
|
228
|
+
return 0
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logging.debug("Unexpected error in get_gpu_memory_usage: %s", e)
|
|
231
|
+
return 0
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
235
|
+
def get_cpu_memory_usage() -> float:
|
|
236
|
+
"""
|
|
237
|
+
Get CPU memory usage.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
float: Memory usage between 0 and 1
|
|
241
|
+
"""
|
|
242
|
+
memory = psutil.virtual_memory()
|
|
243
|
+
return memory.percent / 100
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
247
|
+
def get_mem_usage() -> float:
|
|
248
|
+
"""
|
|
249
|
+
Get memory usage for either GPU or CPU.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
float: Memory usage between 0 and 1
|
|
253
|
+
"""
|
|
254
|
+
if has_gpu():
|
|
255
|
+
try:
|
|
256
|
+
mem_usage = get_gpu_memory_usage()
|
|
257
|
+
except Exception as err:
|
|
258
|
+
logging.error(
|
|
259
|
+
"Error getting GPU memory usage: %s",
|
|
260
|
+
err,
|
|
261
|
+
)
|
|
262
|
+
mem_usage = get_cpu_memory_usage()
|
|
263
|
+
else:
|
|
264
|
+
mem_usage = get_cpu_memory_usage()
|
|
265
|
+
if mem_usage is None:
|
|
266
|
+
mem_usage = 0
|
|
267
|
+
return mem_usage
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@log_errors(default_return=[], raise_exception=False, log_error=False)
|
|
271
|
+
def get_gpu_info() -> list:
|
|
272
|
+
"""
|
|
273
|
+
Get GPU information.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
list: GPU information strings
|
|
277
|
+
"""
|
|
278
|
+
try:
|
|
279
|
+
proc = subprocess.Popen(
|
|
280
|
+
[
|
|
281
|
+
"nvidia-smi",
|
|
282
|
+
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
283
|
+
"--format=csv,noheader,nounits",
|
|
284
|
+
],
|
|
285
|
+
stdout=subprocess.PIPE,
|
|
286
|
+
stderr=subprocess.PIPE,
|
|
287
|
+
)
|
|
288
|
+
try:
|
|
289
|
+
stdout, stderr = proc.communicate(timeout=5)
|
|
290
|
+
if proc.returncode != 0:
|
|
291
|
+
logging.debug("nvidia-smi command failed in get_gpu_info")
|
|
292
|
+
return []
|
|
293
|
+
output = stdout.decode("UTF-8")
|
|
294
|
+
result = [line for line in output.split("\n") if line.strip()]
|
|
295
|
+
return result
|
|
296
|
+
except subprocess.TimeoutExpired:
|
|
297
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_info")
|
|
298
|
+
proc.kill()
|
|
299
|
+
proc.communicate() # flush output after kill
|
|
300
|
+
return []
|
|
301
|
+
except FileNotFoundError:
|
|
302
|
+
logging.debug("nvidia-smi not found on this system")
|
|
303
|
+
return []
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logging.debug("Error getting GPU info: %s", e)
|
|
306
|
+
return []
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
@log_errors(default_return="", raise_exception=False)
|
|
310
|
+
def get_instance_id() -> str:
|
|
311
|
+
"""
|
|
312
|
+
Get instance ID.
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
str: Instance ID or empty string
|
|
316
|
+
"""
|
|
317
|
+
return os.environ["INSTANCE_ID"]
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
321
|
+
def is_docker_running() -> bool:
|
|
322
|
+
"""
|
|
323
|
+
Check if Docker is running.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
bool: True if Docker containers are running
|
|
327
|
+
"""
|
|
328
|
+
command = ["docker", "ps"]
|
|
329
|
+
try:
|
|
330
|
+
result = subprocess.run(
|
|
331
|
+
command,
|
|
332
|
+
stdout=subprocess.PIPE,
|
|
333
|
+
stderr=subprocess.PIPE,
|
|
334
|
+
check=False,
|
|
335
|
+
timeout=10,
|
|
336
|
+
)
|
|
337
|
+
if result.returncode != 0:
|
|
338
|
+
logging.warning("docker ps command failed")
|
|
339
|
+
return False
|
|
340
|
+
docker_images = result.stdout.decode("ascii").split("\n")[:-1][1:]
|
|
341
|
+
return bool(docker_images)
|
|
342
|
+
except subprocess.TimeoutExpired:
|
|
343
|
+
logging.warning("docker ps command timed out")
|
|
344
|
+
return False
|
|
345
|
+
except FileNotFoundError:
|
|
346
|
+
logging.warning("docker command not found")
|
|
347
|
+
return False
|
|
348
|
+
except Exception as e:
|
|
349
|
+
logging.warning("Error checking if docker is running: %s", e)
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
354
|
+
def prune_docker_images() -> None:
|
|
355
|
+
"""Prune Docker images."""
|
|
356
|
+
subprocess.run(
|
|
357
|
+
[
|
|
358
|
+
"docker",
|
|
359
|
+
"image",
|
|
360
|
+
"prune",
|
|
361
|
+
"-a",
|
|
362
|
+
"-f",
|
|
363
|
+
],
|
|
364
|
+
check=True,
|
|
365
|
+
)
|
|
366
|
+
logging.info("Docker images pruned successfully.")
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
@log_errors(default_return=0.0, raise_exception=False)
|
|
370
|
+
def _normalize_disk_usage_to_gb(disk_space: str) -> float:
|
|
371
|
+
"""
|
|
372
|
+
Normalize disk usage to GB.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
disk_space (str): Disk space with unit
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
float: Disk space in GB
|
|
379
|
+
"""
|
|
380
|
+
if disk_space.endswith("G"):
|
|
381
|
+
result = float(disk_space[:-1])
|
|
382
|
+
elif disk_space.endswith("T"):
|
|
383
|
+
result = float(disk_space[:-1]) * 1024
|
|
384
|
+
elif disk_space.endswith("M"):
|
|
385
|
+
result = float(disk_space[:-1]) / 1024
|
|
386
|
+
elif disk_space.endswith("K"):
|
|
387
|
+
result = float(disk_space[:-1]) / (1024 * 1024)
|
|
388
|
+
else:
|
|
389
|
+
result = float(disk_space)
|
|
390
|
+
logging.debug(
|
|
391
|
+
"Normalized disk space value to %f GB",
|
|
392
|
+
result,
|
|
393
|
+
)
|
|
394
|
+
return result
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
398
|
+
def _parse_disk_usage_info(line: str) -> dict:
|
|
399
|
+
"""
|
|
400
|
+
Parse disk usage information.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
line (str): Disk usage line from df command
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
dict: Parsed disk usage information
|
|
407
|
+
"""
|
|
408
|
+
parts = line.split()
|
|
409
|
+
parsed_info = {
|
|
410
|
+
"filesystem": parts[0],
|
|
411
|
+
"size": _normalize_disk_usage_to_gb(parts[1]),
|
|
412
|
+
"used": _normalize_disk_usage_to_gb(parts[2]),
|
|
413
|
+
"available": _normalize_disk_usage_to_gb(parts[3]),
|
|
414
|
+
"use_percentage": float(parts[4].rstrip("%")),
|
|
415
|
+
"mounted_on": parts[5],
|
|
416
|
+
}
|
|
417
|
+
logging.debug(
|
|
418
|
+
"Successfully parsed disk usage info: %s",
|
|
419
|
+
parsed_info,
|
|
420
|
+
)
|
|
421
|
+
return parsed_info
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
425
|
+
def get_disk_space_usage() -> list:
|
|
426
|
+
"""
|
|
427
|
+
Get disk space usage for all filesystems.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
list: List of disk usage information dictionaries
|
|
431
|
+
"""
|
|
432
|
+
logging.info("Getting disk space usage information")
|
|
433
|
+
result = subprocess.run(
|
|
434
|
+
["df", "-h"],
|
|
435
|
+
capture_output=True,
|
|
436
|
+
text=True,
|
|
437
|
+
check=True,
|
|
438
|
+
)
|
|
439
|
+
lines = result.stdout.strip().split("\n")[1:]
|
|
440
|
+
disk_usage = []
|
|
441
|
+
for line in lines:
|
|
442
|
+
disk = _parse_disk_usage_info(line)
|
|
443
|
+
if disk:
|
|
444
|
+
disk_usage.append(disk)
|
|
445
|
+
logging.info(
|
|
446
|
+
"Found disk usage info for %d filesystems",
|
|
447
|
+
len(disk_usage),
|
|
448
|
+
)
|
|
449
|
+
return disk_usage
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
453
|
+
def get_max_file_system() -> str:
|
|
454
|
+
"""
|
|
455
|
+
Get filesystem with maximum available space.
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
str: Path to filesystem with most space or None
|
|
459
|
+
"""
|
|
460
|
+
logging.info("Finding filesystem with maximum available space")
|
|
461
|
+
disk_usage = get_disk_space_usage()
|
|
462
|
+
if not disk_usage:
|
|
463
|
+
logging.warning("No disk usage information available")
|
|
464
|
+
return None
|
|
465
|
+
filtered_disks = [
|
|
466
|
+
disk
|
|
467
|
+
for disk in disk_usage
|
|
468
|
+
if disk["mounted_on"] != "/boot/efi"
|
|
469
|
+
and "overlay" not in disk["filesystem"]
|
|
470
|
+
and disk["available"] > 0
|
|
471
|
+
]
|
|
472
|
+
if not filtered_disks:
|
|
473
|
+
logging.warning("No suitable filesystems found after filtering")
|
|
474
|
+
max_available_filesystem = ""
|
|
475
|
+
else:
|
|
476
|
+
max_disk = max(
|
|
477
|
+
filtered_disks,
|
|
478
|
+
key=lambda x: x["available"],
|
|
479
|
+
)
|
|
480
|
+
max_available_filesystem = max_disk["mounted_on"]
|
|
481
|
+
logging.info(
|
|
482
|
+
"Found filesystem with maximum space: %s (%f GB available)",
|
|
483
|
+
max_available_filesystem,
|
|
484
|
+
max_disk["available"],
|
|
485
|
+
)
|
|
486
|
+
# Check if filesystem is writable, or if it's root/empty
|
|
487
|
+
if max_available_filesystem in ["/", ""] or not os.access(max_available_filesystem, os.W_OK):
|
|
488
|
+
if max_available_filesystem not in ["/", ""]:
|
|
489
|
+
logging.warning(
|
|
490
|
+
"Filesystem %s is not writable, falling back to home directory",
|
|
491
|
+
max_available_filesystem,
|
|
492
|
+
)
|
|
493
|
+
home_dir = os.path.expanduser("~")
|
|
494
|
+
if not os.environ.get("WORKSPACE_DIR"):
|
|
495
|
+
logging.error("WORKSPACE_DIR environment variable not set")
|
|
496
|
+
return None
|
|
497
|
+
workspace_dir = os.path.join(
|
|
498
|
+
home_dir,
|
|
499
|
+
os.environ["WORKSPACE_DIR"],
|
|
500
|
+
)
|
|
501
|
+
os.makedirs(workspace_dir, exist_ok=True)
|
|
502
|
+
logging.info(
|
|
503
|
+
"Created workspace directory at: %s",
|
|
504
|
+
workspace_dir,
|
|
505
|
+
)
|
|
506
|
+
return workspace_dir
|
|
507
|
+
return max_available_filesystem
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
511
|
+
def get_docker_disk_space_usage() -> dict:
|
|
512
|
+
"""
|
|
513
|
+
Get disk space usage for Docker storage.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
dict: Docker disk usage information
|
|
517
|
+
"""
|
|
518
|
+
result = subprocess.run(
|
|
519
|
+
["docker", "info"],
|
|
520
|
+
capture_output=True,
|
|
521
|
+
text=True,
|
|
522
|
+
check=True,
|
|
523
|
+
)
|
|
524
|
+
docker_info = result.stdout
|
|
525
|
+
docker_root_dir = None
|
|
526
|
+
for line in docker_info.split("\n"):
|
|
527
|
+
if line.strip().startswith("Docker Root Dir"):
|
|
528
|
+
docker_root_dir = line.split(":")[1].strip()
|
|
529
|
+
break
|
|
530
|
+
if docker_root_dir is None:
|
|
531
|
+
logging.error("Unable to find Docker root directory")
|
|
532
|
+
raise ValueError("Unable to find Docker root directory")
|
|
533
|
+
logging.debug(
|
|
534
|
+
"Found Docker root directory: %s",
|
|
535
|
+
docker_root_dir,
|
|
536
|
+
)
|
|
537
|
+
result = subprocess.run(
|
|
538
|
+
["df", "-h", docker_root_dir],
|
|
539
|
+
capture_output=True,
|
|
540
|
+
text=True,
|
|
541
|
+
check=True,
|
|
542
|
+
)
|
|
543
|
+
lines = result.stdout.strip().split("\n")[1:]
|
|
544
|
+
if not lines:
|
|
545
|
+
logging.error("No disk usage information found for Docker root directory")
|
|
546
|
+
raise ValueError("No disk usage information found for Docker root directory")
|
|
547
|
+
docker_disk_usage = _parse_disk_usage_info(lines[0])
|
|
548
|
+
if docker_disk_usage is None:
|
|
549
|
+
logging.error("Failed to parse Docker disk usage information")
|
|
550
|
+
raise ValueError("Failed to parse Docker disk usage information")
|
|
551
|
+
logging.info(
|
|
552
|
+
"Successfully retrieved Docker disk usage: %s",
|
|
553
|
+
docker_disk_usage,
|
|
554
|
+
)
|
|
555
|
+
return docker_disk_usage
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
@log_errors(raise_exception=False)
|
|
559
|
+
def cleanup_docker_storage() -> None:
|
|
560
|
+
"""Clean up Docker storage if space is low."""
|
|
561
|
+
docker_disk_usage = get_docker_disk_space_usage()
|
|
562
|
+
if docker_disk_usage is None:
|
|
563
|
+
logging.error("Failed to get Docker disk space usage, skipping cleanup")
|
|
564
|
+
return
|
|
565
|
+
if docker_disk_usage["use_percentage"] >= 90 or docker_disk_usage["available"] <= 30:
|
|
566
|
+
logging.info(
|
|
567
|
+
"Pruning Docker images. Disk space is low: %s",
|
|
568
|
+
docker_disk_usage,
|
|
569
|
+
)
|
|
570
|
+
prune_docker_images()
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
574
|
+
def get_required_gpu_memory(action_details: dict) -> int:
|
|
575
|
+
"""
|
|
576
|
+
Get required GPU memory from action details.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
action_details (dict): Action details
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
int: Required GPU memory
|
|
583
|
+
"""
|
|
584
|
+
try:
|
|
585
|
+
return action_details["actionDetails"]["expectedResources"]["gpuMemory"]
|
|
586
|
+
except KeyError:
|
|
587
|
+
return 0
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
@log_errors(default_return=True, raise_exception=False)
|
|
591
|
+
def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
592
|
+
"""Check if GPU device is allowed based on GPUS environment variable.
|
|
593
|
+
|
|
594
|
+
The GPUS environment variable can be used to restrict which GPU devices
|
|
595
|
+
are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
gpu_index (int): GPU device index
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
bool: True if GPU is allowed (or no filter is set), False otherwise
|
|
602
|
+
"""
|
|
603
|
+
gpus = os.environ.get("GPUS", "").strip()
|
|
604
|
+
# No filter set or empty string - all GPUs are allowed
|
|
605
|
+
if not gpus or gpus == '""' or gpus == "''":
|
|
606
|
+
return True
|
|
607
|
+
|
|
608
|
+
try:
|
|
609
|
+
allowed_gpus = [int(x.strip()) for x in gpus.split(",") if x.strip()]
|
|
610
|
+
|
|
611
|
+
# If no valid GPUs after parsing, allow all
|
|
612
|
+
if not allowed_gpus:
|
|
613
|
+
return True
|
|
614
|
+
|
|
615
|
+
is_allowed = int(gpu_index) in allowed_gpus
|
|
616
|
+
|
|
617
|
+
if not is_allowed:
|
|
618
|
+
logging.debug(
|
|
619
|
+
"GPU %d is not in allowed GPU list: %s",
|
|
620
|
+
gpu_index,
|
|
621
|
+
allowed_gpus
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
return is_allowed
|
|
625
|
+
|
|
626
|
+
except ValueError as e:
|
|
627
|
+
logging.warning(
|
|
628
|
+
"Invalid GPUS environment variable format '%s': %s. Allowing all GPUs.",
|
|
629
|
+
gpus,
|
|
630
|
+
e
|
|
631
|
+
)
|
|
632
|
+
return True
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
@log_errors(raise_exception=True, log_error=False)
|
|
636
|
+
def get_gpu_with_sufficient_memory_for_action(
|
|
637
|
+
action_details: dict,
|
|
638
|
+
) -> list:
|
|
639
|
+
"""
|
|
640
|
+
Get GPUs with sufficient memory for action.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
action_details (dict): Action details
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
list: List of GPU indices
|
|
647
|
+
|
|
648
|
+
Raises:
|
|
649
|
+
ValueError: If insufficient GPU memory
|
|
650
|
+
"""
|
|
651
|
+
action_id = action_details.get("_id", "unknown")
|
|
652
|
+
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
653
|
+
|
|
654
|
+
logging.info(
|
|
655
|
+
"Action %s: Searching for GPU(s) with %d MB available memory",
|
|
656
|
+
action_id,
|
|
657
|
+
required_gpu_memory
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
661
|
+
try:
|
|
662
|
+
result = subprocess.run(
|
|
663
|
+
command,
|
|
664
|
+
stdout=subprocess.PIPE,
|
|
665
|
+
stderr=subprocess.PIPE,
|
|
666
|
+
timeout=5,
|
|
667
|
+
check=False,
|
|
668
|
+
)
|
|
669
|
+
if result.returncode != 0:
|
|
670
|
+
error_msg = f"nvidia-smi command failed with return code {result.returncode}"
|
|
671
|
+
logging.error("Action %s: %s", action_id, error_msg)
|
|
672
|
+
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
673
|
+
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
674
|
+
except subprocess.TimeoutExpired:
|
|
675
|
+
logging.error(
|
|
676
|
+
"Action %s: nvidia-smi command timed out after 5 seconds",
|
|
677
|
+
action_id
|
|
678
|
+
)
|
|
679
|
+
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
680
|
+
except FileNotFoundError:
|
|
681
|
+
logging.error(
|
|
682
|
+
"Action %s: nvidia-smi not found on this system",
|
|
683
|
+
action_id
|
|
684
|
+
)
|
|
685
|
+
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
686
|
+
except Exception as e:
|
|
687
|
+
logging.error(
|
|
688
|
+
"Action %s: Error running nvidia-smi: %s",
|
|
689
|
+
action_id,
|
|
690
|
+
e
|
|
691
|
+
)
|
|
692
|
+
raise ValueError(f"Failed to get GPU information: {e}")
|
|
693
|
+
|
|
694
|
+
if len(memory_free_info) < 2:
|
|
695
|
+
logging.error(
|
|
696
|
+
"Action %s: No GPU information available from nvidia-smi output",
|
|
697
|
+
action_id
|
|
698
|
+
)
|
|
699
|
+
raise ValueError("No GPU information available from nvidia-smi")
|
|
700
|
+
|
|
701
|
+
try:
|
|
702
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
703
|
+
except (ValueError, IndexError) as e:
|
|
704
|
+
logging.error(
|
|
705
|
+
"Action %s: Error parsing GPU memory information: %s",
|
|
706
|
+
action_id,
|
|
707
|
+
e
|
|
708
|
+
)
|
|
709
|
+
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
710
|
+
|
|
711
|
+
if not memory_free_values:
|
|
712
|
+
logging.error("Action %s: No GPU devices found", action_id)
|
|
713
|
+
raise ValueError("No GPU devices found")
|
|
714
|
+
|
|
715
|
+
# Log all available GPUs and their free memory
|
|
716
|
+
logging.info(
|
|
717
|
+
"Action %s: Found %d GPU(s) - Free memory: %s",
|
|
718
|
+
action_id,
|
|
719
|
+
len(memory_free_values),
|
|
720
|
+
", ".join([f"GPU{i}: {mem}MB" for i, mem in enumerate(memory_free_values)])
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
# Check GPUS environment variable for allowed devices
|
|
724
|
+
allowed_gpus = os.environ.get("GPUS", "")
|
|
725
|
+
if allowed_gpus:
|
|
726
|
+
logging.info(
|
|
727
|
+
"Action %s: GPU device filter active - allowed devices: %s",
|
|
728
|
+
action_id,
|
|
729
|
+
allowed_gpus
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# For smaller memory requirements, try to fit on a single GPU first
|
|
733
|
+
if required_gpu_memory < 80000:
|
|
734
|
+
logging.debug(
|
|
735
|
+
"Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation (selecting GPU with most free memory)",
|
|
736
|
+
action_id,
|
|
737
|
+
required_gpu_memory
|
|
738
|
+
)
|
|
739
|
+
try:
|
|
740
|
+
single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
|
|
741
|
+
logging.info(
|
|
742
|
+
"Action %s: Successfully allocated single GPU with most free memory: %s",
|
|
743
|
+
action_id,
|
|
744
|
+
single_gpu
|
|
745
|
+
)
|
|
746
|
+
return single_gpu
|
|
747
|
+
except ValueError as e:
|
|
748
|
+
logging.debug(
|
|
749
|
+
"Action %s: Single GPU allocation failed (%s) - will try multiple GPUs",
|
|
750
|
+
action_id,
|
|
751
|
+
str(e)
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Multi-GPU allocation: accumulate GPUs until we have enough memory
|
|
755
|
+
logging.info(
|
|
756
|
+
"Action %s: Attempting multi-GPU allocation for %d MB",
|
|
757
|
+
action_id,
|
|
758
|
+
required_gpu_memory
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
selected_gpus = []
|
|
762
|
+
total_memory = 0
|
|
763
|
+
for i, mem in enumerate(memory_free_values):
|
|
764
|
+
if not is_allowed_gpu_device(i):
|
|
765
|
+
logging.debug(
|
|
766
|
+
"Action %s: Skipping GPU %d - not in allowed device list",
|
|
767
|
+
action_id,
|
|
768
|
+
i
|
|
769
|
+
)
|
|
770
|
+
continue
|
|
771
|
+
if total_memory >= required_gpu_memory:
|
|
772
|
+
break
|
|
773
|
+
selected_gpus.append(i)
|
|
774
|
+
total_memory += mem
|
|
775
|
+
logging.debug(
|
|
776
|
+
"Action %s: Added GPU %d (%d MB free) - Total: %d MB",
|
|
777
|
+
action_id,
|
|
778
|
+
i,
|
|
779
|
+
mem,
|
|
780
|
+
total_memory
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
if total_memory >= required_gpu_memory:
|
|
784
|
+
logging.info(
|
|
785
|
+
"Action %s: Successfully allocated %d GPU(s): %s (Total memory: %d MB >= Required: %d MB)",
|
|
786
|
+
action_id,
|
|
787
|
+
len(selected_gpus),
|
|
788
|
+
selected_gpus,
|
|
789
|
+
total_memory,
|
|
790
|
+
required_gpu_memory
|
|
791
|
+
)
|
|
792
|
+
return selected_gpus
|
|
793
|
+
|
|
794
|
+
error_msg = (
|
|
795
|
+
f"Insufficient GPU memory available. "
|
|
796
|
+
f"Required: {required_gpu_memory}MB, "
|
|
797
|
+
f"Available: {total_memory}MB across {len(selected_gpus)} GPU(s)"
|
|
798
|
+
)
|
|
799
|
+
logging.error("Action %s: %s", action_id, error_msg)
|
|
800
|
+
raise ValueError(error_msg)
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
@log_errors(raise_exception=True, log_error=False)
|
|
804
|
+
def get_single_gpu_with_sufficient_memory_for_action(
|
|
805
|
+
action_details: dict,
|
|
806
|
+
) -> list:
|
|
807
|
+
"""
|
|
808
|
+
Get single GPU with sufficient memory using most-free algorithm.
|
|
809
|
+
|
|
810
|
+
Selects the GPU with the MOST free memory that meets the requirements,
|
|
811
|
+
to balance load across GPUs and prevent any single GPU from being overused.
|
|
812
|
+
|
|
813
|
+
Args:
|
|
814
|
+
action_details (dict): Action details
|
|
815
|
+
|
|
816
|
+
Returns:
|
|
817
|
+
list: List with single GPU index
|
|
818
|
+
|
|
819
|
+
Raises:
|
|
820
|
+
ValueError: If no GPU has sufficient memory
|
|
821
|
+
"""
|
|
822
|
+
action_id = action_details.get("_id", "unknown")
|
|
823
|
+
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
824
|
+
|
|
825
|
+
logging.debug(
|
|
826
|
+
"Action %s: Finding GPU with most free memory for %d MB",
|
|
827
|
+
action_id,
|
|
828
|
+
required_gpu_memory
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
832
|
+
try:
|
|
833
|
+
result = subprocess.run(
|
|
834
|
+
command,
|
|
835
|
+
stdout=subprocess.PIPE,
|
|
836
|
+
stderr=subprocess.PIPE,
|
|
837
|
+
timeout=5,
|
|
838
|
+
check=False,
|
|
839
|
+
)
|
|
840
|
+
if result.returncode != 0:
|
|
841
|
+
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
842
|
+
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
843
|
+
except subprocess.TimeoutExpired:
|
|
844
|
+
logging.error(
|
|
845
|
+
"Action %s: nvidia-smi timed out in single GPU selection",
|
|
846
|
+
action_id
|
|
847
|
+
)
|
|
848
|
+
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
849
|
+
except FileNotFoundError:
|
|
850
|
+
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
851
|
+
except Exception as e:
|
|
852
|
+
logging.error(
|
|
853
|
+
"Action %s: Error running nvidia-smi: %s",
|
|
854
|
+
action_id,
|
|
855
|
+
e
|
|
856
|
+
)
|
|
857
|
+
raise ValueError(f"Failed to get GPU information: {e}")
|
|
858
|
+
|
|
859
|
+
if len(memory_free_info) < 2:
|
|
860
|
+
raise ValueError("No GPU information available from nvidia-smi")
|
|
861
|
+
|
|
862
|
+
try:
|
|
863
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
864
|
+
except (ValueError, IndexError) as e:
|
|
865
|
+
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
866
|
+
|
|
867
|
+
if not memory_free_values:
|
|
868
|
+
raise ValueError("No GPU devices found")
|
|
869
|
+
|
|
870
|
+
# Most-free algorithm: find GPU with MAXIMUM free memory that meets requirement
|
|
871
|
+
best_fit_gpu = None
|
|
872
|
+
best_fit_memory = 0 # Changed from float("inf") to 0
|
|
873
|
+
|
|
874
|
+
for i, mem in enumerate(memory_free_values):
|
|
875
|
+
# Check if GPU is in allowed list
|
|
876
|
+
if not is_allowed_gpu_device(i):
|
|
877
|
+
logging.debug(
|
|
878
|
+
"Action %s: Skipping GPU %d (not in allowed list) - %d MB free",
|
|
879
|
+
action_id,
|
|
880
|
+
i,
|
|
881
|
+
mem
|
|
882
|
+
)
|
|
883
|
+
continue
|
|
884
|
+
|
|
885
|
+
# Check if GPU has sufficient memory
|
|
886
|
+
if mem >= required_gpu_memory:
|
|
887
|
+
logging.debug(
|
|
888
|
+
"Action %s: GPU %d is candidate - %d MB free (required: %d MB)",
|
|
889
|
+
action_id,
|
|
890
|
+
i,
|
|
891
|
+
mem,
|
|
892
|
+
required_gpu_memory
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
# Most-free: choose GPU with MOST free memory to balance load
|
|
896
|
+
if mem > best_fit_memory: # Changed from < to >
|
|
897
|
+
best_fit_gpu = i
|
|
898
|
+
best_fit_memory = mem
|
|
899
|
+
logging.debug(
|
|
900
|
+
"Action %s: GPU %d is new best candidate (most free memory)",
|
|
901
|
+
action_id,
|
|
902
|
+
i
|
|
903
|
+
)
|
|
904
|
+
else:
|
|
905
|
+
logging.debug(
|
|
906
|
+
"Action %s: GPU %d insufficient - %d MB free < %d MB required",
|
|
907
|
+
action_id,
|
|
908
|
+
i,
|
|
909
|
+
mem,
|
|
910
|
+
required_gpu_memory
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
if best_fit_gpu is not None:
|
|
914
|
+
logging.info(
|
|
915
|
+
"Action %s: Selected GPU %d with most free memory: %d MB free (required: %d MB, available: %d MB)",
|
|
916
|
+
action_id,
|
|
917
|
+
best_fit_gpu,
|
|
918
|
+
best_fit_memory,
|
|
919
|
+
required_gpu_memory,
|
|
920
|
+
best_fit_memory - required_gpu_memory
|
|
921
|
+
)
|
|
922
|
+
return [best_fit_gpu]
|
|
923
|
+
|
|
924
|
+
# No suitable GPU found - provide detailed error
|
|
925
|
+
suitable_gpus = [
|
|
926
|
+
f"GPU{i}: {mem}MB (need {required_gpu_memory}MB)"
|
|
927
|
+
for i, mem in enumerate(memory_free_values)
|
|
928
|
+
if is_allowed_gpu_device(i)
|
|
929
|
+
]
|
|
930
|
+
|
|
931
|
+
if not suitable_gpus:
|
|
932
|
+
error_msg = f"No allowed GPUs available (GPUS env filter active)"
|
|
933
|
+
else:
|
|
934
|
+
error_msg = (
|
|
935
|
+
f"No single GPU with sufficient memory. "
|
|
936
|
+
f"Required: {required_gpu_memory}MB. "
|
|
937
|
+
f"Available GPUs: {', '.join(suitable_gpus)}"
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
logging.warning("Action %s: %s", action_id, error_msg)
|
|
941
|
+
raise ValueError(error_msg)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
@log_errors(default_return="", raise_exception=False)
|
|
945
|
+
def get_gpu_config_for_deployment(action_details, is_first_deployment=False):
|
|
946
|
+
"""Get GPU configuration for deployment actions.
|
|
947
|
+
|
|
948
|
+
For first deployment of a service, attempts to use all GPUs.
|
|
949
|
+
For subsequent deployments, uses standard GPU selection (most free memory).
|
|
950
|
+
Falls back gracefully to standard GPU selection if '--gpus all' is not available.
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
action_details (dict): Action details containing GPU requirements
|
|
954
|
+
is_first_deployment (bool): Whether this is the first deployment for this service
|
|
955
|
+
|
|
956
|
+
Returns:
|
|
957
|
+
str: GPU configuration string ('--gpus all' or '--gpus "device=X"' or '')
|
|
958
|
+
"""
|
|
959
|
+
action_id = action_details.get("_id", "unknown")
|
|
960
|
+
|
|
961
|
+
# Check if GPU is required
|
|
962
|
+
gpu_required = action_details.get("actionDetails", {}).get("gpuRequired", False)
|
|
963
|
+
if not gpu_required:
|
|
964
|
+
logging.info(
|
|
965
|
+
"Action %s does not require GPU - will run on CPU",
|
|
966
|
+
action_id
|
|
967
|
+
)
|
|
968
|
+
return ""
|
|
969
|
+
|
|
970
|
+
# First deployment: try to use all GPUs
|
|
971
|
+
if is_first_deployment:
|
|
972
|
+
logging.info(
|
|
973
|
+
"Action %s: First deployment - attempting to use all GPUs",
|
|
974
|
+
action_id
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
try:
|
|
978
|
+
# Check if GPUs are available
|
|
979
|
+
result = subprocess.run(
|
|
980
|
+
["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
|
|
981
|
+
stdout=subprocess.PIPE,
|
|
982
|
+
stderr=subprocess.PIPE,
|
|
983
|
+
timeout=5,
|
|
984
|
+
check=False,
|
|
985
|
+
)
|
|
986
|
+
|
|
987
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
988
|
+
# GPUs are available, use all of them
|
|
989
|
+
logging.info(
|
|
990
|
+
"Action %s: Using all GPUs for first deployment",
|
|
991
|
+
action_id
|
|
992
|
+
)
|
|
993
|
+
return '--gpus all'
|
|
994
|
+
else:
|
|
995
|
+
logging.warning(
|
|
996
|
+
"Action %s: No GPUs detected via nvidia-smi for first deployment, falling back to standard GPU selection",
|
|
997
|
+
action_id
|
|
998
|
+
)
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
logging.warning(
|
|
1001
|
+
"Action %s: Error checking GPU availability (%s), falling back to standard GPU selection",
|
|
1002
|
+
action_id,
|
|
1003
|
+
str(e)
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
# Fall back to standard GPU selection (most free memory)
|
|
1007
|
+
# This also handles subsequent deployments
|
|
1008
|
+
logging.info(
|
|
1009
|
+
"Action %s: Using standard GPU allocation (most free memory)",
|
|
1010
|
+
action_id
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
required_memory = action_details.get("actionDetails", {}).get(
|
|
1014
|
+
"expectedResources", {}
|
|
1015
|
+
).get("gpuMemory", 0)
|
|
1016
|
+
|
|
1017
|
+
try:
|
|
1018
|
+
# Get the GPU(s) with most free memory that have sufficient memory
|
|
1019
|
+
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
1020
|
+
action_details=action_details
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
if gpu_indices:
|
|
1024
|
+
gpu_str = ",".join(map(str, gpu_indices))
|
|
1025
|
+
logging.info(
|
|
1026
|
+
"Action %s: Selected GPU device(s): %s (required memory: %d MB)",
|
|
1027
|
+
action_id,
|
|
1028
|
+
gpu_str,
|
|
1029
|
+
required_memory
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
# Return Docker GPU configuration
|
|
1033
|
+
return f'--gpus "device={gpu_str}"'
|
|
1034
|
+
else:
|
|
1035
|
+
logging.warning(
|
|
1036
|
+
"Action %s: No GPUs with sufficient memory found (required: %d MB)",
|
|
1037
|
+
action_id,
|
|
1038
|
+
required_memory
|
|
1039
|
+
)
|
|
1040
|
+
return ""
|
|
1041
|
+
|
|
1042
|
+
except ValueError as e:
|
|
1043
|
+
logging.error(
|
|
1044
|
+
"Action %s: Error selecting GPU - %s",
|
|
1045
|
+
action_id,
|
|
1046
|
+
str(e)
|
|
1047
|
+
)
|
|
1048
|
+
return ""
|
|
1049
|
+
except Exception as e:
|
|
1050
|
+
logging.error(
|
|
1051
|
+
"Action %s: Unexpected error in GPU selection - %s",
|
|
1052
|
+
action_id,
|
|
1053
|
+
str(e)
|
|
1054
|
+
)
|
|
1055
|
+
return ""
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
@log_errors(default_return=(None, None), raise_exception=False)
|
|
1059
|
+
def get_decrypted_access_key_pair(
|
|
1060
|
+
enc_access_key: str,
|
|
1061
|
+
enc_secret_key: str,
|
|
1062
|
+
encryption_key: str = "",
|
|
1063
|
+
) -> tuple:
|
|
1064
|
+
"""
|
|
1065
|
+
Get decrypted access key pair.
|
|
1066
|
+
|
|
1067
|
+
Args:
|
|
1068
|
+
enc_access_key (str): Encrypted access key
|
|
1069
|
+
enc_secret_key (str): Encrypted secret key
|
|
1070
|
+
encryption_key (str): Encryption key
|
|
1071
|
+
|
|
1072
|
+
Returns:
|
|
1073
|
+
tuple: (access_key, secret_key) strings
|
|
1074
|
+
"""
|
|
1075
|
+
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY")
|
|
1076
|
+
if not encryption_key:
|
|
1077
|
+
logging.warning("Encryption key is not set, Will assume that the keys are not encrypted")
|
|
1078
|
+
return enc_access_key, enc_secret_key
|
|
1079
|
+
encrypted_access_key = base64.b64decode(enc_access_key)
|
|
1080
|
+
encrypted_secret_key = base64.b64decode(enc_secret_key)
|
|
1081
|
+
nonce = encrypted_access_key[:12]
|
|
1082
|
+
tag = encrypted_access_key[-16:]
|
|
1083
|
+
ciphertext = encrypted_access_key[12:-16]
|
|
1084
|
+
cipher = Cipher(
|
|
1085
|
+
algorithms.AES(encryption_key.encode()),
|
|
1086
|
+
modes.GCM(nonce, tag),
|
|
1087
|
+
backend=default_backend(),
|
|
1088
|
+
)
|
|
1089
|
+
decryptor = cipher.decryptor()
|
|
1090
|
+
decrypted_access_key = decryptor.update(ciphertext) + decryptor.finalize()
|
|
1091
|
+
nonce = encrypted_secret_key[:12]
|
|
1092
|
+
tag = encrypted_secret_key[-16:]
|
|
1093
|
+
ciphertext = encrypted_secret_key[12:-16]
|
|
1094
|
+
cipher = Cipher(
|
|
1095
|
+
algorithms.AES(encryption_key.encode()),
|
|
1096
|
+
modes.GCM(nonce, tag),
|
|
1097
|
+
backend=default_backend(),
|
|
1098
|
+
)
|
|
1099
|
+
decryptor = cipher.decryptor()
|
|
1100
|
+
decrypted_secret_key = decryptor.update(ciphertext) + decryptor.finalize()
|
|
1101
|
+
access_key = decrypted_access_key.decode("utf-8", errors="replace")
|
|
1102
|
+
secret_key = decrypted_secret_key.decode("utf-8", errors="replace")
|
|
1103
|
+
return access_key, secret_key
|
|
1104
|
+
|
|
1105
|
+
@log_errors(default_return=(None, None), raise_exception=False)
|
|
1106
|
+
def get_encrypted_access_key_pair(
|
|
1107
|
+
access_key: str,
|
|
1108
|
+
secret_key: str,
|
|
1109
|
+
encryption_key: str = "",
|
|
1110
|
+
) -> tuple:
|
|
1111
|
+
"""
|
|
1112
|
+
Get encrypted access key pair.
|
|
1113
|
+
|
|
1114
|
+
Args:
|
|
1115
|
+
access_key (str): access key
|
|
1116
|
+
secret_key (str): secret key
|
|
1117
|
+
encryption_key (str): Encryption key
|
|
1118
|
+
|
|
1119
|
+
Returns:
|
|
1120
|
+
tuple: (encrypted_access_key, encrypted_secret_key) strings
|
|
1121
|
+
"""
|
|
1122
|
+
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY")
|
|
1123
|
+
if not encryption_key:
|
|
1124
|
+
logging.warning("Encryption key is not set, returning unencrypted keys")
|
|
1125
|
+
return access_key, secret_key
|
|
1126
|
+
|
|
1127
|
+
# Convert encryption key to bytes
|
|
1128
|
+
key = encryption_key.encode()
|
|
1129
|
+
|
|
1130
|
+
# Encrypt access key
|
|
1131
|
+
nonce = os.urandom(12)
|
|
1132
|
+
cipher = Cipher(
|
|
1133
|
+
algorithms.AES(key),
|
|
1134
|
+
modes.GCM(nonce),
|
|
1135
|
+
backend=default_backend()
|
|
1136
|
+
)
|
|
1137
|
+
encryptor = cipher.encryptor()
|
|
1138
|
+
encrypted_access_key = encryptor.update(access_key.encode()) + encryptor.finalize()
|
|
1139
|
+
encrypted_access_key_with_nonce = nonce + encrypted_access_key + encryptor.tag
|
|
1140
|
+
|
|
1141
|
+
# Encrypt secret key
|
|
1142
|
+
nonce = os.urandom(12)
|
|
1143
|
+
cipher = Cipher(
|
|
1144
|
+
algorithms.AES(key),
|
|
1145
|
+
modes.GCM(nonce),
|
|
1146
|
+
backend=default_backend()
|
|
1147
|
+
)
|
|
1148
|
+
encryptor = cipher.encryptor()
|
|
1149
|
+
encrypted_secret_key = encryptor.update(secret_key.encode()) + encryptor.finalize()
|
|
1150
|
+
encrypted_secret_key_with_nonce = nonce + encrypted_secret_key + encryptor.tag
|
|
1151
|
+
|
|
1152
|
+
# Encode to base64 for storage
|
|
1153
|
+
encoded_access_key = base64.b64encode(encrypted_access_key_with_nonce).decode()
|
|
1154
|
+
encoded_secret_key = base64.b64encode(encrypted_secret_key_with_nonce).decode()
|
|
1155
|
+
|
|
1156
|
+
return encoded_access_key, encoded_secret_key
|
|
1157
|
+
|
|
1158
|
+
def _get_private_ip() -> str:
|
|
1159
|
+
"""
|
|
1160
|
+
Get the actual private/LAN IP address using UDP socket trick.
|
|
1161
|
+
This works reliably even in Docker, NAT, VPN, etc.
|
|
1162
|
+
|
|
1163
|
+
Returns:
|
|
1164
|
+
str: Private IP address or None if not available
|
|
1165
|
+
"""
|
|
1166
|
+
try:
|
|
1167
|
+
# Use UDP socket to determine which interface would be used for external connection
|
|
1168
|
+
# No actual packets are sent
|
|
1169
|
+
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
|
|
1170
|
+
s.connect(("8.8.8.8", 80))
|
|
1171
|
+
private_ip = s.getsockname()[0]
|
|
1172
|
+
return private_ip
|
|
1173
|
+
except Exception:
|
|
1174
|
+
return None
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
def _public_ip_is_local(public_ip: str) -> bool:
|
|
1178
|
+
"""
|
|
1179
|
+
Check if a public IP address is actually assigned to a local network interface.
|
|
1180
|
+
This is true on cloud servers with real public IPs, false behind NAT.
|
|
1181
|
+
|
|
1182
|
+
Args:
|
|
1183
|
+
public_ip (str): The public IP to check
|
|
1184
|
+
|
|
1185
|
+
Returns:
|
|
1186
|
+
bool: True if the public IP is on a local interface
|
|
1187
|
+
"""
|
|
1188
|
+
try:
|
|
1189
|
+
for iface, addrs in psutil.net_if_addrs().items():
|
|
1190
|
+
for addr in addrs:
|
|
1191
|
+
if addr.family == socket.AF_INET:
|
|
1192
|
+
if addr.address == public_ip:
|
|
1193
|
+
return True
|
|
1194
|
+
return False
|
|
1195
|
+
except Exception:
|
|
1196
|
+
return False
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
@log_errors(default_return=("localhost", True), raise_exception=False)
|
|
1200
|
+
def get_best_service_ip_and_network(port: int) -> tuple:
|
|
1201
|
+
"""
|
|
1202
|
+
Determine the best IP address and network configuration for a service.
|
|
1203
|
+
|
|
1204
|
+
This function intelligently selects the best IP to bind a service to:
|
|
1205
|
+
|
|
1206
|
+
Priority:
|
|
1207
|
+
1. Public IP if it's actually on a local interface (cloud servers)
|
|
1208
|
+
2. Private/LAN IP (NAT, local network, Docker)
|
|
1209
|
+
3. localhost with --net=host (fallback)
|
|
1210
|
+
|
|
1211
|
+
Args:
|
|
1212
|
+
port (int): Port number for the service
|
|
1213
|
+
|
|
1214
|
+
Returns:
|
|
1215
|
+
tuple: (ip_address, use_host_network) where:
|
|
1216
|
+
- ip_address: The IP address to use (public, private, or localhost)
|
|
1217
|
+
- use_host_network: True if should use --net=host, False if should use port mapping
|
|
1218
|
+
"""
|
|
1219
|
+
try:
|
|
1220
|
+
# Check if port is available (not already in use)
|
|
1221
|
+
try:
|
|
1222
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock:
|
|
1223
|
+
test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
1224
|
+
test_sock.bind(("0.0.0.0", port))
|
|
1225
|
+
test_sock.listen(1)
|
|
1226
|
+
# Port is available - socket closes automatically
|
|
1227
|
+
except OSError as e:
|
|
1228
|
+
logging.warning(f"Port {port} is already in use or cannot be bound: {e}, will use --net=host")
|
|
1229
|
+
return "localhost", True
|
|
1230
|
+
|
|
1231
|
+
# Get the actual private/LAN IP
|
|
1232
|
+
private_ip = _get_private_ip()
|
|
1233
|
+
if private_ip:
|
|
1234
|
+
logging.info(f"Determined private/LAN IP: {private_ip}")
|
|
1235
|
+
else:
|
|
1236
|
+
logging.debug("Could not determine private IP")
|
|
1237
|
+
|
|
1238
|
+
# Try to get public IP from external service
|
|
1239
|
+
public_ip = None
|
|
1240
|
+
try:
|
|
1241
|
+
public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8").strip()
|
|
1242
|
+
# Validate it's a proper IP address
|
|
1243
|
+
socket.inet_aton(public_ip)
|
|
1244
|
+
logging.info(f"Determined external/public IP: {public_ip}")
|
|
1245
|
+
except Exception as e:
|
|
1246
|
+
logging.debug(f"Could not determine public IP: {e}")
|
|
1247
|
+
|
|
1248
|
+
# Decision logic: Choose the best IP
|
|
1249
|
+
|
|
1250
|
+
# 1. If public IP is on a local interface, use it (cloud server with real public IP)
|
|
1251
|
+
if public_ip and _public_ip_is_local(public_ip):
|
|
1252
|
+
logging.info(f"Public IP {public_ip} is on local interface, using it for port {port}")
|
|
1253
|
+
return public_ip, False
|
|
1254
|
+
|
|
1255
|
+
# 2. If we have a valid private IP, use it (most common case: NAT, LAN, Docker)
|
|
1256
|
+
if private_ip and not private_ip.startswith("127."):
|
|
1257
|
+
logging.info(f"Using private/LAN IP {private_ip} for port {port}")
|
|
1258
|
+
return private_ip, False
|
|
1259
|
+
|
|
1260
|
+
# 3. Fall back to localhost with --net=host
|
|
1261
|
+
logging.info(f"No suitable IP found, using localhost with --net=host for port {port}")
|
|
1262
|
+
return "localhost", True
|
|
1263
|
+
|
|
1264
|
+
except Exception as e:
|
|
1265
|
+
logging.warning(f"Error determining best IP for port {port}: {e}, falling back to localhost")
|
|
1266
|
+
return "localhost", True
|