matrice-compute 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +9 -0
- matrice_compute/action_instance.py +1508 -0
- matrice_compute/actions_manager.py +226 -0
- matrice_compute/actions_scaledown_manager.py +57 -0
- matrice_compute/instance_manager.py +270 -0
- matrice_compute/instance_utils.py +707 -0
- matrice_compute/prechecks.py +538 -0
- matrice_compute/py.typed +0 -0
- matrice_compute/resources_tracker.py +478 -0
- matrice_compute/scaling.py +880 -0
- matrice_compute/shutdown_manager.py +314 -0
- matrice_compute/task_utils.py +77 -0
- matrice_compute-0.1.1.dist-info/METADATA +28 -0
- matrice_compute-0.1.1.dist-info/RECORD +17 -0
- matrice_compute-0.1.1.dist-info/WHEEL +5 -0
- matrice_compute-0.1.1.dist-info/licenses/LICENSE.txt +21 -0
- matrice_compute-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
"""Module providing instance utilities functionality."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import urllib.request
|
|
6
|
+
import subprocess
|
|
7
|
+
import logging
|
|
8
|
+
import base64
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
import psutil
|
|
11
|
+
from cryptography.hazmat.primitives.ciphers import (
|
|
12
|
+
Cipher,
|
|
13
|
+
algorithms,
|
|
14
|
+
modes,
|
|
15
|
+
)
|
|
16
|
+
from cryptography.hazmat.backends import default_backend
|
|
17
|
+
from matrice_common.utils import log_errors
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_instance_info(service_provider: str = None, instance_id: str = None) -> tuple:
|
|
21
|
+
"""
|
|
22
|
+
Get instance provider and ID information.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
tuple: (service_provider, instance_id) strings
|
|
26
|
+
"""
|
|
27
|
+
auto_service_provider = service_provider or os.environ.get("SERVICE_PROVIDER") or "LOCAL"
|
|
28
|
+
auto_instance_id = instance_id or os.environ.get("INSTANCE_ID") or ""
|
|
29
|
+
try:
|
|
30
|
+
gcp_check = subprocess.run(
|
|
31
|
+
"curl -s -m 1 -H 'Metadata-Flavor: Google' 'http://metadata.google.internal/computeMetadata/v1/instance/id'",
|
|
32
|
+
shell=True,
|
|
33
|
+
capture_output=True,
|
|
34
|
+
check=True,
|
|
35
|
+
)
|
|
36
|
+
if gcp_check.returncode == 0:
|
|
37
|
+
auto_service_provider = "GCP"
|
|
38
|
+
auto_instance_id = gcp_check.stdout.decode().strip()
|
|
39
|
+
except subprocess.CalledProcessError:
|
|
40
|
+
pass
|
|
41
|
+
try:
|
|
42
|
+
azure_check = subprocess.run(
|
|
43
|
+
"curl -s -m 1 -H Metadata:true 'http://169.254.169.254/metadata/instance?api-version=2020-09-01'",
|
|
44
|
+
shell=True,
|
|
45
|
+
capture_output=True,
|
|
46
|
+
check=True,
|
|
47
|
+
)
|
|
48
|
+
if azure_check.returncode == 0:
|
|
49
|
+
auto_service_provider = "AZURE"
|
|
50
|
+
azure_id = subprocess.run(
|
|
51
|
+
"curl -s -H Metadata:true 'http://169.254.169.254/metadata/instance/compute/vmId?api-version=2017-08-01&format=text'",
|
|
52
|
+
shell=True,
|
|
53
|
+
capture_output=True,
|
|
54
|
+
check=True,
|
|
55
|
+
)
|
|
56
|
+
auto_instance_id = azure_id.stdout.decode().strip()
|
|
57
|
+
except subprocess.CalledProcessError:
|
|
58
|
+
pass
|
|
59
|
+
try:
|
|
60
|
+
oci_check = subprocess.run(
|
|
61
|
+
"curl -s -m 1 -H 'Authorization: Bearer OracleCloud' 'http://169.254.169.254/opc/v1/instance/'",
|
|
62
|
+
shell=True,
|
|
63
|
+
capture_output=True,
|
|
64
|
+
check=True,
|
|
65
|
+
)
|
|
66
|
+
if oci_check.returncode == 0:
|
|
67
|
+
auto_service_provider = "OCI"
|
|
68
|
+
oci_id = subprocess.run(
|
|
69
|
+
"curl -s http://169.254.169.254/opc/v1/instance/id",
|
|
70
|
+
shell=True,
|
|
71
|
+
capture_output=True,
|
|
72
|
+
check=True,
|
|
73
|
+
)
|
|
74
|
+
auto_instance_id = oci_id.stdout.decode().strip()
|
|
75
|
+
except subprocess.CalledProcessError:
|
|
76
|
+
pass
|
|
77
|
+
try:
|
|
78
|
+
aws_check = subprocess.run(
|
|
79
|
+
"curl -s -m 1 http://169.254.169.254/latest/meta-data/",
|
|
80
|
+
shell=True,
|
|
81
|
+
capture_output=True,
|
|
82
|
+
check=True,
|
|
83
|
+
)
|
|
84
|
+
if aws_check.returncode == 0:
|
|
85
|
+
auto_service_provider = "AWS"
|
|
86
|
+
aws_id = subprocess.run(
|
|
87
|
+
"curl -s http://169.254.169.254/latest/meta-data/instance-id",
|
|
88
|
+
shell=True,
|
|
89
|
+
capture_output=True,
|
|
90
|
+
check=True,
|
|
91
|
+
)
|
|
92
|
+
auto_instance_id = aws_id.stdout.decode().strip()
|
|
93
|
+
except subprocess.CalledProcessError:
|
|
94
|
+
pass
|
|
95
|
+
return str(auto_service_provider), str(auto_instance_id)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
99
|
+
def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
|
|
100
|
+
"""
|
|
101
|
+
Calculate time difference between start and finish times.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
start_time_str (str): Start time string
|
|
105
|
+
finish_time_str (str): Finish time string
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
int: Time difference in seconds
|
|
109
|
+
"""
|
|
110
|
+
if os.environ["SERVICE_PROVIDER"] in [
|
|
111
|
+
"AWS",
|
|
112
|
+
"OCI",
|
|
113
|
+
"LAMBDA",
|
|
114
|
+
]:
|
|
115
|
+
start_time = datetime.fromisoformat(start_time_str.split(".")[0] + "+00:00")
|
|
116
|
+
finish_time = datetime.fromisoformat(finish_time_str.split(".")[0] + "+00:00")
|
|
117
|
+
else:
|
|
118
|
+
start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
|
|
119
|
+
finish_time = datetime.fromisoformat(finish_time_str.replace("Z", "+00:00"))
|
|
120
|
+
return int((finish_time - start_time).total_seconds())
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
124
|
+
def has_gpu() -> bool:
|
|
125
|
+
"""
|
|
126
|
+
Check if the system has a GPU.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
bool: True if GPU is present, False otherwise
|
|
130
|
+
"""
|
|
131
|
+
subprocess.run("nvidia-smi", check=True)
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
136
|
+
def get_gpu_memory_usage() -> float:
|
|
137
|
+
"""
|
|
138
|
+
Get GPU memory usage percentage.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
float: Memory usage between 0 and 1
|
|
142
|
+
"""
|
|
143
|
+
command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
|
|
144
|
+
output = subprocess.check_output(command.split()).decode("ascii").strip().split("\n")
|
|
145
|
+
memory_percentages = []
|
|
146
|
+
for line in output:
|
|
147
|
+
used, total = map(int, line.split(","))
|
|
148
|
+
usage_percentage = used / total
|
|
149
|
+
memory_percentages.append(usage_percentage)
|
|
150
|
+
return min(memory_percentages)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
154
|
+
def get_cpu_memory_usage() -> float:
|
|
155
|
+
"""
|
|
156
|
+
Get CPU memory usage.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
float: Memory usage between 0 and 1
|
|
160
|
+
"""
|
|
161
|
+
memory = psutil.virtual_memory()
|
|
162
|
+
return memory.percent / 100
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
166
|
+
def get_mem_usage() -> float:
|
|
167
|
+
"""
|
|
168
|
+
Get memory usage for either GPU or CPU.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
float: Memory usage between 0 and 1
|
|
172
|
+
"""
|
|
173
|
+
if has_gpu():
|
|
174
|
+
try:
|
|
175
|
+
mem_usage = get_gpu_memory_usage()
|
|
176
|
+
except Exception as err:
|
|
177
|
+
logging.error(
|
|
178
|
+
"Error getting GPU memory usage: %s",
|
|
179
|
+
err,
|
|
180
|
+
)
|
|
181
|
+
mem_usage = get_cpu_memory_usage()
|
|
182
|
+
else:
|
|
183
|
+
mem_usage = get_cpu_memory_usage()
|
|
184
|
+
if mem_usage is None:
|
|
185
|
+
mem_usage = 0
|
|
186
|
+
return mem_usage
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@log_errors(default_return=[], raise_exception=False)
|
|
190
|
+
def get_gpu_info() -> list:
|
|
191
|
+
"""
|
|
192
|
+
Get GPU information.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
list: GPU information strings
|
|
196
|
+
"""
|
|
197
|
+
with subprocess.Popen(
|
|
198
|
+
[
|
|
199
|
+
"nvidia-smi",
|
|
200
|
+
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
201
|
+
"--format=csv,noheader,nounits",
|
|
202
|
+
],
|
|
203
|
+
stdout=subprocess.PIPE,
|
|
204
|
+
) as proc:
|
|
205
|
+
stdout, _ = proc.communicate()
|
|
206
|
+
output = stdout.decode("UTF-8")
|
|
207
|
+
return output.split("\n")[:-1]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@log_errors(default_return="", raise_exception=False)
|
|
211
|
+
def get_instance_id() -> str:
|
|
212
|
+
"""
|
|
213
|
+
Get instance ID.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
str: Instance ID or empty string
|
|
217
|
+
"""
|
|
218
|
+
return os.environ["INSTANCE_ID"]
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
222
|
+
def is_docker_running() -> bool:
|
|
223
|
+
"""
|
|
224
|
+
Check if Docker is running.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
bool: True if Docker containers are running
|
|
228
|
+
"""
|
|
229
|
+
command = "docker ps"
|
|
230
|
+
docker_images = (
|
|
231
|
+
subprocess.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
|
|
232
|
+
)
|
|
233
|
+
return bool(docker_images)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
237
|
+
def prune_docker_images() -> None:
|
|
238
|
+
"""Prune Docker images."""
|
|
239
|
+
subprocess.run(
|
|
240
|
+
[
|
|
241
|
+
"docker",
|
|
242
|
+
"image",
|
|
243
|
+
"prune",
|
|
244
|
+
"-a",
|
|
245
|
+
"-f",
|
|
246
|
+
],
|
|
247
|
+
check=True,
|
|
248
|
+
)
|
|
249
|
+
logging.info("Docker images pruned successfully.")
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@log_errors(default_return=0.0, raise_exception=False)
|
|
253
|
+
def _normalize_disk_usage_to_gb(disk_space: str) -> float:
|
|
254
|
+
"""
|
|
255
|
+
Normalize disk usage to GB.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
disk_space (str): Disk space with unit
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
float: Disk space in GB
|
|
262
|
+
"""
|
|
263
|
+
if disk_space.endswith("G"):
|
|
264
|
+
result = float(disk_space[:-1])
|
|
265
|
+
elif disk_space.endswith("T"):
|
|
266
|
+
result = float(disk_space[:-1]) * 1024
|
|
267
|
+
elif disk_space.endswith("M"):
|
|
268
|
+
result = float(disk_space[:-1]) / 1024
|
|
269
|
+
elif disk_space.endswith("K"):
|
|
270
|
+
result = float(disk_space[:-1]) / (1024 * 1024)
|
|
271
|
+
else:
|
|
272
|
+
result = float(disk_space)
|
|
273
|
+
logging.debug(
|
|
274
|
+
"Normalized disk space value to %f GB",
|
|
275
|
+
result,
|
|
276
|
+
)
|
|
277
|
+
return result
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
281
|
+
def _parse_disk_usage_info(line: str) -> dict:
|
|
282
|
+
"""
|
|
283
|
+
Parse disk usage information.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
line (str): Disk usage line from df command
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
dict: Parsed disk usage information
|
|
290
|
+
"""
|
|
291
|
+
parts = line.split()
|
|
292
|
+
parsed_info = {
|
|
293
|
+
"filesystem": parts[0],
|
|
294
|
+
"size": _normalize_disk_usage_to_gb(parts[1]),
|
|
295
|
+
"used": _normalize_disk_usage_to_gb(parts[2]),
|
|
296
|
+
"available": _normalize_disk_usage_to_gb(parts[3]),
|
|
297
|
+
"use_percentage": float(parts[4].rstrip("%")),
|
|
298
|
+
"mounted_on": parts[5],
|
|
299
|
+
}
|
|
300
|
+
logging.debug(
|
|
301
|
+
"Successfully parsed disk usage info: %s",
|
|
302
|
+
parsed_info,
|
|
303
|
+
)
|
|
304
|
+
return parsed_info
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
308
|
+
def get_disk_space_usage() -> list:
|
|
309
|
+
"""
|
|
310
|
+
Get disk space usage for all filesystems.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
list: List of disk usage information dictionaries
|
|
314
|
+
"""
|
|
315
|
+
logging.info("Getting disk space usage information")
|
|
316
|
+
result = subprocess.run(
|
|
317
|
+
["df", "-h"],
|
|
318
|
+
capture_output=True,
|
|
319
|
+
text=True,
|
|
320
|
+
check=True,
|
|
321
|
+
)
|
|
322
|
+
lines = result.stdout.strip().split("\n")[1:]
|
|
323
|
+
disk_usage = []
|
|
324
|
+
for line in lines:
|
|
325
|
+
disk = _parse_disk_usage_info(line)
|
|
326
|
+
if disk:
|
|
327
|
+
disk_usage.append(disk)
|
|
328
|
+
logging.info(
|
|
329
|
+
"Found disk usage info for %d filesystems",
|
|
330
|
+
len(disk_usage),
|
|
331
|
+
)
|
|
332
|
+
return disk_usage
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
336
|
+
def get_max_file_system() -> str:
|
|
337
|
+
"""
|
|
338
|
+
Get filesystem with maximum available space.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
str: Path to filesystem with most space or None
|
|
342
|
+
"""
|
|
343
|
+
logging.info("Finding filesystem with maximum available space")
|
|
344
|
+
disk_usage = get_disk_space_usage()
|
|
345
|
+
if not disk_usage:
|
|
346
|
+
logging.warning("No disk usage information available")
|
|
347
|
+
return None
|
|
348
|
+
filtered_disks = [
|
|
349
|
+
disk
|
|
350
|
+
for disk in disk_usage
|
|
351
|
+
if disk["mounted_on"] != "/boot/efi"
|
|
352
|
+
and "overlay" not in disk["filesystem"]
|
|
353
|
+
and disk["available"] > 0
|
|
354
|
+
]
|
|
355
|
+
if not filtered_disks:
|
|
356
|
+
logging.warning("No suitable filesystems found after filtering")
|
|
357
|
+
max_available_filesystem = ""
|
|
358
|
+
else:
|
|
359
|
+
max_disk = max(
|
|
360
|
+
filtered_disks,
|
|
361
|
+
key=lambda x: x["available"],
|
|
362
|
+
)
|
|
363
|
+
max_available_filesystem = max_disk["mounted_on"]
|
|
364
|
+
logging.info(
|
|
365
|
+
"Found filesystem with maximum space: %s (%f GB available)",
|
|
366
|
+
max_available_filesystem,
|
|
367
|
+
max_disk["available"],
|
|
368
|
+
)
|
|
369
|
+
if max_available_filesystem in ["/", ""]:
|
|
370
|
+
home_dir = os.path.expanduser("~")
|
|
371
|
+
if not os.environ.get("WORKSPACE_DIR"):
|
|
372
|
+
logging.error("WORKSPACE_DIR environment variable not set")
|
|
373
|
+
return None
|
|
374
|
+
workspace_dir = os.path.join(
|
|
375
|
+
home_dir,
|
|
376
|
+
os.environ["WORKSPACE_DIR"],
|
|
377
|
+
)
|
|
378
|
+
os.makedirs(workspace_dir, exist_ok=True)
|
|
379
|
+
logging.info(
|
|
380
|
+
"Created workspace directory at: %s",
|
|
381
|
+
workspace_dir,
|
|
382
|
+
)
|
|
383
|
+
return workspace_dir
|
|
384
|
+
return max_available_filesystem
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@log_errors(default_return=None, raise_exception=False)
|
|
388
|
+
def get_docker_disk_space_usage() -> dict:
|
|
389
|
+
"""
|
|
390
|
+
Get disk space usage for Docker storage.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
dict: Docker disk usage information
|
|
394
|
+
"""
|
|
395
|
+
result = subprocess.run(
|
|
396
|
+
["docker", "info"],
|
|
397
|
+
capture_output=True,
|
|
398
|
+
text=True,
|
|
399
|
+
check=True,
|
|
400
|
+
)
|
|
401
|
+
docker_info = result.stdout
|
|
402
|
+
docker_root_dir = None
|
|
403
|
+
for line in docker_info.split("\n"):
|
|
404
|
+
if line.strip().startswith("Docker Root Dir"):
|
|
405
|
+
docker_root_dir = line.split(":")[1].strip()
|
|
406
|
+
break
|
|
407
|
+
if docker_root_dir is None:
|
|
408
|
+
logging.error("Unable to find Docker root directory")
|
|
409
|
+
raise ValueError("Unable to find Docker root directory")
|
|
410
|
+
logging.debug(
|
|
411
|
+
"Found Docker root directory: %s",
|
|
412
|
+
docker_root_dir,
|
|
413
|
+
)
|
|
414
|
+
result = subprocess.run(
|
|
415
|
+
["df", "-h", docker_root_dir],
|
|
416
|
+
capture_output=True,
|
|
417
|
+
text=True,
|
|
418
|
+
check=True,
|
|
419
|
+
)
|
|
420
|
+
lines = result.stdout.strip().split("\n")[1:]
|
|
421
|
+
if not lines:
|
|
422
|
+
logging.error("No disk usage information found for Docker root directory")
|
|
423
|
+
raise ValueError("No disk usage information found for Docker root directory")
|
|
424
|
+
docker_disk_usage = _parse_disk_usage_info(lines[0])
|
|
425
|
+
if docker_disk_usage is None:
|
|
426
|
+
logging.error("Failed to parse Docker disk usage information")
|
|
427
|
+
raise ValueError("Failed to parse Docker disk usage information")
|
|
428
|
+
logging.info(
|
|
429
|
+
"Successfully retrieved Docker disk usage: %s",
|
|
430
|
+
docker_disk_usage,
|
|
431
|
+
)
|
|
432
|
+
return docker_disk_usage
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
@log_errors(raise_exception=False)
|
|
436
|
+
def cleanup_docker_storage() -> None:
|
|
437
|
+
"""Clean up Docker storage if space is low."""
|
|
438
|
+
docker_disk_usage = get_docker_disk_space_usage()
|
|
439
|
+
if docker_disk_usage is None:
|
|
440
|
+
logging.error("Failed to get Docker disk space usage, skipping cleanup")
|
|
441
|
+
return
|
|
442
|
+
if docker_disk_usage["use_percentage"] >= 90 or docker_disk_usage["available"] <= 30:
|
|
443
|
+
logging.info(
|
|
444
|
+
"Pruning Docker images. Disk space is low: %s",
|
|
445
|
+
docker_disk_usage,
|
|
446
|
+
)
|
|
447
|
+
prune_docker_images()
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
@log_errors(default_return=0, raise_exception=False)
|
|
451
|
+
def get_required_gpu_memory(action_details: dict) -> int:
|
|
452
|
+
"""
|
|
453
|
+
Get required GPU memory from action details.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
action_details (dict): Action details
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
int: Required GPU memory
|
|
460
|
+
"""
|
|
461
|
+
try:
|
|
462
|
+
return action_details["actionDetails"]["expectedResources"]["gpuMemory"]
|
|
463
|
+
except KeyError:
|
|
464
|
+
return 0
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@log_errors(default_return=True, raise_exception=False)
|
|
468
|
+
def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
469
|
+
"""Check if GPU device is allowed.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
gpu_index (int): GPU device index
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
bool: True if GPU is allowed
|
|
476
|
+
"""
|
|
477
|
+
gpus = os.environ.get("GPUS")
|
|
478
|
+
if not gpus:
|
|
479
|
+
return True
|
|
480
|
+
allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
|
|
481
|
+
return int(gpu_index) in allowed_gpus
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
@log_errors(raise_exception=True)
|
|
485
|
+
def get_gpu_with_sufficient_memory_for_action(
|
|
486
|
+
action_details: dict,
|
|
487
|
+
) -> list:
|
|
488
|
+
"""
|
|
489
|
+
Get GPUs with sufficient memory for action.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
action_details (dict): Action details
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
list: List of GPU indices
|
|
496
|
+
|
|
497
|
+
Raises:
|
|
498
|
+
ValueError: If insufficient GPU memory
|
|
499
|
+
"""
|
|
500
|
+
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
501
|
+
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
502
|
+
memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")
|
|
503
|
+
if len(memory_free_info) < 2:
|
|
504
|
+
raise ValueError("No GPU information available from nvidia-smi")
|
|
505
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
|
|
506
|
+
if required_gpu_memory < 80000:
|
|
507
|
+
try:
|
|
508
|
+
return get_single_gpu_with_sufficient_memory_for_action(action_details)
|
|
509
|
+
except ValueError:
|
|
510
|
+
pass
|
|
511
|
+
selected_gpus = []
|
|
512
|
+
total_memory = 0
|
|
513
|
+
for i, mem in enumerate(memory_free_values):
|
|
514
|
+
if not is_allowed_gpu_device(i):
|
|
515
|
+
continue
|
|
516
|
+
if total_memory >= required_gpu_memory:
|
|
517
|
+
break
|
|
518
|
+
selected_gpus.append(i)
|
|
519
|
+
total_memory += mem
|
|
520
|
+
if total_memory >= required_gpu_memory:
|
|
521
|
+
return selected_gpus
|
|
522
|
+
raise ValueError(
|
|
523
|
+
f"Insufficient GPU memory available. Required: {required_gpu_memory}, Available: {total_memory}"
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
@log_errors(raise_exception=True)
|
|
528
|
+
def get_single_gpu_with_sufficient_memory_for_action(
|
|
529
|
+
action_details: dict,
|
|
530
|
+
) -> list:
|
|
531
|
+
"""
|
|
532
|
+
Get single GPU with sufficient memory.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
action_details (dict): Action details
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
list: List with single GPU index
|
|
539
|
+
|
|
540
|
+
Raises:
|
|
541
|
+
ValueError: If no GPU has sufficient memory
|
|
542
|
+
"""
|
|
543
|
+
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
544
|
+
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
545
|
+
memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")
|
|
546
|
+
if len(memory_free_info) < 2:
|
|
547
|
+
raise ValueError("No GPU information available from nvidia-smi")
|
|
548
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
|
|
549
|
+
best_fit_gpu = None
|
|
550
|
+
best_fit_memory = float("inf")
|
|
551
|
+
for i, mem in enumerate(memory_free_values):
|
|
552
|
+
if not is_allowed_gpu_device(i):
|
|
553
|
+
continue
|
|
554
|
+
if mem >= required_gpu_memory and mem < best_fit_memory:
|
|
555
|
+
best_fit_gpu = i
|
|
556
|
+
best_fit_memory = mem
|
|
557
|
+
if best_fit_gpu is not None:
|
|
558
|
+
return [best_fit_gpu]
|
|
559
|
+
raise ValueError(
|
|
560
|
+
f"No single GPU with sufficient memory ({required_gpu_memory}MB) available"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
@log_errors(default_return=(None, None), raise_exception=False)
|
|
565
|
+
def get_decrypted_access_key_pair(
|
|
566
|
+
enc_access_key: str,
|
|
567
|
+
enc_secret_key: str,
|
|
568
|
+
encryption_key: str = "",
|
|
569
|
+
) -> tuple:
|
|
570
|
+
"""
|
|
571
|
+
Get decrypted access key pair.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
enc_access_key (str): Encrypted access key
|
|
575
|
+
enc_secret_key (str): Encrypted secret key
|
|
576
|
+
encryption_key (str): Encryption key
|
|
577
|
+
|
|
578
|
+
Returns:
|
|
579
|
+
tuple: (access_key, secret_key) strings
|
|
580
|
+
"""
|
|
581
|
+
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY")
|
|
582
|
+
if not encryption_key:
|
|
583
|
+
logging.warning("Encryption key is not set, Will assume that the keys are not encrypted")
|
|
584
|
+
return enc_access_key, enc_secret_key
|
|
585
|
+
encrypted_access_key = base64.b64decode(enc_access_key)
|
|
586
|
+
encrypted_secret_key = base64.b64decode(enc_secret_key)
|
|
587
|
+
nonce = encrypted_access_key[:12]
|
|
588
|
+
tag = encrypted_access_key[-16:]
|
|
589
|
+
ciphertext = encrypted_access_key[12:-16]
|
|
590
|
+
cipher = Cipher(
|
|
591
|
+
algorithms.AES(encryption_key.encode()),
|
|
592
|
+
modes.GCM(nonce, tag),
|
|
593
|
+
backend=default_backend(),
|
|
594
|
+
)
|
|
595
|
+
decryptor = cipher.decryptor()
|
|
596
|
+
decrypted_access_key = decryptor.update(ciphertext) + decryptor.finalize()
|
|
597
|
+
nonce = encrypted_secret_key[:12]
|
|
598
|
+
tag = encrypted_secret_key[-16:]
|
|
599
|
+
ciphertext = encrypted_secret_key[12:-16]
|
|
600
|
+
cipher = Cipher(
|
|
601
|
+
algorithms.AES(encryption_key.encode()),
|
|
602
|
+
modes.GCM(nonce, tag),
|
|
603
|
+
backend=default_backend(),
|
|
604
|
+
)
|
|
605
|
+
decryptor = cipher.decryptor()
|
|
606
|
+
decrypted_secret_key = decryptor.update(ciphertext) + decryptor.finalize()
|
|
607
|
+
access_key = decrypted_access_key.decode("utf-8", errors="replace")
|
|
608
|
+
secret_key = decrypted_secret_key.decode("utf-8", errors="replace")
|
|
609
|
+
return access_key, secret_key
|
|
610
|
+
|
|
611
|
+
@log_errors(default_return=(None, None), raise_exception=False)
|
|
612
|
+
def get_encrypted_access_key_pair(
|
|
613
|
+
access_key: str,
|
|
614
|
+
secret_key: str,
|
|
615
|
+
encryption_key: str = "",
|
|
616
|
+
) -> tuple:
|
|
617
|
+
"""
|
|
618
|
+
Get encrypted access key pair.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
access_key (str): access key
|
|
622
|
+
secret_key (str): secret key
|
|
623
|
+
encryption_key (str): Encryption key
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
tuple: (encrypted_access_key, encrypted_secret_key) strings
|
|
627
|
+
"""
|
|
628
|
+
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY")
|
|
629
|
+
if not encryption_key:
|
|
630
|
+
logging.warning("Encryption key is not set, returning unencrypted keys")
|
|
631
|
+
return access_key, secret_key
|
|
632
|
+
|
|
633
|
+
# Convert encryption key to bytes
|
|
634
|
+
key = encryption_key.encode()
|
|
635
|
+
|
|
636
|
+
# Encrypt access key
|
|
637
|
+
nonce = os.urandom(12)
|
|
638
|
+
cipher = Cipher(
|
|
639
|
+
algorithms.AES(key),
|
|
640
|
+
modes.GCM(nonce),
|
|
641
|
+
backend=default_backend()
|
|
642
|
+
)
|
|
643
|
+
encryptor = cipher.encryptor()
|
|
644
|
+
encrypted_access_key = encryptor.update(access_key.encode()) + encryptor.finalize()
|
|
645
|
+
encrypted_access_key_with_nonce = nonce + encrypted_access_key + encryptor.tag
|
|
646
|
+
|
|
647
|
+
# Encrypt secret key
|
|
648
|
+
nonce = os.urandom(12)
|
|
649
|
+
cipher = Cipher(
|
|
650
|
+
algorithms.AES(key),
|
|
651
|
+
modes.GCM(nonce),
|
|
652
|
+
backend=default_backend()
|
|
653
|
+
)
|
|
654
|
+
encryptor = cipher.encryptor()
|
|
655
|
+
encrypted_secret_key = encryptor.update(secret_key.encode()) + encryptor.finalize()
|
|
656
|
+
encrypted_secret_key_with_nonce = nonce + encrypted_secret_key + encryptor.tag
|
|
657
|
+
|
|
658
|
+
# Encode to base64 for storage
|
|
659
|
+
encoded_access_key = base64.b64encode(encrypted_access_key_with_nonce).decode()
|
|
660
|
+
encoded_secret_key = base64.b64encode(encrypted_secret_key_with_nonce).decode()
|
|
661
|
+
|
|
662
|
+
return encoded_access_key, encoded_secret_key
|
|
663
|
+
|
|
664
|
+
@log_errors(default_return=False, raise_exception=False)
|
|
665
|
+
def check_public_port_exposure(port: int) -> bool:
|
|
666
|
+
"""
|
|
667
|
+
Check if port is publicly accessible.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
port (int): Port number to check
|
|
671
|
+
|
|
672
|
+
Returns:
|
|
673
|
+
bool: True if port is publicly accessible
|
|
674
|
+
"""
|
|
675
|
+
is_public_exposed = False
|
|
676
|
+
is_locally_available = False
|
|
677
|
+
# Check if port is publicly accessible
|
|
678
|
+
public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
|
|
679
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn_sock:
|
|
680
|
+
conn_sock.settimeout(3)
|
|
681
|
+
result = conn_sock.connect_ex((public_ip, port))
|
|
682
|
+
is_public_exposed = result == 0
|
|
683
|
+
|
|
684
|
+
# Check if port is locally available
|
|
685
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as bind_sock:
|
|
686
|
+
bind_sock.setsockopt(
|
|
687
|
+
socket.SOL_SOCKET,
|
|
688
|
+
socket.SO_REUSEADDR,
|
|
689
|
+
1,
|
|
690
|
+
)
|
|
691
|
+
bind_sock.bind(("", port))
|
|
692
|
+
bind_sock.listen(1)
|
|
693
|
+
is_locally_available = True
|
|
694
|
+
|
|
695
|
+
if not is_public_exposed:
|
|
696
|
+
logging.debug(
|
|
697
|
+
"Port %d is not publicly exposed",
|
|
698
|
+
port,
|
|
699
|
+
)
|
|
700
|
+
return False
|
|
701
|
+
if not is_locally_available:
|
|
702
|
+
logging.debug(
|
|
703
|
+
"Port %d is not locally available",
|
|
704
|
+
port,
|
|
705
|
+
)
|
|
706
|
+
return False
|
|
707
|
+
return True
|