matrice-compute 0.1.19__tar.gz → 0.1.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/PKG-INFO +1 -1
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/action_instance.py +40 -10
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/instance_utils.py +305 -94
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/resources_tracker.py +125 -53
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/LICENSE.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/README.md +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/pyproject.toml +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/setup.cfg +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/setup.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/scaling.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/task_utils.py +0 -0
|
@@ -12,6 +12,7 @@ from matrice_compute.instance_utils import (
|
|
|
12
12
|
get_gpu_with_sufficient_memory_for_action,
|
|
13
13
|
get_decrypted_access_key_pair,
|
|
14
14
|
get_max_file_system,
|
|
15
|
+
get_best_service_ip_and_network,
|
|
15
16
|
)
|
|
16
17
|
from matrice_compute.task_utils import (
|
|
17
18
|
setup_workspace_and_run_task,
|
|
@@ -526,13 +527,18 @@ class ActionInstance:
|
|
|
526
527
|
|
|
527
528
|
if username and password:
|
|
528
529
|
login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
|
|
529
|
-
subprocess.run(login_cmd, shell=True, check=True)
|
|
530
|
+
result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
|
|
531
|
+
if result.returncode != 0:
|
|
532
|
+
raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
|
|
530
533
|
logging.info("Docker login successful")
|
|
531
534
|
else:
|
|
532
535
|
logging.warning(
|
|
533
536
|
"Docker credentials not available, skipping Docker login"
|
|
534
537
|
)
|
|
535
538
|
|
|
539
|
+
except subprocess.TimeoutExpired:
|
|
540
|
+
logging.error("Docker login timed out after 30 seconds")
|
|
541
|
+
raise Exception("Docker login timed out")
|
|
536
542
|
except Exception as err:
|
|
537
543
|
logging.error(
|
|
538
544
|
"Docker login failed: %s",
|
|
@@ -1151,9 +1157,17 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1151
1157
|
return
|
|
1152
1158
|
image = action_details["actionDetails"].get("docker")
|
|
1153
1159
|
|
|
1154
|
-
|
|
1155
1160
|
self.setup_action_requirements(action_details)
|
|
1156
1161
|
|
|
1162
|
+
# Get the best IP and network configuration for port 8102
|
|
1163
|
+
ws_host, use_host_network = get_best_service_ip_and_network(8102)
|
|
1164
|
+
|
|
1165
|
+
# Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
|
|
1166
|
+
if not os.environ.get("INFERENCE_WS_HOST"):
|
|
1167
|
+
os.environ["INFERENCE_WS_HOST"] = ws_host
|
|
1168
|
+
|
|
1169
|
+
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1170
|
+
|
|
1157
1171
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1158
1172
|
worker_cmd = (
|
|
1159
1173
|
f"docker run -d --pull=always --net=host "
|
|
@@ -1164,7 +1178,6 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1164
1178
|
f"{image} "
|
|
1165
1179
|
f"./app "
|
|
1166
1180
|
f"{self.action_record_id} "
|
|
1167
|
-
|
|
1168
1181
|
)
|
|
1169
1182
|
logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
|
|
1170
1183
|
|
|
@@ -1185,7 +1198,13 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1185
1198
|
image = action_details["actionDetails"].get("docker")
|
|
1186
1199
|
|
|
1187
1200
|
self.setup_action_requirements(action_details)
|
|
1188
|
-
|
|
1201
|
+
|
|
1202
|
+
# Get the ws_host from environment variable set by inference_ws_server_execute
|
|
1203
|
+
ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
|
|
1204
|
+
ws_url = f"{ws_host}:8102"
|
|
1205
|
+
|
|
1206
|
+
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1207
|
+
|
|
1189
1208
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1190
1209
|
worker_cmd = (
|
|
1191
1210
|
f"docker run -d --pull=always --net=host "
|
|
@@ -1195,9 +1214,10 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1195
1214
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1196
1215
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1197
1216
|
f"-e PORT=3000 "
|
|
1217
|
+
f'-e WS_HOST="{ws_url}" '
|
|
1198
1218
|
f"{image}"
|
|
1199
1219
|
)
|
|
1200
|
-
logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
|
|
1220
|
+
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
1201
1221
|
|
|
1202
1222
|
# Docker Command run
|
|
1203
1223
|
self.start(worker_cmd, "fe_fs_streaming")
|
|
@@ -1304,6 +1324,11 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1304
1324
|
action_id=action_id,
|
|
1305
1325
|
)
|
|
1306
1326
|
|
|
1327
|
+
# Get the best IP for Redis (port 6379)
|
|
1328
|
+
redis_host, _ = get_best_service_ip_and_network(6379)
|
|
1329
|
+
|
|
1330
|
+
logging.info(f"Redis will use IP: {redis_host} on port 6379")
|
|
1331
|
+
|
|
1307
1332
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1308
1333
|
|
|
1309
1334
|
# Redis container with --net=host (Port: 6379)
|
|
@@ -1315,7 +1340,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1315
1340
|
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1316
1341
|
)
|
|
1317
1342
|
|
|
1318
|
-
logging.info("Starting Redis container
|
|
1343
|
+
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
1319
1344
|
|
|
1320
1345
|
# Start Redis container first
|
|
1321
1346
|
redis_process = subprocess.Popen(
|
|
@@ -1324,13 +1349,13 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1324
1349
|
stdout=subprocess.PIPE,
|
|
1325
1350
|
stderr=subprocess.PIPE,
|
|
1326
1351
|
)
|
|
1327
|
-
logging.info("Redis container started successfully on
|
|
1352
|
+
logging.info("Redis container started successfully on %s:6379", redis_host)
|
|
1328
1353
|
|
|
1329
1354
|
# Wait for Redis to be ready
|
|
1330
1355
|
time.sleep(5)
|
|
1331
1356
|
|
|
1332
1357
|
env_vars = {
|
|
1333
|
-
"REDIS_URL": f"
|
|
1358
|
+
"REDIS_URL": f"{redis_host}:6379",
|
|
1334
1359
|
"REDIS_PASSWORD": redis_password,
|
|
1335
1360
|
}
|
|
1336
1361
|
|
|
@@ -1348,7 +1373,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1348
1373
|
f"{self.action_record_id} "
|
|
1349
1374
|
)
|
|
1350
1375
|
|
|
1351
|
-
logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
|
|
1376
|
+
logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
|
|
1352
1377
|
|
|
1353
1378
|
self.start(cmd, "redis_setup")
|
|
1354
1379
|
|
|
@@ -1386,7 +1411,12 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1386
1411
|
action_id=action_id,
|
|
1387
1412
|
)
|
|
1388
1413
|
use_gpu = self.get_gpu_config(action_details)
|
|
1389
|
-
|
|
1414
|
+
|
|
1415
|
+
gpuRequired = action_details["actionDetails"]["gpuRequired"]
|
|
1416
|
+
if gpuRequired==False:
|
|
1417
|
+
use_gpu = ""
|
|
1418
|
+
else:
|
|
1419
|
+
use_gpu = "--runtime=nvidia"
|
|
1390
1420
|
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1391
1421
|
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1392
1422
|
logging.info("cmd is: %s", cmd)
|
|
@@ -95,28 +95,72 @@ def get_instance_info(service_provider: str = None, instance_id: str = None) ->
|
|
|
95
95
|
return str(auto_service_provider), str(auto_instance_id)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
+
def _normalize_timestamp(timestamp_str: str) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Normalize timestamp string to handle different precision levels.
|
|
101
|
+
|
|
102
|
+
Handles nanoseconds (9 digits), microseconds (6 digits), milliseconds (3 digits),
|
|
103
|
+
and various timezone formats across different cloud providers.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
timestamp_str (str): Timestamp string in various formats
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
str: Normalized timestamp string compatible with fromisoformat()
|
|
110
|
+
"""
|
|
111
|
+
# Replace 'Z' with '+00:00' for UTC timestamps
|
|
112
|
+
timestamp_str = timestamp_str.replace("Z", "+00:00")
|
|
113
|
+
|
|
114
|
+
# Handle fractional seconds - Python's datetime only supports up to 6 digits (microseconds)
|
|
115
|
+
# Some providers (like OCI, GCP) may return nanoseconds (9 digits)
|
|
116
|
+
if "." in timestamp_str:
|
|
117
|
+
# Split into main part and fractional part
|
|
118
|
+
if "+" in timestamp_str:
|
|
119
|
+
main_part, tz_part = timestamp_str.rsplit("+", 1)
|
|
120
|
+
tz_suffix = "+" + tz_part
|
|
121
|
+
elif timestamp_str.count("-") > 2: # Has negative timezone offset
|
|
122
|
+
main_part, tz_part = timestamp_str.rsplit("-", 1)
|
|
123
|
+
tz_suffix = "-" + tz_part
|
|
124
|
+
else:
|
|
125
|
+
main_part = timestamp_str
|
|
126
|
+
tz_suffix = ""
|
|
127
|
+
|
|
128
|
+
# Split main part into date/time and fractional seconds
|
|
129
|
+
datetime_part, fractional = main_part.rsplit(".", 1)
|
|
130
|
+
|
|
131
|
+
# Truncate fractional seconds to 6 digits (microseconds)
|
|
132
|
+
if len(fractional) > 6:
|
|
133
|
+
fractional = fractional[:6]
|
|
134
|
+
|
|
135
|
+
# Reconstruct timestamp
|
|
136
|
+
timestamp_str = f"{datetime_part}.{fractional}{tz_suffix}"
|
|
137
|
+
|
|
138
|
+
return timestamp_str
|
|
139
|
+
|
|
140
|
+
|
|
98
141
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
99
142
|
def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
|
|
100
143
|
"""
|
|
101
144
|
Calculate time difference between start and finish times.
|
|
145
|
+
|
|
146
|
+
Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
|
|
147
|
+
and different precision levels (nanoseconds, microseconds, milliseconds).
|
|
102
148
|
|
|
103
149
|
Args:
|
|
104
|
-
start_time_str (str): Start time string
|
|
105
|
-
finish_time_str (str): Finish time string
|
|
150
|
+
start_time_str (str): Start time string in ISO format
|
|
151
|
+
finish_time_str (str): Finish time string in ISO format
|
|
106
152
|
|
|
107
153
|
Returns:
|
|
108
154
|
int: Time difference in seconds
|
|
109
155
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
|
|
119
|
-
finish_time = datetime.fromisoformat(finish_time_str.replace("Z", "+00:00"))
|
|
156
|
+
# Normalize both timestamps to handle different formats
|
|
157
|
+
normalized_start = _normalize_timestamp(start_time_str)
|
|
158
|
+
normalized_finish = _normalize_timestamp(finish_time_str)
|
|
159
|
+
|
|
160
|
+
# Parse the normalized timestamps
|
|
161
|
+
start_time = datetime.fromisoformat(normalized_start)
|
|
162
|
+
finish_time = datetime.fromisoformat(normalized_finish)
|
|
163
|
+
|
|
120
164
|
return int((finish_time - start_time).total_seconds())
|
|
121
165
|
|
|
122
166
|
|
|
@@ -129,14 +173,25 @@ def has_gpu() -> bool:
|
|
|
129
173
|
bool: True if GPU is present, False otherwise
|
|
130
174
|
"""
|
|
131
175
|
try:
|
|
132
|
-
subprocess.run(
|
|
133
|
-
|
|
176
|
+
result = subprocess.run(
|
|
177
|
+
["nvidia-smi"],
|
|
178
|
+
stdout=subprocess.PIPE,
|
|
179
|
+
stderr=subprocess.PIPE,
|
|
180
|
+
timeout=5,
|
|
181
|
+
check=False,
|
|
182
|
+
)
|
|
183
|
+
return result.returncode == 0
|
|
134
184
|
except subprocess.TimeoutExpired:
|
|
135
|
-
logging.
|
|
185
|
+
logging.debug("nvidia-smi command timed out after 5 seconds")
|
|
186
|
+
return False
|
|
187
|
+
except FileNotFoundError:
|
|
188
|
+
logging.debug("nvidia-smi not found on this system")
|
|
189
|
+
return False
|
|
190
|
+
except Exception:
|
|
136
191
|
return False
|
|
137
192
|
|
|
138
193
|
|
|
139
|
-
@log_errors(default_return=0, raise_exception=False)
|
|
194
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
140
195
|
def get_gpu_memory_usage() -> float:
|
|
141
196
|
"""
|
|
142
197
|
Get GPU memory usage percentage.
|
|
@@ -144,17 +199,35 @@ def get_gpu_memory_usage() -> float:
|
|
|
144
199
|
Returns:
|
|
145
200
|
float: Memory usage between 0 and 1
|
|
146
201
|
"""
|
|
147
|
-
command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
|
|
202
|
+
command = ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"]
|
|
148
203
|
try:
|
|
149
|
-
|
|
204
|
+
result = subprocess.run(
|
|
205
|
+
command,
|
|
206
|
+
stdout=subprocess.PIPE,
|
|
207
|
+
stderr=subprocess.PIPE,
|
|
208
|
+
timeout=5,
|
|
209
|
+
check=False,
|
|
210
|
+
)
|
|
211
|
+
if result.returncode != 0:
|
|
212
|
+
logging.debug("nvidia-smi command failed in get_gpu_memory_usage")
|
|
213
|
+
return 0
|
|
214
|
+
output = result.stdout.decode("ascii").strip().split("\n")
|
|
150
215
|
memory_percentages = []
|
|
151
216
|
for line in output:
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
217
|
+
if line.strip():
|
|
218
|
+
used, total = map(int, line.split(","))
|
|
219
|
+
if total > 0:
|
|
220
|
+
usage_percentage = used / total
|
|
221
|
+
memory_percentages.append(usage_percentage)
|
|
222
|
+
return min(memory_percentages) if memory_percentages else 0
|
|
156
223
|
except subprocess.TimeoutExpired:
|
|
157
|
-
logging.
|
|
224
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
|
|
225
|
+
return 0
|
|
226
|
+
except (ValueError, IndexError) as e:
|
|
227
|
+
logging.debug("Error parsing GPU memory info: %s", e)
|
|
228
|
+
return 0
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logging.debug("Unexpected error in get_gpu_memory_usage: %s", e)
|
|
158
231
|
return 0
|
|
159
232
|
|
|
160
233
|
|
|
@@ -194,7 +267,7 @@ def get_mem_usage() -> float:
|
|
|
194
267
|
return mem_usage
|
|
195
268
|
|
|
196
269
|
|
|
197
|
-
@log_errors(default_return=[], raise_exception=False)
|
|
270
|
+
@log_errors(default_return=[], raise_exception=False, log_error=False)
|
|
198
271
|
def get_gpu_info() -> list:
|
|
199
272
|
"""
|
|
200
273
|
Get GPU information.
|
|
@@ -202,23 +275,34 @@ def get_gpu_info() -> list:
|
|
|
202
275
|
Returns:
|
|
203
276
|
list: GPU information strings
|
|
204
277
|
"""
|
|
205
|
-
proc = subprocess.Popen(
|
|
206
|
-
[
|
|
207
|
-
"nvidia-smi",
|
|
208
|
-
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
209
|
-
"--format=csv,noheader,nounits",
|
|
210
|
-
],
|
|
211
|
-
stdout=subprocess.PIPE,
|
|
212
|
-
stderr=subprocess.PIPE,
|
|
213
|
-
)
|
|
214
278
|
try:
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
279
|
+
proc = subprocess.Popen(
|
|
280
|
+
[
|
|
281
|
+
"nvidia-smi",
|
|
282
|
+
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
283
|
+
"--format=csv,noheader,nounits",
|
|
284
|
+
],
|
|
285
|
+
stdout=subprocess.PIPE,
|
|
286
|
+
stderr=subprocess.PIPE,
|
|
287
|
+
)
|
|
288
|
+
try:
|
|
289
|
+
stdout, stderr = proc.communicate(timeout=5)
|
|
290
|
+
if proc.returncode != 0:
|
|
291
|
+
logging.debug("nvidia-smi command failed in get_gpu_info")
|
|
292
|
+
return []
|
|
293
|
+
output = stdout.decode("UTF-8")
|
|
294
|
+
result = [line for line in output.split("\n") if line.strip()]
|
|
295
|
+
return result
|
|
296
|
+
except subprocess.TimeoutExpired:
|
|
297
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_info")
|
|
298
|
+
proc.kill()
|
|
299
|
+
proc.communicate() # flush output after kill
|
|
300
|
+
return []
|
|
301
|
+
except FileNotFoundError:
|
|
302
|
+
logging.debug("nvidia-smi not found on this system")
|
|
303
|
+
return []
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logging.debug("Error getting GPU info: %s", e)
|
|
222
306
|
return []
|
|
223
307
|
|
|
224
308
|
|
|
@@ -241,11 +325,29 @@ def is_docker_running() -> bool:
|
|
|
241
325
|
Returns:
|
|
242
326
|
bool: True if Docker containers are running
|
|
243
327
|
"""
|
|
244
|
-
command = "docker ps"
|
|
245
|
-
|
|
246
|
-
subprocess.
|
|
247
|
-
|
|
248
|
-
|
|
328
|
+
command = ["docker", "ps"]
|
|
329
|
+
try:
|
|
330
|
+
result = subprocess.run(
|
|
331
|
+
command,
|
|
332
|
+
stdout=subprocess.PIPE,
|
|
333
|
+
stderr=subprocess.PIPE,
|
|
334
|
+
check=False,
|
|
335
|
+
timeout=10,
|
|
336
|
+
)
|
|
337
|
+
if result.returncode != 0:
|
|
338
|
+
logging.warning("docker ps command failed")
|
|
339
|
+
return False
|
|
340
|
+
docker_images = result.stdout.decode("ascii").split("\n")[:-1][1:]
|
|
341
|
+
return bool(docker_images)
|
|
342
|
+
except subprocess.TimeoutExpired:
|
|
343
|
+
logging.warning("docker ps command timed out")
|
|
344
|
+
return False
|
|
345
|
+
except FileNotFoundError:
|
|
346
|
+
logging.warning("docker command not found")
|
|
347
|
+
return False
|
|
348
|
+
except Exception as e:
|
|
349
|
+
logging.warning("Error checking if docker is running: %s", e)
|
|
350
|
+
return False
|
|
249
351
|
|
|
250
352
|
|
|
251
353
|
@log_errors(default_return=None, raise_exception=False)
|
|
@@ -502,7 +604,7 @@ def is_allowed_gpu_device(gpu_index: int) -> bool:
|
|
|
502
604
|
return int(gpu_index) in allowed_gpus
|
|
503
605
|
|
|
504
606
|
|
|
505
|
-
@log_errors(raise_exception=True)
|
|
607
|
+
@log_errors(raise_exception=True, log_error=False)
|
|
506
608
|
def get_gpu_with_sufficient_memory_for_action(
|
|
507
609
|
action_details: dict,
|
|
508
610
|
) -> list:
|
|
@@ -519,16 +621,38 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
519
621
|
ValueError: If insufficient GPU memory
|
|
520
622
|
"""
|
|
521
623
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
522
|
-
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
624
|
+
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
523
625
|
try:
|
|
524
|
-
|
|
626
|
+
result = subprocess.run(
|
|
627
|
+
command,
|
|
628
|
+
stdout=subprocess.PIPE,
|
|
629
|
+
stderr=subprocess.PIPE,
|
|
630
|
+
timeout=5,
|
|
631
|
+
check=False,
|
|
632
|
+
)
|
|
633
|
+
if result.returncode != 0:
|
|
634
|
+
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
635
|
+
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
525
636
|
except subprocess.TimeoutExpired:
|
|
526
|
-
logging.
|
|
637
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
|
|
527
638
|
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
639
|
+
except FileNotFoundError:
|
|
640
|
+
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
641
|
+
except Exception as e:
|
|
642
|
+
logging.warning("Error running nvidia-smi: %s", e)
|
|
643
|
+
raise ValueError(f"Failed to get GPU information: {e}")
|
|
528
644
|
|
|
529
645
|
if len(memory_free_info) < 2:
|
|
530
646
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
531
|
-
|
|
647
|
+
|
|
648
|
+
try:
|
|
649
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
650
|
+
except (ValueError, IndexError) as e:
|
|
651
|
+
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
652
|
+
|
|
653
|
+
if not memory_free_values:
|
|
654
|
+
raise ValueError("No GPU devices found")
|
|
655
|
+
|
|
532
656
|
if required_gpu_memory < 80000:
|
|
533
657
|
try:
|
|
534
658
|
return get_single_gpu_with_sufficient_memory_for_action(action_details)
|
|
@@ -546,11 +670,11 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
546
670
|
if total_memory >= required_gpu_memory:
|
|
547
671
|
return selected_gpus
|
|
548
672
|
raise ValueError(
|
|
549
|
-
f"Insufficient GPU memory available. Required: {required_gpu_memory}, Available: {total_memory}"
|
|
673
|
+
f"Insufficient GPU memory available. Required: {required_gpu_memory}MB, Available: {total_memory}MB"
|
|
550
674
|
)
|
|
551
675
|
|
|
552
676
|
|
|
553
|
-
@log_errors(raise_exception=True)
|
|
677
|
+
@log_errors(raise_exception=True, log_error=False)
|
|
554
678
|
def get_single_gpu_with_sufficient_memory_for_action(
|
|
555
679
|
action_details: dict,
|
|
556
680
|
) -> list:
|
|
@@ -567,16 +691,38 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
567
691
|
ValueError: If no GPU has sufficient memory
|
|
568
692
|
"""
|
|
569
693
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
570
|
-
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
694
|
+
command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
|
|
571
695
|
try:
|
|
572
|
-
|
|
696
|
+
result = subprocess.run(
|
|
697
|
+
command,
|
|
698
|
+
stdout=subprocess.PIPE,
|
|
699
|
+
stderr=subprocess.PIPE,
|
|
700
|
+
timeout=5,
|
|
701
|
+
check=False,
|
|
702
|
+
)
|
|
703
|
+
if result.returncode != 0:
|
|
704
|
+
raise ValueError("Failed to get GPU information - nvidia-smi command failed")
|
|
705
|
+
memory_free_info = result.stdout.decode("ascii").strip().split("\n")
|
|
573
706
|
except subprocess.TimeoutExpired:
|
|
574
|
-
logging.
|
|
707
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
|
|
575
708
|
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
709
|
+
except FileNotFoundError:
|
|
710
|
+
raise ValueError("nvidia-smi not found - no GPU support available")
|
|
711
|
+
except Exception as e:
|
|
712
|
+
logging.warning("Error running nvidia-smi: %s", e)
|
|
713
|
+
raise ValueError(f"Failed to get GPU information: {e}")
|
|
576
714
|
|
|
577
715
|
if len(memory_free_info) < 2:
|
|
578
716
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
579
|
-
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
|
|
720
|
+
except (ValueError, IndexError) as e:
|
|
721
|
+
raise ValueError(f"Error parsing GPU memory information: {e}")
|
|
722
|
+
|
|
723
|
+
if not memory_free_values:
|
|
724
|
+
raise ValueError("No GPU devices found")
|
|
725
|
+
|
|
580
726
|
best_fit_gpu = None
|
|
581
727
|
best_fit_memory = float("inf")
|
|
582
728
|
for i, mem in enumerate(memory_free_values):
|
|
@@ -692,47 +838,112 @@ def get_encrypted_access_key_pair(
|
|
|
692
838
|
|
|
693
839
|
return encoded_access_key, encoded_secret_key
|
|
694
840
|
|
|
695
|
-
|
|
696
|
-
def check_public_port_exposure(port: int) -> bool:
|
|
841
|
+
def _get_private_ip() -> str:
|
|
697
842
|
"""
|
|
698
|
-
|
|
843
|
+
Get the actual private/LAN IP address using UDP socket trick.
|
|
844
|
+
This works reliably even in Docker, NAT, VPN, etc.
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
str: Private IP address or None if not available
|
|
848
|
+
"""
|
|
849
|
+
try:
|
|
850
|
+
# Use UDP socket to determine which interface would be used for external connection
|
|
851
|
+
# No actual packets are sent
|
|
852
|
+
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
|
|
853
|
+
s.connect(("8.8.8.8", 80))
|
|
854
|
+
private_ip = s.getsockname()[0]
|
|
855
|
+
return private_ip
|
|
856
|
+
except Exception:
|
|
857
|
+
return None
|
|
699
858
|
|
|
700
|
-
Args:
|
|
701
|
-
port (int): Port number to check
|
|
702
859
|
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
is_locally_available = False
|
|
708
|
-
# Check if port is publicly accessible
|
|
709
|
-
public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
|
|
710
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn_sock:
|
|
711
|
-
conn_sock.settimeout(3)
|
|
712
|
-
result = conn_sock.connect_ex((public_ip, port))
|
|
713
|
-
is_public_exposed = result == 0
|
|
860
|
+
def _public_ip_is_local(public_ip: str) -> bool:
|
|
861
|
+
"""
|
|
862
|
+
Check if a public IP address is actually assigned to a local network interface.
|
|
863
|
+
This is true on cloud servers with real public IPs, false behind NAT.
|
|
714
864
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
logging.debug(
|
|
728
|
-
"Port %d is not publicly exposed",
|
|
729
|
-
port,
|
|
730
|
-
)
|
|
865
|
+
Args:
|
|
866
|
+
public_ip (str): The public IP to check
|
|
867
|
+
|
|
868
|
+
Returns:
|
|
869
|
+
bool: True if the public IP is on a local interface
|
|
870
|
+
"""
|
|
871
|
+
try:
|
|
872
|
+
for iface, addrs in psutil.net_if_addrs().items():
|
|
873
|
+
for addr in addrs:
|
|
874
|
+
if addr.family == socket.AF_INET:
|
|
875
|
+
if addr.address == public_ip:
|
|
876
|
+
return True
|
|
731
877
|
return False
|
|
732
|
-
|
|
733
|
-
logging.debug(
|
|
734
|
-
"Port %d is not locally available",
|
|
735
|
-
port,
|
|
736
|
-
)
|
|
878
|
+
except Exception:
|
|
737
879
|
return False
|
|
738
|
-
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
@log_errors(default_return=("localhost", True), raise_exception=False)
|
|
883
|
+
def get_best_service_ip_and_network(port: int) -> tuple:
|
|
884
|
+
"""
|
|
885
|
+
Determine the best IP address and network configuration for a service.
|
|
886
|
+
|
|
887
|
+
This function intelligently selects the best IP to bind a service to:
|
|
888
|
+
|
|
889
|
+
Priority:
|
|
890
|
+
1. Public IP if it's actually on a local interface (cloud servers)
|
|
891
|
+
2. Private/LAN IP (NAT, local network, Docker)
|
|
892
|
+
3. localhost with --net=host (fallback)
|
|
893
|
+
|
|
894
|
+
Args:
|
|
895
|
+
port (int): Port number for the service
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
tuple: (ip_address, use_host_network) where:
|
|
899
|
+
- ip_address: The IP address to use (public, private, or localhost)
|
|
900
|
+
- use_host_network: True if should use --net=host, False if should use port mapping
|
|
901
|
+
"""
|
|
902
|
+
try:
|
|
903
|
+
# Check if port is available (not already in use)
|
|
904
|
+
try:
|
|
905
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock:
|
|
906
|
+
test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
907
|
+
test_sock.bind(("0.0.0.0", port))
|
|
908
|
+
test_sock.listen(1)
|
|
909
|
+
# Port is available - socket closes automatically
|
|
910
|
+
except OSError as e:
|
|
911
|
+
logging.warning(f"Port {port} is already in use or cannot be bound: {e}, will use --net=host")
|
|
912
|
+
return "localhost", True
|
|
913
|
+
|
|
914
|
+
# Get the actual private/LAN IP
|
|
915
|
+
private_ip = _get_private_ip()
|
|
916
|
+
if private_ip:
|
|
917
|
+
logging.info(f"Determined private/LAN IP: {private_ip}")
|
|
918
|
+
else:
|
|
919
|
+
logging.debug("Could not determine private IP")
|
|
920
|
+
|
|
921
|
+
# Try to get public IP from external service
|
|
922
|
+
public_ip = None
|
|
923
|
+
try:
|
|
924
|
+
public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8").strip()
|
|
925
|
+
# Validate it's a proper IP address
|
|
926
|
+
socket.inet_aton(public_ip)
|
|
927
|
+
logging.info(f"Determined external/public IP: {public_ip}")
|
|
928
|
+
except Exception as e:
|
|
929
|
+
logging.debug(f"Could not determine public IP: {e}")
|
|
930
|
+
|
|
931
|
+
# Decision logic: Choose the best IP
|
|
932
|
+
|
|
933
|
+
# 1. If public IP is on a local interface, use it (cloud server with real public IP)
|
|
934
|
+
if public_ip and _public_ip_is_local(public_ip):
|
|
935
|
+
logging.info(f"Public IP {public_ip} is on local interface, using it for port {port}")
|
|
936
|
+
return public_ip, False
|
|
937
|
+
|
|
938
|
+
# 2. If we have a valid private IP, use it (most common case: NAT, LAN, Docker)
|
|
939
|
+
if private_ip and not private_ip.startswith("127."):
|
|
940
|
+
logging.info(f"Using private/LAN IP {private_ip} for port {port}")
|
|
941
|
+
return private_ip, False
|
|
942
|
+
|
|
943
|
+
# 3. Fall back to localhost with --net=host
|
|
944
|
+
logging.info(f"No suitable IP found, using localhost with --net=host for port {port}")
|
|
945
|
+
return "localhost", True
|
|
946
|
+
|
|
947
|
+
except Exception as e:
|
|
948
|
+
logging.warning(f"Error determining best IP for port {port}: {e}, falling back to localhost")
|
|
949
|
+
return "localhost", True
|
|
@@ -56,7 +56,7 @@ class ResourcesTracker:
|
|
|
56
56
|
return cpu_utilization, memory_utilization
|
|
57
57
|
return 0, 0
|
|
58
58
|
|
|
59
|
-
@log_errors(default_return=(0, 0), raise_exception=False)
|
|
59
|
+
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
60
60
|
def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
|
|
61
61
|
"""
|
|
62
62
|
Get CPU and memory usage for a specific container by its ID.
|
|
@@ -67,32 +67,46 @@ class ResourcesTracker:
|
|
|
67
67
|
Returns:
|
|
68
68
|
Tuple[float, float]: CPU utilization percentage and memory usage in MB.
|
|
69
69
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
70
|
+
try:
|
|
71
|
+
stats_result = subprocess.run(
|
|
72
|
+
[
|
|
73
|
+
"docker",
|
|
74
|
+
"stats",
|
|
75
|
+
"--no-stream",
|
|
76
|
+
"--format",
|
|
77
|
+
"{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
|
|
78
|
+
container_id,
|
|
79
|
+
],
|
|
80
|
+
capture_output=True,
|
|
81
|
+
text=True,
|
|
82
|
+
check=False,
|
|
83
|
+
timeout=10,
|
|
84
|
+
)
|
|
85
|
+
if stats_result.returncode != 0:
|
|
86
|
+
logging.debug("docker stats command failed for container %s", container_id)
|
|
87
|
+
return 0, 0
|
|
88
|
+
stats = stats_result.stdout.strip().split(": ")[1].split(", ")
|
|
89
|
+
cpu_usage = float(stats[0].replace("% CPU", "").strip())
|
|
90
|
+
memory_usage = stats[1].split(" / ")[0]
|
|
91
|
+
mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
|
|
92
|
+
if mem_unit == "KiB":
|
|
93
|
+
memory_usage_mb = float(mem_value) / 1024
|
|
94
|
+
elif mem_unit == "MiB":
|
|
95
|
+
memory_usage_mb = float(mem_value)
|
|
96
|
+
elif mem_unit == "GiB":
|
|
97
|
+
memory_usage_mb = float(mem_value) * 1024
|
|
98
|
+
else:
|
|
99
|
+
memory_usage_mb = float(mem_value)
|
|
100
|
+
return cpu_usage, memory_usage_mb
|
|
101
|
+
except subprocess.TimeoutExpired:
|
|
102
|
+
logging.debug("docker stats command timed out for container %s", container_id)
|
|
103
|
+
return 0, 0
|
|
104
|
+
except (ValueError, IndexError) as e:
|
|
105
|
+
logging.debug("Error parsing docker stats for container %s: %s", container_id, e)
|
|
106
|
+
return 0, 0
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
|
|
109
|
+
return 0, 0
|
|
96
110
|
|
|
97
111
|
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
98
112
|
def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
|
|
@@ -110,7 +124,7 @@ class ResourcesTracker:
|
|
|
110
124
|
gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
|
|
111
125
|
return gpu_util, gpu_mem_used
|
|
112
126
|
|
|
113
|
-
@log_errors(default_return="", raise_exception=False)
|
|
127
|
+
@log_errors(default_return="", raise_exception=False, log_error=False)
|
|
114
128
|
def get_pid_id_by_container_id(self, container_id: str) -> str:
|
|
115
129
|
"""
|
|
116
130
|
Get PID for a container ID.
|
|
@@ -121,20 +135,31 @@ class ResourcesTracker:
|
|
|
121
135
|
Returns:
|
|
122
136
|
str: PID of the container.
|
|
123
137
|
"""
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
+
try:
|
|
139
|
+
pid_result = subprocess.run(
|
|
140
|
+
[
|
|
141
|
+
"docker",
|
|
142
|
+
"inspect",
|
|
143
|
+
"--format",
|
|
144
|
+
"{{.State.Pid}}",
|
|
145
|
+
container_id,
|
|
146
|
+
],
|
|
147
|
+
capture_output=True,
|
|
148
|
+
text=True,
|
|
149
|
+
check=False,
|
|
150
|
+
timeout=10,
|
|
151
|
+
)
|
|
152
|
+
if pid_result.returncode != 0:
|
|
153
|
+
logging.debug("docker inspect command failed for container %s", container_id)
|
|
154
|
+
return ""
|
|
155
|
+
container_pid = pid_result.stdout.strip()
|
|
156
|
+
return container_pid
|
|
157
|
+
except subprocess.TimeoutExpired:
|
|
158
|
+
logging.debug("docker inspect command timed out for container %s", container_id)
|
|
159
|
+
return ""
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logging.debug("Error getting PID for container %s: %s", container_id, e)
|
|
162
|
+
return ""
|
|
138
163
|
|
|
139
164
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
140
165
|
def get_container_gpu_usage(self, container_pid: str) -> float:
|
|
@@ -155,9 +180,12 @@ class ResourcesTracker:
|
|
|
155
180
|
["nvidia-smi", "pmon", "-c", "1"],
|
|
156
181
|
capture_output=True,
|
|
157
182
|
text=True,
|
|
158
|
-
check=
|
|
183
|
+
check=False,
|
|
159
184
|
timeout=5,
|
|
160
185
|
)
|
|
186
|
+
if result.returncode != 0:
|
|
187
|
+
logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
|
|
188
|
+
return 0
|
|
161
189
|
pmon_output = result.stdout.strip().split("\n")
|
|
162
190
|
for line in pmon_output[2:]:
|
|
163
191
|
parts = line.split()
|
|
@@ -167,7 +195,16 @@ class ResourcesTracker:
|
|
|
167
195
|
if pid == str(container_pid):
|
|
168
196
|
gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
|
|
169
197
|
except subprocess.TimeoutExpired:
|
|
170
|
-
logging.
|
|
198
|
+
logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
|
|
199
|
+
return 0
|
|
200
|
+
except (ValueError, IndexError) as e:
|
|
201
|
+
logging.debug("Error parsing GPU usage info: %s", e)
|
|
202
|
+
return 0
|
|
203
|
+
except FileNotFoundError:
|
|
204
|
+
logging.debug("nvidia-smi not found on this system")
|
|
205
|
+
return 0
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
|
|
171
208
|
return 0
|
|
172
209
|
return gpu_util
|
|
173
210
|
|
|
@@ -196,9 +233,12 @@ class ResourcesTracker:
|
|
|
196
233
|
stdout=subprocess.PIPE,
|
|
197
234
|
stderr=subprocess.PIPE,
|
|
198
235
|
text=True,
|
|
199
|
-
check=
|
|
236
|
+
check=False,
|
|
200
237
|
timeout=5,
|
|
201
238
|
)
|
|
239
|
+
if result.returncode != 0:
|
|
240
|
+
logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
|
|
241
|
+
return 0
|
|
202
242
|
for line in result.stdout.splitlines():
|
|
203
243
|
parts = line.strip().split(", ")
|
|
204
244
|
if len(parts) == 2:
|
|
@@ -206,7 +246,16 @@ class ResourcesTracker:
|
|
|
206
246
|
if process_pid == str(container_pid):
|
|
207
247
|
total_memory += int(used_memory)
|
|
208
248
|
except subprocess.TimeoutExpired:
|
|
209
|
-
logging.
|
|
249
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
|
|
250
|
+
return 0
|
|
251
|
+
except (ValueError, IndexError) as e:
|
|
252
|
+
logging.debug("Error parsing GPU memory usage info: %s", e)
|
|
253
|
+
return 0
|
|
254
|
+
except FileNotFoundError:
|
|
255
|
+
logging.debug("nvidia-smi not found on this system")
|
|
256
|
+
return 0
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
|
|
210
259
|
return 0
|
|
211
260
|
return total_memory
|
|
212
261
|
|
|
@@ -238,17 +287,40 @@ class ResourcesTracker:
|
|
|
238
287
|
return gpu_memory_free, gpu_utilization
|
|
239
288
|
|
|
240
289
|
try:
|
|
241
|
-
subprocess.
|
|
290
|
+
result = subprocess.run(
|
|
291
|
+
["nvidia-smi"],
|
|
292
|
+
stdout=subprocess.PIPE,
|
|
293
|
+
stderr=subprocess.PIPE,
|
|
294
|
+
timeout=5,
|
|
295
|
+
check=False,
|
|
296
|
+
)
|
|
297
|
+
if result.returncode != 0:
|
|
298
|
+
logging.debug("nvidia-smi command failed in _get_gpu_resources")
|
|
299
|
+
return 0, 0.0
|
|
242
300
|
except subprocess.TimeoutExpired:
|
|
243
|
-
logging.
|
|
301
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
|
|
302
|
+
return 0, 0.0
|
|
303
|
+
except FileNotFoundError:
|
|
304
|
+
logging.debug("nvidia-smi not found on this system")
|
|
305
|
+
return 0, 0.0
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
|
|
244
308
|
return 0, 0.0
|
|
245
309
|
|
|
246
310
|
info_list = get_gpu_info()
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
311
|
+
if not info_list:
|
|
312
|
+
return 0, 0.0
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
for info in info_list:
|
|
316
|
+
info_split = info.split(", ")
|
|
317
|
+
if len(info_split) >= 6:
|
|
318
|
+
gpu_memory_free += int(info_split[5])
|
|
319
|
+
gpu_utilization += float(info_split[2])
|
|
320
|
+
gpu_utilization /= len(info_list) if info_list else 1
|
|
321
|
+
except (ValueError, IndexError) as e:
|
|
322
|
+
logging.debug("Error parsing GPU resources: %s", e)
|
|
323
|
+
return 0, 0.0
|
|
252
324
|
|
|
253
325
|
return gpu_memory_free, gpu_utilization
|
|
254
326
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.19 → matrice_compute-0.1.20}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.19 → matrice_compute-0.1.20}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|