matrice-compute 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/PKG-INFO +1 -1
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/action_instance.py +118 -103
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/instance_utils.py +46 -15
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/resources_tracker.py +59 -32
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/scaling.py +5 -2
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/task_utils.py +2 -2
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/LICENSE.txt +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/README.md +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/pyproject.toml +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/setup.cfg +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/setup.py +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/shutdown_manager.py +0 -0
|
@@ -575,75 +575,75 @@ class ActionInstance:
|
|
|
575
575
|
)
|
|
576
576
|
raise
|
|
577
577
|
|
|
578
|
-
@log_errors(raise_exception=False)
|
|
579
|
-
def create_redis_container(self, redis_image=None, redis_password=None):
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
578
|
+
# @log_errors(raise_exception=False)
|
|
579
|
+
# def create_redis_container(self, redis_image=None, redis_password=None):
|
|
580
|
+
# """Create and start a Redis container using Docker.
|
|
581
|
+
|
|
582
|
+
# Args:
|
|
583
|
+
# redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
|
|
584
|
+
|
|
585
|
+
# Returns:
|
|
586
|
+
# tuple: (container_info, error, message)
|
|
587
|
+
# """
|
|
588
|
+
# if redis_image is None:
|
|
589
|
+
# redis_image = "redis:latest"
|
|
590
|
+
|
|
591
|
+
# network_name = f"redis_network_{int(time.time())}"
|
|
592
|
+
# subprocess.run(f"docker network create {network_name}", shell=True, check=True)
|
|
593
|
+
|
|
594
|
+
# try:
|
|
595
|
+
# # Get an available port for Redis
|
|
596
|
+
# external_port = "6379"
|
|
597
|
+
|
|
598
|
+
# # Generate a unique container name and password
|
|
599
|
+
# container_name = f"redis_container_{int(time.time())}"
|
|
600
|
+
|
|
601
|
+
# # Build the docker command to create Redis container with password
|
|
602
|
+
# cmd = (
|
|
603
|
+
# f"docker run -d "
|
|
604
|
+
# f"--network {network_name} "
|
|
605
|
+
# f"--name {container_name} "
|
|
606
|
+
# f"-p {external_port}:6379 "
|
|
607
|
+
# f"--restart unless-stopped "
|
|
608
|
+
# f"{redis_image} "
|
|
609
|
+
# f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
610
|
+
# )
|
|
611
|
+
|
|
612
|
+
# logging.info("Creating Redis container with command: %s", cmd)
|
|
613
|
+
|
|
614
|
+
# # Execute the command
|
|
615
|
+
# result = subprocess.run(
|
|
616
|
+
# cmd, shell=True, capture_output=True, text=True, timeout=60
|
|
617
|
+
# )
|
|
618
|
+
|
|
619
|
+
# if result.returncode == 0:
|
|
620
|
+
# container_id = result.stdout.strip()
|
|
621
|
+
# container_info = {
|
|
622
|
+
# "container_id": container_id,
|
|
623
|
+
# "container_name": container_name,
|
|
624
|
+
# "network_name": network_name,
|
|
625
|
+
# "external_port": external_port,
|
|
626
|
+
# "internal_port": 6379,
|
|
627
|
+
# "password": redis_password,
|
|
628
|
+
# "image": redis_image,
|
|
629
|
+
# "status": "running",
|
|
630
|
+
# }
|
|
631
|
+
|
|
632
|
+
# logging.info("Redis container created successfully: %s", container_info)
|
|
633
|
+
# return container_info, None, "Redis container created successfully"
|
|
634
|
+
# else:
|
|
635
|
+
# error_message = f"Failed to create Redis container: {result.stderr}"
|
|
636
|
+
# logging.error(error_message)
|
|
637
|
+
# return None, "ContainerCreationError", error_message
|
|
638
|
+
|
|
639
|
+
# except subprocess.TimeoutExpired:
|
|
640
|
+
# error_message = "Timeout while creating Redis container"
|
|
641
|
+
# logging.error(error_message)
|
|
642
|
+
# return None, "TimeoutError", error_message
|
|
643
|
+
# except Exception as e:
|
|
644
|
+
# error_message = f"Unexpected error creating Redis container: {str(e)}"
|
|
645
|
+
# logging.error(error_message)
|
|
646
|
+
# return None, "UnexpectedError", error_message
|
|
647
647
|
|
|
648
648
|
@log_errors(raise_exception=False, log_error=False)
|
|
649
649
|
def send_logs_continuously(self):
|
|
@@ -1053,7 +1053,7 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1053
1053
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1054
1054
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
1055
1055
|
f"-e PROJECT_ID={project_id} "
|
|
1056
|
-
f
|
|
1056
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1057
1057
|
f"{image} "
|
|
1058
1058
|
)
|
|
1059
1059
|
print("Docker command", cmd)
|
|
@@ -1117,13 +1117,14 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1117
1117
|
|
|
1118
1118
|
# Add worker container run command
|
|
1119
1119
|
worker_cmd = (
|
|
1120
|
-
f"docker run -d --pull=always "
|
|
1120
|
+
f"docker run -d --net=host --pull=always "
|
|
1121
1121
|
f"--name lpr-worker "
|
|
1122
1122
|
f"-p {external_port}:8082 "
|
|
1123
1123
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1124
1124
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1125
1125
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1126
1126
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1127
|
+
f'-e PORT={external_port} '
|
|
1127
1128
|
f"{image}"
|
|
1128
1129
|
)
|
|
1129
1130
|
print("Worker docker run command:", worker_cmd)
|
|
@@ -1134,27 +1135,31 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1134
1135
|
@log_errors(raise_exception=False)
|
|
1135
1136
|
def inference_ws_server_execute(self: ActionInstance):
|
|
1136
1137
|
"""
|
|
1137
|
-
Creates and start inference
|
|
1138
|
+
Creates and start inference pipeline.
|
|
1139
|
+
Inference WebSocket server runs on port 8102 (localhost only with --net=host).
|
|
1138
1140
|
"""
|
|
1139
1141
|
action_details = self.get_action_details()
|
|
1140
1142
|
|
|
1141
1143
|
if not action_details:
|
|
1142
1144
|
return
|
|
1143
1145
|
image = action_details["actionDetails"].get("docker")
|
|
1146
|
+
|
|
1144
1147
|
|
|
1145
1148
|
self.setup_action_requirements(action_details)
|
|
1146
1149
|
|
|
1147
|
-
#
|
|
1150
|
+
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1148
1151
|
worker_cmd = (
|
|
1149
|
-
f"docker run -d --pull=always "
|
|
1152
|
+
f"docker run -d --pull=always --net=host "
|
|
1150
1153
|
f"--name inference "
|
|
1151
|
-
f"-p 8102:8102 "
|
|
1152
1154
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1153
1155
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1154
1156
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1155
|
-
f"{image}"
|
|
1157
|
+
f"{image} "
|
|
1158
|
+
f"./app "
|
|
1159
|
+
f"{self.action_record_id} "
|
|
1160
|
+
|
|
1156
1161
|
)
|
|
1157
|
-
|
|
1162
|
+
logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
|
|
1158
1163
|
|
|
1159
1164
|
# Docker Command run
|
|
1160
1165
|
self.start(worker_cmd, "inference_ws_server")
|
|
@@ -1163,7 +1168,8 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1163
1168
|
@log_errors(raise_exception=False)
|
|
1164
1169
|
def fe_fs_streaming_execute(self: ActionInstance):
|
|
1165
1170
|
"""
|
|
1166
|
-
Creates and setup the frontend for fs streaming
|
|
1171
|
+
Creates and setup the frontend for fs streaming.
|
|
1172
|
+
Frontend streaming runs on port 3000 (localhost only with --net=host).
|
|
1167
1173
|
"""
|
|
1168
1174
|
action_details = self.get_action_details()
|
|
1169
1175
|
|
|
@@ -1173,17 +1179,16 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1173
1179
|
|
|
1174
1180
|
self.setup_action_requirements(action_details)
|
|
1175
1181
|
|
|
1176
|
-
#
|
|
1182
|
+
# Frontend streaming with --net=host (Port: 3000)
|
|
1177
1183
|
worker_cmd = (
|
|
1178
|
-
f"docker run -d --pull=always "
|
|
1184
|
+
f"docker run -d --pull=always --net=host "
|
|
1179
1185
|
f"--name fe_streaming "
|
|
1180
|
-
f"-p 3000:3000 "
|
|
1181
1186
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1182
1187
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1183
1188
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1184
1189
|
f"{image}"
|
|
1185
1190
|
)
|
|
1186
|
-
|
|
1191
|
+
logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
|
|
1187
1192
|
|
|
1188
1193
|
# Docker Command run
|
|
1189
1194
|
self.start(worker_cmd, "fe_fs_streaming")
|
|
@@ -1235,8 +1240,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1235
1240
|
def redis_setup_execute(self: ActionInstance):
|
|
1236
1241
|
"""
|
|
1237
1242
|
Creates and starts a Redis container using Docker.
|
|
1243
|
+
Redis runs on port 6379 (localhost only with --net=host).
|
|
1238
1244
|
"""
|
|
1239
|
-
external_port = self.scaling.get_open_port()
|
|
1240
1245
|
work_fs = get_max_file_system()
|
|
1241
1246
|
|
|
1242
1247
|
action_details = self.get_action_details()
|
|
@@ -1248,18 +1253,6 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1248
1253
|
"password", f"redis_pass_{int(time.time())}"
|
|
1249
1254
|
)
|
|
1250
1255
|
|
|
1251
|
-
container_info, error, message = self.create_redis_container(
|
|
1252
|
-
action_details["actionDetails"].get("redis_image", "redis:latest"),
|
|
1253
|
-
redis_password=redis_password,
|
|
1254
|
-
)
|
|
1255
|
-
if error:
|
|
1256
|
-
logging.error(
|
|
1257
|
-
"Error creating Redis container: %s",
|
|
1258
|
-
message,
|
|
1259
|
-
)
|
|
1260
|
-
return
|
|
1261
|
-
logging.info("Redis container created successfully: %s", container_info)
|
|
1262
|
-
|
|
1263
1256
|
# Initialize redis container
|
|
1264
1257
|
self.setup_action_requirements(
|
|
1265
1258
|
action_details,
|
|
@@ -1268,17 +1261,39 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1268
1261
|
action_id=action_id,
|
|
1269
1262
|
)
|
|
1270
1263
|
|
|
1264
|
+
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1265
|
+
|
|
1266
|
+
# Redis container with --net=host (Port: 6379)
|
|
1267
|
+
redis_cmd = (
|
|
1268
|
+
f"docker run -d --net=host "
|
|
1269
|
+
f"--name redis_container_{int(time.time())} "
|
|
1270
|
+
f"--restart unless-stopped "
|
|
1271
|
+
f"{redis_image} "
|
|
1272
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
|
|
1276
|
+
|
|
1277
|
+
# Start Redis container first
|
|
1278
|
+
redis_process = subprocess.Popen(
|
|
1279
|
+
redis_cmd,
|
|
1280
|
+
shell=True,
|
|
1281
|
+
stdout=subprocess.PIPE,
|
|
1282
|
+
stderr=subprocess.PIPE,
|
|
1283
|
+
)
|
|
1284
|
+
logging.info("Redis container started successfully on localhost:6379")
|
|
1285
|
+
|
|
1286
|
+
# Wait for Redis to be ready
|
|
1287
|
+
time.sleep(5)
|
|
1288
|
+
|
|
1271
1289
|
env_vars = {
|
|
1272
|
-
"REDIS_URL": f"
|
|
1273
|
-
"REDIS_PASSWORD":
|
|
1290
|
+
"REDIS_URL": f"localhost:6379",
|
|
1291
|
+
"REDIS_PASSWORD": redis_password,
|
|
1274
1292
|
}
|
|
1275
1293
|
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
# Make the docker file here
|
|
1294
|
+
# bg-redis management container with --net=host (Port: 8082)
|
|
1279
1295
|
cmd = (
|
|
1280
|
-
f"docker run "
|
|
1281
|
-
f"{network_config} "
|
|
1296
|
+
f"docker run --net=host "
|
|
1282
1297
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1283
1298
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1284
1299
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1290,7 +1305,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1290
1305
|
f"{self.action_record_id} "
|
|
1291
1306
|
)
|
|
1292
1307
|
|
|
1293
|
-
logging.info("
|
|
1308
|
+
logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
|
|
1294
1309
|
|
|
1295
1310
|
self.start(cmd, "redis_setup")
|
|
1296
1311
|
|
|
@@ -128,8 +128,12 @@ def has_gpu() -> bool:
|
|
|
128
128
|
Returns:
|
|
129
129
|
bool: True if GPU is present, False otherwise
|
|
130
130
|
"""
|
|
131
|
-
|
|
132
|
-
|
|
131
|
+
try:
|
|
132
|
+
subprocess.run("nvidia-smi", timeout=5)
|
|
133
|
+
return True
|
|
134
|
+
except subprocess.TimeoutExpired:
|
|
135
|
+
logging.warning("nvidia-smi command timed out after 5 seconds")
|
|
136
|
+
return False
|
|
133
137
|
|
|
134
138
|
|
|
135
139
|
@log_errors(default_return=0, raise_exception=False)
|
|
@@ -141,13 +145,17 @@ def get_gpu_memory_usage() -> float:
|
|
|
141
145
|
float: Memory usage between 0 and 1
|
|
142
146
|
"""
|
|
143
147
|
command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
148
|
+
try:
|
|
149
|
+
output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
|
|
150
|
+
memory_percentages = []
|
|
151
|
+
for line in output:
|
|
152
|
+
used, total = map(int, line.split(","))
|
|
153
|
+
usage_percentage = used / total
|
|
154
|
+
memory_percentages.append(usage_percentage)
|
|
155
|
+
return min(memory_percentages)
|
|
156
|
+
except subprocess.TimeoutExpired:
|
|
157
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
|
|
158
|
+
return 0
|
|
151
159
|
|
|
152
160
|
|
|
153
161
|
@log_errors(default_return=0, raise_exception=False)
|
|
@@ -194,17 +202,24 @@ def get_gpu_info() -> list:
|
|
|
194
202
|
Returns:
|
|
195
203
|
list: GPU information strings
|
|
196
204
|
"""
|
|
197
|
-
|
|
205
|
+
proc = subprocess.Popen(
|
|
198
206
|
[
|
|
199
207
|
"nvidia-smi",
|
|
200
208
|
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
201
209
|
"--format=csv,noheader,nounits",
|
|
202
210
|
],
|
|
203
211
|
stdout=subprocess.PIPE,
|
|
204
|
-
|
|
205
|
-
|
|
212
|
+
stderr=subprocess.PIPE,
|
|
213
|
+
)
|
|
214
|
+
try:
|
|
215
|
+
stdout, stderr = proc.communicate(timeout=5)
|
|
206
216
|
output = stdout.decode("UTF-8")
|
|
207
217
|
return output.split("\n")[:-1]
|
|
218
|
+
except subprocess.TimeoutExpired:
|
|
219
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
|
|
220
|
+
proc.kill()
|
|
221
|
+
proc.communicate() # flush output after kill
|
|
222
|
+
return []
|
|
208
223
|
|
|
209
224
|
|
|
210
225
|
@log_errors(default_return="", raise_exception=False)
|
|
@@ -366,7 +381,13 @@ def get_max_file_system() -> str:
|
|
|
366
381
|
max_available_filesystem,
|
|
367
382
|
max_disk["available"],
|
|
368
383
|
)
|
|
369
|
-
if
|
|
384
|
+
# Check if filesystem is writable, or if it's root/empty
|
|
385
|
+
if max_available_filesystem in ["/", ""] or not os.access(max_available_filesystem, os.W_OK):
|
|
386
|
+
if max_available_filesystem not in ["/", ""]:
|
|
387
|
+
logging.warning(
|
|
388
|
+
"Filesystem %s is not writable, falling back to home directory",
|
|
389
|
+
max_available_filesystem,
|
|
390
|
+
)
|
|
370
391
|
home_dir = os.path.expanduser("~")
|
|
371
392
|
if not os.environ.get("WORKSPACE_DIR"):
|
|
372
393
|
logging.error("WORKSPACE_DIR environment variable not set")
|
|
@@ -499,7 +520,12 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
499
520
|
"""
|
|
500
521
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
501
522
|
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
502
|
-
|
|
523
|
+
try:
|
|
524
|
+
memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
|
|
525
|
+
except subprocess.TimeoutExpired:
|
|
526
|
+
logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
|
|
527
|
+
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
528
|
+
|
|
503
529
|
if len(memory_free_info) < 2:
|
|
504
530
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
505
531
|
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
|
|
@@ -542,7 +568,12 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
542
568
|
"""
|
|
543
569
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
544
570
|
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
545
|
-
|
|
571
|
+
try:
|
|
572
|
+
memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
|
|
573
|
+
except subprocess.TimeoutExpired:
|
|
574
|
+
logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
|
|
575
|
+
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
576
|
+
|
|
546
577
|
if len(memory_free_info) < 2:
|
|
547
578
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
548
579
|
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
|
|
@@ -150,20 +150,25 @@ class ResourcesTracker:
|
|
|
150
150
|
if not has_gpu():
|
|
151
151
|
return 0
|
|
152
152
|
gpu_util = 0
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
153
|
+
try:
|
|
154
|
+
result = subprocess.run(
|
|
155
|
+
["nvidia-smi", "pmon", "-c", "1"],
|
|
156
|
+
capture_output=True,
|
|
157
|
+
text=True,
|
|
158
|
+
check=True,
|
|
159
|
+
timeout=5,
|
|
160
|
+
)
|
|
161
|
+
pmon_output = result.stdout.strip().split("\n")
|
|
162
|
+
for line in pmon_output[2:]:
|
|
163
|
+
parts = line.split()
|
|
164
|
+
if len(parts) >= 8:
|
|
165
|
+
pid = parts[1]
|
|
166
|
+
gpu_usage = parts[3]
|
|
167
|
+
if pid == str(container_pid):
|
|
168
|
+
gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
|
|
169
|
+
except subprocess.TimeoutExpired:
|
|
170
|
+
logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
|
|
171
|
+
return 0
|
|
167
172
|
return gpu_util
|
|
168
173
|
|
|
169
174
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
@@ -185,19 +190,24 @@ class ResourcesTracker:
|
|
|
185
190
|
"--format=csv,noheader,nounits",
|
|
186
191
|
]
|
|
187
192
|
total_memory = 0
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
if
|
|
200
|
-
|
|
193
|
+
try:
|
|
194
|
+
result = subprocess.run(
|
|
195
|
+
cmd,
|
|
196
|
+
stdout=subprocess.PIPE,
|
|
197
|
+
stderr=subprocess.PIPE,
|
|
198
|
+
text=True,
|
|
199
|
+
check=True,
|
|
200
|
+
timeout=5,
|
|
201
|
+
)
|
|
202
|
+
for line in result.stdout.splitlines():
|
|
203
|
+
parts = line.strip().split(", ")
|
|
204
|
+
if len(parts) == 2:
|
|
205
|
+
process_pid, used_memory = parts
|
|
206
|
+
if process_pid == str(container_pid):
|
|
207
|
+
total_memory += int(used_memory)
|
|
208
|
+
except subprocess.TimeoutExpired:
|
|
209
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
|
|
210
|
+
return 0
|
|
201
211
|
return total_memory
|
|
202
212
|
|
|
203
213
|
@log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
|
|
@@ -227,7 +237,12 @@ class ResourcesTracker:
|
|
|
227
237
|
if not has_gpu():
|
|
228
238
|
return gpu_memory_free, gpu_utilization
|
|
229
239
|
|
|
230
|
-
|
|
240
|
+
try:
|
|
241
|
+
subprocess.check_output("nvidia-smi", timeout=5)
|
|
242
|
+
except subprocess.TimeoutExpired:
|
|
243
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
|
|
244
|
+
return 0, 0.0
|
|
245
|
+
|
|
231
246
|
info_list = get_gpu_info()
|
|
232
247
|
for info in info_list:
|
|
233
248
|
info_split = info.split(", ")
|
|
@@ -247,10 +262,14 @@ class ActionsResourcesTracker:
|
|
|
247
262
|
self.max_actions_usage = {}
|
|
248
263
|
self.resources_tracker = ResourcesTracker()
|
|
249
264
|
self.client = docker.from_env()
|
|
265
|
+
self.logged_stopped_containers = []
|
|
250
266
|
|
|
251
267
|
@log_errors(raise_exception=False, log_error=True)
|
|
252
268
|
def update_actions_resources(self) -> None:
|
|
253
|
-
"""Process both running and exited containers
|
|
269
|
+
"""Process both running and exited containers.
|
|
270
|
+
|
|
271
|
+
Note: Does not remove containers to keep logs. Only tracks resource usage.
|
|
272
|
+
"""
|
|
254
273
|
exited_containers = self.client.containers.list(
|
|
255
274
|
filters={"status": "exited"},
|
|
256
275
|
all=True,
|
|
@@ -259,8 +278,12 @@ class ActionsResourcesTracker:
|
|
|
259
278
|
if exited_containers:
|
|
260
279
|
for container in exited_containers:
|
|
261
280
|
try:
|
|
281
|
+
if container.id in self.logged_stopped_containers:
|
|
282
|
+
continue
|
|
262
283
|
self._update_container_action_status(container, "completed")
|
|
263
|
-
container.
|
|
284
|
+
self.logged_stopped_containers.append(container.id)
|
|
285
|
+
# COMMENTED OUT: Do not remove containers to keep logs
|
|
286
|
+
# container.remove()
|
|
264
287
|
except Exception as err:
|
|
265
288
|
logging.error(
|
|
266
289
|
"Error processing exited container %s: %s",
|
|
@@ -310,7 +333,7 @@ class ActionsResourcesTracker:
|
|
|
310
333
|
args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
|
|
311
334
|
action_record_id = args_24[-1] if args_24 else None
|
|
312
335
|
if not action_record_id:
|
|
313
|
-
logging.
|
|
336
|
+
logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
|
|
314
337
|
duration = calculate_time_difference(start_time, finish_time)
|
|
315
338
|
(
|
|
316
339
|
current_gpu_utilization,
|
|
@@ -320,6 +343,8 @@ class ActionsResourcesTracker:
|
|
|
320
343
|
) = self.get_current_action_usage(container, status)
|
|
321
344
|
sub_containers = self.get_sub_containers_by_label("action_id", action_record_id)
|
|
322
345
|
for sub_container in sub_containers:
|
|
346
|
+
if sub_container.id in self.logged_stopped_containers:
|
|
347
|
+
continue
|
|
323
348
|
(
|
|
324
349
|
sub_container_gpu_utilization,
|
|
325
350
|
sub_container_gpu_memory,
|
|
@@ -330,10 +355,12 @@ class ActionsResourcesTracker:
|
|
|
330
355
|
current_gpu_memory += sub_container_gpu_memory
|
|
331
356
|
current_cpu_utilization += sub_container_cpu_utilization
|
|
332
357
|
current_memory_utilization += sub_container_memory_utilization
|
|
358
|
+
# COMMENTED OUT: Do not stop/remove sub-containers to keep logs
|
|
333
359
|
if status == "completed":
|
|
334
360
|
try:
|
|
335
361
|
sub_container.stop()
|
|
336
|
-
sub_container.
|
|
362
|
+
self.logged_stopped_containers.append(sub_container.id)
|
|
363
|
+
# sub_container.remove(force=True)
|
|
337
364
|
except Exception as err:
|
|
338
365
|
logging.error(
|
|
339
366
|
"Error removing sub-container %s: %s",
|
|
@@ -9,6 +9,7 @@ import json
|
|
|
9
9
|
import time
|
|
10
10
|
import base64
|
|
11
11
|
|
|
12
|
+
# TODO: update /scaling to /compute
|
|
12
13
|
|
|
13
14
|
class Scaling:
|
|
14
15
|
|
|
@@ -185,6 +186,8 @@ class Scaling:
|
|
|
185
186
|
Returns:
|
|
186
187
|
Tuple of (data, error, message) from API response
|
|
187
188
|
"""
|
|
189
|
+
if not action_record_id:
|
|
190
|
+
return None, "Action record id is required", "Action record id is required"
|
|
188
191
|
logging.info(
|
|
189
192
|
"Updating action status for action %s",
|
|
190
193
|
action_record_id,
|
|
@@ -499,7 +502,7 @@ class Scaling:
|
|
|
499
502
|
|
|
500
503
|
# Using REST API directly
|
|
501
504
|
try:
|
|
502
|
-
path = f"/v1/
|
|
505
|
+
path = f"/v1/compute/update_available_resources/{self.instance_id}"
|
|
503
506
|
resp = self.rpc.put(path=path, payload=payload)
|
|
504
507
|
return self.handle_response(
|
|
505
508
|
resp,
|
|
@@ -644,7 +647,7 @@ class Scaling:
|
|
|
644
647
|
Returns:
|
|
645
648
|
Tuple of (data, error, message) from API response
|
|
646
649
|
"""
|
|
647
|
-
path = f"/v1/
|
|
650
|
+
path = f"/v1/compute/get_models_secret_keys?secret_name={secret_name}"
|
|
648
651
|
resp = self.rpc.get(path=path)
|
|
649
652
|
return self.handle_response(
|
|
650
653
|
resp,
|
|
@@ -29,8 +29,8 @@ def setup_workspace_and_run_task(
|
|
|
29
29
|
workspace_dir = f"{work_fs}/{action_id}"
|
|
30
30
|
codebase_zip_path = f"{workspace_dir}/file.zip"
|
|
31
31
|
requirements_txt_path = f"{workspace_dir}/requirements.txt"
|
|
32
|
-
if os.path.exists(workspace_dir):
|
|
33
|
-
|
|
32
|
+
# if os.path.exists(workspace_dir): # don't skip if workspace already exists, override it
|
|
33
|
+
# return
|
|
34
34
|
os.makedirs(workspace_dir, exist_ok=True)
|
|
35
35
|
|
|
36
36
|
# Download codebase ZIP file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|