matrice-compute 0.1.37__py3-none-any.whl → 0.1.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +240 -114
- matrice_compute/resources_tracker.py +17 -4
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.39.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.39.dist-info}/RECORD +7 -7
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.39.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.39.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.39.dist-info}/top_level.txt +0 -0
|
@@ -26,6 +26,10 @@ from matrice_common.utils import log_errors
|
|
|
26
26
|
class ActionInstance:
|
|
27
27
|
"""Base class for tasks that run in Action containers."""
|
|
28
28
|
|
|
29
|
+
# Class-level dictionary to track deployed services and their ports
|
|
30
|
+
# Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
|
|
31
|
+
_deployed_services = {}
|
|
32
|
+
|
|
29
33
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
30
34
|
"""Initialize an action instance.
|
|
31
35
|
|
|
@@ -85,6 +89,52 @@ class ActionInstance:
|
|
|
85
89
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
86
90
|
self.task = self.actions_map[self.action_type]
|
|
87
91
|
|
|
92
|
+
@classmethod
|
|
93
|
+
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
94
|
+
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
service_id (str): Service ID (_idService)
|
|
98
|
+
scaling_instance: Scaling instance to get open ports
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
102
|
+
"""
|
|
103
|
+
if not service_id:
|
|
104
|
+
# No service_id, generate new ports
|
|
105
|
+
port1 = scaling_instance.get_open_port()
|
|
106
|
+
port2 = scaling_instance.get_open_port()
|
|
107
|
+
port3 = scaling_instance.get_open_port()
|
|
108
|
+
return f"{port1},{port2},{port3}"
|
|
109
|
+
|
|
110
|
+
# Check if ports already exist for this service
|
|
111
|
+
if service_id in cls._deployed_services:
|
|
112
|
+
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
113
|
+
logging.info(
|
|
114
|
+
"Reusing TRITON_PORTS for service %s: %s",
|
|
115
|
+
service_id,
|
|
116
|
+
triton_ports
|
|
117
|
+
)
|
|
118
|
+
return triton_ports
|
|
119
|
+
|
|
120
|
+
# First deployment: generate new ports and store them
|
|
121
|
+
port1 = scaling_instance.get_open_port()
|
|
122
|
+
port2 = scaling_instance.get_open_port()
|
|
123
|
+
port3 = scaling_instance.get_open_port()
|
|
124
|
+
triton_ports = f"{port1},{port2},{port3}"
|
|
125
|
+
|
|
126
|
+
# Store for future use
|
|
127
|
+
cls._deployed_services[service_id] = {
|
|
128
|
+
"triton_ports": triton_ports,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
logging.info(
|
|
132
|
+
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
133
|
+
service_id,
|
|
134
|
+
triton_ports
|
|
135
|
+
)
|
|
136
|
+
return triton_ports
|
|
137
|
+
|
|
88
138
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
89
139
|
def _init_credentials(self):
|
|
90
140
|
"""Initialize Matrice credentials.
|
|
@@ -346,6 +396,7 @@ class ActionInstance:
|
|
|
346
396
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
347
397
|
docker_workdir: str = "",
|
|
348
398
|
extra_pkgs: list = [],
|
|
399
|
+
container_name: str = "",
|
|
349
400
|
):
|
|
350
401
|
"""Build base Docker command with common options.
|
|
351
402
|
|
|
@@ -360,6 +411,7 @@ class ActionInstance:
|
|
|
360
411
|
destination_workspace_path (str): Container workspace path
|
|
361
412
|
docker_workdir (str): Docker working directory
|
|
362
413
|
extra_pkgs (list): List of extra packages to install
|
|
414
|
+
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
363
415
|
Returns:
|
|
364
416
|
str: Base Docker command
|
|
365
417
|
"""
|
|
@@ -426,17 +478,19 @@ class ActionInstance:
|
|
|
426
478
|
|
|
427
479
|
# if the service provider is local, then put --restart unless-stopped
|
|
428
480
|
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
429
|
-
use_restart_policy = "--restart
|
|
481
|
+
use_restart_policy = "--restart=unless-stopped "
|
|
430
482
|
else:
|
|
431
483
|
use_restart_policy = ""
|
|
432
484
|
|
|
485
|
+
# Build container name option if provided
|
|
486
|
+
name_option = f"--name {container_name}" if container_name else ""
|
|
487
|
+
|
|
433
488
|
cmd_parts = [
|
|
434
|
-
f"docker run {use_gpu} {use_restart_policy} ",
|
|
489
|
+
f"docker run -d {use_gpu} {use_restart_policy} ",
|
|
435
490
|
network_config,
|
|
436
491
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
437
492
|
*volumes,
|
|
438
493
|
# Container configuration and startup commands
|
|
439
|
-
f"--cidfile ./{self.action_record_id}.cid ",
|
|
440
494
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
441
495
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
442
496
|
f"{env_exports} && "
|
|
@@ -838,55 +892,50 @@ class ActionInstance:
|
|
|
838
892
|
self.cmd = cmd
|
|
839
893
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
840
894
|
|
|
841
|
-
with
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
895
|
+
# Run docker with -d flag to get container ID from stdout
|
|
896
|
+
process = subprocess.Popen(
|
|
897
|
+
shlex.split(self.cmd),
|
|
898
|
+
stdout=subprocess.PIPE,
|
|
899
|
+
stderr=subprocess.PIPE,
|
|
900
|
+
text=True,
|
|
901
|
+
env={**os.environ},
|
|
902
|
+
)
|
|
849
903
|
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
self.action_record_id,
|
|
863
|
-
self.container_id,
|
|
864
|
-
)
|
|
865
|
-
break
|
|
866
|
-
except FileNotFoundError:
|
|
867
|
-
logging.warning(
|
|
868
|
-
"CID file not found for action %s, attempt %d/%d",
|
|
869
|
-
self.action_record_id,
|
|
870
|
-
attempt + 1,
|
|
871
|
-
max_retries,
|
|
872
|
-
)
|
|
873
|
-
time.sleep(retry_delay)
|
|
874
|
-
except Exception as e:
|
|
875
|
-
logging.error(
|
|
876
|
-
"Error reading CID file for action %s: %s",
|
|
877
|
-
self.action_record_id,
|
|
878
|
-
str(e),
|
|
879
|
-
)
|
|
880
|
-
time.sleep(retry_delay)
|
|
881
|
-
else:
|
|
904
|
+
# Use a longer timeout for docker run since --pull=always may need to
|
|
905
|
+
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
906
|
+
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
907
|
+
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
908
|
+
logging.info(
|
|
909
|
+
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
910
|
+
self.action_record_id,
|
|
911
|
+
docker_start_timeout,
|
|
912
|
+
)
|
|
913
|
+
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
914
|
+
|
|
915
|
+
if process.returncode != 0:
|
|
882
916
|
logging.error(
|
|
883
|
-
"
|
|
917
|
+
"Docker run failed for action %s: %s",
|
|
884
918
|
self.action_record_id,
|
|
885
|
-
|
|
919
|
+
stderr,
|
|
886
920
|
)
|
|
887
|
-
raise
|
|
921
|
+
raise RuntimeError(f"Docker run failed: {stderr}")
|
|
888
922
|
|
|
889
|
-
|
|
923
|
+
self.container_id = stdout.strip()
|
|
924
|
+
logging.info(
|
|
925
|
+
"Started container for action %s with ID: %s",
|
|
926
|
+
self.action_record_id,
|
|
927
|
+
self.container_id,
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
# Start following container logs in background
|
|
931
|
+
self.process = subprocess.Popen(
|
|
932
|
+
["docker", "logs", "-f", self.container_id],
|
|
933
|
+
stdout=open(self.log_path, "wb"),
|
|
934
|
+
stderr=subprocess.STDOUT,
|
|
935
|
+
start_new_session=True,
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
# Report container id to scaling service
|
|
890
939
|
self.scaling.update_action_container_id(
|
|
891
940
|
action_record_id=self.action_record_id,
|
|
892
941
|
container_id=self.container_id,
|
|
@@ -1052,7 +1101,8 @@ def data_preparation_execute(
|
|
|
1052
1101
|
"Started pulling Docker image with PID: %s",
|
|
1053
1102
|
process.pid,
|
|
1054
1103
|
)
|
|
1055
|
-
|
|
1104
|
+
container_name = f"data_prep_{self.action_record_id}"
|
|
1105
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1056
1106
|
logging.info("cmd is: %s", cmd)
|
|
1057
1107
|
self.start(cmd, "data_preparation_log")
|
|
1058
1108
|
|
|
@@ -1081,7 +1131,8 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1081
1131
|
service="bg-job-scheduler",
|
|
1082
1132
|
job_params=action["jobParams"],
|
|
1083
1133
|
)
|
|
1084
|
-
|
|
1134
|
+
container_name = f"data_processing_{self.action_record_id}"
|
|
1135
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1085
1136
|
logging.info("cmd: %s", cmd)
|
|
1086
1137
|
self.start(cmd, "data_processing_log")
|
|
1087
1138
|
|
|
@@ -1094,7 +1145,8 @@ def data_split_execute(self: ActionInstance):
|
|
|
1094
1145
|
if not action_details:
|
|
1095
1146
|
return
|
|
1096
1147
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1097
|
-
|
|
1148
|
+
container_name = f"data_split_{self.action_record_id}"
|
|
1149
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1098
1150
|
logging.info("cmd: %s", cmd)
|
|
1099
1151
|
self.start(cmd, "data_split")
|
|
1100
1152
|
|
|
@@ -1109,7 +1161,8 @@ def dataset_annotation_execute(
|
|
|
1109
1161
|
if not action_details:
|
|
1110
1162
|
return
|
|
1111
1163
|
self.setup_action_requirements(action_details, work_fs)
|
|
1112
|
-
|
|
1164
|
+
container_name = f"dataset_annotation_{self.action_record_id}"
|
|
1165
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1113
1166
|
logging.info("cmd: %s", cmd)
|
|
1114
1167
|
self.start(cmd, "dataset_annotation")
|
|
1115
1168
|
|
|
@@ -1124,7 +1177,8 @@ def dataset_augmentation_execute(
|
|
|
1124
1177
|
if not action_details:
|
|
1125
1178
|
return
|
|
1126
1179
|
self.setup_action_requirements(action_details, work_fs)
|
|
1127
|
-
|
|
1180
|
+
container_name = f"dataset_augmentation_{self.action_record_id}"
|
|
1181
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1128
1182
|
logging.info("cmd: %s", cmd)
|
|
1129
1183
|
self.start(cmd, "dataset_augmentation")
|
|
1130
1184
|
|
|
@@ -1140,7 +1194,8 @@ def augmentation_server_creation_execute(
|
|
|
1140
1194
|
if not action_details:
|
|
1141
1195
|
return
|
|
1142
1196
|
self.setup_action_requirements(action_details, work_fs)
|
|
1143
|
-
|
|
1197
|
+
container_name = f"augmentation_setup_{self.action_record_id}"
|
|
1198
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1144
1199
|
logging.info("cmd: %s", cmd)
|
|
1145
1200
|
self.start(cmd, "augmentation_setup")
|
|
1146
1201
|
|
|
@@ -1161,32 +1216,34 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1161
1216
|
|
|
1162
1217
|
project_id = action_details["_idProject"]
|
|
1163
1218
|
|
|
1219
|
+
# Define container names with action_record_id for uniqueness
|
|
1220
|
+
mongodb_container_name = f"database_setup_{self.action_record_id}"
|
|
1221
|
+
qdrant_container_name = f"qdrant_{self.action_record_id}"
|
|
1222
|
+
|
|
1164
1223
|
if action_details["actionDetails"].get("containerId"):
|
|
1165
1224
|
logging.info(
|
|
1166
|
-
"Using existing container ID for
|
|
1225
|
+
"Using existing container ID for database setup: %s",
|
|
1167
1226
|
action_details["actionDetails"]["containerId"],
|
|
1168
1227
|
)
|
|
1169
1228
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1170
1229
|
cmd = "docker restart " + self.docker_container
|
|
1171
|
-
self.start(cmd, "
|
|
1230
|
+
self.start(cmd, "database_setup")
|
|
1172
1231
|
|
|
1173
|
-
#qdrant restart
|
|
1174
|
-
qdrant_cmd = "docker restart
|
|
1175
|
-
self.start(qdrant_cmd,
|
|
1232
|
+
# qdrant restart
|
|
1233
|
+
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1234
|
+
self.start(qdrant_cmd, "qdrant_setup")
|
|
1176
1235
|
|
|
1177
1236
|
return
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1181
1237
|
|
|
1238
|
+
dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
|
|
1182
1239
|
|
|
1183
1240
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1184
1241
|
cmd = (
|
|
1185
|
-
f"docker run --pull=always --net=host "
|
|
1242
|
+
f"docker run -d --pull=always --net=host "
|
|
1243
|
+
f"--name {mongodb_container_name} "
|
|
1244
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1186
1245
|
f"-v {dbPath}:{dbPath} "
|
|
1187
|
-
f"--name database_setup_{self.action_record_id} "
|
|
1188
1246
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1189
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1190
1247
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1191
1248
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1192
1249
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1196,6 +1253,23 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1196
1253
|
)
|
|
1197
1254
|
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1198
1255
|
|
|
1256
|
+
# Qdrant container with --net=host (Port: 6334)
|
|
1257
|
+
qdrant_cmd = (
|
|
1258
|
+
f"docker run -d --pull=always --net=host "
|
|
1259
|
+
f"--name {qdrant_container_name} "
|
|
1260
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1261
|
+
f"qdrant/qdrant:latest "
|
|
1262
|
+
)
|
|
1263
|
+
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1264
|
+
|
|
1265
|
+
# Start Qdrant container
|
|
1266
|
+
qdrant_process = subprocess.Popen(
|
|
1267
|
+
qdrant_cmd,
|
|
1268
|
+
shell=True,
|
|
1269
|
+
stdout=subprocess.PIPE,
|
|
1270
|
+
stderr=subprocess.PIPE,
|
|
1271
|
+
)
|
|
1272
|
+
logging.info("Qdrant container started successfully")
|
|
1199
1273
|
|
|
1200
1274
|
# Docker Command run
|
|
1201
1275
|
self.start(cmd, "database_setup")
|
|
@@ -1215,6 +1289,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1215
1289
|
|
|
1216
1290
|
self.setup_action_requirements(action_details)
|
|
1217
1291
|
|
|
1292
|
+
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1293
|
+
|
|
1218
1294
|
if action_details["actionDetails"].get("containerId"):
|
|
1219
1295
|
logging.info(
|
|
1220
1296
|
"Using existing container ID for facial recognition worker: %s",
|
|
@@ -1228,15 +1304,13 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1228
1304
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1229
1305
|
worker_cmd = (
|
|
1230
1306
|
f"docker run -d --pull=always --net=host "
|
|
1231
|
-
f"--name
|
|
1232
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1307
|
+
f"--name {container_name} "
|
|
1233
1308
|
f"-v matrice_myvol:/matrice_data "
|
|
1234
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1235
1309
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1236
1310
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1237
1311
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1238
1312
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1239
|
-
f'
|
|
1313
|
+
f'--restart=unless-stopped '
|
|
1240
1314
|
f"{image}"
|
|
1241
1315
|
)
|
|
1242
1316
|
logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
|
|
@@ -1258,6 +1332,8 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1258
1332
|
|
|
1259
1333
|
self.setup_action_requirements(action_details)
|
|
1260
1334
|
|
|
1335
|
+
container_name = f"lpr_{self.action_record_id}"
|
|
1336
|
+
|
|
1261
1337
|
if action_details["actionDetails"].get("containerId"):
|
|
1262
1338
|
logging.info(
|
|
1263
1339
|
"Using existing container ID for LPR worker: %s",
|
|
@@ -1271,15 +1347,14 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1271
1347
|
# LPR worker container with --net=host (Port: 8082)
|
|
1272
1348
|
worker_cmd = (
|
|
1273
1349
|
f"docker run -d --net=host --pull=always "
|
|
1274
|
-
f"--name
|
|
1275
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1350
|
+
f"--name {container_name} "
|
|
1276
1351
|
f"-v matrice_myvol:/matrice_data "
|
|
1277
1352
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1278
1353
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1279
1354
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1280
1355
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1281
1356
|
f'-e PORT=8082 '
|
|
1282
|
-
f'
|
|
1357
|
+
f'--restart=unless-stopped '
|
|
1283
1358
|
f"{image}"
|
|
1284
1359
|
)
|
|
1285
1360
|
logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
|
|
@@ -1310,6 +1385,8 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1310
1385
|
|
|
1311
1386
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1312
1387
|
|
|
1388
|
+
container_name = f"inference_ws_{self.action_record_id}"
|
|
1389
|
+
|
|
1313
1390
|
if action_details["actionDetails"].get("containerId"):
|
|
1314
1391
|
logging.info(
|
|
1315
1392
|
"Using existing container ID for inference WebSocket server: %s",
|
|
@@ -1323,12 +1400,11 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1323
1400
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1324
1401
|
worker_cmd = (
|
|
1325
1402
|
f"docker run -d --pull=always --net=host "
|
|
1326
|
-
f"--name
|
|
1327
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1403
|
+
f"--name {container_name} "
|
|
1328
1404
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1329
1405
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1330
1406
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1331
|
-
f'
|
|
1407
|
+
f'--restart=unless-stopped '
|
|
1332
1408
|
f"{image} "
|
|
1333
1409
|
f"./app "
|
|
1334
1410
|
f"{self.action_record_id} "
|
|
@@ -1359,6 +1435,8 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1359
1435
|
|
|
1360
1436
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1361
1437
|
|
|
1438
|
+
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1439
|
+
|
|
1362
1440
|
if action_details["actionDetails"].get("containerId"):
|
|
1363
1441
|
logging.info(
|
|
1364
1442
|
"Using existing container ID for frontend streaming: %s",
|
|
@@ -1372,15 +1450,14 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1372
1450
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1373
1451
|
worker_cmd = (
|
|
1374
1452
|
f"docker run -d --pull=always --net=host "
|
|
1375
|
-
f"--name
|
|
1376
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1453
|
+
f"--name {container_name} "
|
|
1377
1454
|
f"-v matrice_myvol:/matrice_data "
|
|
1378
1455
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1379
1456
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1380
1457
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1381
1458
|
f"-e PORT=3000 "
|
|
1382
1459
|
f'-e WS_HOST="{ws_url}" '
|
|
1383
|
-
f'
|
|
1460
|
+
f'--restart=unless-stopped '
|
|
1384
1461
|
f"{image}"
|
|
1385
1462
|
)
|
|
1386
1463
|
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
@@ -1405,6 +1482,8 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1405
1482
|
|
|
1406
1483
|
project_id = action_details["_idProject"]
|
|
1407
1484
|
|
|
1485
|
+
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1486
|
+
|
|
1408
1487
|
if action_details["actionDetails"].get("containerId"):
|
|
1409
1488
|
logging.info(
|
|
1410
1489
|
"Using existing container ID for frontend analytics service: %s",
|
|
@@ -1418,15 +1497,14 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1418
1497
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1419
1498
|
worker_cmd = (
|
|
1420
1499
|
f"docker run -d --pull=always --net=host "
|
|
1421
|
-
f"--name
|
|
1422
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1500
|
+
f"--name {container_name} "
|
|
1423
1501
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1424
1502
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1425
1503
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1426
1504
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1427
1505
|
f"-e PORT=3001 "
|
|
1428
1506
|
f'-e PROJECT_ID="{project_id}" '
|
|
1429
|
-
f'
|
|
1507
|
+
f'--restart=unless-stopped '
|
|
1430
1508
|
f"{image}"
|
|
1431
1509
|
)
|
|
1432
1510
|
logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
|
|
@@ -1451,7 +1529,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1451
1529
|
else:
|
|
1452
1530
|
return
|
|
1453
1531
|
use_gpu = self.get_gpu_config(action_details)
|
|
1454
|
-
|
|
1532
|
+
container_name = f"dataset_generation_{self.action_record_id}"
|
|
1533
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1455
1534
|
logging.info("cmd is: %s", cmd)
|
|
1456
1535
|
self.start(cmd, "dataset_generation")
|
|
1457
1536
|
|
|
@@ -1472,7 +1551,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1472
1551
|
else:
|
|
1473
1552
|
return
|
|
1474
1553
|
use_gpu = self.get_gpu_config(action_details)
|
|
1475
|
-
|
|
1554
|
+
container_name = f"synthetic_data_setup_{self.action_record_id}"
|
|
1555
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1476
1556
|
logging.info("cmd is: %s", cmd)
|
|
1477
1557
|
self.start(cmd, "synthetic_data_setup")
|
|
1478
1558
|
|
|
@@ -1509,6 +1589,8 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1509
1589
|
|
|
1510
1590
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1511
1591
|
|
|
1592
|
+
# Define container names with action_record_id for uniqueness
|
|
1593
|
+
redis_container_name = f"redis_{self.action_record_id}"
|
|
1512
1594
|
|
|
1513
1595
|
if action_details["actionDetails"].get("containerId"):
|
|
1514
1596
|
logging.info(
|
|
@@ -1520,18 +1602,34 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1520
1602
|
self.start(cmd, "redis_setup")
|
|
1521
1603
|
|
|
1522
1604
|
# Redis container restart
|
|
1523
|
-
redis_restart_cmd = "docker restart
|
|
1605
|
+
redis_restart_cmd = f"docker restart {redis_container_name}"
|
|
1524
1606
|
self.start(redis_restart_cmd, "redis")
|
|
1525
1607
|
|
|
1526
1608
|
return
|
|
1527
1609
|
|
|
1528
|
-
# Redis container with --net=host (Port: 6379)
|
|
1610
|
+
# Redis container with --net=host (Port: 6379) with optimized configuration
|
|
1529
1611
|
redis_cmd = (
|
|
1530
1612
|
f"docker run -d --net=host "
|
|
1531
|
-
f"--name
|
|
1613
|
+
f"--name {redis_container_name} "
|
|
1532
1614
|
f"--restart unless-stopped "
|
|
1533
1615
|
f"{redis_image} "
|
|
1534
|
-
f"redis-server --bind 0.0.0.0
|
|
1616
|
+
f"redis-server --bind 0.0.0.0 "
|
|
1617
|
+
f"--appendonly no "
|
|
1618
|
+
f'--save "" '
|
|
1619
|
+
f"--maxmemory 30gb "
|
|
1620
|
+
f"--maxmemory-policy allkeys-lru "
|
|
1621
|
+
f"--io-threads 4 "
|
|
1622
|
+
f"--io-threads-do-reads yes "
|
|
1623
|
+
f"--stream-node-max-bytes 8192 "
|
|
1624
|
+
f"--stream-node-max-entries 1000 "
|
|
1625
|
+
f"--hz 100 "
|
|
1626
|
+
f"--tcp-backlog 2048 "
|
|
1627
|
+
f"--timeout 0 "
|
|
1628
|
+
f"--lazyfree-lazy-eviction yes "
|
|
1629
|
+
f"--lazyfree-lazy-expire yes "
|
|
1630
|
+
f"--lazyfree-lazy-server-del yes "
|
|
1631
|
+
f"--activedefrag yes "
|
|
1632
|
+
f"--requirepass {redis_password}"
|
|
1535
1633
|
)
|
|
1536
1634
|
|
|
1537
1635
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
@@ -1555,8 +1653,9 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1555
1653
|
|
|
1556
1654
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1557
1655
|
cmd = (
|
|
1558
|
-
f"docker run --net=host "
|
|
1559
|
-
|
|
1656
|
+
f"docker run -d --net=host "
|
|
1657
|
+
f"--restart unless-stopped "
|
|
1658
|
+
f"--name bg-redis_{self.action_record_id} "
|
|
1560
1659
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1561
1660
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1562
1661
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1583,7 +1682,8 @@ def deploy_aggregator_execute(
|
|
|
1583
1682
|
if not action_details:
|
|
1584
1683
|
return
|
|
1585
1684
|
self.setup_action_requirements(action_details, work_fs)
|
|
1586
|
-
|
|
1685
|
+
container_name = f"deploy_aggregator_{self.action_record_id}"
|
|
1686
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1587
1687
|
logging.info("cmd: %s", cmd)
|
|
1588
1688
|
self.start(cmd, "deploy_aggregator")
|
|
1589
1689
|
|
|
@@ -1599,6 +1699,10 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1599
1699
|
return
|
|
1600
1700
|
action_id = action_details["_id"]
|
|
1601
1701
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1702
|
+
|
|
1703
|
+
# Get the service ID to track deployments
|
|
1704
|
+
service_id = action_details.get("_idService")
|
|
1705
|
+
|
|
1602
1706
|
self.setup_action_requirements(
|
|
1603
1707
|
action_details,
|
|
1604
1708
|
work_fs,
|
|
@@ -1606,17 +1710,29 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1606
1710
|
action_id=action_id,
|
|
1607
1711
|
)
|
|
1608
1712
|
|
|
1609
|
-
#
|
|
1610
|
-
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1611
|
-
use_gpu = self.get_gpu_config(action_details)
|
|
1612
|
-
|
|
1613
|
-
# Override: If GPU is required, use all available GPUs
|
|
1713
|
+
# Use all GPUs if GPU is required
|
|
1614
1714
|
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1615
1715
|
if gpuRequired:
|
|
1616
1716
|
use_gpu = "--runtime=nvidia --gpus all"
|
|
1717
|
+
else:
|
|
1718
|
+
use_gpu = ""
|
|
1719
|
+
|
|
1720
|
+
logging.info(
|
|
1721
|
+
"Action %s: Model deployment GPU config: %s",
|
|
1722
|
+
action_id,
|
|
1723
|
+
use_gpu if use_gpu else "CPU-only"
|
|
1724
|
+
)
|
|
1725
|
+
|
|
1726
|
+
# Get or create TRITON_PORTS (uses utility method)
|
|
1727
|
+
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1617
1728
|
|
|
1618
|
-
extra_env_vars = {
|
|
1619
|
-
|
|
1729
|
+
extra_env_vars = {
|
|
1730
|
+
"INTERNAL_PORT": internal_port,
|
|
1731
|
+
"TRITON_PORTS": triton_ports
|
|
1732
|
+
}
|
|
1733
|
+
|
|
1734
|
+
container_name = f"model_deploy_{self.action_record_id}"
|
|
1735
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1620
1736
|
logging.info("cmd is: %s", cmd)
|
|
1621
1737
|
self.start(cmd, "deploy_log")
|
|
1622
1738
|
|
|
@@ -1649,7 +1765,8 @@ def model_train_execute(self: ActionInstance):
|
|
|
1649
1765
|
self.start(cmd, "train_log")
|
|
1650
1766
|
return
|
|
1651
1767
|
|
|
1652
|
-
|
|
1768
|
+
container_name = f"model_train_{self.action_record_id}"
|
|
1769
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1653
1770
|
logging.info("cmd is: %s", cmd)
|
|
1654
1771
|
self.start(cmd, "train_log")
|
|
1655
1772
|
|
|
@@ -1672,7 +1789,7 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1672
1789
|
)
|
|
1673
1790
|
if action_details["actionDetails"].get("containerId"):
|
|
1674
1791
|
logging.info(
|
|
1675
|
-
"Using existing container ID for
|
|
1792
|
+
"Using existing container ID for evaluation: %s",
|
|
1676
1793
|
action_details["actionDetails"]["containerId"],
|
|
1677
1794
|
)
|
|
1678
1795
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1680,7 +1797,8 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1680
1797
|
self.start(cmd, "eval_log")
|
|
1681
1798
|
return
|
|
1682
1799
|
|
|
1683
|
-
|
|
1800
|
+
container_name = f"model_eval_{self.action_record_id}"
|
|
1801
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1684
1802
|
logging.info("cmd is: %s", cmd)
|
|
1685
1803
|
self.start(cmd, "eval_log")
|
|
1686
1804
|
|
|
@@ -1706,7 +1824,7 @@ def model_export_execute(self: ActionInstance):
|
|
|
1706
1824
|
)
|
|
1707
1825
|
if action_details["actionDetails"].get("containerId"):
|
|
1708
1826
|
logging.info(
|
|
1709
|
-
"Using existing container ID for
|
|
1827
|
+
"Using existing container ID for export: %s",
|
|
1710
1828
|
action_details["actionDetails"]["containerId"],
|
|
1711
1829
|
)
|
|
1712
1830
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1714,7 +1832,8 @@ def model_export_execute(self: ActionInstance):
|
|
|
1714
1832
|
self.start(cmd, "export_log")
|
|
1715
1833
|
return
|
|
1716
1834
|
|
|
1717
|
-
|
|
1835
|
+
container_name = f"model_export_{self.action_record_id}"
|
|
1836
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1718
1837
|
logging.info("cmd is: %s", cmd)
|
|
1719
1838
|
self.start(cmd, "export_log")
|
|
1720
1839
|
|
|
@@ -1730,7 +1849,8 @@ def image_build_execute(self: ActionInstance):
|
|
|
1730
1849
|
action_id = action_details["_id"]
|
|
1731
1850
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1732
1851
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1733
|
-
|
|
1852
|
+
container_name = f"image_build_{self.action_record_id}"
|
|
1853
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1734
1854
|
logging.info("cmd is: %s", cmd)
|
|
1735
1855
|
self.start(cmd, "image_build_log")
|
|
1736
1856
|
|
|
@@ -1742,7 +1862,8 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1742
1862
|
if not action_details:
|
|
1743
1863
|
return
|
|
1744
1864
|
self.setup_action_requirements(action_details)
|
|
1745
|
-
|
|
1865
|
+
container_name = f"resource_clone_{self.action_record_id}"
|
|
1866
|
+
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1746
1867
|
logging.info("cmd is: %s", cmd)
|
|
1747
1868
|
self.start(cmd, "resource_clone")
|
|
1748
1869
|
|
|
@@ -1760,7 +1881,7 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1760
1881
|
)
|
|
1761
1882
|
if action_details["actionDetails"].get("containerId"):
|
|
1762
1883
|
logging.info(
|
|
1763
|
-
"Using existing container ID for
|
|
1884
|
+
"Using existing container ID for streaming gateway: %s",
|
|
1764
1885
|
action_details["actionDetails"]["containerId"],
|
|
1765
1886
|
)
|
|
1766
1887
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1768,7 +1889,8 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1768
1889
|
self.start(cmd, "streaming_gateway")
|
|
1769
1890
|
return
|
|
1770
1891
|
|
|
1771
|
-
|
|
1892
|
+
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
1893
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1772
1894
|
logging.info("cmd is: %s", cmd)
|
|
1773
1895
|
self.start(cmd, "streaming_gateway")
|
|
1774
1896
|
|
|
@@ -1864,7 +1986,7 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1864
1986
|
|
|
1865
1987
|
if action_details["actionDetails"].get("containerId"):
|
|
1866
1988
|
logging.info(
|
|
1867
|
-
"Using existing container ID for
|
|
1989
|
+
"Using existing container ID for kafka: %s",
|
|
1868
1990
|
action_details["actionDetails"]["containerId"],
|
|
1869
1991
|
)
|
|
1870
1992
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1872,10 +1994,12 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1872
1994
|
self.start(cmd, "kafka_setup")
|
|
1873
1995
|
return
|
|
1874
1996
|
|
|
1997
|
+
container_name = f"kafka_{self.action_record_id}"
|
|
1875
1998
|
|
|
1876
1999
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
1877
2000
|
cmd = (
|
|
1878
|
-
f"docker run --net=host "
|
|
2001
|
+
f"docker run -d --net=host "
|
|
2002
|
+
f"--name {container_name} "
|
|
1879
2003
|
f"{env_args} "
|
|
1880
2004
|
f"--shm-size=30G --pull=always "
|
|
1881
2005
|
f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
|
|
@@ -1908,6 +2032,8 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1908
2032
|
|
|
1909
2033
|
self.setup_action_requirements(action_details)
|
|
1910
2034
|
|
|
2035
|
+
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2036
|
+
|
|
1911
2037
|
if action_details["actionDetails"].get("containerId"):
|
|
1912
2038
|
logging.info(
|
|
1913
2039
|
"Using existing container ID for inference tracker: %s",
|
|
@@ -1921,14 +2047,13 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1921
2047
|
# This is the existing Docker run command
|
|
1922
2048
|
worker_cmd = (
|
|
1923
2049
|
f"docker run -d --pull=always --net=host "
|
|
1924
|
-
|
|
1925
|
-
f"--name inference-tracker-worker "
|
|
2050
|
+
f"--name {container_name} "
|
|
1926
2051
|
f"-v matrice_myvol:/matrice_data "
|
|
1927
2052
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1928
2053
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1929
2054
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1930
2055
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1931
|
-
f'
|
|
2056
|
+
f'--restart=unless-stopped '
|
|
1932
2057
|
f"{image}"
|
|
1933
2058
|
)
|
|
1934
2059
|
|
|
@@ -1950,9 +2075,11 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1950
2075
|
|
|
1951
2076
|
self.setup_action_requirements(action_details)
|
|
1952
2077
|
|
|
2078
|
+
container_name = f"video_storage_{self.action_record_id}"
|
|
2079
|
+
|
|
1953
2080
|
if action_details["actionDetails"].get("containerId"):
|
|
1954
2081
|
logging.info(
|
|
1955
|
-
"Using existing container ID for
|
|
2082
|
+
"Using existing container ID for video storage: %s",
|
|
1956
2083
|
action_details["actionDetails"]["containerId"],
|
|
1957
2084
|
)
|
|
1958
2085
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1963,14 +2090,13 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1963
2090
|
# This is the existing Docker run command
|
|
1964
2091
|
worker_cmd = (
|
|
1965
2092
|
f"docker run -d --pull=always --net=host "
|
|
1966
|
-
|
|
1967
|
-
f"--name media_server "
|
|
2093
|
+
f"--name {container_name} "
|
|
1968
2094
|
f"-v matrice_myvol:/matrice_data "
|
|
1969
2095
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1970
2096
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1971
2097
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1972
2098
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1973
|
-
f'
|
|
2099
|
+
f'--restart=unless-stopped '
|
|
1974
2100
|
f"{image}"
|
|
1975
2101
|
)
|
|
1976
2102
|
|
|
@@ -916,14 +916,27 @@ class ResourcesTracker:
|
|
|
916
916
|
gpu_count = 0
|
|
917
917
|
|
|
918
918
|
for gpu in gpu_data['gpus']:
|
|
919
|
-
|
|
919
|
+
# Be defensive: nvidia-smi can occasionally report N/A/0 for total while used is numeric,
|
|
920
|
+
# which would otherwise produce negative "free" memory.
|
|
921
|
+
total_mb = gpu.get('memory_total', 0) or 0
|
|
922
|
+
used_mb = gpu.get('memory_used', 0) or 0
|
|
923
|
+
free_mb = total_mb - used_mb
|
|
924
|
+
if free_mb < 0:
|
|
925
|
+
logging.debug(
|
|
926
|
+
"Negative GPU free memory computed (gpu_idx=%s total_mb=%s used_mb=%s); clamping to 0",
|
|
927
|
+
gpu.get('idx'),
|
|
928
|
+
total_mb,
|
|
929
|
+
used_mb,
|
|
930
|
+
)
|
|
931
|
+
free_mb = 0
|
|
932
|
+
gpu_memory_free += free_mb
|
|
920
933
|
gpu_utilization += gpu['utilization']
|
|
921
934
|
gpu_count += 1
|
|
922
935
|
|
|
923
936
|
if gpu_count > 0:
|
|
924
937
|
gpu_utilization /= gpu_count
|
|
925
|
-
|
|
926
|
-
return gpu_memory_free, gpu_utilization
|
|
938
|
+
|
|
939
|
+
return max(0, gpu_memory_free), gpu_utilization
|
|
927
940
|
|
|
928
941
|
@log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
|
|
929
942
|
def _get_gpu_resources_direct(self) -> Tuple[int, float]:
|
|
@@ -1218,7 +1231,7 @@ class MachineResourcesTracker:
|
|
|
1218
1231
|
availableCPU=available_cpu,
|
|
1219
1232
|
availableMemory=available_memory,
|
|
1220
1233
|
availableGPU=100 - gpu_utilization,
|
|
1221
|
-
availableGPUMemory=gpu_memory_free,
|
|
1234
|
+
availableGPUMemory=max(0, gpu_memory_free),
|
|
1222
1235
|
)
|
|
1223
1236
|
if err is not None:
|
|
1224
1237
|
logging.error(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=u98XGbDuO7eIwbn1kEFOd_FjSZvs5QFa5W4tgJvt384,81184
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
@@ -7,12 +7,12 @@ matrice_compute/instance_manager.py,sha256=9u3QRTP-MkAWmrSQMMbCKc0TfK584teAg1wWI
|
|
|
7
7
|
matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
|
|
8
8
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
9
9
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
matrice_compute/resources_tracker.py,sha256=
|
|
10
|
+
matrice_compute/resources_tracker.py,sha256=AG_lnxoSi1TIDD0atBybntGyvyenwmP7sGCf4shBL4c,59276
|
|
11
11
|
matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
|
|
12
12
|
matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
|
|
13
13
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
18
|
-
matrice_compute-0.1.
|
|
14
|
+
matrice_compute-0.1.39.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.39.dist-info/METADATA,sha256=c1MwRdZ3ReZrT0-yfuxSCwJI-JYFhDNCS6Oq6KbY_ic,1038
|
|
16
|
+
matrice_compute-0.1.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.39.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.39.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|