matrice-compute 0.1.37__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +240 -113
- matrice_compute/resources_tracker.py +17 -4
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.38.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.38.dist-info}/RECORD +7 -7
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.38.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.38.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.37.dist-info → matrice_compute-0.1.38.dist-info}/top_level.txt +0 -0
|
@@ -26,6 +26,10 @@ from matrice_common.utils import log_errors
|
|
|
26
26
|
class ActionInstance:
|
|
27
27
|
"""Base class for tasks that run in Action containers."""
|
|
28
28
|
|
|
29
|
+
# Class-level dictionary to track deployed services and their ports
|
|
30
|
+
# Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
|
|
31
|
+
_deployed_services = {}
|
|
32
|
+
|
|
29
33
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
30
34
|
"""Initialize an action instance.
|
|
31
35
|
|
|
@@ -85,6 +89,52 @@ class ActionInstance:
|
|
|
85
89
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
86
90
|
self.task = self.actions_map[self.action_type]
|
|
87
91
|
|
|
92
|
+
@classmethod
|
|
93
|
+
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
94
|
+
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
service_id (str): Service ID (_idService)
|
|
98
|
+
scaling_instance: Scaling instance to get open ports
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
102
|
+
"""
|
|
103
|
+
if not service_id:
|
|
104
|
+
# No service_id, generate new ports
|
|
105
|
+
port1 = scaling_instance.get_open_port()
|
|
106
|
+
port2 = scaling_instance.get_open_port()
|
|
107
|
+
port3 = scaling_instance.get_open_port()
|
|
108
|
+
return f"{port1},{port2},{port3}"
|
|
109
|
+
|
|
110
|
+
# Check if ports already exist for this service
|
|
111
|
+
if service_id in cls._deployed_services:
|
|
112
|
+
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
113
|
+
logging.info(
|
|
114
|
+
"Reusing TRITON_PORTS for service %s: %s",
|
|
115
|
+
service_id,
|
|
116
|
+
triton_ports
|
|
117
|
+
)
|
|
118
|
+
return triton_ports
|
|
119
|
+
|
|
120
|
+
# First deployment: generate new ports and store them
|
|
121
|
+
port1 = scaling_instance.get_open_port()
|
|
122
|
+
port2 = scaling_instance.get_open_port()
|
|
123
|
+
port3 = scaling_instance.get_open_port()
|
|
124
|
+
triton_ports = f"{port1},{port2},{port3}"
|
|
125
|
+
|
|
126
|
+
# Store for future use
|
|
127
|
+
cls._deployed_services[service_id] = {
|
|
128
|
+
"triton_ports": triton_ports,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
logging.info(
|
|
132
|
+
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
133
|
+
service_id,
|
|
134
|
+
triton_ports
|
|
135
|
+
)
|
|
136
|
+
return triton_ports
|
|
137
|
+
|
|
88
138
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
89
139
|
def _init_credentials(self):
|
|
90
140
|
"""Initialize Matrice credentials.
|
|
@@ -346,6 +396,7 @@ class ActionInstance:
|
|
|
346
396
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
347
397
|
docker_workdir: str = "",
|
|
348
398
|
extra_pkgs: list = [],
|
|
399
|
+
container_name: str = "",
|
|
349
400
|
):
|
|
350
401
|
"""Build base Docker command with common options.
|
|
351
402
|
|
|
@@ -360,6 +411,7 @@ class ActionInstance:
|
|
|
360
411
|
destination_workspace_path (str): Container workspace path
|
|
361
412
|
docker_workdir (str): Docker working directory
|
|
362
413
|
extra_pkgs (list): List of extra packages to install
|
|
414
|
+
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
363
415
|
Returns:
|
|
364
416
|
str: Base Docker command
|
|
365
417
|
"""
|
|
@@ -426,17 +478,20 @@ class ActionInstance:
|
|
|
426
478
|
|
|
427
479
|
# if the service provider is local, then put --restart unless-stopped
|
|
428
480
|
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
481
|
+
env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
|
|
429
482
|
use_restart_policy = "--restart unless-stopped"
|
|
430
483
|
else:
|
|
431
484
|
use_restart_policy = ""
|
|
432
485
|
|
|
486
|
+
# Build container name option if provided
|
|
487
|
+
name_option = f"--name {container_name}" if container_name else ""
|
|
488
|
+
|
|
433
489
|
cmd_parts = [
|
|
434
|
-
f"docker run {use_gpu} {use_restart_policy} ",
|
|
490
|
+
f"docker run -d {use_gpu} {use_restart_policy} ",
|
|
435
491
|
network_config,
|
|
436
492
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
437
493
|
*volumes,
|
|
438
494
|
# Container configuration and startup commands
|
|
439
|
-
f"--cidfile ./{self.action_record_id}.cid ",
|
|
440
495
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
441
496
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
442
497
|
f"{env_exports} && "
|
|
@@ -838,55 +893,50 @@ class ActionInstance:
|
|
|
838
893
|
self.cmd = cmd
|
|
839
894
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
840
895
|
|
|
841
|
-
with
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
896
|
+
# Run docker with -d flag to get container ID from stdout
|
|
897
|
+
process = subprocess.Popen(
|
|
898
|
+
shlex.split(self.cmd),
|
|
899
|
+
stdout=subprocess.PIPE,
|
|
900
|
+
stderr=subprocess.PIPE,
|
|
901
|
+
text=True,
|
|
902
|
+
env={**os.environ},
|
|
903
|
+
)
|
|
849
904
|
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
self.action_record_id,
|
|
863
|
-
self.container_id,
|
|
864
|
-
)
|
|
865
|
-
break
|
|
866
|
-
except FileNotFoundError:
|
|
867
|
-
logging.warning(
|
|
868
|
-
"CID file not found for action %s, attempt %d/%d",
|
|
869
|
-
self.action_record_id,
|
|
870
|
-
attempt + 1,
|
|
871
|
-
max_retries,
|
|
872
|
-
)
|
|
873
|
-
time.sleep(retry_delay)
|
|
874
|
-
except Exception as e:
|
|
875
|
-
logging.error(
|
|
876
|
-
"Error reading CID file for action %s: %s",
|
|
877
|
-
self.action_record_id,
|
|
878
|
-
str(e),
|
|
879
|
-
)
|
|
880
|
-
time.sleep(retry_delay)
|
|
881
|
-
else:
|
|
905
|
+
# Use a longer timeout for docker run since --pull=always may need to
|
|
906
|
+
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
907
|
+
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
908
|
+
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
909
|
+
logging.info(
|
|
910
|
+
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
911
|
+
self.action_record_id,
|
|
912
|
+
docker_start_timeout,
|
|
913
|
+
)
|
|
914
|
+
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
915
|
+
|
|
916
|
+
if process.returncode != 0:
|
|
882
917
|
logging.error(
|
|
883
|
-
"
|
|
918
|
+
"Docker run failed for action %s: %s",
|
|
884
919
|
self.action_record_id,
|
|
885
|
-
|
|
920
|
+
stderr,
|
|
886
921
|
)
|
|
887
|
-
raise
|
|
922
|
+
raise RuntimeError(f"Docker run failed: {stderr}")
|
|
888
923
|
|
|
889
|
-
|
|
924
|
+
self.container_id = stdout.strip()
|
|
925
|
+
logging.info(
|
|
926
|
+
"Started container for action %s with ID: %s",
|
|
927
|
+
self.action_record_id,
|
|
928
|
+
self.container_id,
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
# Start following container logs in background
|
|
932
|
+
self.process = subprocess.Popen(
|
|
933
|
+
["docker", "logs", "-f", self.container_id],
|
|
934
|
+
stdout=open(self.log_path, "wb"),
|
|
935
|
+
stderr=subprocess.STDOUT,
|
|
936
|
+
start_new_session=True,
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
# Report container id to scaling service
|
|
890
940
|
self.scaling.update_action_container_id(
|
|
891
941
|
action_record_id=self.action_record_id,
|
|
892
942
|
container_id=self.container_id,
|
|
@@ -1052,7 +1102,8 @@ def data_preparation_execute(
|
|
|
1052
1102
|
"Started pulling Docker image with PID: %s",
|
|
1053
1103
|
process.pid,
|
|
1054
1104
|
)
|
|
1055
|
-
|
|
1105
|
+
container_name = f"data_prep_{self.action_record_id}"
|
|
1106
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1056
1107
|
logging.info("cmd is: %s", cmd)
|
|
1057
1108
|
self.start(cmd, "data_preparation_log")
|
|
1058
1109
|
|
|
@@ -1081,7 +1132,8 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1081
1132
|
service="bg-job-scheduler",
|
|
1082
1133
|
job_params=action["jobParams"],
|
|
1083
1134
|
)
|
|
1084
|
-
|
|
1135
|
+
container_name = f"data_processing_{self.action_record_id}"
|
|
1136
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1085
1137
|
logging.info("cmd: %s", cmd)
|
|
1086
1138
|
self.start(cmd, "data_processing_log")
|
|
1087
1139
|
|
|
@@ -1094,7 +1146,8 @@ def data_split_execute(self: ActionInstance):
|
|
|
1094
1146
|
if not action_details:
|
|
1095
1147
|
return
|
|
1096
1148
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1097
|
-
|
|
1149
|
+
container_name = f"data_split_{self.action_record_id}"
|
|
1150
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1098
1151
|
logging.info("cmd: %s", cmd)
|
|
1099
1152
|
self.start(cmd, "data_split")
|
|
1100
1153
|
|
|
@@ -1109,7 +1162,8 @@ def dataset_annotation_execute(
|
|
|
1109
1162
|
if not action_details:
|
|
1110
1163
|
return
|
|
1111
1164
|
self.setup_action_requirements(action_details, work_fs)
|
|
1112
|
-
|
|
1165
|
+
container_name = f"dataset_annotation_{self.action_record_id}"
|
|
1166
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1113
1167
|
logging.info("cmd: %s", cmd)
|
|
1114
1168
|
self.start(cmd, "dataset_annotation")
|
|
1115
1169
|
|
|
@@ -1124,7 +1178,8 @@ def dataset_augmentation_execute(
|
|
|
1124
1178
|
if not action_details:
|
|
1125
1179
|
return
|
|
1126
1180
|
self.setup_action_requirements(action_details, work_fs)
|
|
1127
|
-
|
|
1181
|
+
container_name = f"dataset_augmentation_{self.action_record_id}"
|
|
1182
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1128
1183
|
logging.info("cmd: %s", cmd)
|
|
1129
1184
|
self.start(cmd, "dataset_augmentation")
|
|
1130
1185
|
|
|
@@ -1140,7 +1195,8 @@ def augmentation_server_creation_execute(
|
|
|
1140
1195
|
if not action_details:
|
|
1141
1196
|
return
|
|
1142
1197
|
self.setup_action_requirements(action_details, work_fs)
|
|
1143
|
-
|
|
1198
|
+
container_name = f"augmentation_setup_{self.action_record_id}"
|
|
1199
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1144
1200
|
logging.info("cmd: %s", cmd)
|
|
1145
1201
|
self.start(cmd, "augmentation_setup")
|
|
1146
1202
|
|
|
@@ -1161,32 +1217,34 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1161
1217
|
|
|
1162
1218
|
project_id = action_details["_idProject"]
|
|
1163
1219
|
|
|
1220
|
+
# Define container names with action_record_id for uniqueness
|
|
1221
|
+
mongodb_container_name = f"database_setup_{self.action_record_id}"
|
|
1222
|
+
qdrant_container_name = f"qdrant_{self.action_record_id}"
|
|
1223
|
+
|
|
1164
1224
|
if action_details["actionDetails"].get("containerId"):
|
|
1165
1225
|
logging.info(
|
|
1166
|
-
"Using existing container ID for
|
|
1226
|
+
"Using existing container ID for database setup: %s",
|
|
1167
1227
|
action_details["actionDetails"]["containerId"],
|
|
1168
1228
|
)
|
|
1169
1229
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1170
1230
|
cmd = "docker restart " + self.docker_container
|
|
1171
|
-
self.start(cmd, "
|
|
1231
|
+
self.start(cmd, "database_setup")
|
|
1172
1232
|
|
|
1173
|
-
#qdrant restart
|
|
1174
|
-
qdrant_cmd = "docker restart
|
|
1175
|
-
self.start(qdrant_cmd,
|
|
1233
|
+
# qdrant restart
|
|
1234
|
+
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1235
|
+
self.start(qdrant_cmd, "qdrant_setup")
|
|
1176
1236
|
|
|
1177
1237
|
return
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1181
1238
|
|
|
1239
|
+
dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
|
|
1182
1240
|
|
|
1183
1241
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1184
1242
|
cmd = (
|
|
1185
|
-
f"docker run --pull=always --net=host "
|
|
1243
|
+
f"docker run -d --pull=always --net=host "
|
|
1244
|
+
f"--name {mongodb_container_name} "
|
|
1245
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1186
1246
|
f"-v {dbPath}:{dbPath} "
|
|
1187
|
-
f"--name database_setup_{self.action_record_id} "
|
|
1188
1247
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1189
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1190
1248
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1191
1249
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1192
1250
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1196,6 +1254,23 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1196
1254
|
)
|
|
1197
1255
|
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1198
1256
|
|
|
1257
|
+
# Qdrant container with --net=host (Port: 6334)
|
|
1258
|
+
qdrant_cmd = (
|
|
1259
|
+
f"docker run -d --pull=always --net=host "
|
|
1260
|
+
f"--name {qdrant_container_name} "
|
|
1261
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1262
|
+
f"qdrant/qdrant:latest "
|
|
1263
|
+
)
|
|
1264
|
+
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1265
|
+
|
|
1266
|
+
# Start Qdrant container
|
|
1267
|
+
qdrant_process = subprocess.Popen(
|
|
1268
|
+
qdrant_cmd,
|
|
1269
|
+
shell=True,
|
|
1270
|
+
stdout=subprocess.PIPE,
|
|
1271
|
+
stderr=subprocess.PIPE,
|
|
1272
|
+
)
|
|
1273
|
+
logging.info("Qdrant container started successfully")
|
|
1199
1274
|
|
|
1200
1275
|
# Docker Command run
|
|
1201
1276
|
self.start(cmd, "database_setup")
|
|
@@ -1215,6 +1290,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1215
1290
|
|
|
1216
1291
|
self.setup_action_requirements(action_details)
|
|
1217
1292
|
|
|
1293
|
+
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1294
|
+
|
|
1218
1295
|
if action_details["actionDetails"].get("containerId"):
|
|
1219
1296
|
logging.info(
|
|
1220
1297
|
"Using existing container ID for facial recognition worker: %s",
|
|
@@ -1228,15 +1305,13 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1228
1305
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1229
1306
|
worker_cmd = (
|
|
1230
1307
|
f"docker run -d --pull=always --net=host "
|
|
1231
|
-
f"--name
|
|
1232
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1308
|
+
f"--name {container_name} "
|
|
1233
1309
|
f"-v matrice_myvol:/matrice_data "
|
|
1234
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1235
1310
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1236
1311
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1237
1312
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1238
1313
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1239
|
-
f'
|
|
1314
|
+
f'--restart=unless-stopped '
|
|
1240
1315
|
f"{image}"
|
|
1241
1316
|
)
|
|
1242
1317
|
logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
|
|
@@ -1258,6 +1333,8 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1258
1333
|
|
|
1259
1334
|
self.setup_action_requirements(action_details)
|
|
1260
1335
|
|
|
1336
|
+
container_name = f"lpr_{self.action_record_id}"
|
|
1337
|
+
|
|
1261
1338
|
if action_details["actionDetails"].get("containerId"):
|
|
1262
1339
|
logging.info(
|
|
1263
1340
|
"Using existing container ID for LPR worker: %s",
|
|
@@ -1271,15 +1348,14 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1271
1348
|
# LPR worker container with --net=host (Port: 8082)
|
|
1272
1349
|
worker_cmd = (
|
|
1273
1350
|
f"docker run -d --net=host --pull=always "
|
|
1274
|
-
f"--name
|
|
1275
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1351
|
+
f"--name {container_name} "
|
|
1276
1352
|
f"-v matrice_myvol:/matrice_data "
|
|
1277
1353
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1278
1354
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1279
1355
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1280
1356
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1281
1357
|
f'-e PORT=8082 '
|
|
1282
|
-
f'
|
|
1358
|
+
f'--restart=unless-stopped '
|
|
1283
1359
|
f"{image}"
|
|
1284
1360
|
)
|
|
1285
1361
|
logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
|
|
@@ -1310,6 +1386,8 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1310
1386
|
|
|
1311
1387
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1312
1388
|
|
|
1389
|
+
container_name = f"inference_ws_{self.action_record_id}"
|
|
1390
|
+
|
|
1313
1391
|
if action_details["actionDetails"].get("containerId"):
|
|
1314
1392
|
logging.info(
|
|
1315
1393
|
"Using existing container ID for inference WebSocket server: %s",
|
|
@@ -1323,12 +1401,11 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1323
1401
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1324
1402
|
worker_cmd = (
|
|
1325
1403
|
f"docker run -d --pull=always --net=host "
|
|
1326
|
-
f"--name
|
|
1327
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1404
|
+
f"--name {container_name} "
|
|
1328
1405
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1329
1406
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1330
1407
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1331
|
-
f'
|
|
1408
|
+
f'--restart=unless-stopped '
|
|
1332
1409
|
f"{image} "
|
|
1333
1410
|
f"./app "
|
|
1334
1411
|
f"{self.action_record_id} "
|
|
@@ -1359,6 +1436,8 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1359
1436
|
|
|
1360
1437
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1361
1438
|
|
|
1439
|
+
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1440
|
+
|
|
1362
1441
|
if action_details["actionDetails"].get("containerId"):
|
|
1363
1442
|
logging.info(
|
|
1364
1443
|
"Using existing container ID for frontend streaming: %s",
|
|
@@ -1372,15 +1451,14 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1372
1451
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1373
1452
|
worker_cmd = (
|
|
1374
1453
|
f"docker run -d --pull=always --net=host "
|
|
1375
|
-
f"--name
|
|
1376
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1454
|
+
f"--name {container_name} "
|
|
1377
1455
|
f"-v matrice_myvol:/matrice_data "
|
|
1378
1456
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1379
1457
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1380
1458
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1381
1459
|
f"-e PORT=3000 "
|
|
1382
1460
|
f'-e WS_HOST="{ws_url}" '
|
|
1383
|
-
f'
|
|
1461
|
+
f'--restart=unless-stopped '
|
|
1384
1462
|
f"{image}"
|
|
1385
1463
|
)
|
|
1386
1464
|
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
@@ -1405,6 +1483,8 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1405
1483
|
|
|
1406
1484
|
project_id = action_details["_idProject"]
|
|
1407
1485
|
|
|
1486
|
+
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1487
|
+
|
|
1408
1488
|
if action_details["actionDetails"].get("containerId"):
|
|
1409
1489
|
logging.info(
|
|
1410
1490
|
"Using existing container ID for frontend analytics service: %s",
|
|
@@ -1418,15 +1498,14 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1418
1498
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1419
1499
|
worker_cmd = (
|
|
1420
1500
|
f"docker run -d --pull=always --net=host "
|
|
1421
|
-
f"--name
|
|
1422
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1501
|
+
f"--name {container_name} "
|
|
1423
1502
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1424
1503
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1425
1504
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1426
1505
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1427
1506
|
f"-e PORT=3001 "
|
|
1428
1507
|
f'-e PROJECT_ID="{project_id}" '
|
|
1429
|
-
f'
|
|
1508
|
+
f'--restart=unless-stopped '
|
|
1430
1509
|
f"{image}"
|
|
1431
1510
|
)
|
|
1432
1511
|
logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
|
|
@@ -1451,7 +1530,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1451
1530
|
else:
|
|
1452
1531
|
return
|
|
1453
1532
|
use_gpu = self.get_gpu_config(action_details)
|
|
1454
|
-
|
|
1533
|
+
container_name = f"dataset_generation_{self.action_record_id}"
|
|
1534
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1455
1535
|
logging.info("cmd is: %s", cmd)
|
|
1456
1536
|
self.start(cmd, "dataset_generation")
|
|
1457
1537
|
|
|
@@ -1472,7 +1552,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1472
1552
|
else:
|
|
1473
1553
|
return
|
|
1474
1554
|
use_gpu = self.get_gpu_config(action_details)
|
|
1475
|
-
|
|
1555
|
+
container_name = f"synthetic_data_setup_{self.action_record_id}"
|
|
1556
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1476
1557
|
logging.info("cmd is: %s", cmd)
|
|
1477
1558
|
self.start(cmd, "synthetic_data_setup")
|
|
1478
1559
|
|
|
@@ -1509,6 +1590,8 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1509
1590
|
|
|
1510
1591
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1511
1592
|
|
|
1593
|
+
# Define container names with action_record_id for uniqueness
|
|
1594
|
+
redis_container_name = f"redis_{self.action_record_id}"
|
|
1512
1595
|
|
|
1513
1596
|
if action_details["actionDetails"].get("containerId"):
|
|
1514
1597
|
logging.info(
|
|
@@ -1520,18 +1603,34 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1520
1603
|
self.start(cmd, "redis_setup")
|
|
1521
1604
|
|
|
1522
1605
|
# Redis container restart
|
|
1523
|
-
redis_restart_cmd = "docker restart
|
|
1606
|
+
redis_restart_cmd = f"docker restart {redis_container_name}"
|
|
1524
1607
|
self.start(redis_restart_cmd, "redis")
|
|
1525
1608
|
|
|
1526
1609
|
return
|
|
1527
1610
|
|
|
1528
|
-
# Redis container with --net=host (Port: 6379)
|
|
1611
|
+
# Redis container with --net=host (Port: 6379) with optimized configuration
|
|
1529
1612
|
redis_cmd = (
|
|
1530
1613
|
f"docker run -d --net=host "
|
|
1531
|
-
f"--name
|
|
1614
|
+
f"--name {redis_container_name} "
|
|
1532
1615
|
f"--restart unless-stopped "
|
|
1533
1616
|
f"{redis_image} "
|
|
1534
|
-
f"redis-server --bind 0.0.0.0
|
|
1617
|
+
f"redis-server --bind 0.0.0.0 "
|
|
1618
|
+
f"--appendonly no "
|
|
1619
|
+
f'--save "" '
|
|
1620
|
+
f"--maxmemory 30gb "
|
|
1621
|
+
f"--maxmemory-policy allkeys-lru "
|
|
1622
|
+
f"--io-threads 4 "
|
|
1623
|
+
f"--io-threads-do-reads yes "
|
|
1624
|
+
f"--stream-node-max-bytes 8192 "
|
|
1625
|
+
f"--stream-node-max-entries 1000 "
|
|
1626
|
+
f"--hz 100 "
|
|
1627
|
+
f"--tcp-backlog 2048 "
|
|
1628
|
+
f"--timeout 0 "
|
|
1629
|
+
f"--lazyfree-lazy-eviction yes "
|
|
1630
|
+
f"--lazyfree-lazy-expire yes "
|
|
1631
|
+
f"--lazyfree-lazy-server-del yes "
|
|
1632
|
+
f"--activedefrag yes "
|
|
1633
|
+
f"--requirepass {redis_password}"
|
|
1535
1634
|
)
|
|
1536
1635
|
|
|
1537
1636
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
@@ -1555,8 +1654,9 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1555
1654
|
|
|
1556
1655
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1557
1656
|
cmd = (
|
|
1558
|
-
f"docker run --net=host "
|
|
1559
|
-
|
|
1657
|
+
f"docker run -d --net=host "
|
|
1658
|
+
f"--restart unless-stopped "
|
|
1659
|
+
f"--name bg-redis_{self.action_record_id} "
|
|
1560
1660
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1561
1661
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1562
1662
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1583,7 +1683,8 @@ def deploy_aggregator_execute(
|
|
|
1583
1683
|
if not action_details:
|
|
1584
1684
|
return
|
|
1585
1685
|
self.setup_action_requirements(action_details, work_fs)
|
|
1586
|
-
|
|
1686
|
+
container_name = f"deploy_aggregator_{self.action_record_id}"
|
|
1687
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1587
1688
|
logging.info("cmd: %s", cmd)
|
|
1588
1689
|
self.start(cmd, "deploy_aggregator")
|
|
1589
1690
|
|
|
@@ -1599,6 +1700,10 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1599
1700
|
return
|
|
1600
1701
|
action_id = action_details["_id"]
|
|
1601
1702
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1703
|
+
|
|
1704
|
+
# Get the service ID to track deployments
|
|
1705
|
+
service_id = action_details.get("_idService")
|
|
1706
|
+
|
|
1602
1707
|
self.setup_action_requirements(
|
|
1603
1708
|
action_details,
|
|
1604
1709
|
work_fs,
|
|
@@ -1606,17 +1711,29 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1606
1711
|
action_id=action_id,
|
|
1607
1712
|
)
|
|
1608
1713
|
|
|
1609
|
-
#
|
|
1610
|
-
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1611
|
-
use_gpu = self.get_gpu_config(action_details)
|
|
1612
|
-
|
|
1613
|
-
# Override: If GPU is required, use all available GPUs
|
|
1714
|
+
# Use all GPUs if GPU is required
|
|
1614
1715
|
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1615
1716
|
if gpuRequired:
|
|
1616
1717
|
use_gpu = "--runtime=nvidia --gpus all"
|
|
1718
|
+
else:
|
|
1719
|
+
use_gpu = ""
|
|
1720
|
+
|
|
1721
|
+
logging.info(
|
|
1722
|
+
"Action %s: Model deployment GPU config: %s",
|
|
1723
|
+
action_id,
|
|
1724
|
+
use_gpu if use_gpu else "CPU-only"
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
# Get or create TRITON_PORTS (uses utility method)
|
|
1728
|
+
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1617
1729
|
|
|
1618
|
-
extra_env_vars = {
|
|
1619
|
-
|
|
1730
|
+
extra_env_vars = {
|
|
1731
|
+
"INTERNAL_PORT": internal_port,
|
|
1732
|
+
"TRITON_PORTS": triton_ports
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
container_name = f"model_deploy_{self.action_record_id}"
|
|
1736
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1620
1737
|
logging.info("cmd is: %s", cmd)
|
|
1621
1738
|
self.start(cmd, "deploy_log")
|
|
1622
1739
|
|
|
@@ -1649,7 +1766,8 @@ def model_train_execute(self: ActionInstance):
|
|
|
1649
1766
|
self.start(cmd, "train_log")
|
|
1650
1767
|
return
|
|
1651
1768
|
|
|
1652
|
-
|
|
1769
|
+
container_name = f"model_train_{self.action_record_id}"
|
|
1770
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1653
1771
|
logging.info("cmd is: %s", cmd)
|
|
1654
1772
|
self.start(cmd, "train_log")
|
|
1655
1773
|
|
|
@@ -1672,7 +1790,7 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1672
1790
|
)
|
|
1673
1791
|
if action_details["actionDetails"].get("containerId"):
|
|
1674
1792
|
logging.info(
|
|
1675
|
-
"Using existing container ID for
|
|
1793
|
+
"Using existing container ID for evaluation: %s",
|
|
1676
1794
|
action_details["actionDetails"]["containerId"],
|
|
1677
1795
|
)
|
|
1678
1796
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1680,7 +1798,8 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1680
1798
|
self.start(cmd, "eval_log")
|
|
1681
1799
|
return
|
|
1682
1800
|
|
|
1683
|
-
|
|
1801
|
+
container_name = f"model_eval_{self.action_record_id}"
|
|
1802
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1684
1803
|
logging.info("cmd is: %s", cmd)
|
|
1685
1804
|
self.start(cmd, "eval_log")
|
|
1686
1805
|
|
|
@@ -1706,7 +1825,7 @@ def model_export_execute(self: ActionInstance):
|
|
|
1706
1825
|
)
|
|
1707
1826
|
if action_details["actionDetails"].get("containerId"):
|
|
1708
1827
|
logging.info(
|
|
1709
|
-
"Using existing container ID for
|
|
1828
|
+
"Using existing container ID for export: %s",
|
|
1710
1829
|
action_details["actionDetails"]["containerId"],
|
|
1711
1830
|
)
|
|
1712
1831
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1714,7 +1833,8 @@ def model_export_execute(self: ActionInstance):
|
|
|
1714
1833
|
self.start(cmd, "export_log")
|
|
1715
1834
|
return
|
|
1716
1835
|
|
|
1717
|
-
|
|
1836
|
+
container_name = f"model_export_{self.action_record_id}"
|
|
1837
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1718
1838
|
logging.info("cmd is: %s", cmd)
|
|
1719
1839
|
self.start(cmd, "export_log")
|
|
1720
1840
|
|
|
@@ -1730,7 +1850,8 @@ def image_build_execute(self: ActionInstance):
|
|
|
1730
1850
|
action_id = action_details["_id"]
|
|
1731
1851
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1732
1852
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1733
|
-
|
|
1853
|
+
container_name = f"image_build_{self.action_record_id}"
|
|
1854
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1734
1855
|
logging.info("cmd is: %s", cmd)
|
|
1735
1856
|
self.start(cmd, "image_build_log")
|
|
1736
1857
|
|
|
@@ -1742,7 +1863,8 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1742
1863
|
if not action_details:
|
|
1743
1864
|
return
|
|
1744
1865
|
self.setup_action_requirements(action_details)
|
|
1745
|
-
|
|
1866
|
+
container_name = f"resource_clone_{self.action_record_id}"
|
|
1867
|
+
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1746
1868
|
logging.info("cmd is: %s", cmd)
|
|
1747
1869
|
self.start(cmd, "resource_clone")
|
|
1748
1870
|
|
|
@@ -1760,7 +1882,7 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1760
1882
|
)
|
|
1761
1883
|
if action_details["actionDetails"].get("containerId"):
|
|
1762
1884
|
logging.info(
|
|
1763
|
-
"Using existing container ID for
|
|
1885
|
+
"Using existing container ID for streaming gateway: %s",
|
|
1764
1886
|
action_details["actionDetails"]["containerId"],
|
|
1765
1887
|
)
|
|
1766
1888
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1768,7 +1890,8 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1768
1890
|
self.start(cmd, "streaming_gateway")
|
|
1769
1891
|
return
|
|
1770
1892
|
|
|
1771
|
-
|
|
1893
|
+
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
1894
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1772
1895
|
logging.info("cmd is: %s", cmd)
|
|
1773
1896
|
self.start(cmd, "streaming_gateway")
|
|
1774
1897
|
|
|
@@ -1864,7 +1987,7 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1864
1987
|
|
|
1865
1988
|
if action_details["actionDetails"].get("containerId"):
|
|
1866
1989
|
logging.info(
|
|
1867
|
-
"Using existing container ID for
|
|
1990
|
+
"Using existing container ID for kafka: %s",
|
|
1868
1991
|
action_details["actionDetails"]["containerId"],
|
|
1869
1992
|
)
|
|
1870
1993
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1872,10 +1995,12 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1872
1995
|
self.start(cmd, "kafka_setup")
|
|
1873
1996
|
return
|
|
1874
1997
|
|
|
1998
|
+
container_name = f"kafka_{self.action_record_id}"
|
|
1875
1999
|
|
|
1876
2000
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
1877
2001
|
cmd = (
|
|
1878
|
-
f"docker run --net=host "
|
|
2002
|
+
f"docker run -d --net=host "
|
|
2003
|
+
f"--name {container_name} "
|
|
1879
2004
|
f"{env_args} "
|
|
1880
2005
|
f"--shm-size=30G --pull=always "
|
|
1881
2006
|
f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
|
|
@@ -1908,6 +2033,8 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1908
2033
|
|
|
1909
2034
|
self.setup_action_requirements(action_details)
|
|
1910
2035
|
|
|
2036
|
+
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2037
|
+
|
|
1911
2038
|
if action_details["actionDetails"].get("containerId"):
|
|
1912
2039
|
logging.info(
|
|
1913
2040
|
"Using existing container ID for inference tracker: %s",
|
|
@@ -1921,14 +2048,13 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1921
2048
|
# This is the existing Docker run command
|
|
1922
2049
|
worker_cmd = (
|
|
1923
2050
|
f"docker run -d --pull=always --net=host "
|
|
1924
|
-
|
|
1925
|
-
f"--name inference-tracker-worker "
|
|
2051
|
+
f"--name {container_name} "
|
|
1926
2052
|
f"-v matrice_myvol:/matrice_data "
|
|
1927
2053
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1928
2054
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1929
2055
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1930
2056
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1931
|
-
f'
|
|
2057
|
+
f'--restart=unless-stopped '
|
|
1932
2058
|
f"{image}"
|
|
1933
2059
|
)
|
|
1934
2060
|
|
|
@@ -1950,9 +2076,11 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1950
2076
|
|
|
1951
2077
|
self.setup_action_requirements(action_details)
|
|
1952
2078
|
|
|
2079
|
+
container_name = f"video_storage_{self.action_record_id}"
|
|
2080
|
+
|
|
1953
2081
|
if action_details["actionDetails"].get("containerId"):
|
|
1954
2082
|
logging.info(
|
|
1955
|
-
"Using existing container ID for
|
|
2083
|
+
"Using existing container ID for video storage: %s",
|
|
1956
2084
|
action_details["actionDetails"]["containerId"],
|
|
1957
2085
|
)
|
|
1958
2086
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1963,14 +2091,13 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1963
2091
|
# This is the existing Docker run command
|
|
1964
2092
|
worker_cmd = (
|
|
1965
2093
|
f"docker run -d --pull=always --net=host "
|
|
1966
|
-
|
|
1967
|
-
f"--name media_server "
|
|
2094
|
+
f"--name {container_name} "
|
|
1968
2095
|
f"-v matrice_myvol:/matrice_data "
|
|
1969
2096
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1970
2097
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1971
2098
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1972
2099
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1973
|
-
f'
|
|
2100
|
+
f'--restart=unless-stopped '
|
|
1974
2101
|
f"{image}"
|
|
1975
2102
|
)
|
|
1976
2103
|
|
|
@@ -916,14 +916,27 @@ class ResourcesTracker:
|
|
|
916
916
|
gpu_count = 0
|
|
917
917
|
|
|
918
918
|
for gpu in gpu_data['gpus']:
|
|
919
|
-
|
|
919
|
+
# Be defensive: nvidia-smi can occasionally report N/A/0 for total while used is numeric,
|
|
920
|
+
# which would otherwise produce negative "free" memory.
|
|
921
|
+
total_mb = gpu.get('memory_total', 0) or 0
|
|
922
|
+
used_mb = gpu.get('memory_used', 0) or 0
|
|
923
|
+
free_mb = total_mb - used_mb
|
|
924
|
+
if free_mb < 0:
|
|
925
|
+
logging.debug(
|
|
926
|
+
"Negative GPU free memory computed (gpu_idx=%s total_mb=%s used_mb=%s); clamping to 0",
|
|
927
|
+
gpu.get('idx'),
|
|
928
|
+
total_mb,
|
|
929
|
+
used_mb,
|
|
930
|
+
)
|
|
931
|
+
free_mb = 0
|
|
932
|
+
gpu_memory_free += free_mb
|
|
920
933
|
gpu_utilization += gpu['utilization']
|
|
921
934
|
gpu_count += 1
|
|
922
935
|
|
|
923
936
|
if gpu_count > 0:
|
|
924
937
|
gpu_utilization /= gpu_count
|
|
925
|
-
|
|
926
|
-
return gpu_memory_free, gpu_utilization
|
|
938
|
+
|
|
939
|
+
return max(0, gpu_memory_free), gpu_utilization
|
|
927
940
|
|
|
928
941
|
@log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
|
|
929
942
|
def _get_gpu_resources_direct(self) -> Tuple[int, float]:
|
|
@@ -1218,7 +1231,7 @@ class MachineResourcesTracker:
|
|
|
1218
1231
|
availableCPU=available_cpu,
|
|
1219
1232
|
availableMemory=available_memory,
|
|
1220
1233
|
availableGPU=100 - gpu_utilization,
|
|
1221
|
-
availableGPUMemory=gpu_memory_free,
|
|
1234
|
+
availableGPUMemory=max(0, gpu_memory_free),
|
|
1222
1235
|
)
|
|
1223
1236
|
if err is not None:
|
|
1224
1237
|
logging.error(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=8kN5NREUCGOwexkk-V0zTcOhOO5bq9sV6dF-NdH5rc8,81273
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
@@ -7,12 +7,12 @@ matrice_compute/instance_manager.py,sha256=9u3QRTP-MkAWmrSQMMbCKc0TfK584teAg1wWI
|
|
|
7
7
|
matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
|
|
8
8
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
9
9
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
matrice_compute/resources_tracker.py,sha256=
|
|
10
|
+
matrice_compute/resources_tracker.py,sha256=AG_lnxoSi1TIDD0atBybntGyvyenwmP7sGCf4shBL4c,59276
|
|
11
11
|
matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
|
|
12
12
|
matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
|
|
13
13
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
18
|
-
matrice_compute-0.1.
|
|
14
|
+
matrice_compute-0.1.38.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.38.dist-info/METADATA,sha256=xKPifKrVdGeI9CWY5gAjQYoXYh1RCusoHo9Iwv2_5TM,1038
|
|
16
|
+
matrice_compute-0.1.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.38.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.38.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|