matrice-compute 0.1.36__tar.gz → 0.1.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/PKG-INFO +1 -1
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/action_instance.py +242 -113
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/resources_tracker.py +17 -4
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/LICENSE.txt +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/README.md +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/pyproject.toml +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/setup.cfg +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/setup.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/compute_operations_handler.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/instance_utils.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/scaling.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/task_utils.py +0 -0
|
@@ -26,6 +26,10 @@ from matrice_common.utils import log_errors
|
|
|
26
26
|
class ActionInstance:
|
|
27
27
|
"""Base class for tasks that run in Action containers."""
|
|
28
28
|
|
|
29
|
+
# Class-level dictionary to track deployed services and their ports
|
|
30
|
+
# Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
|
|
31
|
+
_deployed_services = {}
|
|
32
|
+
|
|
29
33
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
30
34
|
"""Initialize an action instance.
|
|
31
35
|
|
|
@@ -85,6 +89,52 @@ class ActionInstance:
|
|
|
85
89
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
86
90
|
self.task = self.actions_map[self.action_type]
|
|
87
91
|
|
|
92
|
+
@classmethod
|
|
93
|
+
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
94
|
+
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
service_id (str): Service ID (_idService)
|
|
98
|
+
scaling_instance: Scaling instance to get open ports
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
102
|
+
"""
|
|
103
|
+
if not service_id:
|
|
104
|
+
# No service_id, generate new ports
|
|
105
|
+
port1 = scaling_instance.get_open_port()
|
|
106
|
+
port2 = scaling_instance.get_open_port()
|
|
107
|
+
port3 = scaling_instance.get_open_port()
|
|
108
|
+
return f"{port1},{port2},{port3}"
|
|
109
|
+
|
|
110
|
+
# Check if ports already exist for this service
|
|
111
|
+
if service_id in cls._deployed_services:
|
|
112
|
+
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
113
|
+
logging.info(
|
|
114
|
+
"Reusing TRITON_PORTS for service %s: %s",
|
|
115
|
+
service_id,
|
|
116
|
+
triton_ports
|
|
117
|
+
)
|
|
118
|
+
return triton_ports
|
|
119
|
+
|
|
120
|
+
# First deployment: generate new ports and store them
|
|
121
|
+
port1 = scaling_instance.get_open_port()
|
|
122
|
+
port2 = scaling_instance.get_open_port()
|
|
123
|
+
port3 = scaling_instance.get_open_port()
|
|
124
|
+
triton_ports = f"{port1},{port2},{port3}"
|
|
125
|
+
|
|
126
|
+
# Store for future use
|
|
127
|
+
cls._deployed_services[service_id] = {
|
|
128
|
+
"triton_ports": triton_ports,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
logging.info(
|
|
132
|
+
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
133
|
+
service_id,
|
|
134
|
+
triton_ports
|
|
135
|
+
)
|
|
136
|
+
return triton_ports
|
|
137
|
+
|
|
88
138
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
89
139
|
def _init_credentials(self):
|
|
90
140
|
"""Initialize Matrice credentials.
|
|
@@ -346,6 +396,7 @@ class ActionInstance:
|
|
|
346
396
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
347
397
|
docker_workdir: str = "",
|
|
348
398
|
extra_pkgs: list = [],
|
|
399
|
+
container_name: str = "",
|
|
349
400
|
):
|
|
350
401
|
"""Build base Docker command with common options.
|
|
351
402
|
|
|
@@ -360,6 +411,7 @@ class ActionInstance:
|
|
|
360
411
|
destination_workspace_path (str): Container workspace path
|
|
361
412
|
docker_workdir (str): Docker working directory
|
|
362
413
|
extra_pkgs (list): List of extra packages to install
|
|
414
|
+
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
363
415
|
Returns:
|
|
364
416
|
str: Base Docker command
|
|
365
417
|
"""
|
|
@@ -427,14 +479,19 @@ class ActionInstance:
|
|
|
427
479
|
# if the service provider is local, then put --restart unless-stopped
|
|
428
480
|
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
429
481
|
env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
|
|
482
|
+
use_restart_policy = "--restart unless-stopped"
|
|
483
|
+
else:
|
|
484
|
+
use_restart_policy = ""
|
|
485
|
+
|
|
486
|
+
# Build container name option if provided
|
|
487
|
+
name_option = f"--name {container_name}" if container_name else ""
|
|
430
488
|
|
|
431
489
|
cmd_parts = [
|
|
432
|
-
f"docker run {use_gpu} ",
|
|
490
|
+
f"docker run -d {use_gpu} {use_restart_policy} ",
|
|
433
491
|
network_config,
|
|
434
492
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
435
493
|
*volumes,
|
|
436
494
|
# Container configuration and startup commands
|
|
437
|
-
f"--cidfile ./{self.action_record_id}.cid ",
|
|
438
495
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
439
496
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
440
497
|
f"{env_exports} && "
|
|
@@ -836,55 +893,50 @@ class ActionInstance:
|
|
|
836
893
|
self.cmd = cmd
|
|
837
894
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
838
895
|
|
|
839
|
-
with
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
896
|
+
# Run docker with -d flag to get container ID from stdout
|
|
897
|
+
process = subprocess.Popen(
|
|
898
|
+
shlex.split(self.cmd),
|
|
899
|
+
stdout=subprocess.PIPE,
|
|
900
|
+
stderr=subprocess.PIPE,
|
|
901
|
+
text=True,
|
|
902
|
+
env={**os.environ},
|
|
903
|
+
)
|
|
847
904
|
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
self.action_record_id,
|
|
861
|
-
self.container_id,
|
|
862
|
-
)
|
|
863
|
-
break
|
|
864
|
-
except FileNotFoundError:
|
|
865
|
-
logging.warning(
|
|
866
|
-
"CID file not found for action %s, attempt %d/%d",
|
|
867
|
-
self.action_record_id,
|
|
868
|
-
attempt + 1,
|
|
869
|
-
max_retries,
|
|
870
|
-
)
|
|
871
|
-
time.sleep(retry_delay)
|
|
872
|
-
except Exception as e:
|
|
873
|
-
logging.error(
|
|
874
|
-
"Error reading CID file for action %s: %s",
|
|
875
|
-
self.action_record_id,
|
|
876
|
-
str(e),
|
|
877
|
-
)
|
|
878
|
-
time.sleep(retry_delay)
|
|
879
|
-
else:
|
|
905
|
+
# Use a longer timeout for docker run since --pull=always may need to
|
|
906
|
+
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
907
|
+
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
908
|
+
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
909
|
+
logging.info(
|
|
910
|
+
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
911
|
+
self.action_record_id,
|
|
912
|
+
docker_start_timeout,
|
|
913
|
+
)
|
|
914
|
+
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
915
|
+
|
|
916
|
+
if process.returncode != 0:
|
|
880
917
|
logging.error(
|
|
881
|
-
"
|
|
918
|
+
"Docker run failed for action %s: %s",
|
|
882
919
|
self.action_record_id,
|
|
883
|
-
|
|
920
|
+
stderr,
|
|
884
921
|
)
|
|
885
|
-
raise
|
|
922
|
+
raise RuntimeError(f"Docker run failed: {stderr}")
|
|
923
|
+
|
|
924
|
+
self.container_id = stdout.strip()
|
|
925
|
+
logging.info(
|
|
926
|
+
"Started container for action %s with ID: %s",
|
|
927
|
+
self.action_record_id,
|
|
928
|
+
self.container_id,
|
|
929
|
+
)
|
|
886
930
|
|
|
887
|
-
#
|
|
931
|
+
# Start following container logs in background
|
|
932
|
+
self.process = subprocess.Popen(
|
|
933
|
+
["docker", "logs", "-f", self.container_id],
|
|
934
|
+
stdout=open(self.log_path, "wb"),
|
|
935
|
+
stderr=subprocess.STDOUT,
|
|
936
|
+
start_new_session=True,
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
# Report container id to scaling service
|
|
888
940
|
self.scaling.update_action_container_id(
|
|
889
941
|
action_record_id=self.action_record_id,
|
|
890
942
|
container_id=self.container_id,
|
|
@@ -1050,7 +1102,8 @@ def data_preparation_execute(
|
|
|
1050
1102
|
"Started pulling Docker image with PID: %s",
|
|
1051
1103
|
process.pid,
|
|
1052
1104
|
)
|
|
1053
|
-
|
|
1105
|
+
container_name = f"data_prep_{self.action_record_id}"
|
|
1106
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1054
1107
|
logging.info("cmd is: %s", cmd)
|
|
1055
1108
|
self.start(cmd, "data_preparation_log")
|
|
1056
1109
|
|
|
@@ -1079,7 +1132,8 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1079
1132
|
service="bg-job-scheduler",
|
|
1080
1133
|
job_params=action["jobParams"],
|
|
1081
1134
|
)
|
|
1082
|
-
|
|
1135
|
+
container_name = f"data_processing_{self.action_record_id}"
|
|
1136
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1083
1137
|
logging.info("cmd: %s", cmd)
|
|
1084
1138
|
self.start(cmd, "data_processing_log")
|
|
1085
1139
|
|
|
@@ -1092,7 +1146,8 @@ def data_split_execute(self: ActionInstance):
|
|
|
1092
1146
|
if not action_details:
|
|
1093
1147
|
return
|
|
1094
1148
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1095
|
-
|
|
1149
|
+
container_name = f"data_split_{self.action_record_id}"
|
|
1150
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1096
1151
|
logging.info("cmd: %s", cmd)
|
|
1097
1152
|
self.start(cmd, "data_split")
|
|
1098
1153
|
|
|
@@ -1107,7 +1162,8 @@ def dataset_annotation_execute(
|
|
|
1107
1162
|
if not action_details:
|
|
1108
1163
|
return
|
|
1109
1164
|
self.setup_action_requirements(action_details, work_fs)
|
|
1110
|
-
|
|
1165
|
+
container_name = f"dataset_annotation_{self.action_record_id}"
|
|
1166
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1111
1167
|
logging.info("cmd: %s", cmd)
|
|
1112
1168
|
self.start(cmd, "dataset_annotation")
|
|
1113
1169
|
|
|
@@ -1122,7 +1178,8 @@ def dataset_augmentation_execute(
|
|
|
1122
1178
|
if not action_details:
|
|
1123
1179
|
return
|
|
1124
1180
|
self.setup_action_requirements(action_details, work_fs)
|
|
1125
|
-
|
|
1181
|
+
container_name = f"dataset_augmentation_{self.action_record_id}"
|
|
1182
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1126
1183
|
logging.info("cmd: %s", cmd)
|
|
1127
1184
|
self.start(cmd, "dataset_augmentation")
|
|
1128
1185
|
|
|
@@ -1138,7 +1195,8 @@ def augmentation_server_creation_execute(
|
|
|
1138
1195
|
if not action_details:
|
|
1139
1196
|
return
|
|
1140
1197
|
self.setup_action_requirements(action_details, work_fs)
|
|
1141
|
-
|
|
1198
|
+
container_name = f"augmentation_setup_{self.action_record_id}"
|
|
1199
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1142
1200
|
logging.info("cmd: %s", cmd)
|
|
1143
1201
|
self.start(cmd, "augmentation_setup")
|
|
1144
1202
|
|
|
@@ -1159,32 +1217,34 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1159
1217
|
|
|
1160
1218
|
project_id = action_details["_idProject"]
|
|
1161
1219
|
|
|
1220
|
+
# Define container names with action_record_id for uniqueness
|
|
1221
|
+
mongodb_container_name = f"database_setup_{self.action_record_id}"
|
|
1222
|
+
qdrant_container_name = f"qdrant_{self.action_record_id}"
|
|
1223
|
+
|
|
1162
1224
|
if action_details["actionDetails"].get("containerId"):
|
|
1163
1225
|
logging.info(
|
|
1164
|
-
"Using existing container ID for
|
|
1226
|
+
"Using existing container ID for database setup: %s",
|
|
1165
1227
|
action_details["actionDetails"]["containerId"],
|
|
1166
1228
|
)
|
|
1167
1229
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1168
1230
|
cmd = "docker restart " + self.docker_container
|
|
1169
|
-
self.start(cmd, "
|
|
1231
|
+
self.start(cmd, "database_setup")
|
|
1170
1232
|
|
|
1171
|
-
#qdrant restart
|
|
1172
|
-
qdrant_cmd = "docker restart
|
|
1173
|
-
self.start(qdrant_cmd,
|
|
1233
|
+
# qdrant restart
|
|
1234
|
+
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1235
|
+
self.start(qdrant_cmd, "qdrant_setup")
|
|
1174
1236
|
|
|
1175
1237
|
return
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1179
1238
|
|
|
1239
|
+
dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
|
|
1180
1240
|
|
|
1181
1241
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1182
1242
|
cmd = (
|
|
1183
|
-
f"docker run --pull=always --net=host "
|
|
1243
|
+
f"docker run -d --pull=always --net=host "
|
|
1244
|
+
f"--name {mongodb_container_name} "
|
|
1245
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1184
1246
|
f"-v {dbPath}:{dbPath} "
|
|
1185
|
-
f"--name database_setup_{self.action_record_id} "
|
|
1186
1247
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1187
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1188
1248
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1189
1249
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1190
1250
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1194,6 +1254,23 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1194
1254
|
)
|
|
1195
1255
|
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1196
1256
|
|
|
1257
|
+
# Qdrant container with --net=host (Port: 6334)
|
|
1258
|
+
qdrant_cmd = (
|
|
1259
|
+
f"docker run -d --pull=always --net=host "
|
|
1260
|
+
f"--name {qdrant_container_name} "
|
|
1261
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1262
|
+
f"qdrant/qdrant:latest "
|
|
1263
|
+
)
|
|
1264
|
+
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1265
|
+
|
|
1266
|
+
# Start Qdrant container
|
|
1267
|
+
qdrant_process = subprocess.Popen(
|
|
1268
|
+
qdrant_cmd,
|
|
1269
|
+
shell=True,
|
|
1270
|
+
stdout=subprocess.PIPE,
|
|
1271
|
+
stderr=subprocess.PIPE,
|
|
1272
|
+
)
|
|
1273
|
+
logging.info("Qdrant container started successfully")
|
|
1197
1274
|
|
|
1198
1275
|
# Docker Command run
|
|
1199
1276
|
self.start(cmd, "database_setup")
|
|
@@ -1213,6 +1290,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1213
1290
|
|
|
1214
1291
|
self.setup_action_requirements(action_details)
|
|
1215
1292
|
|
|
1293
|
+
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1294
|
+
|
|
1216
1295
|
if action_details["actionDetails"].get("containerId"):
|
|
1217
1296
|
logging.info(
|
|
1218
1297
|
"Using existing container ID for facial recognition worker: %s",
|
|
@@ -1226,15 +1305,13 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1226
1305
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1227
1306
|
worker_cmd = (
|
|
1228
1307
|
f"docker run -d --pull=always --net=host "
|
|
1229
|
-
f"--name
|
|
1230
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1308
|
+
f"--name {container_name} "
|
|
1231
1309
|
f"-v matrice_myvol:/matrice_data "
|
|
1232
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1233
1310
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1234
1311
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1235
1312
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1236
1313
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1237
|
-
f'
|
|
1314
|
+
f'--restart=unless-stopped '
|
|
1238
1315
|
f"{image}"
|
|
1239
1316
|
)
|
|
1240
1317
|
logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
|
|
@@ -1256,6 +1333,8 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1256
1333
|
|
|
1257
1334
|
self.setup_action_requirements(action_details)
|
|
1258
1335
|
|
|
1336
|
+
container_name = f"lpr_{self.action_record_id}"
|
|
1337
|
+
|
|
1259
1338
|
if action_details["actionDetails"].get("containerId"):
|
|
1260
1339
|
logging.info(
|
|
1261
1340
|
"Using existing container ID for LPR worker: %s",
|
|
@@ -1269,15 +1348,14 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1269
1348
|
# LPR worker container with --net=host (Port: 8082)
|
|
1270
1349
|
worker_cmd = (
|
|
1271
1350
|
f"docker run -d --net=host --pull=always "
|
|
1272
|
-
f"--name
|
|
1273
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1351
|
+
f"--name {container_name} "
|
|
1274
1352
|
f"-v matrice_myvol:/matrice_data "
|
|
1275
1353
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1276
1354
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1277
1355
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1278
1356
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1279
1357
|
f'-e PORT=8082 '
|
|
1280
|
-
f'
|
|
1358
|
+
f'--restart=unless-stopped '
|
|
1281
1359
|
f"{image}"
|
|
1282
1360
|
)
|
|
1283
1361
|
logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
|
|
@@ -1308,6 +1386,8 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1308
1386
|
|
|
1309
1387
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1310
1388
|
|
|
1389
|
+
container_name = f"inference_ws_{self.action_record_id}"
|
|
1390
|
+
|
|
1311
1391
|
if action_details["actionDetails"].get("containerId"):
|
|
1312
1392
|
logging.info(
|
|
1313
1393
|
"Using existing container ID for inference WebSocket server: %s",
|
|
@@ -1321,12 +1401,11 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1321
1401
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1322
1402
|
worker_cmd = (
|
|
1323
1403
|
f"docker run -d --pull=always --net=host "
|
|
1324
|
-
f"--name
|
|
1325
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1404
|
+
f"--name {container_name} "
|
|
1326
1405
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1327
1406
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1328
1407
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1329
|
-
f'
|
|
1408
|
+
f'--restart=unless-stopped '
|
|
1330
1409
|
f"{image} "
|
|
1331
1410
|
f"./app "
|
|
1332
1411
|
f"{self.action_record_id} "
|
|
@@ -1357,6 +1436,8 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1357
1436
|
|
|
1358
1437
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1359
1438
|
|
|
1439
|
+
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1440
|
+
|
|
1360
1441
|
if action_details["actionDetails"].get("containerId"):
|
|
1361
1442
|
logging.info(
|
|
1362
1443
|
"Using existing container ID for frontend streaming: %s",
|
|
@@ -1370,15 +1451,14 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1370
1451
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1371
1452
|
worker_cmd = (
|
|
1372
1453
|
f"docker run -d --pull=always --net=host "
|
|
1373
|
-
f"--name
|
|
1374
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1454
|
+
f"--name {container_name} "
|
|
1375
1455
|
f"-v matrice_myvol:/matrice_data "
|
|
1376
1456
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1377
1457
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1378
1458
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1379
1459
|
f"-e PORT=3000 "
|
|
1380
1460
|
f'-e WS_HOST="{ws_url}" '
|
|
1381
|
-
f'
|
|
1461
|
+
f'--restart=unless-stopped '
|
|
1382
1462
|
f"{image}"
|
|
1383
1463
|
)
|
|
1384
1464
|
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
@@ -1403,6 +1483,8 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1403
1483
|
|
|
1404
1484
|
project_id = action_details["_idProject"]
|
|
1405
1485
|
|
|
1486
|
+
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1487
|
+
|
|
1406
1488
|
if action_details["actionDetails"].get("containerId"):
|
|
1407
1489
|
logging.info(
|
|
1408
1490
|
"Using existing container ID for frontend analytics service: %s",
|
|
@@ -1416,15 +1498,14 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1416
1498
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1417
1499
|
worker_cmd = (
|
|
1418
1500
|
f"docker run -d --pull=always --net=host "
|
|
1419
|
-
f"--name
|
|
1420
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1501
|
+
f"--name {container_name} "
|
|
1421
1502
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1422
1503
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1423
1504
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1424
1505
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1425
1506
|
f"-e PORT=3001 "
|
|
1426
1507
|
f'-e PROJECT_ID="{project_id}" '
|
|
1427
|
-
f'
|
|
1508
|
+
f'--restart=unless-stopped '
|
|
1428
1509
|
f"{image}"
|
|
1429
1510
|
)
|
|
1430
1511
|
logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
|
|
@@ -1449,7 +1530,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1449
1530
|
else:
|
|
1450
1531
|
return
|
|
1451
1532
|
use_gpu = self.get_gpu_config(action_details)
|
|
1452
|
-
|
|
1533
|
+
container_name = f"dataset_generation_{self.action_record_id}"
|
|
1534
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1453
1535
|
logging.info("cmd is: %s", cmd)
|
|
1454
1536
|
self.start(cmd, "dataset_generation")
|
|
1455
1537
|
|
|
@@ -1470,7 +1552,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1470
1552
|
else:
|
|
1471
1553
|
return
|
|
1472
1554
|
use_gpu = self.get_gpu_config(action_details)
|
|
1473
|
-
|
|
1555
|
+
container_name = f"synthetic_data_setup_{self.action_record_id}"
|
|
1556
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1474
1557
|
logging.info("cmd is: %s", cmd)
|
|
1475
1558
|
self.start(cmd, "synthetic_data_setup")
|
|
1476
1559
|
|
|
@@ -1507,6 +1590,8 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1507
1590
|
|
|
1508
1591
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1509
1592
|
|
|
1593
|
+
# Define container names with action_record_id for uniqueness
|
|
1594
|
+
redis_container_name = f"redis_{self.action_record_id}"
|
|
1510
1595
|
|
|
1511
1596
|
if action_details["actionDetails"].get("containerId"):
|
|
1512
1597
|
logging.info(
|
|
@@ -1518,18 +1603,34 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1518
1603
|
self.start(cmd, "redis_setup")
|
|
1519
1604
|
|
|
1520
1605
|
# Redis container restart
|
|
1521
|
-
redis_restart_cmd = "docker restart
|
|
1606
|
+
redis_restart_cmd = f"docker restart {redis_container_name}"
|
|
1522
1607
|
self.start(redis_restart_cmd, "redis")
|
|
1523
1608
|
|
|
1524
1609
|
return
|
|
1525
1610
|
|
|
1526
|
-
# Redis container with --net=host (Port: 6379)
|
|
1611
|
+
# Redis container with --net=host (Port: 6379) with optimized configuration
|
|
1527
1612
|
redis_cmd = (
|
|
1528
1613
|
f"docker run -d --net=host "
|
|
1529
|
-
f"--name
|
|
1614
|
+
f"--name {redis_container_name} "
|
|
1530
1615
|
f"--restart unless-stopped "
|
|
1531
1616
|
f"{redis_image} "
|
|
1532
|
-
f"redis-server --bind 0.0.0.0
|
|
1617
|
+
f"redis-server --bind 0.0.0.0 "
|
|
1618
|
+
f"--appendonly no "
|
|
1619
|
+
f'--save "" '
|
|
1620
|
+
f"--maxmemory 30gb "
|
|
1621
|
+
f"--maxmemory-policy allkeys-lru "
|
|
1622
|
+
f"--io-threads 4 "
|
|
1623
|
+
f"--io-threads-do-reads yes "
|
|
1624
|
+
f"--stream-node-max-bytes 8192 "
|
|
1625
|
+
f"--stream-node-max-entries 1000 "
|
|
1626
|
+
f"--hz 100 "
|
|
1627
|
+
f"--tcp-backlog 2048 "
|
|
1628
|
+
f"--timeout 0 "
|
|
1629
|
+
f"--lazyfree-lazy-eviction yes "
|
|
1630
|
+
f"--lazyfree-lazy-expire yes "
|
|
1631
|
+
f"--lazyfree-lazy-server-del yes "
|
|
1632
|
+
f"--activedefrag yes "
|
|
1633
|
+
f"--requirepass {redis_password}"
|
|
1533
1634
|
)
|
|
1534
1635
|
|
|
1535
1636
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
@@ -1553,8 +1654,9 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1553
1654
|
|
|
1554
1655
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1555
1656
|
cmd = (
|
|
1556
|
-
f"docker run --net=host "
|
|
1557
|
-
|
|
1657
|
+
f"docker run -d --net=host "
|
|
1658
|
+
f"--restart unless-stopped "
|
|
1659
|
+
f"--name bg-redis_{self.action_record_id} "
|
|
1558
1660
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1559
1661
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1560
1662
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1581,7 +1683,8 @@ def deploy_aggregator_execute(
|
|
|
1581
1683
|
if not action_details:
|
|
1582
1684
|
return
|
|
1583
1685
|
self.setup_action_requirements(action_details, work_fs)
|
|
1584
|
-
|
|
1686
|
+
container_name = f"deploy_aggregator_{self.action_record_id}"
|
|
1687
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1585
1688
|
logging.info("cmd: %s", cmd)
|
|
1586
1689
|
self.start(cmd, "deploy_aggregator")
|
|
1587
1690
|
|
|
@@ -1597,6 +1700,10 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1597
1700
|
return
|
|
1598
1701
|
action_id = action_details["_id"]
|
|
1599
1702
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1703
|
+
|
|
1704
|
+
# Get the service ID to track deployments
|
|
1705
|
+
service_id = action_details.get("_idService")
|
|
1706
|
+
|
|
1600
1707
|
self.setup_action_requirements(
|
|
1601
1708
|
action_details,
|
|
1602
1709
|
work_fs,
|
|
@@ -1604,17 +1711,29 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1604
1711
|
action_id=action_id,
|
|
1605
1712
|
)
|
|
1606
1713
|
|
|
1607
|
-
#
|
|
1608
|
-
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1609
|
-
use_gpu = self.get_gpu_config(action_details)
|
|
1610
|
-
|
|
1611
|
-
# Override: If GPU is required, use all available GPUs
|
|
1714
|
+
# Use all GPUs if GPU is required
|
|
1612
1715
|
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1613
1716
|
if gpuRequired:
|
|
1614
1717
|
use_gpu = "--runtime=nvidia --gpus all"
|
|
1718
|
+
else:
|
|
1719
|
+
use_gpu = ""
|
|
1720
|
+
|
|
1721
|
+
logging.info(
|
|
1722
|
+
"Action %s: Model deployment GPU config: %s",
|
|
1723
|
+
action_id,
|
|
1724
|
+
use_gpu if use_gpu else "CPU-only"
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
# Get or create TRITON_PORTS (uses utility method)
|
|
1728
|
+
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1615
1729
|
|
|
1616
|
-
extra_env_vars = {
|
|
1617
|
-
|
|
1730
|
+
extra_env_vars = {
|
|
1731
|
+
"INTERNAL_PORT": internal_port,
|
|
1732
|
+
"TRITON_PORTS": triton_ports
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
container_name = f"model_deploy_{self.action_record_id}"
|
|
1736
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1618
1737
|
logging.info("cmd is: %s", cmd)
|
|
1619
1738
|
self.start(cmd, "deploy_log")
|
|
1620
1739
|
|
|
@@ -1647,7 +1766,8 @@ def model_train_execute(self: ActionInstance):
|
|
|
1647
1766
|
self.start(cmd, "train_log")
|
|
1648
1767
|
return
|
|
1649
1768
|
|
|
1650
|
-
|
|
1769
|
+
container_name = f"model_train_{self.action_record_id}"
|
|
1770
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1651
1771
|
logging.info("cmd is: %s", cmd)
|
|
1652
1772
|
self.start(cmd, "train_log")
|
|
1653
1773
|
|
|
@@ -1670,7 +1790,7 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1670
1790
|
)
|
|
1671
1791
|
if action_details["actionDetails"].get("containerId"):
|
|
1672
1792
|
logging.info(
|
|
1673
|
-
"Using existing container ID for
|
|
1793
|
+
"Using existing container ID for evaluation: %s",
|
|
1674
1794
|
action_details["actionDetails"]["containerId"],
|
|
1675
1795
|
)
|
|
1676
1796
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1678,7 +1798,8 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1678
1798
|
self.start(cmd, "eval_log")
|
|
1679
1799
|
return
|
|
1680
1800
|
|
|
1681
|
-
|
|
1801
|
+
container_name = f"model_eval_{self.action_record_id}"
|
|
1802
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1682
1803
|
logging.info("cmd is: %s", cmd)
|
|
1683
1804
|
self.start(cmd, "eval_log")
|
|
1684
1805
|
|
|
@@ -1704,7 +1825,7 @@ def model_export_execute(self: ActionInstance):
|
|
|
1704
1825
|
)
|
|
1705
1826
|
if action_details["actionDetails"].get("containerId"):
|
|
1706
1827
|
logging.info(
|
|
1707
|
-
"Using existing container ID for
|
|
1828
|
+
"Using existing container ID for export: %s",
|
|
1708
1829
|
action_details["actionDetails"]["containerId"],
|
|
1709
1830
|
)
|
|
1710
1831
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1712,7 +1833,8 @@ def model_export_execute(self: ActionInstance):
|
|
|
1712
1833
|
self.start(cmd, "export_log")
|
|
1713
1834
|
return
|
|
1714
1835
|
|
|
1715
|
-
|
|
1836
|
+
container_name = f"model_export_{self.action_record_id}"
|
|
1837
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1716
1838
|
logging.info("cmd is: %s", cmd)
|
|
1717
1839
|
self.start(cmd, "export_log")
|
|
1718
1840
|
|
|
@@ -1728,7 +1850,8 @@ def image_build_execute(self: ActionInstance):
|
|
|
1728
1850
|
action_id = action_details["_id"]
|
|
1729
1851
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1730
1852
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1731
|
-
|
|
1853
|
+
container_name = f"image_build_{self.action_record_id}"
|
|
1854
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1732
1855
|
logging.info("cmd is: %s", cmd)
|
|
1733
1856
|
self.start(cmd, "image_build_log")
|
|
1734
1857
|
|
|
@@ -1740,7 +1863,8 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1740
1863
|
if not action_details:
|
|
1741
1864
|
return
|
|
1742
1865
|
self.setup_action_requirements(action_details)
|
|
1743
|
-
|
|
1866
|
+
container_name = f"resource_clone_{self.action_record_id}"
|
|
1867
|
+
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1744
1868
|
logging.info("cmd is: %s", cmd)
|
|
1745
1869
|
self.start(cmd, "resource_clone")
|
|
1746
1870
|
|
|
@@ -1758,7 +1882,7 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1758
1882
|
)
|
|
1759
1883
|
if action_details["actionDetails"].get("containerId"):
|
|
1760
1884
|
logging.info(
|
|
1761
|
-
"Using existing container ID for
|
|
1885
|
+
"Using existing container ID for streaming gateway: %s",
|
|
1762
1886
|
action_details["actionDetails"]["containerId"],
|
|
1763
1887
|
)
|
|
1764
1888
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1766,7 +1890,8 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1766
1890
|
self.start(cmd, "streaming_gateway")
|
|
1767
1891
|
return
|
|
1768
1892
|
|
|
1769
|
-
|
|
1893
|
+
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
1894
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1770
1895
|
logging.info("cmd is: %s", cmd)
|
|
1771
1896
|
self.start(cmd, "streaming_gateway")
|
|
1772
1897
|
|
|
@@ -1862,7 +1987,7 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1862
1987
|
|
|
1863
1988
|
if action_details["actionDetails"].get("containerId"):
|
|
1864
1989
|
logging.info(
|
|
1865
|
-
"Using existing container ID for
|
|
1990
|
+
"Using existing container ID for kafka: %s",
|
|
1866
1991
|
action_details["actionDetails"]["containerId"],
|
|
1867
1992
|
)
|
|
1868
1993
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1870,10 +1995,12 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1870
1995
|
self.start(cmd, "kafka_setup")
|
|
1871
1996
|
return
|
|
1872
1997
|
|
|
1998
|
+
container_name = f"kafka_{self.action_record_id}"
|
|
1873
1999
|
|
|
1874
2000
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
1875
2001
|
cmd = (
|
|
1876
|
-
f"docker run --net=host "
|
|
2002
|
+
f"docker run -d --net=host "
|
|
2003
|
+
f"--name {container_name} "
|
|
1877
2004
|
f"{env_args} "
|
|
1878
2005
|
f"--shm-size=30G --pull=always "
|
|
1879
2006
|
f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
|
|
@@ -1906,6 +2033,8 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1906
2033
|
|
|
1907
2034
|
self.setup_action_requirements(action_details)
|
|
1908
2035
|
|
|
2036
|
+
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2037
|
+
|
|
1909
2038
|
if action_details["actionDetails"].get("containerId"):
|
|
1910
2039
|
logging.info(
|
|
1911
2040
|
"Using existing container ID for inference tracker: %s",
|
|
@@ -1919,14 +2048,13 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1919
2048
|
# This is the existing Docker run command
|
|
1920
2049
|
worker_cmd = (
|
|
1921
2050
|
f"docker run -d --pull=always --net=host "
|
|
1922
|
-
|
|
1923
|
-
f"--name inference-tracker-worker "
|
|
2051
|
+
f"--name {container_name} "
|
|
1924
2052
|
f"-v matrice_myvol:/matrice_data "
|
|
1925
2053
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1926
2054
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1927
2055
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1928
2056
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1929
|
-
f'
|
|
2057
|
+
f'--restart=unless-stopped '
|
|
1930
2058
|
f"{image}"
|
|
1931
2059
|
)
|
|
1932
2060
|
|
|
@@ -1948,9 +2076,11 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1948
2076
|
|
|
1949
2077
|
self.setup_action_requirements(action_details)
|
|
1950
2078
|
|
|
2079
|
+
container_name = f"video_storage_{self.action_record_id}"
|
|
2080
|
+
|
|
1951
2081
|
if action_details["actionDetails"].get("containerId"):
|
|
1952
2082
|
logging.info(
|
|
1953
|
-
"Using existing container ID for
|
|
2083
|
+
"Using existing container ID for video storage: %s",
|
|
1954
2084
|
action_details["actionDetails"]["containerId"],
|
|
1955
2085
|
)
|
|
1956
2086
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1961,14 +2091,13 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1961
2091
|
# This is the existing Docker run command
|
|
1962
2092
|
worker_cmd = (
|
|
1963
2093
|
f"docker run -d --pull=always --net=host "
|
|
1964
|
-
|
|
1965
|
-
f"--name media_server "
|
|
2094
|
+
f"--name {container_name} "
|
|
1966
2095
|
f"-v matrice_myvol:/matrice_data "
|
|
1967
2096
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1968
2097
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1969
2098
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1970
2099
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1971
|
-
f'
|
|
2100
|
+
f'--restart=unless-stopped '
|
|
1972
2101
|
f"{image}"
|
|
1973
2102
|
)
|
|
1974
2103
|
|
|
@@ -916,14 +916,27 @@ class ResourcesTracker:
|
|
|
916
916
|
gpu_count = 0
|
|
917
917
|
|
|
918
918
|
for gpu in gpu_data['gpus']:
|
|
919
|
-
|
|
919
|
+
# Be defensive: nvidia-smi can occasionally report N/A/0 for total while used is numeric,
|
|
920
|
+
# which would otherwise produce negative "free" memory.
|
|
921
|
+
total_mb = gpu.get('memory_total', 0) or 0
|
|
922
|
+
used_mb = gpu.get('memory_used', 0) or 0
|
|
923
|
+
free_mb = total_mb - used_mb
|
|
924
|
+
if free_mb < 0:
|
|
925
|
+
logging.debug(
|
|
926
|
+
"Negative GPU free memory computed (gpu_idx=%s total_mb=%s used_mb=%s); clamping to 0",
|
|
927
|
+
gpu.get('idx'),
|
|
928
|
+
total_mb,
|
|
929
|
+
used_mb,
|
|
930
|
+
)
|
|
931
|
+
free_mb = 0
|
|
932
|
+
gpu_memory_free += free_mb
|
|
920
933
|
gpu_utilization += gpu['utilization']
|
|
921
934
|
gpu_count += 1
|
|
922
935
|
|
|
923
936
|
if gpu_count > 0:
|
|
924
937
|
gpu_utilization /= gpu_count
|
|
925
|
-
|
|
926
|
-
return gpu_memory_free, gpu_utilization
|
|
938
|
+
|
|
939
|
+
return max(0, gpu_memory_free), gpu_utilization
|
|
927
940
|
|
|
928
941
|
@log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
|
|
929
942
|
def _get_gpu_resources_direct(self) -> Tuple[int, float]:
|
|
@@ -1218,7 +1231,7 @@ class MachineResourcesTracker:
|
|
|
1218
1231
|
availableCPU=available_cpu,
|
|
1219
1232
|
availableMemory=available_memory,
|
|
1220
1233
|
availableGPU=100 - gpu_utilization,
|
|
1221
|
-
availableGPUMemory=gpu_memory_free,
|
|
1234
|
+
availableGPUMemory=max(0, gpu_memory_free),
|
|
1222
1235
|
)
|
|
1223
1236
|
if err is not None:
|
|
1224
1237
|
logging.error(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.36 → matrice_compute-0.1.38}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
{matrice_compute-0.1.36 → matrice_compute-0.1.38}/src/matrice_compute/compute_operations_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|