matrice-compute 0.1.37__tar.gz → 0.1.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/action_instance.py +240 -114
  4. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/resources_tracker.py +17 -4
  5. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/LICENSE.txt +0 -0
  6. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/README.md +0 -0
  7. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/matrice_compute.egg-info/SOURCES.txt +0 -0
  8. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/matrice_compute.egg-info/dependency_links.txt +0 -0
  9. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/matrice_compute.egg-info/not-zip-safe +0 -0
  10. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/matrice_compute.egg-info/top_level.txt +0 -0
  11. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/pyproject.toml +0 -0
  12. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/setup.cfg +0 -0
  13. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/setup.py +0 -0
  14. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/__init__.py +0 -0
  15. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/actions_manager.py +0 -0
  16. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  17. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/compute_operations_handler.py +0 -0
  18. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/instance_manager.py +0 -0
  19. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/instance_utils.py +0 -0
  20. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/prechecks.py +0 -0
  21. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/py.typed +0 -0
  22. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/scaling.py +0 -0
  23. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.37 → matrice_compute-0.1.39}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.37
3
+ Version: 0.1.39
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.37
3
+ Version: 0.1.39
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -26,6 +26,10 @@ from matrice_common.utils import log_errors
26
26
  class ActionInstance:
27
27
  """Base class for tasks that run in Action containers."""
28
28
 
29
+ # Class-level dictionary to track deployed services and their ports
30
+ # Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
31
+ _deployed_services = {}
32
+
29
33
  def __init__(self, scaling: Scaling, action_info: dict):
30
34
  """Initialize an action instance.
31
35
 
@@ -85,6 +89,52 @@ class ActionInstance:
85
89
  raise ValueError(f"Unknown action type: {self.action_type}")
86
90
  self.task = self.actions_map[self.action_type]
87
91
 
92
+ @classmethod
93
+ def get_or_create_triton_ports(cls, service_id, scaling_instance):
94
+ """Get existing TRITON_PORTS for a service or create new ones.
95
+
96
+ Args:
97
+ service_id (str): Service ID (_idService)
98
+ scaling_instance: Scaling instance to get open ports
99
+
100
+ Returns:
101
+ str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
102
+ """
103
+ if not service_id:
104
+ # No service_id, generate new ports
105
+ port1 = scaling_instance.get_open_port()
106
+ port2 = scaling_instance.get_open_port()
107
+ port3 = scaling_instance.get_open_port()
108
+ return f"{port1},{port2},{port3}"
109
+
110
+ # Check if ports already exist for this service
111
+ if service_id in cls._deployed_services:
112
+ triton_ports = cls._deployed_services[service_id]["triton_ports"]
113
+ logging.info(
114
+ "Reusing TRITON_PORTS for service %s: %s",
115
+ service_id,
116
+ triton_ports
117
+ )
118
+ return triton_ports
119
+
120
+ # First deployment: generate new ports and store them
121
+ port1 = scaling_instance.get_open_port()
122
+ port2 = scaling_instance.get_open_port()
123
+ port3 = scaling_instance.get_open_port()
124
+ triton_ports = f"{port1},{port2},{port3}"
125
+
126
+ # Store for future use
127
+ cls._deployed_services[service_id] = {
128
+ "triton_ports": triton_ports,
129
+ }
130
+
131
+ logging.info(
132
+ "First deployment for service %s - generated TRITON_PORTS: %s",
133
+ service_id,
134
+ triton_ports
135
+ )
136
+ return triton_ports
137
+
88
138
  @log_errors(default_return={}, raise_exception=True, log_error=False)
89
139
  def _init_credentials(self):
90
140
  """Initialize Matrice credentials.
@@ -346,6 +396,7 @@ class ActionInstance:
346
396
  destination_workspace_path: str = "/usr/src/workspace",
347
397
  docker_workdir: str = "",
348
398
  extra_pkgs: list = [],
399
+ container_name: str = "",
349
400
  ):
350
401
  """Build base Docker command with common options.
351
402
 
@@ -360,6 +411,7 @@ class ActionInstance:
360
411
  destination_workspace_path (str): Container workspace path
361
412
  docker_workdir (str): Docker working directory
362
413
  extra_pkgs (list): List of extra packages to install
414
+ container_name (str): Docker container name (format: {action_type}_{action_id})
363
415
  Returns:
364
416
  str: Base Docker command
365
417
  """
@@ -426,17 +478,19 @@ class ActionInstance:
426
478
 
427
479
  # if the service provider is local, then put --restart unless-stopped
428
480
  if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
429
- use_restart_policy = "--restart unless-stopped"
481
+ use_restart_policy = "--restart=unless-stopped "
430
482
  else:
431
483
  use_restart_policy = ""
432
484
 
485
+ # Build container name option if provided
486
+ name_option = f"--name {container_name}" if container_name else ""
487
+
433
488
  cmd_parts = [
434
- f"docker run {use_gpu} {use_restart_policy} ",
489
+ f"docker run -d {use_gpu} {use_restart_policy} ",
435
490
  network_config,
436
491
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
437
492
  *volumes,
438
493
  # Container configuration and startup commands
439
- f"--cidfile ./{self.action_record_id}.cid ",
440
494
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
441
495
  f'/bin/bash -c "cd {docker_workdir} && '
442
496
  f"{env_exports} && "
@@ -838,55 +892,50 @@ class ActionInstance:
838
892
  self.cmd = cmd
839
893
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
840
894
 
841
- with open(self.log_path, "wb") as out:
842
- self.process = subprocess.Popen(
843
- shlex.split(self.cmd),
844
- stdout=out,
845
- stderr=out,
846
- env={**os.environ},
847
- start_new_session=True,
848
- )
895
+ # Run docker with -d flag to get container ID from stdout
896
+ process = subprocess.Popen(
897
+ shlex.split(self.cmd),
898
+ stdout=subprocess.PIPE,
899
+ stderr=subprocess.PIPE,
900
+ text=True,
901
+ env={**os.environ},
902
+ )
849
903
 
850
- self.container_id = None
851
-
852
- cid_file_path = f"./{self.action_record_id}.cid"
853
- max_retries = 5
854
- retry_delay = 1 # seconds
855
- for attempt in range(max_retries):
856
- try:
857
- with open(cid_file_path, "r") as cid_file:
858
- container_id = cid_file.read().strip()
859
- self.container_id = container_id
860
- logging.info(
861
- "Started process for action %s with container ID: %s",
862
- self.action_record_id,
863
- self.container_id,
864
- )
865
- break
866
- except FileNotFoundError:
867
- logging.warning(
868
- "CID file not found for action %s, attempt %d/%d",
869
- self.action_record_id,
870
- attempt + 1,
871
- max_retries,
872
- )
873
- time.sleep(retry_delay)
874
- except Exception as e:
875
- logging.error(
876
- "Error reading CID file for action %s: %s",
877
- self.action_record_id,
878
- str(e),
879
- )
880
- time.sleep(retry_delay)
881
- else:
904
+ # Use a longer timeout for docker run since --pull=always may need to
905
+ # download large images on first run. Default: 30 minutes (1800 seconds)
906
+ # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
907
+ docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
908
+ logging.info(
909
+ "Waiting for docker container to start for action %s (timeout: %d seconds)",
910
+ self.action_record_id,
911
+ docker_start_timeout,
912
+ )
913
+ stdout, stderr = process.communicate(timeout=docker_start_timeout)
914
+
915
+ if process.returncode != 0:
882
916
  logging.error(
883
- "Failed to read CID file for action %s after %d attempts",
917
+ "Docker run failed for action %s: %s",
884
918
  self.action_record_id,
885
- max_retries,
919
+ stderr,
886
920
  )
887
- raise Exception("Failed to start process: CID file not found")
921
+ raise RuntimeError(f"Docker run failed: {stderr}")
888
922
 
889
- # report container id to scaling service
923
+ self.container_id = stdout.strip()
924
+ logging.info(
925
+ "Started container for action %s with ID: %s",
926
+ self.action_record_id,
927
+ self.container_id,
928
+ )
929
+
930
+ # Start following container logs in background
931
+ self.process = subprocess.Popen(
932
+ ["docker", "logs", "-f", self.container_id],
933
+ stdout=open(self.log_path, "wb"),
934
+ stderr=subprocess.STDOUT,
935
+ start_new_session=True,
936
+ )
937
+
938
+ # Report container id to scaling service
890
939
  self.scaling.update_action_container_id(
891
940
  action_record_id=self.action_record_id,
892
941
  container_id=self.container_id,
@@ -1052,7 +1101,8 @@ def data_preparation_execute(
1052
1101
  "Started pulling Docker image with PID: %s",
1053
1102
  process.pid,
1054
1103
  )
1055
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1104
+ container_name = f"data_prep_{self.action_record_id}"
1105
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1056
1106
  logging.info("cmd is: %s", cmd)
1057
1107
  self.start(cmd, "data_preparation_log")
1058
1108
 
@@ -1081,7 +1131,8 @@ def data_processing_execute(self: ActionInstance):
1081
1131
  service="bg-job-scheduler",
1082
1132
  job_params=action["jobParams"],
1083
1133
  )
1084
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1134
+ container_name = f"data_processing_{self.action_record_id}"
1135
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1085
1136
  logging.info("cmd: %s", cmd)
1086
1137
  self.start(cmd, "data_processing_log")
1087
1138
 
@@ -1094,7 +1145,8 @@ def data_split_execute(self: ActionInstance):
1094
1145
  if not action_details:
1095
1146
  return
1096
1147
  self.setup_action_requirements(action_details, work_fs, model_family="")
1097
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1148
+ container_name = f"data_split_{self.action_record_id}"
1149
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1098
1150
  logging.info("cmd: %s", cmd)
1099
1151
  self.start(cmd, "data_split")
1100
1152
 
@@ -1109,7 +1161,8 @@ def dataset_annotation_execute(
1109
1161
  if not action_details:
1110
1162
  return
1111
1163
  self.setup_action_requirements(action_details, work_fs)
1112
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1164
+ container_name = f"dataset_annotation_{self.action_record_id}"
1165
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1113
1166
  logging.info("cmd: %s", cmd)
1114
1167
  self.start(cmd, "dataset_annotation")
1115
1168
 
@@ -1124,7 +1177,8 @@ def dataset_augmentation_execute(
1124
1177
  if not action_details:
1125
1178
  return
1126
1179
  self.setup_action_requirements(action_details, work_fs)
1127
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1180
+ container_name = f"dataset_augmentation_{self.action_record_id}"
1181
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1128
1182
  logging.info("cmd: %s", cmd)
1129
1183
  self.start(cmd, "dataset_augmentation")
1130
1184
 
@@ -1140,7 +1194,8 @@ def augmentation_server_creation_execute(
1140
1194
  if not action_details:
1141
1195
  return
1142
1196
  self.setup_action_requirements(action_details, work_fs)
1143
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1197
+ container_name = f"augmentation_setup_{self.action_record_id}"
1198
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1144
1199
  logging.info("cmd: %s", cmd)
1145
1200
  self.start(cmd, "augmentation_setup")
1146
1201
 
@@ -1161,32 +1216,34 @@ def database_setup_execute(self: ActionInstance):
1161
1216
 
1162
1217
  project_id = action_details["_idProject"]
1163
1218
 
1219
+ # Define container names with action_record_id for uniqueness
1220
+ mongodb_container_name = f"database_setup_{self.action_record_id}"
1221
+ qdrant_container_name = f"qdrant_{self.action_record_id}"
1222
+
1164
1223
  if action_details["actionDetails"].get("containerId"):
1165
1224
  logging.info(
1166
- "Using existing container ID for inference tracker: %s",
1225
+ "Using existing container ID for database setup: %s",
1167
1226
  action_details["actionDetails"]["containerId"],
1168
1227
  )
1169
1228
  self.docker_container = action_details["actionDetails"]["containerId"]
1170
1229
  cmd = "docker restart " + self.docker_container
1171
- self.start(cmd, "qdrant_setup")
1230
+ self.start(cmd, "database_setup")
1172
1231
 
1173
- #qdrant restart
1174
- qdrant_cmd = "docker restart qdrant"
1175
- self.start(qdrant_cmd, 'qdrant_setup')
1232
+ # qdrant restart
1233
+ qdrant_cmd = f"docker restart {qdrant_container_name}"
1234
+ self.start(qdrant_cmd, "qdrant_setup")
1176
1235
 
1177
1236
  return
1178
-
1179
-
1180
- dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1181
1237
 
1238
+ dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
1182
1239
 
1183
1240
  # MongoDB container with --net=host (Port: 27020:27017)
1184
1241
  cmd = (
1185
- f"docker run --pull=always --net=host "
1242
+ f"docker run -d --pull=always --net=host "
1243
+ f"--name {mongodb_container_name} "
1244
+ f"-v matrice_myvol:/matrice_data "
1186
1245
  f"-v {dbPath}:{dbPath} "
1187
- f"--name database_setup_{self.action_record_id} "
1188
1246
  f"-v /var/run/docker.sock:/var/run/docker.sock "
1189
- f"--cidfile ./{self.action_record_id}.cid "
1190
1247
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1191
1248
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1192
1249
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1196,6 +1253,23 @@ def database_setup_execute(self: ActionInstance):
1196
1253
  )
1197
1254
  logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1198
1255
 
1256
+ # Qdrant container with --net=host (Port: 6334)
1257
+ qdrant_cmd = (
1258
+ f"docker run -d --pull=always --net=host "
1259
+ f"--name {qdrant_container_name} "
1260
+ f"-v matrice_myvol:/matrice_data "
1261
+ f"qdrant/qdrant:latest "
1262
+ )
1263
+ logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1264
+
1265
+ # Start Qdrant container
1266
+ qdrant_process = subprocess.Popen(
1267
+ qdrant_cmd,
1268
+ shell=True,
1269
+ stdout=subprocess.PIPE,
1270
+ stderr=subprocess.PIPE,
1271
+ )
1272
+ logging.info("Qdrant container started successfully")
1199
1273
 
1200
1274
  # Docker Command run
1201
1275
  self.start(cmd, "database_setup")
@@ -1215,6 +1289,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
1215
1289
 
1216
1290
  self.setup_action_requirements(action_details)
1217
1291
 
1292
+ container_name = f"facial_recognition_{self.action_record_id}"
1293
+
1218
1294
  if action_details["actionDetails"].get("containerId"):
1219
1295
  logging.info(
1220
1296
  "Using existing container ID for facial recognition worker: %s",
@@ -1228,15 +1304,13 @@ def facial_recognition_setup_execute(self: ActionInstance):
1228
1304
  # Facial recognition worker container with --net=host (Port: 8081)
1229
1305
  worker_cmd = (
1230
1306
  f"docker run -d --pull=always --net=host "
1231
- f"--name worker "
1232
- f"--cidfile ./{self.action_record_id}.cid "
1307
+ f"--name {container_name} "
1233
1308
  f"-v matrice_myvol:/matrice_data "
1234
- f"--cidfile ./{self.action_record_id}.cid "
1235
1309
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1236
1310
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1237
1311
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1238
1312
  f'-e ACTION_ID="{self.action_record_id}" '
1239
- f' --restart=unless-stopped '
1313
+ f'--restart=unless-stopped '
1240
1314
  f"{image}"
1241
1315
  )
1242
1316
  logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1258,6 +1332,8 @@ def lpr_setup_execute(self: ActionInstance):
1258
1332
 
1259
1333
  self.setup_action_requirements(action_details)
1260
1334
 
1335
+ container_name = f"lpr_{self.action_record_id}"
1336
+
1261
1337
  if action_details["actionDetails"].get("containerId"):
1262
1338
  logging.info(
1263
1339
  "Using existing container ID for LPR worker: %s",
@@ -1271,15 +1347,14 @@ def lpr_setup_execute(self: ActionInstance):
1271
1347
  # LPR worker container with --net=host (Port: 8082)
1272
1348
  worker_cmd = (
1273
1349
  f"docker run -d --net=host --pull=always "
1274
- f"--name lpr-worker "
1275
- f"--cidfile ./{self.action_record_id}.cid "
1350
+ f"--name {container_name} "
1276
1351
  f"-v matrice_myvol:/matrice_data "
1277
1352
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1278
1353
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1279
1354
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1280
1355
  f'-e ACTION_ID="{self.action_record_id}" '
1281
1356
  f'-e PORT=8082 '
1282
- f' --restart=unless-stopped '
1357
+ f'--restart=unless-stopped '
1283
1358
  f"{image}"
1284
1359
  )
1285
1360
  logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1310,6 +1385,8 @@ def inference_ws_server_execute(self: ActionInstance):
1310
1385
 
1311
1386
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1312
1387
 
1388
+ container_name = f"inference_ws_{self.action_record_id}"
1389
+
1313
1390
  if action_details["actionDetails"].get("containerId"):
1314
1391
  logging.info(
1315
1392
  "Using existing container ID for inference WebSocket server: %s",
@@ -1323,12 +1400,11 @@ def inference_ws_server_execute(self: ActionInstance):
1323
1400
  # Inference WebSocket server with --net=host (Port: 8102)
1324
1401
  worker_cmd = (
1325
1402
  f"docker run -d --pull=always --net=host "
1326
- f"--name inference "
1327
- f"--cidfile ./{self.action_record_id}.cid "
1403
+ f"--name {container_name} "
1328
1404
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1329
1405
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1330
1406
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1331
- f' --restart=unless-stopped '
1407
+ f'--restart=unless-stopped '
1332
1408
  f"{image} "
1333
1409
  f"./app "
1334
1410
  f"{self.action_record_id} "
@@ -1359,6 +1435,8 @@ def fe_fs_streaming_execute(self: ActionInstance):
1359
1435
 
1360
1436
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1361
1437
 
1438
+ container_name = f"fe_streaming_{self.action_record_id}"
1439
+
1362
1440
  if action_details["actionDetails"].get("containerId"):
1363
1441
  logging.info(
1364
1442
  "Using existing container ID for frontend streaming: %s",
@@ -1372,15 +1450,14 @@ def fe_fs_streaming_execute(self: ActionInstance):
1372
1450
  # Frontend streaming with --net=host (Port: 3000)
1373
1451
  worker_cmd = (
1374
1452
  f"docker run -d --pull=always --net=host "
1375
- f"--name fe_streaming "
1376
- f"--cidfile ./{self.action_record_id}.cid "
1453
+ f"--name {container_name} "
1377
1454
  f"-v matrice_myvol:/matrice_data "
1378
1455
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1379
1456
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1380
1457
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1381
1458
  f"-e PORT=3000 "
1382
1459
  f'-e WS_HOST="{ws_url}" '
1383
- f' --restart=unless-stopped '
1460
+ f'--restart=unless-stopped '
1384
1461
  f"{image}"
1385
1462
  )
1386
1463
  logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1405,6 +1482,8 @@ def fe_analytics_service_execute(self: ActionInstance):
1405
1482
 
1406
1483
  project_id = action_details["_idProject"]
1407
1484
 
1485
+ container_name = f"fe_analytics_{self.action_record_id}"
1486
+
1408
1487
  if action_details["actionDetails"].get("containerId"):
1409
1488
  logging.info(
1410
1489
  "Using existing container ID for frontend analytics service: %s",
@@ -1418,15 +1497,14 @@ def fe_analytics_service_execute(self: ActionInstance):
1418
1497
  # Frontend analytics service with --net=host (Port: 3001)
1419
1498
  worker_cmd = (
1420
1499
  f"docker run -d --pull=always --net=host "
1421
- f"--name fe-analytics "
1422
- f"--cidfile ./{self.action_record_id}.cid "
1500
+ f"--name {container_name} "
1423
1501
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1424
1502
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1425
1503
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1426
1504
  f'-e ACTION_ID="{self.action_record_id}" '
1427
1505
  f"-e PORT=3001 "
1428
1506
  f'-e PROJECT_ID="{project_id}" '
1429
- f' --restart=unless-stopped '
1507
+ f'--restart=unless-stopped '
1430
1508
  f"{image}"
1431
1509
  )
1432
1510
  logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1451,7 +1529,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1451
1529
  else:
1452
1530
  return
1453
1531
  use_gpu = self.get_gpu_config(action_details)
1454
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1532
+ container_name = f"dataset_generation_{self.action_record_id}"
1533
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1455
1534
  logging.info("cmd is: %s", cmd)
1456
1535
  self.start(cmd, "dataset_generation")
1457
1536
 
@@ -1472,7 +1551,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1472
1551
  else:
1473
1552
  return
1474
1553
  use_gpu = self.get_gpu_config(action_details)
1475
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1554
+ container_name = f"synthetic_data_setup_{self.action_record_id}"
1555
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1476
1556
  logging.info("cmd is: %s", cmd)
1477
1557
  self.start(cmd, "synthetic_data_setup")
1478
1558
 
@@ -1509,6 +1589,8 @@ def redis_setup_execute(self: ActionInstance):
1509
1589
 
1510
1590
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1511
1591
 
1592
+ # Define container names with action_record_id for uniqueness
1593
+ redis_container_name = f"redis_{self.action_record_id}"
1512
1594
 
1513
1595
  if action_details["actionDetails"].get("containerId"):
1514
1596
  logging.info(
@@ -1520,18 +1602,34 @@ def redis_setup_execute(self: ActionInstance):
1520
1602
  self.start(cmd, "redis_setup")
1521
1603
 
1522
1604
  # Redis container restart
1523
- redis_restart_cmd = "docker restart redis_container"
1605
+ redis_restart_cmd = f"docker restart {redis_container_name}"
1524
1606
  self.start(redis_restart_cmd, "redis")
1525
1607
 
1526
1608
  return
1527
1609
 
1528
- # Redis container with --net=host (Port: 6379)
1610
+ # Redis container with --net=host (Port: 6379) with optimized configuration
1529
1611
  redis_cmd = (
1530
1612
  f"docker run -d --net=host "
1531
- f"--name redis_container "
1613
+ f"--name {redis_container_name} "
1532
1614
  f"--restart unless-stopped "
1533
1615
  f"{redis_image} "
1534
- f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1616
+ f"redis-server --bind 0.0.0.0 "
1617
+ f"--appendonly no "
1618
+ f'--save "" '
1619
+ f"--maxmemory 30gb "
1620
+ f"--maxmemory-policy allkeys-lru "
1621
+ f"--io-threads 4 "
1622
+ f"--io-threads-do-reads yes "
1623
+ f"--stream-node-max-bytes 8192 "
1624
+ f"--stream-node-max-entries 1000 "
1625
+ f"--hz 100 "
1626
+ f"--tcp-backlog 2048 "
1627
+ f"--timeout 0 "
1628
+ f"--lazyfree-lazy-eviction yes "
1629
+ f"--lazyfree-lazy-expire yes "
1630
+ f"--lazyfree-lazy-server-del yes "
1631
+ f"--activedefrag yes "
1632
+ f"--requirepass {redis_password}"
1535
1633
  )
1536
1634
 
1537
1635
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
@@ -1555,8 +1653,9 @@ def redis_setup_execute(self: ActionInstance):
1555
1653
 
1556
1654
  # bg-redis management container with --net=host (Port: 8082)
1557
1655
  cmd = (
1558
- f"docker run --net=host "
1559
- f"--cidfile ./{self.action_record_id}.cid "
1656
+ f"docker run -d --net=host "
1657
+ f"--restart unless-stopped "
1658
+ f"--name bg-redis_{self.action_record_id} "
1560
1659
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1561
1660
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1562
1661
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1583,7 +1682,8 @@ def deploy_aggregator_execute(
1583
1682
  if not action_details:
1584
1683
  return
1585
1684
  self.setup_action_requirements(action_details, work_fs)
1586
- cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1685
+ container_name = f"deploy_aggregator_{self.action_record_id}"
1686
+ cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1587
1687
  logging.info("cmd: %s", cmd)
1588
1688
  self.start(cmd, "deploy_aggregator")
1589
1689
 
@@ -1599,6 +1699,10 @@ def model_deploy_execute(self: ActionInstance):
1599
1699
  return
1600
1700
  action_id = action_details["_id"]
1601
1701
  model_family = action_details["actionDetails"]["modelFamily"]
1702
+
1703
+ # Get the service ID to track deployments
1704
+ service_id = action_details.get("_idService")
1705
+
1602
1706
  self.setup_action_requirements(
1603
1707
  action_details,
1604
1708
  work_fs,
@@ -1606,17 +1710,29 @@ def model_deploy_execute(self: ActionInstance):
1606
1710
  action_id=action_id,
1607
1711
  )
1608
1712
 
1609
- # Get GPU configuration based on requirements and availability
1610
- # This uses the best-fit algorithm to select the most appropriate GPU(s)
1611
- use_gpu = self.get_gpu_config(action_details)
1612
-
1613
- # Override: If GPU is required, use all available GPUs
1713
+ # Use all GPUs if GPU is required
1614
1714
  gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1615
1715
  if gpuRequired:
1616
1716
  use_gpu = "--runtime=nvidia --gpus all"
1717
+ else:
1718
+ use_gpu = ""
1719
+
1720
+ logging.info(
1721
+ "Action %s: Model deployment GPU config: %s",
1722
+ action_id,
1723
+ use_gpu if use_gpu else "CPU-only"
1724
+ )
1725
+
1726
+ # Get or create TRITON_PORTS (uses utility method)
1727
+ triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1617
1728
 
1618
- extra_env_vars = {"INTERNAL_PORT": internal_port}
1619
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1729
+ extra_env_vars = {
1730
+ "INTERNAL_PORT": internal_port,
1731
+ "TRITON_PORTS": triton_ports
1732
+ }
1733
+
1734
+ container_name = f"model_deploy_{self.action_record_id}"
1735
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1620
1736
  logging.info("cmd is: %s", cmd)
1621
1737
  self.start(cmd, "deploy_log")
1622
1738
 
@@ -1649,7 +1765,8 @@ def model_train_execute(self: ActionInstance):
1649
1765
  self.start(cmd, "train_log")
1650
1766
  return
1651
1767
 
1652
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1768
+ container_name = f"model_train_{self.action_record_id}"
1769
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1653
1770
  logging.info("cmd is: %s", cmd)
1654
1771
  self.start(cmd, "train_log")
1655
1772
 
@@ -1672,7 +1789,7 @@ def model_eval_execute(self: ActionInstance):
1672
1789
  )
1673
1790
  if action_details["actionDetails"].get("containerId"):
1674
1791
  logging.info(
1675
- "Using existing container ID for training: %s",
1792
+ "Using existing container ID for evaluation: %s",
1676
1793
  action_details["actionDetails"]["containerId"],
1677
1794
  )
1678
1795
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1680,7 +1797,8 @@ def model_eval_execute(self: ActionInstance):
1680
1797
  self.start(cmd, "eval_log")
1681
1798
  return
1682
1799
 
1683
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1800
+ container_name = f"model_eval_{self.action_record_id}"
1801
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1684
1802
  logging.info("cmd is: %s", cmd)
1685
1803
  self.start(cmd, "eval_log")
1686
1804
 
@@ -1706,7 +1824,7 @@ def model_export_execute(self: ActionInstance):
1706
1824
  )
1707
1825
  if action_details["actionDetails"].get("containerId"):
1708
1826
  logging.info(
1709
- "Using existing container ID for training: %s",
1827
+ "Using existing container ID for export: %s",
1710
1828
  action_details["actionDetails"]["containerId"],
1711
1829
  )
1712
1830
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1714,7 +1832,8 @@ def model_export_execute(self: ActionInstance):
1714
1832
  self.start(cmd, "export_log")
1715
1833
  return
1716
1834
 
1717
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1835
+ container_name = f"model_export_{self.action_record_id}"
1836
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1718
1837
  logging.info("cmd is: %s", cmd)
1719
1838
  self.start(cmd, "export_log")
1720
1839
 
@@ -1730,7 +1849,8 @@ def image_build_execute(self: ActionInstance):
1730
1849
  action_id = action_details["_id"]
1731
1850
  internal_api_key = self.get_internal_api_key(action_id)
1732
1851
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1733
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1852
+ container_name = f"image_build_{self.action_record_id}"
1853
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1734
1854
  logging.info("cmd is: %s", cmd)
1735
1855
  self.start(cmd, "image_build_log")
1736
1856
 
@@ -1742,7 +1862,8 @@ def resource_clone_execute(self: ActionInstance):
1742
1862
  if not action_details:
1743
1863
  return
1744
1864
  self.setup_action_requirements(action_details)
1745
- cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1865
+ container_name = f"resource_clone_{self.action_record_id}"
1866
+ cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1746
1867
  logging.info("cmd is: %s", cmd)
1747
1868
  self.start(cmd, "resource_clone")
1748
1869
 
@@ -1760,7 +1881,7 @@ def streaming_gateway_execute(self: ActionInstance):
1760
1881
  )
1761
1882
  if action_details["actionDetails"].get("containerId"):
1762
1883
  logging.info(
1763
- "Using existing container ID for training: %s",
1884
+ "Using existing container ID for streaming gateway: %s",
1764
1885
  action_details["actionDetails"]["containerId"],
1765
1886
  )
1766
1887
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1768,7 +1889,8 @@ def streaming_gateway_execute(self: ActionInstance):
1768
1889
  self.start(cmd, "streaming_gateway")
1769
1890
  return
1770
1891
 
1771
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1892
+ container_name = f"streaming_gateway_{self.action_record_id}"
1893
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1772
1894
  logging.info("cmd is: %s", cmd)
1773
1895
  self.start(cmd, "streaming_gateway")
1774
1896
 
@@ -1864,7 +1986,7 @@ def kafka_setup_execute(self: ActionInstance):
1864
1986
 
1865
1987
  if action_details["actionDetails"].get("containerId"):
1866
1988
  logging.info(
1867
- "Using existing container ID for training: %s",
1989
+ "Using existing container ID for kafka: %s",
1868
1990
  action_details["actionDetails"]["containerId"],
1869
1991
  )
1870
1992
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1872,10 +1994,12 @@ def kafka_setup_execute(self: ActionInstance):
1872
1994
  self.start(cmd, "kafka_setup")
1873
1995
  return
1874
1996
 
1997
+ container_name = f"kafka_{self.action_record_id}"
1875
1998
 
1876
1999
  # Kafka container with --net=host (Ports: 9092, 9093)
1877
2000
  cmd = (
1878
- f"docker run --net=host "
2001
+ f"docker run -d --net=host "
2002
+ f"--name {container_name} "
1879
2003
  f"{env_args} "
1880
2004
  f"--shm-size=30G --pull=always "
1881
2005
  f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
@@ -1908,6 +2032,8 @@ def inference_tracker_setup_execute(self: ActionInstance):
1908
2032
 
1909
2033
  self.setup_action_requirements(action_details)
1910
2034
 
2035
+ container_name = f"inference_tracker_{self.action_record_id}"
2036
+
1911
2037
  if action_details["actionDetails"].get("containerId"):
1912
2038
  logging.info(
1913
2039
  "Using existing container ID for inference tracker: %s",
@@ -1921,14 +2047,13 @@ def inference_tracker_setup_execute(self: ActionInstance):
1921
2047
  # This is the existing Docker run command
1922
2048
  worker_cmd = (
1923
2049
  f"docker run -d --pull=always --net=host "
1924
- f"--cidfile ./{self.action_record_id}.cid "
1925
- f"--name inference-tracker-worker "
2050
+ f"--name {container_name} "
1926
2051
  f"-v matrice_myvol:/matrice_data "
1927
2052
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1928
2053
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1929
2054
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1930
2055
  f'-e ACTION_ID="{self.action_record_id}" '
1931
- f' --restart=unless-stopped '
2056
+ f'--restart=unless-stopped '
1932
2057
  f"{image}"
1933
2058
  )
1934
2059
 
@@ -1950,9 +2075,11 @@ def video_storage_setup_execute(self: ActionInstance):
1950
2075
 
1951
2076
  self.setup_action_requirements(action_details)
1952
2077
 
2078
+ container_name = f"video_storage_{self.action_record_id}"
2079
+
1953
2080
  if action_details["actionDetails"].get("containerId"):
1954
2081
  logging.info(
1955
- "Using existing container ID for inference tracker: %s",
2082
+ "Using existing container ID for video storage: %s",
1956
2083
  action_details["actionDetails"]["containerId"],
1957
2084
  )
1958
2085
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1963,14 +2090,13 @@ def video_storage_setup_execute(self: ActionInstance):
1963
2090
  # This is the existing Docker run command
1964
2091
  worker_cmd = (
1965
2092
  f"docker run -d --pull=always --net=host "
1966
- f"--cidfile ./{self.action_record_id}.cid "
1967
- f"--name media_server "
2093
+ f"--name {container_name} "
1968
2094
  f"-v matrice_myvol:/matrice_data "
1969
2095
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1970
2096
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1971
2097
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1972
2098
  f'-e ACTION_ID="{self.action_record_id}" '
1973
- f' --restart=unless-stopped '
2099
+ f'--restart=unless-stopped '
1974
2100
  f"{image}"
1975
2101
  )
1976
2102
 
@@ -916,14 +916,27 @@ class ResourcesTracker:
916
916
  gpu_count = 0
917
917
 
918
918
  for gpu in gpu_data['gpus']:
919
- gpu_memory_free += gpu['memory_total'] - gpu['memory_used']
919
+ # Be defensive: nvidia-smi can occasionally report N/A/0 for total while used is numeric,
920
+ # which would otherwise produce negative "free" memory.
921
+ total_mb = gpu.get('memory_total', 0) or 0
922
+ used_mb = gpu.get('memory_used', 0) or 0
923
+ free_mb = total_mb - used_mb
924
+ if free_mb < 0:
925
+ logging.debug(
926
+ "Negative GPU free memory computed (gpu_idx=%s total_mb=%s used_mb=%s); clamping to 0",
927
+ gpu.get('idx'),
928
+ total_mb,
929
+ used_mb,
930
+ )
931
+ free_mb = 0
932
+ gpu_memory_free += free_mb
920
933
  gpu_utilization += gpu['utilization']
921
934
  gpu_count += 1
922
935
 
923
936
  if gpu_count > 0:
924
937
  gpu_utilization /= gpu_count
925
-
926
- return gpu_memory_free, gpu_utilization
938
+
939
+ return max(0, gpu_memory_free), gpu_utilization
927
940
 
928
941
  @log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
929
942
  def _get_gpu_resources_direct(self) -> Tuple[int, float]:
@@ -1218,7 +1231,7 @@ class MachineResourcesTracker:
1218
1231
  availableCPU=available_cpu,
1219
1232
  availableMemory=available_memory,
1220
1233
  availableGPU=100 - gpu_utilization,
1221
- availableGPUMemory=gpu_memory_free,
1234
+ availableGPUMemory=max(0, gpu_memory_free),
1222
1235
  )
1223
1236
  if err is not None:
1224
1237
  logging.error(