matrice-compute 0.1.37__tar.gz → 0.1.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/action_instance.py +240 -113
  4. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/resources_tracker.py +17 -4
  5. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/LICENSE.txt +0 -0
  6. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/README.md +0 -0
  7. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/matrice_compute.egg-info/SOURCES.txt +0 -0
  8. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/matrice_compute.egg-info/dependency_links.txt +0 -0
  9. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/matrice_compute.egg-info/not-zip-safe +0 -0
  10. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/matrice_compute.egg-info/top_level.txt +0 -0
  11. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/pyproject.toml +0 -0
  12. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/setup.cfg +0 -0
  13. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/setup.py +0 -0
  14. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/__init__.py +0 -0
  15. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/actions_manager.py +0 -0
  16. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  17. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/compute_operations_handler.py +0 -0
  18. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/instance_manager.py +0 -0
  19. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/instance_utils.py +0 -0
  20. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/prechecks.py +0 -0
  21. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/py.typed +0 -0
  22. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/scaling.py +0 -0
  23. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.37
3
+ Version: 0.1.38
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.37
3
+ Version: 0.1.38
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -26,6 +26,10 @@ from matrice_common.utils import log_errors
26
26
  class ActionInstance:
27
27
  """Base class for tasks that run in Action containers."""
28
28
 
29
+ # Class-level dictionary to track deployed services and their ports
30
+ # Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
31
+ _deployed_services = {}
32
+
29
33
  def __init__(self, scaling: Scaling, action_info: dict):
30
34
  """Initialize an action instance.
31
35
 
@@ -85,6 +89,52 @@ class ActionInstance:
85
89
  raise ValueError(f"Unknown action type: {self.action_type}")
86
90
  self.task = self.actions_map[self.action_type]
87
91
 
92
+ @classmethod
93
+ def get_or_create_triton_ports(cls, service_id, scaling_instance):
94
+ """Get existing TRITON_PORTS for a service or create new ones.
95
+
96
+ Args:
97
+ service_id (str): Service ID (_idService)
98
+ scaling_instance: Scaling instance to get open ports
99
+
100
+ Returns:
101
+ str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
102
+ """
103
+ if not service_id:
104
+ # No service_id, generate new ports
105
+ port1 = scaling_instance.get_open_port()
106
+ port2 = scaling_instance.get_open_port()
107
+ port3 = scaling_instance.get_open_port()
108
+ return f"{port1},{port2},{port3}"
109
+
110
+ # Check if ports already exist for this service
111
+ if service_id in cls._deployed_services:
112
+ triton_ports = cls._deployed_services[service_id]["triton_ports"]
113
+ logging.info(
114
+ "Reusing TRITON_PORTS for service %s: %s",
115
+ service_id,
116
+ triton_ports
117
+ )
118
+ return triton_ports
119
+
120
+ # First deployment: generate new ports and store them
121
+ port1 = scaling_instance.get_open_port()
122
+ port2 = scaling_instance.get_open_port()
123
+ port3 = scaling_instance.get_open_port()
124
+ triton_ports = f"{port1},{port2},{port3}"
125
+
126
+ # Store for future use
127
+ cls._deployed_services[service_id] = {
128
+ "triton_ports": triton_ports,
129
+ }
130
+
131
+ logging.info(
132
+ "First deployment for service %s - generated TRITON_PORTS: %s",
133
+ service_id,
134
+ triton_ports
135
+ )
136
+ return triton_ports
137
+
88
138
  @log_errors(default_return={}, raise_exception=True, log_error=False)
89
139
  def _init_credentials(self):
90
140
  """Initialize Matrice credentials.
@@ -346,6 +396,7 @@ class ActionInstance:
346
396
  destination_workspace_path: str = "/usr/src/workspace",
347
397
  docker_workdir: str = "",
348
398
  extra_pkgs: list = [],
399
+ container_name: str = "",
349
400
  ):
350
401
  """Build base Docker command with common options.
351
402
 
@@ -360,6 +411,7 @@ class ActionInstance:
360
411
  destination_workspace_path (str): Container workspace path
361
412
  docker_workdir (str): Docker working directory
362
413
  extra_pkgs (list): List of extra packages to install
414
+ container_name (str): Docker container name (format: {action_type}_{action_id})
363
415
  Returns:
364
416
  str: Base Docker command
365
417
  """
@@ -426,17 +478,20 @@ class ActionInstance:
426
478
 
427
479
  # if the service provider is local, then put --restart unless-stopped
428
480
  if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
481
+ env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
429
482
  use_restart_policy = "--restart unless-stopped"
430
483
  else:
431
484
  use_restart_policy = ""
432
485
 
486
+ # Build container name option if provided
487
+ name_option = f"--name {container_name}" if container_name else ""
488
+
433
489
  cmd_parts = [
434
- f"docker run {use_gpu} {use_restart_policy} ",
490
+ f"docker run -d {use_gpu} {use_restart_policy} ",
435
491
  network_config,
436
492
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
437
493
  *volumes,
438
494
  # Container configuration and startup commands
439
- f"--cidfile ./{self.action_record_id}.cid ",
440
495
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
441
496
  f'/bin/bash -c "cd {docker_workdir} && '
442
497
  f"{env_exports} && "
@@ -838,55 +893,50 @@ class ActionInstance:
838
893
  self.cmd = cmd
839
894
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
840
895
 
841
- with open(self.log_path, "wb") as out:
842
- self.process = subprocess.Popen(
843
- shlex.split(self.cmd),
844
- stdout=out,
845
- stderr=out,
846
- env={**os.environ},
847
- start_new_session=True,
848
- )
896
+ # Run docker with -d flag to get container ID from stdout
897
+ process = subprocess.Popen(
898
+ shlex.split(self.cmd),
899
+ stdout=subprocess.PIPE,
900
+ stderr=subprocess.PIPE,
901
+ text=True,
902
+ env={**os.environ},
903
+ )
849
904
 
850
- self.container_id = None
851
-
852
- cid_file_path = f"./{self.action_record_id}.cid"
853
- max_retries = 5
854
- retry_delay = 1 # seconds
855
- for attempt in range(max_retries):
856
- try:
857
- with open(cid_file_path, "r") as cid_file:
858
- container_id = cid_file.read().strip()
859
- self.container_id = container_id
860
- logging.info(
861
- "Started process for action %s with container ID: %s",
862
- self.action_record_id,
863
- self.container_id,
864
- )
865
- break
866
- except FileNotFoundError:
867
- logging.warning(
868
- "CID file not found for action %s, attempt %d/%d",
869
- self.action_record_id,
870
- attempt + 1,
871
- max_retries,
872
- )
873
- time.sleep(retry_delay)
874
- except Exception as e:
875
- logging.error(
876
- "Error reading CID file for action %s: %s",
877
- self.action_record_id,
878
- str(e),
879
- )
880
- time.sleep(retry_delay)
881
- else:
905
+ # Use a longer timeout for docker run since --pull=always may need to
906
+ # download large images on first run. Default: 30 minutes (1800 seconds)
907
+ # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
908
+ docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
909
+ logging.info(
910
+ "Waiting for docker container to start for action %s (timeout: %d seconds)",
911
+ self.action_record_id,
912
+ docker_start_timeout,
913
+ )
914
+ stdout, stderr = process.communicate(timeout=docker_start_timeout)
915
+
916
+ if process.returncode != 0:
882
917
  logging.error(
883
- "Failed to read CID file for action %s after %d attempts",
918
+ "Docker run failed for action %s: %s",
884
919
  self.action_record_id,
885
- max_retries,
920
+ stderr,
886
921
  )
887
- raise Exception("Failed to start process: CID file not found")
922
+ raise RuntimeError(f"Docker run failed: {stderr}")
888
923
 
889
- # report container id to scaling service
924
+ self.container_id = stdout.strip()
925
+ logging.info(
926
+ "Started container for action %s with ID: %s",
927
+ self.action_record_id,
928
+ self.container_id,
929
+ )
930
+
931
+ # Start following container logs in background
932
+ self.process = subprocess.Popen(
933
+ ["docker", "logs", "-f", self.container_id],
934
+ stdout=open(self.log_path, "wb"),
935
+ stderr=subprocess.STDOUT,
936
+ start_new_session=True,
937
+ )
938
+
939
+ # Report container id to scaling service
890
940
  self.scaling.update_action_container_id(
891
941
  action_record_id=self.action_record_id,
892
942
  container_id=self.container_id,
@@ -1052,7 +1102,8 @@ def data_preparation_execute(
1052
1102
  "Started pulling Docker image with PID: %s",
1053
1103
  process.pid,
1054
1104
  )
1055
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1105
+ container_name = f"data_prep_{self.action_record_id}"
1106
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1056
1107
  logging.info("cmd is: %s", cmd)
1057
1108
  self.start(cmd, "data_preparation_log")
1058
1109
 
@@ -1081,7 +1132,8 @@ def data_processing_execute(self: ActionInstance):
1081
1132
  service="bg-job-scheduler",
1082
1133
  job_params=action["jobParams"],
1083
1134
  )
1084
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1135
+ container_name = f"data_processing_{self.action_record_id}"
1136
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1085
1137
  logging.info("cmd: %s", cmd)
1086
1138
  self.start(cmd, "data_processing_log")
1087
1139
 
@@ -1094,7 +1146,8 @@ def data_split_execute(self: ActionInstance):
1094
1146
  if not action_details:
1095
1147
  return
1096
1148
  self.setup_action_requirements(action_details, work_fs, model_family="")
1097
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1149
+ container_name = f"data_split_{self.action_record_id}"
1150
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1098
1151
  logging.info("cmd: %s", cmd)
1099
1152
  self.start(cmd, "data_split")
1100
1153
 
@@ -1109,7 +1162,8 @@ def dataset_annotation_execute(
1109
1162
  if not action_details:
1110
1163
  return
1111
1164
  self.setup_action_requirements(action_details, work_fs)
1112
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1165
+ container_name = f"dataset_annotation_{self.action_record_id}"
1166
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1113
1167
  logging.info("cmd: %s", cmd)
1114
1168
  self.start(cmd, "dataset_annotation")
1115
1169
 
@@ -1124,7 +1178,8 @@ def dataset_augmentation_execute(
1124
1178
  if not action_details:
1125
1179
  return
1126
1180
  self.setup_action_requirements(action_details, work_fs)
1127
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1181
+ container_name = f"dataset_augmentation_{self.action_record_id}"
1182
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1128
1183
  logging.info("cmd: %s", cmd)
1129
1184
  self.start(cmd, "dataset_augmentation")
1130
1185
 
@@ -1140,7 +1195,8 @@ def augmentation_server_creation_execute(
1140
1195
  if not action_details:
1141
1196
  return
1142
1197
  self.setup_action_requirements(action_details, work_fs)
1143
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1198
+ container_name = f"augmentation_setup_{self.action_record_id}"
1199
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1144
1200
  logging.info("cmd: %s", cmd)
1145
1201
  self.start(cmd, "augmentation_setup")
1146
1202
 
@@ -1161,32 +1217,34 @@ def database_setup_execute(self: ActionInstance):
1161
1217
 
1162
1218
  project_id = action_details["_idProject"]
1163
1219
 
1220
+ # Define container names with action_record_id for uniqueness
1221
+ mongodb_container_name = f"database_setup_{self.action_record_id}"
1222
+ qdrant_container_name = f"qdrant_{self.action_record_id}"
1223
+
1164
1224
  if action_details["actionDetails"].get("containerId"):
1165
1225
  logging.info(
1166
- "Using existing container ID for inference tracker: %s",
1226
+ "Using existing container ID for database setup: %s",
1167
1227
  action_details["actionDetails"]["containerId"],
1168
1228
  )
1169
1229
  self.docker_container = action_details["actionDetails"]["containerId"]
1170
1230
  cmd = "docker restart " + self.docker_container
1171
- self.start(cmd, "qdrant_setup")
1231
+ self.start(cmd, "database_setup")
1172
1232
 
1173
- #qdrant restart
1174
- qdrant_cmd = "docker restart qdrant"
1175
- self.start(qdrant_cmd, 'qdrant_setup')
1233
+ # qdrant restart
1234
+ qdrant_cmd = f"docker restart {qdrant_container_name}"
1235
+ self.start(qdrant_cmd, "qdrant_setup")
1176
1236
 
1177
1237
  return
1178
-
1179
-
1180
- dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1181
1238
 
1239
+ dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
1182
1240
 
1183
1241
  # MongoDB container with --net=host (Port: 27020:27017)
1184
1242
  cmd = (
1185
- f"docker run --pull=always --net=host "
1243
+ f"docker run -d --pull=always --net=host "
1244
+ f"--name {mongodb_container_name} "
1245
+ f"-v matrice_myvol:/matrice_data "
1186
1246
  f"-v {dbPath}:{dbPath} "
1187
- f"--name database_setup_{self.action_record_id} "
1188
1247
  f"-v /var/run/docker.sock:/var/run/docker.sock "
1189
- f"--cidfile ./{self.action_record_id}.cid "
1190
1248
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1191
1249
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1192
1250
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1196,6 +1254,23 @@ def database_setup_execute(self: ActionInstance):
1196
1254
  )
1197
1255
  logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1198
1256
 
1257
+ # Qdrant container with --net=host (Port: 6334)
1258
+ qdrant_cmd = (
1259
+ f"docker run -d --pull=always --net=host "
1260
+ f"--name {qdrant_container_name} "
1261
+ f"-v matrice_myvol:/matrice_data "
1262
+ f"qdrant/qdrant:latest "
1263
+ )
1264
+ logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1265
+
1266
+ # Start Qdrant container
1267
+ qdrant_process = subprocess.Popen(
1268
+ qdrant_cmd,
1269
+ shell=True,
1270
+ stdout=subprocess.PIPE,
1271
+ stderr=subprocess.PIPE,
1272
+ )
1273
+ logging.info("Qdrant container started successfully")
1199
1274
 
1200
1275
  # Docker Command run
1201
1276
  self.start(cmd, "database_setup")
@@ -1215,6 +1290,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
1215
1290
 
1216
1291
  self.setup_action_requirements(action_details)
1217
1292
 
1293
+ container_name = f"facial_recognition_{self.action_record_id}"
1294
+
1218
1295
  if action_details["actionDetails"].get("containerId"):
1219
1296
  logging.info(
1220
1297
  "Using existing container ID for facial recognition worker: %s",
@@ -1228,15 +1305,13 @@ def facial_recognition_setup_execute(self: ActionInstance):
1228
1305
  # Facial recognition worker container with --net=host (Port: 8081)
1229
1306
  worker_cmd = (
1230
1307
  f"docker run -d --pull=always --net=host "
1231
- f"--name worker "
1232
- f"--cidfile ./{self.action_record_id}.cid "
1308
+ f"--name {container_name} "
1233
1309
  f"-v matrice_myvol:/matrice_data "
1234
- f"--cidfile ./{self.action_record_id}.cid "
1235
1310
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1236
1311
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1237
1312
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1238
1313
  f'-e ACTION_ID="{self.action_record_id}" '
1239
- f' --restart=unless-stopped '
1314
+ f'--restart=unless-stopped '
1240
1315
  f"{image}"
1241
1316
  )
1242
1317
  logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1258,6 +1333,8 @@ def lpr_setup_execute(self: ActionInstance):
1258
1333
 
1259
1334
  self.setup_action_requirements(action_details)
1260
1335
 
1336
+ container_name = f"lpr_{self.action_record_id}"
1337
+
1261
1338
  if action_details["actionDetails"].get("containerId"):
1262
1339
  logging.info(
1263
1340
  "Using existing container ID for LPR worker: %s",
@@ -1271,15 +1348,14 @@ def lpr_setup_execute(self: ActionInstance):
1271
1348
  # LPR worker container with --net=host (Port: 8082)
1272
1349
  worker_cmd = (
1273
1350
  f"docker run -d --net=host --pull=always "
1274
- f"--name lpr-worker "
1275
- f"--cidfile ./{self.action_record_id}.cid "
1351
+ f"--name {container_name} "
1276
1352
  f"-v matrice_myvol:/matrice_data "
1277
1353
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1278
1354
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1279
1355
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1280
1356
  f'-e ACTION_ID="{self.action_record_id}" '
1281
1357
  f'-e PORT=8082 '
1282
- f' --restart=unless-stopped '
1358
+ f'--restart=unless-stopped '
1283
1359
  f"{image}"
1284
1360
  )
1285
1361
  logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1310,6 +1386,8 @@ def inference_ws_server_execute(self: ActionInstance):
1310
1386
 
1311
1387
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1312
1388
 
1389
+ container_name = f"inference_ws_{self.action_record_id}"
1390
+
1313
1391
  if action_details["actionDetails"].get("containerId"):
1314
1392
  logging.info(
1315
1393
  "Using existing container ID for inference WebSocket server: %s",
@@ -1323,12 +1401,11 @@ def inference_ws_server_execute(self: ActionInstance):
1323
1401
  # Inference WebSocket server with --net=host (Port: 8102)
1324
1402
  worker_cmd = (
1325
1403
  f"docker run -d --pull=always --net=host "
1326
- f"--name inference "
1327
- f"--cidfile ./{self.action_record_id}.cid "
1404
+ f"--name {container_name} "
1328
1405
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1329
1406
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1330
1407
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1331
- f' --restart=unless-stopped '
1408
+ f'--restart=unless-stopped '
1332
1409
  f"{image} "
1333
1410
  f"./app "
1334
1411
  f"{self.action_record_id} "
@@ -1359,6 +1436,8 @@ def fe_fs_streaming_execute(self: ActionInstance):
1359
1436
 
1360
1437
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1361
1438
 
1439
+ container_name = f"fe_streaming_{self.action_record_id}"
1440
+
1362
1441
  if action_details["actionDetails"].get("containerId"):
1363
1442
  logging.info(
1364
1443
  "Using existing container ID for frontend streaming: %s",
@@ -1372,15 +1451,14 @@ def fe_fs_streaming_execute(self: ActionInstance):
1372
1451
  # Frontend streaming with --net=host (Port: 3000)
1373
1452
  worker_cmd = (
1374
1453
  f"docker run -d --pull=always --net=host "
1375
- f"--name fe_streaming "
1376
- f"--cidfile ./{self.action_record_id}.cid "
1454
+ f"--name {container_name} "
1377
1455
  f"-v matrice_myvol:/matrice_data "
1378
1456
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1379
1457
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1380
1458
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1381
1459
  f"-e PORT=3000 "
1382
1460
  f'-e WS_HOST="{ws_url}" '
1383
- f' --restart=unless-stopped '
1461
+ f'--restart=unless-stopped '
1384
1462
  f"{image}"
1385
1463
  )
1386
1464
  logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1405,6 +1483,8 @@ def fe_analytics_service_execute(self: ActionInstance):
1405
1483
 
1406
1484
  project_id = action_details["_idProject"]
1407
1485
 
1486
+ container_name = f"fe_analytics_{self.action_record_id}"
1487
+
1408
1488
  if action_details["actionDetails"].get("containerId"):
1409
1489
  logging.info(
1410
1490
  "Using existing container ID for frontend analytics service: %s",
@@ -1418,15 +1498,14 @@ def fe_analytics_service_execute(self: ActionInstance):
1418
1498
  # Frontend analytics service with --net=host (Port: 3001)
1419
1499
  worker_cmd = (
1420
1500
  f"docker run -d --pull=always --net=host "
1421
- f"--name fe-analytics "
1422
- f"--cidfile ./{self.action_record_id}.cid "
1501
+ f"--name {container_name} "
1423
1502
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1424
1503
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1425
1504
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1426
1505
  f'-e ACTION_ID="{self.action_record_id}" '
1427
1506
  f"-e PORT=3001 "
1428
1507
  f'-e PROJECT_ID="{project_id}" '
1429
- f' --restart=unless-stopped '
1508
+ f'--restart=unless-stopped '
1430
1509
  f"{image}"
1431
1510
  )
1432
1511
  logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1451,7 +1530,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1451
1530
  else:
1452
1531
  return
1453
1532
  use_gpu = self.get_gpu_config(action_details)
1454
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1533
+ container_name = f"dataset_generation_{self.action_record_id}"
1534
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1455
1535
  logging.info("cmd is: %s", cmd)
1456
1536
  self.start(cmd, "dataset_generation")
1457
1537
 
@@ -1472,7 +1552,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1472
1552
  else:
1473
1553
  return
1474
1554
  use_gpu = self.get_gpu_config(action_details)
1475
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1555
+ container_name = f"synthetic_data_setup_{self.action_record_id}"
1556
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1476
1557
  logging.info("cmd is: %s", cmd)
1477
1558
  self.start(cmd, "synthetic_data_setup")
1478
1559
 
@@ -1509,6 +1590,8 @@ def redis_setup_execute(self: ActionInstance):
1509
1590
 
1510
1591
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1511
1592
 
1593
+ # Define container names with action_record_id for uniqueness
1594
+ redis_container_name = f"redis_{self.action_record_id}"
1512
1595
 
1513
1596
  if action_details["actionDetails"].get("containerId"):
1514
1597
  logging.info(
@@ -1520,18 +1603,34 @@ def redis_setup_execute(self: ActionInstance):
1520
1603
  self.start(cmd, "redis_setup")
1521
1604
 
1522
1605
  # Redis container restart
1523
- redis_restart_cmd = "docker restart redis_container"
1606
+ redis_restart_cmd = f"docker restart {redis_container_name}"
1524
1607
  self.start(redis_restart_cmd, "redis")
1525
1608
 
1526
1609
  return
1527
1610
 
1528
- # Redis container with --net=host (Port: 6379)
1611
+ # Redis container with --net=host (Port: 6379) with optimized configuration
1529
1612
  redis_cmd = (
1530
1613
  f"docker run -d --net=host "
1531
- f"--name redis_container "
1614
+ f"--name {redis_container_name} "
1532
1615
  f"--restart unless-stopped "
1533
1616
  f"{redis_image} "
1534
- f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1617
+ f"redis-server --bind 0.0.0.0 "
1618
+ f"--appendonly no "
1619
+ f'--save "" '
1620
+ f"--maxmemory 30gb "
1621
+ f"--maxmemory-policy allkeys-lru "
1622
+ f"--io-threads 4 "
1623
+ f"--io-threads-do-reads yes "
1624
+ f"--stream-node-max-bytes 8192 "
1625
+ f"--stream-node-max-entries 1000 "
1626
+ f"--hz 100 "
1627
+ f"--tcp-backlog 2048 "
1628
+ f"--timeout 0 "
1629
+ f"--lazyfree-lazy-eviction yes "
1630
+ f"--lazyfree-lazy-expire yes "
1631
+ f"--lazyfree-lazy-server-del yes "
1632
+ f"--activedefrag yes "
1633
+ f"--requirepass {redis_password}"
1535
1634
  )
1536
1635
 
1537
1636
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
@@ -1555,8 +1654,9 @@ def redis_setup_execute(self: ActionInstance):
1555
1654
 
1556
1655
  # bg-redis management container with --net=host (Port: 8082)
1557
1656
  cmd = (
1558
- f"docker run --net=host "
1559
- f"--cidfile ./{self.action_record_id}.cid "
1657
+ f"docker run -d --net=host "
1658
+ f"--restart unless-stopped "
1659
+ f"--name bg-redis_{self.action_record_id} "
1560
1660
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1561
1661
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1562
1662
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1583,7 +1683,8 @@ def deploy_aggregator_execute(
1583
1683
  if not action_details:
1584
1684
  return
1585
1685
  self.setup_action_requirements(action_details, work_fs)
1586
- cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1686
+ container_name = f"deploy_aggregator_{self.action_record_id}"
1687
+ cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1587
1688
  logging.info("cmd: %s", cmd)
1588
1689
  self.start(cmd, "deploy_aggregator")
1589
1690
 
@@ -1599,6 +1700,10 @@ def model_deploy_execute(self: ActionInstance):
1599
1700
  return
1600
1701
  action_id = action_details["_id"]
1601
1702
  model_family = action_details["actionDetails"]["modelFamily"]
1703
+
1704
+ # Get the service ID to track deployments
1705
+ service_id = action_details.get("_idService")
1706
+
1602
1707
  self.setup_action_requirements(
1603
1708
  action_details,
1604
1709
  work_fs,
@@ -1606,17 +1711,29 @@ def model_deploy_execute(self: ActionInstance):
1606
1711
  action_id=action_id,
1607
1712
  )
1608
1713
 
1609
- # Get GPU configuration based on requirements and availability
1610
- # This uses the best-fit algorithm to select the most appropriate GPU(s)
1611
- use_gpu = self.get_gpu_config(action_details)
1612
-
1613
- # Override: If GPU is required, use all available GPUs
1714
+ # Use all GPUs if GPU is required
1614
1715
  gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1615
1716
  if gpuRequired:
1616
1717
  use_gpu = "--runtime=nvidia --gpus all"
1718
+ else:
1719
+ use_gpu = ""
1720
+
1721
+ logging.info(
1722
+ "Action %s: Model deployment GPU config: %s",
1723
+ action_id,
1724
+ use_gpu if use_gpu else "CPU-only"
1725
+ )
1726
+
1727
+ # Get or create TRITON_PORTS (uses utility method)
1728
+ triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1617
1729
 
1618
- extra_env_vars = {"INTERNAL_PORT": internal_port}
1619
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1730
+ extra_env_vars = {
1731
+ "INTERNAL_PORT": internal_port,
1732
+ "TRITON_PORTS": triton_ports
1733
+ }
1734
+
1735
+ container_name = f"model_deploy_{self.action_record_id}"
1736
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1620
1737
  logging.info("cmd is: %s", cmd)
1621
1738
  self.start(cmd, "deploy_log")
1622
1739
 
@@ -1649,7 +1766,8 @@ def model_train_execute(self: ActionInstance):
1649
1766
  self.start(cmd, "train_log")
1650
1767
  return
1651
1768
 
1652
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1769
+ container_name = f"model_train_{self.action_record_id}"
1770
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1653
1771
  logging.info("cmd is: %s", cmd)
1654
1772
  self.start(cmd, "train_log")
1655
1773
 
@@ -1672,7 +1790,7 @@ def model_eval_execute(self: ActionInstance):
1672
1790
  )
1673
1791
  if action_details["actionDetails"].get("containerId"):
1674
1792
  logging.info(
1675
- "Using existing container ID for training: %s",
1793
+ "Using existing container ID for evaluation: %s",
1676
1794
  action_details["actionDetails"]["containerId"],
1677
1795
  )
1678
1796
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1680,7 +1798,8 @@ def model_eval_execute(self: ActionInstance):
1680
1798
  self.start(cmd, "eval_log")
1681
1799
  return
1682
1800
 
1683
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1801
+ container_name = f"model_eval_{self.action_record_id}"
1802
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1684
1803
  logging.info("cmd is: %s", cmd)
1685
1804
  self.start(cmd, "eval_log")
1686
1805
 
@@ -1706,7 +1825,7 @@ def model_export_execute(self: ActionInstance):
1706
1825
  )
1707
1826
  if action_details["actionDetails"].get("containerId"):
1708
1827
  logging.info(
1709
- "Using existing container ID for training: %s",
1828
+ "Using existing container ID for export: %s",
1710
1829
  action_details["actionDetails"]["containerId"],
1711
1830
  )
1712
1831
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1714,7 +1833,8 @@ def model_export_execute(self: ActionInstance):
1714
1833
  self.start(cmd, "export_log")
1715
1834
  return
1716
1835
 
1717
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1836
+ container_name = f"model_export_{self.action_record_id}"
1837
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1718
1838
  logging.info("cmd is: %s", cmd)
1719
1839
  self.start(cmd, "export_log")
1720
1840
 
@@ -1730,7 +1850,8 @@ def image_build_execute(self: ActionInstance):
1730
1850
  action_id = action_details["_id"]
1731
1851
  internal_api_key = self.get_internal_api_key(action_id)
1732
1852
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1733
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1853
+ container_name = f"image_build_{self.action_record_id}"
1854
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1734
1855
  logging.info("cmd is: %s", cmd)
1735
1856
  self.start(cmd, "image_build_log")
1736
1857
 
@@ -1742,7 +1863,8 @@ def resource_clone_execute(self: ActionInstance):
1742
1863
  if not action_details:
1743
1864
  return
1744
1865
  self.setup_action_requirements(action_details)
1745
- cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1866
+ container_name = f"resource_clone_{self.action_record_id}"
1867
+ cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1746
1868
  logging.info("cmd is: %s", cmd)
1747
1869
  self.start(cmd, "resource_clone")
1748
1870
 
@@ -1760,7 +1882,7 @@ def streaming_gateway_execute(self: ActionInstance):
1760
1882
  )
1761
1883
  if action_details["actionDetails"].get("containerId"):
1762
1884
  logging.info(
1763
- "Using existing container ID for training: %s",
1885
+ "Using existing container ID for streaming gateway: %s",
1764
1886
  action_details["actionDetails"]["containerId"],
1765
1887
  )
1766
1888
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1768,7 +1890,8 @@ def streaming_gateway_execute(self: ActionInstance):
1768
1890
  self.start(cmd, "streaming_gateway")
1769
1891
  return
1770
1892
 
1771
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1893
+ container_name = f"streaming_gateway_{self.action_record_id}"
1894
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1772
1895
  logging.info("cmd is: %s", cmd)
1773
1896
  self.start(cmd, "streaming_gateway")
1774
1897
 
@@ -1864,7 +1987,7 @@ def kafka_setup_execute(self: ActionInstance):
1864
1987
 
1865
1988
  if action_details["actionDetails"].get("containerId"):
1866
1989
  logging.info(
1867
- "Using existing container ID for training: %s",
1990
+ "Using existing container ID for kafka: %s",
1868
1991
  action_details["actionDetails"]["containerId"],
1869
1992
  )
1870
1993
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1872,10 +1995,12 @@ def kafka_setup_execute(self: ActionInstance):
1872
1995
  self.start(cmd, "kafka_setup")
1873
1996
  return
1874
1997
 
1998
+ container_name = f"kafka_{self.action_record_id}"
1875
1999
 
1876
2000
  # Kafka container with --net=host (Ports: 9092, 9093)
1877
2001
  cmd = (
1878
- f"docker run --net=host "
2002
+ f"docker run -d --net=host "
2003
+ f"--name {container_name} "
1879
2004
  f"{env_args} "
1880
2005
  f"--shm-size=30G --pull=always "
1881
2006
  f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
@@ -1908,6 +2033,8 @@ def inference_tracker_setup_execute(self: ActionInstance):
1908
2033
 
1909
2034
  self.setup_action_requirements(action_details)
1910
2035
 
2036
+ container_name = f"inference_tracker_{self.action_record_id}"
2037
+
1911
2038
  if action_details["actionDetails"].get("containerId"):
1912
2039
  logging.info(
1913
2040
  "Using existing container ID for inference tracker: %s",
@@ -1921,14 +2048,13 @@ def inference_tracker_setup_execute(self: ActionInstance):
1921
2048
  # This is the existing Docker run command
1922
2049
  worker_cmd = (
1923
2050
  f"docker run -d --pull=always --net=host "
1924
- f"--cidfile ./{self.action_record_id}.cid "
1925
- f"--name inference-tracker-worker "
2051
+ f"--name {container_name} "
1926
2052
  f"-v matrice_myvol:/matrice_data "
1927
2053
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1928
2054
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1929
2055
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1930
2056
  f'-e ACTION_ID="{self.action_record_id}" '
1931
- f' --restart=unless-stopped '
2057
+ f'--restart=unless-stopped '
1932
2058
  f"{image}"
1933
2059
  )
1934
2060
 
@@ -1950,9 +2076,11 @@ def video_storage_setup_execute(self: ActionInstance):
1950
2076
 
1951
2077
  self.setup_action_requirements(action_details)
1952
2078
 
2079
+ container_name = f"video_storage_{self.action_record_id}"
2080
+
1953
2081
  if action_details["actionDetails"].get("containerId"):
1954
2082
  logging.info(
1955
- "Using existing container ID for inference tracker: %s",
2083
+ "Using existing container ID for video storage: %s",
1956
2084
  action_details["actionDetails"]["containerId"],
1957
2085
  )
1958
2086
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1963,14 +2091,13 @@ def video_storage_setup_execute(self: ActionInstance):
1963
2091
  # This is the existing Docker run command
1964
2092
  worker_cmd = (
1965
2093
  f"docker run -d --pull=always --net=host "
1966
- f"--cidfile ./{self.action_record_id}.cid "
1967
- f"--name media_server "
2094
+ f"--name {container_name} "
1968
2095
  f"-v matrice_myvol:/matrice_data "
1969
2096
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1970
2097
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1971
2098
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1972
2099
  f'-e ACTION_ID="{self.action_record_id}" '
1973
- f' --restart=unless-stopped '
2100
+ f'--restart=unless-stopped '
1974
2101
  f"{image}"
1975
2102
  )
1976
2103
 
@@ -916,14 +916,27 @@ class ResourcesTracker:
916
916
  gpu_count = 0
917
917
 
918
918
  for gpu in gpu_data['gpus']:
919
- gpu_memory_free += gpu['memory_total'] - gpu['memory_used']
919
+ # Be defensive: nvidia-smi can occasionally report N/A/0 for total while used is numeric,
920
+ # which would otherwise produce negative "free" memory.
921
+ total_mb = gpu.get('memory_total', 0) or 0
922
+ used_mb = gpu.get('memory_used', 0) or 0
923
+ free_mb = total_mb - used_mb
924
+ if free_mb < 0:
925
+ logging.debug(
926
+ "Negative GPU free memory computed (gpu_idx=%s total_mb=%s used_mb=%s); clamping to 0",
927
+ gpu.get('idx'),
928
+ total_mb,
929
+ used_mb,
930
+ )
931
+ free_mb = 0
932
+ gpu_memory_free += free_mb
920
933
  gpu_utilization += gpu['utilization']
921
934
  gpu_count += 1
922
935
 
923
936
  if gpu_count > 0:
924
937
  gpu_utilization /= gpu_count
925
-
926
- return gpu_memory_free, gpu_utilization
938
+
939
+ return max(0, gpu_memory_free), gpu_utilization
927
940
 
928
941
  @log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
929
942
  def _get_gpu_resources_direct(self) -> Tuple[int, float]:
@@ -1218,7 +1231,7 @@ class MachineResourcesTracker:
1218
1231
  availableCPU=available_cpu,
1219
1232
  availableMemory=available_memory,
1220
1233
  availableGPU=100 - gpu_utilization,
1221
- availableGPUMemory=gpu_memory_free,
1234
+ availableGPUMemory=max(0, gpu_memory_free),
1222
1235
  )
1223
1236
  if err is not None:
1224
1237
  logging.error(