matrice-compute 0.1.30__tar.gz → 0.1.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/__init__.py +4 -0
  4. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/action_instance.py +375 -208
  5. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/actions_manager.py +1 -1
  6. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/instance_manager.py +1 -1
  7. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/scaling.py +1 -1
  8. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/shutdown_manager.py +2 -2
  9. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/LICENSE.txt +0 -0
  10. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/README.md +0 -0
  11. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/matrice_compute.egg-info/SOURCES.txt +0 -0
  12. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/matrice_compute.egg-info/dependency_links.txt +0 -0
  13. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/matrice_compute.egg-info/not-zip-safe +0 -0
  14. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/matrice_compute.egg-info/top_level.txt +0 -0
  15. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/pyproject.toml +0 -0
  16. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/setup.cfg +0 -0
  17. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/setup.py +0 -0
  18. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  19. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/compute_operations_handler.py +0 -0
  20. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/instance_utils.py +0 -0
  21. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/prechecks.py +0 -0
  22. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/py.typed +0 -0
  23. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/resources_tracker.py +0 -0
  24. {matrice_compute-0.1.30 → matrice_compute-0.1.32}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.30
3
+ Version: 0.1.32
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.30
3
+ Version: 0.1.32
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,7 @@
1
1
  """Module providing __init__ functionality."""
2
2
 
3
3
  import subprocess
4
+ import logging
4
5
 
5
6
  from matrice_common.utils import dependencies_check
6
7
 
@@ -17,4 +18,7 @@ subprocess.run( # Re-upgrade docker to avoid missing DOCKER_HOST connection erro
17
18
 
18
19
  from matrice_compute.instance_manager import InstanceManager # noqa: E402
19
20
 
21
+ logging.getLogger("kafka").setLevel(logging.INFO)
22
+ logging.getLogger("confluent_kafka").setLevel(logging.INFO)
23
+
20
24
  __all__ = ["InstanceManager"]
@@ -296,7 +296,7 @@ class ActionInstance:
296
296
  getattr(self, "action_record_id", "unknown"),
297
297
  )
298
298
  else:
299
- logging.debug(
299
+ logging.info(
300
300
  "No additional logs to send for action %s",
301
301
  getattr(self, "action_record_id", "unknown"),
302
302
  )
@@ -411,6 +411,7 @@ class ActionInstance:
411
411
  destination_workspace_path: str = "/usr/src/workspace",
412
412
  docker_workdir: str = "",
413
413
  extra_pkgs: list = [],
414
+ container_name: str = "",
414
415
  ):
415
416
  """Build base Docker command with common options.
416
417
 
@@ -425,6 +426,7 @@ class ActionInstance:
425
426
  destination_workspace_path (str): Container workspace path
426
427
  docker_workdir (str): Docker working directory
427
428
  extra_pkgs (list): List of extra packages to install
429
+ container_name (str): Docker container name (format: {action_type}_{action_id})
428
430
  Returns:
429
431
  str: Base Docker command
430
432
  """
@@ -489,13 +491,16 @@ class ActionInstance:
489
491
  ]
490
492
  )
491
493
 
494
+ # Build container name option if provided
495
+ name_option = f"--name {container_name}" if container_name else ""
496
+
492
497
  cmd_parts = [
493
- f"docker run {use_gpu} ",
498
+ f"docker run -d {use_gpu} ",
499
+ name_option,
494
500
  network_config,
495
501
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
502
  *volumes,
497
503
  # Container configuration and startup commands
498
- f"--cidfile ./{self.action_record_id}.cid ",
499
504
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
500
505
  f'/bin/bash -c "cd {docker_workdir} && '
501
506
  f"{env_exports} && "
@@ -883,6 +888,34 @@ class ActionInstance:
883
888
  job_params=action_details["jobParams"],
884
889
  )
885
890
 
891
+ @staticmethod
892
+ def container_exists(container_id: str) -> bool:
893
+ """Check if a Docker container exists.
894
+
895
+ Args:
896
+ container_id (str): Container ID or name to check
897
+
898
+ Returns:
899
+ bool: True if container exists, False otherwise
900
+ """
901
+ if not container_id:
902
+ return False
903
+ try:
904
+ result = subprocess.run(
905
+ ["docker", "inspect", container_id],
906
+ capture_output=True,
907
+ text=True,
908
+ timeout=10
909
+ )
910
+ return result.returncode == 0
911
+ except Exception as e:
912
+ logging.warning(
913
+ "Error checking if container %s exists: %s",
914
+ container_id,
915
+ str(e)
916
+ )
917
+ return False
918
+
886
919
  @log_errors(raise_exception=True)
887
920
  def start_process(self, cmd, log_name):
888
921
  """Start the process and initialize logging.
@@ -897,60 +930,45 @@ class ActionInstance:
897
930
  self.cmd = cmd
898
931
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
899
932
 
900
- with open(self.log_path, "wb") as out:
901
- self.process = subprocess.Popen(
902
- shlex.split(self.cmd),
903
- stdout=out,
904
- stderr=out,
905
- env={**os.environ},
906
- start_new_session=True,
907
- )
933
+ # Run docker with -d flag to get container ID from stdout
934
+ process = subprocess.Popen(
935
+ shlex.split(self.cmd),
936
+ stdout=subprocess.PIPE,
937
+ stderr=subprocess.PIPE,
938
+ text=True,
939
+ env={**os.environ},
940
+ )
908
941
 
909
- self.container_id = None
910
-
911
- cid_file_path = f"./{self.action_record_id}.cid"
912
- max_retries = 5
913
- retry_delay = 1 # seconds
914
- for attempt in range(max_retries):
915
- try:
916
- with open(cid_file_path, "r") as cid_file:
917
- container_id = cid_file.read().strip()
918
- self.container_id = container_id
919
- logging.info(
920
- "Started process for action %s with container ID: %s",
921
- self.action_record_id,
922
- self.container_id,
923
- )
924
- break
925
- except FileNotFoundError:
926
- logging.warning(
927
- "CID file not found for action %s, attempt %d/%d",
928
- self.action_record_id,
929
- attempt + 1,
930
- max_retries,
931
- )
932
- time.sleep(retry_delay)
933
- except Exception as e:
934
- logging.error(
935
- "Error reading CID file for action %s: %s",
936
- self.action_record_id,
937
- str(e),
938
- )
939
- time.sleep(retry_delay)
940
- else:
942
+ stdout, stderr = process.communicate(timeout=120)
943
+
944
+ if process.returncode != 0:
941
945
  logging.error(
942
- "Failed to read CID file for action %s after %d attempts",
946
+ "Docker run failed for action %s: %s",
943
947
  self.action_record_id,
944
- max_retries,
948
+ stderr,
945
949
  )
946
- raise Exception("Failed to start process: CID file not found")
950
+ raise RuntimeError(f"Docker run failed: {stderr}")
947
951
 
948
- # report container id to scaling service
952
+ self.container_id = stdout.strip()
953
+ logging.info(
954
+ "Started container for action %s with ID: %s",
955
+ self.action_record_id,
956
+ self.container_id,
957
+ )
958
+
959
+ # Start following container logs in background
960
+ self.process = subprocess.Popen(
961
+ ["docker", "logs", "-f", self.container_id],
962
+ stdout=open(self.log_path, "wb"),
963
+ stderr=subprocess.STDOUT,
964
+ start_new_session=True,
965
+ )
966
+
967
+ # Report container id to scaling service
949
968
  self.scaling.update_action_container_id(
950
969
  action_record_id=self.action_record_id,
951
970
  container_id=self.container_id,
952
971
  )
953
-
954
972
 
955
973
  @log_errors(raise_exception=False)
956
974
  def start_logger(self):
@@ -1111,7 +1129,8 @@ def data_preparation_execute(
1111
1129
  "Started pulling Docker image with PID: %s",
1112
1130
  process.pid,
1113
1131
  )
1114
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1132
+ container_name = f"data_prep_{self.action_record_id}"
1133
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1115
1134
  logging.info("cmd is: %s", cmd)
1116
1135
  self.start(cmd, "data_preparation_log")
1117
1136
 
@@ -1140,7 +1159,8 @@ def data_processing_execute(self: ActionInstance):
1140
1159
  service="bg-job-scheduler",
1141
1160
  job_params=action["jobParams"],
1142
1161
  )
1143
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1162
+ container_name = f"data_processing_{self.action_record_id}"
1163
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1144
1164
  logging.info("cmd: %s", cmd)
1145
1165
  self.start(cmd, "data_processing_log")
1146
1166
 
@@ -1153,7 +1173,8 @@ def data_split_execute(self: ActionInstance):
1153
1173
  if not action_details:
1154
1174
  return
1155
1175
  self.setup_action_requirements(action_details, work_fs, model_family="")
1156
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1176
+ container_name = f"data_split_{self.action_record_id}"
1177
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1157
1178
  logging.info("cmd: %s", cmd)
1158
1179
  self.start(cmd, "data_split")
1159
1180
 
@@ -1168,7 +1189,8 @@ def dataset_annotation_execute(
1168
1189
  if not action_details:
1169
1190
  return
1170
1191
  self.setup_action_requirements(action_details, work_fs)
1171
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1192
+ container_name = f"dataset_annotation_{self.action_record_id}"
1193
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1172
1194
  logging.info("cmd: %s", cmd)
1173
1195
  self.start(cmd, "dataset_annotation")
1174
1196
 
@@ -1183,7 +1205,8 @@ def dataset_augmentation_execute(
1183
1205
  if not action_details:
1184
1206
  return
1185
1207
  self.setup_action_requirements(action_details, work_fs)
1186
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1208
+ container_name = f"dataset_augmentation_{self.action_record_id}"
1209
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1187
1210
  logging.info("cmd: %s", cmd)
1188
1211
  self.start(cmd, "dataset_augmentation")
1189
1212
 
@@ -1199,7 +1222,8 @@ def augmentation_server_creation_execute(
1199
1222
  if not action_details:
1200
1223
  return
1201
1224
  self.setup_action_requirements(action_details, work_fs)
1202
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1225
+ container_name = f"augmentation_setup_{self.action_record_id}"
1226
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1203
1227
  logging.info("cmd: %s", cmd)
1204
1228
  self.start(cmd, "augmentation_setup")
1205
1229
 
@@ -1220,25 +1244,41 @@ def database_setup_execute(self: ActionInstance):
1220
1244
 
1221
1245
  project_id = action_details["_idProject"]
1222
1246
 
1223
- if action_details["actionDetails"].get("containerId"):
1224
- logging.info(
1225
- "Using existing container ID for inference tracker: %s",
1226
- action_details["actionDetails"]["containerId"],
1227
- )
1228
- self.docker_container = action_details["actionDetails"]["containerId"]
1229
- cmd = "docker restart " + self.docker_container
1230
- self.start(cmd, "qdrant_setup")
1247
+ # Define container names with action_record_id for uniqueness
1248
+ mongodb_container_name = f"database_setup_{self.action_record_id}"
1249
+ qdrant_container_name = f"qdrant_{self.action_record_id}"
1231
1250
 
1232
- #qdrant restart
1233
- qdrant_cmd = "docker restart qdrant"
1234
- self.start(qdrant_cmd, 'qdrant_setup')
1251
+ existing_container_id = action_details["actionDetails"].get("containerId")
1252
+ if existing_container_id:
1253
+ # Check if both containers actually exist before trying to restart
1254
+ mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
1255
+ qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
1235
1256
 
1236
- return
1257
+ if mongodb_container_exists and qdrant_container_exists:
1258
+ logging.info(
1259
+ "Using existing container ID for database setup: %s",
1260
+ existing_container_id,
1261
+ )
1262
+ self.docker_container = existing_container_id
1263
+ cmd = "docker restart " + self.docker_container
1264
+ self.start(cmd, "qdrant_setup")
1265
+
1266
+ # qdrant restart
1267
+ qdrant_cmd = f"docker restart {qdrant_container_name}"
1268
+ self.start(qdrant_cmd, "qdrant_setup")
1269
+ return
1270
+ else:
1271
+ logging.warning(
1272
+ "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
1273
+ mongodb_container_exists,
1274
+ qdrant_container_exists
1275
+ )
1276
+ # Fall through to create new containers
1237
1277
 
1238
1278
  # MongoDB container with --net=host (Port: 27020:27017)
1239
1279
  cmd = (
1240
1280
  f"docker run --pull=always --net=host "
1241
- f"--name mongodbdatabase "
1281
+ f"--name {mongodb_container_name} "
1242
1282
  f"-v matrice_myvol:/matrice_data "
1243
1283
  f"--cidfile ./{self.action_record_id}.cid "
1244
1284
  f"-e ACTION_RECORD_ID={self.action_record_id} "
@@ -1253,7 +1293,7 @@ def database_setup_execute(self: ActionInstance):
1253
1293
  # Qdrant container with --net=host (Port: 6334)
1254
1294
  qdrant_cmd = (
1255
1295
  f"docker run --pull=always --net=host "
1256
- f"--name qdrant "
1296
+ f"--name {qdrant_container_name} "
1257
1297
  f"-v matrice_myvol:/matrice_data "
1258
1298
  f"{'qdrant/qdrant:latest'} "
1259
1299
  )
@@ -1279,23 +1319,32 @@ def facial_recognition_setup_execute(self: ActionInstance):
1279
1319
 
1280
1320
  self.setup_action_requirements(action_details)
1281
1321
 
1282
- if action_details["actionDetails"].get("containerId"):
1283
- logging.info(
1284
- "Using existing container ID for facial recognition worker: %s",
1285
- action_details["actionDetails"]["containerId"],
1286
- )
1287
- self.docker_container = action_details["actionDetails"]["containerId"]
1288
- cmd = "docker restart " + self.docker_container
1289
- self.start(cmd, "facial_recognition_setup")
1290
- return
1322
+ existing_container_id = action_details["actionDetails"].get("containerId")
1323
+ if existing_container_id:
1324
+ # Check if container actually exists before trying to restart
1325
+ if ActionInstance.container_exists(existing_container_id):
1326
+ logging.info(
1327
+ "Using existing container ID for facial recognition worker: %s",
1328
+ existing_container_id,
1329
+ )
1330
+ self.docker_container = existing_container_id
1331
+ cmd = "docker restart " + self.docker_container
1332
+ self.start(cmd, "facial_recognition_setup")
1333
+ return
1334
+ else:
1335
+ logging.warning(
1336
+ "Container %s not found. Creating new container.",
1337
+ existing_container_id
1338
+ )
1339
+ # Fall through to create new container
1291
1340
 
1292
1341
  # Facial recognition worker container with --net=host (Port: 8081)
1342
+ container_name = f"facial_recognition_{self.action_record_id}"
1293
1343
  worker_cmd = (
1294
1344
  f"docker run -d --pull=always --net=host "
1295
- f"--name worker "
1296
- f"--cidfile ./{self.action_record_id}.cid "
1297
- f"-v matrice_myvol:/matrice_data "
1345
+ f"--name {container_name} "
1298
1346
  f"--cidfile ./{self.action_record_id}.cid "
1347
+ f"-v matrice_myvol:/matrice_data "
1299
1348
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1300
1349
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1301
1350
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1321,20 +1370,30 @@ def lpr_setup_execute(self: ActionInstance):
1321
1370
 
1322
1371
  self.setup_action_requirements(action_details)
1323
1372
 
1324
- if action_details["actionDetails"].get("containerId"):
1325
- logging.info(
1326
- "Using existing container ID for LPR worker: %s",
1327
- action_details["actionDetails"]["containerId"],
1328
- )
1329
- self.docker_container = action_details["actionDetails"]["containerId"]
1330
- cmd = "docker restart " + self.docker_container
1331
- self.start(cmd, "lpr_setup")
1332
- return
1373
+ existing_container_id = action_details["actionDetails"].get("containerId")
1374
+ if existing_container_id:
1375
+ # Check if container actually exists before trying to restart
1376
+ if ActionInstance.container_exists(existing_container_id):
1377
+ logging.info(
1378
+ "Using existing container ID for LPR worker: %s",
1379
+ existing_container_id,
1380
+ )
1381
+ self.docker_container = existing_container_id
1382
+ cmd = "docker restart " + self.docker_container
1383
+ self.start(cmd, "lpr_setup")
1384
+ return
1385
+ else:
1386
+ logging.warning(
1387
+ "Container %s not found. Creating new container.",
1388
+ existing_container_id
1389
+ )
1390
+ # Fall through to create new container
1333
1391
 
1334
1392
  # LPR worker container with --net=host (Port: 8082)
1393
+ container_name = f"lpr_{self.action_record_id}"
1335
1394
  worker_cmd = (
1336
1395
  f"docker run -d --net=host --pull=always "
1337
- f"--name lpr-worker "
1396
+ f"--name {container_name} "
1338
1397
  f"--cidfile ./{self.action_record_id}.cid "
1339
1398
  f"-v matrice_myvol:/matrice_data "
1340
1399
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1372,20 +1431,30 @@ def inference_ws_server_execute(self: ActionInstance):
1372
1431
 
1373
1432
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1374
1433
 
1375
- if action_details["actionDetails"].get("containerId"):
1376
- logging.info(
1377
- "Using existing container ID for inference WebSocket server: %s",
1378
- action_details["actionDetails"]["containerId"],
1379
- )
1380
- self.docker_container = action_details["actionDetails"]["containerId"]
1381
- cmd = "docker restart " + self.docker_container
1382
- self.start(cmd, "inference_ws_server")
1383
- return
1434
+ existing_container_id = action_details["actionDetails"].get("containerId")
1435
+ if existing_container_id:
1436
+ # Check if container actually exists before trying to restart
1437
+ if ActionInstance.container_exists(existing_container_id):
1438
+ logging.info(
1439
+ "Using existing container ID for inference WebSocket server: %s",
1440
+ existing_container_id,
1441
+ )
1442
+ self.docker_container = existing_container_id
1443
+ cmd = "docker restart " + self.docker_container
1444
+ self.start(cmd, "inference_ws_server")
1445
+ return
1446
+ else:
1447
+ logging.warning(
1448
+ "Container %s not found. Creating new container.",
1449
+ existing_container_id
1450
+ )
1451
+ # Fall through to create new container
1384
1452
 
1385
1453
  # Inference WebSocket server with --net=host (Port: 8102)
1454
+ container_name = f"inference_ws_{self.action_record_id}"
1386
1455
  worker_cmd = (
1387
1456
  f"docker run -d --pull=always --net=host "
1388
- f"--name inference "
1457
+ f"--name {container_name} "
1389
1458
  f"--cidfile ./{self.action_record_id}.cid "
1390
1459
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1391
1460
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1420,20 +1489,30 @@ def fe_fs_streaming_execute(self: ActionInstance):
1420
1489
 
1421
1490
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1422
1491
 
1423
- if action_details["actionDetails"].get("containerId"):
1424
- logging.info(
1425
- "Using existing container ID for frontend streaming: %s",
1426
- action_details["actionDetails"]["containerId"],
1427
- )
1428
- self.docker_container = action_details["actionDetails"]["containerId"]
1429
- cmd = "docker restart " + self.docker_container
1430
- self.start(cmd, "fe_fs_streaming")
1431
- return
1432
-
1492
+ existing_container_id = action_details["actionDetails"].get("containerId")
1493
+ if existing_container_id:
1494
+ # Check if container actually exists before trying to restart
1495
+ if ActionInstance.container_exists(existing_container_id):
1496
+ logging.info(
1497
+ "Using existing container ID for frontend streaming: %s",
1498
+ existing_container_id,
1499
+ )
1500
+ self.docker_container = existing_container_id
1501
+ cmd = "docker restart " + self.docker_container
1502
+ self.start(cmd, "fe_fs_streaming")
1503
+ return
1504
+ else:
1505
+ logging.warning(
1506
+ "Container %s not found. Creating new container.",
1507
+ existing_container_id
1508
+ )
1509
+ # Fall through to create new container
1510
+
1433
1511
  # Frontend streaming with --net=host (Port: 3000)
1512
+ container_name = f"fe_streaming_{self.action_record_id}"
1434
1513
  worker_cmd = (
1435
1514
  f"docker run -d --pull=always --net=host "
1436
- f"--name fe_streaming "
1515
+ f"--name {container_name} "
1437
1516
  f"--cidfile ./{self.action_record_id}.cid "
1438
1517
  f"-v matrice_myvol:/matrice_data "
1439
1518
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1465,20 +1544,30 @@ def fe_analytics_service_execute(self: ActionInstance):
1465
1544
 
1466
1545
  project_id = action_details["_idProject"]
1467
1546
 
1468
- if action_details["actionDetails"].get("containerId"):
1469
- logging.info(
1470
- "Using existing container ID for frontend analytics service: %s",
1471
- action_details["actionDetails"]["containerId"],
1472
- )
1473
- self.docker_container = action_details["actionDetails"]["containerId"]
1474
- cmd = "docker restart " + self.docker_container
1475
- self.start(cmd, "fe_analytics_service")
1476
- return
1477
-
1547
+ existing_container_id = action_details["actionDetails"].get("containerId")
1548
+ if existing_container_id:
1549
+ # Check if container actually exists before trying to restart
1550
+ if ActionInstance.container_exists(existing_container_id):
1551
+ logging.info(
1552
+ "Using existing container ID for frontend analytics service: %s",
1553
+ existing_container_id,
1554
+ )
1555
+ self.docker_container = existing_container_id
1556
+ cmd = "docker restart " + self.docker_container
1557
+ self.start(cmd, "fe_analytics_service")
1558
+ return
1559
+ else:
1560
+ logging.warning(
1561
+ "Container %s not found. Creating new container.",
1562
+ existing_container_id
1563
+ )
1564
+ # Fall through to create new container
1565
+
1478
1566
  # Frontend analytics service with --net=host (Port: 3001)
1567
+ container_name = f"fe_analytics_{self.action_record_id}"
1479
1568
  worker_cmd = (
1480
1569
  f"docker run -d --pull=always --net=host "
1481
- f"--name fe-analytics "
1570
+ f"--name {container_name} "
1482
1571
  f"--cidfile ./{self.action_record_id}.cid "
1483
1572
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1484
1573
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1510,7 +1599,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1510
1599
  else:
1511
1600
  return
1512
1601
  use_gpu = self.get_gpu_config(action_details)
1513
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1602
+ container_name = f"dataset_generation_{self.action_record_id}"
1603
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1514
1604
  logging.info("cmd is: %s", cmd)
1515
1605
  self.start(cmd, "dataset_generation")
1516
1606
 
@@ -1531,7 +1621,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1531
1621
  else:
1532
1622
  return
1533
1623
  use_gpu = self.get_gpu_config(action_details)
1534
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1624
+ container_name = f"synthetic_data_setup_{self.action_record_id}"
1625
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1535
1626
  logging.info("cmd is: %s", cmd)
1536
1627
  self.start(cmd, "synthetic_data_setup")
1537
1628
 
@@ -1568,26 +1659,40 @@ def redis_setup_execute(self: ActionInstance):
1568
1659
 
1569
1660
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1570
1661
 
1662
+ # Define container names with action_record_id for uniqueness
1663
+ redis_container_name = f"redis_{self.action_record_id}"
1571
1664
 
1572
- if action_details["actionDetails"].get("containerId"):
1573
- logging.info(
1574
- "Using existing container ID for redis management: %s",
1575
- action_details["actionDetails"]["containerId"],
1576
- )
1577
- self.docker_container = action_details["actionDetails"]["containerId"]
1578
- cmd = "docker restart " + self.docker_container
1579
- self.start(cmd, "redis_setup")
1665
+ existing_container_id = action_details["actionDetails"].get("containerId")
1666
+ if existing_container_id:
1667
+ # Check if both containers actually exist before trying to restart
1668
+ management_container_exists = ActionInstance.container_exists(existing_container_id)
1669
+ redis_container_exists = ActionInstance.container_exists(redis_container_name)
1580
1670
 
1581
- # Redis container restart
1582
- redis_restart_cmd = "docker restart redis_container"
1583
- self.start(redis_restart_cmd, "redis")
1671
+ if management_container_exists and redis_container_exists:
1672
+ logging.info(
1673
+ "Using existing container ID for redis management: %s",
1674
+ existing_container_id,
1675
+ )
1676
+ self.docker_container = existing_container_id
1677
+ cmd = "docker restart " + self.docker_container
1678
+ self.start(cmd, "redis_setup")
1679
+
1680
+ # Redis container restart
1681
+ redis_restart_cmd = f"docker restart {redis_container_name}"
1682
+ self.start(redis_restart_cmd, "redis")
1683
+ return
1684
+ else:
1685
+ logging.warning(
1686
+ "Container(s) not found (management=%s, redis=%s). Creating new containers.",
1687
+ management_container_exists,
1688
+ redis_container_exists
1689
+ )
1690
+ # Fall through to create new containers
1584
1691
 
1585
- return
1586
-
1587
1692
  # Redis container with --net=host (Port: 6379)
1588
1693
  redis_cmd = (
1589
1694
  f"docker run -d --net=host "
1590
- f"--name redis_container "
1695
+ f"--name {redis_container_name} "
1591
1696
  f"--restart unless-stopped "
1592
1697
  f"{redis_image} "
1593
1698
  f"redis-server --bind 0.0.0.0 "
@@ -1657,7 +1762,8 @@ def deploy_aggregator_execute(
1657
1762
  if not action_details:
1658
1763
  return
1659
1764
  self.setup_action_requirements(action_details, work_fs)
1660
- cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1765
+ container_name = f"deploy_aggregator_{self.action_record_id}"
1766
+ cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1661
1767
  logging.info("cmd: %s", cmd)
1662
1768
  self.start(cmd, "deploy_aggregator")
1663
1769
 
@@ -1705,7 +1811,8 @@ def model_deploy_execute(self: ActionInstance):
1705
1811
  "TRITON_PORTS": triton_ports
1706
1812
  }
1707
1813
 
1708
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1814
+ container_name = f"model_deploy_{self.action_record_id}"
1815
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1709
1816
  logging.info("cmd is: %s", cmd)
1710
1817
  self.start(cmd, "deploy_log")
1711
1818
 
@@ -1728,17 +1835,27 @@ def model_train_execute(self: ActionInstance):
1728
1835
  action_id=action_id,
1729
1836
  )
1730
1837
 
1731
- if action_details["actionDetails"].get("containerId"):
1732
- logging.info(
1733
- "Using existing container ID for training: %s",
1734
- action_details["actionDetails"]["containerId"],
1735
- )
1736
- self.docker_container = action_details["actionDetails"]["containerId"]
1737
- cmd = "docker restart " + self.docker_container
1738
- self.start(cmd, "train_log")
1739
- return
1740
-
1741
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1838
+ existing_container_id = action_details["actionDetails"].get("containerId")
1839
+ if existing_container_id:
1840
+ # Check if container actually exists before trying to restart
1841
+ if ActionInstance.container_exists(existing_container_id):
1842
+ logging.info(
1843
+ "Using existing container ID for training: %s",
1844
+ existing_container_id,
1845
+ )
1846
+ self.docker_container = existing_container_id
1847
+ cmd = "docker restart " + self.docker_container
1848
+ self.start(cmd, "train_log")
1849
+ return
1850
+ else:
1851
+ logging.warning(
1852
+ "Container %s not found. Creating new container.",
1853
+ existing_container_id
1854
+ )
1855
+ # Fall through to create new container
1856
+
1857
+ container_name = f"model_train_{self.action_record_id}"
1858
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1742
1859
  logging.info("cmd is: %s", cmd)
1743
1860
  self.start(cmd, "train_log")
1744
1861
 
@@ -1759,17 +1876,27 @@ def model_eval_execute(self: ActionInstance):
1759
1876
  model_family=model_family,
1760
1877
  action_id=action_id,
1761
1878
  )
1762
- if action_details["actionDetails"].get("containerId"):
1763
- logging.info(
1764
- "Using existing container ID for training: %s",
1765
- action_details["actionDetails"]["containerId"],
1766
- )
1767
- self.docker_container = action_details["actionDetails"]["containerId"]
1768
- cmd = "docker restart " + self.docker_container
1769
- self.start(cmd, "eval_log")
1770
- return
1771
-
1772
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1879
+ existing_container_id = action_details["actionDetails"].get("containerId")
1880
+ if existing_container_id:
1881
+ # Check if container actually exists before trying to restart
1882
+ if ActionInstance.container_exists(existing_container_id):
1883
+ logging.info(
1884
+ "Using existing container ID for evaluation: %s",
1885
+ existing_container_id,
1886
+ )
1887
+ self.docker_container = existing_container_id
1888
+ cmd = "docker restart " + self.docker_container
1889
+ self.start(cmd, "eval_log")
1890
+ return
1891
+ else:
1892
+ logging.warning(
1893
+ "Container %s not found. Creating new container.",
1894
+ existing_container_id
1895
+ )
1896
+ # Fall through to create new container
1897
+
1898
+ container_name = f"model_eval_{self.action_record_id}"
1899
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1773
1900
  logging.info("cmd is: %s", cmd)
1774
1901
  self.start(cmd, "eval_log")
1775
1902
 
@@ -1793,17 +1920,27 @@ def model_export_execute(self: ActionInstance):
1793
1920
  model_family=model_family,
1794
1921
  action_id=action_id,
1795
1922
  )
1796
- if action_details["actionDetails"].get("containerId"):
1797
- logging.info(
1798
- "Using existing container ID for training: %s",
1799
- action_details["actionDetails"]["containerId"],
1800
- )
1801
- self.docker_container = action_details["actionDetails"]["containerId"]
1802
- cmd = "docker restart " + self.docker_container
1803
- self.start(cmd, "export_log")
1804
- return
1805
-
1806
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1923
+ existing_container_id = action_details["actionDetails"].get("containerId")
1924
+ if existing_container_id:
1925
+ # Check if container actually exists before trying to restart
1926
+ if ActionInstance.container_exists(existing_container_id):
1927
+ logging.info(
1928
+ "Using existing container ID for export: %s",
1929
+ existing_container_id,
1930
+ )
1931
+ self.docker_container = existing_container_id
1932
+ cmd = "docker restart " + self.docker_container
1933
+ self.start(cmd, "export_log")
1934
+ return
1935
+ else:
1936
+ logging.warning(
1937
+ "Container %s not found. Creating new container.",
1938
+ existing_container_id
1939
+ )
1940
+ # Fall through to create new container
1941
+
1942
+ container_name = f"model_export_{self.action_record_id}"
1943
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1807
1944
  logging.info("cmd is: %s", cmd)
1808
1945
  self.start(cmd, "export_log")
1809
1946
 
@@ -1819,7 +1956,8 @@ def image_build_execute(self: ActionInstance):
1819
1956
  action_id = action_details["_id"]
1820
1957
  internal_api_key = self.get_internal_api_key(action_id)
1821
1958
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1822
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1959
+ container_name = f"image_build_{self.action_record_id}"
1960
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1823
1961
  logging.info("cmd is: %s", cmd)
1824
1962
  self.start(cmd, "image_build_log")
1825
1963
 
@@ -1831,7 +1969,8 @@ def resource_clone_execute(self: ActionInstance):
1831
1969
  if not action_details:
1832
1970
  return
1833
1971
  self.setup_action_requirements(action_details)
1834
- cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1972
+ container_name = f"resource_clone_{self.action_record_id}"
1973
+ cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1835
1974
  logging.info("cmd is: %s", cmd)
1836
1975
  self.start(cmd, "resource_clone")
1837
1976
 
@@ -1847,17 +1986,27 @@ def streaming_gateway_execute(self: ActionInstance):
1847
1986
  self.docker_container = (
1848
1987
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1849
1988
  )
1850
- if action_details["actionDetails"].get("containerId"):
1851
- logging.info(
1852
- "Using existing container ID for training: %s",
1853
- action_details["actionDetails"]["containerId"],
1854
- )
1855
- self.docker_container = action_details["actionDetails"]["containerId"]
1856
- cmd = "docker restart " + self.docker_container
1857
- self.start(cmd, "streaming_gateway")
1858
- return
1859
-
1860
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1989
+ existing_container_id = action_details["actionDetails"].get("containerId")
1990
+ if existing_container_id:
1991
+ # Check if container actually exists before trying to restart
1992
+ if ActionInstance.container_exists(existing_container_id):
1993
+ logging.info(
1994
+ "Using existing container ID for streaming gateway: %s",
1995
+ existing_container_id,
1996
+ )
1997
+ self.docker_container = existing_container_id
1998
+ cmd = "docker restart " + self.docker_container
1999
+ self.start(cmd, "streaming_gateway")
2000
+ return
2001
+ else:
2002
+ logging.warning(
2003
+ "Container %s not found. Creating new container.",
2004
+ existing_container_id
2005
+ )
2006
+ # Fall through to create new container
2007
+
2008
+ container_name = f"streaming_gateway_{self.action_record_id}"
2009
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1861
2010
  logging.info("cmd is: %s", cmd)
1862
2011
  self.start(cmd, "streaming_gateway")
1863
2012
 
@@ -1951,16 +2100,24 @@ def kafka_setup_execute(self: ActionInstance):
1951
2100
  else:
1952
2101
  pkgs = f"matrice_common matrice"
1953
2102
 
1954
- if action_details["actionDetails"].get("containerId"):
1955
- logging.info(
1956
- "Using existing container ID for training: %s",
1957
- action_details["actionDetails"]["containerId"],
1958
- )
1959
- self.docker_container = action_details["actionDetails"]["containerId"]
1960
- cmd = "docker restart " + self.docker_container
1961
- self.start(cmd, "kafka_setup")
1962
- return
1963
-
2103
+ existing_container_id = action_details["actionDetails"].get("containerId")
2104
+ if existing_container_id:
2105
+ # Check if container actually exists before trying to restart
2106
+ if ActionInstance.container_exists(existing_container_id):
2107
+ logging.info(
2108
+ "Using existing container ID for kafka: %s",
2109
+ existing_container_id,
2110
+ )
2111
+ self.docker_container = existing_container_id
2112
+ cmd = "docker restart " + self.docker_container
2113
+ self.start(cmd, "kafka_setup")
2114
+ return
2115
+ else:
2116
+ logging.warning(
2117
+ "Container %s not found. Creating new container.",
2118
+ existing_container_id
2119
+ )
2120
+ # Fall through to create new container
1964
2121
 
1965
2122
  # Kafka container with --net=host (Ports: 9092, 9093)
1966
2123
  cmd = (
@@ -1997,21 +2154,31 @@ def inference_tracker_setup_execute(self: ActionInstance):
1997
2154
 
1998
2155
  self.setup_action_requirements(action_details)
1999
2156
 
2000
- if action_details["actionDetails"].get("containerId"):
2001
- logging.info(
2002
- "Using existing container ID for inference tracker: %s",
2003
- action_details["actionDetails"]["containerId"],
2004
- )
2005
- self.docker_container = action_details["actionDetails"]["containerId"]
2006
- cmd = "docker restart " + self.docker_container
2007
- self.start(cmd, "inference_tracker_setup")
2008
- return
2009
-
2157
+ existing_container_id = action_details["actionDetails"].get("containerId")
2158
+ if existing_container_id:
2159
+ # Check if container actually exists before trying to restart
2160
+ if ActionInstance.container_exists(existing_container_id):
2161
+ logging.info(
2162
+ "Using existing container ID for inference tracker: %s",
2163
+ existing_container_id,
2164
+ )
2165
+ self.docker_container = existing_container_id
2166
+ cmd = "docker restart " + self.docker_container
2167
+ self.start(cmd, "inference_tracker_setup")
2168
+ return
2169
+ else:
2170
+ logging.warning(
2171
+ "Container %s not found. Creating new container.",
2172
+ existing_container_id
2173
+ )
2174
+ # Fall through to create new container
2175
+
2010
2176
  # This is the existing Docker run command
2177
+ container_name = f"inference_tracker_{self.action_record_id}"
2011
2178
  worker_cmd = (
2012
2179
  f"docker run -d --pull=always --net=host "
2013
- f"--cidfile ./{self.action_record_id}.cid "
2014
- f"--name inference-tracker-worker "
2180
+ f"--cidfile ./{self.action_record_id}.cid "
2181
+ f"--name {container_name} "
2015
2182
  f"-v matrice_myvol:/matrice_data "
2016
2183
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2017
2184
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -224,7 +224,7 @@ class ActionsManager:
224
224
  action_ids
225
225
  )
226
226
  else:
227
- logging.debug("No actions currently running")
227
+ logging.info("No actions currently running")
228
228
 
229
229
  return self.current_actions
230
230
 
@@ -404,7 +404,7 @@ class InstanceManager:
404
404
  if self.container_kafka_producer:
405
405
  try:
406
406
  self.container_kafka_producer.send(topic_name, status_message)
407
- logging.debug("Container status monitor: Sent status for %d containers", len(containers))
407
+ logging.info("Container status monitor: Sent status for %d containers", len(containers))
408
408
  except Exception as e:
409
409
  logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
410
410
 
@@ -295,7 +295,7 @@ class Scaling:
295
295
  logging.warning(f"Kafka returned error for {api}, falling back to REST")
296
296
 
297
297
  # Kafka failed or disabled, try REST
298
- logging.info(f"Using REST API for {api}")
298
+ logging.debug(f"Using REST API for {api}")
299
299
  try:
300
300
  rest_response = rest_fallback_func()
301
301
 
@@ -185,7 +185,7 @@ class ShutdownManager:
185
185
  time.sleep(2)
186
186
  return True
187
187
  except Exception as e:
188
- logging.debug("Aggressive command failed: %s", str(e))
188
+ logging.info("Aggressive command failed: %s", str(e))
189
189
  except Exception as e:
190
190
  logging.error("Error in aggressive shutdown methods: %s", str(e))
191
191
  return False
@@ -271,7 +271,7 @@ class ShutdownManager:
271
271
  """
272
272
  # CRITICAL: Check if this is a reserved instance that should not be shut down
273
273
  # if self.reserved_instance:
274
- # logging.debug("Reserved instance detected, skipping shutdown check")
274
+ # logging.info("Reserved instance detected, skipping shutdown check")
275
275
  # return
276
276
 
277
277
  # Update idle time tracking