matrice-compute 0.1.31__tar.gz → 0.1.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/__init__.py +4 -0
  4. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/action_instance.py +354 -162
  5. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/actions_manager.py +6 -2
  6. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/instance_manager.py +1 -1
  7. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/scaling.py +1 -1
  8. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/shutdown_manager.py +2 -2
  9. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/LICENSE.txt +0 -0
  10. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/README.md +0 -0
  11. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/matrice_compute.egg-info/SOURCES.txt +0 -0
  12. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/matrice_compute.egg-info/dependency_links.txt +0 -0
  13. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/matrice_compute.egg-info/not-zip-safe +0 -0
  14. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/matrice_compute.egg-info/top_level.txt +0 -0
  15. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/pyproject.toml +0 -0
  16. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/setup.cfg +0 -0
  17. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/setup.py +0 -0
  18. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  19. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/compute_operations_handler.py +0 -0
  20. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/instance_utils.py +0 -0
  21. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/prechecks.py +0 -0
  22. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/py.typed +0 -0
  23. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/resources_tracker.py +0 -0
  24. {matrice_compute-0.1.31 → matrice_compute-0.1.33}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.31
3
+ Version: 0.1.33
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.31
3
+ Version: 0.1.33
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,7 @@
1
1
  """Module providing __init__ functionality."""
2
2
 
3
3
  import subprocess
4
+ import logging
4
5
 
5
6
  from matrice_common.utils import dependencies_check
6
7
 
@@ -17,4 +18,7 @@ subprocess.run( # Re-upgrade docker to avoid missing DOCKER_HOST connection erro
17
18
 
18
19
  from matrice_compute.instance_manager import InstanceManager # noqa: E402
19
20
 
21
+ logging.getLogger("kafka").setLevel(logging.INFO)
22
+ logging.getLogger("confluent_kafka").setLevel(logging.INFO)
23
+
20
24
  __all__ = ["InstanceManager"]
@@ -296,7 +296,7 @@ class ActionInstance:
296
296
  getattr(self, "action_record_id", "unknown"),
297
297
  )
298
298
  else:
299
- logging.debug(
299
+ logging.info(
300
300
  "No additional logs to send for action %s",
301
301
  getattr(self, "action_record_id", "unknown"),
302
302
  )
@@ -411,6 +411,7 @@ class ActionInstance:
411
411
  destination_workspace_path: str = "/usr/src/workspace",
412
412
  docker_workdir: str = "",
413
413
  extra_pkgs: list = [],
414
+ container_name: str = "",
414
415
  ):
415
416
  """Build base Docker command with common options.
416
417
 
@@ -425,6 +426,7 @@ class ActionInstance:
425
426
  destination_workspace_path (str): Container workspace path
426
427
  docker_workdir (str): Docker working directory
427
428
  extra_pkgs (list): List of extra packages to install
429
+ container_name (str): Docker container name (format: {action_type}_{action_id})
428
430
  Returns:
429
431
  str: Base Docker command
430
432
  """
@@ -489,8 +491,12 @@ class ActionInstance:
489
491
  ]
490
492
  )
491
493
 
494
+ # Build container name option if provided
495
+ name_option = f"--name {container_name}" if container_name else ""
496
+
492
497
  cmd_parts = [
493
498
  f"docker run -d {use_gpu} ",
499
+ name_option,
494
500
  network_config,
495
501
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
502
  *volumes,
@@ -882,6 +888,34 @@ class ActionInstance:
882
888
  job_params=action_details["jobParams"],
883
889
  )
884
890
 
891
+ @staticmethod
892
+ def container_exists(container_id: str) -> bool:
893
+ """Check if a Docker container exists.
894
+
895
+ Args:
896
+ container_id (str): Container ID or name to check
897
+
898
+ Returns:
899
+ bool: True if container exists, False otherwise
900
+ """
901
+ if not container_id:
902
+ return False
903
+ try:
904
+ result = subprocess.run(
905
+ ["docker", "inspect", container_id],
906
+ capture_output=True,
907
+ text=True,
908
+ timeout=10
909
+ )
910
+ return result.returncode == 0
911
+ except Exception as e:
912
+ logging.warning(
913
+ "Error checking if container %s exists: %s",
914
+ container_id,
915
+ str(e)
916
+ )
917
+ return False
918
+
885
919
  @log_errors(raise_exception=True)
886
920
  def start_process(self, cmd, log_name):
887
921
  """Start the process and initialize logging.
@@ -905,7 +939,16 @@ class ActionInstance:
905
939
  env={**os.environ},
906
940
  )
907
941
 
908
- stdout, stderr = process.communicate(timeout=120)
942
+ # Use a longer timeout for docker run since --pull=always may need to
943
+ # download large images on first run. Default: 30 minutes (1800 seconds)
944
+ # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
945
+ docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
946
+ logging.info(
947
+ "Waiting for docker container to start for action %s (timeout: %d seconds)",
948
+ self.action_record_id,
949
+ docker_start_timeout,
950
+ )
951
+ stdout, stderr = process.communicate(timeout=docker_start_timeout)
909
952
 
910
953
  if process.returncode != 0:
911
954
  logging.error(
@@ -1095,7 +1138,8 @@ def data_preparation_execute(
1095
1138
  "Started pulling Docker image with PID: %s",
1096
1139
  process.pid,
1097
1140
  )
1098
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1141
+ container_name = f"data_prep_{self.action_record_id}"
1142
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1099
1143
  logging.info("cmd is: %s", cmd)
1100
1144
  self.start(cmd, "data_preparation_log")
1101
1145
 
@@ -1124,7 +1168,8 @@ def data_processing_execute(self: ActionInstance):
1124
1168
  service="bg-job-scheduler",
1125
1169
  job_params=action["jobParams"],
1126
1170
  )
1127
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1171
+ container_name = f"data_processing_{self.action_record_id}"
1172
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1128
1173
  logging.info("cmd: %s", cmd)
1129
1174
  self.start(cmd, "data_processing_log")
1130
1175
 
@@ -1137,7 +1182,8 @@ def data_split_execute(self: ActionInstance):
1137
1182
  if not action_details:
1138
1183
  return
1139
1184
  self.setup_action_requirements(action_details, work_fs, model_family="")
1140
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1185
+ container_name = f"data_split_{self.action_record_id}"
1186
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1141
1187
  logging.info("cmd: %s", cmd)
1142
1188
  self.start(cmd, "data_split")
1143
1189
 
@@ -1152,7 +1198,8 @@ def dataset_annotation_execute(
1152
1198
  if not action_details:
1153
1199
  return
1154
1200
  self.setup_action_requirements(action_details, work_fs)
1155
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1201
+ container_name = f"dataset_annotation_{self.action_record_id}"
1202
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1156
1203
  logging.info("cmd: %s", cmd)
1157
1204
  self.start(cmd, "dataset_annotation")
1158
1205
 
@@ -1167,7 +1214,8 @@ def dataset_augmentation_execute(
1167
1214
  if not action_details:
1168
1215
  return
1169
1216
  self.setup_action_requirements(action_details, work_fs)
1170
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1217
+ container_name = f"dataset_augmentation_{self.action_record_id}"
1218
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1171
1219
  logging.info("cmd: %s", cmd)
1172
1220
  self.start(cmd, "dataset_augmentation")
1173
1221
 
@@ -1183,7 +1231,8 @@ def augmentation_server_creation_execute(
1183
1231
  if not action_details:
1184
1232
  return
1185
1233
  self.setup_action_requirements(action_details, work_fs)
1186
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1234
+ container_name = f"augmentation_setup_{self.action_record_id}"
1235
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1187
1236
  logging.info("cmd: %s", cmd)
1188
1237
  self.start(cmd, "augmentation_setup")
1189
1238
 
@@ -1204,25 +1253,41 @@ def database_setup_execute(self: ActionInstance):
1204
1253
 
1205
1254
  project_id = action_details["_idProject"]
1206
1255
 
1207
- if action_details["actionDetails"].get("containerId"):
1208
- logging.info(
1209
- "Using existing container ID for inference tracker: %s",
1210
- action_details["actionDetails"]["containerId"],
1211
- )
1212
- self.docker_container = action_details["actionDetails"]["containerId"]
1213
- cmd = "docker restart " + self.docker_container
1214
- self.start(cmd, "qdrant_setup")
1256
+ # Define container names with action_record_id for uniqueness
1257
+ mongodb_container_name = f"database_setup_{self.action_record_id}"
1258
+ qdrant_container_name = f"qdrant_{self.action_record_id}"
1215
1259
 
1216
- #qdrant restart
1217
- qdrant_cmd = "docker restart qdrant"
1218
- self.start(qdrant_cmd, 'qdrant_setup')
1260
+ existing_container_id = action_details["actionDetails"].get("containerId")
1261
+ if existing_container_id:
1262
+ # Check if both containers actually exist before trying to restart
1263
+ mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
1264
+ qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
1219
1265
 
1220
- return
1266
+ if mongodb_container_exists and qdrant_container_exists:
1267
+ logging.info(
1268
+ "Using existing container ID for database setup: %s",
1269
+ existing_container_id,
1270
+ )
1271
+ self.docker_container = existing_container_id
1272
+ cmd = "docker restart " + self.docker_container
1273
+ self.start(cmd, "qdrant_setup")
1274
+
1275
+ # qdrant restart
1276
+ qdrant_cmd = f"docker restart {qdrant_container_name}"
1277
+ self.start(qdrant_cmd, "qdrant_setup")
1278
+ return
1279
+ else:
1280
+ logging.warning(
1281
+ "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
1282
+ mongodb_container_exists,
1283
+ qdrant_container_exists
1284
+ )
1285
+ # Fall through to create new containers
1221
1286
 
1222
1287
  # MongoDB container with --net=host (Port: 27020:27017)
1223
1288
  cmd = (
1224
1289
  f"docker run --pull=always --net=host "
1225
- f"--name mongodbdatabase "
1290
+ f"--name {mongodb_container_name} "
1226
1291
  f"-v matrice_myvol:/matrice_data "
1227
1292
  f"--cidfile ./{self.action_record_id}.cid "
1228
1293
  f"-e ACTION_RECORD_ID={self.action_record_id} "
@@ -1237,7 +1302,7 @@ def database_setup_execute(self: ActionInstance):
1237
1302
  # Qdrant container with --net=host (Port: 6334)
1238
1303
  qdrant_cmd = (
1239
1304
  f"docker run --pull=always --net=host "
1240
- f"--name qdrant "
1305
+ f"--name {qdrant_container_name} "
1241
1306
  f"-v matrice_myvol:/matrice_data "
1242
1307
  f"{'qdrant/qdrant:latest'} "
1243
1308
  )
@@ -1263,23 +1328,32 @@ def facial_recognition_setup_execute(self: ActionInstance):
1263
1328
 
1264
1329
  self.setup_action_requirements(action_details)
1265
1330
 
1266
- if action_details["actionDetails"].get("containerId"):
1267
- logging.info(
1268
- "Using existing container ID for facial recognition worker: %s",
1269
- action_details["actionDetails"]["containerId"],
1270
- )
1271
- self.docker_container = action_details["actionDetails"]["containerId"]
1272
- cmd = "docker restart " + self.docker_container
1273
- self.start(cmd, "facial_recognition_setup")
1274
- return
1331
+ existing_container_id = action_details["actionDetails"].get("containerId")
1332
+ if existing_container_id:
1333
+ # Check if container actually exists before trying to restart
1334
+ if ActionInstance.container_exists(existing_container_id):
1335
+ logging.info(
1336
+ "Using existing container ID for facial recognition worker: %s",
1337
+ existing_container_id,
1338
+ )
1339
+ self.docker_container = existing_container_id
1340
+ cmd = "docker restart " + self.docker_container
1341
+ self.start(cmd, "facial_recognition_setup")
1342
+ return
1343
+ else:
1344
+ logging.warning(
1345
+ "Container %s not found. Creating new container.",
1346
+ existing_container_id
1347
+ )
1348
+ # Fall through to create new container
1275
1349
 
1276
1350
  # Facial recognition worker container with --net=host (Port: 8081)
1351
+ container_name = f"facial_recognition_{self.action_record_id}"
1277
1352
  worker_cmd = (
1278
1353
  f"docker run -d --pull=always --net=host "
1279
- f"--name worker "
1280
- f"--cidfile ./{self.action_record_id}.cid "
1281
- f"-v matrice_myvol:/matrice_data "
1354
+ f"--name {container_name} "
1282
1355
  f"--cidfile ./{self.action_record_id}.cid "
1356
+ f"-v matrice_myvol:/matrice_data "
1283
1357
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1284
1358
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1285
1359
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1305,20 +1379,30 @@ def lpr_setup_execute(self: ActionInstance):
1305
1379
 
1306
1380
  self.setup_action_requirements(action_details)
1307
1381
 
1308
- if action_details["actionDetails"].get("containerId"):
1309
- logging.info(
1310
- "Using existing container ID for LPR worker: %s",
1311
- action_details["actionDetails"]["containerId"],
1312
- )
1313
- self.docker_container = action_details["actionDetails"]["containerId"]
1314
- cmd = "docker restart " + self.docker_container
1315
- self.start(cmd, "lpr_setup")
1316
- return
1382
+ existing_container_id = action_details["actionDetails"].get("containerId")
1383
+ if existing_container_id:
1384
+ # Check if container actually exists before trying to restart
1385
+ if ActionInstance.container_exists(existing_container_id):
1386
+ logging.info(
1387
+ "Using existing container ID for LPR worker: %s",
1388
+ existing_container_id,
1389
+ )
1390
+ self.docker_container = existing_container_id
1391
+ cmd = "docker restart " + self.docker_container
1392
+ self.start(cmd, "lpr_setup")
1393
+ return
1394
+ else:
1395
+ logging.warning(
1396
+ "Container %s not found. Creating new container.",
1397
+ existing_container_id
1398
+ )
1399
+ # Fall through to create new container
1317
1400
 
1318
1401
  # LPR worker container with --net=host (Port: 8082)
1402
+ container_name = f"lpr_{self.action_record_id}"
1319
1403
  worker_cmd = (
1320
1404
  f"docker run -d --net=host --pull=always "
1321
- f"--name lpr-worker "
1405
+ f"--name {container_name} "
1322
1406
  f"--cidfile ./{self.action_record_id}.cid "
1323
1407
  f"-v matrice_myvol:/matrice_data "
1324
1408
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1356,20 +1440,30 @@ def inference_ws_server_execute(self: ActionInstance):
1356
1440
 
1357
1441
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1358
1442
 
1359
- if action_details["actionDetails"].get("containerId"):
1360
- logging.info(
1361
- "Using existing container ID for inference WebSocket server: %s",
1362
- action_details["actionDetails"]["containerId"],
1363
- )
1364
- self.docker_container = action_details["actionDetails"]["containerId"]
1365
- cmd = "docker restart " + self.docker_container
1366
- self.start(cmd, "inference_ws_server")
1367
- return
1443
+ existing_container_id = action_details["actionDetails"].get("containerId")
1444
+ if existing_container_id:
1445
+ # Check if container actually exists before trying to restart
1446
+ if ActionInstance.container_exists(existing_container_id):
1447
+ logging.info(
1448
+ "Using existing container ID for inference WebSocket server: %s",
1449
+ existing_container_id,
1450
+ )
1451
+ self.docker_container = existing_container_id
1452
+ cmd = "docker restart " + self.docker_container
1453
+ self.start(cmd, "inference_ws_server")
1454
+ return
1455
+ else:
1456
+ logging.warning(
1457
+ "Container %s not found. Creating new container.",
1458
+ existing_container_id
1459
+ )
1460
+ # Fall through to create new container
1368
1461
 
1369
1462
  # Inference WebSocket server with --net=host (Port: 8102)
1463
+ container_name = f"inference_ws_{self.action_record_id}"
1370
1464
  worker_cmd = (
1371
1465
  f"docker run -d --pull=always --net=host "
1372
- f"--name inference "
1466
+ f"--name {container_name} "
1373
1467
  f"--cidfile ./{self.action_record_id}.cid "
1374
1468
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1375
1469
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1404,20 +1498,30 @@ def fe_fs_streaming_execute(self: ActionInstance):
1404
1498
 
1405
1499
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1406
1500
 
1407
- if action_details["actionDetails"].get("containerId"):
1408
- logging.info(
1409
- "Using existing container ID for frontend streaming: %s",
1410
- action_details["actionDetails"]["containerId"],
1411
- )
1412
- self.docker_container = action_details["actionDetails"]["containerId"]
1413
- cmd = "docker restart " + self.docker_container
1414
- self.start(cmd, "fe_fs_streaming")
1415
- return
1416
-
1501
+ existing_container_id = action_details["actionDetails"].get("containerId")
1502
+ if existing_container_id:
1503
+ # Check if container actually exists before trying to restart
1504
+ if ActionInstance.container_exists(existing_container_id):
1505
+ logging.info(
1506
+ "Using existing container ID for frontend streaming: %s",
1507
+ existing_container_id,
1508
+ )
1509
+ self.docker_container = existing_container_id
1510
+ cmd = "docker restart " + self.docker_container
1511
+ self.start(cmd, "fe_fs_streaming")
1512
+ return
1513
+ else:
1514
+ logging.warning(
1515
+ "Container %s not found. Creating new container.",
1516
+ existing_container_id
1517
+ )
1518
+ # Fall through to create new container
1519
+
1417
1520
  # Frontend streaming with --net=host (Port: 3000)
1521
+ container_name = f"fe_streaming_{self.action_record_id}"
1418
1522
  worker_cmd = (
1419
1523
  f"docker run -d --pull=always --net=host "
1420
- f"--name fe_streaming "
1524
+ f"--name {container_name} "
1421
1525
  f"--cidfile ./{self.action_record_id}.cid "
1422
1526
  f"-v matrice_myvol:/matrice_data "
1423
1527
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1449,20 +1553,30 @@ def fe_analytics_service_execute(self: ActionInstance):
1449
1553
 
1450
1554
  project_id = action_details["_idProject"]
1451
1555
 
1452
- if action_details["actionDetails"].get("containerId"):
1453
- logging.info(
1454
- "Using existing container ID for frontend analytics service: %s",
1455
- action_details["actionDetails"]["containerId"],
1456
- )
1457
- self.docker_container = action_details["actionDetails"]["containerId"]
1458
- cmd = "docker restart " + self.docker_container
1459
- self.start(cmd, "fe_analytics_service")
1460
- return
1461
-
1556
+ existing_container_id = action_details["actionDetails"].get("containerId")
1557
+ if existing_container_id:
1558
+ # Check if container actually exists before trying to restart
1559
+ if ActionInstance.container_exists(existing_container_id):
1560
+ logging.info(
1561
+ "Using existing container ID for frontend analytics service: %s",
1562
+ existing_container_id,
1563
+ )
1564
+ self.docker_container = existing_container_id
1565
+ cmd = "docker restart " + self.docker_container
1566
+ self.start(cmd, "fe_analytics_service")
1567
+ return
1568
+ else:
1569
+ logging.warning(
1570
+ "Container %s not found. Creating new container.",
1571
+ existing_container_id
1572
+ )
1573
+ # Fall through to create new container
1574
+
1462
1575
  # Frontend analytics service with --net=host (Port: 3001)
1576
+ container_name = f"fe_analytics_{self.action_record_id}"
1463
1577
  worker_cmd = (
1464
1578
  f"docker run -d --pull=always --net=host "
1465
- f"--name fe-analytics "
1579
+ f"--name {container_name} "
1466
1580
  f"--cidfile ./{self.action_record_id}.cid "
1467
1581
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1468
1582
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1494,7 +1608,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1494
1608
  else:
1495
1609
  return
1496
1610
  use_gpu = self.get_gpu_config(action_details)
1497
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1611
+ container_name = f"dataset_generation_{self.action_record_id}"
1612
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1498
1613
  logging.info("cmd is: %s", cmd)
1499
1614
  self.start(cmd, "dataset_generation")
1500
1615
 
@@ -1515,7 +1630,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1515
1630
  else:
1516
1631
  return
1517
1632
  use_gpu = self.get_gpu_config(action_details)
1518
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1633
+ container_name = f"synthetic_data_setup_{self.action_record_id}"
1634
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1519
1635
  logging.info("cmd is: %s", cmd)
1520
1636
  self.start(cmd, "synthetic_data_setup")
1521
1637
 
@@ -1552,26 +1668,40 @@ def redis_setup_execute(self: ActionInstance):
1552
1668
 
1553
1669
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1554
1670
 
1671
+ # Define container names with action_record_id for uniqueness
1672
+ redis_container_name = f"redis_{self.action_record_id}"
1555
1673
 
1556
- if action_details["actionDetails"].get("containerId"):
1557
- logging.info(
1558
- "Using existing container ID for redis management: %s",
1559
- action_details["actionDetails"]["containerId"],
1560
- )
1561
- self.docker_container = action_details["actionDetails"]["containerId"]
1562
- cmd = "docker restart " + self.docker_container
1563
- self.start(cmd, "redis_setup")
1674
+ existing_container_id = action_details["actionDetails"].get("containerId")
1675
+ if existing_container_id:
1676
+ # Check if both containers actually exist before trying to restart
1677
+ management_container_exists = ActionInstance.container_exists(existing_container_id)
1678
+ redis_container_exists = ActionInstance.container_exists(redis_container_name)
1564
1679
 
1565
- # Redis container restart
1566
- redis_restart_cmd = "docker restart redis_container"
1567
- self.start(redis_restart_cmd, "redis")
1680
+ if management_container_exists and redis_container_exists:
1681
+ logging.info(
1682
+ "Using existing container ID for redis management: %s",
1683
+ existing_container_id,
1684
+ )
1685
+ self.docker_container = existing_container_id
1686
+ cmd = "docker restart " + self.docker_container
1687
+ self.start(cmd, "redis_setup")
1688
+
1689
+ # Redis container restart
1690
+ redis_restart_cmd = f"docker restart {redis_container_name}"
1691
+ self.start(redis_restart_cmd, "redis")
1692
+ return
1693
+ else:
1694
+ logging.warning(
1695
+ "Container(s) not found (management=%s, redis=%s). Creating new containers.",
1696
+ management_container_exists,
1697
+ redis_container_exists
1698
+ )
1699
+ # Fall through to create new containers
1568
1700
 
1569
- return
1570
-
1571
1701
  # Redis container with --net=host (Port: 6379)
1572
1702
  redis_cmd = (
1573
1703
  f"docker run -d --net=host "
1574
- f"--name redis_container "
1704
+ f"--name {redis_container_name} "
1575
1705
  f"--restart unless-stopped "
1576
1706
  f"{redis_image} "
1577
1707
  f"redis-server --bind 0.0.0.0 "
@@ -1641,7 +1771,8 @@ def deploy_aggregator_execute(
1641
1771
  if not action_details:
1642
1772
  return
1643
1773
  self.setup_action_requirements(action_details, work_fs)
1644
- cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1774
+ container_name = f"deploy_aggregator_{self.action_record_id}"
1775
+ cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1645
1776
  logging.info("cmd: %s", cmd)
1646
1777
  self.start(cmd, "deploy_aggregator")
1647
1778
 
@@ -1689,7 +1820,8 @@ def model_deploy_execute(self: ActionInstance):
1689
1820
  "TRITON_PORTS": triton_ports
1690
1821
  }
1691
1822
 
1692
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1823
+ container_name = f"model_deploy_{self.action_record_id}"
1824
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1693
1825
  logging.info("cmd is: %s", cmd)
1694
1826
  self.start(cmd, "deploy_log")
1695
1827
 
@@ -1712,17 +1844,27 @@ def model_train_execute(self: ActionInstance):
1712
1844
  action_id=action_id,
1713
1845
  )
1714
1846
 
1715
- if action_details["actionDetails"].get("containerId"):
1716
- logging.info(
1717
- "Using existing container ID for training: %s",
1718
- action_details["actionDetails"]["containerId"],
1719
- )
1720
- self.docker_container = action_details["actionDetails"]["containerId"]
1721
- cmd = "docker restart " + self.docker_container
1722
- self.start(cmd, "train_log")
1723
- return
1724
-
1725
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1847
+ existing_container_id = action_details["actionDetails"].get("containerId")
1848
+ if existing_container_id:
1849
+ # Check if container actually exists before trying to restart
1850
+ if ActionInstance.container_exists(existing_container_id):
1851
+ logging.info(
1852
+ "Using existing container ID for training: %s",
1853
+ existing_container_id,
1854
+ )
1855
+ self.docker_container = existing_container_id
1856
+ cmd = "docker restart " + self.docker_container
1857
+ self.start(cmd, "train_log")
1858
+ return
1859
+ else:
1860
+ logging.warning(
1861
+ "Container %s not found. Creating new container.",
1862
+ existing_container_id
1863
+ )
1864
+ # Fall through to create new container
1865
+
1866
+ container_name = f"model_train_{self.action_record_id}"
1867
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1726
1868
  logging.info("cmd is: %s", cmd)
1727
1869
  self.start(cmd, "train_log")
1728
1870
 
@@ -1743,17 +1885,27 @@ def model_eval_execute(self: ActionInstance):
1743
1885
  model_family=model_family,
1744
1886
  action_id=action_id,
1745
1887
  )
1746
- if action_details["actionDetails"].get("containerId"):
1747
- logging.info(
1748
- "Using existing container ID for training: %s",
1749
- action_details["actionDetails"]["containerId"],
1750
- )
1751
- self.docker_container = action_details["actionDetails"]["containerId"]
1752
- cmd = "docker restart " + self.docker_container
1753
- self.start(cmd, "eval_log")
1754
- return
1755
-
1756
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1888
+ existing_container_id = action_details["actionDetails"].get("containerId")
1889
+ if existing_container_id:
1890
+ # Check if container actually exists before trying to restart
1891
+ if ActionInstance.container_exists(existing_container_id):
1892
+ logging.info(
1893
+ "Using existing container ID for evaluation: %s",
1894
+ existing_container_id,
1895
+ )
1896
+ self.docker_container = existing_container_id
1897
+ cmd = "docker restart " + self.docker_container
1898
+ self.start(cmd, "eval_log")
1899
+ return
1900
+ else:
1901
+ logging.warning(
1902
+ "Container %s not found. Creating new container.",
1903
+ existing_container_id
1904
+ )
1905
+ # Fall through to create new container
1906
+
1907
+ container_name = f"model_eval_{self.action_record_id}"
1908
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1757
1909
  logging.info("cmd is: %s", cmd)
1758
1910
  self.start(cmd, "eval_log")
1759
1911
 
@@ -1777,17 +1929,27 @@ def model_export_execute(self: ActionInstance):
1777
1929
  model_family=model_family,
1778
1930
  action_id=action_id,
1779
1931
  )
1780
- if action_details["actionDetails"].get("containerId"):
1781
- logging.info(
1782
- "Using existing container ID for training: %s",
1783
- action_details["actionDetails"]["containerId"],
1784
- )
1785
- self.docker_container = action_details["actionDetails"]["containerId"]
1786
- cmd = "docker restart " + self.docker_container
1787
- self.start(cmd, "export_log")
1788
- return
1789
-
1790
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1932
+ existing_container_id = action_details["actionDetails"].get("containerId")
1933
+ if existing_container_id:
1934
+ # Check if container actually exists before trying to restart
1935
+ if ActionInstance.container_exists(existing_container_id):
1936
+ logging.info(
1937
+ "Using existing container ID for export: %s",
1938
+ existing_container_id,
1939
+ )
1940
+ self.docker_container = existing_container_id
1941
+ cmd = "docker restart " + self.docker_container
1942
+ self.start(cmd, "export_log")
1943
+ return
1944
+ else:
1945
+ logging.warning(
1946
+ "Container %s not found. Creating new container.",
1947
+ existing_container_id
1948
+ )
1949
+ # Fall through to create new container
1950
+
1951
+ container_name = f"model_export_{self.action_record_id}"
1952
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1791
1953
  logging.info("cmd is: %s", cmd)
1792
1954
  self.start(cmd, "export_log")
1793
1955
 
@@ -1803,7 +1965,8 @@ def image_build_execute(self: ActionInstance):
1803
1965
  action_id = action_details["_id"]
1804
1966
  internal_api_key = self.get_internal_api_key(action_id)
1805
1967
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1806
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1968
+ container_name = f"image_build_{self.action_record_id}"
1969
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1807
1970
  logging.info("cmd is: %s", cmd)
1808
1971
  self.start(cmd, "image_build_log")
1809
1972
 
@@ -1815,7 +1978,8 @@ def resource_clone_execute(self: ActionInstance):
1815
1978
  if not action_details:
1816
1979
  return
1817
1980
  self.setup_action_requirements(action_details)
1818
- cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1981
+ container_name = f"resource_clone_{self.action_record_id}"
1982
+ cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1819
1983
  logging.info("cmd is: %s", cmd)
1820
1984
  self.start(cmd, "resource_clone")
1821
1985
 
@@ -1831,17 +1995,27 @@ def streaming_gateway_execute(self: ActionInstance):
1831
1995
  self.docker_container = (
1832
1996
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1833
1997
  )
1834
- if action_details["actionDetails"].get("containerId"):
1835
- logging.info(
1836
- "Using existing container ID for training: %s",
1837
- action_details["actionDetails"]["containerId"],
1838
- )
1839
- self.docker_container = action_details["actionDetails"]["containerId"]
1840
- cmd = "docker restart " + self.docker_container
1841
- self.start(cmd, "streaming_gateway")
1842
- return
1843
-
1844
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1998
+ existing_container_id = action_details["actionDetails"].get("containerId")
1999
+ if existing_container_id:
2000
+ # Check if container actually exists before trying to restart
2001
+ if ActionInstance.container_exists(existing_container_id):
2002
+ logging.info(
2003
+ "Using existing container ID for streaming gateway: %s",
2004
+ existing_container_id,
2005
+ )
2006
+ self.docker_container = existing_container_id
2007
+ cmd = "docker restart " + self.docker_container
2008
+ self.start(cmd, "streaming_gateway")
2009
+ return
2010
+ else:
2011
+ logging.warning(
2012
+ "Container %s not found. Creating new container.",
2013
+ existing_container_id
2014
+ )
2015
+ # Fall through to create new container
2016
+
2017
+ container_name = f"streaming_gateway_{self.action_record_id}"
2018
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1845
2019
  logging.info("cmd is: %s", cmd)
1846
2020
  self.start(cmd, "streaming_gateway")
1847
2021
 
@@ -1935,16 +2109,24 @@ def kafka_setup_execute(self: ActionInstance):
1935
2109
  else:
1936
2110
  pkgs = f"matrice_common matrice"
1937
2111
 
1938
- if action_details["actionDetails"].get("containerId"):
1939
- logging.info(
1940
- "Using existing container ID for training: %s",
1941
- action_details["actionDetails"]["containerId"],
1942
- )
1943
- self.docker_container = action_details["actionDetails"]["containerId"]
1944
- cmd = "docker restart " + self.docker_container
1945
- self.start(cmd, "kafka_setup")
1946
- return
1947
-
2112
+ existing_container_id = action_details["actionDetails"].get("containerId")
2113
+ if existing_container_id:
2114
+ # Check if container actually exists before trying to restart
2115
+ if ActionInstance.container_exists(existing_container_id):
2116
+ logging.info(
2117
+ "Using existing container ID for kafka: %s",
2118
+ existing_container_id,
2119
+ )
2120
+ self.docker_container = existing_container_id
2121
+ cmd = "docker restart " + self.docker_container
2122
+ self.start(cmd, "kafka_setup")
2123
+ return
2124
+ else:
2125
+ logging.warning(
2126
+ "Container %s not found. Creating new container.",
2127
+ existing_container_id
2128
+ )
2129
+ # Fall through to create new container
1948
2130
 
1949
2131
  # Kafka container with --net=host (Ports: 9092, 9093)
1950
2132
  cmd = (
@@ -1981,21 +2163,31 @@ def inference_tracker_setup_execute(self: ActionInstance):
1981
2163
 
1982
2164
  self.setup_action_requirements(action_details)
1983
2165
 
1984
- if action_details["actionDetails"].get("containerId"):
1985
- logging.info(
1986
- "Using existing container ID for inference tracker: %s",
1987
- action_details["actionDetails"]["containerId"],
1988
- )
1989
- self.docker_container = action_details["actionDetails"]["containerId"]
1990
- cmd = "docker restart " + self.docker_container
1991
- self.start(cmd, "inference_tracker_setup")
1992
- return
1993
-
2166
+ existing_container_id = action_details["actionDetails"].get("containerId")
2167
+ if existing_container_id:
2168
+ # Check if container actually exists before trying to restart
2169
+ if ActionInstance.container_exists(existing_container_id):
2170
+ logging.info(
2171
+ "Using existing container ID for inference tracker: %s",
2172
+ existing_container_id,
2173
+ )
2174
+ self.docker_container = existing_container_id
2175
+ cmd = "docker restart " + self.docker_container
2176
+ self.start(cmd, "inference_tracker_setup")
2177
+ return
2178
+ else:
2179
+ logging.warning(
2180
+ "Container %s not found. Creating new container.",
2181
+ existing_container_id
2182
+ )
2183
+ # Fall through to create new container
2184
+
1994
2185
  # This is the existing Docker run command
2186
+ container_name = f"inference_tracker_{self.action_record_id}"
1995
2187
  worker_cmd = (
1996
2188
  f"docker run -d --pull=always --net=host "
1997
- f"--cidfile ./{self.action_record_id}.cid "
1998
- f"--name inference-tracker-worker "
2189
+ f"--cidfile ./{self.action_record_id}.cid "
2190
+ f"--name {container_name} "
1999
2191
  f"-v matrice_myvol:/matrice_data "
2000
2192
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2001
2193
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -43,7 +43,11 @@ class ActionsManager:
43
43
  """
44
44
  actions = []
45
45
  logging.info("Polling backend for new jobs")
46
- fetched_actions, error, _ = self.scaling.assign_jobs(has_gpu())
46
+ result = self.scaling.assign_jobs(has_gpu())
47
+ if result is None:
48
+ logging.error("assign_jobs returned None")
49
+ return actions
50
+ fetched_actions, error, _ = result
47
51
  if error:
48
52
  logging.error("Error assigning jobs: %s", error)
49
53
  return actions
@@ -224,7 +228,7 @@ class ActionsManager:
224
228
  action_ids
225
229
  )
226
230
  else:
227
- logging.debug("No actions currently running")
231
+ logging.info("No actions currently running")
228
232
 
229
233
  return self.current_actions
230
234
 
@@ -404,7 +404,7 @@ class InstanceManager:
404
404
  if self.container_kafka_producer:
405
405
  try:
406
406
  self.container_kafka_producer.send(topic_name, status_message)
407
- logging.debug("Container status monitor: Sent status for %d containers", len(containers))
407
+ logging.info("Container status monitor: Sent status for %d containers", len(containers))
408
408
  except Exception as e:
409
409
  logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
410
410
 
@@ -295,7 +295,7 @@ class Scaling:
295
295
  logging.warning(f"Kafka returned error for {api}, falling back to REST")
296
296
 
297
297
  # Kafka failed or disabled, try REST
298
- logging.info(f"Using REST API for {api}")
298
+ logging.debug(f"Using REST API for {api}")
299
299
  try:
300
300
  rest_response = rest_fallback_func()
301
301
 
@@ -185,7 +185,7 @@ class ShutdownManager:
185
185
  time.sleep(2)
186
186
  return True
187
187
  except Exception as e:
188
- logging.debug("Aggressive command failed: %s", str(e))
188
+ logging.info("Aggressive command failed: %s", str(e))
189
189
  except Exception as e:
190
190
  logging.error("Error in aggressive shutdown methods: %s", str(e))
191
191
  return False
@@ -271,7 +271,7 @@ class ShutdownManager:
271
271
  """
272
272
  # CRITICAL: Check if this is a reserved instance that should not be shut down
273
273
  # if self.reserved_instance:
274
- # logging.debug("Reserved instance detected, skipping shutdown check")
274
+ # logging.info("Reserved instance detected, skipping shutdown check")
275
275
  # return
276
276
 
277
277
  # Update idle time tracking