matrice-compute 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  """Module providing __init__ functionality."""
2
2
 
3
3
  import subprocess
4
+ import logging
4
5
 
5
6
  from matrice_common.utils import dependencies_check
6
7
 
@@ -17,4 +18,7 @@ subprocess.run( # Re-upgrade docker to avoid missing DOCKER_HOST connection erro
17
18
 
18
19
  from matrice_compute.instance_manager import InstanceManager # noqa: E402
19
20
 
21
+ logging.getLogger("kafka").setLevel(logging.INFO)
22
+ logging.getLogger("confluent_kafka").setLevel(logging.INFO)
23
+
20
24
  __all__ = ["InstanceManager"]
@@ -296,7 +296,7 @@ class ActionInstance:
296
296
  getattr(self, "action_record_id", "unknown"),
297
297
  )
298
298
  else:
299
- logging.debug(
299
+ logging.info(
300
300
  "No additional logs to send for action %s",
301
301
  getattr(self, "action_record_id", "unknown"),
302
302
  )
@@ -411,6 +411,7 @@ class ActionInstance:
411
411
  destination_workspace_path: str = "/usr/src/workspace",
412
412
  docker_workdir: str = "",
413
413
  extra_pkgs: list = [],
414
+ container_name: str = "",
414
415
  ):
415
416
  """Build base Docker command with common options.
416
417
 
@@ -425,6 +426,7 @@ class ActionInstance:
425
426
  destination_workspace_path (str): Container workspace path
426
427
  docker_workdir (str): Docker working directory
427
428
  extra_pkgs (list): List of extra packages to install
429
+ container_name (str): Docker container name (format: {action_type}_{action_id})
428
430
  Returns:
429
431
  str: Base Docker command
430
432
  """
@@ -489,8 +491,12 @@ class ActionInstance:
489
491
  ]
490
492
  )
491
493
 
494
+ # Build container name option if provided
495
+ name_option = f"--name {container_name}" if container_name else ""
496
+
492
497
  cmd_parts = [
493
498
  f"docker run -d {use_gpu} ",
499
+ name_option,
494
500
  network_config,
495
501
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
502
  *volumes,
@@ -882,6 +888,34 @@ class ActionInstance:
882
888
  job_params=action_details["jobParams"],
883
889
  )
884
890
 
891
+ @staticmethod
892
+ def container_exists(container_id: str) -> bool:
893
+ """Check if a Docker container exists.
894
+
895
+ Args:
896
+ container_id (str): Container ID or name to check
897
+
898
+ Returns:
899
+ bool: True if container exists, False otherwise
900
+ """
901
+ if not container_id:
902
+ return False
903
+ try:
904
+ result = subprocess.run(
905
+ ["docker", "inspect", container_id],
906
+ capture_output=True,
907
+ text=True,
908
+ timeout=10
909
+ )
910
+ return result.returncode == 0
911
+ except Exception as e:
912
+ logging.warning(
913
+ "Error checking if container %s exists: %s",
914
+ container_id,
915
+ str(e)
916
+ )
917
+ return False
918
+
885
919
  @log_errors(raise_exception=True)
886
920
  def start_process(self, cmd, log_name):
887
921
  """Start the process and initialize logging.
@@ -1095,7 +1129,8 @@ def data_preparation_execute(
1095
1129
  "Started pulling Docker image with PID: %s",
1096
1130
  process.pid,
1097
1131
  )
1098
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1132
+ container_name = f"data_prep_{self.action_record_id}"
1133
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1099
1134
  logging.info("cmd is: %s", cmd)
1100
1135
  self.start(cmd, "data_preparation_log")
1101
1136
 
@@ -1124,7 +1159,8 @@ def data_processing_execute(self: ActionInstance):
1124
1159
  service="bg-job-scheduler",
1125
1160
  job_params=action["jobParams"],
1126
1161
  )
1127
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1162
+ container_name = f"data_processing_{self.action_record_id}"
1163
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1128
1164
  logging.info("cmd: %s", cmd)
1129
1165
  self.start(cmd, "data_processing_log")
1130
1166
 
@@ -1137,7 +1173,8 @@ def data_split_execute(self: ActionInstance):
1137
1173
  if not action_details:
1138
1174
  return
1139
1175
  self.setup_action_requirements(action_details, work_fs, model_family="")
1140
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1176
+ container_name = f"data_split_{self.action_record_id}"
1177
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1141
1178
  logging.info("cmd: %s", cmd)
1142
1179
  self.start(cmd, "data_split")
1143
1180
 
@@ -1152,7 +1189,8 @@ def dataset_annotation_execute(
1152
1189
  if not action_details:
1153
1190
  return
1154
1191
  self.setup_action_requirements(action_details, work_fs)
1155
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1192
+ container_name = f"dataset_annotation_{self.action_record_id}"
1193
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1156
1194
  logging.info("cmd: %s", cmd)
1157
1195
  self.start(cmd, "dataset_annotation")
1158
1196
 
@@ -1167,7 +1205,8 @@ def dataset_augmentation_execute(
1167
1205
  if not action_details:
1168
1206
  return
1169
1207
  self.setup_action_requirements(action_details, work_fs)
1170
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1208
+ container_name = f"dataset_augmentation_{self.action_record_id}"
1209
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1171
1210
  logging.info("cmd: %s", cmd)
1172
1211
  self.start(cmd, "dataset_augmentation")
1173
1212
 
@@ -1183,7 +1222,8 @@ def augmentation_server_creation_execute(
1183
1222
  if not action_details:
1184
1223
  return
1185
1224
  self.setup_action_requirements(action_details, work_fs)
1186
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1225
+ container_name = f"augmentation_setup_{self.action_record_id}"
1226
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1187
1227
  logging.info("cmd: %s", cmd)
1188
1228
  self.start(cmd, "augmentation_setup")
1189
1229
 
@@ -1204,25 +1244,41 @@ def database_setup_execute(self: ActionInstance):
1204
1244
 
1205
1245
  project_id = action_details["_idProject"]
1206
1246
 
1207
- if action_details["actionDetails"].get("containerId"):
1208
- logging.info(
1209
- "Using existing container ID for inference tracker: %s",
1210
- action_details["actionDetails"]["containerId"],
1211
- )
1212
- self.docker_container = action_details["actionDetails"]["containerId"]
1213
- cmd = "docker restart " + self.docker_container
1214
- self.start(cmd, "qdrant_setup")
1247
+ # Define container names with action_record_id for uniqueness
1248
+ mongodb_container_name = f"database_setup_{self.action_record_id}"
1249
+ qdrant_container_name = f"qdrant_{self.action_record_id}"
1215
1250
 
1216
- #qdrant restart
1217
- qdrant_cmd = "docker restart qdrant"
1218
- self.start(qdrant_cmd, 'qdrant_setup')
1251
+ existing_container_id = action_details["actionDetails"].get("containerId")
1252
+ if existing_container_id:
1253
+ # Check if both containers actually exist before trying to restart
1254
+ mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
1255
+ qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
1219
1256
 
1220
- return
1257
+ if mongodb_container_exists and qdrant_container_exists:
1258
+ logging.info(
1259
+ "Using existing container ID for database setup: %s",
1260
+ existing_container_id,
1261
+ )
1262
+ self.docker_container = existing_container_id
1263
+ cmd = "docker restart " + self.docker_container
1264
+ self.start(cmd, "qdrant_setup")
1265
+
1266
+ # qdrant restart
1267
+ qdrant_cmd = f"docker restart {qdrant_container_name}"
1268
+ self.start(qdrant_cmd, "qdrant_setup")
1269
+ return
1270
+ else:
1271
+ logging.warning(
1272
+ "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
1273
+ mongodb_container_exists,
1274
+ qdrant_container_exists
1275
+ )
1276
+ # Fall through to create new containers
1221
1277
 
1222
1278
  # MongoDB container with --net=host (Port: 27020:27017)
1223
1279
  cmd = (
1224
1280
  f"docker run --pull=always --net=host "
1225
- f"--name mongodbdatabase "
1281
+ f"--name {mongodb_container_name} "
1226
1282
  f"-v matrice_myvol:/matrice_data "
1227
1283
  f"--cidfile ./{self.action_record_id}.cid "
1228
1284
  f"-e ACTION_RECORD_ID={self.action_record_id} "
@@ -1237,7 +1293,7 @@ def database_setup_execute(self: ActionInstance):
1237
1293
  # Qdrant container with --net=host (Port: 6334)
1238
1294
  qdrant_cmd = (
1239
1295
  f"docker run --pull=always --net=host "
1240
- f"--name qdrant "
1296
+ f"--name {qdrant_container_name} "
1241
1297
  f"-v matrice_myvol:/matrice_data "
1242
1298
  f"{'qdrant/qdrant:latest'} "
1243
1299
  )
@@ -1263,23 +1319,32 @@ def facial_recognition_setup_execute(self: ActionInstance):
1263
1319
 
1264
1320
  self.setup_action_requirements(action_details)
1265
1321
 
1266
- if action_details["actionDetails"].get("containerId"):
1267
- logging.info(
1268
- "Using existing container ID for facial recognition worker: %s",
1269
- action_details["actionDetails"]["containerId"],
1270
- )
1271
- self.docker_container = action_details["actionDetails"]["containerId"]
1272
- cmd = "docker restart " + self.docker_container
1273
- self.start(cmd, "facial_recognition_setup")
1274
- return
1322
+ existing_container_id = action_details["actionDetails"].get("containerId")
1323
+ if existing_container_id:
1324
+ # Check if container actually exists before trying to restart
1325
+ if ActionInstance.container_exists(existing_container_id):
1326
+ logging.info(
1327
+ "Using existing container ID for facial recognition worker: %s",
1328
+ existing_container_id,
1329
+ )
1330
+ self.docker_container = existing_container_id
1331
+ cmd = "docker restart " + self.docker_container
1332
+ self.start(cmd, "facial_recognition_setup")
1333
+ return
1334
+ else:
1335
+ logging.warning(
1336
+ "Container %s not found. Creating new container.",
1337
+ existing_container_id
1338
+ )
1339
+ # Fall through to create new container
1275
1340
 
1276
1341
  # Facial recognition worker container with --net=host (Port: 8081)
1342
+ container_name = f"facial_recognition_{self.action_record_id}"
1277
1343
  worker_cmd = (
1278
1344
  f"docker run -d --pull=always --net=host "
1279
- f"--name worker "
1280
- f"--cidfile ./{self.action_record_id}.cid "
1281
- f"-v matrice_myvol:/matrice_data "
1345
+ f"--name {container_name} "
1282
1346
  f"--cidfile ./{self.action_record_id}.cid "
1347
+ f"-v matrice_myvol:/matrice_data "
1283
1348
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1284
1349
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1285
1350
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1305,20 +1370,30 @@ def lpr_setup_execute(self: ActionInstance):
1305
1370
 
1306
1371
  self.setup_action_requirements(action_details)
1307
1372
 
1308
- if action_details["actionDetails"].get("containerId"):
1309
- logging.info(
1310
- "Using existing container ID for LPR worker: %s",
1311
- action_details["actionDetails"]["containerId"],
1312
- )
1313
- self.docker_container = action_details["actionDetails"]["containerId"]
1314
- cmd = "docker restart " + self.docker_container
1315
- self.start(cmd, "lpr_setup")
1316
- return
1373
+ existing_container_id = action_details["actionDetails"].get("containerId")
1374
+ if existing_container_id:
1375
+ # Check if container actually exists before trying to restart
1376
+ if ActionInstance.container_exists(existing_container_id):
1377
+ logging.info(
1378
+ "Using existing container ID for LPR worker: %s",
1379
+ existing_container_id,
1380
+ )
1381
+ self.docker_container = existing_container_id
1382
+ cmd = "docker restart " + self.docker_container
1383
+ self.start(cmd, "lpr_setup")
1384
+ return
1385
+ else:
1386
+ logging.warning(
1387
+ "Container %s not found. Creating new container.",
1388
+ existing_container_id
1389
+ )
1390
+ # Fall through to create new container
1317
1391
 
1318
1392
  # LPR worker container with --net=host (Port: 8082)
1393
+ container_name = f"lpr_{self.action_record_id}"
1319
1394
  worker_cmd = (
1320
1395
  f"docker run -d --net=host --pull=always "
1321
- f"--name lpr-worker "
1396
+ f"--name {container_name} "
1322
1397
  f"--cidfile ./{self.action_record_id}.cid "
1323
1398
  f"-v matrice_myvol:/matrice_data "
1324
1399
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1356,20 +1431,30 @@ def inference_ws_server_execute(self: ActionInstance):
1356
1431
 
1357
1432
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1358
1433
 
1359
- if action_details["actionDetails"].get("containerId"):
1360
- logging.info(
1361
- "Using existing container ID for inference WebSocket server: %s",
1362
- action_details["actionDetails"]["containerId"],
1363
- )
1364
- self.docker_container = action_details["actionDetails"]["containerId"]
1365
- cmd = "docker restart " + self.docker_container
1366
- self.start(cmd, "inference_ws_server")
1367
- return
1434
+ existing_container_id = action_details["actionDetails"].get("containerId")
1435
+ if existing_container_id:
1436
+ # Check if container actually exists before trying to restart
1437
+ if ActionInstance.container_exists(existing_container_id):
1438
+ logging.info(
1439
+ "Using existing container ID for inference WebSocket server: %s",
1440
+ existing_container_id,
1441
+ )
1442
+ self.docker_container = existing_container_id
1443
+ cmd = "docker restart " + self.docker_container
1444
+ self.start(cmd, "inference_ws_server")
1445
+ return
1446
+ else:
1447
+ logging.warning(
1448
+ "Container %s not found. Creating new container.",
1449
+ existing_container_id
1450
+ )
1451
+ # Fall through to create new container
1368
1452
 
1369
1453
  # Inference WebSocket server with --net=host (Port: 8102)
1454
+ container_name = f"inference_ws_{self.action_record_id}"
1370
1455
  worker_cmd = (
1371
1456
  f"docker run -d --pull=always --net=host "
1372
- f"--name inference "
1457
+ f"--name {container_name} "
1373
1458
  f"--cidfile ./{self.action_record_id}.cid "
1374
1459
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1375
1460
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1404,20 +1489,30 @@ def fe_fs_streaming_execute(self: ActionInstance):
1404
1489
 
1405
1490
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1406
1491
 
1407
- if action_details["actionDetails"].get("containerId"):
1408
- logging.info(
1409
- "Using existing container ID for frontend streaming: %s",
1410
- action_details["actionDetails"]["containerId"],
1411
- )
1412
- self.docker_container = action_details["actionDetails"]["containerId"]
1413
- cmd = "docker restart " + self.docker_container
1414
- self.start(cmd, "fe_fs_streaming")
1415
- return
1416
-
1492
+ existing_container_id = action_details["actionDetails"].get("containerId")
1493
+ if existing_container_id:
1494
+ # Check if container actually exists before trying to restart
1495
+ if ActionInstance.container_exists(existing_container_id):
1496
+ logging.info(
1497
+ "Using existing container ID for frontend streaming: %s",
1498
+ existing_container_id,
1499
+ )
1500
+ self.docker_container = existing_container_id
1501
+ cmd = "docker restart " + self.docker_container
1502
+ self.start(cmd, "fe_fs_streaming")
1503
+ return
1504
+ else:
1505
+ logging.warning(
1506
+ "Container %s not found. Creating new container.",
1507
+ existing_container_id
1508
+ )
1509
+ # Fall through to create new container
1510
+
1417
1511
  # Frontend streaming with --net=host (Port: 3000)
1512
+ container_name = f"fe_streaming_{self.action_record_id}"
1418
1513
  worker_cmd = (
1419
1514
  f"docker run -d --pull=always --net=host "
1420
- f"--name fe_streaming "
1515
+ f"--name {container_name} "
1421
1516
  f"--cidfile ./{self.action_record_id}.cid "
1422
1517
  f"-v matrice_myvol:/matrice_data "
1423
1518
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1449,20 +1544,30 @@ def fe_analytics_service_execute(self: ActionInstance):
1449
1544
 
1450
1545
  project_id = action_details["_idProject"]
1451
1546
 
1452
- if action_details["actionDetails"].get("containerId"):
1453
- logging.info(
1454
- "Using existing container ID for frontend analytics service: %s",
1455
- action_details["actionDetails"]["containerId"],
1456
- )
1457
- self.docker_container = action_details["actionDetails"]["containerId"]
1458
- cmd = "docker restart " + self.docker_container
1459
- self.start(cmd, "fe_analytics_service")
1460
- return
1461
-
1547
+ existing_container_id = action_details["actionDetails"].get("containerId")
1548
+ if existing_container_id:
1549
+ # Check if container actually exists before trying to restart
1550
+ if ActionInstance.container_exists(existing_container_id):
1551
+ logging.info(
1552
+ "Using existing container ID for frontend analytics service: %s",
1553
+ existing_container_id,
1554
+ )
1555
+ self.docker_container = existing_container_id
1556
+ cmd = "docker restart " + self.docker_container
1557
+ self.start(cmd, "fe_analytics_service")
1558
+ return
1559
+ else:
1560
+ logging.warning(
1561
+ "Container %s not found. Creating new container.",
1562
+ existing_container_id
1563
+ )
1564
+ # Fall through to create new container
1565
+
1462
1566
  # Frontend analytics service with --net=host (Port: 3001)
1567
+ container_name = f"fe_analytics_{self.action_record_id}"
1463
1568
  worker_cmd = (
1464
1569
  f"docker run -d --pull=always --net=host "
1465
- f"--name fe-analytics "
1570
+ f"--name {container_name} "
1466
1571
  f"--cidfile ./{self.action_record_id}.cid "
1467
1572
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1468
1573
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1494,7 +1599,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1494
1599
  else:
1495
1600
  return
1496
1601
  use_gpu = self.get_gpu_config(action_details)
1497
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1602
+ container_name = f"dataset_generation_{self.action_record_id}"
1603
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1498
1604
  logging.info("cmd is: %s", cmd)
1499
1605
  self.start(cmd, "dataset_generation")
1500
1606
 
@@ -1515,7 +1621,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1515
1621
  else:
1516
1622
  return
1517
1623
  use_gpu = self.get_gpu_config(action_details)
1518
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1624
+ container_name = f"synthetic_data_setup_{self.action_record_id}"
1625
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1519
1626
  logging.info("cmd is: %s", cmd)
1520
1627
  self.start(cmd, "synthetic_data_setup")
1521
1628
 
@@ -1552,26 +1659,40 @@ def redis_setup_execute(self: ActionInstance):
1552
1659
 
1553
1660
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1554
1661
 
1662
+ # Define container names with action_record_id for uniqueness
1663
+ redis_container_name = f"redis_{self.action_record_id}"
1555
1664
 
1556
- if action_details["actionDetails"].get("containerId"):
1557
- logging.info(
1558
- "Using existing container ID for redis management: %s",
1559
- action_details["actionDetails"]["containerId"],
1560
- )
1561
- self.docker_container = action_details["actionDetails"]["containerId"]
1562
- cmd = "docker restart " + self.docker_container
1563
- self.start(cmd, "redis_setup")
1665
+ existing_container_id = action_details["actionDetails"].get("containerId")
1666
+ if existing_container_id:
1667
+ # Check if both containers actually exist before trying to restart
1668
+ management_container_exists = ActionInstance.container_exists(existing_container_id)
1669
+ redis_container_exists = ActionInstance.container_exists(redis_container_name)
1564
1670
 
1565
- # Redis container restart
1566
- redis_restart_cmd = "docker restart redis_container"
1567
- self.start(redis_restart_cmd, "redis")
1671
+ if management_container_exists and redis_container_exists:
1672
+ logging.info(
1673
+ "Using existing container ID for redis management: %s",
1674
+ existing_container_id,
1675
+ )
1676
+ self.docker_container = existing_container_id
1677
+ cmd = "docker restart " + self.docker_container
1678
+ self.start(cmd, "redis_setup")
1679
+
1680
+ # Redis container restart
1681
+ redis_restart_cmd = f"docker restart {redis_container_name}"
1682
+ self.start(redis_restart_cmd, "redis")
1683
+ return
1684
+ else:
1685
+ logging.warning(
1686
+ "Container(s) not found (management=%s, redis=%s). Creating new containers.",
1687
+ management_container_exists,
1688
+ redis_container_exists
1689
+ )
1690
+ # Fall through to create new containers
1568
1691
 
1569
- return
1570
-
1571
1692
  # Redis container with --net=host (Port: 6379)
1572
1693
  redis_cmd = (
1573
1694
  f"docker run -d --net=host "
1574
- f"--name redis_container "
1695
+ f"--name {redis_container_name} "
1575
1696
  f"--restart unless-stopped "
1576
1697
  f"{redis_image} "
1577
1698
  f"redis-server --bind 0.0.0.0 "
@@ -1641,7 +1762,8 @@ def deploy_aggregator_execute(
1641
1762
  if not action_details:
1642
1763
  return
1643
1764
  self.setup_action_requirements(action_details, work_fs)
1644
- cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1765
+ container_name = f"deploy_aggregator_{self.action_record_id}"
1766
+ cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1645
1767
  logging.info("cmd: %s", cmd)
1646
1768
  self.start(cmd, "deploy_aggregator")
1647
1769
 
@@ -1689,7 +1811,8 @@ def model_deploy_execute(self: ActionInstance):
1689
1811
  "TRITON_PORTS": triton_ports
1690
1812
  }
1691
1813
 
1692
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1814
+ container_name = f"model_deploy_{self.action_record_id}"
1815
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1693
1816
  logging.info("cmd is: %s", cmd)
1694
1817
  self.start(cmd, "deploy_log")
1695
1818
 
@@ -1712,17 +1835,27 @@ def model_train_execute(self: ActionInstance):
1712
1835
  action_id=action_id,
1713
1836
  )
1714
1837
 
1715
- if action_details["actionDetails"].get("containerId"):
1716
- logging.info(
1717
- "Using existing container ID for training: %s",
1718
- action_details["actionDetails"]["containerId"],
1719
- )
1720
- self.docker_container = action_details["actionDetails"]["containerId"]
1721
- cmd = "docker restart " + self.docker_container
1722
- self.start(cmd, "train_log")
1723
- return
1724
-
1725
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1838
+ existing_container_id = action_details["actionDetails"].get("containerId")
1839
+ if existing_container_id:
1840
+ # Check if container actually exists before trying to restart
1841
+ if ActionInstance.container_exists(existing_container_id):
1842
+ logging.info(
1843
+ "Using existing container ID for training: %s",
1844
+ existing_container_id,
1845
+ )
1846
+ self.docker_container = existing_container_id
1847
+ cmd = "docker restart " + self.docker_container
1848
+ self.start(cmd, "train_log")
1849
+ return
1850
+ else:
1851
+ logging.warning(
1852
+ "Container %s not found. Creating new container.",
1853
+ existing_container_id
1854
+ )
1855
+ # Fall through to create new container
1856
+
1857
+ container_name = f"model_train_{self.action_record_id}"
1858
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1726
1859
  logging.info("cmd is: %s", cmd)
1727
1860
  self.start(cmd, "train_log")
1728
1861
 
@@ -1743,17 +1876,27 @@ def model_eval_execute(self: ActionInstance):
1743
1876
  model_family=model_family,
1744
1877
  action_id=action_id,
1745
1878
  )
1746
- if action_details["actionDetails"].get("containerId"):
1747
- logging.info(
1748
- "Using existing container ID for training: %s",
1749
- action_details["actionDetails"]["containerId"],
1750
- )
1751
- self.docker_container = action_details["actionDetails"]["containerId"]
1752
- cmd = "docker restart " + self.docker_container
1753
- self.start(cmd, "eval_log")
1754
- return
1755
-
1756
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1879
+ existing_container_id = action_details["actionDetails"].get("containerId")
1880
+ if existing_container_id:
1881
+ # Check if container actually exists before trying to restart
1882
+ if ActionInstance.container_exists(existing_container_id):
1883
+ logging.info(
1884
+ "Using existing container ID for evaluation: %s",
1885
+ existing_container_id,
1886
+ )
1887
+ self.docker_container = existing_container_id
1888
+ cmd = "docker restart " + self.docker_container
1889
+ self.start(cmd, "eval_log")
1890
+ return
1891
+ else:
1892
+ logging.warning(
1893
+ "Container %s not found. Creating new container.",
1894
+ existing_container_id
1895
+ )
1896
+ # Fall through to create new container
1897
+
1898
+ container_name = f"model_eval_{self.action_record_id}"
1899
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1757
1900
  logging.info("cmd is: %s", cmd)
1758
1901
  self.start(cmd, "eval_log")
1759
1902
 
@@ -1777,17 +1920,27 @@ def model_export_execute(self: ActionInstance):
1777
1920
  model_family=model_family,
1778
1921
  action_id=action_id,
1779
1922
  )
1780
- if action_details["actionDetails"].get("containerId"):
1781
- logging.info(
1782
- "Using existing container ID for training: %s",
1783
- action_details["actionDetails"]["containerId"],
1784
- )
1785
- self.docker_container = action_details["actionDetails"]["containerId"]
1786
- cmd = "docker restart " + self.docker_container
1787
- self.start(cmd, "export_log")
1788
- return
1789
-
1790
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1923
+ existing_container_id = action_details["actionDetails"].get("containerId")
1924
+ if existing_container_id:
1925
+ # Check if container actually exists before trying to restart
1926
+ if ActionInstance.container_exists(existing_container_id):
1927
+ logging.info(
1928
+ "Using existing container ID for export: %s",
1929
+ existing_container_id,
1930
+ )
1931
+ self.docker_container = existing_container_id
1932
+ cmd = "docker restart " + self.docker_container
1933
+ self.start(cmd, "export_log")
1934
+ return
1935
+ else:
1936
+ logging.warning(
1937
+ "Container %s not found. Creating new container.",
1938
+ existing_container_id
1939
+ )
1940
+ # Fall through to create new container
1941
+
1942
+ container_name = f"model_export_{self.action_record_id}"
1943
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1791
1944
  logging.info("cmd is: %s", cmd)
1792
1945
  self.start(cmd, "export_log")
1793
1946
 
@@ -1803,7 +1956,8 @@ def image_build_execute(self: ActionInstance):
1803
1956
  action_id = action_details["_id"]
1804
1957
  internal_api_key = self.get_internal_api_key(action_id)
1805
1958
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1806
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1959
+ container_name = f"image_build_{self.action_record_id}"
1960
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1807
1961
  logging.info("cmd is: %s", cmd)
1808
1962
  self.start(cmd, "image_build_log")
1809
1963
 
@@ -1815,7 +1969,8 @@ def resource_clone_execute(self: ActionInstance):
1815
1969
  if not action_details:
1816
1970
  return
1817
1971
  self.setup_action_requirements(action_details)
1818
- cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1972
+ container_name = f"resource_clone_{self.action_record_id}"
1973
+ cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1819
1974
  logging.info("cmd is: %s", cmd)
1820
1975
  self.start(cmd, "resource_clone")
1821
1976
 
@@ -1831,17 +1986,27 @@ def streaming_gateway_execute(self: ActionInstance):
1831
1986
  self.docker_container = (
1832
1987
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1833
1988
  )
1834
- if action_details["actionDetails"].get("containerId"):
1835
- logging.info(
1836
- "Using existing container ID for training: %s",
1837
- action_details["actionDetails"]["containerId"],
1838
- )
1839
- self.docker_container = action_details["actionDetails"]["containerId"]
1840
- cmd = "docker restart " + self.docker_container
1841
- self.start(cmd, "streaming_gateway")
1842
- return
1843
-
1844
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1989
+ existing_container_id = action_details["actionDetails"].get("containerId")
1990
+ if existing_container_id:
1991
+ # Check if container actually exists before trying to restart
1992
+ if ActionInstance.container_exists(existing_container_id):
1993
+ logging.info(
1994
+ "Using existing container ID for streaming gateway: %s",
1995
+ existing_container_id,
1996
+ )
1997
+ self.docker_container = existing_container_id
1998
+ cmd = "docker restart " + self.docker_container
1999
+ self.start(cmd, "streaming_gateway")
2000
+ return
2001
+ else:
2002
+ logging.warning(
2003
+ "Container %s not found. Creating new container.",
2004
+ existing_container_id
2005
+ )
2006
+ # Fall through to create new container
2007
+
2008
+ container_name = f"streaming_gateway_{self.action_record_id}"
2009
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1845
2010
  logging.info("cmd is: %s", cmd)
1846
2011
  self.start(cmd, "streaming_gateway")
1847
2012
 
@@ -1935,16 +2100,24 @@ def kafka_setup_execute(self: ActionInstance):
1935
2100
  else:
1936
2101
  pkgs = f"matrice_common matrice"
1937
2102
 
1938
- if action_details["actionDetails"].get("containerId"):
1939
- logging.info(
1940
- "Using existing container ID for training: %s",
1941
- action_details["actionDetails"]["containerId"],
1942
- )
1943
- self.docker_container = action_details["actionDetails"]["containerId"]
1944
- cmd = "docker restart " + self.docker_container
1945
- self.start(cmd, "kafka_setup")
1946
- return
1947
-
2103
+ existing_container_id = action_details["actionDetails"].get("containerId")
2104
+ if existing_container_id:
2105
+ # Check if container actually exists before trying to restart
2106
+ if ActionInstance.container_exists(existing_container_id):
2107
+ logging.info(
2108
+ "Using existing container ID for kafka: %s",
2109
+ existing_container_id,
2110
+ )
2111
+ self.docker_container = existing_container_id
2112
+ cmd = "docker restart " + self.docker_container
2113
+ self.start(cmd, "kafka_setup")
2114
+ return
2115
+ else:
2116
+ logging.warning(
2117
+ "Container %s not found. Creating new container.",
2118
+ existing_container_id
2119
+ )
2120
+ # Fall through to create new container
1948
2121
 
1949
2122
  # Kafka container with --net=host (Ports: 9092, 9093)
1950
2123
  cmd = (
@@ -1981,21 +2154,31 @@ def inference_tracker_setup_execute(self: ActionInstance):
1981
2154
 
1982
2155
  self.setup_action_requirements(action_details)
1983
2156
 
1984
- if action_details["actionDetails"].get("containerId"):
1985
- logging.info(
1986
- "Using existing container ID for inference tracker: %s",
1987
- action_details["actionDetails"]["containerId"],
1988
- )
1989
- self.docker_container = action_details["actionDetails"]["containerId"]
1990
- cmd = "docker restart " + self.docker_container
1991
- self.start(cmd, "inference_tracker_setup")
1992
- return
1993
-
2157
+ existing_container_id = action_details["actionDetails"].get("containerId")
2158
+ if existing_container_id:
2159
+ # Check if container actually exists before trying to restart
2160
+ if ActionInstance.container_exists(existing_container_id):
2161
+ logging.info(
2162
+ "Using existing container ID for inference tracker: %s",
2163
+ existing_container_id,
2164
+ )
2165
+ self.docker_container = existing_container_id
2166
+ cmd = "docker restart " + self.docker_container
2167
+ self.start(cmd, "inference_tracker_setup")
2168
+ return
2169
+ else:
2170
+ logging.warning(
2171
+ "Container %s not found. Creating new container.",
2172
+ existing_container_id
2173
+ )
2174
+ # Fall through to create new container
2175
+
1994
2176
  # This is the existing Docker run command
2177
+ container_name = f"inference_tracker_{self.action_record_id}"
1995
2178
  worker_cmd = (
1996
2179
  f"docker run -d --pull=always --net=host "
1997
- f"--cidfile ./{self.action_record_id}.cid "
1998
- f"--name inference-tracker-worker "
2180
+ f"--cidfile ./{self.action_record_id}.cid "
2181
+ f"--name {container_name} "
1999
2182
  f"-v matrice_myvol:/matrice_data "
2000
2183
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2001
2184
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -224,7 +224,7 @@ class ActionsManager:
224
224
  action_ids
225
225
  )
226
226
  else:
227
- logging.debug("No actions currently running")
227
+ logging.info("No actions currently running")
228
228
 
229
229
  return self.current_actions
230
230
 
@@ -404,7 +404,7 @@ class InstanceManager:
404
404
  if self.container_kafka_producer:
405
405
  try:
406
406
  self.container_kafka_producer.send(topic_name, status_message)
407
- logging.debug("Container status monitor: Sent status for %d containers", len(containers))
407
+ logging.info("Container status monitor: Sent status for %d containers", len(containers))
408
408
  except Exception as e:
409
409
  logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
410
410
 
@@ -295,7 +295,7 @@ class Scaling:
295
295
  logging.warning(f"Kafka returned error for {api}, falling back to REST")
296
296
 
297
297
  # Kafka failed or disabled, try REST
298
- logging.info(f"Using REST API for {api}")
298
+ logging.debug(f"Using REST API for {api}")
299
299
  try:
300
300
  rest_response = rest_fallback_func()
301
301
 
@@ -185,7 +185,7 @@ class ShutdownManager:
185
185
  time.sleep(2)
186
186
  return True
187
187
  except Exception as e:
188
- logging.debug("Aggressive command failed: %s", str(e))
188
+ logging.info("Aggressive command failed: %s", str(e))
189
189
  except Exception as e:
190
190
  logging.error("Error in aggressive shutdown methods: %s", str(e))
191
191
  return False
@@ -271,7 +271,7 @@ class ShutdownManager:
271
271
  """
272
272
  # CRITICAL: Check if this is a reserved instance that should not be shut down
273
273
  # if self.reserved_instance:
274
- # logging.debug("Reserved instance detected, skipping shutdown check")
274
+ # logging.info("Reserved instance detected, skipping shutdown check")
275
275
  # return
276
276
 
277
277
  # Update idle time tracking
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.31
3
+ Version: 0.1.32
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -0,0 +1,18 @@
1
+ matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
2
+ matrice_compute/action_instance.py,sha256=j6_3OG82HT7WcdWMy6VjEWwYxELfLhGJ1Y6ZaoRgWig,85420
3
+ matrice_compute/actions_manager.py,sha256=14DKWfdJ145oyA0x5YVaj4ylnKE5Kd6xJZ5xzk0Jres,18147
4
+ matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
+ matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
6
+ matrice_compute/instance_manager.py,sha256=9u3QRTP-MkAWmrSQMMbCKc0TfK584teAg1wWIaqMZdE,19291
7
+ matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
8
+ matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
9
+ matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ matrice_compute/resources_tracker.py,sha256=1jSLrIFlOh-vgyNzFrUrE2Ak2JAGCIfV7wcyEPJ0f2c,32246
11
+ matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
12
+ matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
13
+ matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
14
+ matrice_compute-0.1.32.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
+ matrice_compute-0.1.32.dist-info/METADATA,sha256=DMQ2-4mfoiU0aUvxsTVe7lcvhrZ5_uiIvzkIun_6sP4,1038
16
+ matrice_compute-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ matrice_compute-0.1.32.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
+ matrice_compute-0.1.32.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
2
- matrice_compute/action_instance.py,sha256=NpI7uCaLJ5GKdW-2JBGCjTwijb8XBrRc7GKRC4uhQF4,76650
3
- matrice_compute/actions_manager.py,sha256=Iex5uw0PLRR4pvIAZDxc2CypucbanKDbJ3SK8mMGXK8,18148
4
- matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
- matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
6
- matrice_compute/instance_manager.py,sha256=kPZYfiq3Oevs5r1xzwvDzE27zeWF9oBBxh9KhpHJuG4,19292
7
- matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
8
- matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
9
- matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- matrice_compute/resources_tracker.py,sha256=1jSLrIFlOh-vgyNzFrUrE2Ak2JAGCIfV7wcyEPJ0f2c,32246
11
- matrice_compute/scaling.py,sha256=cdEJqdVsPGDeOjkVAG85lubOn-qwDRV5qqmrNl_XpCM,55146
12
- matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
13
- matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
14
- matrice_compute-0.1.31.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
- matrice_compute-0.1.31.dist-info/METADATA,sha256=nhJU2AA0SxaSWMZXKjYtAthzjbjdEmmD3agMYqukQx8,1038
16
- matrice_compute-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
- matrice_compute-0.1.31.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
- matrice_compute-0.1.31.dist-info/RECORD,,