matrice-compute 0.1.26__tar.gz → 0.1.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/PKG-INFO +1 -1
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/action_instance.py +201 -1
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/instance_manager.py +179 -8
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/resources_tracker.py +260 -0
- matrice_compute-0.1.27/src/matrice_compute/scaling.py +1395 -0
- matrice_compute-0.1.26/src/matrice_compute/scaling.py +0 -737
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/LICENSE.txt +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/README.md +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/pyproject.toml +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/setup.cfg +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/setup.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/compute_operations_handler.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/instance_utils.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.26 → matrice_compute-0.1.27}/src/matrice_compute/task_utils.py +0 -0
|
@@ -495,6 +495,7 @@ class ActionInstance:
|
|
|
495
495
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
496
496
|
*volumes,
|
|
497
497
|
# Container configuration and startup commands
|
|
498
|
+
f"--cidfile ./{self.action_record_id}.cid ",
|
|
498
499
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
499
500
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
500
501
|
f"{env_exports} && "
|
|
@@ -895,6 +896,7 @@ class ActionInstance:
|
|
|
895
896
|
"""
|
|
896
897
|
self.cmd = cmd
|
|
897
898
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
899
|
+
|
|
898
900
|
with open(self.log_path, "wb") as out:
|
|
899
901
|
self.process = subprocess.Popen(
|
|
900
902
|
shlex.split(self.cmd),
|
|
@@ -903,6 +905,52 @@ class ActionInstance:
|
|
|
903
905
|
env={**os.environ},
|
|
904
906
|
start_new_session=True,
|
|
905
907
|
)
|
|
908
|
+
|
|
909
|
+
self.container_id = None
|
|
910
|
+
|
|
911
|
+
cid_file_path = f"./{self.action_record_id}.cid"
|
|
912
|
+
max_retries = 5
|
|
913
|
+
retry_delay = 1 # seconds
|
|
914
|
+
for attempt in range(max_retries):
|
|
915
|
+
try:
|
|
916
|
+
with open(cid_file_path, "r") as cid_file:
|
|
917
|
+
container_id = cid_file.read().strip()
|
|
918
|
+
self.container_id = container_id
|
|
919
|
+
logging.info(
|
|
920
|
+
"Started process for action %s with container ID: %s",
|
|
921
|
+
self.action_record_id,
|
|
922
|
+
self.container_id,
|
|
923
|
+
)
|
|
924
|
+
break
|
|
925
|
+
except FileNotFoundError:
|
|
926
|
+
logging.warning(
|
|
927
|
+
"CID file not found for action %s, attempt %d/%d",
|
|
928
|
+
self.action_record_id,
|
|
929
|
+
attempt + 1,
|
|
930
|
+
max_retries,
|
|
931
|
+
)
|
|
932
|
+
time.sleep(retry_delay)
|
|
933
|
+
except Exception as e:
|
|
934
|
+
logging.error(
|
|
935
|
+
"Error reading CID file for action %s: %s",
|
|
936
|
+
self.action_record_id,
|
|
937
|
+
str(e),
|
|
938
|
+
)
|
|
939
|
+
time.sleep(retry_delay)
|
|
940
|
+
else:
|
|
941
|
+
logging.error(
|
|
942
|
+
"Failed to read CID file for action %s after %d attempts",
|
|
943
|
+
self.action_record_id,
|
|
944
|
+
max_retries,
|
|
945
|
+
)
|
|
946
|
+
raise Exception("Failed to start process: CID file not found")
|
|
947
|
+
|
|
948
|
+
# report container id to scaling service
|
|
949
|
+
self.scaling.update_action_container_id(
|
|
950
|
+
action_record_id=self.action_record_id,
|
|
951
|
+
container_id=self.container_id,
|
|
952
|
+
)
|
|
953
|
+
|
|
906
954
|
|
|
907
955
|
@log_errors(raise_exception=False)
|
|
908
956
|
def start_logger(self):
|
|
@@ -1172,11 +1220,27 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1172
1220
|
|
|
1173
1221
|
project_id = action_details["_idProject"]
|
|
1174
1222
|
|
|
1223
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1224
|
+
logging.info(
|
|
1225
|
+
"Using existing container ID for inference tracker: %s",
|
|
1226
|
+
action_details["actionDetails"]["containerId"],
|
|
1227
|
+
)
|
|
1228
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1229
|
+
cmd = "docker restart " + self.docker_container
|
|
1230
|
+
self.start(cmd, "qdrant_setup")
|
|
1231
|
+
|
|
1232
|
+
#qdrant restart
|
|
1233
|
+
qdrant_cmd = "docker restart qdrant"
|
|
1234
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1235
|
+
|
|
1236
|
+
return
|
|
1237
|
+
|
|
1175
1238
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1176
1239
|
cmd = (
|
|
1177
1240
|
f"docker run --pull=always --net=host "
|
|
1178
1241
|
f"--name mongodbdatabase "
|
|
1179
1242
|
f"-v matrice_myvol:/matrice_data "
|
|
1243
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1180
1244
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1181
1245
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1182
1246
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1215,11 +1279,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1215
1279
|
|
|
1216
1280
|
self.setup_action_requirements(action_details)
|
|
1217
1281
|
|
|
1282
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1283
|
+
logging.info(
|
|
1284
|
+
"Using existing container ID for facial recognition worker: %s",
|
|
1285
|
+
action_details["actionDetails"]["containerId"],
|
|
1286
|
+
)
|
|
1287
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1288
|
+
cmd = "docker restart " + self.docker_container
|
|
1289
|
+
self.start(cmd, "facial_recognition_setup")
|
|
1290
|
+
return
|
|
1291
|
+
|
|
1218
1292
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1219
1293
|
worker_cmd = (
|
|
1220
1294
|
f"docker run -d --pull=always --net=host "
|
|
1221
1295
|
f"--name worker "
|
|
1296
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1222
1297
|
f"-v matrice_myvol:/matrice_data "
|
|
1298
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1223
1299
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1224
1300
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1225
1301
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1245,10 +1321,21 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1245
1321
|
|
|
1246
1322
|
self.setup_action_requirements(action_details)
|
|
1247
1323
|
|
|
1324
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1325
|
+
logging.info(
|
|
1326
|
+
"Using existing container ID for LPR worker: %s",
|
|
1327
|
+
action_details["actionDetails"]["containerId"],
|
|
1328
|
+
)
|
|
1329
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1330
|
+
cmd = "docker restart " + self.docker_container
|
|
1331
|
+
self.start(cmd, "lpr_setup")
|
|
1332
|
+
return
|
|
1333
|
+
|
|
1248
1334
|
# LPR worker container with --net=host (Port: 8082)
|
|
1249
1335
|
worker_cmd = (
|
|
1250
1336
|
f"docker run -d --net=host --pull=always "
|
|
1251
1337
|
f"--name lpr-worker "
|
|
1338
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1252
1339
|
f"-v matrice_myvol:/matrice_data "
|
|
1253
1340
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1254
1341
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1285,10 +1372,21 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1285
1372
|
|
|
1286
1373
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1287
1374
|
|
|
1375
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1376
|
+
logging.info(
|
|
1377
|
+
"Using existing container ID for inference WebSocket server: %s",
|
|
1378
|
+
action_details["actionDetails"]["containerId"],
|
|
1379
|
+
)
|
|
1380
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1381
|
+
cmd = "docker restart " + self.docker_container
|
|
1382
|
+
self.start(cmd, "inference_ws_server")
|
|
1383
|
+
return
|
|
1384
|
+
|
|
1288
1385
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1289
1386
|
worker_cmd = (
|
|
1290
1387
|
f"docker run -d --pull=always --net=host "
|
|
1291
1388
|
f"--name inference "
|
|
1389
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1292
1390
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1293
1391
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1294
1392
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1321,11 +1419,22 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1321
1419
|
ws_url = f"{ws_host}:8102"
|
|
1322
1420
|
|
|
1323
1421
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1422
|
+
|
|
1423
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1424
|
+
logging.info(
|
|
1425
|
+
"Using existing container ID for frontend streaming: %s",
|
|
1426
|
+
action_details["actionDetails"]["containerId"],
|
|
1427
|
+
)
|
|
1428
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1429
|
+
cmd = "docker restart " + self.docker_container
|
|
1430
|
+
self.start(cmd, "fe_fs_streaming")
|
|
1431
|
+
return
|
|
1324
1432
|
|
|
1325
1433
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1326
1434
|
worker_cmd = (
|
|
1327
1435
|
f"docker run -d --pull=always --net=host "
|
|
1328
1436
|
f"--name fe_streaming "
|
|
1437
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1329
1438
|
f"-v matrice_myvol:/matrice_data "
|
|
1330
1439
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1331
1440
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1355,11 +1464,22 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1355
1464
|
self.setup_action_requirements(action_details)
|
|
1356
1465
|
|
|
1357
1466
|
project_id = action_details["_idProject"]
|
|
1467
|
+
|
|
1468
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1469
|
+
logging.info(
|
|
1470
|
+
"Using existing container ID for frontend analytics service: %s",
|
|
1471
|
+
action_details["actionDetails"]["containerId"],
|
|
1472
|
+
)
|
|
1473
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1474
|
+
cmd = "docker restart " + self.docker_container
|
|
1475
|
+
self.start(cmd, "fe_analytics_service")
|
|
1476
|
+
return
|
|
1358
1477
|
|
|
1359
1478
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1360
1479
|
worker_cmd = (
|
|
1361
1480
|
f"docker run -d --pull=always --net=host "
|
|
1362
1481
|
f"--name fe-analytics "
|
|
1482
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1363
1483
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1364
1484
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1365
1485
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1447,11 +1567,27 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1447
1567
|
logging.info(f"Redis will use IP: {redis_host} on port 6379")
|
|
1448
1568
|
|
|
1449
1569
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1570
|
+
|
|
1571
|
+
|
|
1572
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1573
|
+
logging.info(
|
|
1574
|
+
"Using existing container ID for redis management: %s",
|
|
1575
|
+
action_details["actionDetails"]["containerId"],
|
|
1576
|
+
)
|
|
1577
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1578
|
+
cmd = "docker restart " + self.docker_container
|
|
1579
|
+
self.start(cmd, "redis_setup")
|
|
1580
|
+
|
|
1581
|
+
# Redis container restart
|
|
1582
|
+
redis_restart_cmd = "docker restart redis_container"
|
|
1583
|
+
self.start(redis_restart_cmd, "redis")
|
|
1584
|
+
|
|
1585
|
+
return
|
|
1450
1586
|
|
|
1451
1587
|
# Redis container with --net=host (Port: 6379)
|
|
1452
1588
|
redis_cmd = (
|
|
1453
1589
|
f"docker run -d --net=host "
|
|
1454
|
-
f"--name
|
|
1590
|
+
f"--name redis_container"
|
|
1455
1591
|
f"--restart unless-stopped "
|
|
1456
1592
|
f"--memory=32g "
|
|
1457
1593
|
f"--cpus=8 "
|
|
@@ -1496,6 +1632,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1496
1632
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1497
1633
|
cmd = (
|
|
1498
1634
|
f"docker run --net=host "
|
|
1635
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1499
1636
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1500
1637
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1501
1638
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1592,6 +1729,17 @@ def model_train_execute(self: ActionInstance):
|
|
|
1592
1729
|
model_family=model_family,
|
|
1593
1730
|
action_id=action_id,
|
|
1594
1731
|
)
|
|
1732
|
+
|
|
1733
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1734
|
+
logging.info(
|
|
1735
|
+
"Using existing container ID for training: %s",
|
|
1736
|
+
action_details["actionDetails"]["containerId"],
|
|
1737
|
+
)
|
|
1738
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1739
|
+
cmd = "docker restart " + self.docker_container
|
|
1740
|
+
self.start(cmd, "train_log")
|
|
1741
|
+
return
|
|
1742
|
+
|
|
1595
1743
|
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1596
1744
|
logging.info("cmd is: %s", cmd)
|
|
1597
1745
|
self.start(cmd, "train_log")
|
|
@@ -1613,6 +1761,16 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1613
1761
|
model_family=model_family,
|
|
1614
1762
|
action_id=action_id,
|
|
1615
1763
|
)
|
|
1764
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1765
|
+
logging.info(
|
|
1766
|
+
"Using existing container ID for training: %s",
|
|
1767
|
+
action_details["actionDetails"]["containerId"],
|
|
1768
|
+
)
|
|
1769
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1770
|
+
cmd = "docker restart " + self.docker_container
|
|
1771
|
+
self.start(cmd, "eval_log")
|
|
1772
|
+
return
|
|
1773
|
+
|
|
1616
1774
|
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1617
1775
|
logging.info("cmd is: %s", cmd)
|
|
1618
1776
|
self.start(cmd, "eval_log")
|
|
@@ -1637,6 +1795,16 @@ def model_export_execute(self: ActionInstance):
|
|
|
1637
1795
|
model_family=model_family,
|
|
1638
1796
|
action_id=action_id,
|
|
1639
1797
|
)
|
|
1798
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1799
|
+
logging.info(
|
|
1800
|
+
"Using existing container ID for training: %s",
|
|
1801
|
+
action_details["actionDetails"]["containerId"],
|
|
1802
|
+
)
|
|
1803
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1804
|
+
cmd = "docker restart " + self.docker_container
|
|
1805
|
+
self.start(cmd, "export_log")
|
|
1806
|
+
return
|
|
1807
|
+
|
|
1640
1808
|
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1641
1809
|
logging.info("cmd is: %s", cmd)
|
|
1642
1810
|
self.start(cmd, "export_log")
|
|
@@ -1681,6 +1849,16 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1681
1849
|
self.docker_container = (
|
|
1682
1850
|
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
1683
1851
|
)
|
|
1852
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1853
|
+
logging.info(
|
|
1854
|
+
"Using existing container ID for training: %s",
|
|
1855
|
+
action_details["actionDetails"]["containerId"],
|
|
1856
|
+
)
|
|
1857
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1858
|
+
cmd = "docker restart " + self.docker_container
|
|
1859
|
+
self.start(cmd, "streaming_gateway")
|
|
1860
|
+
return
|
|
1861
|
+
|
|
1684
1862
|
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1685
1863
|
logging.info("cmd is: %s", cmd)
|
|
1686
1864
|
self.start(cmd, "streaming_gateway")
|
|
@@ -1775,6 +1953,17 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1775
1953
|
else:
|
|
1776
1954
|
pkgs = f"matrice_common matrice"
|
|
1777
1955
|
|
|
1956
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1957
|
+
logging.info(
|
|
1958
|
+
"Using existing container ID for training: %s",
|
|
1959
|
+
action_details["actionDetails"]["containerId"],
|
|
1960
|
+
)
|
|
1961
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1962
|
+
cmd = "docker restart " + self.docker_container
|
|
1963
|
+
self.start(cmd, "kafka_setup")
|
|
1964
|
+
return
|
|
1965
|
+
|
|
1966
|
+
|
|
1778
1967
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
1779
1968
|
cmd = (
|
|
1780
1969
|
f"docker run --net=host "
|
|
@@ -1809,10 +1998,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1809
1998
|
image = self.docker_container
|
|
1810
1999
|
|
|
1811
2000
|
self.setup_action_requirements(action_details)
|
|
2001
|
+
|
|
2002
|
+
if action_details["actionDetails"].get("containerId"):
|
|
2003
|
+
logging.info(
|
|
2004
|
+
"Using existing container ID for inference tracker: %s",
|
|
2005
|
+
action_details["actionDetails"]["containerId"],
|
|
2006
|
+
)
|
|
2007
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
2008
|
+
cmd = "docker restart " + self.docker_container
|
|
2009
|
+
self.start(cmd, "inference_tracker_setup")
|
|
2010
|
+
return
|
|
1812
2011
|
|
|
1813
2012
|
# This is the existing Docker run command
|
|
1814
2013
|
worker_cmd = (
|
|
1815
2014
|
f"docker run -d --pull=always --net=host "
|
|
2015
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1816
2016
|
f"--name inference-tracker-worker "
|
|
1817
2017
|
f"-v matrice_myvol:/matrice_data "
|
|
1818
2018
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import subprocess
|
|
6
7
|
import threading
|
|
7
8
|
import time
|
|
9
|
+
from kafka import KafkaProducer
|
|
8
10
|
from matrice_compute.actions_manager import ActionsManager
|
|
9
11
|
from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
|
|
10
12
|
from matrice_compute.compute_operations_handler import ComputeOperationsHandler
|
|
@@ -15,6 +17,7 @@ from matrice_compute.instance_utils import (
|
|
|
15
17
|
from matrice_compute.resources_tracker import (
|
|
16
18
|
MachineResourcesTracker,
|
|
17
19
|
ActionsResourcesTracker,
|
|
20
|
+
KafkaResourceMonitor,
|
|
18
21
|
)
|
|
19
22
|
from matrice_compute.scaling import Scaling
|
|
20
23
|
from matrice_compute.shutdown_manager import ShutdownManager
|
|
@@ -92,6 +95,19 @@ class InstanceManager:
|
|
|
92
95
|
self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
|
|
93
96
|
logging.info("InstanceManager initialized with actions resources tracker")
|
|
94
97
|
|
|
98
|
+
# Initialize Kafka resource monitor using the same internal Kafka as scaling
|
|
99
|
+
try:
|
|
100
|
+
kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
|
|
101
|
+
self.kafka_resource_monitor = KafkaResourceMonitor(
|
|
102
|
+
instance_id=os.environ.get("INSTANCE_ID"),
|
|
103
|
+
kafka_bootstrap=kafka_bootstrap,
|
|
104
|
+
interval_seconds=60
|
|
105
|
+
)
|
|
106
|
+
logging.info("InstanceManager initialized with Kafka resource monitor using internal Kafka: %s", kafka_bootstrap)
|
|
107
|
+
except (ValueError, Exception) as e:
|
|
108
|
+
logging.warning("Failed to initialize Kafka resource monitor: %s", e)
|
|
109
|
+
self.kafka_resource_monitor = None
|
|
110
|
+
|
|
95
111
|
# Initialize Compute Operations Handler for event-driven operations
|
|
96
112
|
# Uses EventListener from matrice_common for simplified Kafka consumption
|
|
97
113
|
try:
|
|
@@ -103,14 +119,30 @@ class InstanceManager:
|
|
|
103
119
|
instance_id=instance_id
|
|
104
120
|
)
|
|
105
121
|
logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
|
|
106
|
-
except Exception as e:
|
|
122
|
+
except (ValueError, Exception) as e:
|
|
107
123
|
logging.warning("Failed to initialize Compute Operations Handler: %s", e)
|
|
108
124
|
self.compute_operations_handler = None
|
|
109
125
|
|
|
110
126
|
self.poll_interval = 10
|
|
111
127
|
# Note: encryption_key is set in _setup_env_credentials
|
|
128
|
+
|
|
129
|
+
# Initialize container monitoring
|
|
130
|
+
self.container_monitor_thread = None
|
|
131
|
+
self.container_monitor_running = False
|
|
132
|
+
self.container_kafka_producer = None
|
|
133
|
+
|
|
112
134
|
logging.info("InstanceManager initialized.")
|
|
113
135
|
|
|
136
|
+
# report the resources at startup
|
|
137
|
+
try:
|
|
138
|
+
self.scaling.report_architecture_info()
|
|
139
|
+
logging.info("InstanceManager reported initial resources.")
|
|
140
|
+
except Exception as exc:
|
|
141
|
+
logging.error(
|
|
142
|
+
"Error reporting initial resources: %s",
|
|
143
|
+
str(exc),
|
|
144
|
+
)
|
|
145
|
+
|
|
114
146
|
@log_errors(default_return=None, raise_exception=True, log_error=True)
|
|
115
147
|
def _setup_env_credentials(
|
|
116
148
|
self,
|
|
@@ -245,13 +277,13 @@ class InstanceManager:
|
|
|
245
277
|
# "Error in scale_down_manager auto_scaledown_actions: %s",
|
|
246
278
|
# str(exc),
|
|
247
279
|
# )
|
|
248
|
-
try:
|
|
249
|
-
|
|
250
|
-
except Exception as exc:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
280
|
+
# try:
|
|
281
|
+
# self.machine_resources_tracker.update_available_resources()
|
|
282
|
+
# except Exception as exc:
|
|
283
|
+
# logging.error(
|
|
284
|
+
# "Error in machine_resources_tracker update_available_resources: %s",
|
|
285
|
+
# str(exc),
|
|
286
|
+
# )
|
|
255
287
|
try:
|
|
256
288
|
self.actions_resources_tracker.update_actions_resources()
|
|
257
289
|
except Exception as exc:
|
|
@@ -262,6 +294,130 @@ class InstanceManager:
|
|
|
262
294
|
|
|
263
295
|
time.sleep(self.poll_interval)
|
|
264
296
|
|
|
297
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
298
|
+
def start_container_status_monitor(self):
|
|
299
|
+
"""Start the background container status monitoring."""
|
|
300
|
+
if self.container_monitor_running:
|
|
301
|
+
logging.info("Container status monitor is already running")
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
self.container_monitor_running = True
|
|
305
|
+
self.container_monitor_thread = threading.Thread(
|
|
306
|
+
target=self._container_status_monitor_worker,
|
|
307
|
+
daemon=True,
|
|
308
|
+
name="ContainerStatusMonitor"
|
|
309
|
+
)
|
|
310
|
+
self.container_monitor_thread.start()
|
|
311
|
+
logging.info("Started container status monitoring thread")
|
|
312
|
+
|
|
313
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
314
|
+
def stop_container_status_monitor(self):
|
|
315
|
+
"""Stop the background container status monitoring."""
|
|
316
|
+
if not self.container_monitor_running:
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
logging.info("Stopping container status monitor...")
|
|
320
|
+
self.container_monitor_running = False
|
|
321
|
+
|
|
322
|
+
if self.container_monitor_thread:
|
|
323
|
+
self.container_monitor_thread.join(timeout=10)
|
|
324
|
+
|
|
325
|
+
if self.container_kafka_producer:
|
|
326
|
+
self.container_kafka_producer.close()
|
|
327
|
+
self.container_kafka_producer = None
|
|
328
|
+
|
|
329
|
+
logging.info("Container status monitor stopped")
|
|
330
|
+
|
|
331
|
+
def _container_status_monitor_worker(self):
|
|
332
|
+
"""Background worker function that monitors container status."""
|
|
333
|
+
# Initialize Kafka producer
|
|
334
|
+
try:
|
|
335
|
+
if self.scaling.enable_kafka:
|
|
336
|
+
bootstrap_servers = self.scaling.get_kafka_bootstrap_servers()
|
|
337
|
+
self.container_kafka_producer = KafkaProducer(
|
|
338
|
+
bootstrap_servers=bootstrap_servers,
|
|
339
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),
|
|
340
|
+
max_block_ms=5000 # Timeout if Kafka is down
|
|
341
|
+
)
|
|
342
|
+
logging.info("Container status monitor: Kafka producer initialized")
|
|
343
|
+
else:
|
|
344
|
+
logging.warning("Container status monitor: Kafka is disabled, no monitoring will be performed")
|
|
345
|
+
return
|
|
346
|
+
except Exception as e:
|
|
347
|
+
logging.error("Container status monitor: Failed to initialize Kafka producer: %s", str(e))
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
instance_id = os.environ.get("INSTANCE_ID")
|
|
351
|
+
topic_name = "compute_container_status"
|
|
352
|
+
|
|
353
|
+
logging.info("Container status monitor started for instance: %s", instance_id)
|
|
354
|
+
|
|
355
|
+
while self.container_monitor_running:
|
|
356
|
+
try:
|
|
357
|
+
# Get container status using docker ps -a
|
|
358
|
+
result = subprocess.run(
|
|
359
|
+
["docker", "ps", "-a", "--format", "json"],
|
|
360
|
+
capture_output=True,
|
|
361
|
+
text=True,
|
|
362
|
+
timeout=30
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if result.returncode != 0:
|
|
366
|
+
logging.error("Container status monitor: docker ps command failed: %s", result.stderr)
|
|
367
|
+
time.sleep(30) # Wait before retrying
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
# Parse container information
|
|
371
|
+
containers = []
|
|
372
|
+
if result.stdout.strip():
|
|
373
|
+
for line in result.stdout.strip().split('\n'):
|
|
374
|
+
try:
|
|
375
|
+
container_info = json.loads(line)
|
|
376
|
+
containers.append({
|
|
377
|
+
"container_id": container_info.get("ID", ""),
|
|
378
|
+
"image": container_info.get("Image", ""),
|
|
379
|
+
"command": container_info.get("Command", ""),
|
|
380
|
+
"created": container_info.get("CreatedAt", ""),
|
|
381
|
+
"status": container_info.get("Status", ""),
|
|
382
|
+
"ports": container_info.get("Ports", ""),
|
|
383
|
+
"names": container_info.get("Names", ""),
|
|
384
|
+
"size": container_info.get("Size", ""),
|
|
385
|
+
"state": container_info.get("State", ""),
|
|
386
|
+
"labels": container_info.get("Labels", "")
|
|
387
|
+
})
|
|
388
|
+
except json.JSONDecodeError as e:
|
|
389
|
+
logging.warning("Container status monitor: Failed to parse container info: %s", str(e))
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
# Prepare message for Kafka
|
|
393
|
+
status_message = {
|
|
394
|
+
"timestamp": time.time(),
|
|
395
|
+
"instance_id": instance_id,
|
|
396
|
+
"container_count": len(containers),
|
|
397
|
+
"containers": containers
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
# Send to Kafka
|
|
401
|
+
if self.container_kafka_producer:
|
|
402
|
+
try:
|
|
403
|
+
self.container_kafka_producer.send(topic_name, status_message)
|
|
404
|
+
logging.debug("Container status monitor: Sent status for %d containers", len(containers))
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
|
|
407
|
+
|
|
408
|
+
except subprocess.TimeoutExpired:
|
|
409
|
+
logging.error("Container status monitor: docker ps command timed out")
|
|
410
|
+
except Exception as e:
|
|
411
|
+
logging.error("Container status monitor: Unexpected error: %s", str(e))
|
|
412
|
+
|
|
413
|
+
# Wait 30 seconds before next check
|
|
414
|
+
for _ in range(30):
|
|
415
|
+
if not self.container_monitor_running:
|
|
416
|
+
break
|
|
417
|
+
time.sleep(1)
|
|
418
|
+
|
|
419
|
+
logging.info("Container status monitor worker stopped")
|
|
420
|
+
|
|
265
421
|
@log_errors(default_return=(None, None), raise_exception=True)
|
|
266
422
|
def start(self) -> tuple:
|
|
267
423
|
"""Start the instance manager threads.
|
|
@@ -269,6 +425,14 @@ class InstanceManager:
|
|
|
269
425
|
Returns:
|
|
270
426
|
tuple: (instance_manager_thread, actions_manager_thread)
|
|
271
427
|
"""
|
|
428
|
+
# Start Kafka resource monitor in background thread
|
|
429
|
+
if self.kafka_resource_monitor:
|
|
430
|
+
try:
|
|
431
|
+
self.kafka_resource_monitor.start()
|
|
432
|
+
logging.info("Started Kafka resource monitor")
|
|
433
|
+
except Exception as exc:
|
|
434
|
+
logging.error("Failed to start Kafka resource monitor: %s", str(exc))
|
|
435
|
+
|
|
272
436
|
# Start Compute Operations Handler in background thread
|
|
273
437
|
if self.compute_operations_handler:
|
|
274
438
|
try:
|
|
@@ -277,6 +441,13 @@ class InstanceManager:
|
|
|
277
441
|
except Exception as exc:
|
|
278
442
|
logging.error("Failed to start Compute Operations Handler: %s", str(exc))
|
|
279
443
|
|
|
444
|
+
# Start Container Status Monitor in background thread
|
|
445
|
+
try:
|
|
446
|
+
self.start_container_status_monitor()
|
|
447
|
+
logging.info("Started Container Status Monitor")
|
|
448
|
+
except Exception as exc:
|
|
449
|
+
logging.error("Failed to start Container Status Monitor: %s", str(exc))
|
|
450
|
+
|
|
280
451
|
# Create and start threads
|
|
281
452
|
instance_manager_thread = threading.Thread(
|
|
282
453
|
target=self.start_instance_manager,
|