matrice-compute 0.1.26__tar.gz → 0.1.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/action_instance.py +201 -1
  4. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/instance_manager.py +182 -8
  5. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/resources_tracker.py +260 -0
  6. matrice_compute-0.1.28/src/matrice_compute/scaling.py +1395 -0
  7. matrice_compute-0.1.26/src/matrice_compute/scaling.py +0 -737
  8. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/LICENSE.txt +0 -0
  9. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/README.md +0 -0
  10. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/matrice_compute.egg-info/SOURCES.txt +0 -0
  11. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/matrice_compute.egg-info/dependency_links.txt +0 -0
  12. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/matrice_compute.egg-info/not-zip-safe +0 -0
  13. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/matrice_compute.egg-info/top_level.txt +0 -0
  14. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/pyproject.toml +0 -0
  15. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/setup.cfg +0 -0
  16. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/setup.py +0 -0
  17. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/__init__.py +0 -0
  18. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/actions_manager.py +0 -0
  19. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  20. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/compute_operations_handler.py +0 -0
  21. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/instance_utils.py +0 -0
  22. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/prechecks.py +0 -0
  23. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/py.typed +0 -0
  24. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/shutdown_manager.py +0 -0
  25. {matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.26
3
+ Version: 0.1.28
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.26
3
+ Version: 0.1.28
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -495,6 +495,7 @@ class ActionInstance:
495
495
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
496
  *volumes,
497
497
  # Container configuration and startup commands
498
+ f"--cidfile ./{self.action_record_id}.cid ",
498
499
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
499
500
  f'/bin/bash -c "cd {docker_workdir} && '
500
501
  f"{env_exports} && "
@@ -895,6 +896,7 @@ class ActionInstance:
895
896
  """
896
897
  self.cmd = cmd
897
898
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
899
+
898
900
  with open(self.log_path, "wb") as out:
899
901
  self.process = subprocess.Popen(
900
902
  shlex.split(self.cmd),
@@ -903,6 +905,52 @@ class ActionInstance:
903
905
  env={**os.environ},
904
906
  start_new_session=True,
905
907
  )
908
+
909
+ self.container_id = None
910
+
911
+ cid_file_path = f"./{self.action_record_id}.cid"
912
+ max_retries = 5
913
+ retry_delay = 1 # seconds
914
+ for attempt in range(max_retries):
915
+ try:
916
+ with open(cid_file_path, "r") as cid_file:
917
+ container_id = cid_file.read().strip()
918
+ self.container_id = container_id
919
+ logging.info(
920
+ "Started process for action %s with container ID: %s",
921
+ self.action_record_id,
922
+ self.container_id,
923
+ )
924
+ break
925
+ except FileNotFoundError:
926
+ logging.warning(
927
+ "CID file not found for action %s, attempt %d/%d",
928
+ self.action_record_id,
929
+ attempt + 1,
930
+ max_retries,
931
+ )
932
+ time.sleep(retry_delay)
933
+ except Exception as e:
934
+ logging.error(
935
+ "Error reading CID file for action %s: %s",
936
+ self.action_record_id,
937
+ str(e),
938
+ )
939
+ time.sleep(retry_delay)
940
+ else:
941
+ logging.error(
942
+ "Failed to read CID file for action %s after %d attempts",
943
+ self.action_record_id,
944
+ max_retries,
945
+ )
946
+ raise Exception("Failed to start process: CID file not found")
947
+
948
+ # report container id to scaling service
949
+ self.scaling.update_action_container_id(
950
+ action_record_id=self.action_record_id,
951
+ container_id=self.container_id,
952
+ )
953
+
906
954
 
907
955
  @log_errors(raise_exception=False)
908
956
  def start_logger(self):
@@ -1172,11 +1220,27 @@ def database_setup_execute(self: ActionInstance):
1172
1220
 
1173
1221
  project_id = action_details["_idProject"]
1174
1222
 
1223
+ if action_details["actionDetails"].get("containerId"):
1224
+ logging.info(
1225
+ "Using existing container ID for inference tracker: %s",
1226
+ action_details["actionDetails"]["containerId"],
1227
+ )
1228
+ self.docker_container = action_details["actionDetails"]["containerId"]
1229
+ cmd = "docker restart " + self.docker_container
1230
+ self.start(cmd, "qdrant_setup")
1231
+
1232
+ #qdrant restart
1233
+ qdrant_cmd = "docker restart qdrant"
1234
+ self.start(qdrant_cmd, 'qdrant_setup')
1235
+
1236
+ return
1237
+
1175
1238
  # MongoDB container with --net=host (Port: 27020:27017)
1176
1239
  cmd = (
1177
1240
  f"docker run --pull=always --net=host "
1178
1241
  f"--name mongodbdatabase "
1179
1242
  f"-v matrice_myvol:/matrice_data "
1243
+ f"--cidfile ./{self.action_record_id}.cid "
1180
1244
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1181
1245
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1182
1246
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1215,11 +1279,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
1215
1279
 
1216
1280
  self.setup_action_requirements(action_details)
1217
1281
 
1282
+ if action_details["actionDetails"].get("containerId"):
1283
+ logging.info(
1284
+ "Using existing container ID for facial recognition worker: %s",
1285
+ action_details["actionDetails"]["containerId"],
1286
+ )
1287
+ self.docker_container = action_details["actionDetails"]["containerId"]
1288
+ cmd = "docker restart " + self.docker_container
1289
+ self.start(cmd, "facial_recognition_setup")
1290
+ return
1291
+
1218
1292
  # Facial recognition worker container with --net=host (Port: 8081)
1219
1293
  worker_cmd = (
1220
1294
  f"docker run -d --pull=always --net=host "
1221
1295
  f"--name worker "
1296
+ f"--cidfile ./{self.action_record_id}.cid "
1222
1297
  f"-v matrice_myvol:/matrice_data "
1298
+ f"--cidfile ./{self.action_record_id}.cid "
1223
1299
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1224
1300
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1225
1301
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1245,10 +1321,21 @@ def lpr_setup_execute(self: ActionInstance):
1245
1321
 
1246
1322
  self.setup_action_requirements(action_details)
1247
1323
 
1324
+ if action_details["actionDetails"].get("containerId"):
1325
+ logging.info(
1326
+ "Using existing container ID for LPR worker: %s",
1327
+ action_details["actionDetails"]["containerId"],
1328
+ )
1329
+ self.docker_container = action_details["actionDetails"]["containerId"]
1330
+ cmd = "docker restart " + self.docker_container
1331
+ self.start(cmd, "lpr_setup")
1332
+ return
1333
+
1248
1334
  # LPR worker container with --net=host (Port: 8082)
1249
1335
  worker_cmd = (
1250
1336
  f"docker run -d --net=host --pull=always "
1251
1337
  f"--name lpr-worker "
1338
+ f"--cidfile ./{self.action_record_id}.cid "
1252
1339
  f"-v matrice_myvol:/matrice_data "
1253
1340
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1254
1341
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1285,10 +1372,21 @@ def inference_ws_server_execute(self: ActionInstance):
1285
1372
 
1286
1373
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1287
1374
 
1375
+ if action_details["actionDetails"].get("containerId"):
1376
+ logging.info(
1377
+ "Using existing container ID for inference WebSocket server: %s",
1378
+ action_details["actionDetails"]["containerId"],
1379
+ )
1380
+ self.docker_container = action_details["actionDetails"]["containerId"]
1381
+ cmd = "docker restart " + self.docker_container
1382
+ self.start(cmd, "inference_ws_server")
1383
+ return
1384
+
1288
1385
  # Inference WebSocket server with --net=host (Port: 8102)
1289
1386
  worker_cmd = (
1290
1387
  f"docker run -d --pull=always --net=host "
1291
1388
  f"--name inference "
1389
+ f"--cidfile ./{self.action_record_id}.cid "
1292
1390
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1293
1391
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1294
1392
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1321,11 +1419,22 @@ def fe_fs_streaming_execute(self: ActionInstance):
1321
1419
  ws_url = f"{ws_host}:8102"
1322
1420
 
1323
1421
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1422
+
1423
+ if action_details["actionDetails"].get("containerId"):
1424
+ logging.info(
1425
+ "Using existing container ID for frontend streaming: %s",
1426
+ action_details["actionDetails"]["containerId"],
1427
+ )
1428
+ self.docker_container = action_details["actionDetails"]["containerId"]
1429
+ cmd = "docker restart " + self.docker_container
1430
+ self.start(cmd, "fe_fs_streaming")
1431
+ return
1324
1432
 
1325
1433
  # Frontend streaming with --net=host (Port: 3000)
1326
1434
  worker_cmd = (
1327
1435
  f"docker run -d --pull=always --net=host "
1328
1436
  f"--name fe_streaming "
1437
+ f"--cidfile ./{self.action_record_id}.cid "
1329
1438
  f"-v matrice_myvol:/matrice_data "
1330
1439
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1331
1440
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1355,11 +1464,22 @@ def fe_analytics_service_execute(self: ActionInstance):
1355
1464
  self.setup_action_requirements(action_details)
1356
1465
 
1357
1466
  project_id = action_details["_idProject"]
1467
+
1468
+ if action_details["actionDetails"].get("containerId"):
1469
+ logging.info(
1470
+ "Using existing container ID for frontend analytics service: %s",
1471
+ action_details["actionDetails"]["containerId"],
1472
+ )
1473
+ self.docker_container = action_details["actionDetails"]["containerId"]
1474
+ cmd = "docker restart " + self.docker_container
1475
+ self.start(cmd, "fe_analytics_service")
1476
+ return
1358
1477
 
1359
1478
  # Frontend analytics service with --net=host (Port: 3001)
1360
1479
  worker_cmd = (
1361
1480
  f"docker run -d --pull=always --net=host "
1362
1481
  f"--name fe-analytics "
1482
+ f"--cidfile ./{self.action_record_id}.cid "
1363
1483
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1364
1484
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1365
1485
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1447,11 +1567,27 @@ def redis_setup_execute(self: ActionInstance):
1447
1567
  logging.info(f"Redis will use IP: {redis_host} on port 6379")
1448
1568
 
1449
1569
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1570
+
1571
+
1572
+ if action_details["actionDetails"].get("containerId"):
1573
+ logging.info(
1574
+ "Using existing container ID for redis management: %s",
1575
+ action_details["actionDetails"]["containerId"],
1576
+ )
1577
+ self.docker_container = action_details["actionDetails"]["containerId"]
1578
+ cmd = "docker restart " + self.docker_container
1579
+ self.start(cmd, "redis_setup")
1580
+
1581
+ # Redis container restart
1582
+ redis_restart_cmd = "docker restart redis_container"
1583
+ self.start(redis_restart_cmd, "redis")
1584
+
1585
+ return
1450
1586
 
1451
1587
  # Redis container with --net=host (Port: 6379)
1452
1588
  redis_cmd = (
1453
1589
  f"docker run -d --net=host "
1454
- f"--name redis_container_{int(time.time())} "
1590
+ f"--name redis_container"
1455
1591
  f"--restart unless-stopped "
1456
1592
  f"--memory=32g "
1457
1593
  f"--cpus=8 "
@@ -1496,6 +1632,7 @@ def redis_setup_execute(self: ActionInstance):
1496
1632
  # bg-redis management container with --net=host (Port: 8082)
1497
1633
  cmd = (
1498
1634
  f"docker run --net=host "
1635
+ f"--cidfile ./{self.action_record_id}.cid "
1499
1636
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1500
1637
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1501
1638
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1592,6 +1729,17 @@ def model_train_execute(self: ActionInstance):
1592
1729
  model_family=model_family,
1593
1730
  action_id=action_id,
1594
1731
  )
1732
+
1733
+ if action_details["actionDetails"].get("containerId"):
1734
+ logging.info(
1735
+ "Using existing container ID for training: %s",
1736
+ action_details["actionDetails"]["containerId"],
1737
+ )
1738
+ self.docker_container = action_details["actionDetails"]["containerId"]
1739
+ cmd = "docker restart " + self.docker_container
1740
+ self.start(cmd, "train_log")
1741
+ return
1742
+
1595
1743
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1596
1744
  logging.info("cmd is: %s", cmd)
1597
1745
  self.start(cmd, "train_log")
@@ -1613,6 +1761,16 @@ def model_eval_execute(self: ActionInstance):
1613
1761
  model_family=model_family,
1614
1762
  action_id=action_id,
1615
1763
  )
1764
+ if action_details["actionDetails"].get("containerId"):
1765
+ logging.info(
1766
+ "Using existing container ID for training: %s",
1767
+ action_details["actionDetails"]["containerId"],
1768
+ )
1769
+ self.docker_container = action_details["actionDetails"]["containerId"]
1770
+ cmd = "docker restart " + self.docker_container
1771
+ self.start(cmd, "eval_log")
1772
+ return
1773
+
1616
1774
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1617
1775
  logging.info("cmd is: %s", cmd)
1618
1776
  self.start(cmd, "eval_log")
@@ -1637,6 +1795,16 @@ def model_export_execute(self: ActionInstance):
1637
1795
  model_family=model_family,
1638
1796
  action_id=action_id,
1639
1797
  )
1798
+ if action_details["actionDetails"].get("containerId"):
1799
+ logging.info(
1800
+ "Using existing container ID for training: %s",
1801
+ action_details["actionDetails"]["containerId"],
1802
+ )
1803
+ self.docker_container = action_details["actionDetails"]["containerId"]
1804
+ cmd = "docker restart " + self.docker_container
1805
+ self.start(cmd, "export_log")
1806
+ return
1807
+
1640
1808
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1641
1809
  logging.info("cmd is: %s", cmd)
1642
1810
  self.start(cmd, "export_log")
@@ -1681,6 +1849,16 @@ def streaming_gateway_execute(self: ActionInstance):
1681
1849
  self.docker_container = (
1682
1850
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1683
1851
  )
1852
+ if action_details["actionDetails"].get("containerId"):
1853
+ logging.info(
1854
+ "Using existing container ID for training: %s",
1855
+ action_details["actionDetails"]["containerId"],
1856
+ )
1857
+ self.docker_container = action_details["actionDetails"]["containerId"]
1858
+ cmd = "docker restart " + self.docker_container
1859
+ self.start(cmd, "streaming_gateway")
1860
+ return
1861
+
1684
1862
  cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1685
1863
  logging.info("cmd is: %s", cmd)
1686
1864
  self.start(cmd, "streaming_gateway")
@@ -1775,6 +1953,17 @@ def kafka_setup_execute(self: ActionInstance):
1775
1953
  else:
1776
1954
  pkgs = f"matrice_common matrice"
1777
1955
 
1956
+ if action_details["actionDetails"].get("containerId"):
1957
+ logging.info(
1958
+ "Using existing container ID for training: %s",
1959
+ action_details["actionDetails"]["containerId"],
1960
+ )
1961
+ self.docker_container = action_details["actionDetails"]["containerId"]
1962
+ cmd = "docker restart " + self.docker_container
1963
+ self.start(cmd, "kafka_setup")
1964
+ return
1965
+
1966
+
1778
1967
  # Kafka container with --net=host (Ports: 9092, 9093)
1779
1968
  cmd = (
1780
1969
  f"docker run --net=host "
@@ -1809,10 +1998,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
1809
1998
  image = self.docker_container
1810
1999
 
1811
2000
  self.setup_action_requirements(action_details)
2001
+
2002
+ if action_details["actionDetails"].get("containerId"):
2003
+ logging.info(
2004
+ "Using existing container ID for inference tracker: %s",
2005
+ action_details["actionDetails"]["containerId"],
2006
+ )
2007
+ self.docker_container = action_details["actionDetails"]["containerId"]
2008
+ cmd = "docker restart " + self.docker_container
2009
+ self.start(cmd, "inference_tracker_setup")
2010
+ return
1812
2011
 
1813
2012
  # This is the existing Docker run command
1814
2013
  worker_cmd = (
1815
2014
  f"docker run -d --pull=always --net=host "
2015
+ f"--cidfile ./{self.action_record_id}.cid "
1816
2016
  f"--name inference-tracker-worker "
1817
2017
  f"-v matrice_myvol:/matrice_data "
1818
2018
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -3,8 +3,10 @@
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import subprocess
6
7
  import threading
7
8
  import time
9
+ from kafka import KafkaProducer
8
10
  from matrice_compute.actions_manager import ActionsManager
9
11
  from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
10
12
  from matrice_compute.compute_operations_handler import ComputeOperationsHandler
@@ -15,6 +17,7 @@ from matrice_compute.instance_utils import (
15
17
  from matrice_compute.resources_tracker import (
16
18
  MachineResourcesTracker,
17
19
  ActionsResourcesTracker,
20
+ KafkaResourceMonitor,
18
21
  )
19
22
  from matrice_compute.scaling import Scaling
20
23
  from matrice_compute.shutdown_manager import ShutdownManager
@@ -38,6 +41,7 @@ class InstanceManager:
38
41
  env: str = "",
39
42
  gpus: str = "",
40
43
  workspace_dir: str = "matrice_workspace",
44
+ enable_kafka: bool = False,
41
45
  ):
42
46
  """Initialize an instance manager.
43
47
 
@@ -58,6 +62,7 @@ class InstanceManager:
58
62
  Defaults to empty string.
59
63
  workspace_dir (str): Directory for workspace files.
60
64
  Defaults to "matrice_workspace".
65
+ enable_kafka (bool): Enable Kafka communication (default False).
61
66
  """
62
67
  self.session = self._setup_env_credentials(
63
68
  env,
@@ -72,6 +77,7 @@ class InstanceManager:
72
77
  self.scaling = Scaling(
73
78
  self.session,
74
79
  os.environ.get("INSTANCE_ID"),
80
+ enable_kafka,
75
81
  )
76
82
  logging.info("InstanceManager initialized with scaling")
77
83
  jupyter_token = os.environ.get("JUPYTER_TOKEN")
@@ -92,6 +98,19 @@ class InstanceManager:
92
98
  self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
93
99
  logging.info("InstanceManager initialized with actions resources tracker")
94
100
 
101
+ # Initialize Kafka resource monitor using the same internal Kafka as scaling
102
+ try:
103
+ kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
104
+ self.kafka_resource_monitor = KafkaResourceMonitor(
105
+ instance_id=os.environ.get("INSTANCE_ID"),
106
+ kafka_bootstrap=kafka_bootstrap,
107
+ interval_seconds=60
108
+ )
109
+ logging.info("InstanceManager initialized with Kafka resource monitor using internal Kafka: %s", kafka_bootstrap)
110
+ except (ValueError, Exception) as e:
111
+ logging.warning("Failed to initialize Kafka resource monitor: %s", e)
112
+ self.kafka_resource_monitor = None
113
+
95
114
  # Initialize Compute Operations Handler for event-driven operations
96
115
  # Uses EventListener from matrice_common for simplified Kafka consumption
97
116
  try:
@@ -103,14 +122,30 @@ class InstanceManager:
103
122
  instance_id=instance_id
104
123
  )
105
124
  logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
106
- except Exception as e:
125
+ except (ValueError, Exception) as e:
107
126
  logging.warning("Failed to initialize Compute Operations Handler: %s", e)
108
127
  self.compute_operations_handler = None
109
128
 
110
129
  self.poll_interval = 10
111
130
  # Note: encryption_key is set in _setup_env_credentials
131
+
132
+ # Initialize container monitoring
133
+ self.container_monitor_thread = None
134
+ self.container_monitor_running = False
135
+ self.container_kafka_producer = None
136
+
112
137
  logging.info("InstanceManager initialized.")
113
138
 
139
+ # report the resources at startup
140
+ try:
141
+ self.scaling.report_architecture_info()
142
+ logging.info("InstanceManager reported initial resources.")
143
+ except Exception as exc:
144
+ logging.error(
145
+ "Error reporting initial resources: %s",
146
+ str(exc),
147
+ )
148
+
114
149
  @log_errors(default_return=None, raise_exception=True, log_error=True)
115
150
  def _setup_env_credentials(
116
151
  self,
@@ -245,13 +280,13 @@ class InstanceManager:
245
280
  # "Error in scale_down_manager auto_scaledown_actions: %s",
246
281
  # str(exc),
247
282
  # )
248
- try:
249
- self.machine_resources_tracker.update_available_resources()
250
- except Exception as exc:
251
- logging.error(
252
- "Error in machine_resources_tracker update_available_resources: %s",
253
- str(exc),
254
- )
283
+ # try:
284
+ # self.machine_resources_tracker.update_available_resources()
285
+ # except Exception as exc:
286
+ # logging.error(
287
+ # "Error in machine_resources_tracker update_available_resources: %s",
288
+ # str(exc),
289
+ # )
255
290
  try:
256
291
  self.actions_resources_tracker.update_actions_resources()
257
292
  except Exception as exc:
@@ -262,6 +297,130 @@ class InstanceManager:
262
297
 
263
298
  time.sleep(self.poll_interval)
264
299
 
300
+ @log_errors(raise_exception=False, log_error=True)
301
+ def start_container_status_monitor(self):
302
+ """Start the background container status monitoring."""
303
+ if self.container_monitor_running:
304
+ logging.info("Container status monitor is already running")
305
+ return
306
+
307
+ self.container_monitor_running = True
308
+ self.container_monitor_thread = threading.Thread(
309
+ target=self._container_status_monitor_worker,
310
+ daemon=True,
311
+ name="ContainerStatusMonitor"
312
+ )
313
+ self.container_monitor_thread.start()
314
+ logging.info("Started container status monitoring thread")
315
+
316
+ @log_errors(raise_exception=False, log_error=True)
317
+ def stop_container_status_monitor(self):
318
+ """Stop the background container status monitoring."""
319
+ if not self.container_monitor_running:
320
+ return
321
+
322
+ logging.info("Stopping container status monitor...")
323
+ self.container_monitor_running = False
324
+
325
+ if self.container_monitor_thread:
326
+ self.container_monitor_thread.join(timeout=10)
327
+
328
+ if self.container_kafka_producer:
329
+ self.container_kafka_producer.close()
330
+ self.container_kafka_producer = None
331
+
332
+ logging.info("Container status monitor stopped")
333
+
334
+ def _container_status_monitor_worker(self):
335
+ """Background worker function that monitors container status."""
336
+ # Initialize Kafka producer
337
+ try:
338
+ if self.scaling.enable_kafka:
339
+ bootstrap_servers = self.scaling.get_kafka_bootstrap_servers()
340
+ self.container_kafka_producer = KafkaProducer(
341
+ bootstrap_servers=bootstrap_servers,
342
+ value_serializer=lambda v: json.dumps(v).encode("utf-8"),
343
+ max_block_ms=5000 # Timeout if Kafka is down
344
+ )
345
+ logging.info("Container status monitor: Kafka producer initialized")
346
+ else:
347
+ logging.warning("Container status monitor: Kafka is disabled, no monitoring will be performed")
348
+ return
349
+ except Exception as e:
350
+ logging.error("Container status monitor: Failed to initialize Kafka producer: %s", str(e))
351
+ return
352
+
353
+ instance_id = os.environ.get("INSTANCE_ID")
354
+ topic_name = "compute_container_status"
355
+
356
+ logging.info("Container status monitor started for instance: %s", instance_id)
357
+
358
+ while self.container_monitor_running:
359
+ try:
360
+ # Get container status using docker ps -a
361
+ result = subprocess.run(
362
+ ["docker", "ps", "-a", "--format", "json"],
363
+ capture_output=True,
364
+ text=True,
365
+ timeout=30
366
+ )
367
+
368
+ if result.returncode != 0:
369
+ logging.error("Container status monitor: docker ps command failed: %s", result.stderr)
370
+ time.sleep(30) # Wait before retrying
371
+ continue
372
+
373
+ # Parse container information
374
+ containers = []
375
+ if result.stdout.strip():
376
+ for line in result.stdout.strip().split('\n'):
377
+ try:
378
+ container_info = json.loads(line)
379
+ containers.append({
380
+ "container_id": container_info.get("ID", ""),
381
+ "image": container_info.get("Image", ""),
382
+ "command": container_info.get("Command", ""),
383
+ "created": container_info.get("CreatedAt", ""),
384
+ "status": container_info.get("Status", ""),
385
+ "ports": container_info.get("Ports", ""),
386
+ "names": container_info.get("Names", ""),
387
+ "size": container_info.get("Size", ""),
388
+ "state": container_info.get("State", ""),
389
+ "labels": container_info.get("Labels", "")
390
+ })
391
+ except json.JSONDecodeError as e:
392
+ logging.warning("Container status monitor: Failed to parse container info: %s", str(e))
393
+ continue
394
+
395
+ # Prepare message for Kafka
396
+ status_message = {
397
+ "timestamp": time.time(),
398
+ "instance_id": instance_id,
399
+ "container_count": len(containers),
400
+ "containers": containers
401
+ }
402
+
403
+ # Send to Kafka
404
+ if self.container_kafka_producer:
405
+ try:
406
+ self.container_kafka_producer.send(topic_name, status_message)
407
+ logging.debug("Container status monitor: Sent status for %d containers", len(containers))
408
+ except Exception as e:
409
+ logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
410
+
411
+ except subprocess.TimeoutExpired:
412
+ logging.error("Container status monitor: docker ps command timed out")
413
+ except Exception as e:
414
+ logging.error("Container status monitor: Unexpected error: %s", str(e))
415
+
416
+ # Wait 30 seconds before next check
417
+ for _ in range(30):
418
+ if not self.container_monitor_running:
419
+ break
420
+ time.sleep(1)
421
+
422
+ logging.info("Container status monitor worker stopped")
423
+
265
424
  @log_errors(default_return=(None, None), raise_exception=True)
266
425
  def start(self) -> tuple:
267
426
  """Start the instance manager threads.
@@ -269,6 +428,14 @@ class InstanceManager:
269
428
  Returns:
270
429
  tuple: (instance_manager_thread, actions_manager_thread)
271
430
  """
431
+ # Start Kafka resource monitor in background thread
432
+ if self.kafka_resource_monitor:
433
+ try:
434
+ self.kafka_resource_monitor.start()
435
+ logging.info("Started Kafka resource monitor")
436
+ except Exception as exc:
437
+ logging.error("Failed to start Kafka resource monitor: %s", str(exc))
438
+
272
439
  # Start Compute Operations Handler in background thread
273
440
  if self.compute_operations_handler:
274
441
  try:
@@ -277,6 +444,13 @@ class InstanceManager:
277
444
  except Exception as exc:
278
445
  logging.error("Failed to start Compute Operations Handler: %s", str(exc))
279
446
 
447
+ # Start Container Status Monitor in background thread
448
+ try:
449
+ self.start_container_status_monitor()
450
+ logging.info("Started Container Status Monitor")
451
+ except Exception as exc:
452
+ logging.error("Failed to start Container Status Monitor: %s", str(exc))
453
+
280
454
  # Create and start threads
281
455
  instance_manager_thread = threading.Thread(
282
456
  target=self.start_instance_manager,