matrice-compute 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -495,6 +495,7 @@ class ActionInstance:
495
495
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
496
  *volumes,
497
497
  # Container configuration and startup commands
498
+ f"--cidfile ./{self.action_record_id}.cid ",
498
499
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
499
500
  f'/bin/bash -c "cd {docker_workdir} && '
500
501
  f"{env_exports} && "
@@ -895,6 +896,7 @@ class ActionInstance:
895
896
  """
896
897
  self.cmd = cmd
897
898
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
899
+
898
900
  with open(self.log_path, "wb") as out:
899
901
  self.process = subprocess.Popen(
900
902
  shlex.split(self.cmd),
@@ -903,6 +905,52 @@ class ActionInstance:
903
905
  env={**os.environ},
904
906
  start_new_session=True,
905
907
  )
908
+
909
+ self.container_id = None
910
+
911
+ cid_file_path = f"./{self.action_record_id}.cid"
912
+ max_retries = 5
913
+ retry_delay = 1 # seconds
914
+ for attempt in range(max_retries):
915
+ try:
916
+ with open(cid_file_path, "r") as cid_file:
917
+ container_id = cid_file.read().strip()
918
+ self.container_id = container_id
919
+ logging.info(
920
+ "Started process for action %s with container ID: %s",
921
+ self.action_record_id,
922
+ self.container_id,
923
+ )
924
+ break
925
+ except FileNotFoundError:
926
+ logging.warning(
927
+ "CID file not found for action %s, attempt %d/%d",
928
+ self.action_record_id,
929
+ attempt + 1,
930
+ max_retries,
931
+ )
932
+ time.sleep(retry_delay)
933
+ except Exception as e:
934
+ logging.error(
935
+ "Error reading CID file for action %s: %s",
936
+ self.action_record_id,
937
+ str(e),
938
+ )
939
+ time.sleep(retry_delay)
940
+ else:
941
+ logging.error(
942
+ "Failed to read CID file for action %s after %d attempts",
943
+ self.action_record_id,
944
+ max_retries,
945
+ )
946
+ raise Exception("Failed to start process: CID file not found")
947
+
948
+ # report container id to scaling service
949
+ self.scaling.update_action_container_id(
950
+ action_record_id=self.action_record_id,
951
+ container_id=self.container_id,
952
+ )
953
+
906
954
 
907
955
  @log_errors(raise_exception=False)
908
956
  def start_logger(self):
@@ -1172,11 +1220,27 @@ def database_setup_execute(self: ActionInstance):
1172
1220
 
1173
1221
  project_id = action_details["_idProject"]
1174
1222
 
1223
+ if action_details["actionDetails"].get("containerId"):
1224
+ logging.info(
1225
+ "Using existing container ID for inference tracker: %s",
1226
+ action_details["actionDetails"]["containerId"],
1227
+ )
1228
+ self.docker_container = action_details["actionDetails"]["containerId"]
1229
+ cmd = "docker restart " + self.docker_container
1230
+ self.start(cmd, "qdrant_setup")
1231
+
1232
+ #qdrant restart
1233
+ qdrant_cmd = "docker restart qdrant"
1234
+ self.start(qdrant_cmd, 'qdrant_setup')
1235
+
1236
+ return
1237
+
1175
1238
  # MongoDB container with --net=host (Port: 27020:27017)
1176
1239
  cmd = (
1177
1240
  f"docker run --pull=always --net=host "
1178
1241
  f"--name mongodbdatabase "
1179
1242
  f"-v matrice_myvol:/matrice_data "
1243
+ f"--cidfile ./{self.action_record_id}.cid "
1180
1244
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1181
1245
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1182
1246
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1215,11 +1279,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
1215
1279
 
1216
1280
  self.setup_action_requirements(action_details)
1217
1281
 
1282
+ if action_details["actionDetails"].get("containerId"):
1283
+ logging.info(
1284
+ "Using existing container ID for facial recognition worker: %s",
1285
+ action_details["actionDetails"]["containerId"],
1286
+ )
1287
+ self.docker_container = action_details["actionDetails"]["containerId"]
1288
+ cmd = "docker restart " + self.docker_container
1289
+ self.start(cmd, "facial_recognition_setup")
1290
+ return
1291
+
1218
1292
  # Facial recognition worker container with --net=host (Port: 8081)
1219
1293
  worker_cmd = (
1220
1294
  f"docker run -d --pull=always --net=host "
1221
1295
  f"--name worker "
1296
+ f"--cidfile ./{self.action_record_id}.cid "
1222
1297
  f"-v matrice_myvol:/matrice_data "
1298
+ f"--cidfile ./{self.action_record_id}.cid "
1223
1299
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1224
1300
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1225
1301
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1245,10 +1321,21 @@ def lpr_setup_execute(self: ActionInstance):
1245
1321
 
1246
1322
  self.setup_action_requirements(action_details)
1247
1323
 
1324
+ if action_details["actionDetails"].get("containerId"):
1325
+ logging.info(
1326
+ "Using existing container ID for LPR worker: %s",
1327
+ action_details["actionDetails"]["containerId"],
1328
+ )
1329
+ self.docker_container = action_details["actionDetails"]["containerId"]
1330
+ cmd = "docker restart " + self.docker_container
1331
+ self.start(cmd, "lpr_setup")
1332
+ return
1333
+
1248
1334
  # LPR worker container with --net=host (Port: 8082)
1249
1335
  worker_cmd = (
1250
1336
  f"docker run -d --net=host --pull=always "
1251
1337
  f"--name lpr-worker "
1338
+ f"--cidfile ./{self.action_record_id}.cid "
1252
1339
  f"-v matrice_myvol:/matrice_data "
1253
1340
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1254
1341
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1285,10 +1372,21 @@ def inference_ws_server_execute(self: ActionInstance):
1285
1372
 
1286
1373
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1287
1374
 
1375
+ if action_details["actionDetails"].get("containerId"):
1376
+ logging.info(
1377
+ "Using existing container ID for inference WebSocket server: %s",
1378
+ action_details["actionDetails"]["containerId"],
1379
+ )
1380
+ self.docker_container = action_details["actionDetails"]["containerId"]
1381
+ cmd = "docker restart " + self.docker_container
1382
+ self.start(cmd, "inference_ws_server")
1383
+ return
1384
+
1288
1385
  # Inference WebSocket server with --net=host (Port: 8102)
1289
1386
  worker_cmd = (
1290
1387
  f"docker run -d --pull=always --net=host "
1291
1388
  f"--name inference "
1389
+ f"--cidfile ./{self.action_record_id}.cid "
1292
1390
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1293
1391
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1294
1392
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1321,11 +1419,22 @@ def fe_fs_streaming_execute(self: ActionInstance):
1321
1419
  ws_url = f"{ws_host}:8102"
1322
1420
 
1323
1421
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1422
+
1423
+ if action_details["actionDetails"].get("containerId"):
1424
+ logging.info(
1425
+ "Using existing container ID for frontend streaming: %s",
1426
+ action_details["actionDetails"]["containerId"],
1427
+ )
1428
+ self.docker_container = action_details["actionDetails"]["containerId"]
1429
+ cmd = "docker restart " + self.docker_container
1430
+ self.start(cmd, "fe_fs_streaming")
1431
+ return
1324
1432
 
1325
1433
  # Frontend streaming with --net=host (Port: 3000)
1326
1434
  worker_cmd = (
1327
1435
  f"docker run -d --pull=always --net=host "
1328
1436
  f"--name fe_streaming "
1437
+ f"--cidfile ./{self.action_record_id}.cid "
1329
1438
  f"-v matrice_myvol:/matrice_data "
1330
1439
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1331
1440
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1355,11 +1464,22 @@ def fe_analytics_service_execute(self: ActionInstance):
1355
1464
  self.setup_action_requirements(action_details)
1356
1465
 
1357
1466
  project_id = action_details["_idProject"]
1467
+
1468
+ if action_details["actionDetails"].get("containerId"):
1469
+ logging.info(
1470
+ "Using existing container ID for frontend analytics service: %s",
1471
+ action_details["actionDetails"]["containerId"],
1472
+ )
1473
+ self.docker_container = action_details["actionDetails"]["containerId"]
1474
+ cmd = "docker restart " + self.docker_container
1475
+ self.start(cmd, "fe_analytics_service")
1476
+ return
1358
1477
 
1359
1478
  # Frontend analytics service with --net=host (Port: 3001)
1360
1479
  worker_cmd = (
1361
1480
  f"docker run -d --pull=always --net=host "
1362
1481
  f"--name fe-analytics "
1482
+ f"--cidfile ./{self.action_record_id}.cid "
1363
1483
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1364
1484
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1365
1485
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1447,11 +1567,27 @@ def redis_setup_execute(self: ActionInstance):
1447
1567
  logging.info(f"Redis will use IP: {redis_host} on port 6379")
1448
1568
 
1449
1569
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1570
+
1571
+
1572
+ if action_details["actionDetails"].get("containerId"):
1573
+ logging.info(
1574
+ "Using existing container ID for redis management: %s",
1575
+ action_details["actionDetails"]["containerId"],
1576
+ )
1577
+ self.docker_container = action_details["actionDetails"]["containerId"]
1578
+ cmd = "docker restart " + self.docker_container
1579
+ self.start(cmd, "redis_setup")
1580
+
1581
+ # Redis container restart
1582
+ redis_restart_cmd = "docker restart redis_container"
1583
+ self.start(redis_restart_cmd, "redis")
1584
+
1585
+ return
1450
1586
 
1451
1587
  # Redis container with --net=host (Port: 6379)
1452
1588
  redis_cmd = (
1453
1589
  f"docker run -d --net=host "
1454
- f"--name redis_container_{int(time.time())} "
1590
+ f"--name redis_container"
1455
1591
  f"--restart unless-stopped "
1456
1592
  f"--memory=32g "
1457
1593
  f"--cpus=8 "
@@ -1496,6 +1632,7 @@ def redis_setup_execute(self: ActionInstance):
1496
1632
  # bg-redis management container with --net=host (Port: 8082)
1497
1633
  cmd = (
1498
1634
  f"docker run --net=host "
1635
+ f"--cidfile ./{self.action_record_id}.cid "
1499
1636
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1500
1637
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1501
1638
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1592,6 +1729,17 @@ def model_train_execute(self: ActionInstance):
1592
1729
  model_family=model_family,
1593
1730
  action_id=action_id,
1594
1731
  )
1732
+
1733
+ if action_details["actionDetails"].get("containerId"):
1734
+ logging.info(
1735
+ "Using existing container ID for training: %s",
1736
+ action_details["actionDetails"]["containerId"],
1737
+ )
1738
+ self.docker_container = action_details["actionDetails"]["containerId"]
1739
+ cmd = "docker restart " + self.docker_container
1740
+ self.start(cmd, "train_log")
1741
+ return
1742
+
1595
1743
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1596
1744
  logging.info("cmd is: %s", cmd)
1597
1745
  self.start(cmd, "train_log")
@@ -1613,6 +1761,16 @@ def model_eval_execute(self: ActionInstance):
1613
1761
  model_family=model_family,
1614
1762
  action_id=action_id,
1615
1763
  )
1764
+ if action_details["actionDetails"].get("containerId"):
1765
+ logging.info(
1766
+ "Using existing container ID for training: %s",
1767
+ action_details["actionDetails"]["containerId"],
1768
+ )
1769
+ self.docker_container = action_details["actionDetails"]["containerId"]
1770
+ cmd = "docker restart " + self.docker_container
1771
+ self.start(cmd, "eval_log")
1772
+ return
1773
+
1616
1774
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1617
1775
  logging.info("cmd is: %s", cmd)
1618
1776
  self.start(cmd, "eval_log")
@@ -1637,6 +1795,16 @@ def model_export_execute(self: ActionInstance):
1637
1795
  model_family=model_family,
1638
1796
  action_id=action_id,
1639
1797
  )
1798
+ if action_details["actionDetails"].get("containerId"):
1799
+ logging.info(
1800
+ "Using existing container ID for training: %s",
1801
+ action_details["actionDetails"]["containerId"],
1802
+ )
1803
+ self.docker_container = action_details["actionDetails"]["containerId"]
1804
+ cmd = "docker restart " + self.docker_container
1805
+ self.start(cmd, "export_log")
1806
+ return
1807
+
1640
1808
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1641
1809
  logging.info("cmd is: %s", cmd)
1642
1810
  self.start(cmd, "export_log")
@@ -1681,6 +1849,16 @@ def streaming_gateway_execute(self: ActionInstance):
1681
1849
  self.docker_container = (
1682
1850
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1683
1851
  )
1852
+ if action_details["actionDetails"].get("containerId"):
1853
+ logging.info(
1854
+ "Using existing container ID for training: %s",
1855
+ action_details["actionDetails"]["containerId"],
1856
+ )
1857
+ self.docker_container = action_details["actionDetails"]["containerId"]
1858
+ cmd = "docker restart " + self.docker_container
1859
+ self.start(cmd, "streaming_gateway")
1860
+ return
1861
+
1684
1862
  cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1685
1863
  logging.info("cmd is: %s", cmd)
1686
1864
  self.start(cmd, "streaming_gateway")
@@ -1775,6 +1953,17 @@ def kafka_setup_execute(self: ActionInstance):
1775
1953
  else:
1776
1954
  pkgs = f"matrice_common matrice"
1777
1955
 
1956
+ if action_details["actionDetails"].get("containerId"):
1957
+ logging.info(
1958
+ "Using existing container ID for training: %s",
1959
+ action_details["actionDetails"]["containerId"],
1960
+ )
1961
+ self.docker_container = action_details["actionDetails"]["containerId"]
1962
+ cmd = "docker restart " + self.docker_container
1963
+ self.start(cmd, "kafka_setup")
1964
+ return
1965
+
1966
+
1778
1967
  # Kafka container with --net=host (Ports: 9092, 9093)
1779
1968
  cmd = (
1780
1969
  f"docker run --net=host "
@@ -1809,10 +1998,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
1809
1998
  image = self.docker_container
1810
1999
 
1811
2000
  self.setup_action_requirements(action_details)
2001
+
2002
+ if action_details["actionDetails"].get("containerId"):
2003
+ logging.info(
2004
+ "Using existing container ID for inference tracker: %s",
2005
+ action_details["actionDetails"]["containerId"],
2006
+ )
2007
+ self.docker_container = action_details["actionDetails"]["containerId"]
2008
+ cmd = "docker restart " + self.docker_container
2009
+ self.start(cmd, "inference_tracker_setup")
2010
+ return
1812
2011
 
1813
2012
  # This is the existing Docker run command
1814
2013
  worker_cmd = (
1815
2014
  f"docker run -d --pull=always --net=host "
2015
+ f"--cidfile ./{self.action_record_id}.cid "
1816
2016
  f"--name inference-tracker-worker "
1817
2017
  f"-v matrice_myvol:/matrice_data "
1818
2018
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -3,8 +3,10 @@
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import subprocess
6
7
  import threading
7
8
  import time
9
+ from kafka import KafkaProducer
8
10
  from matrice_compute.actions_manager import ActionsManager
9
11
  from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
10
12
  from matrice_compute.compute_operations_handler import ComputeOperationsHandler
@@ -15,6 +17,7 @@ from matrice_compute.instance_utils import (
15
17
  from matrice_compute.resources_tracker import (
16
18
  MachineResourcesTracker,
17
19
  ActionsResourcesTracker,
20
+ KafkaResourceMonitor,
18
21
  )
19
22
  from matrice_compute.scaling import Scaling
20
23
  from matrice_compute.shutdown_manager import ShutdownManager
@@ -92,6 +95,19 @@ class InstanceManager:
92
95
  self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
93
96
  logging.info("InstanceManager initialized with actions resources tracker")
94
97
 
98
+ # Initialize Kafka resource monitor using the same internal Kafka as scaling
99
+ try:
100
+ kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
101
+ self.kafka_resource_monitor = KafkaResourceMonitor(
102
+ instance_id=os.environ.get("INSTANCE_ID"),
103
+ kafka_bootstrap=kafka_bootstrap,
104
+ interval_seconds=60
105
+ )
106
+ logging.info("InstanceManager initialized with Kafka resource monitor using internal Kafka: %s", kafka_bootstrap)
107
+ except (ValueError, Exception) as e:
108
+ logging.warning("Failed to initialize Kafka resource monitor: %s", e)
109
+ self.kafka_resource_monitor = None
110
+
95
111
  # Initialize Compute Operations Handler for event-driven operations
96
112
  # Uses EventListener from matrice_common for simplified Kafka consumption
97
113
  try:
@@ -103,14 +119,30 @@ class InstanceManager:
103
119
  instance_id=instance_id
104
120
  )
105
121
  logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
106
- except Exception as e:
122
+ except (ValueError, Exception) as e:
107
123
  logging.warning("Failed to initialize Compute Operations Handler: %s", e)
108
124
  self.compute_operations_handler = None
109
125
 
110
126
  self.poll_interval = 10
111
127
  # Note: encryption_key is set in _setup_env_credentials
128
+
129
+ # Initialize container monitoring
130
+ self.container_monitor_thread = None
131
+ self.container_monitor_running = False
132
+ self.container_kafka_producer = None
133
+
112
134
  logging.info("InstanceManager initialized.")
113
135
 
136
+ # report the resources at startup
137
+ try:
138
+ self.scaling.report_architecture_info()
139
+ logging.info("InstanceManager reported initial resources.")
140
+ except Exception as exc:
141
+ logging.error(
142
+ "Error reporting initial resources: %s",
143
+ str(exc),
144
+ )
145
+
114
146
  @log_errors(default_return=None, raise_exception=True, log_error=True)
115
147
  def _setup_env_credentials(
116
148
  self,
@@ -245,13 +277,13 @@ class InstanceManager:
245
277
  # "Error in scale_down_manager auto_scaledown_actions: %s",
246
278
  # str(exc),
247
279
  # )
248
- try:
249
- self.machine_resources_tracker.update_available_resources()
250
- except Exception as exc:
251
- logging.error(
252
- "Error in machine_resources_tracker update_available_resources: %s",
253
- str(exc),
254
- )
280
+ # try:
281
+ # self.machine_resources_tracker.update_available_resources()
282
+ # except Exception as exc:
283
+ # logging.error(
284
+ # "Error in machine_resources_tracker update_available_resources: %s",
285
+ # str(exc),
286
+ # )
255
287
  try:
256
288
  self.actions_resources_tracker.update_actions_resources()
257
289
  except Exception as exc:
@@ -262,6 +294,130 @@ class InstanceManager:
262
294
 
263
295
  time.sleep(self.poll_interval)
264
296
 
297
+ @log_errors(raise_exception=False, log_error=True)
298
+ def start_container_status_monitor(self):
299
+ """Start the background container status monitoring."""
300
+ if self.container_monitor_running:
301
+ logging.info("Container status monitor is already running")
302
+ return
303
+
304
+ self.container_monitor_running = True
305
+ self.container_monitor_thread = threading.Thread(
306
+ target=self._container_status_monitor_worker,
307
+ daemon=True,
308
+ name="ContainerStatusMonitor"
309
+ )
310
+ self.container_monitor_thread.start()
311
+ logging.info("Started container status monitoring thread")
312
+
313
+ @log_errors(raise_exception=False, log_error=True)
314
+ def stop_container_status_monitor(self):
315
+ """Stop the background container status monitoring."""
316
+ if not self.container_monitor_running:
317
+ return
318
+
319
+ logging.info("Stopping container status monitor...")
320
+ self.container_monitor_running = False
321
+
322
+ if self.container_monitor_thread:
323
+ self.container_monitor_thread.join(timeout=10)
324
+
325
+ if self.container_kafka_producer:
326
+ self.container_kafka_producer.close()
327
+ self.container_kafka_producer = None
328
+
329
+ logging.info("Container status monitor stopped")
330
+
331
+ def _container_status_monitor_worker(self):
332
+ """Background worker function that monitors container status."""
333
+ # Initialize Kafka producer
334
+ try:
335
+ if self.scaling.enable_kafka:
336
+ bootstrap_servers = self.scaling.get_kafka_bootstrap_servers()
337
+ self.container_kafka_producer = KafkaProducer(
338
+ bootstrap_servers=bootstrap_servers,
339
+ value_serializer=lambda v: json.dumps(v).encode("utf-8"),
340
+ max_block_ms=5000 # Timeout if Kafka is down
341
+ )
342
+ logging.info("Container status monitor: Kafka producer initialized")
343
+ else:
344
+ logging.warning("Container status monitor: Kafka is disabled, no monitoring will be performed")
345
+ return
346
+ except Exception as e:
347
+ logging.error("Container status monitor: Failed to initialize Kafka producer: %s", str(e))
348
+ return
349
+
350
+ instance_id = os.environ.get("INSTANCE_ID")
351
+ topic_name = "compute_container_status"
352
+
353
+ logging.info("Container status monitor started for instance: %s", instance_id)
354
+
355
+ while self.container_monitor_running:
356
+ try:
357
+ # Get container status using docker ps -a
358
+ result = subprocess.run(
359
+ ["docker", "ps", "-a", "--format", "json"],
360
+ capture_output=True,
361
+ text=True,
362
+ timeout=30
363
+ )
364
+
365
+ if result.returncode != 0:
366
+ logging.error("Container status monitor: docker ps command failed: %s", result.stderr)
367
+ time.sleep(30) # Wait before retrying
368
+ continue
369
+
370
+ # Parse container information
371
+ containers = []
372
+ if result.stdout.strip():
373
+ for line in result.stdout.strip().split('\n'):
374
+ try:
375
+ container_info = json.loads(line)
376
+ containers.append({
377
+ "container_id": container_info.get("ID", ""),
378
+ "image": container_info.get("Image", ""),
379
+ "command": container_info.get("Command", ""),
380
+ "created": container_info.get("CreatedAt", ""),
381
+ "status": container_info.get("Status", ""),
382
+ "ports": container_info.get("Ports", ""),
383
+ "names": container_info.get("Names", ""),
384
+ "size": container_info.get("Size", ""),
385
+ "state": container_info.get("State", ""),
386
+ "labels": container_info.get("Labels", "")
387
+ })
388
+ except json.JSONDecodeError as e:
389
+ logging.warning("Container status monitor: Failed to parse container info: %s", str(e))
390
+ continue
391
+
392
+ # Prepare message for Kafka
393
+ status_message = {
394
+ "timestamp": time.time(),
395
+ "instance_id": instance_id,
396
+ "container_count": len(containers),
397
+ "containers": containers
398
+ }
399
+
400
+ # Send to Kafka
401
+ if self.container_kafka_producer:
402
+ try:
403
+ self.container_kafka_producer.send(topic_name, status_message)
404
+ logging.debug("Container status monitor: Sent status for %d containers", len(containers))
405
+ except Exception as e:
406
+ logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
407
+
408
+ except subprocess.TimeoutExpired:
409
+ logging.error("Container status monitor: docker ps command timed out")
410
+ except Exception as e:
411
+ logging.error("Container status monitor: Unexpected error: %s", str(e))
412
+
413
+ # Wait 30 seconds before next check
414
+ for _ in range(30):
415
+ if not self.container_monitor_running:
416
+ break
417
+ time.sleep(1)
418
+
419
+ logging.info("Container status monitor worker stopped")
420
+
265
421
  @log_errors(default_return=(None, None), raise_exception=True)
266
422
  def start(self) -> tuple:
267
423
  """Start the instance manager threads.
@@ -269,6 +425,14 @@ class InstanceManager:
269
425
  Returns:
270
426
  tuple: (instance_manager_thread, actions_manager_thread)
271
427
  """
428
+ # Start Kafka resource monitor in background thread
429
+ if self.kafka_resource_monitor:
430
+ try:
431
+ self.kafka_resource_monitor.start()
432
+ logging.info("Started Kafka resource monitor")
433
+ except Exception as exc:
434
+ logging.error("Failed to start Kafka resource monitor: %s", str(exc))
435
+
272
436
  # Start Compute Operations Handler in background thread
273
437
  if self.compute_operations_handler:
274
438
  try:
@@ -277,6 +441,13 @@ class InstanceManager:
277
441
  except Exception as exc:
278
442
  logging.error("Failed to start Compute Operations Handler: %s", str(exc))
279
443
 
444
+ # Start Container Status Monitor in background thread
445
+ try:
446
+ self.start_container_status_monitor()
447
+ logging.info("Started Container Status Monitor")
448
+ except Exception as exc:
449
+ logging.error("Failed to start Container Status Monitor: %s", str(exc))
450
+
280
451
  # Create and start threads
281
452
  instance_manager_thread = threading.Thread(
282
453
  target=self.start_instance_manager,