matrice-compute 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +11 -67
- matrice_compute/instance_manager.py +59 -0
- matrice_compute/resources_tracker.py +244 -8
- {matrice_compute-0.1.41.dist-info → matrice_compute-0.1.43.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.41.dist-info → matrice_compute-0.1.43.dist-info}/RECORD +8 -8
- {matrice_compute-0.1.41.dist-info → matrice_compute-0.1.43.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.41.dist-info → matrice_compute-0.1.43.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.41.dist-info → matrice_compute-0.1.43.dist-info}/top_level.txt +0 -0
|
@@ -433,10 +433,9 @@ class ActionInstance:
|
|
|
433
433
|
cmd_parts = [
|
|
434
434
|
f"docker run -d {use_gpu} {use_restart_policy} ",
|
|
435
435
|
network_config,
|
|
436
|
+
f"--name {self.action_record_id}_{self.action_type} ",
|
|
436
437
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
437
438
|
*volumes,
|
|
438
|
-
# Container configuration and startup commands
|
|
439
|
-
f"--cidfile ./{self.action_record_id}.cid ",
|
|
440
439
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
441
440
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
442
441
|
f"{env_exports} && "
|
|
@@ -846,51 +845,6 @@ class ActionInstance:
|
|
|
846
845
|
env={**os.environ},
|
|
847
846
|
start_new_session=True,
|
|
848
847
|
)
|
|
849
|
-
|
|
850
|
-
self.container_id = None
|
|
851
|
-
|
|
852
|
-
cid_file_path = f"./{self.action_record_id}.cid"
|
|
853
|
-
max_retries = 5
|
|
854
|
-
retry_delay = 1 # seconds
|
|
855
|
-
for attempt in range(max_retries):
|
|
856
|
-
try:
|
|
857
|
-
with open(cid_file_path, "r") as cid_file:
|
|
858
|
-
container_id = cid_file.read().strip()
|
|
859
|
-
self.container_id = container_id
|
|
860
|
-
logging.info(
|
|
861
|
-
"Started process for action %s with container ID: %s",
|
|
862
|
-
self.action_record_id,
|
|
863
|
-
self.container_id,
|
|
864
|
-
)
|
|
865
|
-
break
|
|
866
|
-
except FileNotFoundError:
|
|
867
|
-
logging.warning(
|
|
868
|
-
"CID file not found for action %s, attempt %d/%d",
|
|
869
|
-
self.action_record_id,
|
|
870
|
-
attempt + 1,
|
|
871
|
-
max_retries,
|
|
872
|
-
)
|
|
873
|
-
time.sleep(retry_delay)
|
|
874
|
-
except Exception as e:
|
|
875
|
-
logging.error(
|
|
876
|
-
"Error reading CID file for action %s: %s",
|
|
877
|
-
self.action_record_id,
|
|
878
|
-
str(e),
|
|
879
|
-
)
|
|
880
|
-
time.sleep(retry_delay)
|
|
881
|
-
else:
|
|
882
|
-
logging.error(
|
|
883
|
-
"Failed to read CID file for action %s after %d attempts",
|
|
884
|
-
self.action_record_id,
|
|
885
|
-
max_retries,
|
|
886
|
-
)
|
|
887
|
-
raise Exception("Failed to start process: CID file not found")
|
|
888
|
-
|
|
889
|
-
# report container id to scaling service
|
|
890
|
-
self.scaling.update_action_container_id(
|
|
891
|
-
action_record_id=self.action_record_id,
|
|
892
|
-
container_id=self.container_id,
|
|
893
|
-
)
|
|
894
848
|
|
|
895
849
|
|
|
896
850
|
@log_errors(raise_exception=False)
|
|
@@ -1184,9 +1138,8 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1184
1138
|
cmd = (
|
|
1185
1139
|
f"docker run --pull=always --net=host "
|
|
1186
1140
|
f"-v {dbPath}:{dbPath} "
|
|
1187
|
-
f"--name
|
|
1141
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1188
1142
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1189
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1190
1143
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1191
1144
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1192
1145
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1228,10 +1181,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1228
1181
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1229
1182
|
worker_cmd = (
|
|
1230
1183
|
f"docker run -d --pull=always --net=host "
|
|
1231
|
-
f"--name
|
|
1232
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1184
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1233
1185
|
f"-v matrice_myvol:/matrice_data "
|
|
1234
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1235
1186
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1236
1187
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1237
1188
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1271,8 +1222,7 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1271
1222
|
# LPR worker container with --net=host (Port: 8082)
|
|
1272
1223
|
worker_cmd = (
|
|
1273
1224
|
f"docker run -d --net=host --pull=always "
|
|
1274
|
-
f"--name
|
|
1275
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1225
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1276
1226
|
f"-v matrice_myvol:/matrice_data "
|
|
1277
1227
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1278
1228
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1323,8 +1273,7 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1323
1273
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1324
1274
|
worker_cmd = (
|
|
1325
1275
|
f"docker run -d --pull=always --net=host "
|
|
1326
|
-
f"--name
|
|
1327
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1276
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1328
1277
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1329
1278
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1330
1279
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1372,8 +1321,7 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1372
1321
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1373
1322
|
worker_cmd = (
|
|
1374
1323
|
f"docker run -d --pull=always --net=host "
|
|
1375
|
-
f"--name
|
|
1376
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1324
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1377
1325
|
f"-v matrice_myvol:/matrice_data "
|
|
1378
1326
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1379
1327
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1418,8 +1366,7 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1418
1366
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1419
1367
|
worker_cmd = (
|
|
1420
1368
|
f"docker run -d --pull=always --net=host "
|
|
1421
|
-
f"--name
|
|
1422
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1369
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1423
1370
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1424
1371
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1425
1372
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1520,7 +1467,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1520
1467
|
self.start(cmd, "redis_setup")
|
|
1521
1468
|
|
|
1522
1469
|
# Redis container restart
|
|
1523
|
-
redis_restart_cmd = "docker restart
|
|
1470
|
+
redis_restart_cmd = f"docker restart {self.action_record_id}_{self.action_type}_redis_container"
|
|
1524
1471
|
self.start(redis_restart_cmd, "redis")
|
|
1525
1472
|
|
|
1526
1473
|
return
|
|
@@ -1528,7 +1475,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1528
1475
|
# Redis container with --net=host (Port: 6379)
|
|
1529
1476
|
redis_cmd = (
|
|
1530
1477
|
f"docker run -d --net=host "
|
|
1531
|
-
f"--name
|
|
1478
|
+
f"--name {self.action_record_id}_{self.action_type}_redis_container "
|
|
1532
1479
|
f"--restart unless-stopped "
|
|
1533
1480
|
f"{redis_image} "
|
|
1534
1481
|
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
@@ -1556,7 +1503,6 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1556
1503
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1557
1504
|
cmd = (
|
|
1558
1505
|
f"docker run --net=host "
|
|
1559
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1560
1506
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1561
1507
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1562
1508
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1921,8 +1867,7 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1921
1867
|
# This is the existing Docker run command
|
|
1922
1868
|
worker_cmd = (
|
|
1923
1869
|
f"docker run -d --pull=always --net=host "
|
|
1924
|
-
f"--
|
|
1925
|
-
f"--name inference-tracker-worker "
|
|
1870
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1926
1871
|
f"-v matrice_myvol:/matrice_data "
|
|
1927
1872
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1928
1873
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1963,8 +1908,7 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1963
1908
|
# This is the existing Docker run command
|
|
1964
1909
|
worker_cmd = (
|
|
1965
1910
|
f"docker run -d --pull=always --net=host "
|
|
1966
|
-
|
|
1967
|
-
f"--name media_server "
|
|
1911
|
+
f"--name {self.action_record_id}_{self.action_type} "
|
|
1968
1912
|
f"-v matrice_myvol:/matrice_data "
|
|
1969
1913
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1970
1914
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -18,6 +18,7 @@ from matrice_compute.resources_tracker import (
|
|
|
18
18
|
MachineResourcesTracker,
|
|
19
19
|
ActionsResourcesTracker,
|
|
20
20
|
KafkaResourceMonitor,
|
|
21
|
+
ContainerResourceMonitor,
|
|
21
22
|
)
|
|
22
23
|
from matrice_compute.scaling import Scaling
|
|
23
24
|
from matrice_compute.shutdown_manager import ShutdownManager
|
|
@@ -111,6 +112,19 @@ class InstanceManager:
|
|
|
111
112
|
logging.warning("Failed to initialize Kafka resource monitor: %s", e)
|
|
112
113
|
self.kafka_resource_monitor = None
|
|
113
114
|
|
|
115
|
+
# Initialize Container resource monitor using the same internal Kafka as scaling
|
|
116
|
+
try:
|
|
117
|
+
kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
|
|
118
|
+
self.container_resource_monitor = ContainerResourceMonitor(
|
|
119
|
+
instance_id=os.environ.get("INSTANCE_ID"),
|
|
120
|
+
kafka_bootstrap=kafka_bootstrap,
|
|
121
|
+
interval_seconds=30
|
|
122
|
+
)
|
|
123
|
+
logging.info("InstanceManager initialized with Container resource monitor using internal Kafka: %s", kafka_bootstrap)
|
|
124
|
+
except (ValueError, Exception) as e:
|
|
125
|
+
logging.warning("Failed to initialize Container resource monitor: %s", e)
|
|
126
|
+
self.container_resource_monitor = None
|
|
127
|
+
|
|
114
128
|
# Initialize Compute Operations Handler for event-driven operations
|
|
115
129
|
# Uses EventListener from matrice_common for simplified Kafka consumption
|
|
116
130
|
try:
|
|
@@ -436,6 +450,14 @@ class InstanceManager:
|
|
|
436
450
|
except Exception as exc:
|
|
437
451
|
logging.error("Failed to start Kafka resource monitor: %s", str(exc))
|
|
438
452
|
|
|
453
|
+
# Start Container resource monitor in background thread
|
|
454
|
+
if self.container_resource_monitor:
|
|
455
|
+
try:
|
|
456
|
+
self.container_resource_monitor.start()
|
|
457
|
+
logging.info("Started Container resource monitor")
|
|
458
|
+
except Exception as exc:
|
|
459
|
+
logging.error("Failed to start Container resource monitor: %s", str(exc))
|
|
460
|
+
|
|
439
461
|
# Start Compute Operations Handler in background thread
|
|
440
462
|
if self.compute_operations_handler:
|
|
441
463
|
try:
|
|
@@ -468,3 +490,40 @@ class InstanceManager:
|
|
|
468
490
|
instance_manager_thread,
|
|
469
491
|
actions_manager_thread,
|
|
470
492
|
)
|
|
493
|
+
|
|
494
|
+
def stop(self):
|
|
495
|
+
"""Stop all background threads and cleanup resources."""
|
|
496
|
+
logging.info("Stopping InstanceManager...")
|
|
497
|
+
|
|
498
|
+
# Stop Container resource monitor
|
|
499
|
+
if hasattr(self, 'container_resource_monitor') and self.container_resource_monitor:
|
|
500
|
+
try:
|
|
501
|
+
self.container_resource_monitor.stop()
|
|
502
|
+
logging.info("Stopped Container resource monitor")
|
|
503
|
+
except Exception as exc:
|
|
504
|
+
logging.error("Failed to stop Container resource monitor: %s", str(exc))
|
|
505
|
+
|
|
506
|
+
# Stop Kafka resource monitor
|
|
507
|
+
if hasattr(self, 'kafka_resource_monitor') and self.kafka_resource_monitor:
|
|
508
|
+
try:
|
|
509
|
+
self.kafka_resource_monitor.stop()
|
|
510
|
+
logging.info("Stopped Kafka resource monitor")
|
|
511
|
+
except Exception as exc:
|
|
512
|
+
logging.error("Failed to stop Kafka resource monitor: %s", str(exc))
|
|
513
|
+
|
|
514
|
+
# Stop compute operations handler
|
|
515
|
+
if hasattr(self, 'compute_operations_handler') and self.compute_operations_handler:
|
|
516
|
+
try:
|
|
517
|
+
self.compute_operations_handler.stop()
|
|
518
|
+
logging.info("Stopped Compute Operations Handler")
|
|
519
|
+
except Exception as exc:
|
|
520
|
+
logging.error("Failed to stop Compute Operations Handler: %s", str(exc))
|
|
521
|
+
|
|
522
|
+
# Stop container status monitor
|
|
523
|
+
try:
|
|
524
|
+
self.stop_container_status_monitor()
|
|
525
|
+
logging.info("Stopped Container Status Monitor")
|
|
526
|
+
except Exception as exc:
|
|
527
|
+
logging.error("Failed to stop Container Status Monitor: %s", str(exc))
|
|
528
|
+
|
|
529
|
+
logging.info("InstanceManager stopped")
|
|
@@ -1240,6 +1240,239 @@ class MachineResourcesTracker:
|
|
|
1240
1240
|
)
|
|
1241
1241
|
|
|
1242
1242
|
|
|
1243
|
+
class ContainerResourceMonitor:
|
|
1244
|
+
"""
|
|
1245
|
+
Monitors individual container resource utilization and publishes to Kafka.
|
|
1246
|
+
This thread runs independently and reports CPU, memory, and GPU usage for all running containers.
|
|
1247
|
+
"""
|
|
1248
|
+
|
|
1249
|
+
def __init__(
|
|
1250
|
+
self,
|
|
1251
|
+
instance_id: Optional[str] = None,
|
|
1252
|
+
kafka_bootstrap: Optional[str] = None,
|
|
1253
|
+
interval_seconds: int = 30,
|
|
1254
|
+
):
|
|
1255
|
+
"""
|
|
1256
|
+
Initialize ContainerResourceMonitor.
|
|
1257
|
+
|
|
1258
|
+
Args:
|
|
1259
|
+
instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
|
|
1260
|
+
kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
|
|
1261
|
+
interval_seconds: Interval between container checks in seconds. Defaults to 30.
|
|
1262
|
+
"""
|
|
1263
|
+
self.instance_id = instance_id or os.getenv("INSTANCE_ID")
|
|
1264
|
+
if not self.instance_id:
|
|
1265
|
+
raise ValueError("instance_id must be provided or INSTANCE_ID env var must be set")
|
|
1266
|
+
|
|
1267
|
+
if not kafka_bootstrap:
|
|
1268
|
+
raise ValueError("kafka_bootstrap must be provided - use Scaling.get_kafka_bootstrap_servers() to get internal Kafka config")
|
|
1269
|
+
|
|
1270
|
+
self.kafka_bootstrap = kafka_bootstrap
|
|
1271
|
+
self.interval_seconds = interval_seconds
|
|
1272
|
+
self.topic_name = "instance_resource_utilisation"
|
|
1273
|
+
|
|
1274
|
+
self._stop_event = threading.Event()
|
|
1275
|
+
self._monitor_thread: Optional[threading.Thread] = None
|
|
1276
|
+
self._producer = None
|
|
1277
|
+
self._is_running = False
|
|
1278
|
+
self._docker_client = None
|
|
1279
|
+
self._resources_tracker = ResourcesTracker()
|
|
1280
|
+
|
|
1281
|
+
def _get_all_running_containers(self) -> List[docker.models.containers.Container]:
|
|
1282
|
+
"""
|
|
1283
|
+
Get all running Docker containers.
|
|
1284
|
+
|
|
1285
|
+
Returns:
|
|
1286
|
+
List[docker.models.containers.Container]: List of running containers
|
|
1287
|
+
"""
|
|
1288
|
+
try:
|
|
1289
|
+
if not self._docker_client:
|
|
1290
|
+
self._docker_client = docker.from_env()
|
|
1291
|
+
|
|
1292
|
+
containers = self._docker_client.containers.list(filters={"status": "running"})
|
|
1293
|
+
return containers
|
|
1294
|
+
except Exception as e:
|
|
1295
|
+
logging.debug("Error getting running containers: %s", e)
|
|
1296
|
+
return []
|
|
1297
|
+
|
|
1298
|
+
def _collect_container_resources(self, container: docker.models.containers.Container) -> Dict:
|
|
1299
|
+
"""
|
|
1300
|
+
Collect resource usage for a single container.
|
|
1301
|
+
|
|
1302
|
+
Args:
|
|
1303
|
+
container: Docker container instance
|
|
1304
|
+
|
|
1305
|
+
Returns:
|
|
1306
|
+
Dict: Container resource data
|
|
1307
|
+
"""
|
|
1308
|
+
try:
|
|
1309
|
+
container_id = container.id
|
|
1310
|
+
container_name = container.name
|
|
1311
|
+
|
|
1312
|
+
# Get CPU and memory usage
|
|
1313
|
+
cpu_util, memory_mb = self._resources_tracker.get_container_cpu_and_memory(container)
|
|
1314
|
+
|
|
1315
|
+
# Get GPU usage (utilization and memory)
|
|
1316
|
+
gpu_util, gpu_memory_mb = self._resources_tracker.get_container_gpu_info(container_id)
|
|
1317
|
+
|
|
1318
|
+
return {
|
|
1319
|
+
"container_id": container_id,
|
|
1320
|
+
"container_name": container_name,
|
|
1321
|
+
"cpu_utilization_percent": round(cpu_util, 2),
|
|
1322
|
+
"memory_usage_mb": round(memory_mb, 2),
|
|
1323
|
+
"gpu_utilization_percent": round(gpu_util, 2),
|
|
1324
|
+
"gpu_memory_usage_mb": gpu_memory_mb,
|
|
1325
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
1326
|
+
}
|
|
1327
|
+
except Exception as e:
|
|
1328
|
+
logging.debug("Error collecting resources for container %s: %s", container.name, e)
|
|
1329
|
+
return None
|
|
1330
|
+
|
|
1331
|
+
def _monitor_worker(self):
|
|
1332
|
+
"""
|
|
1333
|
+
Worker function that runs in a separate thread to monitor containers and publish to Kafka.
|
|
1334
|
+
"""
|
|
1335
|
+
try:
|
|
1336
|
+
from kafka import KafkaProducer
|
|
1337
|
+
|
|
1338
|
+
self._producer = KafkaProducer(
|
|
1339
|
+
bootstrap_servers=self.kafka_bootstrap,
|
|
1340
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),
|
|
1341
|
+
retries=5,
|
|
1342
|
+
)
|
|
1343
|
+
logging.info("Container resource monitor started. Publishing to topic: %s", self.topic_name)
|
|
1344
|
+
|
|
1345
|
+
except ImportError:
|
|
1346
|
+
logging.error("kafka-python not installed. Install with: pip install kafka-python")
|
|
1347
|
+
return
|
|
1348
|
+
except Exception as e:
|
|
1349
|
+
logging.error("Failed to initialize Kafka producer for container monitor: %s", e)
|
|
1350
|
+
return
|
|
1351
|
+
|
|
1352
|
+
while not self._stop_event.is_set():
|
|
1353
|
+
try:
|
|
1354
|
+
# Get all running containers
|
|
1355
|
+
containers = self._get_all_running_containers()
|
|
1356
|
+
|
|
1357
|
+
if not containers:
|
|
1358
|
+
logging.debug("No running containers found")
|
|
1359
|
+
else:
|
|
1360
|
+
container_data = []
|
|
1361
|
+
|
|
1362
|
+
# Collect resources for each container
|
|
1363
|
+
for container in containers:
|
|
1364
|
+
resource_data = self._collect_container_resources(container)
|
|
1365
|
+
if resource_data:
|
|
1366
|
+
container_data.append(resource_data)
|
|
1367
|
+
|
|
1368
|
+
# Create the payload with instance information and all container data
|
|
1369
|
+
payload = {
|
|
1370
|
+
"instance_id": self.instance_id,
|
|
1371
|
+
"container_count": len(container_data),
|
|
1372
|
+
"containers": container_data,
|
|
1373
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
# Send to Kafka topic
|
|
1377
|
+
self._producer.send(self.topic_name, payload)
|
|
1378
|
+
self._producer.flush()
|
|
1379
|
+
|
|
1380
|
+
logging.debug("Published container resource stats for %d containers", len(container_data))
|
|
1381
|
+
|
|
1382
|
+
except Exception as e:
|
|
1383
|
+
logging.error("Error in container resource monitor loop: %s", e)
|
|
1384
|
+
|
|
1385
|
+
# Wait for interval or until stop event is set
|
|
1386
|
+
if self._stop_event.wait(self.interval_seconds):
|
|
1387
|
+
break
|
|
1388
|
+
|
|
1389
|
+
# Cleanup
|
|
1390
|
+
if self._producer:
|
|
1391
|
+
try:
|
|
1392
|
+
self._producer.close()
|
|
1393
|
+
except Exception as e:
|
|
1394
|
+
logging.debug("Error closing Kafka producer: %s", e)
|
|
1395
|
+
|
|
1396
|
+
if self._docker_client:
|
|
1397
|
+
try:
|
|
1398
|
+
self._docker_client.close()
|
|
1399
|
+
except Exception as e:
|
|
1400
|
+
logging.debug("Error closing Docker client: %s", e)
|
|
1401
|
+
|
|
1402
|
+
logging.info("Container resource monitor stopped.")
|
|
1403
|
+
|
|
1404
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
1405
|
+
def start(self):
|
|
1406
|
+
"""
|
|
1407
|
+
Start the container resource monitoring thread.
|
|
1408
|
+
|
|
1409
|
+
Returns:
|
|
1410
|
+
bool: True if started successfully, False otherwise.
|
|
1411
|
+
"""
|
|
1412
|
+
if self._is_running:
|
|
1413
|
+
logging.warning("Container resource monitor is already running.")
|
|
1414
|
+
return False
|
|
1415
|
+
|
|
1416
|
+
self._stop_event.clear()
|
|
1417
|
+
self._monitor_thread = threading.Thread(
|
|
1418
|
+
target=self._monitor_worker,
|
|
1419
|
+
daemon=True,
|
|
1420
|
+
name="ContainerResourceMonitor"
|
|
1421
|
+
)
|
|
1422
|
+
self._monitor_thread.start()
|
|
1423
|
+
self._is_running = True
|
|
1424
|
+
|
|
1425
|
+
logging.info("Started container resource monitor thread.")
|
|
1426
|
+
return True
|
|
1427
|
+
|
|
1428
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
1429
|
+
def stop(self, timeout: int = 10):
|
|
1430
|
+
"""
|
|
1431
|
+
Stop the container resource monitoring thread gracefully.
|
|
1432
|
+
|
|
1433
|
+
Args:
|
|
1434
|
+
timeout: Maximum time to wait for thread to stop in seconds.
|
|
1435
|
+
|
|
1436
|
+
Returns:
|
|
1437
|
+
bool: True if stopped successfully, False otherwise.
|
|
1438
|
+
"""
|
|
1439
|
+
if not self._is_running:
|
|
1440
|
+
logging.warning("Container resource monitor is not running.")
|
|
1441
|
+
return False
|
|
1442
|
+
|
|
1443
|
+
logging.info("Stopping container resource monitor...")
|
|
1444
|
+
self._stop_event.set()
|
|
1445
|
+
|
|
1446
|
+
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
1447
|
+
self._monitor_thread.join(timeout=timeout)
|
|
1448
|
+
|
|
1449
|
+
if self._monitor_thread.is_alive():
|
|
1450
|
+
logging.error("Container resource monitor thread did not stop within timeout.")
|
|
1451
|
+
return False
|
|
1452
|
+
|
|
1453
|
+
self._is_running = False
|
|
1454
|
+
logging.info("Container resource monitor stopped successfully.")
|
|
1455
|
+
return True
|
|
1456
|
+
|
|
1457
|
+
def is_running(self) -> bool:
|
|
1458
|
+
"""
|
|
1459
|
+
Check if the container resource monitor is currently running.
|
|
1460
|
+
|
|
1461
|
+
Returns:
|
|
1462
|
+
bool: True if running, False otherwise.
|
|
1463
|
+
"""
|
|
1464
|
+
return self._is_running
|
|
1465
|
+
|
|
1466
|
+
def __enter__(self):
|
|
1467
|
+
"""Context manager entry."""
|
|
1468
|
+
self.start()
|
|
1469
|
+
return self
|
|
1470
|
+
|
|
1471
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
1472
|
+
"""Context manager exit."""
|
|
1473
|
+
self.stop()
|
|
1474
|
+
|
|
1475
|
+
|
|
1243
1476
|
class KafkaResourceMonitor:
|
|
1244
1477
|
"""
|
|
1245
1478
|
Monitors system resources and publishes them to Kafka in a separate thread.
|
|
@@ -1310,12 +1543,12 @@ class KafkaResourceMonitor:
|
|
|
1310
1543
|
return gpu_usage
|
|
1311
1544
|
|
|
1312
1545
|
@staticmethod
|
|
1313
|
-
def get_all_storage_info() -> Dict[str,
|
|
1546
|
+
def get_all_storage_info() -> Dict[str, tuple]:
|
|
1314
1547
|
"""
|
|
1315
|
-
Get
|
|
1548
|
+
Get storage information for all mounted drives.
|
|
1316
1549
|
|
|
1317
1550
|
Returns:
|
|
1318
|
-
Dict[str,
|
|
1551
|
+
Dict[str, tuple]: Dictionary mapping mount point to (free_gb, total_gb).
|
|
1319
1552
|
"""
|
|
1320
1553
|
storage_info = {}
|
|
1321
1554
|
|
|
@@ -1330,8 +1563,9 @@ class KafkaResourceMonitor:
|
|
|
1330
1563
|
|
|
1331
1564
|
# Convert bytes to GB
|
|
1332
1565
|
free_gb = usage.free / (1024 ** 3)
|
|
1566
|
+
total_gb = usage.total / (1024 ** 3)
|
|
1333
1567
|
|
|
1334
|
-
storage_info[partition.mountpoint] = round(free_gb, 2)
|
|
1568
|
+
storage_info[partition.mountpoint] = (round(free_gb, 2), round(total_gb, 2))
|
|
1335
1569
|
|
|
1336
1570
|
except PermissionError:
|
|
1337
1571
|
# Skip drives that we can't access (common on Windows)
|
|
@@ -1347,13 +1581,13 @@ class KafkaResourceMonitor:
|
|
|
1347
1581
|
|
|
1348
1582
|
return storage_info
|
|
1349
1583
|
|
|
1350
|
-
def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str,
|
|
1584
|
+
def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str, tuple]]:
|
|
1351
1585
|
"""
|
|
1352
1586
|
Collect current system resource statistics.
|
|
1353
1587
|
|
|
1354
1588
|
Returns:
|
|
1355
|
-
Tuple[float, int, float, float, Dict[int, tuple], Dict[str,
|
|
1356
|
-
CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total),
|
|
1589
|
+
Tuple[float, int, float, float, Dict[int, tuple], Dict[str, tuple]]:
|
|
1590
|
+
CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total), Storage dict (free, total)
|
|
1357
1591
|
"""
|
|
1358
1592
|
cpu_usage = psutil.cpu_percent(interval=1)
|
|
1359
1593
|
cpu_cores = psutil.cpu_count(logical=True) # Total logical CPU cores
|
|
@@ -1394,6 +1628,8 @@ class KafkaResourceMonitor:
|
|
|
1394
1628
|
|
|
1395
1629
|
# Format GPU info for output: {0: {"used_gb": x, "total_gb": y}, ...}
|
|
1396
1630
|
gpu_memory_gb = {k: {"used_gb": v[0], "total_gb": v[1]} for k, v in gpus.items()}
|
|
1631
|
+
# Format storage info for output: {"/": {"free_gb": x, "total_gb": y}, ...}
|
|
1632
|
+
storage_gb = {k: {"free_gb": v[0], "total_gb": v[1]} for k, v in storage.items()}
|
|
1397
1633
|
payload = {
|
|
1398
1634
|
"instance_id": self.instance_id,
|
|
1399
1635
|
"cpu_usage_percent": round(cpu, 2),
|
|
@@ -1401,7 +1637,7 @@ class KafkaResourceMonitor:
|
|
|
1401
1637
|
"ram_total_gb": round(total, 2),
|
|
1402
1638
|
"ram_used_gb": round(used, 2),
|
|
1403
1639
|
"gpu_memory_gb": gpu_memory_gb, # dict: {0: {used_gb, total_gb}, ...}
|
|
1404
|
-
"
|
|
1640
|
+
"storage_gb": storage_gb, # dict: {"/": {free_gb, total_gb}, ...}
|
|
1405
1641
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
1406
1642
|
}
|
|
1407
1643
|
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=N74jgcNm_l6zk9w8shPlgR9EWFjICXWkfOP4MKwig2o,73979
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
6
|
-
matrice_compute/instance_manager.py,sha256=
|
|
6
|
+
matrice_compute/instance_manager.py,sha256=W0BN1mkfcqCP1jxb6JjhNPUHM-iTmrDu7WoyfTTKGdY,22098
|
|
7
7
|
matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
|
|
8
8
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
9
9
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
matrice_compute/resources_tracker.py,sha256=
|
|
10
|
+
matrice_compute/resources_tracker.py,sha256=Hn_auCSQ2vQIc8X3PZ-KvoVOamjfEkQmbL4ekmWgbt8,68149
|
|
11
11
|
matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
|
|
12
12
|
matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
|
|
13
13
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
18
|
-
matrice_compute-0.1.
|
|
14
|
+
matrice_compute-0.1.43.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.43.dist-info/METADATA,sha256=lp_H12wR55PdKBttrYO4yiNGDo4vuK42a9xKiEas7SU,1038
|
|
16
|
+
matrice_compute-0.1.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.43.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.43.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|