matrice-compute 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +201 -1
- matrice_compute/actions_manager.py +288 -48
- matrice_compute/compute_operations_handler.py +490 -0
- matrice_compute/instance_manager.py +203 -7
- matrice_compute/resources_tracker.py +267 -2
- matrice_compute/scaling.py +821 -140
- {matrice_compute-0.1.25.dist-info → matrice_compute-0.1.27.dist-info}/METADATA +1 -1
- matrice_compute-0.1.27.dist-info/RECORD +18 -0
- matrice_compute-0.1.25.dist-info/RECORD +0 -17
- {matrice_compute-0.1.25.dist-info → matrice_compute-0.1.27.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.25.dist-info → matrice_compute-0.1.27.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.25.dist-info → matrice_compute-0.1.27.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,13 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import subprocess
|
|
6
7
|
import threading
|
|
7
8
|
import time
|
|
9
|
+
from kafka import KafkaProducer
|
|
8
10
|
from matrice_compute.actions_manager import ActionsManager
|
|
9
11
|
from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
|
|
12
|
+
from matrice_compute.compute_operations_handler import ComputeOperationsHandler
|
|
10
13
|
from matrice_compute.instance_utils import (
|
|
11
14
|
get_instance_info,
|
|
12
15
|
get_decrypted_access_key_pair,
|
|
@@ -14,6 +17,7 @@ from matrice_compute.instance_utils import (
|
|
|
14
17
|
from matrice_compute.resources_tracker import (
|
|
15
18
|
MachineResourcesTracker,
|
|
16
19
|
ActionsResourcesTracker,
|
|
20
|
+
KafkaResourceMonitor,
|
|
17
21
|
)
|
|
18
22
|
from matrice_compute.scaling import Scaling
|
|
19
23
|
from matrice_compute.shutdown_manager import ShutdownManager
|
|
@@ -90,10 +94,55 @@ class InstanceManager:
|
|
|
90
94
|
logging.info("InstanceManager initialized with machine resources tracker")
|
|
91
95
|
self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
|
|
92
96
|
logging.info("InstanceManager initialized with actions resources tracker")
|
|
97
|
+
|
|
98
|
+
# Initialize Kafka resource monitor using the same internal Kafka as scaling
|
|
99
|
+
try:
|
|
100
|
+
kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
|
|
101
|
+
self.kafka_resource_monitor = KafkaResourceMonitor(
|
|
102
|
+
instance_id=os.environ.get("INSTANCE_ID"),
|
|
103
|
+
kafka_bootstrap=kafka_bootstrap,
|
|
104
|
+
interval_seconds=60
|
|
105
|
+
)
|
|
106
|
+
logging.info("InstanceManager initialized with Kafka resource monitor using internal Kafka: %s", kafka_bootstrap)
|
|
107
|
+
except (ValueError, Exception) as e:
|
|
108
|
+
logging.warning("Failed to initialize Kafka resource monitor: %s", e)
|
|
109
|
+
self.kafka_resource_monitor = None
|
|
110
|
+
|
|
111
|
+
# Initialize Compute Operations Handler for event-driven operations
|
|
112
|
+
# Uses EventListener from matrice_common for simplified Kafka consumption
|
|
113
|
+
try:
|
|
114
|
+
instance_id = os.environ.get("INSTANCE_ID")
|
|
115
|
+
self.compute_operations_handler = ComputeOperationsHandler(
|
|
116
|
+
actions_manager=self.actions_manager,
|
|
117
|
+
session=self.session,
|
|
118
|
+
scaling=self.scaling,
|
|
119
|
+
instance_id=instance_id
|
|
120
|
+
)
|
|
121
|
+
logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
|
|
122
|
+
except (ValueError, Exception) as e:
|
|
123
|
+
logging.warning("Failed to initialize Compute Operations Handler: %s", e)
|
|
124
|
+
self.compute_operations_handler = None
|
|
125
|
+
|
|
93
126
|
self.poll_interval = 10
|
|
94
127
|
# Note: encryption_key is set in _setup_env_credentials
|
|
128
|
+
|
|
129
|
+
# Initialize container monitoring
|
|
130
|
+
self.container_monitor_thread = None
|
|
131
|
+
self.container_monitor_running = False
|
|
132
|
+
self.container_kafka_producer = None
|
|
133
|
+
|
|
95
134
|
logging.info("InstanceManager initialized.")
|
|
96
135
|
|
|
136
|
+
# report the resources at startup
|
|
137
|
+
try:
|
|
138
|
+
self.scaling.report_architecture_info()
|
|
139
|
+
logging.info("InstanceManager reported initial resources.")
|
|
140
|
+
except Exception as exc:
|
|
141
|
+
logging.error(
|
|
142
|
+
"Error reporting initial resources: %s",
|
|
143
|
+
str(exc),
|
|
144
|
+
)
|
|
145
|
+
|
|
97
146
|
@log_errors(default_return=None, raise_exception=True, log_error=True)
|
|
98
147
|
def _setup_env_credentials(
|
|
99
148
|
self,
|
|
@@ -228,13 +277,13 @@ class InstanceManager:
|
|
|
228
277
|
# "Error in scale_down_manager auto_scaledown_actions: %s",
|
|
229
278
|
# str(exc),
|
|
230
279
|
# )
|
|
231
|
-
try:
|
|
232
|
-
|
|
233
|
-
except Exception as exc:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
280
|
+
# try:
|
|
281
|
+
# self.machine_resources_tracker.update_available_resources()
|
|
282
|
+
# except Exception as exc:
|
|
283
|
+
# logging.error(
|
|
284
|
+
# "Error in machine_resources_tracker update_available_resources: %s",
|
|
285
|
+
# str(exc),
|
|
286
|
+
# )
|
|
238
287
|
try:
|
|
239
288
|
self.actions_resources_tracker.update_actions_resources()
|
|
240
289
|
except Exception as exc:
|
|
@@ -245,6 +294,130 @@ class InstanceManager:
|
|
|
245
294
|
|
|
246
295
|
time.sleep(self.poll_interval)
|
|
247
296
|
|
|
297
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
298
|
+
def start_container_status_monitor(self):
|
|
299
|
+
"""Start the background container status monitoring."""
|
|
300
|
+
if self.container_monitor_running:
|
|
301
|
+
logging.info("Container status monitor is already running")
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
self.container_monitor_running = True
|
|
305
|
+
self.container_monitor_thread = threading.Thread(
|
|
306
|
+
target=self._container_status_monitor_worker,
|
|
307
|
+
daemon=True,
|
|
308
|
+
name="ContainerStatusMonitor"
|
|
309
|
+
)
|
|
310
|
+
self.container_monitor_thread.start()
|
|
311
|
+
logging.info("Started container status monitoring thread")
|
|
312
|
+
|
|
313
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
314
|
+
def stop_container_status_monitor(self):
|
|
315
|
+
"""Stop the background container status monitoring."""
|
|
316
|
+
if not self.container_monitor_running:
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
logging.info("Stopping container status monitor...")
|
|
320
|
+
self.container_monitor_running = False
|
|
321
|
+
|
|
322
|
+
if self.container_monitor_thread:
|
|
323
|
+
self.container_monitor_thread.join(timeout=10)
|
|
324
|
+
|
|
325
|
+
if self.container_kafka_producer:
|
|
326
|
+
self.container_kafka_producer.close()
|
|
327
|
+
self.container_kafka_producer = None
|
|
328
|
+
|
|
329
|
+
logging.info("Container status monitor stopped")
|
|
330
|
+
|
|
331
|
+
def _container_status_monitor_worker(self):
|
|
332
|
+
"""Background worker function that monitors container status."""
|
|
333
|
+
# Initialize Kafka producer
|
|
334
|
+
try:
|
|
335
|
+
if self.scaling.enable_kafka:
|
|
336
|
+
bootstrap_servers = self.scaling.get_kafka_bootstrap_servers()
|
|
337
|
+
self.container_kafka_producer = KafkaProducer(
|
|
338
|
+
bootstrap_servers=bootstrap_servers,
|
|
339
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),
|
|
340
|
+
max_block_ms=5000 # Timeout if Kafka is down
|
|
341
|
+
)
|
|
342
|
+
logging.info("Container status monitor: Kafka producer initialized")
|
|
343
|
+
else:
|
|
344
|
+
logging.warning("Container status monitor: Kafka is disabled, no monitoring will be performed")
|
|
345
|
+
return
|
|
346
|
+
except Exception as e:
|
|
347
|
+
logging.error("Container status monitor: Failed to initialize Kafka producer: %s", str(e))
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
instance_id = os.environ.get("INSTANCE_ID")
|
|
351
|
+
topic_name = "compute_container_status"
|
|
352
|
+
|
|
353
|
+
logging.info("Container status monitor started for instance: %s", instance_id)
|
|
354
|
+
|
|
355
|
+
while self.container_monitor_running:
|
|
356
|
+
try:
|
|
357
|
+
# Get container status using docker ps -a
|
|
358
|
+
result = subprocess.run(
|
|
359
|
+
["docker", "ps", "-a", "--format", "json"],
|
|
360
|
+
capture_output=True,
|
|
361
|
+
text=True,
|
|
362
|
+
timeout=30
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if result.returncode != 0:
|
|
366
|
+
logging.error("Container status monitor: docker ps command failed: %s", result.stderr)
|
|
367
|
+
time.sleep(30) # Wait before retrying
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
# Parse container information
|
|
371
|
+
containers = []
|
|
372
|
+
if result.stdout.strip():
|
|
373
|
+
for line in result.stdout.strip().split('\n'):
|
|
374
|
+
try:
|
|
375
|
+
container_info = json.loads(line)
|
|
376
|
+
containers.append({
|
|
377
|
+
"container_id": container_info.get("ID", ""),
|
|
378
|
+
"image": container_info.get("Image", ""),
|
|
379
|
+
"command": container_info.get("Command", ""),
|
|
380
|
+
"created": container_info.get("CreatedAt", ""),
|
|
381
|
+
"status": container_info.get("Status", ""),
|
|
382
|
+
"ports": container_info.get("Ports", ""),
|
|
383
|
+
"names": container_info.get("Names", ""),
|
|
384
|
+
"size": container_info.get("Size", ""),
|
|
385
|
+
"state": container_info.get("State", ""),
|
|
386
|
+
"labels": container_info.get("Labels", "")
|
|
387
|
+
})
|
|
388
|
+
except json.JSONDecodeError as e:
|
|
389
|
+
logging.warning("Container status monitor: Failed to parse container info: %s", str(e))
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
# Prepare message for Kafka
|
|
393
|
+
status_message = {
|
|
394
|
+
"timestamp": time.time(),
|
|
395
|
+
"instance_id": instance_id,
|
|
396
|
+
"container_count": len(containers),
|
|
397
|
+
"containers": containers
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
# Send to Kafka
|
|
401
|
+
if self.container_kafka_producer:
|
|
402
|
+
try:
|
|
403
|
+
self.container_kafka_producer.send(topic_name, status_message)
|
|
404
|
+
logging.debug("Container status monitor: Sent status for %d containers", len(containers))
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
|
|
407
|
+
|
|
408
|
+
except subprocess.TimeoutExpired:
|
|
409
|
+
logging.error("Container status monitor: docker ps command timed out")
|
|
410
|
+
except Exception as e:
|
|
411
|
+
logging.error("Container status monitor: Unexpected error: %s", str(e))
|
|
412
|
+
|
|
413
|
+
# Wait 30 seconds before next check
|
|
414
|
+
for _ in range(30):
|
|
415
|
+
if not self.container_monitor_running:
|
|
416
|
+
break
|
|
417
|
+
time.sleep(1)
|
|
418
|
+
|
|
419
|
+
logging.info("Container status monitor worker stopped")
|
|
420
|
+
|
|
248
421
|
@log_errors(default_return=(None, None), raise_exception=True)
|
|
249
422
|
def start(self) -> tuple:
|
|
250
423
|
"""Start the instance manager threads.
|
|
@@ -252,6 +425,29 @@ class InstanceManager:
|
|
|
252
425
|
Returns:
|
|
253
426
|
tuple: (instance_manager_thread, actions_manager_thread)
|
|
254
427
|
"""
|
|
428
|
+
# Start Kafka resource monitor in background thread
|
|
429
|
+
if self.kafka_resource_monitor:
|
|
430
|
+
try:
|
|
431
|
+
self.kafka_resource_monitor.start()
|
|
432
|
+
logging.info("Started Kafka resource monitor")
|
|
433
|
+
except Exception as exc:
|
|
434
|
+
logging.error("Failed to start Kafka resource monitor: %s", str(exc))
|
|
435
|
+
|
|
436
|
+
# Start Compute Operations Handler in background thread
|
|
437
|
+
if self.compute_operations_handler:
|
|
438
|
+
try:
|
|
439
|
+
self.compute_operations_handler.start()
|
|
440
|
+
logging.info("Started Compute Operations Handler")
|
|
441
|
+
except Exception as exc:
|
|
442
|
+
logging.error("Failed to start Compute Operations Handler: %s", str(exc))
|
|
443
|
+
|
|
444
|
+
# Start Container Status Monitor in background thread
|
|
445
|
+
try:
|
|
446
|
+
self.start_container_status_monitor()
|
|
447
|
+
logging.info("Started Container Status Monitor")
|
|
448
|
+
except Exception as exc:
|
|
449
|
+
logging.error("Failed to start Container Status Monitor: %s", str(exc))
|
|
450
|
+
|
|
255
451
|
# Create and start threads
|
|
256
452
|
instance_manager_thread = threading.Thread(
|
|
257
453
|
target=self.start_instance_manager,
|
|
@@ -5,6 +5,8 @@ This module contains classes for tracking machine and action resources.
|
|
|
5
5
|
import os
|
|
6
6
|
import subprocess
|
|
7
7
|
import logging
|
|
8
|
+
import threading
|
|
9
|
+
import json
|
|
8
10
|
from datetime import datetime, timezone
|
|
9
11
|
import psutil
|
|
10
12
|
import docker
|
|
@@ -402,8 +404,13 @@ class ActionsResourcesTracker:
|
|
|
402
404
|
new_args.extend(x.replace('"', "").replace("'", "") for x in arg.split(" "))
|
|
403
405
|
return new_args
|
|
404
406
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
+
def is_valid_objectid(s: str) -> bool:
|
|
408
|
+
"""Check if string is a valid MongoDB ObjectId (24 hex characters)"""
|
|
409
|
+
s = s.strip()
|
|
410
|
+
return len(s) == 24 and all(c in '0123456789abcdefABCDEF' for c in s)
|
|
411
|
+
|
|
412
|
+
valid_objectids = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if is_valid_objectid(arg)]
|
|
413
|
+
action_record_id = valid_objectids[-1] if valid_objectids else None
|
|
407
414
|
if not action_record_id:
|
|
408
415
|
logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
|
|
409
416
|
duration = calculate_time_difference(start_time, finish_time)
|
|
@@ -575,3 +582,261 @@ class MachineResourcesTracker:
|
|
|
575
582
|
"Error in updating available resources: %s",
|
|
576
583
|
err,
|
|
577
584
|
)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
class KafkaResourceMonitor:
|
|
588
|
+
"""
|
|
589
|
+
Monitors system resources and publishes them to Kafka in a separate thread.
|
|
590
|
+
This class provides thread-safe start/stop operations for resource monitoring.
|
|
591
|
+
"""
|
|
592
|
+
|
|
593
|
+
def __init__(
|
|
594
|
+
self,
|
|
595
|
+
instance_id: Optional[str] = None,
|
|
596
|
+
kafka_bootstrap: Optional[str] = None,
|
|
597
|
+
interval_seconds: int = 60,
|
|
598
|
+
):
|
|
599
|
+
"""
|
|
600
|
+
Initialize KafkaResourceMonitor.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
|
|
604
|
+
kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
|
|
605
|
+
interval_seconds: Interval between resource checks in seconds. Defaults to 60.
|
|
606
|
+
"""
|
|
607
|
+
self.instance_id = instance_id or os.getenv("INSTANCE_ID")
|
|
608
|
+
if not self.instance_id:
|
|
609
|
+
raise ValueError("instance_id must be provided or INSTANCE_ID env var must be set")
|
|
610
|
+
|
|
611
|
+
if not kafka_bootstrap:
|
|
612
|
+
raise ValueError("kafka_bootstrap must be provided - use Scaling.get_kafka_bootstrap_servers() to get internal Kafka config")
|
|
613
|
+
|
|
614
|
+
self.kafka_bootstrap = kafka_bootstrap
|
|
615
|
+
self.interval_seconds = interval_seconds
|
|
616
|
+
self.topic_name = "compute_instance_resource_utilization"
|
|
617
|
+
|
|
618
|
+
self._stop_event = threading.Event()
|
|
619
|
+
self._monitor_thread: Optional[threading.Thread] = None
|
|
620
|
+
self._producer = None
|
|
621
|
+
self._is_running = False
|
|
622
|
+
|
|
623
|
+
@staticmethod
|
|
624
|
+
def get_all_gpu_memory() -> Dict[int, tuple]:
|
|
625
|
+
"""
|
|
626
|
+
Get GPU memory usage and total for all GPUs.
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
Dict[int, tuple]: Dictionary mapping GPU ID to (used_gb, total_gb).
|
|
630
|
+
Returns empty dict if nvidia-smi is not available.
|
|
631
|
+
"""
|
|
632
|
+
gpu_usage = {}
|
|
633
|
+
|
|
634
|
+
try:
|
|
635
|
+
cmd = [
|
|
636
|
+
"nvidia-smi",
|
|
637
|
+
"--query-gpu=index,memory.used,memory.total",
|
|
638
|
+
"--format=csv,noheader,nounits"
|
|
639
|
+
]
|
|
640
|
+
result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, timeout=5)
|
|
641
|
+
lines = result.decode().strip().split("\n")
|
|
642
|
+
|
|
643
|
+
for line in lines:
|
|
644
|
+
gpu_id_str, mem_used_mb_str, mem_total_mb_str = line.split(",")
|
|
645
|
+
gpu_id = int(gpu_id_str.strip())
|
|
646
|
+
mem_used_gb = int(mem_used_mb_str.strip()) / 1024 # MB → GB
|
|
647
|
+
mem_total_gb = int(mem_total_mb_str.strip()) / 1024 # MB → GB
|
|
648
|
+
gpu_usage[gpu_id] = (round(mem_used_gb, 2), round(mem_total_gb, 2))
|
|
649
|
+
|
|
650
|
+
except Exception as e:
|
|
651
|
+
logging.debug("Failed to get GPU memory info: %s", e)
|
|
652
|
+
return {}
|
|
653
|
+
|
|
654
|
+
return gpu_usage
|
|
655
|
+
|
|
656
|
+
@staticmethod
|
|
657
|
+
def get_all_storage_info() -> Dict[str, float]:
|
|
658
|
+
"""
|
|
659
|
+
Get free storage space for all mounted drives.
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
Dict[str, float]: Dictionary mapping mount point to free storage space in GB.
|
|
663
|
+
"""
|
|
664
|
+
storage_info = {}
|
|
665
|
+
|
|
666
|
+
try:
|
|
667
|
+
# Get all disk partitions
|
|
668
|
+
partitions = psutil.disk_partitions()
|
|
669
|
+
|
|
670
|
+
for partition in partitions:
|
|
671
|
+
try:
|
|
672
|
+
# Get usage statistics for this partition
|
|
673
|
+
usage = psutil.disk_usage(partition.mountpoint)
|
|
674
|
+
|
|
675
|
+
# Convert bytes to GB
|
|
676
|
+
free_gb = usage.free / (1024 ** 3)
|
|
677
|
+
|
|
678
|
+
storage_info[partition.mountpoint] = round(free_gb, 2)
|
|
679
|
+
|
|
680
|
+
except PermissionError:
|
|
681
|
+
# Skip drives that we can't access (common on Windows)
|
|
682
|
+
logging.debug("Permission denied accessing %s", partition.mountpoint)
|
|
683
|
+
continue
|
|
684
|
+
except Exception as e:
|
|
685
|
+
logging.debug("Error getting storage info for %s: %s", partition.mountpoint, e)
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
except Exception as e:
|
|
689
|
+
logging.debug("Failed to get storage info: %s", e)
|
|
690
|
+
return {}
|
|
691
|
+
|
|
692
|
+
return storage_info
|
|
693
|
+
|
|
694
|
+
def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str, float]]:
|
|
695
|
+
"""
|
|
696
|
+
Collect current system resource statistics.
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
Tuple[float, int, float, float, Dict[int, tuple], Dict[str, float]]:
|
|
700
|
+
CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total), Free storage dict
|
|
701
|
+
"""
|
|
702
|
+
cpu_usage = psutil.cpu_percent(interval=1)
|
|
703
|
+
cpu_cores = psutil.cpu_count(logical=True) # Total logical CPU cores
|
|
704
|
+
|
|
705
|
+
mem = psutil.virtual_memory()
|
|
706
|
+
ram_total = mem.total / (1024 ** 3)
|
|
707
|
+
ram_used = mem.used / (1024 ** 3)
|
|
708
|
+
|
|
709
|
+
gpu_usage = self.get_all_gpu_memory()
|
|
710
|
+
storage_info = self.get_all_storage_info()
|
|
711
|
+
|
|
712
|
+
return cpu_usage, cpu_cores, ram_total, ram_used, gpu_usage, storage_info
|
|
713
|
+
|
|
714
|
+
def _monitor_worker(self):
|
|
715
|
+
"""
|
|
716
|
+
Worker function that runs in a separate thread to monitor and publish resources.
|
|
717
|
+
"""
|
|
718
|
+
try:
|
|
719
|
+
from kafka import KafkaProducer
|
|
720
|
+
|
|
721
|
+
self._producer = KafkaProducer(
|
|
722
|
+
bootstrap_servers=self.kafka_bootstrap,
|
|
723
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),
|
|
724
|
+
retries=5,
|
|
725
|
+
)
|
|
726
|
+
logging.info("Kafka resource monitor started. Publishing to topic: %s", self.topic_name)
|
|
727
|
+
|
|
728
|
+
except ImportError:
|
|
729
|
+
logging.error("kafka-python not installed. Install with: pip install kafka-python")
|
|
730
|
+
return
|
|
731
|
+
except Exception as e:
|
|
732
|
+
logging.error("Failed to initialize Kafka producer: %s", e)
|
|
733
|
+
return
|
|
734
|
+
|
|
735
|
+
while not self._stop_event.is_set():
|
|
736
|
+
try:
|
|
737
|
+
cpu, cpu_cores, total, used, gpus, storage = self.get_stats()
|
|
738
|
+
|
|
739
|
+
# Format GPU info for output: {0: {"used_gb": x, "total_gb": y}, ...}
|
|
740
|
+
gpu_memory_gb = {k: {"used_gb": v[0], "total_gb": v[1]} for k, v in gpus.items()}
|
|
741
|
+
payload = {
|
|
742
|
+
"instance_id": self.instance_id,
|
|
743
|
+
"cpu_usage_percent": round(cpu, 2),
|
|
744
|
+
"cpu_cores": cpu_cores,
|
|
745
|
+
"ram_total_gb": round(total, 2),
|
|
746
|
+
"ram_used_gb": round(used, 2),
|
|
747
|
+
"gpu_memory_gb": gpu_memory_gb, # dict: {0: {used_gb, total_gb}, ...}
|
|
748
|
+
"free_storage_gb": storage, # dict: {"/": 50.5, "C:": 123.4}
|
|
749
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
self._producer.send(self.topic_name, payload)
|
|
753
|
+
self._producer.flush()
|
|
754
|
+
|
|
755
|
+
logging.debug("Published resource stats: %s", payload)
|
|
756
|
+
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logging.error("Error in resource monitor loop: %s", e)
|
|
759
|
+
|
|
760
|
+
# Wait for interval or until stop event is set
|
|
761
|
+
if self._stop_event.wait(self.interval_seconds):
|
|
762
|
+
break
|
|
763
|
+
|
|
764
|
+
# Cleanup
|
|
765
|
+
if self._producer:
|
|
766
|
+
try:
|
|
767
|
+
self._producer.close()
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logging.debug("Error closing Kafka producer: %s", e)
|
|
770
|
+
|
|
771
|
+
logging.info("Kafka resource monitor stopped.")
|
|
772
|
+
|
|
773
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
774
|
+
def start(self):
|
|
775
|
+
"""
|
|
776
|
+
Start the resource monitoring thread.
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
bool: True if started successfully, False otherwise.
|
|
780
|
+
"""
|
|
781
|
+
if self._is_running:
|
|
782
|
+
logging.warning("Kafka resource monitor is already running.")
|
|
783
|
+
return False
|
|
784
|
+
|
|
785
|
+
self._stop_event.clear()
|
|
786
|
+
self._monitor_thread = threading.Thread(
|
|
787
|
+
target=self._monitor_worker,
|
|
788
|
+
daemon=True,
|
|
789
|
+
name="KafkaResourceMonitor"
|
|
790
|
+
)
|
|
791
|
+
self._monitor_thread.start()
|
|
792
|
+
self._is_running = True
|
|
793
|
+
|
|
794
|
+
logging.info("Started Kafka resource monitor thread.")
|
|
795
|
+
return True
|
|
796
|
+
|
|
797
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
798
|
+
def stop(self, timeout: int = 10):
|
|
799
|
+
"""
|
|
800
|
+
Stop the resource monitoring thread gracefully.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
timeout: Maximum time to wait for thread to stop in seconds.
|
|
804
|
+
|
|
805
|
+
Returns:
|
|
806
|
+
bool: True if stopped successfully, False otherwise.
|
|
807
|
+
"""
|
|
808
|
+
if not self._is_running:
|
|
809
|
+
logging.warning("Kafka resource monitor is not running.")
|
|
810
|
+
return False
|
|
811
|
+
|
|
812
|
+
logging.info("Stopping Kafka resource monitor...")
|
|
813
|
+
self._stop_event.set()
|
|
814
|
+
|
|
815
|
+
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
816
|
+
self._monitor_thread.join(timeout=timeout)
|
|
817
|
+
|
|
818
|
+
if self._monitor_thread.is_alive():
|
|
819
|
+
logging.error("Kafka resource monitor thread did not stop within timeout.")
|
|
820
|
+
return False
|
|
821
|
+
|
|
822
|
+
self._is_running = False
|
|
823
|
+
logging.info("Kafka resource monitor stopped successfully.")
|
|
824
|
+
return True
|
|
825
|
+
|
|
826
|
+
def is_running(self) -> bool:
|
|
827
|
+
"""
|
|
828
|
+
Check if the resource monitor is currently running.
|
|
829
|
+
|
|
830
|
+
Returns:
|
|
831
|
+
bool: True if running, False otherwise.
|
|
832
|
+
"""
|
|
833
|
+
return self._is_running
|
|
834
|
+
|
|
835
|
+
def __enter__(self):
|
|
836
|
+
"""Context manager entry."""
|
|
837
|
+
self.start()
|
|
838
|
+
return self
|
|
839
|
+
|
|
840
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
841
|
+
"""Context manager exit."""
|
|
842
|
+
self.stop()
|