matrice-compute 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,8 @@ This module contains classes for tracking machine and action resources.
5
5
  import os
6
6
  import subprocess
7
7
  import logging
8
+ import threading
9
+ import json
8
10
  from datetime import datetime, timezone
9
11
  import psutil
10
12
  import docker
@@ -580,3 +582,261 @@ class MachineResourcesTracker:
580
582
  "Error in updating available resources: %s",
581
583
  err,
582
584
  )
585
+
586
+
587
+ class KafkaResourceMonitor:
588
+ """
589
+ Monitors system resources and publishes them to Kafka in a separate thread.
590
+ This class provides thread-safe start/stop operations for resource monitoring.
591
+ """
592
+
593
+ def __init__(
594
+ self,
595
+ instance_id: Optional[str] = None,
596
+ kafka_bootstrap: Optional[str] = None,
597
+ interval_seconds: int = 60,
598
+ ):
599
+ """
600
+ Initialize KafkaResourceMonitor.
601
+
602
+ Args:
603
+ instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
604
+ kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
605
+ interval_seconds: Interval between resource checks in seconds. Defaults to 60.
606
+ """
607
+ self.instance_id = instance_id or os.getenv("INSTANCE_ID")
608
+ if not self.instance_id:
609
+ raise ValueError("instance_id must be provided or INSTANCE_ID env var must be set")
610
+
611
+ if not kafka_bootstrap:
612
+ raise ValueError("kafka_bootstrap must be provided - use Scaling.get_kafka_bootstrap_servers() to get internal Kafka config")
613
+
614
+ self.kafka_bootstrap = kafka_bootstrap
615
+ self.interval_seconds = interval_seconds
616
+ self.topic_name = "compute_instance_resource_utilization"
617
+
618
+ self._stop_event = threading.Event()
619
+ self._monitor_thread: Optional[threading.Thread] = None
620
+ self._producer = None
621
+ self._is_running = False
622
+
623
+ @staticmethod
624
+ def get_all_gpu_memory() -> Dict[int, tuple]:
625
+ """
626
+ Get GPU memory usage and total for all GPUs.
627
+
628
+ Returns:
629
+ Dict[int, tuple]: Dictionary mapping GPU ID to (used_gb, total_gb).
630
+ Returns empty dict if nvidia-smi is not available.
631
+ """
632
+ gpu_usage = {}
633
+
634
+ try:
635
+ cmd = [
636
+ "nvidia-smi",
637
+ "--query-gpu=index,memory.used,memory.total",
638
+ "--format=csv,noheader,nounits"
639
+ ]
640
+ result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, timeout=5)
641
+ lines = result.decode().strip().split("\n")
642
+
643
+ for line in lines:
644
+ gpu_id_str, mem_used_mb_str, mem_total_mb_str = line.split(",")
645
+ gpu_id = int(gpu_id_str.strip())
646
+ mem_used_gb = int(mem_used_mb_str.strip()) / 1024 # MB → GB
647
+ mem_total_gb = int(mem_total_mb_str.strip()) / 1024 # MB → GB
648
+ gpu_usage[gpu_id] = (round(mem_used_gb, 2), round(mem_total_gb, 2))
649
+
650
+ except Exception as e:
651
+ logging.debug("Failed to get GPU memory info: %s", e)
652
+ return {}
653
+
654
+ return gpu_usage
655
+
656
+ @staticmethod
657
+ def get_all_storage_info() -> Dict[str, float]:
658
+ """
659
+ Get free storage space for all mounted drives.
660
+
661
+ Returns:
662
+ Dict[str, float]: Dictionary mapping mount point to free storage space in GB.
663
+ """
664
+ storage_info = {}
665
+
666
+ try:
667
+ # Get all disk partitions
668
+ partitions = psutil.disk_partitions()
669
+
670
+ for partition in partitions:
671
+ try:
672
+ # Get usage statistics for this partition
673
+ usage = psutil.disk_usage(partition.mountpoint)
674
+
675
+ # Convert bytes to GB
676
+ free_gb = usage.free / (1024 ** 3)
677
+
678
+ storage_info[partition.mountpoint] = round(free_gb, 2)
679
+
680
+ except PermissionError:
681
+ # Skip drives that we can't access (common on Windows)
682
+ logging.debug("Permission denied accessing %s", partition.mountpoint)
683
+ continue
684
+ except Exception as e:
685
+ logging.debug("Error getting storage info for %s: %s", partition.mountpoint, e)
686
+ continue
687
+
688
+ except Exception as e:
689
+ logging.debug("Failed to get storage info: %s", e)
690
+ return {}
691
+
692
+ return storage_info
693
+
694
+ def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str, float]]:
695
+ """
696
+ Collect current system resource statistics.
697
+
698
+ Returns:
699
+ Tuple[float, int, float, float, Dict[int, tuple], Dict[str, float]]:
700
+ CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total), Free storage dict
701
+ """
702
+ cpu_usage = psutil.cpu_percent(interval=1)
703
+ cpu_cores = psutil.cpu_count(logical=True) # Total logical CPU cores
704
+
705
+ mem = psutil.virtual_memory()
706
+ ram_total = mem.total / (1024 ** 3)
707
+ ram_used = mem.used / (1024 ** 3)
708
+
709
+ gpu_usage = self.get_all_gpu_memory()
710
+ storage_info = self.get_all_storage_info()
711
+
712
+ return cpu_usage, cpu_cores, ram_total, ram_used, gpu_usage, storage_info
713
+
714
+ def _monitor_worker(self):
715
+ """
716
+ Worker function that runs in a separate thread to monitor and publish resources.
717
+ """
718
+ try:
719
+ from kafka import KafkaProducer
720
+
721
+ self._producer = KafkaProducer(
722
+ bootstrap_servers=self.kafka_bootstrap,
723
+ value_serializer=lambda v: json.dumps(v).encode("utf-8"),
724
+ retries=5,
725
+ )
726
+ logging.info("Kafka resource monitor started. Publishing to topic: %s", self.topic_name)
727
+
728
+ except ImportError:
729
+ logging.error("kafka-python not installed. Install with: pip install kafka-python")
730
+ return
731
+ except Exception as e:
732
+ logging.error("Failed to initialize Kafka producer: %s", e)
733
+ return
734
+
735
+ while not self._stop_event.is_set():
736
+ try:
737
+ cpu, cpu_cores, total, used, gpus, storage = self.get_stats()
738
+
739
+ # Format GPU info for output: {0: {"used_gb": x, "total_gb": y}, ...}
740
+ gpu_memory_gb = {k: {"used_gb": v[0], "total_gb": v[1]} for k, v in gpus.items()}
741
+ payload = {
742
+ "instance_id": self.instance_id,
743
+ "cpu_usage_percent": round(cpu, 2),
744
+ "cpu_cores": cpu_cores,
745
+ "ram_total_gb": round(total, 2),
746
+ "ram_used_gb": round(used, 2),
747
+ "gpu_memory_gb": gpu_memory_gb, # dict: {0: {used_gb, total_gb}, ...}
748
+ "free_storage_gb": storage, # dict: {"/": 50.5, "C:": 123.4}
749
+ "timestamp": datetime.now(timezone.utc).isoformat()
750
+ }
751
+
752
+ self._producer.send(self.topic_name, payload)
753
+ self._producer.flush()
754
+
755
+ logging.debug("Published resource stats: %s", payload)
756
+
757
+ except Exception as e:
758
+ logging.error("Error in resource monitor loop: %s", e)
759
+
760
+ # Wait for interval or until stop event is set
761
+ if self._stop_event.wait(self.interval_seconds):
762
+ break
763
+
764
+ # Cleanup
765
+ if self._producer:
766
+ try:
767
+ self._producer.close()
768
+ except Exception as e:
769
+ logging.debug("Error closing Kafka producer: %s", e)
770
+
771
+ logging.info("Kafka resource monitor stopped.")
772
+
773
+ @log_errors(raise_exception=False, log_error=True)
774
+ def start(self):
775
+ """
776
+ Start the resource monitoring thread.
777
+
778
+ Returns:
779
+ bool: True if started successfully, False otherwise.
780
+ """
781
+ if self._is_running:
782
+ logging.warning("Kafka resource monitor is already running.")
783
+ return False
784
+
785
+ self._stop_event.clear()
786
+ self._monitor_thread = threading.Thread(
787
+ target=self._monitor_worker,
788
+ daemon=True,
789
+ name="KafkaResourceMonitor"
790
+ )
791
+ self._monitor_thread.start()
792
+ self._is_running = True
793
+
794
+ logging.info("Started Kafka resource monitor thread.")
795
+ return True
796
+
797
+ @log_errors(raise_exception=False, log_error=True)
798
+ def stop(self, timeout: int = 10):
799
+ """
800
+ Stop the resource monitoring thread gracefully.
801
+
802
+ Args:
803
+ timeout: Maximum time to wait for thread to stop in seconds.
804
+
805
+ Returns:
806
+ bool: True if stopped successfully, False otherwise.
807
+ """
808
+ if not self._is_running:
809
+ logging.warning("Kafka resource monitor is not running.")
810
+ return False
811
+
812
+ logging.info("Stopping Kafka resource monitor...")
813
+ self._stop_event.set()
814
+
815
+ if self._monitor_thread and self._monitor_thread.is_alive():
816
+ self._monitor_thread.join(timeout=timeout)
817
+
818
+ if self._monitor_thread.is_alive():
819
+ logging.error("Kafka resource monitor thread did not stop within timeout.")
820
+ return False
821
+
822
+ self._is_running = False
823
+ logging.info("Kafka resource monitor stopped successfully.")
824
+ return True
825
+
826
+ def is_running(self) -> bool:
827
+ """
828
+ Check if the resource monitor is currently running.
829
+
830
+ Returns:
831
+ bool: True if running, False otherwise.
832
+ """
833
+ return self._is_running
834
+
835
+ def __enter__(self):
836
+ """Context manager entry."""
837
+ self.start()
838
+ return self
839
+
840
+ def __exit__(self, exc_type, exc_val, exc_tb):
841
+ """Context manager exit."""
842
+ self.stop()