PyPI - arbor-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

arbor-ai 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arbor/__init__.py +17 -0
arbor/cli.py +83 -43
arbor/client/arbor_client.py +259 -0
arbor/server/api/models/schemas.py +3 -1
arbor/server/api/routes/grpo.py +2 -6
arbor/server/api/routes/inference.py +7 -3
arbor/server/core/config.py +293 -7
arbor/server/core/config_manager.py +100 -0
arbor/server/main.py +26 -1
arbor/server/services/comms/comms.py +13 -9
arbor/server/services/file_manager.py +7 -4
arbor/server/services/grpo_manager.py +98 -62
arbor/server/services/health_manager.py +171 -0
arbor/server/services/inference/vllm_client.py +6 -4
arbor/server/services/inference_manager.py +40 -38
arbor/server/services/job_manager.py +2 -2
arbor/server/services/scripts/grpo_training.py +62 -281
arbor/server/services/scripts/mmgrpo_training.py +510 -0
arbor/server/services/scripts/sft_training.py +8 -5
arbor/server/services/scripts/utils/callbacks.py +33 -0
arbor/server/services/scripts/utils/comms_monitors.py +169 -0
arbor/server/services/scripts/utils/dataset.py +176 -0
arbor/server/services/scripts/utils/ingestion_monitor.py +35 -0
arbor/server/services/scripts/utils/mock_server.py +124 -0
arbor/server/services/training_manager.py +4 -4
arbor/server/utils/logging.py +298 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.3.dist-info}/METADATA +17 -19
arbor_ai-0.2.3.dist-info/RECORD +51 -0
arbor_ai-0.2.1.dist-info/RECORD +0 -42
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.3.dist-info}/WHEEL +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.3.dist-info}/entry_points.txt +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.3.dist-info}/licenses/LICENSE +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.3.dist-info}/top_level.txt +0 -0

arbor/server/services/grpo_manager.py CHANGED Viewed

@@ -3,7 +3,6 @@ import json
 import os
 import random
 import signal
-import socket
 import string
 import subprocess
 import sys
@@ -20,14 +19,17 @@ from arbor.server.api.models.schemas import (
     GRPOConfigRequest,
     GRPORequest,
 )
-from arbor.server.core.config import Settings
+from arbor.server.core.config import Config
 from arbor.server.services.comms.comms import ArborServerCommsHandler
 from arbor.server.services.inference_manager import InferenceManager
+from arbor.server.utils.logging import get_logger
+logger = get_logger(__name__)
 class GRPOManager:
-    def __init__(self, settings: Settings):
-        self.settings = settings
+    def __init__(self, config: Config):
+        self.config = config
         self.training_process = None
         self.current_model = None
         self.train_kwargs = None
@@ -47,7 +49,7 @@ class GRPOManager:
     def _signal_handler(self, signum, frame):
         """Handle keyboard interrupt (SIGINT) gracefully."""
-        print("\nReceived keyboard interrupt. Shutting down gracefully...")
+        logger.info("Received keyboard interrupt. Shutting down gracefully...")
         # Sleep for a bit to let async operations go through
         time.sleep(2)
         if self.training_process is not None:
@@ -65,16 +67,14 @@ class GRPOManager:
         )
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         name = f"grpo:{model_name}:{suffix}:{timestamp}"
-        return name, str(Path(self.settings.STORAGE_PATH).resolve() / "models" / name)
+        return name, str(Path(self.config.STORAGE_PATH).resolve() / "models" / name)
     def find_training_args(self, request: GRPOConfigRequest) -> dict:
         """Process the config request and return training arguments."""
         name, output_dir = self.make_output_dir(request.model, request.suffix)
         # Here are defaults for training. We can adjust them if we disagree w the huggingface defaults
-        default_train_kwargs = {
-            "output_dir": output_dir,
-        }
+        default_train_kwargs = {"output_dir": output_dir, "grpo_flavor": "grpo"}
         train_kwargs = request.model_dump(exclude_unset=True)
         return {**default_train_kwargs, **(train_kwargs or {})}
@@ -108,7 +108,7 @@ class GRPOManager:
             key: train_kwargs[key] for key in trl_keys if key in train_kwargs
         }
-        arbor_keys = ["max_context_length", "lora"]
+        arbor_keys = ["max_context_length", "lora", "wandb_kwargs", "grpo_flavor"]
         arbor_train_kwargs = {
             key: train_kwargs[key] for key in arbor_keys if key in train_kwargs
         }
@@ -121,7 +121,7 @@ class GRPOManager:
         """Initialize the training process with ZMQ-based communication."""
         self.train_kwargs = self.find_training_args(request)
-        trl_train_kwargs, arbor_train_kwargs = self.process_training_args(
+        self.trl_train_kwargs, self.arbor_train_kwargs = self.process_training_args(
             self.train_kwargs
         )
@@ -132,31 +132,34 @@ class GRPOManager:
         # launch_kwargs = {
         #     k: v for k, v in arbor_train_kwargs.items() if k in ["max_context_length"]
         # }
-        inference_manager.launch_kwargs["max_context_length"] = arbor_train_kwargs.get(
-            "max_context_length", None
+        inference_manager.launch_kwargs["max_context_length"] = (
+            self.arbor_train_kwargs.get("max_context_length", None)
         )
-        print("Launching inference server...")
+        logger.info("Launching inference server...")
         inference_manager.launch(self.current_model)
         # Initialize ZMQ socket manager - no need for connection acceptance thread anymore
         self.server_comms_handler = ArborServerCommsHandler()
         script_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "scripts")
-        script_path = os.path.join(script_dir, "grpo_training.py")
+        script_name = {"mmgrpo": "mmgrpo_training.py", "grpo": "grpo_training.py"}[
+            self.arbor_train_kwargs["grpo_flavor"]
+        ]
+        script_path = os.path.join(script_dir, script_name)
         # Start the training process with ZMQ ports
         my_env = os.environ.copy()
-        my_env["CUDA_VISIBLE_DEVICES"] = self.settings.arbor_config.training.gpu_ids
+        my_env["CUDA_VISIBLE_DEVICES"] = self.config.arbor_config.training.gpu_ids
         # WandB can block the training process for login, so we silence it
         my_env["WANDB_SILENT"] = "true"
-        num_processes = self.settings.arbor_config.training.gpu_ids.count(",") + 1
+        num_processes = self.config.arbor_config.training.gpu_ids.count(",") + 1
         # This is the port for the accelerate main process
         main_process_port = get_free_port()
         params = [
-            "python",
+            sys.executable,
             "-m",
             "accelerate.commands.launch",
             "--num_processes",
@@ -164,11 +167,11 @@ class GRPOManager:
             "--main_process_port",
             str(main_process_port),
         ]
-        if self.settings.arbor_config.training.accelerate_config:
+        if self.config.arbor_config.training.accelerate_config:
             params.extend(
                 [
                     "--config_file",
-                    self.settings.arbor_config.training.accelerate_config,
+                    self.config.arbor_config.training.accelerate_config,
                 ]
             )
         params.extend(
@@ -195,12 +198,12 @@ class GRPOManager:
                 "--model",
                 self.current_model,
                 "--trl_train_kwargs",
-                json.dumps(trl_train_kwargs),
+                json.dumps(self.trl_train_kwargs),
                 "--arbor_train_kwargs",
-                json.dumps(arbor_train_kwargs),
+                json.dumps(self.arbor_train_kwargs),
             ]
         )
-        print(f"Running following command\n: {' '.join(params)}")
+        logger.info(f"Running GRPO training command: {' '.join(params)}")
         self.training_process = subprocess.Popen(
             params,
@@ -222,9 +225,9 @@ class GRPOManager:
                     break
                 if line:
                     buffer.append(line)
-                    # Print only if stop_event is not set
+                    # Log only if stop_event is not set
                     if not stop_event.is_set():
-                        print(f"[GRPO LOG] {line}", end="")
+                        logger.info(f"[GRPO LOG] {line.strip()}")
         # Start a background thread to read from the process continuously
         thread = threading.Thread(
@@ -256,10 +259,10 @@ class GRPOManager:
     def _handle_status_updates(self, inference_manager: InferenceManager):
         """Handle status updates from training process using ZMQ SUB socket"""
-        print("Starting status update handler...")
+        logger.info("Starting status update handler...")
         try:
             for status in self.server_comms_handler.receive_status():
-                print(f"Received status update: {status}")
+                logger.debug(f"Received status update: {status}")
                 if status["status"] == "weight_update_start":
                     # Block inference calls by incrementing counter
                     inference_manager.start_weight_update()
@@ -267,44 +270,80 @@ class GRPOManager:
                     # Decrement counter to potentially allow inference calls again
                     inference_manager.complete_weight_update()
                 elif status["status"] == "model_saved":
-                    print("Updating inference model...")
+                    logger.info("Updating inference model...")
                     # There is a case where this status is sent multiple times
                     # We need to make sure we only update the model once
                     self.saving_model = False
-                    print("Model update complete")
+                    logger.info("Model update complete")
                 elif status["status"] == "checkpoint_saved":
-                    print("Received checkpoint saved status")
+                    logger.info("Received checkpoint saved status")
                     self.checkpoints[status["checkpoint_name"]] = status["output_dir"]
                     self.last_checkpoint = status["checkpoint_name"]
                     self.saving_checkpoint = False
-                    print("Checkpoint saved")
+                    logger.info("Checkpoint saved")
                 elif status["status"] == "error":
-                    print(f"Training error: {status.get('error', 'Unknown error')}")
+                    error_msg = status.get("error", "Unknown error")
+                    logger.error(f"Training error: {error_msg}")
                 elif status["status"] == "terminated":
                     self.terminating = False
-                    print("Training process terminated")
+                    logger.info("Training process terminated")
         except Exception as e:
-            print(f"Error in status update handler: {e}")
+            logger.error(f"Error in status update handler: {e}")
             # Make sure to allow inference if there's an error
             try:
                 inference_manager.complete_weight_update()
             except:
                 pass
+    def validate_batch(self, batch):
+        if not isinstance(batch, list):
+            raise ValueError("Batch must be a list")
+        if self.arbor_train_kwargs["grpo_flavor"] == "mmgrpo":
+            for group in batch:
+                if not isinstance(group, list):
+                    raise ValueError("Each group in batch must be a list")
+                for item in group:
+                    if not isinstance(item, dict):
+                        raise ValueError("Each item in group must be a dictionary")
+                    required_keys = {"messages", "completion", "advantage"}
+                    if not all(key in item for key in required_keys):
+                        raise ValueError(
+                            f"Each item must contain keys: {required_keys}"
+                        )
+            return True
+        elif self.arbor_train_kwargs["grpo_flavor"] == "grpo":
+            for item in batch:
+                if not isinstance(item, dict):
+                    raise ValueError("Each item in batch must be a dictionary")
+                required_keys = {"messages", "completion", "reward"}
+                if not all(key in item for key in required_keys):
+                    raise ValueError(f"Each item must contain keys: {required_keys}")
+            return True
+        else:
+            raise NotImplementedError(
+                f"GRPO flavor batch validation not implemented for {self.arbor_train_kwargs['grpo_flavor']}"
+            )
     def grpo_step(
         self, request: GRPORequest, inference_manager: InferenceManager
     ) -> str:
         while self.saving_checkpoint:
-            print("Saving checkpoint, pausing GRPO steps until checkpoint is saved...")
+            logger.info(
+                "Saving checkpoint, pausing GRPO steps until checkpoint is saved..."
+            )
             time.sleep(5)
+        self.validate_batch(request.batch)
         try:
             # Send the batch to the training process
             self.server_comms_handler.send_data(request.batch)
             self.data_count += 1
         except Exception as e:
-            print(f"Failed to send batch to training process: {e}")
+            logger.error(f"Failed to send batch to training process: {e}")
             raise
         self.current_model = self.train_kwargs["output_dir"]
@@ -322,7 +361,7 @@ class GRPOManager:
         while (
             inference_manager.is_updating
         ):  # Use the property instead of direct access
-            print("Waiting for weight updates to finish before checkpointing...")
+            logger.info("Waiting for weight updates to finish before checkpointing...")
             time.sleep(5)
         self.saving_checkpoint = True
@@ -330,7 +369,7 @@ class GRPOManager:
             {"command": "save_checkpoint", "checkpoint_name": request.checkpoint_name}
         )
         while self.saving_checkpoint:
-            print("Waiting for checkpoint to be saved...")
+            logger.info("Waiting for checkpoint to be saved...")
             time.sleep(5)
         return {
             "current_model": self.current_model,
@@ -345,14 +384,14 @@ class GRPOManager:
         while (
             inference_manager and inference_manager.is_updating
         ):  # Use the property instead of direct access
-            print("Waiting for final weight updates to finish before saving...")
+            logger.info("Waiting for final weight updates to finish before saving...")
             time.sleep(5)
-        print("sending save model command")
+        logger.info("Sending save model command")
         self.saving_model = True
         self.server_comms_handler.send_command({"command": "save_model"})
         while self.saving_model:
-            print("Waiting for final model to be saved...")
+            logger.info("Waiting for final model to be saved...")
             time.sleep(5)
         termination_data = {
@@ -361,37 +400,34 @@ class GRPOManager:
             "last_checkpoint": self.last_checkpoint,
         }
-        print("sending termination command")
+        logger.info("Sending termination command")
         self.terminating = True
         self.server_comms_handler.send_command({"command": "terminate"})
-        print("Waiting for training process to finish...")
+        logger.info("Waiting for training process to finish...")
         # Wait for at most 15 seconds for termination
         start_time = time.time()
         while self.terminating:
             if time.time() - start_time > 15:
-                print(
+                logger.warning(
                     "Termination wait timed out after 15 seconds, proceeding with cleanup..."
                 )
                 break
-            print("Waiting for run to be terminated...")
+            logger.info("Waiting for run to be terminated...")
             time.sleep(3)
-        print("Doing cleanup")
+        logger.info("Starting cleanup")
         self.cleanup_termination(inference_manager)
         if self.train_kwargs and "output_dir" in self.train_kwargs:
-            print(
-                f"Training completed. Model saved to {self.train_kwargs['output_dir']}"
-            )
-            if not os.path.exists(self.train_kwargs["output_dir"]):
-                print(
-                    f"Warning: Output directory {self.train_kwargs['output_dir']} does not exist"
-                )
             output_dir = self.train_kwargs["output_dir"]
+            logger.info(f"Training completed. Model saved to {output_dir}")
+            logger.info(f"Training logs and checkpoints are stored in: {output_dir}")
+            if not os.path.exists(output_dir):
+                logger.warning(f"Output directory {output_dir} does not exist")
             self.train_kwargs = None
         else:
-            print("Training terminated, no output directory specified")
+            logger.info("Training terminated, no output directory specified")
             self.train_kwargs = None
         return termination_data
@@ -400,7 +436,7 @@ class GRPOManager:
         try:
             # Kill training process and all its children (accelerate launcher creates multiple processes)
             if self.training_process:
-                print("Terminating training process and its children...")
+                logger.info("Terminating training process and its children...")
                 try:
                     parent = psutil.Process(self.training_process.pid)
                     # Get all child processes including grandchildren
@@ -427,9 +463,9 @@ class GRPOManager:
                             pass
                 except psutil.NoSuchProcess:
-                    print(f"Process {self.training_process.pid} not found")
+                    logger.warning(f"Process {self.training_process.pid} not found")
                 except Exception as e:
-                    print(f"Error killing training process tree: {e}")
+                    logger.error(f"Error killing training process tree: {e}")
                     # Fallback to basic termination
                     self.training_process.terminate()
                     try:
@@ -440,11 +476,11 @@ class GRPOManager:
             # Clean up ZMQ connections
             if self.server_comms_handler:
-                print("Closing ZMQ connections...")
+                logger.debug("Closing ZMQ connections...")
                 self.server_comms_handler.close()
             if inference_manager and inference_manager.process is not None:
-                print("Killing inference manager...")
+                logger.info("Killing inference manager...")
                 inference_manager.kill()
             # Reinitialize in case we want to start a new training run
@@ -453,9 +489,9 @@ class GRPOManager:
             self.server_comms_handler = None
             self.status_thread = None
             self.data_count = 0
-            print("Cleanup completed successfully")
+            logger.info("Cleanup completed successfully")
         except Exception as e:
-            print(f"Error during cleanup: {e}")
+            logger.error(f"Error during cleanup: {e}")
             # Still reset state even if cleanup fails
             self.training_process = None
             self.current_model = None
@@ -478,5 +514,5 @@ def get_free_port() -> int:
                 s.bind(("localhost", 0))
                 ports.append(s.getsockname()[1])
         except Exception as e:
-            print(f"Error binding to port: {e}")
+            logger.error(f"Error binding to port: {e}")
     return random.choice(ports)

arbor/server/services/health_manager.py ADDED Viewed

@@ -0,0 +1,171 @@
+import platform
+from datetime import datetime
+from typing import Any, Dict
+import psutil
+from arbor.server.core.config import Config
+try:
+    import GPUtil
+    GPU_AVAILABLE = True
+except ImportError:
+    GPU_AVAILABLE = False
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+class HealthManager:
+    """Manages system health checks including GPU monitoring."""
+    def __init__(self, config: Config = None):
+        self.config = config
+    def get_gpu_info(self) -> Dict[str, Any]:
+        """Get GPU information including available and used GPUs."""
+        gpu_info = {
+            "gpus_available": 0,
+            "gpus_used": 0,
+            "gpu_details": [],
+            "cuda_available": False,
+            "gpu_library": "none",
+        }
+        # Try GPUtil first
+        if GPU_AVAILABLE:
+            try:
+                gpus = GPUtil.getGPUs()
+                gpu_info["gpus_available"] = len(gpus)
+                gpu_info["gpu_library"] = "GPUtil"
+                for i, gpu in enumerate(gpus):
+                    gpu_detail = {
+                        "id": gpu.id,
+                        "name": gpu.name,
+                        "memory_total": f"{gpu.memoryTotal}MB",
+                        "memory_used": f"{gpu.memoryUsed}MB",
+                        "memory_free": f"{gpu.memoryFree}MB",
+                        "utilization": f"{gpu.load * 100:.1f}%",
+                        "temperature": f"{gpu.temperature}°C",
+                    }
+                    gpu_info["gpu_details"].append(gpu_detail)
+                    # Consider GPU "used" if utilization > 10% or memory usage > 10%
+                    if gpu.load > 0.1 or (gpu.memoryUsed / gpu.memoryTotal) > 0.1:
+                        gpu_info["gpus_used"] += 1
+            except Exception as e:
+                gpu_info["error"] = f"GPUtil error: {str(e)}"
+        # Try PyTorch as fallback/additional info
+        if TORCH_AVAILABLE:
+            try:
+                gpu_info["cuda_available"] = torch.cuda.is_available()
+                if torch.cuda.is_available():
+                    cuda_count = torch.cuda.device_count()
+                    if not GPU_AVAILABLE:  # Only use torch info if GPUtil not available
+                        gpu_info["gpus_available"] = cuda_count
+                        gpu_info["gpu_library"] = "PyTorch"
+                        for i in range(cuda_count):
+                            props = torch.cuda.get_device_properties(i)
+                            memory_allocated = (
+                                torch.cuda.memory_allocated(i) / 1024**2
+                            )  # MB
+                            memory_cached = (
+                                torch.cuda.memory_reserved(i) / 1024**2
+                            )  # MB
+                            memory_total = props.total_memory / 1024**2  # MB
+                            gpu_detail = {
+                                "id": i,
+                                "name": props.name,
+                                "memory_total": f"{memory_total:.0f}MB",
+                                "memory_allocated": f"{memory_allocated:.0f}MB",
+                                "memory_cached": f"{memory_cached:.0f}MB",
+                                "compute_capability": f"{props.major}.{props.minor}",
+                            }
+                            gpu_info["gpu_details"].append(gpu_detail)
+                            # Consider GPU "used" if memory allocated > 100MB
+                            if memory_allocated > 100:
+                                gpu_info["gpus_used"] += 1
+            except Exception as e:
+                gpu_info["torch_error"] = f"PyTorch error: {str(e)}"
+        return gpu_info
+    def get_system_info(self) -> Dict[str, Any]:
+        """Get system information including CPU, memory, and disk usage."""
+        memory = psutil.virtual_memory()
+        disk = psutil.disk_usage("/")
+        cpu_percent = psutil.cpu_percent(interval=1)
+        return {
+            "platform": platform.system(),
+            "platform_release": platform.release(),
+            "platform_version": platform.version(),
+            "architecture": platform.machine(),
+            "processor": platform.processor(),
+            "cpu_usage": f"{cpu_percent}%",
+            "memory": {
+                "total": f"{memory.total / 1024**3:.2f}GB",
+                "available": f"{memory.available / 1024**3:.2f}GB",
+                "used": f"{memory.used / 1024**3:.2f}GB",
+                "percentage": f"{memory.percent}%",
+            },
+            "disk": {
+                "total": f"{disk.total / 1024**3:.2f}GB",
+                "free": f"{disk.free / 1024**3:.2f}GB",
+                "used": f"{disk.used / 1024**3:.2f}GB",
+                "percentage": f"{(disk.used / disk.total) * 100:.1f}%",
+            },
+            "gpu": self.get_gpu_info(),
+        }
+    def get_health_status(self) -> Dict[str, Any]:
+        """Get comprehensive health status including system and GPU information."""
+        version = self.config.get_arbor_version() if self.config else "unknown"
+        versions = (
+            self.config.get_system_versions() if self.config else {"arbor": version}
+        )
+        return {
+            "status": "healthy",
+            "version": version,  # Keep for backward compatibility
+            "versions": versions,  # Comprehensive version info
+            "timestamp": datetime.now().isoformat(),
+            "system": self.get_system_info(),
+        }
+    def is_healthy(self) -> bool:
+        """Check if the system is healthy based on various metrics."""
+        try:
+            # Check memory usage (unhealthy if > 90%)
+            memory = psutil.virtual_memory()
+            if memory.percent > 90:
+                print(f"Memory usage is {memory.percent}%")
+                return False
+            # Check disk usage (unhealthy if > 95%)
+            disk = psutil.disk_usage("/")
+            if (disk.used / disk.total) * 100 > 95:
+                print(f"Disk usage is {disk.used / disk.total * 100}%")
+                return False
+            # Check CPU usage (unhealthy if > 95% sustained)
+            cpu_percent = psutil.cpu_percent(interval=2)
+            if cpu_percent > 95:
+                print(f"CPU usage is {cpu_percent}%")
+                return False
+            return True
+        except Exception:
+            return False

arbor/server/services/inference/vllm_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import atexit
 import logging
 import time
+import traceback
 from typing import Optional
 import httpx
@@ -131,11 +132,11 @@ class VLLMClient:
                     ) from exc
             else:
                 if response.status_code == 200:
-                    logger.info("Server is up!")
+                    logger.debug("Server is up!")
                     return None
-            # Retry logic: wait before trying again
-            logger.info(
+            # Retry logic: wait before tryng again
+            logger.debug(
                 f"Server is not up yet. Retrying in {retry_interval} seconds..."
             )
             time.sleep(retry_interval)
@@ -254,7 +255,8 @@ class VLLMClient:
                     await asyncio.sleep(INFERENCE_RETRY_DELAY)
                 else:
                     logger.error(
-                        f"Request failed after {MAX_INFERENCE_RETRIES} retries"
+                        f"Request failed after {MAX_INFERENCE_RETRIES} retries. Error: {e}\n"
+                        f"Stack trace:\n{traceback.format_exc()}"
                     )
                     raise

arbor-ai 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

arbor-ai 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl