PyPI - arbor-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

arbor-ai 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arbor/__init__.py +17 -0
arbor/cli.py +83 -43
arbor/client/arbor_client.py +259 -0
arbor/server/api/models/schemas.py +3 -1
arbor/server/api/routes/grpo.py +2 -6
arbor/server/api/routes/inference.py +7 -3
arbor/server/core/config.py +293 -7
arbor/server/core/config_manager.py +100 -0
arbor/server/main.py +26 -1
arbor/server/services/comms/comms.py +13 -9
arbor/server/services/file_manager.py +7 -4
arbor/server/services/grpo_manager.py +98 -62
arbor/server/services/health_manager.py +171 -0
arbor/server/services/inference/vllm_client.py +6 -4
arbor/server/services/inference_manager.py +40 -38
arbor/server/services/job_manager.py +2 -2
arbor/server/services/scripts/grpo_training.py +62 -281
arbor/server/services/scripts/mmgrpo_training.py +510 -0
arbor/server/services/scripts/sft_training.py +8 -5
arbor/server/services/scripts/utils/callbacks.py +33 -0
arbor/server/services/scripts/utils/comms_monitors.py +169 -0
arbor/server/services/scripts/utils/dataset.py +176 -0
arbor/server/services/scripts/utils/ingestion_monitor.py +35 -0
arbor/server/services/scripts/utils/mock_server.py +124 -0
arbor/server/services/training_manager.py +4 -4
arbor/server/utils/logging.py +298 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/METADATA +8 -18
arbor_ai-0.2.2.dist-info/RECORD +51 -0
arbor_ai-0.2.1.dist-info/RECORD +0 -42
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/WHEEL +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/entry_points.txt +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/licenses/LICENSE +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/top_level.txt +0 -0

arbor/server/services/inference_manager.py CHANGED Viewed

@@ -1,34 +1,34 @@
 import asyncio
 import os
-import random
 import signal
-import socket
 import subprocess
 import sys
 import threading
 import time
 from datetime import datetime
-from enum import Enum
 from typing import Any, Dict, Optional
 import psutil
 import requests
-from arbor.server.core.config import Settings
+from arbor.server.core.config import Config
 from arbor.server.services.inference.vllm_client import VLLMClient
+from arbor.server.utils.logging import get_logger
+logger = get_logger(__name__)
 class InferenceManager:
-    def __init__(self, settings: Settings):
-        self.settings = settings
+    def __init__(self, config: Config):
+        self.config = config
         self.process = None
         self.launch_kwargs = {}
         self.last_activity = None
         self._shutting_down = False
-        self.launched_model = None
+        self.launched_model: Optional[str] = None
         self.inference_count = 0
         self._session = None
-        self.port = None
+        self.port: Optional[int] = None
         self.group_port = None
         self.vllm_client = None
         self._is_updating = 0  # Counter for weight updates in progress
@@ -37,21 +37,24 @@ class InferenceManager:
         signal.signal(signal.SIGTERM, self._signal_handler)
     def _signal_handler(self, signum, frame):
-        if self._shutting_down:
-            print("\nForced exit during cleanup...")
+        """Handle shutdown signals gracefully."""
+        logger.info(f"Received signal {signum}. Initiating graceful shutdown...")
+        try:
+            self.kill_server()
+        except Exception as e:
+            logger.error(f"Error during signal handler cleanup: {e}")
+            logger.info("Forced exit during cleanup...")
             os._exit(1)
+        logger.info("Received signal to terminate. Cleaning up...")
+        os._exit(0)
-        print("\nReceived signal to terminate. Cleaning up...")
-        self._shutting_down = True
-        self.kill()
-        sys.exit(0)
-    def is_server_running(self):
-        return self.process is not None
+    def is_server_running(self) -> bool:
+        """Check if vLLM server is running."""
+        return self.process is not None and self.process.poll() is None
     def launch(self, model: str, launch_kwargs: Optional[Dict[str, Any]] = None):
         if self.is_server_running():
-            print("Server is already launched.")
+            logger.info("Server is already launched.")
             return
         launch_kwargs = launch_kwargs or self.launch_kwargs
@@ -61,18 +64,17 @@ class InferenceManager:
             if model.startswith(prefix):
                 model = model[len(prefix) :]
-        print(f"Grabbing a free port to launch a vLLM server for model {model}")
+        logger.info(f"Grabbing a free port to launch a vLLM server for model {model}")
         self.port = get_free_port()
-        timeout = launch_kwargs.get("timeout", 1800)
         my_env = os.environ.copy()
-        my_env["CUDA_VISIBLE_DEVICES"] = self.settings.arbor_config.inference.gpu_ids
-        n_gpus = self.settings.arbor_config.inference.gpu_ids.count(",") + 1
-        command = f"python -m arbor.server.services.inference.vllm_serve --model {model} --port {self.port} --gpu-memory-utilization 0.9 --tensor-parallel-size {n_gpus} --enable_prefix_caching True"
+        my_env["CUDA_VISIBLE_DEVICES"] = self.config.arbor_config.inference.gpu_ids
+        n_gpus = self.config.arbor_config.inference.gpu_ids.count(",") + 1
+        command = f"{sys.executable} -m arbor.server.services.inference.vllm_serve --model {model} --port {self.port} --gpu-memory-utilization 0.9 --tensor-parallel-size {n_gpus} --enable_prefix_caching True"
         if launch_kwargs.get("max_context_length"):
             command += f" --max_model_len {launch_kwargs['max_context_length']}"
-        print(f"Running command: {command}")
+        logger.info(f"Running command: {command}")
         # We will manually stream & capture logs.
         process = subprocess.Popen(
@@ -85,7 +87,7 @@ class InferenceManager:
         # A threading.Event to control printing after the server is ready.
         # This will store *all* lines (both before and after readiness).
-        print(f"vLLM server process started with PID {process.pid}.")
+        logger.info(f"vLLM server process started with PID {process.pid}.")
         stop_printing_event = threading.Event()
         logs_buffer = []
@@ -97,9 +99,11 @@ class InferenceManager:
                     break
                 if line:
                     buffer.append(line)
-                    # Print only if stop_event is not set
+                    # Log only if stop_event is not set
                     if not stop_event.is_set():
-                        print(f"[vLLM LOG] {line}", end="")
+                        logger.info(f"[vLLM LOG] {line.strip()}")
+                    else:
+                        logger.debug(f"[vLLM LOG] {line.strip()}")
         # Start a background thread to read from the process continuously
         thread = threading.Thread(
@@ -115,7 +119,7 @@ class InferenceManager:
             return "".join(logs_buffer)
         # Let the user know server is up
-        print(f"Server ready on random port {self.port}!")
+        logger.info(f"Server ready on random port {self.port}!")
         # self.launch_kwargs["api_base"] = f"http://localhost:{port}/v1"
         # self.launch_kwargs["api_key"] = "local"
@@ -137,7 +141,7 @@ class InferenceManager:
     def kill(self):
         if self.process is None:
-            print("No running server to kill.")
+            logger.info("No running server to kill.")
             return
         process = self.process
@@ -152,19 +156,18 @@ class InferenceManager:
         try:
             kill_vllm_server(process.pid)
         except Exception as e:
-            print(f"Error during cleanup: {e}")
+            logger.error(f"Error during cleanup: {e}")
             try:
                 process.kill()  # Final attempt to kill
             except:
                 pass
-        print("Server killed.")
+        logger.info("Server killed.")
     async def run_inference(self, request_json: dict):
         # Check if weights are being updated
-        while self.is_updating:
+        while self._is_updating:
             # weights are being updated...waiting
-            # print("Weights are being updated, waiting...")
             await asyncio.sleep(1)  # Small sleep to prevent busy waiting
         model = request_json["model"]
@@ -172,13 +175,12 @@ class InferenceManager:
         for prefix in prefixes:
             if model.startswith(prefix):
                 model = model[len(prefix) :]
-        print(f"Running inference for model {model}")
+        logger.info(f"Running inference for model {model}")
         # Monkeypatch for GRPO runs:
         # vllm complains if we don't give it the exact model name that was launched
         # TODO: This should really throw an error unless in a GRPO run.
         if model != self.launched_model:
-            # print(f"Model changed from {model} to {self.current_model}")
             model = self.launched_model
             request_json["model"] = model
@@ -218,7 +220,7 @@ def get_free_port() -> int:
                 s.bind(("localhost", 0))
                 ports.append(s.getsockname()[1])
         except Exception as e:
-            print(f"Error binding to port: {e}")
+            logger.error(f"Error binding to port: {e}")
     return random.choice(ports)
@@ -272,6 +274,6 @@ def kill_vllm_server(main_process_pid):
             p.kill()  # SIGKILL
     except psutil.NoSuchProcess:
-        print(f"Process {main_process_pid} not found")
+        logger.warning(f"Process {main_process_pid} not found")
     except Exception as e:
-        print(f"Error killing processes: {e}")
+        logger.error(f"Error killing processes: {e}")

arbor/server/services/job_manager.py CHANGED Viewed

@@ -3,7 +3,7 @@ from datetime import datetime
 from typing import Literal
 from arbor.server.api.models.schemas import JobStatus
-from arbor.server.core.config import Settings
+from arbor.server.core.config import Config
 class JobEvent:
@@ -58,7 +58,7 @@ class Job:
 class JobManager:
-    def __init__(self, settings: Settings):
+    def __init__(self, config: Config):
         self.jobs = {}
     def get_job(self, job_id: str):

arbor-ai 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

arbor-ai 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl