PyPI - arbor-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

arbor-ai 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arbor/__init__.py +17 -0
arbor/cli.py +83 -43
arbor/client/arbor_client.py +259 -0
arbor/server/api/models/schemas.py +3 -1
arbor/server/api/routes/grpo.py +2 -6
arbor/server/api/routes/inference.py +7 -3
arbor/server/core/config.py +293 -7
arbor/server/core/config_manager.py +100 -0
arbor/server/main.py +26 -1
arbor/server/services/comms/comms.py +13 -9
arbor/server/services/file_manager.py +7 -4
arbor/server/services/grpo_manager.py +98 -62
arbor/server/services/health_manager.py +171 -0
arbor/server/services/inference/vllm_client.py +6 -4
arbor/server/services/inference_manager.py +40 -38
arbor/server/services/job_manager.py +2 -2
arbor/server/services/scripts/grpo_training.py +62 -281
arbor/server/services/scripts/mmgrpo_training.py +510 -0
arbor/server/services/scripts/sft_training.py +8 -5
arbor/server/services/scripts/utils/callbacks.py +33 -0
arbor/server/services/scripts/utils/comms_monitors.py +169 -0
arbor/server/services/scripts/utils/dataset.py +176 -0
arbor/server/services/scripts/utils/ingestion_monitor.py +35 -0
arbor/server/services/scripts/utils/mock_server.py +124 -0
arbor/server/services/training_manager.py +4 -4
arbor/server/utils/logging.py +298 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/METADATA +8 -18
arbor_ai-0.2.2.dist-info/RECORD +51 -0
arbor_ai-0.2.1.dist-info/RECORD +0 -42
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/WHEEL +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/entry_points.txt +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/licenses/LICENSE +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/top_level.txt +0 -0

arbor/__init__.py CHANGED Viewed

@@ -0,0 +1,17 @@
+"""
+Arbor - A framework for fine-tuning and managing language models
+"""
+from importlib.metadata import PackageNotFoundError, version
+try:
+    __version__ = version("arbor-ai")
+except PackageNotFoundError:
+    # Package is not installed, likely in development mode
+    __version__ = "dev"
+except Exception:
+    __version__ = "unknown"
+from arbor.client.arbor_client import is_running, serve, stop
+__all__ = ["__version__", "serve", "stop", "is_running"]

arbor/cli.py CHANGED Viewed

@@ -4,58 +4,22 @@ from datetime import datetime
 import click
 import uvicorn
-from arbor.server.core.config import Settings
+from arbor.server.core.config import Config
+from arbor.server.core.config_manager import ConfigManager
 from arbor.server.main import app
 from arbor.server.services.file_manager import FileManager
 from arbor.server.services.grpo_manager import GRPOManager
+from arbor.server.services.health_manager import HealthManager
 from arbor.server.services.inference_manager import InferenceManager
 from arbor.server.services.job_manager import JobManager
 from arbor.server.services.training_manager import TrainingManager
-def make_log_dir(storage_path: str):
-    # Create a timestamped log directory under the storage path
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    log_dir = os.path.join(storage_path, "logs", timestamp)
-    os.makedirs(log_dir, exist_ok=True)
-    return log_dir
+from arbor.client.arbor_client import create_app
 @click.group()
 def cli():
     pass
-def create_app(arbor_config_path: str):
-    """Create and configure the Arbor API application
-    Args:
-        storage_path (str): Path to store models and uploaded training files
-    Returns:
-        FastAPI: Configured FastAPI application
-    """
-    # Create new settings instance with overrides
-    settings = Settings.load_from_yaml(arbor_config_path)
-    app.state.log_dir = make_log_dir(settings.STORAGE_PATH)
-    # Initialize services with settings
-    file_manager = FileManager(settings=settings)
-    job_manager = JobManager(settings=settings)
-    training_manager = TrainingManager(settings=settings)
-    inference_manager = InferenceManager(settings=settings)
-    grpo_manager = GRPOManager(settings=settings)
-    # Inject settings into app state
-    app.state.settings = settings
-    app.state.file_manager = file_manager
-    app.state.job_manager = job_manager
-    app.state.training_manager = training_manager
-    app.state.inference_manager = inference_manager
-    app.state.grpo_manager = grpo_manager
-    return app
 def start_server(host="0.0.0.0", port=7453, storage_path="./storage", timeout=10):
     """Start the Arbor API server with a single function call"""
     import socket
@@ -72,6 +36,7 @@ def start_server(host="0.0.0.0", port=7453, storage_path="./storage", timeout=10
         raise RuntimeError(f"Port {port} is already in use")
     app = create_app(storage_path)
+    # configure_uvicorn_logging()
     config = uvicorn.Config(app, host=host, port=port, log_level="info")
     server = uvicorn.Server(config)
@@ -102,11 +67,86 @@ def stop_server(server):
 @cli.command()
 @click.option("--host", default="0.0.0.0", help="Host to bind to")
 @click.option("--port", default=7453, help="Port to bind to")
-@click.option("--arbor-config", required=True, help="Path to the Arbor config file")
+@click.option("--arbor-config", required=False, help="Path to the Arbor config file")
 def serve(host, port, arbor_config):
     """Start the Arbor API server"""
-    app = create_app(arbor_config)
-    uvicorn.run(app, host=host, port=port)
+    if arbor_config:
+        config_path = arbor_config
+    else:
+        config_path = Config.use_default_config()
+        # If no config found, run first-time setup
+        if config_path is None:
+            config_path = run_first_time_setup()
+    # Validate config exists and is readable
+    is_valid, msg = ConfigManager.validate_config_file(config_path)
+    if not is_valid:
+        click.echo(msg)
+        raise click.Abort()
+    try:
+        create_app(config_path)
+        # Temporarily disable custom uvicorn logging configuration
+        # configure_uvicorn_logging()
+        uvicorn.run(app, host=host, port=port)
+    except Exception as e:
+        click.echo(f"Failed to start server: {e}", err=True)
+        raise click.Abort()
+def run_first_time_setup() -> str:
+    """Run first-time setup and return created config path"""
+    click.echo("Welcome to Arbor!")
+    click.echo("It looks like this is your first time running Arbor.")
+    click.echo("Let's set up your configuration...\n")
+    try:
+        # Get config details
+        inference = click.prompt(
+            "Which gpu ids should be used for inference (separated by comma)",
+            default="0",
+        )
+        training = click.prompt(
+            "Which gpu ids should be used for training (separated by comma)",
+            default="1, 2",
+        )
+        click.echo()
+        # Get config file path
+        config_path = click.prompt(
+            "Enter path to save config file in. We recommend (~/.arbor/config.yaml)",
+            default=ConfigManager.get_default_config_path(),
+        )
+        logger = get_logger(__name__)
+        logger.info(f"Config path selected: {config_path}")
+        click.echo()
+        # Update or create config at path
+        config_path = ConfigManager.update_config(inference, training, config_path)
+        click.echo(f"Created configuration at: {config_path}")
+        # Check if it is a valid config file
+        is_valid, msg = ConfigManager.validate_config_file(config_path)
+        if not is_valid:
+            raise click.ClickException(f"Invalid config file: {msg}")
+        # Read and display the contents
+        _, content = ConfigManager.get_config_contents(config_path)
+        click.echo("\nConfiguration file contents:")
+        click.echo("---")
+        click.echo(content)
+        click.echo("---")
+        click.echo("\nSetup complete! Starting Arbor server...")
+        return config_path
+    except Exception as e:
+        click.echo(f"Failed initial setup of Arbor: {e}", err=True)
+        raise click.Abort()
 if __name__ == "__main__":

arbor/client/arbor_client.py ADDED Viewed

@@ -0,0 +1,259 @@
+import asyncio
+import os
+import socket
+import threading
+import time
+from datetime import datetime
+import click
+import requests
+import uvicorn
+from arbor.server.core.config import Config
+from arbor.server.core.config_manager import ConfigManager
+from arbor.server.main import app
+from arbor.server.services.file_manager import FileManager
+from arbor.server.services.grpo_manager import GRPOManager
+from arbor.server.services.health_manager import HealthManager
+from arbor.server.services.inference_manager import InferenceManager
+from arbor.server.services.job_manager import JobManager
+from arbor.server.services.training_manager import TrainingManager
+# Global server state
+_server = None
+_server_thread = None
+_server_loop = None
+_server_host = None
+_server_port = None
+def create_app(
+    config_path: str = None,
+    storage_path: str = None,
+    inference_gpus: str = None,
+    training_gpus: str = None,
+):
+    """Create and configure the Arbor API application
+    Args:
+        arbor_config_path (str): Path to config file
+        storage_path (str): Path to storage directory
+        inference_gpus (str): gpu ids to use for inference
+        training_gpus (str): gpu ids to use for training
+    Returns:
+        FastAPI: Configured FastAPI application
+    """
+    # Create new config instance with overrides
+    if config_path:
+        config = Config.load_config_from_yaml(config_path)
+    elif inference_gpus and training_gpus:
+        config = Config.load_config_directly(
+            storage_path, inference_gpus, training_gpus
+        )
+    else:
+        raise ValueError(
+            "Either 'config_path' must be provided, or 'inference_gpus', and 'training_gpus' must be provided"
+        )
+    app.state.log_dir = Config.make_log_dir(config.STORAGE_PATH)
+    # Initialize services with config
+    health_manager = HealthManager(config=config)
+    file_manager = FileManager(config=config)
+    job_manager = JobManager(config=config)
+    training_manager = TrainingManager(config=config)
+    inference_manager = InferenceManager(config=config)
+    grpo_manager = GRPOManager(config=config)
+    # Inject config into app state
+    app.state.config = config
+    app.state.file_manager = file_manager
+    app.state.job_manager = job_manager
+    app.state.training_manager = training_manager
+    app.state.inference_manager = inference_manager
+    app.state.grpo_manager = grpo_manager
+    app.state.health_manager = health_manager
+    return app
+def serve(
+    config_path: str = None,
+    storage_path: str = None,
+    inference_gpus: str = None,
+    training_gpus: str = None,
+    host: str = "0.0.0.0",
+    port: int = 7453,
+):
+    """Start the Arbor API server.
+    Starts the server in a background thread and returns once the server is ready to accept requests.
+    Use arbor.stop() to shutdown the server.
+    Args:
+        config_path: Path to YAML config file (optional)
+        storage_path: Valid storage directory path (optional)
+        inference_gpus: GPU IDs for inference, e.g. "0,1" (optional, default 0)
+        training_gpus: GPU IDs for training, e.g. "1,2,3" (optional, default 1,2)
+        host: Host to bind to (default: "0.0.0.0")
+        port: Port to bind to (default: 7453)
+    Example:
+        import arbor
+        arbor.serve(inference_gpus="0", training_gpus="1,2")
+        # Server is now ready to accept requests
+        # Later, to stop:
+        arbor.stop()
+    """
+    global _server, _server_thread, _server_loop, _server_host, _server_port
+    # Stop existing server if running
+    if _server is not None:
+        print("🌳 Stopping existing server...")
+        stop()
+    _server_host = host
+    _server_port = port
+    create_app(config_path, storage_path, inference_gpus, training_gpus)
+    # Start server in background thread
+    def run_server():
+        global _server, _server_loop
+        # Create a new event loop for this thread
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        _server_loop = loop
+        # Create uvicorn config and server
+        config = uvicorn.Config(app, host=host, port=port, loop=loop)
+        server = uvicorn.Server(config)
+        _server = server
+        # Run the server
+        try:
+            loop.run_until_complete(server.serve())
+        except Exception as e:
+            print(f"Server error: {e}")
+        finally:
+            loop.close()
+            _server = None
+            _server_loop = None
+    # Start server thread
+    _server_thread = threading.Thread(target=run_server, daemon=True)
+    _server_thread.start()
+    print(f"🌳 Starting Arbor server on http://{host}:{port}...")
+    # Wait for server to be ready
+    try:
+        _wait_for_server_ready(host, port, timeout=60)  # Increased timeout
+        print(f"🌳 Arbor server is ready and accepting requests!")
+    except TimeoutError as e:
+        print(f"❌ {e}")
+        # Try to stop the server if it failed to start properly
+        stop()
+        raise
+def stop():
+    """Stop the Arbor server if it's running."""
+    global _server, _server_thread, _server_loop
+    if _server is None:
+        print("🌳 No server running to stop.")
+        return
+    print("🌳 Stopping Arbor server...")
+    # Schedule server shutdown in the server's event loop
+    if _server_loop and _server:
+        try:
+            asyncio.run_coroutine_threadsafe(_server.shutdown(), _server_loop)
+        except Exception as e:
+            print(f"Error during shutdown: {e}")
+    # Wait for thread to finish
+    if _server_thread and _server_thread.is_alive():
+        _server_thread.join(timeout=5)
+    # Reset global state
+    _server = None
+    _server_thread = None
+    _server_loop = None
+    print("🌳 Arbor server stopped.")
+def is_running():
+    """Check if the Arbor server is currently running."""
+    return (
+        _server is not None and _server_thread is not None and _server_thread.is_alive()
+    )
+def _wait_for_server_ready(host, port, timeout=30):
+    """Wait for the server to be ready to accept requests."""
+    start_time = time.time()
+    last_error = None
+    port_open = False
+    print(f"🌳 Waiting for server to be ready at http://{host}:{port}...")
+    while time.time() - start_time < timeout:
+        # First check if the port is open
+        if not port_open:
+            try:
+                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                sock.settimeout(1)
+                result = sock.connect_ex((host, port))
+                sock.close()
+                if result == 0:
+                    port_open = True
+                    print(f"🌳 Port {port} is now open, checking health endpoint...")
+                else:
+                    last_error = f"Port {port} not yet open"
+                    time.sleep(0.5)
+                    continue
+            except Exception as e:
+                last_error = f"Socket error: {e}"
+                time.sleep(0.5)
+                continue
+        # Now try the health check
+        try:
+            response = requests.get(f"http://{host}:{port}/health/simple", timeout=2)
+            if response.status_code == 200:
+                print(f"🌳 Server ready! Response: {response.json()}")
+                return
+            else:
+                last_error = f"Health check returned status {response.status_code}"
+        except requests.exceptions.ConnectionError as e:
+            last_error = f"Connection error: {e}"
+            port_open = False  # Port might have closed
+        except requests.exceptions.Timeout as e:
+            last_error = f"Timeout error: {e}"
+        except requests.exceptions.RequestException as e:
+            last_error = f"Request error: {e}"
+        except Exception as e:
+            last_error = f"Unexpected error: {e}"
+        # Print progress every 5 seconds
+        elapsed = time.time() - start_time
+        if int(elapsed) % 5 == 0 and elapsed >= 5:
+            print(
+                f"🌳 Still waiting... ({elapsed:.1f}s elapsed, port_open={port_open}, last error: {last_error})"
+            )
+        time.sleep(0.5)
+    raise TimeoutError(
+        f"Server did not become ready within {timeout} seconds. Last error: {last_error}"
+    )
+if __name__ == "__main__":
+    serve(inference_gpus="0, 1", training_gpus="2, 3")

arbor/server/api/models/schemas.py CHANGED Viewed

@@ -178,7 +178,7 @@ class ChatCompletionModel(BaseModel):
 class GRPORequest(BaseModel):
     model: str
-    batch: List[dict]
+    batch: List[dict] | List[List[dict]]
 class GRPOConfigRequest(BaseModel):
@@ -205,6 +205,8 @@ class GRPOConfigRequest(BaseModel):
     # Arbor specific
     max_context_length: Optional[int] = None
     lora: Optional[bool] = None
+    grpo_flavor: Optional[Literal["grpo", "mmgrpo"]] = None
+    wandb_kwargs: Optional[dict] = None
     # To name the run
     suffix: Optional[str] = None
     generation_batch_size: Optional[int] = None

arbor/server/api/routes/grpo.py CHANGED Viewed

@@ -10,7 +10,6 @@ from arbor.server.api.models.schemas import (
     GRPOConfigResponse,
     GRPORequest,
     GRPOStepResponse,
-    GRPOTerminateRequest,
     GRPOTerminateResponse,
 )
@@ -27,12 +26,9 @@ def initialize_grpo(request: Request, grpo_config_request: GRPOConfigRequest):
 # Create a grpo job
 @router.post("/step", response_model=GRPOStepResponse)
-def run_grpo_step(
-    request: Request, grpo_request: GRPORequest, background_tasks: BackgroundTasks
-):
-    inference_manager = request.app.state.inference_manager
+def run_grpo_step(request: Request, grpo_request: GRPORequest):
     grpo_manager = request.app.state.grpo_manager
+    inference_manager = request.app.state.inference_manager
     step_data = grpo_manager.grpo_step(grpo_request, inference_manager)
     return GRPOStepResponse(status="success", **step_data)

arbor/server/api/routes/inference.py CHANGED Viewed

@@ -3,6 +3,10 @@ import uuid
 from fastapi import APIRouter, Request
+from arbor.server.utils.logging import get_logger
+logger = get_logger(__name__)
 router = APIRouter()
@@ -27,17 +31,17 @@ async def run_inference(
     # if a server isnt running, launch one
     if not inference_manager.is_server_running():
-        print("No model is running, launching model...")
+        logger.info("No model is running, launching model...")
         inference_manager.launch(request_model)
     # if the requested model is different from the launched model, swap the server
     if request_model != inference_manager.launched_model:
-        print(
+        logger.info(
             f"Model changed from {inference_manager.launched_model} to {request_model}, swapping server..."
         )
         inference_manager.kill()
         inference_manager.launch(request_model)
-        print(f"Model swapped to {request_model}")
+        logger.info(f"Model swapped to {request_model}")
     # forward the request to the inference server
     completion = await inference_manager.run_inference(raw_json)

arbor-ai 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

arbor-ai 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl