PyPI - arbor-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

arbor-ai 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arbor/__init__.py +17 -0
arbor/cli.py +83 -43
arbor/client/arbor_client.py +259 -0
arbor/server/api/models/schemas.py +3 -1
arbor/server/api/routes/grpo.py +2 -6
arbor/server/api/routes/inference.py +7 -3
arbor/server/core/config.py +293 -7
arbor/server/core/config_manager.py +100 -0
arbor/server/main.py +26 -1
arbor/server/services/comms/comms.py +13 -9
arbor/server/services/file_manager.py +7 -4
arbor/server/services/grpo_manager.py +98 -62
arbor/server/services/health_manager.py +171 -0
arbor/server/services/inference/vllm_client.py +6 -4
arbor/server/services/inference_manager.py +40 -38
arbor/server/services/job_manager.py +2 -2
arbor/server/services/scripts/grpo_training.py +62 -281
arbor/server/services/scripts/mmgrpo_training.py +510 -0
arbor/server/services/scripts/sft_training.py +8 -5
arbor/server/services/scripts/utils/callbacks.py +33 -0
arbor/server/services/scripts/utils/comms_monitors.py +169 -0
arbor/server/services/scripts/utils/dataset.py +176 -0
arbor/server/services/scripts/utils/ingestion_monitor.py +35 -0
arbor/server/services/scripts/utils/mock_server.py +124 -0
arbor/server/services/training_manager.py +4 -4
arbor/server/utils/logging.py +298 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/METADATA +8 -18
arbor_ai-0.2.2.dist-info/RECORD +51 -0
arbor_ai-0.2.1.dist-info/RECORD +0 -42
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/WHEEL +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/entry_points.txt +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/licenses/LICENSE +0 -0
{arbor_ai-0.2.1.dist-info → arbor_ai-0.2.2.dist-info}/top_level.txt +0 -0

arbor/server/core/config.py CHANGED Viewed

@@ -1,8 +1,18 @@
+import datetime
+import os
+import subprocess
+import sys
 from pathlib import Path
-from typing import Optional
+from typing import Any, ClassVar, Dict, Optional
 import yaml
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel
+try:
+    from importlib.metadata import PackageNotFoundError, version
+except ImportError:
+    # For Python < 3.8
+    from importlib_metadata import PackageNotFoundError, version
 class InferenceConfig(BaseModel):
@@ -19,16 +29,267 @@ class ArborConfig(BaseModel):
     training: TrainingConfig
-class Settings(BaseModel):
-    STORAGE_PATH: str = "./storage"
+class Config(BaseModel):
+    STORAGE_PATH: ClassVar[str] = str(Path.home() / ".arbor" / "storage")
     INACTIVITY_TIMEOUT: int = 30  # 5 seconds
     arbor_config: ArborConfig
+    @staticmethod
+    def validate_storage_path(storage_path: str):
+        """Validates a storage path, return True for success, False if failed."""
+        try:
+            if not Path(storage_path).exists():
+                return False
+            return True
+        except Exception as e:
+            return False
+    @classmethod
+    def set_storage_path(cls, storage_path: str):
+        """Set a valid storage path to use, return True for success, False if failed."""
+        if not cls.validate_storage_path(storage_path):
+            return False
+        cls.STORAGE_PATH = storage_path
+        return True
+    @staticmethod
+    def validate_storage_path(storage_path: str) -> None:
+        """Validates a storage path, raises exception if invalid."""
+        if not storage_path:
+            raise ValueError("Storage path cannot be empty")
+        path = Path(storage_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Storage path does not exist: {storage_path}")
+        if not path.is_dir():
+            raise NotADirectoryError(f"Storage path is not a directory: {storage_path}")
+        # Check if we can write to the directory
+        if not os.access(path, os.W_OK):
+            raise PermissionError(
+                f"No write permission for storage path: {storage_path}"
+            )
+    @classmethod
+    def set_storage_path(cls, storage_path: str) -> None:
+        """Set a valid storage path to use, raises exception if invalid."""
+        cls.validate_storage_path(storage_path)  # raises if invalid
+        cls.STORAGE_PATH = storage_path
+    @classmethod
+    def make_log_dir(cls, storage_path: str = None):
+        """Create a timestamped log directory under the storage path."""
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_dir = Path(
+            storage_path if storage_path else cls.STORAGE_PATH / "logs" / timestamp
+        )
+        log_dir.mkdir(exist_ok=True)
+        return log_dir
+    @staticmethod
+    def get_arbor_version() -> str:
+        """Get the installed version of arbor package."""
+        try:
+            return version("arbor-ai")
+        except PackageNotFoundError:
+            # Fallback to a default version if package not found
+            # This might happen in development mode
+            return "dev"
+        except Exception:
+            return "unknown"
+    @staticmethod
+    def get_cuda_version() -> str:
+        """Get CUDA runtime version."""
+        try:
+            import torch
+            if torch.cuda.is_available():
+                return torch.version.cuda
+            else:
+                return "not_available"
+        except ImportError:
+            try:
+                # Try getting CUDA version from nvcc
+                result = subprocess.run(
+                    ["nvcc", "--version"], capture_output=True, text=True, timeout=5
+                )
+                if result.returncode == 0:
+                    # Parse nvcc output for version
+                    for line in result.stdout.split("\n"):
+                        if "release" in line.lower():
+                            # Extract version from line like "Cuda compilation tools, release 11.8, V11.8.89"
+                            parts = line.split("release")
+                            if len(parts) > 1:
+                                version_part = parts[1].split(",")[0].strip()
+                                return version_part
+                return "unknown"
+            except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
+                return "not_installed"
+        except Exception:
+            return "unknown"
+    @staticmethod
+    def get_nvidia_driver_version() -> str:
+        """Get NVIDIA driver version."""
+        try:
+            result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=driver_version",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            if result.returncode == 0:
+                return result.stdout.strip().split("\n")[0]
+            return "unknown"
+        except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
+            return "not_installed"
+    @staticmethod
+    def get_python_package_version(package_name: str) -> str:
+        """Get version of a Python package."""
+        try:
+            return version(package_name)
+        except PackageNotFoundError:
+            return "not_installed"
+        except Exception:
+            return "unknown"
+    @classmethod
+    def get_ml_library_versions(cls) -> Dict[str, str]:
+        """Get versions of common ML libraries."""
+        libraries = {
+            "torch": "torch",
+            "transformers": "transformers",
+            "vllm": "vllm",
+            "trl": "trl",
+            "peft": "peft",
+            "accelerate": "accelerate",
+            "ray": "ray",
+            "wandb": "wandb",
+            "numpy": "numpy",
+            "pandas": "pandas",
+            "scikit-learn": "scikit-learn",
+        }
+        versions = {}
+        for lib_name, package_name in libraries.items():
+            versions[lib_name] = cls.get_python_package_version(package_name)
+        return versions
+    @classmethod
+    def get_cuda_library_versions(cls) -> Dict[str, str]:
+        """Get versions of CUDA-related libraries."""
+        cuda_info = {}
+        # CUDA runtime version
+        cuda_info["cuda_runtime"] = cls.get_cuda_version()
+        # NVIDIA driver version
+        cuda_info["nvidia_driver"] = cls.get_nvidia_driver_version()
+        # cuDNN version (if available through PyTorch)
+        try:
+            import torch
+            if torch.cuda.is_available() and hasattr(torch.backends.cudnn, "version"):
+                cuda_info["cudnn"] = str(torch.backends.cudnn.version())
+            else:
+                cuda_info["cudnn"] = "not_available"
+        except ImportError:
+            cuda_info["cudnn"] = "torch_not_installed"
+        except Exception:
+            cuda_info["cudnn"] = "unknown"
+        # NCCL version (if available through PyTorch)
+        try:
+            import torch
+            if torch.cuda.is_available() and hasattr(torch, "__version__"):
+                # NCCL version is often embedded in PyTorch build info
+                try:
+                    import torch.distributed as dist
+                    if hasattr(dist, "is_nccl_available") and dist.is_nccl_available():
+                        # Try to get NCCL version from PyTorch
+                        if hasattr(torch.cuda.nccl, "version"):
+                            nccl_version = torch.cuda.nccl.version()
+                            cuda_info["nccl"] = (
+                                f"{nccl_version[0]}.{nccl_version[1]}.{nccl_version[2]}"
+                            )
+                        else:
+                            cuda_info["nccl"] = "available"
+                    else:
+                        cuda_info["nccl"] = "not_available"
+                except Exception:
+                    cuda_info["nccl"] = "unknown"
+            else:
+                cuda_info["nccl"] = "cuda_not_available"
+        except ImportError:
+            cuda_info["nccl"] = "torch_not_installed"
+        except Exception:
+            cuda_info["nccl"] = "unknown"
+        return cuda_info
     @classmethod
-    def load_from_yaml(cls, yaml_path: str) -> "Settings":
+    def get_system_versions(cls) -> Dict[str, Any]:
+        """Get comprehensive version information for the system."""
+        return {
+            "arbor": cls.get_arbor_version(),
+            "python": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
+            "ml_libraries": cls.get_ml_library_versions(),
+            "cuda_stack": cls.get_cuda_library_versions(),
+        }
+    @classmethod
+    def _init_arbor_directories(cls):
+        arbor_root = Path.home() / ".arbor"
+        storage_dir = Path(cls.STORAGE_PATH)
+        arbor_root.mkdir(exist_ok=True)
+        storage_dir.mkdir(exist_ok=True)
+        (storage_dir / "logs").mkdir(exist_ok=True)
+        (storage_dir / "models").mkdir(exist_ok=True)
+        (storage_dir / "uploads").mkdir(exist_ok=True)
+    @classmethod
+    def use_default_config(cls) -> Optional[str]:
+        """Search for: ~/.arbor/config.yaml, else return None"""
+        # Check ~/.arbor/config.yaml
+        arbor_config = Path.home() / ".arbor" / "config.yaml"
+        if arbor_config.exists():
+            return str(arbor_config)
+        return None
+    @classmethod
+    def load_config_from_yaml(cls, yaml_path: str) -> "Config":
+        # If yaml file is not provided, try to use ~/.arbor/config.yaml
+        cls._init_arbor_directories()
+        if not yaml_path:
+            yaml_path = cls.use_default_config()
         if not yaml_path:
-            raise ValueError("Config file path is required")
+            raise ValueError(
+                "No config file found. Please create ~/.arbor/config.yaml or "
+                "provide a config file path with --arbor-config"
+            )
         if not Path(yaml_path).exists():
             raise ValueError(f"Config file {yaml_path} does not exist")
@@ -42,6 +303,31 @@ class Settings(BaseModel):
                     training=TrainingConfig(**config["training"]),
                 )
             )
+            storage_path = config.get("storage_path")
+            if storage_path:
+                cls.set_storage_path(storage_path)
             return settings
         except Exception as e:
             raise ValueError(f"Error loading config file {yaml_path}: {e}")
+    @classmethod
+    def load_config_directly(
+        cls,
+        storage_path: str = None,
+        inference_gpus: str = "0",
+        training_gpus: str = "1,2",
+    ):
+        cls._init_arbor_directories()
+        # create settings without yaml file
+        config = ArborConfig(
+            inference=InferenceConfig(gpu_ids=inference_gpus),
+            training=TrainingConfig(gpu_ids=training_gpus),
+        )
+        if storage_path:
+            cls.set_storage_path(storage_path)
+        return cls(arbor_config=config)

arbor/server/core/config_manager.py ADDED Viewed

@@ -0,0 +1,100 @@
+import os
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import yaml
+from arbor.server.core.config import Config
+class ConfigManager:
+    def __init__(self):
+        self._init_arbor_directories()
+    def _init_arbor_directories(self):
+        arbor_root = Path.home() / ".arbor"
+        storage_dir = Path(self.STORAGE_PATH)
+        arbor_root.mkdir(exist_ok=True)
+        storage_dir.mkdir(exist_ok=True)
+        (storage_dir / "logs").mkdir(exist_ok=True)
+        (storage_dir / "models").mkdir(exist_ok=True)
+        (storage_dir / "uploads").mkdir(exist_ok=True)
+    @staticmethod
+    def get_default_config_path() -> Path:
+        return str(Path.home() / ".arbor" / "config.yaml")
+    @staticmethod
+    def get_config_template() -> Dict:
+        return {"inference": {"gpu_ids": "0"}, "training": {"gpu_ids": "1, 2"}}
+    @classmethod
+    def update_config(
+        cls,
+        inference_gpus: Optional[str] = None,
+        training_gpus: Optional[str] = None,
+        config_path: Optional[str] = None,
+    ) -> str:
+        """Update existing config or create new one."""
+        if config_path is None:
+            config_path = Config.use_default_config()
+            if config_path is None:
+                config_path = str(cls.get_default_config_path())
+        config_file = Path(config_path)
+        config_file.parent.mkdir(parents=True, exist_ok=True)
+        # Load existing config or use template
+        if config_file.exists():
+            with open(config_file, "r") as f:
+                config = yaml.safe_load(f) or {}
+        else:
+            config = cls.get_config_template()
+        # Update values given
+        if inference_gpus is not None:
+            if "inference" not in config:
+                config["inference"] = {}
+            config["inference"]["gpu_ids"] = str(inference_gpus)
+        if training_gpus is not None:
+            if "training" not in config:
+                config["training"] = {}
+            config["training"]["gpu_ids"] = str(training_gpus)
+        temp_path = config_file.with_suffix(".tmp")
+        try:
+            with open(temp_path, "w") as f:
+                yaml.dump(config, f, default_flow_style=False, default_style="'")
+            temp_path.rename(config_file)
+        except Exception:
+            if temp_path.exists():
+                temp_path.unlink()
+            raise
+        return str(config_file)
+    @classmethod
+    def validate_config_file(cls, config_path: str) -> Tuple[bool, str]:
+        """Validate a config file"""
+        try:
+            if not Path(config_path).exists():
+                return False, f"Config file does not exist: {config_path}"
+            # If we do have a config file, try to see if it will load
+            Config.load_config_from_yaml(config_path)
+            return True, "Config is valid"
+        except Exception as e:
+            return False, f"Invalid config: {e}"
+    @classmethod
+    def get_config_contents(cls, config_path: str) -> Tuple[bool, str]:
+        try:
+            with open(config_path, "r") as f:
+                config_content = f.read()
+            return True, config_content
+        except Exception as e:
+            return False, str(e)

arbor/server/main.py CHANGED Viewed

@@ -1,11 +1,36 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, Request
 from arbor.server.api.routes import files, grpo, inference, jobs
+from arbor.server.utils.logging import apply_uvicorn_formatting
 app = FastAPI(title="Arbor API")
+@app.on_event("startup")
+async def startup_event():
+    """Configure uvicorn logging after the app starts up."""
+    apply_uvicorn_formatting()
 # Include routers
 app.include_router(files.router, prefix="/v1/files")
 app.include_router(jobs.router, prefix="/v1/fine_tuning/jobs")
 app.include_router(grpo.router, prefix="/v1/fine_tuning/grpo")
 app.include_router(inference.router, prefix="/v1/chat")
+@app.get("/health")
+def health_check(request: Request):
+    """Enhanced health check with system and GPU information."""
+    health_manager = request.app.state.health_manager
+    return health_manager.get_health_status()
+@app.get("/health/simple")
+def simple_health_check(request: Request):
+    """Simple health check that returns just the status."""
+    health_manager = request.app.state.health_manager
+    return {
+        "status": "healthy" if health_manager.is_healthy() else "unhealthy",
+        "timestamp": health_manager.get_health_status()["timestamp"],
+    }

arbor/server/services/comms/comms.py CHANGED Viewed

@@ -6,6 +6,10 @@ import time
 import zmq
+from arbor.server.utils.logging import get_logger
+logger = get_logger(__name__)
 class ArborServerCommsHandler:
     """Handles socket communication between manager and training process"""
@@ -64,14 +68,14 @@ class ArborServerCommsHandler:
     def wait_for_clients(self, expected_count):
         connected_clients = []
         while len(connected_clients) < expected_count:
-            print(f"Waiting for {expected_count} clients to connect...")
+            logger.info(f"Waiting for {expected_count} clients to connect...")
             msg = self.handshake_socket.recv_json()
             if msg.get("type") == "hello":
                 client_id = msg.get("client_id")
                 connected_clients.append(client_id)
                 self.handshake_socket.send_json({"status": "ack"})
-            print(f"Received handshake from {client_id}")
-        print(f"All {expected_count} clients connected!")
+            logger.info(f"Received handshake from {client_id}")
+        logger.info(f"All {expected_count} clients connected!")
 class ArborScriptCommsHandler:
@@ -138,7 +142,7 @@ class ArborScriptCommsHandler:
                     data = self.data_socket.recv_json()
                     self.data_queue.put(data)
                 except Exception as e:
-                    print(f"Error receiving data: {e}")
+                    logger.error(f"Error receiving data: {e}")
                     break
         self.receiver_thread = threading.Thread(target=_receiver, daemon=True)
@@ -170,7 +174,7 @@ class ArborScriptCommsHandler:
         return f"{socket.gethostname()}_{os.getpid()}"
     def _send_handshake(self):
-        print(f"Sending handshake to {self.handshake_socket}")
+        logger.debug(f"Sending handshake to {self.handshake_socket}")
         self.handshake_socket.send_json(
             {"type": "hello", "client_id": self._get_client_id()}
         )
@@ -187,12 +191,12 @@ if __name__ == "__main__":
     def _client_thread(script_comms):
         for data in script_comms.receive_data():
-            print("Client received data:", data)
+            logger.info("Client received data:", data)
     server_comms = ArborServerCommsHandler()
     t1 = threading.Thread(target=_server_thread, args=(server_comms,))
     t1.start()
-    print("Server started")
+    logger.info("Server started")
     client_threads = []
     script_comms_list = []
@@ -222,9 +226,9 @@ if __name__ == "__main__":
         for t in client_threads:
             t.join()
     except KeyboardInterrupt:
-        print("Keyboard interrupt")
+        logger.info("Keyboard interrupt")
     except Exception as e:
-        print(f"Error: {e}")
+        logger.error(f"Error: {e}")
     finally:
         for script_comms in script_comms_list:
             script_comms.close()

arbor/server/services/file_manager.py CHANGED Viewed

@@ -7,7 +7,10 @@ from pathlib import Path
 from fastapi import UploadFile
-from arbor.server.core.config import Settings
+from arbor.server.core.config import Config
+from arbor.server.utils.logging import get_logger
+logger = get_logger(__name__)
 class FileValidationError(Exception):
@@ -17,8 +20,8 @@ class FileValidationError(Exception):
 class FileManager:
-    def __init__(self, settings: Settings):
-        self.uploads_dir = Path(settings.STORAGE_PATH) / "uploads"
+    def __init__(self, config: Config):
+        self.uploads_dir = Path(config.STORAGE_PATH) / "uploads"
         self.uploads_dir.mkdir(parents=True, exist_ok=True)
         self.files = self.load_files_from_uploads()
@@ -284,6 +287,6 @@ class FileManager:
                     }
                     fout.write(json.dumps(new_line) + "\n")
                 except Exception as e:
-                    print(f"Error parsing line {line_num}: {e}")
+                    logger.error(f"Error parsing line {line_num}: {e}")
         os.replace(output_path, file_path)

arbor-ai 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

arbor-ai 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl