PyPI - partagpu - Versions diffs - 1.0.0__tar.gz - Mend

partagpu 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

partagpu-1.0.0/PKG-INFO +54 -0
partagpu-1.0.0/README.md +35 -0
partagpu-1.0.0/pyproject.toml +34 -0
partagpu-1.0.0/setup.cfg +4 -0
partagpu-1.0.0/src/partagpu/__init__.py +6 -0
partagpu-1.0.0/src/partagpu/discover.py +95 -0
partagpu-1.0.0/src/partagpu/distributed.py +166 -0
partagpu-1.0.0/src/partagpu.egg-info/PKG-INFO +54 -0
partagpu-1.0.0/src/partagpu.egg-info/SOURCES.txt +10 -0
partagpu-1.0.0/src/partagpu.egg-info/dependency_links.txt +1 -0
partagpu-1.0.0/src/partagpu.egg-info/requires.txt +4 -0
partagpu-1.0.0/src/partagpu.egg-info/top_level.txt +1 -0

partagpu-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.4
+Name: partagpu
+Version: 1.0.0
+Summary: Client Python pour PartaGPU — entraînement distribué multi-GPU sur réseau local
+Author-email: César Lizurey <cesar@lizurey.fr>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/cesar-lizurey/partagpu
+Project-URL: Repository, https://github.com/cesar-lizurey/partagpu
+Keywords: gpu,distributed,pytorch,training,local-network
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: requests>=2.28
+Provides-Extra: torch
+Requires-Dist: torch>=2.0; extra == "torch"
+# partagpu
+Client Python pour [PartaGPU](https://github.com/cesar-lizurey/partagpu) — utilisez les GPU de plusieurs machines d'une salle de cours pour l'entraînement distribué.
+## Installation
+```bash
+pip install partagpu
+```
+## Utilisation
+L'application PartaGPU doit tourner sur votre machine.
+### Lister les GPU disponibles
+```python
+import partagpu
+gpus = partagpu.discover()
+# [GPU('local', ip='192.168.70.103', limit=100%, verified),
+#  GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
+```
+### Lancer un entraînement distribué
+```python
+from partagpu.distributed import launch_workers
+workers = launch_workers("train.py", args=["--epochs", "10"])
+for w in workers:
+    w.wait()
+```
+Voir le [README principal](https://github.com/cesar-lizurey/partagpu#package-python--entraînement-distribué) pour la documentation complète.

partagpu-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# partagpu
+Client Python pour [PartaGPU](https://github.com/cesar-lizurey/partagpu) — utilisez les GPU de plusieurs machines d'une salle de cours pour l'entraînement distribué.
+## Installation
+```bash
+pip install partagpu
+```
+## Utilisation
+L'application PartaGPU doit tourner sur votre machine.
+### Lister les GPU disponibles
+```python
+import partagpu
+gpus = partagpu.discover()
+# [GPU('local', ip='192.168.70.103', limit=100%, verified),
+#  GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
+```
+### Lancer un entraînement distribué
+```python
+from partagpu.distributed import launch_workers
+workers = launch_workers("train.py", args=["--epochs", "10"])
+for w in workers:
+    w.wait()
+```
+Voir le [README principal](https://github.com/cesar-lizurey/partagpu#package-python--entraînement-distribué) pour la documentation complète.

partagpu-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,34 @@
+[build-system]
+requires = ["setuptools>=68.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "partagpu"
+version = "1.0.0"
+description = "Client Python pour PartaGPU — entraînement distribué multi-GPU sur réseau local"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.9"
+authors = [
+    { name = "César Lizurey", email = "cesar@lizurey.fr" },
+]
+keywords = ["gpu", "distributed", "pytorch", "training", "local-network"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Programming Language :: Python :: 3",
+]
+dependencies = [
+    "requests>=2.28",
+]
+[project.optional-dependencies]
+torch = ["torch>=2.0"]
+[project.urls]
+Homepage = "https://github.com/cesar-lizurey/partagpu"
+Repository = "https://github.com/cesar-lizurey/partagpu"
+[tool.setuptools.packages.find]
+where = ["src"]

partagpu-1.0.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

partagpu-1.0.0/src/partagpu/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""PartaGPU — Client Python pour l'entraînement distribué multi-GPU sur réseau local."""
+from partagpu.discover import discover, GPUResource
+__version__ = "1.0.0"
+__all__ = ["discover", "GPUResource"]

partagpu-1.0.0/src/partagpu/discover.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Discover available GPU resources via the PartaGPU local HTTP API."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import requests
+API_BASE = "http://127.0.0.1:7654"
+@dataclass
+class GPUResource:
+    """A GPU resource available for distributed training."""
+    host: str
+    ip: str
+    gpu_limit_percent: float
+    verified: bool
+    def __repr__(self) -> str:
+        status = "verified" if self.verified else "unverified"
+        return f"GPU({self.host!r}, ip={self.ip!r}, limit={self.gpu_limit_percent}%, {status})"
+@dataclass
+class Peer:
+    """A machine discovered on the network by PartaGPU."""
+    display_name: str
+    hostname: str
+    ip: str
+    sharing_enabled: bool
+    cpu_limit: float
+    ram_limit: float
+    gpu_limit: float
+    verified: bool
+def discover(api_base: str = API_BASE, timeout: float = 2.0) -> list[GPUResource]:
+    """Discover all available GPUs (local + remote peers).
+    Requires the PartaGPU desktop app to be running.
+    Returns:
+        List of GPUResource objects representing available GPUs.
+    Raises:
+        ConnectionError: If the PartaGPU app is not running.
+    """
+    try:
+        resp = requests.get(f"{api_base}/api/gpu", timeout=timeout)
+        resp.raise_for_status()
+    except requests.ConnectionError:
+        raise ConnectionError(
+            "Impossible de se connecter a PartaGPU. "
+            "Verifiez que l'application est lancee."
+        ) from None
+    except requests.RequestException as e:
+        raise ConnectionError(f"Erreur API PartaGPU: {e}") from None
+    return [GPUResource(**gpu) for gpu in resp.json()]
+def get_peers(api_base: str = API_BASE, timeout: float = 2.0) -> list[Peer]:
+    """Get all peers discovered by PartaGPU.
+    Returns:
+        List of Peer objects.
+    """
+    try:
+        resp = requests.get(f"{api_base}/api/peers", timeout=timeout)
+        resp.raise_for_status()
+    except requests.ConnectionError:
+        raise ConnectionError(
+            "Impossible de se connecter a PartaGPU. "
+            "Verifiez que l'application est lancee."
+        ) from None
+    peers = []
+    for p in resp.json():
+        peers.append(
+            Peer(
+                display_name=p.get("display_name", ""),
+                hostname=p.get("hostname", ""),
+                ip=p.get("ip", ""),
+                sharing_enabled=p.get("sharing_enabled", False),
+                cpu_limit=p.get("cpu_limit", 0),
+                ram_limit=p.get("ram_limit", 0),
+                gpu_limit=p.get("gpu_limit", 0),
+                verified=p.get("verified", False),
+            )
+        )
+    return peers

partagpu-1.0.0/src/partagpu/distributed.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Helpers for distributed PyTorch training across PartaGPU peers.
+Usage in a Jupyter notebook:
+    import partagpu
+    gpus = partagpu.discover()
+    # → [GPU('local', ip='192.168.70.103', limit=100%, verified),
+    #    GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
+    from partagpu.distributed import setup_ddp, cleanup_ddp
+    # On each node, call setup_ddp with the appropriate rank
+    setup_ddp(rank=0, world_size=len(gpus), master_addr=gpus[0].ip)
+    ...
+    cleanup_ddp()
+For single-machine multi-GPU or simple remote offloading, use the
+higher-level `distribute` context manager.
+"""
+from __future__ import annotations
+import os
+import subprocess
+import sys
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+from partagpu.discover import GPUResource, discover
+if TYPE_CHECKING:
+    pass
+def setup_ddp(
+    rank: int,
+    world_size: int,
+    master_addr: str = "127.0.0.1",
+    master_port: int = 29500,
+    backend: str = "nccl",
+) -> None:
+    """Initialize a PyTorch Distributed Data Parallel process group.
+    Args:
+        rank: Global rank of this process.
+        world_size: Total number of processes (= number of GPUs).
+        master_addr: IP of the rank-0 node.
+        master_port: Port for the rendezvous.
+        backend: Communication backend ('nccl' for GPU, 'gloo' for CPU).
+    """
+    import torch.distributed as dist
+    os.environ["MASTER_ADDR"] = master_addr
+    os.environ["MASTER_PORT"] = str(master_port)
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+def cleanup_ddp() -> None:
+    """Destroy the PyTorch distributed process group."""
+    import torch.distributed as dist
+    if dist.is_initialized():
+        dist.destroy_process_group()
+def launch_workers(
+    script: str,
+    gpus: list[GPUResource] | None = None,
+    master_port: int = 29500,
+    args: list[str] | None = None,
+) -> list[subprocess.Popen]:
+    """Launch distributed training workers on available GPUs.
+    This starts one subprocess per GPU. The local GPU gets rank 0,
+    remote GPUs get subsequent ranks. Each worker receives RANK,
+    WORLD_SIZE, MASTER_ADDR, and MASTER_PORT as environment variables.
+    Args:
+        script: Path to the training script.
+        gpus: List of GPUResource (defaults to partagpu.discover()).
+        master_port: Port for the rendezvous server.
+        args: Additional arguments to pass to the training script.
+    Returns:
+        List of Popen objects for each worker.
+    """
+    if gpus is None:
+        gpus = discover()
+    if not gpus:
+        raise RuntimeError("Aucun GPU disponible. Verifiez PartaGPU.")
+    master_addr = gpus[0].ip
+    world_size = len(gpus)
+    workers = []
+    for rank, gpu in enumerate(gpus):
+        env = os.environ.copy()
+        env["MASTER_ADDR"] = master_addr
+        env["MASTER_PORT"] = str(master_port)
+        env["RANK"] = str(rank)
+        env["WORLD_SIZE"] = str(world_size)
+        env["LOCAL_RANK"] = "0"  # one GPU per process
+        cmd = [sys.executable, script] + (args or [])
+        if gpu.host == "local" or gpu.ip == master_addr:
+            # Local worker
+            proc = subprocess.Popen(cmd, env=env)
+        else:
+            # Remote worker via SSH to the partagpu account
+            remote_cmd = " ".join(
+                [f"{k}={v}" for k, v in env.items()
+                 if k in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE", "LOCAL_RANK")]
+                + cmd
+            )
+            proc = subprocess.Popen(
+                ["ssh", f"partagpu@{gpu.ip}", remote_cmd],
+                env=env,
+            )
+        workers.append(proc)
+    return workers
+@contextmanager
+def distribute(
+    gpus: list[GPUResource] | None = None,
+    master_port: int = 29500,
+    backend: str = "nccl",
+):
+    """Context manager for distributed training.
+    Discovers GPUs, sets up the DDP process group for rank 0, and
+    provides the list of GPUs. Cleans up on exit.
+    Usage:
+        with partagpu.distributed.distribute() as gpus:
+            model = DDP(model)
+            train(model)
+    """
+    if gpus is None:
+        gpus = discover()
+    if not gpus:
+        raise RuntimeError("Aucun GPU disponible. Verifiez PartaGPU.")
+    master_addr = gpus[0].ip
+    world_size = len(gpus)
+    setup_ddp(
+        rank=0,
+        world_size=world_size,
+        master_addr=master_addr,
+        master_port=master_port,
+        backend=backend,
+    )
+    try:
+        yield gpus
+    finally:
+        cleanup_ddp()

partagpu-1.0.0/src/partagpu.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.4
+Name: partagpu
+Version: 1.0.0
+Summary: Client Python pour PartaGPU — entraînement distribué multi-GPU sur réseau local
+Author-email: César Lizurey <cesar@lizurey.fr>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/cesar-lizurey/partagpu
+Project-URL: Repository, https://github.com/cesar-lizurey/partagpu
+Keywords: gpu,distributed,pytorch,training,local-network
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: requests>=2.28
+Provides-Extra: torch
+Requires-Dist: torch>=2.0; extra == "torch"
+# partagpu
+Client Python pour [PartaGPU](https://github.com/cesar-lizurey/partagpu) — utilisez les GPU de plusieurs machines d'une salle de cours pour l'entraînement distribué.
+## Installation
+```bash
+pip install partagpu
+```
+## Utilisation
+L'application PartaGPU doit tourner sur votre machine.
+### Lister les GPU disponibles
+```python
+import partagpu
+gpus = partagpu.discover()
+# [GPU('local', ip='192.168.70.103', limit=100%, verified),
+#  GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
+```
+### Lancer un entraînement distribué
+```python
+from partagpu.distributed import launch_workers
+workers = launch_workers("train.py", args=["--epochs", "10"])
+for w in workers:
+    w.wait()
+```
+Voir le [README principal](https://github.com/cesar-lizurey/partagpu#package-python--entraînement-distribué) pour la documentation complète.

partagpu-1.0.0/src/partagpu.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,10 @@
+README.md
+pyproject.toml
+src/partagpu/__init__.py
+src/partagpu/discover.py
+src/partagpu/distributed.py
+src/partagpu.egg-info/PKG-INFO
+src/partagpu.egg-info/SOURCES.txt
+src/partagpu.egg-info/dependency_links.txt
+src/partagpu.egg-info/requires.txt
+src/partagpu.egg-info/top_level.txt

partagpu-1.0.0/src/partagpu.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

partagpu-1.0.0/src/partagpu.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+requests>=2.28
+[torch]
+torch>=2.0

partagpu-1.0.0/src/partagpu.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ partagpu