partagpu 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: partagpu
3
+ Version: 1.0.0
4
+ Summary: Client Python pour PartaGPU — entraînement distribué multi-GPU sur réseau local
5
+ Author-email: César Lizurey <cesar@lizurey.fr>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/cesar-lizurey/partagpu
8
+ Project-URL: Repository, https://github.com/cesar-lizurey/partagpu
9
+ Keywords: gpu,distributed,pytorch,training,local-network
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: requests>=2.28
17
+ Provides-Extra: torch
18
+ Requires-Dist: torch>=2.0; extra == "torch"
19
+
20
+ # partagpu
21
+
22
+ Client Python pour [PartaGPU](https://github.com/cesar-lizurey/partagpu) — utilisez les GPU de plusieurs machines d'une salle de cours pour l'entraînement distribué.
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install partagpu
28
+ ```
29
+
30
+ ## Utilisation
31
+
32
+ L'application PartaGPU doit tourner sur votre machine.
33
+
34
+ ### Lister les GPU disponibles
35
+
36
+ ```python
37
+ import partagpu
38
+
39
+ gpus = partagpu.discover()
40
+ # [GPU('local', ip='192.168.70.103', limit=100%, verified),
41
+ # GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
42
+ ```
43
+
44
+ ### Lancer un entraînement distribué
45
+
46
+ ```python
47
+ from partagpu.distributed import launch_workers
48
+
49
+ workers = launch_workers("train.py", args=["--epochs", "10"])
50
+ for w in workers:
51
+ w.wait()
52
+ ```
53
+
54
+ Voir le [README principal](https://github.com/cesar-lizurey/partagpu#package-python--entraînement-distribué) pour la documentation complète.
@@ -0,0 +1,35 @@
1
+ # partagpu
2
+
3
+ Client Python pour [PartaGPU](https://github.com/cesar-lizurey/partagpu) — utilisez les GPU de plusieurs machines d'une salle de cours pour l'entraînement distribué.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install partagpu
9
+ ```
10
+
11
+ ## Utilisation
12
+
13
+ L'application PartaGPU doit tourner sur votre machine.
14
+
15
+ ### Lister les GPU disponibles
16
+
17
+ ```python
18
+ import partagpu
19
+
20
+ gpus = partagpu.discover()
21
+ # [GPU('local', ip='192.168.70.103', limit=100%, verified),
22
+ # GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
23
+ ```
24
+
25
+ ### Lancer un entraînement distribué
26
+
27
+ ```python
28
+ from partagpu.distributed import launch_workers
29
+
30
+ workers = launch_workers("train.py", args=["--epochs", "10"])
31
+ for w in workers:
32
+ w.wait()
33
+ ```
34
+
35
+ Voir le [README principal](https://github.com/cesar-lizurey/partagpu#package-python--entraînement-distribué) pour la documentation complète.
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "partagpu"
7
+ version = "1.0.0"
8
+ description = "Client Python pour PartaGPU — entraînement distribué multi-GPU sur réseau local"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "César Lizurey", email = "cesar@lizurey.fr" },
14
+ ]
15
+ keywords = ["gpu", "distributed", "pytorch", "training", "local-network"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Science/Research",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ "Programming Language :: Python :: 3",
21
+ ]
22
+ dependencies = [
23
+ "requests>=2.28",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ torch = ["torch>=2.0"]
28
+
29
+ [project.urls]
30
+ Homepage = "https://github.com/cesar-lizurey/partagpu"
31
+ Repository = "https://github.com/cesar-lizurey/partagpu"
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ """PartaGPU — Client Python pour l'entraînement distribué multi-GPU sur réseau local."""
2
+
3
+ from partagpu.discover import discover, GPUResource
4
+
5
+ __version__ = "1.0.0"
6
+ __all__ = ["discover", "GPUResource"]
@@ -0,0 +1,95 @@
1
+ """Discover available GPU resources via the PartaGPU local HTTP API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ import requests
9
+
10
+ API_BASE = "http://127.0.0.1:7654"
11
+
12
+
13
+ @dataclass
14
+ class GPUResource:
15
+ """A GPU resource available for distributed training."""
16
+
17
+ host: str
18
+ ip: str
19
+ gpu_limit_percent: float
20
+ verified: bool
21
+
22
+ def __repr__(self) -> str:
23
+ status = "verified" if self.verified else "unverified"
24
+ return f"GPU({self.host!r}, ip={self.ip!r}, limit={self.gpu_limit_percent}%, {status})"
25
+
26
+
27
+ @dataclass
28
+ class Peer:
29
+ """A machine discovered on the network by PartaGPU."""
30
+
31
+ display_name: str
32
+ hostname: str
33
+ ip: str
34
+ sharing_enabled: bool
35
+ cpu_limit: float
36
+ ram_limit: float
37
+ gpu_limit: float
38
+ verified: bool
39
+
40
+
41
+ def discover(api_base: str = API_BASE, timeout: float = 2.0) -> list[GPUResource]:
42
+ """Discover all available GPUs (local + remote peers).
43
+
44
+ Requires the PartaGPU desktop app to be running.
45
+
46
+ Returns:
47
+ List of GPUResource objects representing available GPUs.
48
+
49
+ Raises:
50
+ ConnectionError: If the PartaGPU app is not running.
51
+ """
52
+ try:
53
+ resp = requests.get(f"{api_base}/api/gpu", timeout=timeout)
54
+ resp.raise_for_status()
55
+ except requests.ConnectionError:
56
+ raise ConnectionError(
57
+ "Impossible de se connecter a PartaGPU. "
58
+ "Verifiez que l'application est lancee."
59
+ ) from None
60
+ except requests.RequestException as e:
61
+ raise ConnectionError(f"Erreur API PartaGPU: {e}") from None
62
+
63
+ return [GPUResource(**gpu) for gpu in resp.json()]
64
+
65
+
66
+ def get_peers(api_base: str = API_BASE, timeout: float = 2.0) -> list[Peer]:
67
+ """Get all peers discovered by PartaGPU.
68
+
69
+ Returns:
70
+ List of Peer objects.
71
+ """
72
+ try:
73
+ resp = requests.get(f"{api_base}/api/peers", timeout=timeout)
74
+ resp.raise_for_status()
75
+ except requests.ConnectionError:
76
+ raise ConnectionError(
77
+ "Impossible de se connecter a PartaGPU. "
78
+ "Verifiez que l'application est lancee."
79
+ ) from None
80
+
81
+ peers = []
82
+ for p in resp.json():
83
+ peers.append(
84
+ Peer(
85
+ display_name=p.get("display_name", ""),
86
+ hostname=p.get("hostname", ""),
87
+ ip=p.get("ip", ""),
88
+ sharing_enabled=p.get("sharing_enabled", False),
89
+ cpu_limit=p.get("cpu_limit", 0),
90
+ ram_limit=p.get("ram_limit", 0),
91
+ gpu_limit=p.get("gpu_limit", 0),
92
+ verified=p.get("verified", False),
93
+ )
94
+ )
95
+ return peers
@@ -0,0 +1,166 @@
1
+ """Helpers for distributed PyTorch training across PartaGPU peers.
2
+
3
+ Usage in a Jupyter notebook:
4
+
5
+ import partagpu
6
+ gpus = partagpu.discover()
7
+ # → [GPU('local', ip='192.168.70.103', limit=100%, verified),
8
+ # GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
9
+
10
+ from partagpu.distributed import setup_ddp, cleanup_ddp
11
+
12
+ # On each node, call setup_ddp with the appropriate rank
13
+ setup_ddp(rank=0, world_size=len(gpus), master_addr=gpus[0].ip)
14
+ ...
15
+ cleanup_ddp()
16
+
17
+ For single-machine multi-GPU or simple remote offloading, use the
18
+ higher-level `distribute` context manager.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import os
24
+ import subprocess
25
+ import sys
26
+ from contextlib import contextmanager
27
+ from typing import TYPE_CHECKING
28
+
29
+ from partagpu.discover import GPUResource, discover
30
+
31
+ if TYPE_CHECKING:
32
+ pass
33
+
34
+
35
+ def setup_ddp(
36
+ rank: int,
37
+ world_size: int,
38
+ master_addr: str = "127.0.0.1",
39
+ master_port: int = 29500,
40
+ backend: str = "nccl",
41
+ ) -> None:
42
+ """Initialize a PyTorch Distributed Data Parallel process group.
43
+
44
+ Args:
45
+ rank: Global rank of this process.
46
+ world_size: Total number of processes (= number of GPUs).
47
+ master_addr: IP of the rank-0 node.
48
+ master_port: Port for the rendezvous.
49
+ backend: Communication backend ('nccl' for GPU, 'gloo' for CPU).
50
+ """
51
+ import torch.distributed as dist
52
+
53
+ os.environ["MASTER_ADDR"] = master_addr
54
+ os.environ["MASTER_PORT"] = str(master_port)
55
+ os.environ["RANK"] = str(rank)
56
+ os.environ["WORLD_SIZE"] = str(world_size)
57
+
58
+ dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
59
+
60
+
61
+ def cleanup_ddp() -> None:
62
+ """Destroy the PyTorch distributed process group."""
63
+ import torch.distributed as dist
64
+
65
+ if dist.is_initialized():
66
+ dist.destroy_process_group()
67
+
68
+
69
+ def launch_workers(
70
+ script: str,
71
+ gpus: list[GPUResource] | None = None,
72
+ master_port: int = 29500,
73
+ args: list[str] | None = None,
74
+ ) -> list[subprocess.Popen]:
75
+ """Launch distributed training workers on available GPUs.
76
+
77
+ This starts one subprocess per GPU. The local GPU gets rank 0,
78
+ remote GPUs get subsequent ranks. Each worker receives RANK,
79
+ WORLD_SIZE, MASTER_ADDR, and MASTER_PORT as environment variables.
80
+
81
+ Args:
82
+ script: Path to the training script.
83
+ gpus: List of GPUResource (defaults to partagpu.discover()).
84
+ master_port: Port for the rendezvous server.
85
+ args: Additional arguments to pass to the training script.
86
+
87
+ Returns:
88
+ List of Popen objects for each worker.
89
+ """
90
+ if gpus is None:
91
+ gpus = discover()
92
+
93
+ if not gpus:
94
+ raise RuntimeError("Aucun GPU disponible. Verifiez PartaGPU.")
95
+
96
+ master_addr = gpus[0].ip
97
+ world_size = len(gpus)
98
+ workers = []
99
+
100
+ for rank, gpu in enumerate(gpus):
101
+ env = os.environ.copy()
102
+ env["MASTER_ADDR"] = master_addr
103
+ env["MASTER_PORT"] = str(master_port)
104
+ env["RANK"] = str(rank)
105
+ env["WORLD_SIZE"] = str(world_size)
106
+ env["LOCAL_RANK"] = "0" # one GPU per process
107
+
108
+ cmd = [sys.executable, script] + (args or [])
109
+
110
+ if gpu.host == "local" or gpu.ip == master_addr:
111
+ # Local worker
112
+ proc = subprocess.Popen(cmd, env=env)
113
+ else:
114
+ # Remote worker via SSH to the partagpu account
115
+ remote_cmd = " ".join(
116
+ [f"{k}={v}" for k, v in env.items()
117
+ if k in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE", "LOCAL_RANK")]
118
+ + cmd
119
+ )
120
+ proc = subprocess.Popen(
121
+ ["ssh", f"partagpu@{gpu.ip}", remote_cmd],
122
+ env=env,
123
+ )
124
+
125
+ workers.append(proc)
126
+
127
+ return workers
128
+
129
+
130
+ @contextmanager
131
+ def distribute(
132
+ gpus: list[GPUResource] | None = None,
133
+ master_port: int = 29500,
134
+ backend: str = "nccl",
135
+ ):
136
+ """Context manager for distributed training.
137
+
138
+ Discovers GPUs, sets up the DDP process group for rank 0, and
139
+ provides the list of GPUs. Cleans up on exit.
140
+
141
+ Usage:
142
+ with partagpu.distributed.distribute() as gpus:
143
+ model = DDP(model)
144
+ train(model)
145
+ """
146
+ if gpus is None:
147
+ gpus = discover()
148
+
149
+ if not gpus:
150
+ raise RuntimeError("Aucun GPU disponible. Verifiez PartaGPU.")
151
+
152
+ master_addr = gpus[0].ip
153
+ world_size = len(gpus)
154
+
155
+ setup_ddp(
156
+ rank=0,
157
+ world_size=world_size,
158
+ master_addr=master_addr,
159
+ master_port=master_port,
160
+ backend=backend,
161
+ )
162
+
163
+ try:
164
+ yield gpus
165
+ finally:
166
+ cleanup_ddp()
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: partagpu
3
+ Version: 1.0.0
4
+ Summary: Client Python pour PartaGPU — entraînement distribué multi-GPU sur réseau local
5
+ Author-email: César Lizurey <cesar@lizurey.fr>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/cesar-lizurey/partagpu
8
+ Project-URL: Repository, https://github.com/cesar-lizurey/partagpu
9
+ Keywords: gpu,distributed,pytorch,training,local-network
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: requests>=2.28
17
+ Provides-Extra: torch
18
+ Requires-Dist: torch>=2.0; extra == "torch"
19
+
20
+ # partagpu
21
+
22
+ Client Python pour [PartaGPU](https://github.com/cesar-lizurey/partagpu) — utilisez les GPU de plusieurs machines d'une salle de cours pour l'entraînement distribué.
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install partagpu
28
+ ```
29
+
30
+ ## Utilisation
31
+
32
+ L'application PartaGPU doit tourner sur votre machine.
33
+
34
+ ### Lister les GPU disponibles
35
+
36
+ ```python
37
+ import partagpu
38
+
39
+ gpus = partagpu.discover()
40
+ # [GPU('local', ip='192.168.70.103', limit=100%, verified),
41
+ # GPU('César 2', ip='192.168.70.105', limit=50%, verified)]
42
+ ```
43
+
44
+ ### Lancer un entraînement distribué
45
+
46
+ ```python
47
+ from partagpu.distributed import launch_workers
48
+
49
+ workers = launch_workers("train.py", args=["--epochs", "10"])
50
+ for w in workers:
51
+ w.wait()
52
+ ```
53
+
54
+ Voir le [README principal](https://github.com/cesar-lizurey/partagpu#package-python--entraînement-distribué) pour la documentation complète.
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/partagpu/__init__.py
4
+ src/partagpu/discover.py
5
+ src/partagpu/distributed.py
6
+ src/partagpu.egg-info/PKG-INFO
7
+ src/partagpu.egg-info/SOURCES.txt
8
+ src/partagpu.egg-info/dependency_links.txt
9
+ src/partagpu.egg-info/requires.txt
10
+ src/partagpu.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ requests>=2.28
2
+
3
+ [torch]
4
+ torch>=2.0
@@ -0,0 +1 @@
1
+ partagpu