macfleet 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- macfleet/__init__.py +46 -0
- macfleet/cli/__init__.py +1 -0
- macfleet/cli/main.py +486 -0
- macfleet/comm/__init__.py +1 -0
- macfleet/comm/collectives.py +343 -0
- macfleet/comm/protocol.py +289 -0
- macfleet/comm/transport.py +321 -0
- macfleet/compression/__init__.py +43 -0
- macfleet/compression/adaptive.py +394 -0
- macfleet/compression/pipeline.py +298 -0
- macfleet/compression/quantize.py +98 -0
- macfleet/compression/topk.py +119 -0
- macfleet/engines/__init__.py +33 -0
- macfleet/engines/base.py +203 -0
- macfleet/engines/mlx_engine.py +438 -0
- macfleet/engines/torch_engine.py +244 -0
- macfleet/monitoring/__init__.py +29 -0
- macfleet/monitoring/dashboard.py +363 -0
- macfleet/monitoring/health.py +305 -0
- macfleet/monitoring/thermal.py +203 -0
- macfleet/monitoring/throughput.py +218 -0
- macfleet/pool/__init__.py +1 -0
- macfleet/pool/agent.py +326 -0
- macfleet/pool/discovery.py +302 -0
- macfleet/pool/heartbeat.py +204 -0
- macfleet/pool/network.py +280 -0
- macfleet/pool/registry.py +149 -0
- macfleet/pool/scheduler.py +148 -0
- macfleet/sdk/__init__.py +22 -0
- macfleet/sdk/decorators.py +31 -0
- macfleet/sdk/pool.py +276 -0
- macfleet/sdk/train.py +43 -0
- macfleet/training/__init__.py +1 -0
- macfleet/training/data_parallel.py +216 -0
- macfleet/training/loop.py +163 -0
- macfleet/training/sampler.py +183 -0
- macfleet/utils/__init__.py +1 -0
- macfleet-2.0.0.dist-info/LICENSE +21 -0
- macfleet-2.0.0.dist-info/METADATA +175 -0
- macfleet-2.0.0.dist-info/RECORD +43 -0
- macfleet-2.0.0.dist-info/WHEEL +5 -0
- macfleet-2.0.0.dist-info/entry_points.txt +2 -0
- macfleet-2.0.0.dist-info/top_level.txt +1 -0
macfleet/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""MacFleet v2: Pool Apple Silicon Macs into a distributed ML training cluster.
|
|
2
|
+
|
|
3
|
+
Zero-config discovery. Framework-agnostic engines. Adaptive networking.
|
|
4
|
+
|
|
5
|
+
pip install macfleet && macfleet join
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
__version__ = "2.0.0"
|
|
11
|
+
|
|
12
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __getattr__(name: str):
|
|
16
|
+
"""Lazy imports for heavy modules (avoid importing torch/mlx at module load)."""
|
|
17
|
+
if name == "Pool":
|
|
18
|
+
from macfleet.sdk.pool import Pool
|
|
19
|
+
return Pool
|
|
20
|
+
if name == "train":
|
|
21
|
+
from macfleet.sdk.train import train
|
|
22
|
+
return train
|
|
23
|
+
if name == "distributed":
|
|
24
|
+
from macfleet.sdk.decorators import distributed
|
|
25
|
+
return distributed
|
|
26
|
+
if name == "DataParallel":
|
|
27
|
+
from macfleet.training.data_parallel import DataParallel
|
|
28
|
+
return DataParallel
|
|
29
|
+
if name == "TorchEngine":
|
|
30
|
+
from macfleet.engines.torch_engine import TorchEngine
|
|
31
|
+
return TorchEngine
|
|
32
|
+
if name == "MLXEngine":
|
|
33
|
+
from macfleet.engines.mlx_engine import MLXEngine
|
|
34
|
+
return MLXEngine
|
|
35
|
+
raise AttributeError(f"module 'macfleet' has no attribute {name!r}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"__version__",
|
|
40
|
+
"Pool",
|
|
41
|
+
"train",
|
|
42
|
+
"distributed",
|
|
43
|
+
"DataParallel",
|
|
44
|
+
"TorchEngine",
|
|
45
|
+
"MLXEngine",
|
|
46
|
+
]
|
macfleet/cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI interface for MacFleet."""
|
macfleet/cli/main.py
ADDED
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
"""MacFleet CLI: zero-config compute pool for Apple Silicon Macs.
|
|
2
|
+
|
|
3
|
+
Commands:
|
|
4
|
+
macfleet join Join the compute pool
|
|
5
|
+
macfleet leave Leave the pool gracefully
|
|
6
|
+
macfleet status Show pool members and network info
|
|
7
|
+
macfleet info Show local hardware info
|
|
8
|
+
macfleet train Submit a training job
|
|
9
|
+
macfleet bench Benchmark network + compute
|
|
10
|
+
macfleet diagnose System health check
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import signal
|
|
17
|
+
import sys
|
|
18
|
+
import time
|
|
19
|
+
|
|
20
|
+
import click
|
|
21
|
+
from rich.console import Console
|
|
22
|
+
from rich.table import Table
|
|
23
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
24
|
+
|
|
25
|
+
import macfleet
|
|
26
|
+
|
|
27
|
+
console = Console()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@click.group()
|
|
31
|
+
@click.version_option(version=macfleet.__version__, prog_name="macfleet")
|
|
32
|
+
def cli():
|
|
33
|
+
"""MacFleet: Pool Apple Silicon Macs for distributed ML training."""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@cli.command()
|
|
38
|
+
@click.option("--name", default=None, help="Custom node name")
|
|
39
|
+
@click.option("--port", default=50051, help="Communication port")
|
|
40
|
+
@click.option("--token", default=None, help="Pool authentication token")
|
|
41
|
+
def join(name: str | None, port: int, token: str | None):
|
|
42
|
+
"""Join the compute pool. Auto-discovers peers on the network."""
|
|
43
|
+
from macfleet.pool.agent import PoolAgent
|
|
44
|
+
|
|
45
|
+
agent = PoolAgent(name=name, port=port, token=token)
|
|
46
|
+
|
|
47
|
+
async def run():
|
|
48
|
+
await agent.start()
|
|
49
|
+
console.print("\n[dim]Press Ctrl+C to leave the pool[/dim]\n")
|
|
50
|
+
|
|
51
|
+
# Wait for interrupt
|
|
52
|
+
stop_event = asyncio.Event()
|
|
53
|
+
loop = asyncio.get_event_loop()
|
|
54
|
+
|
|
55
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
56
|
+
loop.add_signal_handler(sig, stop_event.set)
|
|
57
|
+
|
|
58
|
+
await stop_event.wait()
|
|
59
|
+
await agent.stop()
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
asyncio.run(run())
|
|
63
|
+
except KeyboardInterrupt:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@cli.command()
|
|
68
|
+
def info():
|
|
69
|
+
"""Show local hardware information."""
|
|
70
|
+
from macfleet.pool.agent import profile_hardware
|
|
71
|
+
from macfleet.pool.network import get_network_topology
|
|
72
|
+
from macfleet.monitoring.thermal import get_thermal_state, thermal_state_to_string
|
|
73
|
+
|
|
74
|
+
hw = profile_hardware()
|
|
75
|
+
topo = get_network_topology()
|
|
76
|
+
thermal = get_thermal_state()
|
|
77
|
+
|
|
78
|
+
table = Table(title="MacFleet Node Info", show_header=False)
|
|
79
|
+
table.add_column("Property", style="bold")
|
|
80
|
+
table.add_column("Value")
|
|
81
|
+
|
|
82
|
+
table.add_row("Hostname", hw.hostname)
|
|
83
|
+
table.add_row("Chip", hw.chip_name)
|
|
84
|
+
table.add_row("GPU Cores", str(hw.gpu_cores))
|
|
85
|
+
table.add_row("RAM", f"{hw.ram_gb:.0f} GB")
|
|
86
|
+
table.add_row("Memory Bandwidth", f"~{hw.memory_bandwidth_gbps:.0f} GB/s")
|
|
87
|
+
table.add_row("Compute Score", f"{hw.compute_score:.0f}")
|
|
88
|
+
table.add_row("MPS Available", "yes" if hw.mps_available else "no")
|
|
89
|
+
table.add_row("MLX Available", "yes" if hw.mlx_available else "no")
|
|
90
|
+
table.add_row("Thermal", thermal_state_to_string(thermal))
|
|
91
|
+
|
|
92
|
+
# Network
|
|
93
|
+
table.add_row("", "")
|
|
94
|
+
for link in topo.links:
|
|
95
|
+
table.add_row(f"Network ({link.interface})", f"{link.link_type.value} — {link.ip_address}")
|
|
96
|
+
|
|
97
|
+
console.print(table)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@cli.command()
|
|
101
|
+
def status():
|
|
102
|
+
"""Show pool status (discovers peers for 3 seconds)."""
|
|
103
|
+
from macfleet.pool.discovery import ServiceRegistry
|
|
104
|
+
|
|
105
|
+
console.print("[bold]Scanning for pool members...[/bold]")
|
|
106
|
+
|
|
107
|
+
registry = ServiceRegistry()
|
|
108
|
+
try:
|
|
109
|
+
peers = registry.find_peers(timeout=3.0)
|
|
110
|
+
finally:
|
|
111
|
+
registry.stop()
|
|
112
|
+
|
|
113
|
+
if not peers:
|
|
114
|
+
console.print("[yellow]No pool members found on the network.[/yellow]")
|
|
115
|
+
console.print("[dim]Run 'macfleet join' on this and other Macs to form a pool.[/dim]")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
table = Table(title=f"MacFleet Pool ({len(peers)} nodes)")
|
|
119
|
+
table.add_column("Hostname", style="bold")
|
|
120
|
+
table.add_column("Chip")
|
|
121
|
+
table.add_column("GPU Cores", justify="right")
|
|
122
|
+
table.add_column("RAM (GB)", justify="right")
|
|
123
|
+
table.add_column("IP Address")
|
|
124
|
+
table.add_column("Score", justify="right")
|
|
125
|
+
|
|
126
|
+
for node in sorted(peers, key=lambda n: -n.compute_score):
|
|
127
|
+
table.add_row(
|
|
128
|
+
node.hostname,
|
|
129
|
+
node.chip_name,
|
|
130
|
+
str(node.gpu_cores),
|
|
131
|
+
str(node.ram_gb),
|
|
132
|
+
f"{node.ip_address}:{node.port}",
|
|
133
|
+
f"{node.compute_score:.0f}",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
console.print(table)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@cli.command()
|
|
140
|
+
def diagnose():
|
|
141
|
+
"""Run system health checks."""
|
|
142
|
+
from macfleet.pool.agent import profile_hardware, _check_mps_available, _check_mlx_available
|
|
143
|
+
from macfleet.monitoring.thermal import get_thermal_state
|
|
144
|
+
from macfleet.pool.network import detect_interfaces
|
|
145
|
+
|
|
146
|
+
console.print("[bold]Running diagnostics...[/bold]\n")
|
|
147
|
+
|
|
148
|
+
checks_passed = 0
|
|
149
|
+
checks_total = 0
|
|
150
|
+
|
|
151
|
+
def check(name: str, passed: bool, detail: str = ""):
|
|
152
|
+
nonlocal checks_passed, checks_total
|
|
153
|
+
checks_total += 1
|
|
154
|
+
if passed:
|
|
155
|
+
checks_passed += 1
|
|
156
|
+
console.print(f" [green]PASS[/green] {name}" + (f" — {detail}" if detail else ""))
|
|
157
|
+
else:
|
|
158
|
+
console.print(f" [red]FAIL[/red] {name}" + (f" — {detail}" if detail else ""))
|
|
159
|
+
|
|
160
|
+
# Hardware
|
|
161
|
+
console.print("[bold]Hardware[/bold]")
|
|
162
|
+
hw = profile_hardware()
|
|
163
|
+
check("Apple Silicon detected", "apple" in hw.chip_name.lower() or "m" in hw.chip_name.lower(), hw.chip_name)
|
|
164
|
+
check("GPU cores detected", hw.gpu_cores > 0, f"{hw.gpu_cores} cores")
|
|
165
|
+
check("RAM detected", hw.ram_gb > 0, f"{hw.ram_gb:.0f} GB")
|
|
166
|
+
check("RAM >= 8 GB", hw.ram_gb >= 8, f"{hw.ram_gb:.0f} GB")
|
|
167
|
+
|
|
168
|
+
# Frameworks
|
|
169
|
+
console.print("\n[bold]ML Frameworks[/bold]")
|
|
170
|
+
check("MPS available", _check_mps_available())
|
|
171
|
+
check("MLX available", _check_mlx_available())
|
|
172
|
+
|
|
173
|
+
# Thermal
|
|
174
|
+
console.print("\n[bold]Thermal[/bold]")
|
|
175
|
+
thermal = get_thermal_state()
|
|
176
|
+
check("Not throttling", not thermal.is_throttling, thermal.pressure.value)
|
|
177
|
+
|
|
178
|
+
# Network
|
|
179
|
+
console.print("\n[bold]Network[/bold]")
|
|
180
|
+
links = detect_interfaces()
|
|
181
|
+
check("Network interfaces found", len(links) > 0, f"{len(links)} interfaces")
|
|
182
|
+
has_non_loopback = any(l.link_type.value != "loopback" for l in links)
|
|
183
|
+
check("Non-loopback interface", has_non_loopback)
|
|
184
|
+
|
|
185
|
+
# Summary
|
|
186
|
+
console.print(f"\n[bold]{checks_passed}/{checks_total} checks passed[/bold]")
|
|
187
|
+
if checks_passed == checks_total:
|
|
188
|
+
console.print("[green]System is ready for MacFleet![/green]")
|
|
189
|
+
else:
|
|
190
|
+
console.print("[yellow]Some checks failed. See above for details.[/yellow]")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@cli.command()
|
|
194
|
+
@click.argument("script", required=False)
|
|
195
|
+
@click.option("--engine", type=click.Choice(["torch", "mlx"]), default="torch")
|
|
196
|
+
@click.option("--epochs", default=10, help="Number of training epochs")
|
|
197
|
+
@click.option("--batch-size", default=128, help="Global batch size")
|
|
198
|
+
@click.option("--lr", default=0.001, help="Learning rate")
|
|
199
|
+
@click.option("--compression", default="none", help="Compression: none, topk, fp16, topk_fp16")
|
|
200
|
+
@click.option("--config", "config_path", default=None, help="YAML config file")
|
|
201
|
+
def train(
|
|
202
|
+
script: str | None,
|
|
203
|
+
engine: str,
|
|
204
|
+
epochs: int,
|
|
205
|
+
batch_size: int,
|
|
206
|
+
lr: float,
|
|
207
|
+
compression: str,
|
|
208
|
+
config_path: str | None,
|
|
209
|
+
):
|
|
210
|
+
"""Submit a training job to the pool.
|
|
211
|
+
|
|
212
|
+
If SCRIPT is provided, it is executed as a Python file that defines
|
|
213
|
+
`model` and `dataset` variables. Otherwise, runs a built-in demo
|
|
214
|
+
(small MLP on synthetic data) useful for testing the pipeline.
|
|
215
|
+
"""
|
|
216
|
+
if script:
|
|
217
|
+
_train_from_script(script, engine, epochs, batch_size, lr, compression)
|
|
218
|
+
else:
|
|
219
|
+
_train_demo(engine, epochs, batch_size, lr)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _train_demo(engine_type: str, epochs: int, batch_size: int, lr: float):
|
|
223
|
+
"""Run a built-in demo training on synthetic data (single-node)."""
|
|
224
|
+
import torch
|
|
225
|
+
import torch.nn as nn
|
|
226
|
+
from torch.utils.data import DataLoader, TensorDataset
|
|
227
|
+
|
|
228
|
+
from macfleet.engines.torch_engine import TorchEngine
|
|
229
|
+
from macfleet.training.data_parallel import DataParallel
|
|
230
|
+
from macfleet.comm.collectives import CollectiveGroup
|
|
231
|
+
from macfleet.comm.transport import PeerTransport
|
|
232
|
+
|
|
233
|
+
console.print("[bold blue]MacFleet Demo Training[/bold blue]")
|
|
234
|
+
console.print("[dim]Single-node training on synthetic data (no peers needed)[/dim]\n")
|
|
235
|
+
|
|
236
|
+
# Synthetic classification: 4 features, 2 classes
|
|
237
|
+
torch.manual_seed(42)
|
|
238
|
+
n_samples = 1000
|
|
239
|
+
X = torch.randn(n_samples, 4)
|
|
240
|
+
y = (X[:, 0] + X[:, 1] > 0).long()
|
|
241
|
+
|
|
242
|
+
dataset = TensorDataset(X, y)
|
|
243
|
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
|
244
|
+
|
|
245
|
+
# Simple MLP
|
|
246
|
+
model = nn.Sequential(
|
|
247
|
+
nn.Linear(4, 32),
|
|
248
|
+
nn.ReLU(),
|
|
249
|
+
nn.Linear(32, 2),
|
|
250
|
+
)
|
|
251
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
|
252
|
+
criterion = nn.CrossEntropyLoss()
|
|
253
|
+
|
|
254
|
+
eng = TorchEngine(device="cpu")
|
|
255
|
+
eng.load_model(model, optimizer)
|
|
256
|
+
|
|
257
|
+
console.print(f" Model params: {eng.param_count():,}")
|
|
258
|
+
console.print(f" Dataset size: {n_samples}")
|
|
259
|
+
console.print(f" Batch size: {batch_size}")
|
|
260
|
+
console.print(f" Epochs: {epochs}")
|
|
261
|
+
console.print(f" Device: {eng.device}\n")
|
|
262
|
+
|
|
263
|
+
for epoch in range(epochs):
|
|
264
|
+
epoch_loss = 0.0
|
|
265
|
+
correct = 0
|
|
266
|
+
total = 0
|
|
267
|
+
t0 = time.time()
|
|
268
|
+
|
|
269
|
+
for batch_x, batch_y in dataloader:
|
|
270
|
+
eng.zero_grad()
|
|
271
|
+
logits = model(batch_x)
|
|
272
|
+
loss = criterion(logits, batch_y)
|
|
273
|
+
eng.backward(loss)
|
|
274
|
+
eng.step()
|
|
275
|
+
|
|
276
|
+
epoch_loss += loss.item()
|
|
277
|
+
correct += (logits.argmax(1) == batch_y).sum().item()
|
|
278
|
+
total += len(batch_y)
|
|
279
|
+
|
|
280
|
+
elapsed = time.time() - t0
|
|
281
|
+
acc = correct / total * 100
|
|
282
|
+
avg_loss = epoch_loss / max(len(dataloader), 1)
|
|
283
|
+
console.print(
|
|
284
|
+
f" Epoch {epoch + 1:3d}/{epochs} "
|
|
285
|
+
f"loss={avg_loss:.4f} acc={acc:.1f}% "
|
|
286
|
+
f"time={elapsed:.2f}s"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
console.print("\n[green]Training complete![/green]")
|
|
290
|
+
console.print("[dim]To train across multiple Macs, use the Python SDK:[/dim]")
|
|
291
|
+
console.print("[dim] macfleet.Pool().train(model, dataset, epochs=10)[/dim]")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _train_from_script(
|
|
295
|
+
script: str,
|
|
296
|
+
engine_type: str,
|
|
297
|
+
epochs: int,
|
|
298
|
+
batch_size: int,
|
|
299
|
+
lr: float,
|
|
300
|
+
compression: str,
|
|
301
|
+
):
|
|
302
|
+
"""Run a user-provided training script."""
|
|
303
|
+
import importlib.util
|
|
304
|
+
import os
|
|
305
|
+
|
|
306
|
+
if not os.path.isfile(script):
|
|
307
|
+
console.print(f"[red]Error: Script not found: {script}[/red]")
|
|
308
|
+
sys.exit(1)
|
|
309
|
+
|
|
310
|
+
console.print(f"[bold blue]MacFleet Training[/bold blue] — {script}")
|
|
311
|
+
|
|
312
|
+
# Load user script
|
|
313
|
+
spec = importlib.util.spec_from_file_location("user_train", script)
|
|
314
|
+
module = importlib.util.module_from_spec(spec)
|
|
315
|
+
spec.loader.exec_module(module)
|
|
316
|
+
|
|
317
|
+
# Expect the script to define a `main()` function or `model`/`dataset`
|
|
318
|
+
if hasattr(module, "main"):
|
|
319
|
+
module.main()
|
|
320
|
+
else:
|
|
321
|
+
console.print("[red]Error: Script must define a main() function.[/red]")
|
|
322
|
+
console.print("[dim]Example:[/dim]")
|
|
323
|
+
console.print("[dim] def main():[/dim]")
|
|
324
|
+
console.print("[dim] model = MyModel()[/dim]")
|
|
325
|
+
console.print("[dim] macfleet.train(model, dataset, epochs=10)[/dim]")
|
|
326
|
+
sys.exit(1)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
@cli.command()
|
|
330
|
+
@click.option("--type", "bench_type", type=click.Choice(["network", "compute", "allreduce"]), default="network")
|
|
331
|
+
@click.option("--size-mb", default=10, help="Payload size in MB for network tests")
|
|
332
|
+
@click.option("--iterations", default=5, help="Number of iterations")
|
|
333
|
+
def bench(bench_type: str, size_mb: int, iterations: int):
|
|
334
|
+
"""Benchmark network and compute performance."""
|
|
335
|
+
if bench_type == "compute":
|
|
336
|
+
_bench_compute(iterations)
|
|
337
|
+
elif bench_type == "network":
|
|
338
|
+
_bench_network(size_mb, iterations)
|
|
339
|
+
elif bench_type == "allreduce":
|
|
340
|
+
_bench_allreduce(size_mb, iterations)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _bench_compute(iterations: int):
|
|
344
|
+
"""Benchmark local compute throughput."""
|
|
345
|
+
import torch
|
|
346
|
+
import torch.nn as nn
|
|
347
|
+
|
|
348
|
+
console.print("[bold blue]MacFleet Compute Benchmark[/bold blue]\n")
|
|
349
|
+
|
|
350
|
+
from macfleet.engines.torch_engine import TorchEngine
|
|
351
|
+
|
|
352
|
+
eng = TorchEngine(device="cpu")
|
|
353
|
+
model = nn.Sequential(nn.Linear(512, 512), nn.ReLU(), nn.Linear(512, 10))
|
|
354
|
+
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
|
|
355
|
+
eng.load_model(model, optimizer)
|
|
356
|
+
|
|
357
|
+
console.print(f" Model: 2-layer MLP ({eng.param_count():,} params)")
|
|
358
|
+
console.print(f" Device: {eng.device}")
|
|
359
|
+
console.print(f" Iterations: {iterations}\n")
|
|
360
|
+
|
|
361
|
+
# Warmup
|
|
362
|
+
x = torch.randn(64, 512)
|
|
363
|
+
for _ in range(3):
|
|
364
|
+
eng.zero_grad()
|
|
365
|
+
loss = model(x).sum()
|
|
366
|
+
eng.backward(loss)
|
|
367
|
+
eng.step()
|
|
368
|
+
|
|
369
|
+
# Benchmark
|
|
370
|
+
times = []
|
|
371
|
+
for i in range(iterations):
|
|
372
|
+
x = torch.randn(64, 512)
|
|
373
|
+
t0 = time.perf_counter()
|
|
374
|
+
eng.zero_grad()
|
|
375
|
+
loss = model(x).sum()
|
|
376
|
+
eng.backward(loss)
|
|
377
|
+
eng.step()
|
|
378
|
+
elapsed = time.perf_counter() - t0
|
|
379
|
+
times.append(elapsed)
|
|
380
|
+
console.print(f" Step {i + 1}: {elapsed * 1000:.1f} ms")
|
|
381
|
+
|
|
382
|
+
import numpy as np
|
|
383
|
+
|
|
384
|
+
avg = np.mean(times) * 1000
|
|
385
|
+
std = np.std(times) * 1000
|
|
386
|
+
console.print(f"\n [bold]Average: {avg:.1f} ms/step (std={std:.1f} ms)[/bold]")
|
|
387
|
+
console.print(f" Throughput: {64 / np.mean(times):.0f} samples/sec")
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _bench_network(size_mb: int, iterations: int):
|
|
391
|
+
"""Benchmark loopback network throughput."""
|
|
392
|
+
import numpy as np
|
|
393
|
+
|
|
394
|
+
console.print("[bold blue]MacFleet Network Benchmark[/bold blue]\n")
|
|
395
|
+
console.print(f" Payload: {size_mb} MB, loopback")
|
|
396
|
+
console.print(f" Iterations: {iterations}\n")
|
|
397
|
+
|
|
398
|
+
from macfleet.comm.transport import PeerTransport, TransportConfig
|
|
399
|
+
|
|
400
|
+
config = TransportConfig(recv_timeout_sec=30.0, connect_timeout_sec=10.0)
|
|
401
|
+
|
|
402
|
+
async def run():
|
|
403
|
+
server = PeerTransport(local_id="bench-server", config=config)
|
|
404
|
+
client = PeerTransport(local_id="bench-client", config=config)
|
|
405
|
+
|
|
406
|
+
await server.start_server("127.0.0.1", 0)
|
|
407
|
+
port = server._server.sockets[0].getsockname()[1]
|
|
408
|
+
await client.connect("bench-server", "127.0.0.1", port)
|
|
409
|
+
await asyncio.sleep(0.1)
|
|
410
|
+
|
|
411
|
+
payload = bytes(range(256)) * (size_mb * 1024 * 1024 // 256)
|
|
412
|
+
times = []
|
|
413
|
+
|
|
414
|
+
for i in range(iterations):
|
|
415
|
+
t0 = time.perf_counter()
|
|
416
|
+
await client.send("bench-server", payload)
|
|
417
|
+
await server.recv("bench-client")
|
|
418
|
+
elapsed = time.perf_counter() - t0
|
|
419
|
+
times.append(elapsed)
|
|
420
|
+
throughput = size_mb / elapsed
|
|
421
|
+
console.print(f" Transfer {i + 1}: {elapsed * 1000:.1f} ms ({throughput:.0f} MB/s)")
|
|
422
|
+
|
|
423
|
+
await client.disconnect_all()
|
|
424
|
+
await server.disconnect_all()
|
|
425
|
+
|
|
426
|
+
avg_time = np.mean(times)
|
|
427
|
+
avg_throughput = size_mb / avg_time
|
|
428
|
+
console.print(f"\n [bold]Average: {avg_throughput:.0f} MB/s[/bold]")
|
|
429
|
+
|
|
430
|
+
asyncio.run(run())
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _bench_allreduce(size_mb: int, iterations: int):
|
|
434
|
+
"""Benchmark AllReduce over loopback (simulates 2-node)."""
|
|
435
|
+
import numpy as np
|
|
436
|
+
|
|
437
|
+
console.print("[bold blue]MacFleet AllReduce Benchmark (2-node loopback)[/bold blue]\n")
|
|
438
|
+
console.print(f" Array size: {size_mb} MB")
|
|
439
|
+
console.print(f" Iterations: {iterations}\n")
|
|
440
|
+
|
|
441
|
+
from macfleet.comm.collectives import CollectiveGroup
|
|
442
|
+
from macfleet.comm.transport import PeerTransport, TransportConfig
|
|
443
|
+
|
|
444
|
+
config = TransportConfig(recv_timeout_sec=30.0, connect_timeout_sec=10.0)
|
|
445
|
+
|
|
446
|
+
async def run():
|
|
447
|
+
# Setup 2-node mesh
|
|
448
|
+
t0_transport = PeerTransport(local_id="node-0", config=config)
|
|
449
|
+
t1_transport = PeerTransport(local_id="node-1", config=config)
|
|
450
|
+
|
|
451
|
+
await t1_transport.start_server("127.0.0.1", 0)
|
|
452
|
+
port = t1_transport._server.sockets[0].getsockname()[1]
|
|
453
|
+
await t0_transport.connect("node-1", "127.0.0.1", port)
|
|
454
|
+
await asyncio.sleep(0.1)
|
|
455
|
+
|
|
456
|
+
group0 = CollectiveGroup(rank=0, world_size=2, transport=t0_transport, rank_to_peer={1: "node-1"})
|
|
457
|
+
group1 = CollectiveGroup(rank=1, world_size=2, transport=t1_transport, rank_to_peer={0: "node-0"})
|
|
458
|
+
|
|
459
|
+
# Create arrays
|
|
460
|
+
n_floats = size_mb * 1024 * 1024 // 4
|
|
461
|
+
arr0 = np.random.randn(n_floats).astype(np.float32)
|
|
462
|
+
arr1 = np.random.randn(n_floats).astype(np.float32)
|
|
463
|
+
|
|
464
|
+
times = []
|
|
465
|
+
for i in range(iterations):
|
|
466
|
+
t0 = time.perf_counter()
|
|
467
|
+
await asyncio.gather(
|
|
468
|
+
group0.allreduce(arr0, op="mean"),
|
|
469
|
+
group1.allreduce(arr1, op="mean"),
|
|
470
|
+
)
|
|
471
|
+
elapsed = time.perf_counter() - t0
|
|
472
|
+
times.append(elapsed)
|
|
473
|
+
console.print(f" AllReduce {i + 1}: {elapsed * 1000:.1f} ms")
|
|
474
|
+
|
|
475
|
+
await t0_transport.disconnect_all()
|
|
476
|
+
await t1_transport.disconnect_all()
|
|
477
|
+
|
|
478
|
+
avg = np.mean(times) * 1000
|
|
479
|
+
console.print(f"\n [bold]Average AllReduce: {avg:.1f} ms[/bold]")
|
|
480
|
+
console.print(f" Effective bandwidth: {size_mb * 2 / np.mean(times):.0f} MB/s")
|
|
481
|
+
|
|
482
|
+
asyncio.run(run())
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
if __name__ == "__main__":
|
|
486
|
+
cli()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Communication layer: transport, protocol, and collectives."""
|