macfleet 2.0.0__tar.gz → 2.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- macfleet-2.1.1/PKG-INFO +163 -0
- macfleet-2.1.1/README.md +117 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/__init__.py +9 -1
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/cli/main.py +99 -7
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/comm/collectives.py +1 -1
- macfleet-2.1.1/macfleet/comm/protocol.py +135 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/comm/transport.py +187 -30
- macfleet-2.1.1/macfleet/compute/__init__.py +30 -0
- macfleet-2.1.1/macfleet/compute/dispatch.py +159 -0
- macfleet-2.1.1/macfleet/compute/models.py +193 -0
- macfleet-2.1.1/macfleet/compute/worker.py +153 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/engines/base.py +18 -33
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/engines/mlx_engine.py +0 -32
- macfleet-2.0.0/macfleet/comm/protocol.py → macfleet-2.1.1/macfleet/engines/serialization.py +16 -118
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/engines/torch_engine.py +1 -28
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/agent.py +155 -12
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/discovery.py +47 -26
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/heartbeat.py +50 -12
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/registry.py +6 -6
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/scheduler.py +1 -15
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/sdk/pool.py +121 -3
- macfleet-2.1.1/macfleet/security/__init__.py +41 -0
- macfleet-2.1.1/macfleet/security/auth.py +476 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/training/data_parallel.py +99 -21
- macfleet-2.1.1/macfleet.egg-info/PKG-INFO +163 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet.egg-info/SOURCES.txt +7 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet.egg-info/requires.txt +1 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/pyproject.toml +3 -2
- macfleet-2.0.0/PKG-INFO +0 -175
- macfleet-2.0.0/README.md +0 -130
- macfleet-2.0.0/macfleet.egg-info/PKG-INFO +0 -175
- {macfleet-2.0.0 → macfleet-2.1.1}/LICENSE +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/cli/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/comm/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/compression/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/compression/adaptive.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/compression/pipeline.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/compression/quantize.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/compression/topk.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/engines/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/monitoring/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/monitoring/dashboard.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/monitoring/health.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/monitoring/thermal.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/monitoring/throughput.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/pool/network.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/sdk/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/sdk/decorators.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/sdk/train.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/training/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/training/loop.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/training/sampler.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet/utils/__init__.py +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet.egg-info/dependency_links.txt +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet.egg-info/entry_points.txt +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/macfleet.egg-info/top_level.txt +0 -0
- {macfleet-2.0.0 → macfleet-2.1.1}/setup.cfg +0 -0
macfleet-2.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: macfleet
|
|
3
|
+
Version: 2.1.1
|
|
4
|
+
Summary: Pool Apple Silicon Macs for distributed compute and ML training
|
|
5
|
+
Author: MacFleet Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/vikranthreddimasu/MacFleet
|
|
8
|
+
Project-URL: Documentation, https://github.com/vikranthreddimasu/MacFleet#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/vikranthreddimasu/MacFleet
|
|
10
|
+
Project-URL: Issues, https://github.com/vikranthreddimasu/MacFleet/issues
|
|
11
|
+
Keywords: distributed,machine-learning,apple-silicon,mps,mlx,pytorch,training,gpu-pooling,data-parallel
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: zeroconf>=0.131.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Requires-Dist: click>=8.1.0
|
|
27
|
+
Requires-Dist: numpy>=1.24.0
|
|
28
|
+
Requires-Dist: msgpack>=1.0.0
|
|
29
|
+
Requires-Dist: cloudpickle>=3.0.0
|
|
30
|
+
Provides-Extra: torch
|
|
31
|
+
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
32
|
+
Provides-Extra: mlx
|
|
33
|
+
Requires-Dist: mlx>=0.5.0; extra == "mlx"
|
|
34
|
+
Provides-Extra: yaml
|
|
35
|
+
Requires-Dist: pyyaml>=6.0; extra == "yaml"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: torch>=2.1.0; extra == "all"
|
|
38
|
+
Requires-Dist: mlx>=0.5.0; extra == "all"
|
|
39
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
43
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
44
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
46
|
+
|
|
47
|
+
# MacFleet
|
|
48
|
+
|
|
49
|
+
**Pool Apple Silicon Macs into a distributed ML training cluster.**
|
|
50
|
+
|
|
51
|
+
Turn spare MacBooks, Mac Minis, and Mac Studios into one big GPU. MacFleet connects them over Thunderbolt, Ethernet, or WiFi and splits training across all of them automatically.
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
macfleet join macfleet join macfleet join
|
|
55
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
56
|
+
│ MacBook Pro │◄────────►│ MacBook Air │◄────────►│ Mac Studio │
|
|
57
|
+
│ M4 Pro │ WiFi / │ M4 │ WiFi / │ M4 Ultra │
|
|
58
|
+
│ 16 GPU cores│ ETH / │ 10 GPU cores│ ETH / │ 60 GPU cores│
|
|
59
|
+
│ 48 GB RAM │ TB4 │ 16 GB RAM │ TB4 │ 192 GB RAM │
|
|
60
|
+
└──────────────┘ └──────────────┘ └──────────────┘
|
|
61
|
+
▲ ▲ ▲
|
|
62
|
+
└──────────────────────────┴──────────────────────────┘
|
|
63
|
+
Ring AllReduce (gradient sync)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Install
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install macfleet # core
|
|
70
|
+
pip install macfleet[torch] # + PyTorch
|
|
71
|
+
pip install macfleet[mlx] # + Apple MLX
|
|
72
|
+
pip install macfleet[all] # everything
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
**1. Join the pool** (run on each Mac):
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
macfleet join
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
No config files, no IP addresses. Macs find each other automatically via mDNS/Bonjour.
|
|
84
|
+
|
|
85
|
+
**2. Train:**
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import macfleet
|
|
89
|
+
import torch.nn as nn
|
|
90
|
+
|
|
91
|
+
model = nn.Sequential(nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))
|
|
92
|
+
|
|
93
|
+
with macfleet.Pool() as pool:
|
|
94
|
+
result = pool.train(model=model, dataset=(X_train, y_train), epochs=10)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Features
|
|
98
|
+
|
|
99
|
+
- **Dual engine** — PyTorch (MPS) and Apple MLX, same pool infrastructure
|
|
100
|
+
- **Zero config** — mDNS discovery, no coordinator setup, no config files
|
|
101
|
+
- **Adaptive compression** — auto-selects TopK + FP16 based on link speed (1x–200x reduction)
|
|
102
|
+
- **Heterogeneous scheduling** — faster Macs get bigger batches, adjusts for thermal throttling
|
|
103
|
+
- **Secure by default** — auto-generated fleet tokens, HMAC mutual auth, mandatory TLS, gradient validation
|
|
104
|
+
- **Framework-agnostic core** — communication layer uses only numpy, never imports torch or mlx
|
|
105
|
+
|
|
106
|
+
## Security
|
|
107
|
+
|
|
108
|
+
Security is enabled by default. The first `macfleet join` auto-generates a fleet token and saves it to `~/.macfleet/fleet-token`:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
macfleet join # auto-generates token, prints it
|
|
112
|
+
macfleet join --token <token> # join with a specific token (copy from first node)
|
|
113
|
+
macfleet join --fleet-id lab # isolate by fleet name
|
|
114
|
+
macfleet join --open # disable security (not recommended)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
What's protected:
|
|
118
|
+
- **Fleet isolation** — nodes with different tokens are invisible to each other on the network
|
|
119
|
+
- **Mutual authentication** — HMAC-SHA256 challenge-response on every connection
|
|
120
|
+
- **Encryption** — TLS enabled automatically (mandatory with auth)
|
|
121
|
+
- **Authenticated heartbeat** — HMAC-signed liveness probes, replay-resistant
|
|
122
|
+
- **Gradient validation** — rejects NaN, Inf, and extreme magnitudes (anti-poisoning)
|
|
123
|
+
|
|
124
|
+
## CLI
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
macfleet join Join the pool (auto-discovers peers)
|
|
128
|
+
macfleet status Show pool members and network info
|
|
129
|
+
macfleet info Show local hardware profile
|
|
130
|
+
macfleet train Run training (demo or custom script)
|
|
131
|
+
macfleet bench Benchmark compute, network, or allreduce
|
|
132
|
+
macfleet diagnose System health check
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## How It Works
|
|
136
|
+
|
|
137
|
+
MacFleet uses **data parallelism**: every Mac holds a full copy of the model, trains on a weighted portion of the data, and averages gradients via Ring AllReduce after each step.
|
|
138
|
+
|
|
139
|
+
| Network | Compression | 100 MB gradients become |
|
|
140
|
+
|---------------|-----------------|-------------------------|
|
|
141
|
+
| Thunderbolt 4 | None | 100 MB |
|
|
142
|
+
| Ethernet | TopK 10% + FP16 | ~5 MB |
|
|
143
|
+
| WiFi | TopK 1% + FP16 | ~500 KB |
|
|
144
|
+
|
|
145
|
+
## Requirements
|
|
146
|
+
|
|
147
|
+
- macOS with Apple Silicon (M1/M2/M3/M4)
|
|
148
|
+
- Python 3.11+
|
|
149
|
+
- PyTorch 2.1+ or MLX 0.5+
|
|
150
|
+
|
|
151
|
+
## Development
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
git clone https://github.com/vikranthreddimasu/MacFleet.git
|
|
155
|
+
cd MacFleet
|
|
156
|
+
pip install -e ".[dev,all]"
|
|
157
|
+
make test # 373 tests
|
|
158
|
+
make lint # ruff + mypy
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT
|
macfleet-2.1.1/README.md
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# MacFleet
|
|
2
|
+
|
|
3
|
+
**Pool Apple Silicon Macs into a distributed ML training cluster.**
|
|
4
|
+
|
|
5
|
+
Turn spare MacBooks, Mac Minis, and Mac Studios into one big GPU. MacFleet connects them over Thunderbolt, Ethernet, or WiFi and splits training across all of them automatically.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
macfleet join macfleet join macfleet join
|
|
9
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
10
|
+
│ MacBook Pro │◄────────►│ MacBook Air │◄────────►│ Mac Studio │
|
|
11
|
+
│ M4 Pro │ WiFi / │ M4 │ WiFi / │ M4 Ultra │
|
|
12
|
+
│ 16 GPU cores│ ETH / │ 10 GPU cores│ ETH / │ 60 GPU cores│
|
|
13
|
+
│ 48 GB RAM │ TB4 │ 16 GB RAM │ TB4 │ 192 GB RAM │
|
|
14
|
+
└──────────────┘ └──────────────┘ └──────────────┘
|
|
15
|
+
▲ ▲ ▲
|
|
16
|
+
└──────────────────────────┴──────────────────────────┘
|
|
17
|
+
Ring AllReduce (gradient sync)
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install macfleet # core
|
|
24
|
+
pip install macfleet[torch] # + PyTorch
|
|
25
|
+
pip install macfleet[mlx] # + Apple MLX
|
|
26
|
+
pip install macfleet[all] # everything
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
**1. Join the pool** (run on each Mac):
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
macfleet join
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
No config files, no IP addresses. Macs find each other automatically via mDNS/Bonjour.
|
|
38
|
+
|
|
39
|
+
**2. Train:**
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import macfleet
|
|
43
|
+
import torch.nn as nn
|
|
44
|
+
|
|
45
|
+
model = nn.Sequential(nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))
|
|
46
|
+
|
|
47
|
+
with macfleet.Pool() as pool:
|
|
48
|
+
result = pool.train(model=model, dataset=(X_train, y_train), epochs=10)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Dual engine** — PyTorch (MPS) and Apple MLX, same pool infrastructure
|
|
54
|
+
- **Zero config** — mDNS discovery, no coordinator setup, no config files
|
|
55
|
+
- **Adaptive compression** — auto-selects TopK + FP16 based on link speed (1x–200x reduction)
|
|
56
|
+
- **Heterogeneous scheduling** — faster Macs get bigger batches, adjusts for thermal throttling
|
|
57
|
+
- **Secure by default** — auto-generated fleet tokens, HMAC mutual auth, mandatory TLS, gradient validation
|
|
58
|
+
- **Framework-agnostic core** — communication layer uses only numpy, never imports torch or mlx
|
|
59
|
+
|
|
60
|
+
## Security
|
|
61
|
+
|
|
62
|
+
Security is enabled by default. The first `macfleet join` auto-generates a fleet token and saves it to `~/.macfleet/fleet-token`:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
macfleet join # auto-generates token, prints it
|
|
66
|
+
macfleet join --token <token> # join with a specific token (copy from first node)
|
|
67
|
+
macfleet join --fleet-id lab # isolate by fleet name
|
|
68
|
+
macfleet join --open # disable security (not recommended)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
What's protected:
|
|
72
|
+
- **Fleet isolation** — nodes with different tokens are invisible to each other on the network
|
|
73
|
+
- **Mutual authentication** — HMAC-SHA256 challenge-response on every connection
|
|
74
|
+
- **Encryption** — TLS enabled automatically (mandatory with auth)
|
|
75
|
+
- **Authenticated heartbeat** — HMAC-signed liveness probes, replay-resistant
|
|
76
|
+
- **Gradient validation** — rejects NaN, Inf, and extreme magnitudes (anti-poisoning)
|
|
77
|
+
|
|
78
|
+
## CLI
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
macfleet join Join the pool (auto-discovers peers)
|
|
82
|
+
macfleet status Show pool members and network info
|
|
83
|
+
macfleet info Show local hardware profile
|
|
84
|
+
macfleet train Run training (demo or custom script)
|
|
85
|
+
macfleet bench Benchmark compute, network, or allreduce
|
|
86
|
+
macfleet diagnose System health check
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## How It Works
|
|
90
|
+
|
|
91
|
+
MacFleet uses **data parallelism**: every Mac holds a full copy of the model, trains on a weighted portion of the data, and averages gradients via Ring AllReduce after each step.
|
|
92
|
+
|
|
93
|
+
| Network | Compression | 100 MB gradients become |
|
|
94
|
+
|---------------|-----------------|-------------------------|
|
|
95
|
+
| Thunderbolt 4 | None | 100 MB |
|
|
96
|
+
| Ethernet | TopK 10% + FP16 | ~5 MB |
|
|
97
|
+
| WiFi | TopK 1% + FP16 | ~500 KB |
|
|
98
|
+
|
|
99
|
+
## Requirements
|
|
100
|
+
|
|
101
|
+
- macOS with Apple Silicon (M1/M2/M3/M4)
|
|
102
|
+
- Python 3.11+
|
|
103
|
+
- PyTorch 2.1+ or MLX 0.5+
|
|
104
|
+
|
|
105
|
+
## Development
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
git clone https://github.com/vikranthreddimasu/MacFleet.git
|
|
109
|
+
cd MacFleet
|
|
110
|
+
pip install -e ".[dev,all]"
|
|
111
|
+
make test # 373 tests
|
|
112
|
+
make lint # ruff + mypy
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## License
|
|
116
|
+
|
|
117
|
+
MIT
|
|
@@ -7,7 +7,7 @@ Zero-config discovery. Framework-agnostic engines. Adaptive networking.
|
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
9
|
|
|
10
|
-
__version__ = "2.
|
|
10
|
+
__version__ = "2.1.1"
|
|
11
11
|
|
|
12
12
|
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
13
13
|
|
|
@@ -32,6 +32,12 @@ def __getattr__(name: str):
|
|
|
32
32
|
if name == "MLXEngine":
|
|
33
33
|
from macfleet.engines.mlx_engine import MLXEngine
|
|
34
34
|
return MLXEngine
|
|
35
|
+
if name == "TaskFuture":
|
|
36
|
+
from macfleet.compute.models import TaskFuture
|
|
37
|
+
return TaskFuture
|
|
38
|
+
if name == "RemoteTaskError":
|
|
39
|
+
from macfleet.compute.models import RemoteTaskError
|
|
40
|
+
return RemoteTaskError
|
|
35
41
|
raise AttributeError(f"module 'macfleet' has no attribute {name!r}")
|
|
36
42
|
|
|
37
43
|
|
|
@@ -43,4 +49,6 @@ __all__ = [
|
|
|
43
49
|
"DataParallel",
|
|
44
50
|
"TorchEngine",
|
|
45
51
|
"MLXEngine",
|
|
52
|
+
"TaskFuture",
|
|
53
|
+
"RemoteTaskError",
|
|
46
54
|
]
|
|
@@ -37,12 +37,42 @@ def cli():
|
|
|
37
37
|
@cli.command()
|
|
38
38
|
@click.option("--name", default=None, help="Custom node name")
|
|
39
39
|
@click.option("--port", default=50051, help="Communication port")
|
|
40
|
-
@click.option("--token", default=None, help="Pool
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
@click.option("--token", default=None, envvar="MACFLEET_TOKEN", help="Pool token (or set MACFLEET_TOKEN env var)")
|
|
41
|
+
@click.option("--fleet-id", default=None, help="Fleet identifier (isolates pool on network)")
|
|
42
|
+
@click.option("--tls", "use_tls", is_flag=True, default=False, help="Enable TLS encryption")
|
|
43
|
+
@click.option("--open", "open_fleet", is_flag=True, default=False, help="Disable security (open fleet, no authentication)")
|
|
44
|
+
@click.option("--peer", "peers", multiple=True, help="Peer address (IP:PORT). Use when mDNS is blocked. Repeatable.")
|
|
45
|
+
def join(name: str | None, port: int, token: str | None, fleet_id: str | None, use_tls: bool, open_fleet: bool, peers: tuple):
|
|
46
|
+
"""Join the compute pool. Auto-discovers peers on the network.
|
|
47
|
+
|
|
48
|
+
Security is enabled by default. A fleet token is auto-generated on first
|
|
49
|
+
run and saved to ~/.macfleet/fleet-token. Copy this token to other Macs
|
|
50
|
+
to let them join your fleet.
|
|
51
|
+
|
|
52
|
+
Use --open to disable security (not recommended).
|
|
53
|
+
|
|
54
|
+
\b
|
|
55
|
+
If mDNS discovery doesn't work (e.g. enterprise WiFi), use --peer:
|
|
56
|
+
Mac A: macfleet join
|
|
57
|
+
Mac B: macfleet join --token <token> --peer <Mac-A-IP>:50051
|
|
58
|
+
"""
|
|
43
59
|
from macfleet.pool.agent import PoolAgent
|
|
60
|
+
from macfleet.security.auth import resolve_token_with_file, TOKEN_FILE
|
|
61
|
+
|
|
62
|
+
if open_fleet:
|
|
63
|
+
if token:
|
|
64
|
+
console.print("[red]Error: --open and --token are mutually exclusive.[/red]")
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
resolved_token = None
|
|
67
|
+
else:
|
|
68
|
+
resolved_token = resolve_token_with_file(token, auto_generate=True)
|
|
69
|
+
if token is None:
|
|
70
|
+
# Token was auto-generated or loaded from file — show it
|
|
71
|
+
console.print(f"\n[bold green]Fleet token:[/bold green] {resolved_token}")
|
|
72
|
+
console.print(f"[dim]Saved to {TOKEN_FILE}[/dim]")
|
|
73
|
+
console.print("[dim]Copy this token to other Macs: macfleet join --token <token>[/dim]\n")
|
|
44
74
|
|
|
45
|
-
agent = PoolAgent(name=name, port=port, token=
|
|
75
|
+
agent = PoolAgent(name=name, port=port, token=resolved_token, fleet_id=fleet_id, tls=use_tls, peers=list(peers))
|
|
46
76
|
|
|
47
77
|
async def run():
|
|
48
78
|
await agent.start()
|
|
@@ -98,13 +128,27 @@ def info():
|
|
|
98
128
|
|
|
99
129
|
|
|
100
130
|
@cli.command()
|
|
101
|
-
|
|
131
|
+
@click.option("--token", default=None, envvar="MACFLEET_TOKEN", help="Pool token (scopes discovery to fleet)")
|
|
132
|
+
@click.option("--fleet-id", default=None, help="Fleet identifier")
|
|
133
|
+
@click.option("--open", "open_fleet", is_flag=True, default=False, help="Scan open fleet (ignore saved token)")
|
|
134
|
+
def status(token: str | None, fleet_id: str | None, open_fleet: bool):
|
|
102
135
|
"""Show pool status (discovers peers for 3 seconds)."""
|
|
103
136
|
from macfleet.pool.discovery import ServiceRegistry
|
|
137
|
+
from macfleet.security.auth import SecurityConfig, resolve_token_with_file
|
|
138
|
+
|
|
139
|
+
if open_fleet:
|
|
140
|
+
resolved = None
|
|
141
|
+
else:
|
|
142
|
+
resolved = resolve_token_with_file(token)
|
|
104
143
|
|
|
105
|
-
|
|
144
|
+
sec = SecurityConfig(token=resolved, fleet_id=fleet_id) if resolved else None
|
|
145
|
+
if sec and sec.is_secure:
|
|
146
|
+
fleet_label = fleet_id or "default"
|
|
147
|
+
console.print(f"[bold]Scanning fleet '{fleet_label}' for members...[/bold]")
|
|
148
|
+
else:
|
|
149
|
+
console.print("[bold]Scanning for pool members...[/bold]")
|
|
106
150
|
|
|
107
|
-
registry = ServiceRegistry()
|
|
151
|
+
registry = ServiceRegistry(security=sec)
|
|
108
152
|
try:
|
|
109
153
|
peers = registry.find_peers(timeout=3.0)
|
|
110
154
|
finally:
|
|
@@ -326,6 +370,54 @@ def _train_from_script(
|
|
|
326
370
|
sys.exit(1)
|
|
327
371
|
|
|
328
372
|
|
|
373
|
+
@cli.command(name="run")
|
|
374
|
+
@click.argument("script")
|
|
375
|
+
@click.option("--fn", "fn_name", default="main", help="Function to execute (default: main)")
|
|
376
|
+
@click.option("--token", default=None, envvar="MACFLEET_TOKEN", help="Pool token")
|
|
377
|
+
@click.option("--open", "open_fleet", is_flag=True, default=False, help="Disable security")
|
|
378
|
+
def run_command(script: str, fn_name: str, token: str | None, open_fleet: bool):
|
|
379
|
+
"""Run a Python script on the pool.
|
|
380
|
+
|
|
381
|
+
The script must define the named function (default: main).
|
|
382
|
+
The function is executed across the pool's compute resources.
|
|
383
|
+
|
|
384
|
+
\b
|
|
385
|
+
Examples:
|
|
386
|
+
macfleet run process.py
|
|
387
|
+
macfleet run analysis.py --fn analyze
|
|
388
|
+
"""
|
|
389
|
+
import importlib.util
|
|
390
|
+
import os
|
|
391
|
+
|
|
392
|
+
if not os.path.isfile(script):
|
|
393
|
+
console.print(f"[red]Error: Script not found: {script}[/red]")
|
|
394
|
+
sys.exit(1)
|
|
395
|
+
|
|
396
|
+
# Load user script
|
|
397
|
+
spec = importlib.util.spec_from_file_location("user_script", script)
|
|
398
|
+
module = importlib.util.module_from_spec(spec)
|
|
399
|
+
spec.loader.exec_module(module)
|
|
400
|
+
|
|
401
|
+
fn = getattr(module, fn_name, None)
|
|
402
|
+
if fn is None or not callable(fn):
|
|
403
|
+
console.print(f"[red]Error: Function '{fn_name}' not found in {script}[/red]")
|
|
404
|
+
console.print(f"[dim]The script must define a callable named '{fn_name}'.[/dim]")
|
|
405
|
+
sys.exit(1)
|
|
406
|
+
|
|
407
|
+
console.print(f"[bold blue]MacFleet Run[/bold blue] — {script}:{fn_name}()")
|
|
408
|
+
|
|
409
|
+
from macfleet.sdk.pool import Pool
|
|
410
|
+
|
|
411
|
+
with Pool(token=token, open=open_fleet) as pool:
|
|
412
|
+
t0 = time.time()
|
|
413
|
+
result = pool.run(fn)
|
|
414
|
+
elapsed = time.time() - t0
|
|
415
|
+
|
|
416
|
+
console.print(f"\n[green]Completed in {elapsed:.2f}s[/green]")
|
|
417
|
+
if result is not None:
|
|
418
|
+
console.print(f"Result: {result}")
|
|
419
|
+
|
|
420
|
+
|
|
329
421
|
@cli.command()
|
|
330
422
|
@click.option("--type", "bench_type", type=click.Choice(["network", "compute", "allreduce"]), default="network")
|
|
331
423
|
@click.option("--size-mb", default=10, help="Payload size in MB for network tests")
|
|
@@ -174,7 +174,7 @@ class CollectiveGroup:
|
|
|
174
174
|
# Flatten for chunking
|
|
175
175
|
original_shape = array.shape
|
|
176
176
|
original_dtype = array.dtype
|
|
177
|
-
flat = array.
|
|
177
|
+
flat = array.flatten()
|
|
178
178
|
numel = len(flat)
|
|
179
179
|
|
|
180
180
|
# Pad to be evenly divisible
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Binary wire protocol for tensor transport.
|
|
2
|
+
|
|
3
|
+
Extended from MacFleet v1's 16-byte header to 24 bytes with:
|
|
4
|
+
- Stream multiplexing (stream_id)
|
|
5
|
+
- CRC32 checksums (critical for WiFi reliability)
|
|
6
|
+
- Chunking flags for large tensors
|
|
7
|
+
- Sequence numbers for ordering
|
|
8
|
+
|
|
9
|
+
Header (24 bytes):
|
|
10
|
+
stream_id: uint32 (multiplexing: control=0, tensor=1..N)
|
|
11
|
+
msg_type: uint16 (CONTROL=1, TENSOR=2, HEARTBEAT=3, GRADIENT=4, COMPRESSED=5)
|
|
12
|
+
flags: uint16 (bit 0: compressed, bit 1: chunked, bit 2: last_chunk)
|
|
13
|
+
payload_size: uint32 (bytes)
|
|
14
|
+
sequence: uint32 (ordering within stream)
|
|
15
|
+
checksum: uint32 (CRC32 of payload)
|
|
16
|
+
reserved: uint32 (future use)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import struct
|
|
20
|
+
import zlib
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from enum import IntEnum, IntFlag
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MessageType(IntEnum):
|
|
26
|
+
"""Message types for the wire protocol."""
|
|
27
|
+
CONTROL = 0x01
|
|
28
|
+
TENSOR = 0x02
|
|
29
|
+
HEARTBEAT = 0x03
|
|
30
|
+
GRADIENT = 0x04
|
|
31
|
+
COMPRESSED_GRADIENT = 0x05
|
|
32
|
+
BARRIER = 0x06
|
|
33
|
+
STATE = 0x07
|
|
34
|
+
TASK = 0x08
|
|
35
|
+
RESULT = 0x09
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class MessageFlags(IntFlag):
|
|
39
|
+
"""Bit flags for message metadata."""
|
|
40
|
+
NONE = 0x00
|
|
41
|
+
COMPRESSED = 0x01
|
|
42
|
+
CHUNKED = 0x02
|
|
43
|
+
LAST_CHUNK = 0x04
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# 24-byte header: stream_id(I) msg_type(H) flags(H) payload_size(I) sequence(I) checksum(I) reserved(I)
|
|
47
|
+
HEADER_FORMAT = "!IHHIIII"
|
|
48
|
+
HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 24 bytes
|
|
49
|
+
|
|
50
|
+
# SECURITY: Maximum payload size to prevent OOM from malicious headers.
|
|
51
|
+
# 256 MB is larger than any realistic gradient tensor (100M float32 = 400 MB,
|
|
52
|
+
# but compressed gradients are much smaller). Set conservatively high.
|
|
53
|
+
MAX_PAYLOAD_SIZE = 256 * 1024 * 1024 # 256 MB
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class WireMessage:
|
|
58
|
+
"""A message on the wire."""
|
|
59
|
+
stream_id: int
|
|
60
|
+
msg_type: MessageType
|
|
61
|
+
flags: MessageFlags
|
|
62
|
+
sequence: int
|
|
63
|
+
payload: bytes
|
|
64
|
+
checksum: int = 0
|
|
65
|
+
|
|
66
|
+
def pack(self) -> bytes:
|
|
67
|
+
"""Serialize to bytes (header + payload)."""
|
|
68
|
+
checksum = zlib.crc32(self.payload) & 0xFFFFFFFF
|
|
69
|
+
header = struct.pack(
|
|
70
|
+
HEADER_FORMAT,
|
|
71
|
+
self.stream_id,
|
|
72
|
+
self.msg_type,
|
|
73
|
+
self.flags,
|
|
74
|
+
len(self.payload),
|
|
75
|
+
self.sequence,
|
|
76
|
+
checksum,
|
|
77
|
+
0, # reserved
|
|
78
|
+
)
|
|
79
|
+
return header + self.payload
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def unpack(cls, data: bytes) -> "WireMessage":
|
|
83
|
+
"""Deserialize from bytes."""
|
|
84
|
+
header = data[:HEADER_SIZE]
|
|
85
|
+
stream_id, msg_type, flags, payload_size, sequence, checksum, _ = struct.unpack(
|
|
86
|
+
HEADER_FORMAT, header
|
|
87
|
+
)
|
|
88
|
+
payload = data[HEADER_SIZE : HEADER_SIZE + payload_size]
|
|
89
|
+
|
|
90
|
+
# Verify checksum
|
|
91
|
+
actual_checksum = zlib.crc32(payload) & 0xFFFFFFFF
|
|
92
|
+
if actual_checksum != checksum:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"CRC32 mismatch: expected {checksum:#x}, got {actual_checksum:#x}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return cls(
|
|
98
|
+
stream_id=stream_id,
|
|
99
|
+
msg_type=MessageType(msg_type),
|
|
100
|
+
flags=MessageFlags(flags),
|
|
101
|
+
sequence=sequence,
|
|
102
|
+
payload=payload,
|
|
103
|
+
checksum=checksum,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
async def read_from_stream(cls, reader) -> "WireMessage":
|
|
108
|
+
"""Read a single message from an asyncio StreamReader."""
|
|
109
|
+
header_data = await reader.readexactly(HEADER_SIZE)
|
|
110
|
+
stream_id, msg_type, flags, payload_size, sequence, checksum, _ = struct.unpack(
|
|
111
|
+
HEADER_FORMAT, header_data
|
|
112
|
+
)
|
|
113
|
+
if payload_size > MAX_PAYLOAD_SIZE:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Payload size {payload_size} exceeds maximum {MAX_PAYLOAD_SIZE} "
|
|
116
|
+
f"— possible OOM attack or corrupt header"
|
|
117
|
+
)
|
|
118
|
+
payload = await reader.readexactly(payload_size)
|
|
119
|
+
|
|
120
|
+
actual_checksum = zlib.crc32(payload) & 0xFFFFFFFF
|
|
121
|
+
if actual_checksum != checksum:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
f"CRC32 mismatch: expected {checksum:#x}, got {actual_checksum:#x}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return cls(
|
|
127
|
+
stream_id=stream_id,
|
|
128
|
+
msg_type=MessageType(msg_type),
|
|
129
|
+
flags=MessageFlags(flags),
|
|
130
|
+
sequence=sequence,
|
|
131
|
+
payload=payload,
|
|
132
|
+
checksum=checksum,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|