coolingcube 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: coolingcube
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: GPU cluster tail latency optimizer for DDP training and inference
|
|
5
|
+
Home-page: https://coolingcube.cc
|
|
6
|
+
Author: Cooling Cube
|
|
7
|
+
Author-email: CoolingCubeInfo@proton.me
|
|
8
|
+
Keywords: gpu ddp training inference tail-latency optimization multi-gpu
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
Requires-Dist: requests>=2.20
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: author-email
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: keywords
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# Cooling Cube
|
|
29
|
+
|
|
30
|
+
GPU cluster tail latency optimizer for DDP training and multi-GPU inference.
|
|
31
|
+
|
|
32
|
+
Reduces per-step tail latency by identifying pressure workers and computing start-time offsets. Works on 8–64 GPU clusters. Zero negative gains guaranteed.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install coolingcube
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## CLI usage
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# From a timing log file
|
|
44
|
+
coolingcube --logs timing.json
|
|
45
|
+
|
|
46
|
+
# Inline worker times (microseconds)
|
|
47
|
+
coolingcube --workers '{"0": 12000, "1": 11800, "2": 15500, "3": 12500}'
|
|
48
|
+
|
|
49
|
+
# JSON output
|
|
50
|
+
coolingcube --logs timing.json --json
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Python usage
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from coolingcube import optimize
|
|
57
|
+
|
|
58
|
+
result = optimize({
|
|
59
|
+
"0": 12000,
|
|
60
|
+
"1": 11800,
|
|
61
|
+
"2": 15500,
|
|
62
|
+
"3": 12500,
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
print(f"Gain: {result['gain_pct']:.3f}% ({result['gain_us']:.1f} µs)")
|
|
66
|
+
print(f"Schedule: {result['best_schedule']}")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Collecting logs from PyTorch DDP
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import time
|
|
73
|
+
import torch
|
|
74
|
+
import torch.distributed as dist
|
|
75
|
+
|
|
76
|
+
timing_logs = {}
|
|
77
|
+
|
|
78
|
+
for step in range(num_steps):
|
|
79
|
+
t0 = time.perf_counter()
|
|
80
|
+
loss = model(batch)
|
|
81
|
+
loss.backward()
|
|
82
|
+
optimizer.step()
|
|
83
|
+
dist.barrier()
|
|
84
|
+
elapsed_us = (time.perf_counter() - t0) * 1e6
|
|
85
|
+
|
|
86
|
+
rank = dist.get_rank()
|
|
87
|
+
timing_logs[str(rank)] = elapsed_us # average over steps in practice
|
|
88
|
+
|
|
89
|
+
# After training loop, on rank 0:
|
|
90
|
+
if dist.get_rank() == 0:
|
|
91
|
+
from coolingcube import optimize
|
|
92
|
+
result = optimize(timing_logs)
|
|
93
|
+
print(f"Gain: {result['gain_pct']:.3f}%")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Log file formats
|
|
97
|
+
|
|
98
|
+
Cooling Cube accepts several formats automatically:
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
{"0": 12345, "1": 11800, "2": 13000}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
```json
|
|
105
|
+
[{"rank": 0, "total_iter_time": 0.186}, {"rank": 1, "total_iter_time": 0.212}]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
```json
|
|
109
|
+
{"workers": [{"rank": 0, "step_time": 0.172}, ...]}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## How it works
|
|
113
|
+
|
|
114
|
+
Standard DDP holds all workers at the barrier until the slowest finishes. The bottleneck is usually not the straggler itself but the pressure workers pushing it — workers with slightly elevated times that create synchronization pressure.
|
|
115
|
+
|
|
116
|
+
Cooling Cube identifies those pressure workers and computes per-worker start-time offsets to reduce tail latency. The algorithm uses a Ridge surrogate model and converges in 40–80 oracle calls regardless of cluster size.
|
|
117
|
+
|
|
118
|
+
Typical gains: 0.2–0.9% step-time reduction on heterogeneous or PCIe-bound clusters.
|
|
119
|
+
|
|
120
|
+
## Free
|
|
121
|
+
|
|
122
|
+
Free for open source and research use. No account required.
|
|
123
|
+
|
|
124
|
+
https://coolingcube.cc · CoolingCubeInfo@proton.me
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
coolingcube-0.1.0.dist-info/METADATA,sha256=QAmdjnR_Iq947aVsj496Tt1MjAnMKvLQrIuM_147VCo,3397
|
|
2
|
+
coolingcube-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
3
|
+
coolingcube-0.1.0.dist-info/entry_points.txt,sha256=7whiLFJb1Ek49oge-mnFCHkfsDO3YgBHUpLKTQx3e3g,53
|
|
4
|
+
coolingcube-0.1.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
5
|
+
coolingcube-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|