coolingcube 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: coolingcube
3
+ Version: 0.1.0
4
+ Summary: GPU cluster tail latency optimizer for DDP training and inference
5
+ Home-page: https://coolingcube.cc
6
+ Author: Cooling Cube
7
+ Author-email: CoolingCubeInfo@proton.me
8
+ Keywords: gpu ddp training inference tail-latency optimization multi-gpu
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: Topic :: System :: Distributed Computing
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: requests>=2.20
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: keywords
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
27
+
28
+ # Cooling Cube
29
+
30
+ GPU cluster tail latency optimizer for DDP training and multi-GPU inference.
31
+
32
+ Reduces per-step tail latency by identifying pressure workers and computing start-time offsets. Works on 8–64 GPU clusters. Zero negative gains guaranteed.
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install coolingcube
38
+ ```
39
+
40
+ ## CLI usage
41
+
42
+ ```bash
43
+ # From a timing log file
44
+ coolingcube --logs timing.json
45
+
46
+ # Inline worker times (microseconds)
47
+ coolingcube --workers '{"0": 12000, "1": 11800, "2": 15500, "3": 12500}'
48
+
49
+ # JSON output
50
+ coolingcube --logs timing.json --json
51
+ ```
52
+
53
+ ## Python usage
54
+
55
+ ```python
56
+ from coolingcube import optimize
57
+
58
+ result = optimize({
59
+ "0": 12000,
60
+ "1": 11800,
61
+ "2": 15500,
62
+ "3": 12500,
63
+ })
64
+
65
+ print(f"Gain: {result['gain_pct']:.3f}% ({result['gain_us']:.1f} µs)")
66
+ print(f"Schedule: {result['best_schedule']}")
67
+ ```
68
+
69
+ ## Collecting logs from PyTorch DDP
70
+
71
+ ```python
72
+ import time
73
+ import torch
74
+ import torch.distributed as dist
75
+
76
+ timing_logs = {}
77
+
78
+ for step in range(num_steps):
79
+ t0 = time.perf_counter()
80
+ loss = model(batch)
81
+ loss.backward()
82
+ optimizer.step()
83
+ dist.barrier()
84
+ elapsed_us = (time.perf_counter() - t0) * 1e6
85
+
86
+ rank = dist.get_rank()
87
+ timing_logs[str(rank)] = elapsed_us # average over steps in practice
88
+
89
+ # After training loop, on rank 0:
90
+ if dist.get_rank() == 0:
91
+ from coolingcube import optimize
92
+ result = optimize(timing_logs)
93
+ print(f"Gain: {result['gain_pct']:.3f}%")
94
+ ```
95
+
96
+ ## Log file formats
97
+
98
+ Cooling Cube accepts several formats automatically:
99
+
100
+ ```json
101
+ {"0": 12345, "1": 11800, "2": 13000}
102
+ ```
103
+
104
+ ```json
105
+ [{"rank": 0, "total_iter_time": 0.186}, {"rank": 1, "total_iter_time": 0.212}]
106
+ ```
107
+
108
+ ```json
109
+ {"workers": [{"rank": 0, "step_time": 0.172}, ...]}
110
+ ```
111
+
112
+ ## How it works
113
+
114
+ Standard DDP holds all workers at the barrier until the slowest finishes. The bottleneck is usually not the straggler itself but the pressure workers pushing it — workers with slightly elevated times that create synchronization pressure.
115
+
116
+ Cooling Cube identifies those pressure workers and computes per-worker start-time offsets to reduce tail latency. The algorithm uses a Ridge surrogate model and converges in 40–80 oracle calls regardless of cluster size.
117
+
118
+ Typical gains: 0.2–0.9% step-time reduction on heterogeneous or PCIe-bound clusters.
119
+
120
+ ## Free
121
+
122
+ Free for open source and research use. No account required.
123
+
124
+ https://coolingcube.cc · CoolingCubeInfo@proton.me
@@ -0,0 +1,97 @@
1
+ # Cooling Cube
2
+
3
+ GPU cluster tail latency optimizer for DDP training and multi-GPU inference.
4
+
5
+ Reduces per-step tail latency by identifying pressure workers and computing start-time offsets. Works on 8–64 GPU clusters. Zero negative gains guaranteed.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install coolingcube
11
+ ```
12
+
13
+ ## CLI usage
14
+
15
+ ```bash
16
+ # From a timing log file
17
+ coolingcube --logs timing.json
18
+
19
+ # Inline worker times (microseconds)
20
+ coolingcube --workers '{"0": 12000, "1": 11800, "2": 15500, "3": 12500}'
21
+
22
+ # JSON output
23
+ coolingcube --logs timing.json --json
24
+ ```
25
+
26
+ ## Python usage
27
+
28
+ ```python
29
+ from coolingcube import optimize
30
+
31
+ result = optimize({
32
+ "0": 12000,
33
+ "1": 11800,
34
+ "2": 15500,
35
+ "3": 12500,
36
+ })
37
+
38
+ print(f"Gain: {result['gain_pct']:.3f}% ({result['gain_us']:.1f} µs)")
39
+ print(f"Schedule: {result['best_schedule']}")
40
+ ```
41
+
42
+ ## Collecting logs from PyTorch DDP
43
+
44
+ ```python
45
+ import time
46
+ import torch
47
+ import torch.distributed as dist
48
+
49
+ timing_logs = {}
50
+
51
+ for step in range(num_steps):
52
+ t0 = time.perf_counter()
53
+ loss = model(batch)
54
+ loss.backward()
55
+ optimizer.step()
56
+ dist.barrier()
57
+ elapsed_us = (time.perf_counter() - t0) * 1e6
58
+
59
+ rank = dist.get_rank()
60
+ timing_logs[str(rank)] = elapsed_us # average over steps in practice
61
+
62
+ # After training loop, on rank 0:
63
+ if dist.get_rank() == 0:
64
+ from coolingcube import optimize
65
+ result = optimize(timing_logs)
66
+ print(f"Gain: {result['gain_pct']:.3f}%")
67
+ ```
68
+
69
+ ## Log file formats
70
+
71
+ Cooling Cube accepts several formats automatically:
72
+
73
+ ```json
74
+ {"0": 12345, "1": 11800, "2": 13000}
75
+ ```
76
+
77
+ ```json
78
+ [{"rank": 0, "total_iter_time": 0.186}, {"rank": 1, "total_iter_time": 0.212}]
79
+ ```
80
+
81
+ ```json
82
+ {"workers": [{"rank": 0, "step_time": 0.172}, ...]}
83
+ ```
84
+
85
+ ## How it works
86
+
87
+ Standard DDP holds all workers at the barrier until the slowest finishes. The bottleneck is usually not the straggler itself but the pressure workers pushing it — workers with slightly elevated times that create synchronization pressure.
88
+
89
+ Cooling Cube identifies those pressure workers and computes per-worker start-time offsets to reduce tail latency. The algorithm uses a Ridge surrogate model and converges in 40–80 oracle calls regardless of cluster size.
90
+
91
+ Typical gains: 0.2–0.9% step-time reduction on heterogeneous or PCIe-bound clusters.
92
+
93
+ ## Free
94
+
95
+ Free for open source and research use. No account required.
96
+
97
+ https://coolingcube.cc · CoolingCubeInfo@proton.me
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: coolingcube
3
+ Version: 0.1.0
4
+ Summary: GPU cluster tail latency optimizer for DDP training and inference
5
+ Home-page: https://coolingcube.cc
6
+ Author: Cooling Cube
7
+ Author-email: CoolingCubeInfo@proton.me
8
+ Keywords: gpu ddp training inference tail-latency optimization multi-gpu
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: Topic :: System :: Distributed Computing
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: requests>=2.20
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: keywords
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
27
+
28
+ # Cooling Cube
29
+
30
+ GPU cluster tail latency optimizer for DDP training and multi-GPU inference.
31
+
32
+ Reduces per-step tail latency by identifying pressure workers and computing start-time offsets. Works on 8–64 GPU clusters. Zero negative gains guaranteed.
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install coolingcube
38
+ ```
39
+
40
+ ## CLI usage
41
+
42
+ ```bash
43
+ # From a timing log file
44
+ coolingcube --logs timing.json
45
+
46
+ # Inline worker times (microseconds)
47
+ coolingcube --workers '{"0": 12000, "1": 11800, "2": 15500, "3": 12500}'
48
+
49
+ # JSON output
50
+ coolingcube --logs timing.json --json
51
+ ```
52
+
53
+ ## Python usage
54
+
55
+ ```python
56
+ from coolingcube import optimize
57
+
58
+ result = optimize({
59
+ "0": 12000,
60
+ "1": 11800,
61
+ "2": 15500,
62
+ "3": 12500,
63
+ })
64
+
65
+ print(f"Gain: {result['gain_pct']:.3f}% ({result['gain_us']:.1f} µs)")
66
+ print(f"Schedule: {result['best_schedule']}")
67
+ ```
68
+
69
+ ## Collecting logs from PyTorch DDP
70
+
71
+ ```python
72
+ import time
73
+ import torch
74
+ import torch.distributed as dist
75
+
76
+ timing_logs = {}
77
+
78
+ for step in range(num_steps):
79
+ t0 = time.perf_counter()
80
+ loss = model(batch)
81
+ loss.backward()
82
+ optimizer.step()
83
+ dist.barrier()
84
+ elapsed_us = (time.perf_counter() - t0) * 1e6
85
+
86
+ rank = dist.get_rank()
87
+ timing_logs[str(rank)] = elapsed_us # average over steps in practice
88
+
89
+ # After training loop, on rank 0:
90
+ if dist.get_rank() == 0:
91
+ from coolingcube import optimize
92
+ result = optimize(timing_logs)
93
+ print(f"Gain: {result['gain_pct']:.3f}%")
94
+ ```
95
+
96
+ ## Log file formats
97
+
98
+ Cooling Cube accepts several formats automatically:
99
+
100
+ ```json
101
+ {"0": 12345, "1": 11800, "2": 13000}
102
+ ```
103
+
104
+ ```json
105
+ [{"rank": 0, "total_iter_time": 0.186}, {"rank": 1, "total_iter_time": 0.212}]
106
+ ```
107
+
108
+ ```json
109
+ {"workers": [{"rank": 0, "step_time": 0.172}, ...]}
110
+ ```
111
+
112
+ ## How it works
113
+
114
+ Standard DDP holds all workers at the barrier until the slowest finishes. The bottleneck is usually not the straggler itself but the pressure workers pushing it — workers with slightly elevated times that create synchronization pressure.
115
+
116
+ Cooling Cube identifies those pressure workers and computes per-worker start-time offsets to reduce tail latency. The algorithm uses a Ridge surrogate model and converges in 40–80 oracle calls regardless of cluster size.
117
+
118
+ Typical gains: 0.2–0.9% step-time reduction on heterogeneous or PCIe-bound clusters.
119
+
120
+ ## Free
121
+
122
+ Free for open source and research use. No account required.
123
+
124
+ https://coolingcube.cc · CoolingCubeInfo@proton.me
@@ -0,0 +1,8 @@
1
+ README.md
2
+ setup.py
3
+ coolingcube.egg-info/PKG-INFO
4
+ coolingcube.egg-info/SOURCES.txt
5
+ coolingcube.egg-info/dependency_links.txt
6
+ coolingcube.egg-info/entry_points.txt
7
+ coolingcube.egg-info/requires.txt
8
+ coolingcube.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ coolingcube = coolingcube.cli:main
@@ -0,0 +1 @@
1
+ requests>=2.20
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,33 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ setup(
7
+ name="coolingcube",
8
+ version="0.1.0",
9
+ author="Cooling Cube",
10
+ author_email="CoolingCubeInfo@proton.me",
11
+ description="GPU cluster tail latency optimizer for DDP training and inference",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://coolingcube.cc",
15
+ packages=find_packages(),
16
+ python_requires=">=3.8",
17
+ install_requires=[
18
+ "requests>=2.20",
19
+ ],
20
+ entry_points={
21
+ "console_scripts": [
22
+ "coolingcube=coolingcube.cli:main",
23
+ ],
24
+ },
25
+ classifiers=[
26
+ "Programming Language :: Python :: 3",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: OS Independent",
29
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
30
+ "Topic :: System :: Distributed Computing",
31
+ ],
32
+ keywords="gpu ddp training inference tail-latency optimization multi-gpu",
33
+ )