thermal-mcp-server 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thermal_mcp_server-0.3.0/LICENSE +21 -0
- thermal_mcp_server-0.3.0/PKG-INFO +254 -0
- thermal_mcp_server-0.3.0/README.md +198 -0
- thermal_mcp_server-0.3.0/pyproject.toml +61 -0
- thermal_mcp_server-0.3.0/setup.cfg +4 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server/__init__.py +3 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server/__main__.py +5 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server/mcp_server.py +290 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server/physics.py +317 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server/schemas.py +187 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server.egg-info/PKG-INFO +254 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server.egg-info/SOURCES.txt +16 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server.egg-info/dependency_links.txt +1 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server.egg-info/requires.txt +12 -0
- thermal_mcp_server-0.3.0/src/thermal_mcp_server.egg-info/top_level.txt +2 -0
- thermal_mcp_server-0.3.0/tests/test_demo_helpers.py +69 -0
- thermal_mcp_server-0.3.0/tests/test_mcp_tools.py +244 -0
- thermal_mcp_server-0.3.0/tests/test_physics_behavior.py +445 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Riccardo Vietri
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thermal-mcp-server
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: MCP server for datacenter GPU liquid cooling thermal analysis
|
|
5
|
+
Author: Riccardo Vietri
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Riccardo Vietri
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/riccardovietri/thermal-mcp-server
|
|
29
|
+
Project-URL: Source, https://github.com/riccardovietri/thermal-mcp-server
|
|
30
|
+
Project-URL: Bug Tracker, https://github.com/riccardovietri/thermal-mcp-server/issues
|
|
31
|
+
Keywords: mcp,thermal,cooling,gpu,datacenter,liquid-cooling,nvidia,h100,b200
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: Scientific/Engineering :: Physics
|
|
41
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
42
|
+
Requires-Python: >=3.10
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Requires-Dist: fastmcp<3,>=2.0
|
|
46
|
+
Requires-Dist: pydantic>=2.6.0
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
50
|
+
Provides-Extra: demo
|
|
51
|
+
Requires-Dist: ipywidgets>=8.1.0; extra == "demo"
|
|
52
|
+
Requires-Dist: jupyter>=1.1.0; extra == "demo"
|
|
53
|
+
Requires-Dist: matplotlib>=3.8.0; extra == "demo"
|
|
54
|
+
Requires-Dist: pandas>=2.2.0; extra == "demo"
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
|
|
57
|
+
[](https://github.com/riccardovietri/thermal-mcp-server/actions/workflows/ci.yml)
|
|
58
|
+
[](https://pypi.org/project/thermal-mcp-server/)
|
|
59
|
+
[](https://pypi.org/project/thermal-mcp-server/)
|
|
60
|
+
|
|
61
|
+
# thermal-mcp-server
|
|
62
|
+
|
|
63
|
+
**A physics engine for liquid-cooled GPU systems, exposed as an AI-callable MCP server.** Ask Claude to size a CDU for an NVL72 rack, optimize flow rates for an H100 cluster, or compare water versus glycol — and get first-principles answers backed by hand-validated thermal models.
|
|
64
|
+
|
|
65
|
+
## Why This Exists
|
|
66
|
+
|
|
67
|
+
GPU power density has grown roughly 3× in two generations: H100 SXM at 700 W, B200 at 1,200 W. A single NVL72 rack dissipates 86.4 kW — more than most commercial HVAC systems handle. Liquid cooling is no longer optional; it is the critical path for AI compute density, and it is now the infrastructure constraint that determines whether a data center can host the next generation of hardware.
|
|
68
|
+
|
|
69
|
+
The tooling hasn't kept up. CDU sizing decisions get made with vendor charts, spreadsheets, and intuition. `thermal-mcp-server` is a first-principles thermal resistance model that an AI assistant can call directly — validated against published chip specs, honest about its assumptions, and designed to answer the questions that actually matter in a procurement conversation.
|
|
70
|
+
|
|
71
|
+
## Demo
|
|
72
|
+
|
|
73
|
+
Ask Claude natural-language questions. The server handles the physics.
|
|
74
|
+
|
|
75
|
+
<img width="1768" height="1750" alt="Claude answering a liquid cooling question using thermal-mcp-server" src="https://github.com/user-attachments/assets/7e3fb436-38d2-477b-a4dd-e5a2a740d463" />
|
|
76
|
+
|
|
77
|
+
### NVL72 CDU sizing — the procurement question
|
|
78
|
+
|
|
79
|
+
**The question:** *What's the minimum CDU flow rate to keep 72 B200 GPUs below 75°C junction temperature at 1,200 W each, with 25°C supply water?*
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from thermal_mcp_server.physics import optimize_flow, analyze_rack
|
|
83
|
+
from thermal_mcp_server.schemas import OptimizeFlowRateInput, AnalyzeRackInput, Geometry
|
|
84
|
+
|
|
85
|
+
# B200-specific cold plate geometry (engineering estimate; NVIDIA does not publish)
|
|
86
|
+
# 60 × 0.7 mm × 1.5 mm channels, 100 mm length, 160 cm² contact area
|
|
87
|
+
B200_PLATE = Geometry(
|
|
88
|
+
channel_count=60, channel_width_m=0.7e-3, channel_height_m=1.5e-3,
|
|
89
|
+
channel_length_m=0.10, base_thickness_m=1.5e-3,
|
|
90
|
+
contact_area_m2=0.016, copper_k_w_mk=385.0,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Find minimum flow per cold plate
|
|
94
|
+
opt = optimize_flow(OptimizeFlowRateInput(
|
|
95
|
+
heat_load_w=1200, max_junction_temp_c=75.0,
|
|
96
|
+
inlet_temp_c=25.0, coolant="water",
|
|
97
|
+
r_jc_k_per_w=0.02, r_tim_k_per_w=0.015, # B200 estimates
|
|
98
|
+
geometry=B200_PLATE,
|
|
99
|
+
))
|
|
100
|
+
min_flow_per_gpu, analysis = opt
|
|
101
|
+
print(f"Minimum flow: {min_flow_per_gpu:.1f} LPM/GPU → {min_flow_per_gpu * 72:.0f} LPM total rack flow")
|
|
102
|
+
|
|
103
|
+
# Full rack spec at that flow
|
|
104
|
+
rack = analyze_rack(AnalyzeRackInput(
|
|
105
|
+
gpu_count=72, topology="parallel",
|
|
106
|
+
heat_load_per_gpu_w=1200, total_flow_lpm=min_flow_per_gpu * 72,
|
|
107
|
+
cdu_supply_temp_c=25.0, coolant="water",
|
|
108
|
+
r_jc_k_per_w=0.02, r_tim_k_per_w=0.015,
|
|
109
|
+
geometry=B200_PLATE,
|
|
110
|
+
))
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
Minimum flow: 9.3 LPM/GPU → 671 LPM total rack flow
|
|
115
|
+
|
|
116
|
+
Rack spec at minimum flow:
|
|
117
|
+
Max junction temp: 75.0°C (at 75°C design limit)
|
|
118
|
+
Cold plate ΔP: 0.096 bar (cold plates only; add 20–50% for manifold)
|
|
119
|
+
CDU return temp: 26.9°C
|
|
120
|
+
Pump power: 214 W
|
|
121
|
+
Total heat load: 86.4 kW
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**What this tells a procurement team:** Minimum CDU flow spec is 671 LPM. Cold plate ΔP headroom is tight at 0.096 bar — real system ΔP with manifold and headers will be higher. With a 3°C design margin (target 72°C), the spec rises to 777 LPM at 0.141 bar.
|
|
125
|
+
|
|
126
|
+
### H100 SXM — validated baseline
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from thermal_mcp_server.physics import analyze
|
|
130
|
+
from thermal_mcp_server.schemas import AnalyzeColdplateInput
|
|
131
|
+
|
|
132
|
+
result = analyze(AnalyzeColdplateInput(
|
|
133
|
+
heat_load_w=700, flow_rate_lpm=8.0, inlet_temp_c=25.0, coolant="water"
|
|
134
|
+
))
|
|
135
|
+
# junction_temp_c: 70.9 — 12.1°C margin below 83°C throttle onset
|
|
136
|
+
# regime: transitional (Re ≈ 3734)
|
|
137
|
+
# pressure_drop_pa: 16800 (0.17 bar per cold plate)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
This case is hand-calculation validated: every intermediate value — Reynolds number, Nusselt number, convection coefficient, pressure drop — is independently verified in `tests/test_physics_behavior.py`.
|
|
141
|
+
|
|
142
|
+
## How It Works
|
|
143
|
+
|
|
144
|
+
The physics engine models a cold plate as a 1D thermal resistance network:
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
T_junction = T_inlet + Q × (R_jc + R_tim + R_base + R_conv) + ΔT_coolant/2
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
- **R_jc / R_tim:** Package resistances (chip manufacturer spec or estimate)
|
|
151
|
+
- **R_base:** Copper base conduction (geometry + k = 385 W/m·K)
|
|
152
|
+
- **R_conv:** Forced convection, Dittus-Boelter (turbulent) or Nu = 4.36 (laminar), linearly blended through transition (Re 2,300–4,000)
|
|
153
|
+
- **ΔP:** Darcy-Weisbach with Blasius friction factor, same transition blend
|
|
154
|
+
|
|
155
|
+
Rack-level model stacks N single-GPU analyses in series (cumulative temperature rise) or parallel (uniform inlet, flow split) topology.
|
|
156
|
+
|
|
157
|
+
```mermaid
|
|
158
|
+
flowchart LR
|
|
159
|
+
A["Input\nchip power, flow,\ncoolant, geometry"] --> B["Physics Engine\nDittus-Boelter · Darcy-Weisbach\nR_total network"]
|
|
160
|
+
B --> C["Output\nT_junction · ΔP\nthermal margin · pump power"]
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Validation
|
|
164
|
+
|
|
165
|
+
Model outputs against published chip specs. All runs use water coolant, 25°C inlet.
|
|
166
|
+
|
|
167
|
+
| Chip | TDP | Tj Design Ceiling | Model Tj | Margin | Notes |
|
|
168
|
+
|------|-----|-------------------|----------|--------|-------|
|
|
169
|
+
| H100 SXM | 700 W | 83°C | **70.9°C** at 8 LPM | 12.1°C | Default geometry; hand-calc validated |
|
|
170
|
+
| MI300X | 750 W | ~85°C (proxy) | **74.2°C** at 8 LPM | ~10°C | AMD does not publish Tj_max |
|
|
171
|
+
| B200 NVL72 | 1,200 W | ~75°C (est.) | **75.0°C** at 9.3 LPM/GPU | 0°C at limit | R_jc=0.02 K/W est.; NVIDIA does not publish |
|
|
172
|
+
| Gaudi 3 OAM | 900 W (air) / 1,200 W (liquid) | ~85°C (proxy) | Requires B200-class geometry | — | Default H100 geometry undersized for 1,200 W |
|
|
173
|
+
|
|
174
|
+
> **On B200 and Gaudi 3 numbers:** NVIDIA and Intel do not publish cold plate geometry or R_jc for these chips. The B200 analysis uses an engineering estimate for cold plate geometry and package resistance. Treat as indicative; real procurement sizing requires vendor data.
|
|
175
|
+
|
|
176
|
+
Chip sources: [NVIDIA H100 Datasheet](https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306) · [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/) · [SemiAnalysis B200 thermal estimates](https://newsletter.semianalysis.com/p/gb200-hardware-architecture-and-component) · [AMD MI300X Data Sheet](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf) · [Intel Gaudi 3 Product Brief](https://cdrdv2-public.intel.com/817487/gaudi-3-ai-accelerator-hl-325l-oam-mezzanine-card-product-brief.pdf)
|
|
177
|
+
|
|
178
|
+
## Known Limitations
|
|
179
|
+
|
|
180
|
+
These are documented explicitly because they bound what the model can and cannot tell you:
|
|
181
|
+
|
|
182
|
+
- **No manifold or header pressure losses** — rack ΔP is cold-plate-only. Real system ΔP should add 20–50% for manifold losses. Do not use cold-plate ΔP as the CDU pump spec without this adder.
|
|
183
|
+
- **No heterogeneous racks** — all GPUs assumed identical TDP, geometry, and thermal resistance. Mixed-SKU racks require per-GPU analysis.
|
|
184
|
+
- **Steady-state only** — no transient thermal capacitance. Power-on ramps, burst workloads, and cooldown curves are not modeled.
|
|
185
|
+
- **Single-point fluid properties** — water and glycol50 properties are fixed at 25°C nominal. No correction for temperature rise along the flow path.
|
|
186
|
+
- **No flow maldistribution** — uniform flow assumed across all cold plates. Real parallel manifolds have ±10–30% flow variation depending on header design.
|
|
187
|
+
|
|
188
|
+
See [`docs/physics.md`](docs/physics.md) for the full physics documentation including equations and assumptions.
|
|
189
|
+
|
|
190
|
+
## Tools
|
|
191
|
+
|
|
192
|
+
- **`analyze_coldplate`** — Single-point thermal and hydraulic analysis. Returns junction temperature, full resistance breakdown, pressure drop, regime, and pump power.
|
|
193
|
+
|
|
194
|
+
- **`compare_coolants`** — Side-by-side water vs. 50/50 glycol comparison at identical conditions. Quantifies the thermal and hydraulic penalty of glycol for freeze-protection applications.
|
|
195
|
+
|
|
196
|
+
- **`optimize_flow_rate`** — Binary search for minimum flow rate to meet a junction temperature target. Returns the optimal flow and full thermal analysis at that point.
|
|
197
|
+
|
|
198
|
+
- **`analyze_rack`** — Rack-level model for N identical GPUs in series or parallel topology. Returns max junction temperature, per-GPU temperature list, total CDU flow requirement, system ΔP, pump power, and CDU return temperature.
|
|
199
|
+
|
|
200
|
+
See [`docs/mcp.md`](docs/mcp.md) for full input/output schemas and field definitions.
|
|
201
|
+
|
|
202
|
+
## Quick Start
|
|
203
|
+
|
|
204
|
+
**Install from PyPI:**
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
python -m venv thermal-venv
|
|
208
|
+
source thermal-venv/bin/activate
|
|
209
|
+
pip install thermal-mcp-server
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Configure in your MCP client (`claude_desktop_config.json`):
|
|
213
|
+
|
|
214
|
+
```json
|
|
215
|
+
{
|
|
216
|
+
"mcpServers": {
|
|
217
|
+
"thermal": {
|
|
218
|
+
"command": "/absolute/path/to/thermal-venv/bin/python",
|
|
219
|
+
"args": ["-m", "thermal_mcp_server"]
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
> Use the absolute path to your venv Python. Claude Desktop does not inherit your shell's `PATH`.
|
|
226
|
+
|
|
227
|
+
**Install from source:**
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
git clone https://github.com/riccardovietri/thermal-mcp-server.git
|
|
231
|
+
cd thermal-mcp-server
|
|
232
|
+
python -m venv venv && source venv/bin/activate
|
|
233
|
+
pip install -e ".[dev]"
|
|
234
|
+
pytest # 34 tests, all should pass
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Usage with Claude
|
|
238
|
+
|
|
239
|
+
Once configured, ask Claude engineering questions directly:
|
|
240
|
+
|
|
241
|
+
> *"I have 8 H100 SXM GPUs at 700 W each. Water cooling, 8 LPM per cold plate, 25°C CDU supply. What's the junction temperature and how much thermal margin do I have?"*
|
|
242
|
+
|
|
243
|
+
> *"What's the minimum flow rate to keep an H100 below 80°C at 30°C inlet temperature?"*
|
|
244
|
+
|
|
245
|
+
> *"Compare water versus 50/50 glycol for a 700 W load at 8 LPM — what's the Tj penalty of switching to glycol?"*
|
|
246
|
+
|
|
247
|
+
> *"Size a CDU for 8 H100 GPUs in a parallel manifold. I want to know total flow, system ΔP, and return water temperature."*
|
|
248
|
+
|
|
249
|
+
Claude calls the relevant tool, interprets the physics output, and answers in context. The MCP layer handles validation and error reporting; the physics stays in the Python API and is independently testable.
|
|
250
|
+
|
|
251
|
+
## Roadmap
|
|
252
|
+
|
|
253
|
+
- **Sensitivity and uncertainty output** — `margin_c` parameter for `optimize_flow_rate` targets Tj_p95 rather than nominal. ∂Tj/∂Q, ∂Tj/∂R_tim, ∂Tj/∂T_inlet available via `compute_sensitivity`. See [`examples/interactive_sizing.ipynb`](examples/interactive_sizing.ipynb) for worked examples.
|
|
254
|
+
- **ROI calculator** — financial layer: annual cooling cost delta between air and liquid, CDU payback period, per-GPU cooling tax as % of compute cost.
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
[](https://github.com/riccardovietri/thermal-mcp-server/actions/workflows/ci.yml)
|
|
2
|
+
[](https://pypi.org/project/thermal-mcp-server/)
|
|
3
|
+
[](https://pypi.org/project/thermal-mcp-server/)
|
|
4
|
+
|
|
5
|
+
# thermal-mcp-server
|
|
6
|
+
|
|
7
|
+
**A physics engine for liquid-cooled GPU systems, exposed as an AI-callable MCP server.** Ask Claude to size a CDU for an NVL72 rack, optimize flow rates for an H100 cluster, or compare water versus glycol — and get first-principles answers backed by hand-validated thermal models.
|
|
8
|
+
|
|
9
|
+
## Why This Exists
|
|
10
|
+
|
|
11
|
+
GPU power density has grown roughly 3× in two generations: H100 SXM at 700 W, B200 at 1,200 W. A single NVL72 rack dissipates 86.4 kW — more than most commercial HVAC systems handle. Liquid cooling is no longer optional; it is the critical path for AI compute density, and it is now the infrastructure constraint that determines whether a data center can host the next generation of hardware.
|
|
12
|
+
|
|
13
|
+
The tooling hasn't kept up. CDU sizing decisions get made with vendor charts, spreadsheets, and intuition. `thermal-mcp-server` is a first-principles thermal resistance model that an AI assistant can call directly — validated against published chip specs, honest about its assumptions, and designed to answer the questions that actually matter in a procurement conversation.
|
|
14
|
+
|
|
15
|
+
## Demo
|
|
16
|
+
|
|
17
|
+
Ask Claude natural-language questions. The server handles the physics.
|
|
18
|
+
|
|
19
|
+
<img width="1768" height="1750" alt="Claude answering a liquid cooling question using thermal-mcp-server" src="https://github.com/user-attachments/assets/7e3fb436-38d2-477b-a4dd-e5a2a740d463" />
|
|
20
|
+
|
|
21
|
+
### NVL72 CDU sizing — the procurement question
|
|
22
|
+
|
|
23
|
+
**The question:** *What's the minimum CDU flow rate to keep 72 B200 GPUs below 75°C junction temperature at 1,200 W each, with 25°C supply water?*
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from thermal_mcp_server.physics import optimize_flow, analyze_rack
|
|
27
|
+
from thermal_mcp_server.schemas import OptimizeFlowRateInput, AnalyzeRackInput, Geometry
|
|
28
|
+
|
|
29
|
+
# B200-specific cold plate geometry (engineering estimate; NVIDIA does not publish)
|
|
30
|
+
# 60 × 0.7 mm × 1.5 mm channels, 100 mm length, 160 cm² contact area
|
|
31
|
+
B200_PLATE = Geometry(
|
|
32
|
+
channel_count=60, channel_width_m=0.7e-3, channel_height_m=1.5e-3,
|
|
33
|
+
channel_length_m=0.10, base_thickness_m=1.5e-3,
|
|
34
|
+
contact_area_m2=0.016, copper_k_w_mk=385.0,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Find minimum flow per cold plate
|
|
38
|
+
opt = optimize_flow(OptimizeFlowRateInput(
|
|
39
|
+
heat_load_w=1200, max_junction_temp_c=75.0,
|
|
40
|
+
inlet_temp_c=25.0, coolant="water",
|
|
41
|
+
r_jc_k_per_w=0.02, r_tim_k_per_w=0.015, # B200 estimates
|
|
42
|
+
geometry=B200_PLATE,
|
|
43
|
+
))
|
|
44
|
+
min_flow_per_gpu, analysis = opt
|
|
45
|
+
print(f"Minimum flow: {min_flow_per_gpu:.1f} LPM/GPU → {min_flow_per_gpu * 72:.0f} LPM total rack flow")
|
|
46
|
+
|
|
47
|
+
# Full rack spec at that flow
|
|
48
|
+
rack = analyze_rack(AnalyzeRackInput(
|
|
49
|
+
gpu_count=72, topology="parallel",
|
|
50
|
+
heat_load_per_gpu_w=1200, total_flow_lpm=min_flow_per_gpu * 72,
|
|
51
|
+
cdu_supply_temp_c=25.0, coolant="water",
|
|
52
|
+
r_jc_k_per_w=0.02, r_tim_k_per_w=0.015,
|
|
53
|
+
geometry=B200_PLATE,
|
|
54
|
+
))
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
Minimum flow: 9.3 LPM/GPU → 671 LPM total rack flow
|
|
59
|
+
|
|
60
|
+
Rack spec at minimum flow:
|
|
61
|
+
Max junction temp: 75.0°C (at 75°C design limit)
|
|
62
|
+
Cold plate ΔP: 0.096 bar (cold plates only; add 20–50% for manifold)
|
|
63
|
+
CDU return temp: 26.9°C
|
|
64
|
+
Pump power: 214 W
|
|
65
|
+
Total heat load: 86.4 kW
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**What this tells a procurement team:** Minimum CDU flow spec is 671 LPM. Cold plate ΔP headroom is tight at 0.096 bar — real system ΔP with manifold and headers will be higher. With a 3°C design margin (target 72°C), the spec rises to 777 LPM at 0.141 bar.
|
|
69
|
+
|
|
70
|
+
### H100 SXM — validated baseline
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from thermal_mcp_server.physics import analyze
|
|
74
|
+
from thermal_mcp_server.schemas import AnalyzeColdplateInput
|
|
75
|
+
|
|
76
|
+
result = analyze(AnalyzeColdplateInput(
|
|
77
|
+
heat_load_w=700, flow_rate_lpm=8.0, inlet_temp_c=25.0, coolant="water"
|
|
78
|
+
))
|
|
79
|
+
# junction_temp_c: 70.9 — 12.1°C margin below 83°C throttle onset
|
|
80
|
+
# regime: transitional (Re ≈ 3734)
|
|
81
|
+
# pressure_drop_pa: 16800 (0.17 bar per cold plate)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
This case is hand-calculation validated: every intermediate value — Reynolds number, Nusselt number, convection coefficient, pressure drop — is independently verified in `tests/test_physics_behavior.py`.
|
|
85
|
+
|
|
86
|
+
## How It Works
|
|
87
|
+
|
|
88
|
+
The physics engine models a cold plate as a 1D thermal resistance network:
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
T_junction = T_inlet + Q × (R_jc + R_tim + R_base + R_conv) + ΔT_coolant/2
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
- **R_jc / R_tim:** Package resistances (chip manufacturer spec or estimate)
|
|
95
|
+
- **R_base:** Copper base conduction (geometry + k = 385 W/m·K)
|
|
96
|
+
- **R_conv:** Forced convection, Dittus-Boelter (turbulent) or Nu = 4.36 (laminar), linearly blended through transition (Re 2,300–4,000)
|
|
97
|
+
- **ΔP:** Darcy-Weisbach with Blasius friction factor, same transition blend
|
|
98
|
+
|
|
99
|
+
Rack-level model stacks N single-GPU analyses in series (cumulative temperature rise) or parallel (uniform inlet, flow split) topology.
|
|
100
|
+
|
|
101
|
+
```mermaid
|
|
102
|
+
flowchart LR
|
|
103
|
+
A["Input\nchip power, flow,\ncoolant, geometry"] --> B["Physics Engine\nDittus-Boelter · Darcy-Weisbach\nR_total network"]
|
|
104
|
+
B --> C["Output\nT_junction · ΔP\nthermal margin · pump power"]
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Validation
|
|
108
|
+
|
|
109
|
+
Model outputs against published chip specs. All runs use water coolant, 25°C inlet.
|
|
110
|
+
|
|
111
|
+
| Chip | TDP | Tj Design Ceiling | Model Tj | Margin | Notes |
|
|
112
|
+
|------|-----|-------------------|----------|--------|-------|
|
|
113
|
+
| H100 SXM | 700 W | 83°C | **70.9°C** at 8 LPM | 12.1°C | Default geometry; hand-calc validated |
|
|
114
|
+
| MI300X | 750 W | ~85°C (proxy) | **74.2°C** at 8 LPM | ~10°C | AMD does not publish Tj_max |
|
|
115
|
+
| B200 NVL72 | 1,200 W | ~75°C (est.) | **75.0°C** at 9.3 LPM/GPU | 0°C at limit | R_jc=0.02 K/W est.; NVIDIA does not publish |
|
|
116
|
+
| Gaudi 3 OAM | 900 W (air) / 1,200 W (liquid) | ~85°C (proxy) | Requires B200-class geometry | — | Default H100 geometry undersized for 1,200 W |
|
|
117
|
+
|
|
118
|
+
> **On B200 and Gaudi 3 numbers:** NVIDIA and Intel do not publish cold plate geometry or R_jc for these chips. The B200 analysis uses an engineering estimate for cold plate geometry and package resistance. Treat as indicative; real procurement sizing requires vendor data.
|
|
119
|
+
|
|
120
|
+
Chip sources: [NVIDIA H100 Datasheet](https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306) · [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/) · [SemiAnalysis B200 thermal estimates](https://newsletter.semianalysis.com/p/gb200-hardware-architecture-and-component) · [AMD MI300X Data Sheet](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf) · [Intel Gaudi 3 Product Brief](https://cdrdv2-public.intel.com/817487/gaudi-3-ai-accelerator-hl-325l-oam-mezzanine-card-product-brief.pdf)
|
|
121
|
+
|
|
122
|
+
## Known Limitations
|
|
123
|
+
|
|
124
|
+
These are documented explicitly because they bound what the model can and cannot tell you:
|
|
125
|
+
|
|
126
|
+
- **No manifold or header pressure losses** — rack ΔP is cold-plate-only. Real system ΔP should add 20–50% for manifold losses. Do not use cold-plate ΔP as the CDU pump spec without this adder.
|
|
127
|
+
- **No heterogeneous racks** — all GPUs assumed identical TDP, geometry, and thermal resistance. Mixed-SKU racks require per-GPU analysis.
|
|
128
|
+
- **Steady-state only** — no transient thermal capacitance. Power-on ramps, burst workloads, and cooldown curves are not modeled.
|
|
129
|
+
- **Single-point fluid properties** — water and glycol50 properties are fixed at 25°C nominal. No correction for temperature rise along the flow path.
|
|
130
|
+
- **No flow maldistribution** — uniform flow assumed across all cold plates. Real parallel manifolds have ±10–30% flow variation depending on header design.
|
|
131
|
+
|
|
132
|
+
See [`docs/physics.md`](docs/physics.md) for the full physics documentation including equations and assumptions.
|
|
133
|
+
|
|
134
|
+
## Tools
|
|
135
|
+
|
|
136
|
+
- **`analyze_coldplate`** — Single-point thermal and hydraulic analysis. Returns junction temperature, full resistance breakdown, pressure drop, regime, and pump power.
|
|
137
|
+
|
|
138
|
+
- **`compare_coolants`** — Side-by-side water vs. 50/50 glycol comparison at identical conditions. Quantifies the thermal and hydraulic penalty of glycol for freeze-protection applications.
|
|
139
|
+
|
|
140
|
+
- **`optimize_flow_rate`** — Binary search for minimum flow rate to meet a junction temperature target. Returns the optimal flow and full thermal analysis at that point.
|
|
141
|
+
|
|
142
|
+
- **`analyze_rack`** — Rack-level model for N identical GPUs in series or parallel topology. Returns max junction temperature, per-GPU temperature list, total CDU flow requirement, system ΔP, pump power, and CDU return temperature.
|
|
143
|
+
|
|
144
|
+
See [`docs/mcp.md`](docs/mcp.md) for full input/output schemas and field definitions.
|
|
145
|
+
|
|
146
|
+
## Quick Start
|
|
147
|
+
|
|
148
|
+
**Install from PyPI:**
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
python -m venv thermal-venv
|
|
152
|
+
source thermal-venv/bin/activate
|
|
153
|
+
pip install thermal-mcp-server
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Configure in your MCP client (`claude_desktop_config.json`):
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
{
|
|
160
|
+
"mcpServers": {
|
|
161
|
+
"thermal": {
|
|
162
|
+
"command": "/absolute/path/to/thermal-venv/bin/python",
|
|
163
|
+
"args": ["-m", "thermal_mcp_server"]
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
> Use the absolute path to your venv Python. Claude Desktop does not inherit your shell's `PATH`.
|
|
170
|
+
|
|
171
|
+
**Install from source:**
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
git clone https://github.com/riccardovietri/thermal-mcp-server.git
|
|
175
|
+
cd thermal-mcp-server
|
|
176
|
+
python -m venv venv && source venv/bin/activate
|
|
177
|
+
pip install -e ".[dev]"
|
|
178
|
+
pytest # 34 tests, all should pass
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Usage with Claude
|
|
182
|
+
|
|
183
|
+
Once configured, ask Claude engineering questions directly:
|
|
184
|
+
|
|
185
|
+
> *"I have 8 H100 SXM GPUs at 700 W each. Water cooling, 8 LPM per cold plate, 25°C CDU supply. What's the junction temperature and how much thermal margin do I have?"*
|
|
186
|
+
|
|
187
|
+
> *"What's the minimum flow rate to keep an H100 below 80°C at 30°C inlet temperature?"*
|
|
188
|
+
|
|
189
|
+
> *"Compare water versus 50/50 glycol for a 700 W load at 8 LPM — what's the Tj penalty of switching to glycol?"*
|
|
190
|
+
|
|
191
|
+
> *"Size a CDU for 8 H100 GPUs in a parallel manifold. I want to know total flow, system ΔP, and return water temperature."*
|
|
192
|
+
|
|
193
|
+
Claude calls the relevant tool, interprets the physics output, and answers in context. The MCP layer handles validation and error reporting; the physics stays in the Python API and is independently testable.
|
|
194
|
+
|
|
195
|
+
## Roadmap
|
|
196
|
+
|
|
197
|
+
- **Sensitivity and uncertainty output** — `margin_c` parameter for `optimize_flow_rate` targets Tj_p95 rather than nominal. ∂Tj/∂Q, ∂Tj/∂R_tim, ∂Tj/∂T_inlet available via `compute_sensitivity`. See [`examples/interactive_sizing.ipynb`](examples/interactive_sizing.ipynb) for worked examples.
|
|
198
|
+
- **ROI calculator** — financial layer: annual cooling cost delta between air and liquid, CDU payback period, per-GPU cooling tax as % of compute cost.
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "thermal-mcp-server"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "MCP server for datacenter GPU liquid cooling thermal analysis"
|
|
9
|
+
authors = [{name = "Riccardo Vietri"}]
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
keywords = ["mcp", "thermal", "cooling", "gpu", "datacenter", "liquid-cooling", "nvidia", "h100", "b200"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Physics",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"fastmcp>=2.0,<3",
|
|
28
|
+
"pydantic>=2.6.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = [
|
|
33
|
+
"pytest>=8.0.0",
|
|
34
|
+
"pytest-cov>=4.1.0",
|
|
35
|
+
]
|
|
36
|
+
demo = [
|
|
37
|
+
"ipywidgets>=8.1.0",
|
|
38
|
+
"jupyter>=1.1.0",
|
|
39
|
+
"matplotlib>=3.8.0",
|
|
40
|
+
"pandas>=2.2.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/riccardovietri/thermal-mcp-server"
|
|
45
|
+
Source = "https://github.com/riccardovietri/thermal-mcp-server"
|
|
46
|
+
"Bug Tracker" = "https://github.com/riccardovietri/thermal-mcp-server/issues"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools]
|
|
49
|
+
package-dir = {"" = "src"}
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.packages.find]
|
|
52
|
+
where = ["src"]
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
testpaths = ["tests"]
|
|
56
|
+
|
|
57
|
+
[dependency-groups]
|
|
58
|
+
dev = [
|
|
59
|
+
"pytest>=9.0.2",
|
|
60
|
+
"pytest-cov>=4.1.0",
|
|
61
|
+
]
|