aws-bootstrap-g4dn 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aws_bootstrap/__init__.py +1 -0
- aws_bootstrap/cli.py +438 -0
- aws_bootstrap/config.py +24 -0
- aws_bootstrap/ec2.py +341 -0
- aws_bootstrap/resources/__init__.py +0 -0
- aws_bootstrap/resources/gpu_benchmark.py +839 -0
- aws_bootstrap/resources/gpu_smoke_test.ipynb +340 -0
- aws_bootstrap/resources/remote_setup.sh +188 -0
- aws_bootstrap/resources/requirements.txt +8 -0
- aws_bootstrap/ssh.py +513 -0
- aws_bootstrap/tests/__init__.py +0 -0
- aws_bootstrap/tests/test_cli.py +528 -0
- aws_bootstrap/tests/test_config.py +35 -0
- aws_bootstrap/tests/test_ec2.py +313 -0
- aws_bootstrap/tests/test_ssh_config.py +297 -0
- aws_bootstrap/tests/test_ssh_gpu.py +138 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/METADATA +308 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/RECORD +22 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/WHEEL +5 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/entry_points.txt +2 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/licenses/LICENSE +21 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "title",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# GPU Smoke Test\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"Interactive GPU verification and quick benchmark for aws-bootstrap instances.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"Run each cell top-to-bottom to verify the CUDA stack, exercise FP32/FP16 operations,\n",
|
|
13
|
+
"train a small CNN on MNIST, and visualise loss and memory usage.\n",
|
|
14
|
+
"\n",
|
|
15
|
+
"For the full CLI benchmark (CNN + Transformer, configurable precision/batch-size),\n",
|
|
16
|
+
"run `python ~/gpu_benchmark.py` from a terminal."
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"cell_type": "code",
|
|
21
|
+
"execution_count": null,
|
|
22
|
+
"id": "system-info",
|
|
23
|
+
"metadata": {},
|
|
24
|
+
"outputs": [],
|
|
25
|
+
"source": [
|
|
26
|
+
"import sys\n",
|
|
27
|
+
"\n",
|
|
28
|
+
"import torch\n",
|
|
29
|
+
"\n",
|
|
30
|
+
"\n",
|
|
31
|
+
"print(f\"Python : {sys.version.split()[0]}\")\n",
|
|
32
|
+
"print(f\"PyTorch : {torch.__version__}\")\n",
|
|
33
|
+
"print(f\"CUDA : {torch.version.cuda}\")\n",
|
|
34
|
+
"print(f\"cuDNN : {torch.backends.cudnn.version()}\")\n",
|
|
35
|
+
"\n",
|
|
36
|
+
"assert torch.cuda.is_available(), \"CUDA is not available!\"\n",
|
|
37
|
+
"\n",
|
|
38
|
+
"props = torch.cuda.get_device_properties(0)\n",
|
|
39
|
+
"print(f\"\\nGPU : {props.name}\")\n",
|
|
40
|
+
"print(f\"Compute cap. : {props.major}.{props.minor}\")\n",
|
|
41
|
+
"print(f\"Total memory : {props.total_memory / (1024**3):.1f} GB\")\n",
|
|
42
|
+
"print(f\"SM count : {props.multi_processor_count}\")"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "markdown",
|
|
47
|
+
"id": "smoke-header",
|
|
48
|
+
"metadata": {},
|
|
49
|
+
"source": [
|
|
50
|
+
"## CUDA Smoke Tests"
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"cell_type": "code",
|
|
55
|
+
"execution_count": null,
|
|
56
|
+
"id": "matmul-test",
|
|
57
|
+
"metadata": {},
|
|
58
|
+
"outputs": [],
|
|
59
|
+
"source": [
|
|
60
|
+
"import torch\n",
|
|
61
|
+
"\n",
|
|
62
|
+
"\n",
|
|
63
|
+
"N = 1024\n",
|
|
64
|
+
"\n",
|
|
65
|
+
"# --- FP32 matmul ---\n",
|
|
66
|
+
"a32 = torch.randn(N, N, device=\"cuda\")\n",
|
|
67
|
+
"b32 = torch.randn(N, N, device=\"cuda\")\n",
|
|
68
|
+
"\n",
|
|
69
|
+
"start = torch.cuda.Event(enable_timing=True)\n",
|
|
70
|
+
"end = torch.cuda.Event(enable_timing=True)\n",
|
|
71
|
+
"\n",
|
|
72
|
+
"start.record()\n",
|
|
73
|
+
"c32 = torch.mm(a32, b32)\n",
|
|
74
|
+
"end.record()\n",
|
|
75
|
+
"torch.cuda.synchronize()\n",
|
|
76
|
+
"fp32_ms = start.elapsed_time(end)\n",
|
|
77
|
+
"print(f\"FP32 matmul ({N}x{N}): {fp32_ms:.2f} ms\")\n",
|
|
78
|
+
"\n",
|
|
79
|
+
"# --- FP16 matmul ---\n",
|
|
80
|
+
"a16 = a32.half()\n",
|
|
81
|
+
"b16 = b32.half()\n",
|
|
82
|
+
"\n",
|
|
83
|
+
"start.record()\n",
|
|
84
|
+
"c16 = torch.mm(a16, b16)\n",
|
|
85
|
+
"end.record()\n",
|
|
86
|
+
"torch.cuda.synchronize()\n",
|
|
87
|
+
"fp16_ms = start.elapsed_time(end)\n",
|
|
88
|
+
"print(f\"FP16 matmul ({N}x{N}): {fp16_ms:.2f} ms\")\n",
|
|
89
|
+
"\n",
|
|
90
|
+
"# Correctness check: FP16 result should be close to FP32\n",
|
|
91
|
+
"diff = (c32 - c16.float()).abs().max().item()\n",
|
|
92
|
+
"print(f\"Max abs diff FP32 vs FP16: {diff:.4f}\")\n",
|
|
93
|
+
"assert diff < N, f\"Unexpectedly large diff: {diff}\" # loose bound\n",
|
|
94
|
+
"print(\"PASSED\")"
|
|
95
|
+
]
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"cell_type": "code",
|
|
99
|
+
"execution_count": null,
|
|
100
|
+
"id": "amp-test",
|
|
101
|
+
"metadata": {},
|
|
102
|
+
"outputs": [],
|
|
103
|
+
"source": [
|
|
104
|
+
"import torch\n",
|
|
105
|
+
"import torch.nn as nn\n",
|
|
106
|
+
"\n",
|
|
107
|
+
"\n",
|
|
108
|
+
"# AMP autocast: Linear + Conv2d\n",
|
|
109
|
+
"linear = nn.Linear(512, 512).cuda()\n",
|
|
110
|
+
"conv = nn.Conv2d(3, 64, 3, padding=1).cuda()\n",
|
|
111
|
+
"\n",
|
|
112
|
+
"x_lin = torch.randn(32, 512, device=\"cuda\")\n",
|
|
113
|
+
"x_conv = torch.randn(4, 3, 32, 32, device=\"cuda\")\n",
|
|
114
|
+
"\n",
|
|
115
|
+
"with torch.amp.autocast(device_type=\"cuda\"):\n",
|
|
116
|
+
" y_lin = linear(x_lin)\n",
|
|
117
|
+
" y_conv = conv(x_conv)\n",
|
|
118
|
+
"\n",
|
|
119
|
+
"torch.cuda.synchronize()\n",
|
|
120
|
+
"print(f\"Linear output: {y_lin.shape}, dtype={y_lin.dtype}\")\n",
|
|
121
|
+
"print(f\"Conv2d output: {y_conv.shape}, dtype={y_conv.dtype}\")\n",
|
|
122
|
+
"print(\"AMP autocast PASSED\")"
|
|
123
|
+
]
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"cell_type": "code",
|
|
127
|
+
"execution_count": null,
|
|
128
|
+
"id": "memory-test",
|
|
129
|
+
"metadata": {},
|
|
130
|
+
"outputs": [],
|
|
131
|
+
"source": [
|
|
132
|
+
"import torch\n",
|
|
133
|
+
"\n",
|
|
134
|
+
"\n",
|
|
135
|
+
"torch.cuda.empty_cache()\n",
|
|
136
|
+
"torch.cuda.reset_peak_memory_stats()\n",
|
|
137
|
+
"\n",
|
|
138
|
+
"before = torch.cuda.memory_allocated()\n",
|
|
139
|
+
"big = torch.randn(4096, 4096, device=\"cuda\") # ~64 MB\n",
|
|
140
|
+
"allocated = torch.cuda.memory_allocated()\n",
|
|
141
|
+
"total = torch.cuda.get_device_properties(0).total_memory\n",
|
|
142
|
+
"\n",
|
|
143
|
+
"print(f\"Before alloc : {before / 1e6:.1f} MB\")\n",
|
|
144
|
+
"print(f\"After alloc : {allocated / 1e6:.1f} MB\")\n",
|
|
145
|
+
"print(f\"Total GPU mem: {total / 1e9:.1f} GB\")\n",
|
|
146
|
+
"\n",
|
|
147
|
+
"del big\n",
|
|
148
|
+
"torch.cuda.empty_cache()\n",
|
|
149
|
+
"after_free = torch.cuda.memory_allocated()\n",
|
|
150
|
+
"print(f\"After free : {after_free / 1e6:.1f} MB\")\n",
|
|
151
|
+
"assert after_free <= before + 1e6, \"Memory not freed!\"\n",
|
|
152
|
+
"print(\"Memory alloc/free PASSED\")"
|
|
153
|
+
]
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"cell_type": "markdown",
|
|
157
|
+
"id": "train-header",
|
|
158
|
+
"metadata": {},
|
|
159
|
+
"source": [
|
|
160
|
+
"## Quick Training Benchmark\n",
|
|
161
|
+
"\n",
|
|
162
|
+
"Train a small CNN on MNIST for 5 epochs and collect the loss per batch."
|
|
163
|
+
]
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
"cell_type": "code",
|
|
167
|
+
"execution_count": null,
|
|
168
|
+
"id": "mnist-train",
|
|
169
|
+
"metadata": {},
|
|
170
|
+
"outputs": [],
|
|
171
|
+
"source": [
|
|
172
|
+
"import torch\n",
|
|
173
|
+
"import torch.nn as nn\n",
|
|
174
|
+
"import torch.nn.functional as F\n",
|
|
175
|
+
"from torch.utils.data import DataLoader\n",
|
|
176
|
+
"from torchvision import datasets, transforms\n",
|
|
177
|
+
"\n",
|
|
178
|
+
"\n",
|
|
179
|
+
"class MNISTConvNet(nn.Module):\n",
|
|
180
|
+
" def __init__(self):\n",
|
|
181
|
+
" super().__init__()\n",
|
|
182
|
+
" self.features = nn.Sequential(\n",
|
|
183
|
+
" nn.Conv2d(1, 64, 3, padding=1),\n",
|
|
184
|
+
" nn.BatchNorm2d(64),\n",
|
|
185
|
+
" nn.ReLU(inplace=True),\n",
|
|
186
|
+
" nn.Conv2d(64, 64, 3, padding=1),\n",
|
|
187
|
+
" nn.BatchNorm2d(64),\n",
|
|
188
|
+
" nn.ReLU(inplace=True),\n",
|
|
189
|
+
" nn.MaxPool2d(2),\n",
|
|
190
|
+
" nn.Conv2d(64, 128, 3, padding=1),\n",
|
|
191
|
+
" nn.BatchNorm2d(128),\n",
|
|
192
|
+
" nn.ReLU(inplace=True),\n",
|
|
193
|
+
" nn.Conv2d(128, 128, 3, padding=1),\n",
|
|
194
|
+
" nn.BatchNorm2d(128),\n",
|
|
195
|
+
" nn.ReLU(inplace=True),\n",
|
|
196
|
+
" nn.MaxPool2d(2),\n",
|
|
197
|
+
" nn.Conv2d(128, 256, 3, padding=1),\n",
|
|
198
|
+
" nn.BatchNorm2d(256),\n",
|
|
199
|
+
" nn.ReLU(inplace=True),\n",
|
|
200
|
+
" nn.AdaptiveAvgPool2d(1),\n",
|
|
201
|
+
" )\n",
|
|
202
|
+
" self.classifier = nn.Sequential(\n",
|
|
203
|
+
" nn.Flatten(),\n",
|
|
204
|
+
" nn.Linear(256, 256),\n",
|
|
205
|
+
" nn.ReLU(inplace=True),\n",
|
|
206
|
+
" nn.Dropout(0.5),\n",
|
|
207
|
+
" nn.Linear(256, 10),\n",
|
|
208
|
+
" )\n",
|
|
209
|
+
"\n",
|
|
210
|
+
" def forward(self, x):\n",
|
|
211
|
+
" return self.classifier(self.features(x))\n",
|
|
212
|
+
"\n",
|
|
213
|
+
"\n",
|
|
214
|
+
"device = torch.device(\"cuda\")\n",
|
|
215
|
+
"torch.cuda.reset_peak_memory_stats()\n",
|
|
216
|
+
"\n",
|
|
217
|
+
"transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])\n",
|
|
218
|
+
"train_set = datasets.MNIST(\"/tmp/data\", train=True, download=True, transform=transform)\n",
|
|
219
|
+
"loader = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=2, pin_memory=True)\n",
|
|
220
|
+
"\n",
|
|
221
|
+
"model = MNISTConvNet().to(device)\n",
|
|
222
|
+
"optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
|
|
223
|
+
"scaler = torch.amp.GradScaler(\"cuda\")\n",
|
|
224
|
+
"\n",
|
|
225
|
+
"NUM_EPOCHS = 5\n",
|
|
226
|
+
"losses = []\n",
|
|
227
|
+
"\n",
|
|
228
|
+
"for epoch in range(1, NUM_EPOCHS + 1):\n",
|
|
229
|
+
" model.train()\n",
|
|
230
|
+
" epoch_loss = 0.0\n",
|
|
231
|
+
" for images, labels in loader:\n",
|
|
232
|
+
" images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)\n",
|
|
233
|
+
" optimizer.zero_grad(set_to_none=True)\n",
|
|
234
|
+
" with torch.amp.autocast(device_type=\"cuda\"):\n",
|
|
235
|
+
" loss = F.cross_entropy(model(images), labels)\n",
|
|
236
|
+
" scaler.scale(loss).backward()\n",
|
|
237
|
+
" scaler.step(optimizer)\n",
|
|
238
|
+
" scaler.update()\n",
|
|
239
|
+
" losses.append(loss.item())\n",
|
|
240
|
+
" epoch_loss += loss.item()\n",
|
|
241
|
+
" avg = epoch_loss / len(loader)\n",
|
|
242
|
+
" print(f\"Epoch {epoch}/{NUM_EPOCHS} avg loss: {avg:.4f}\")\n",
|
|
243
|
+
"\n",
|
|
244
|
+
"peak_mb = torch.cuda.max_memory_allocated() / (1024**2)\n",
|
|
245
|
+
"print(f\"\\nPeak GPU memory during training: {peak_mb:.0f} MB\")\n",
|
|
246
|
+
"print(f\"Total batches: {len(losses)}\")"
|
|
247
|
+
]
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"cell_type": "code",
|
|
251
|
+
"execution_count": null,
|
|
252
|
+
"id": "loss-plot",
|
|
253
|
+
"metadata": {},
|
|
254
|
+
"outputs": [],
|
|
255
|
+
"source": [
|
|
256
|
+
"import matplotlib.pyplot as plt\n",
|
|
257
|
+
"\n",
|
|
258
|
+
"\n",
|
|
259
|
+
"plt.figure(figsize=(10, 4))\n",
|
|
260
|
+
"plt.plot(losses, linewidth=0.8, alpha=0.7)\n",
|
|
261
|
+
"plt.xlabel(\"Batch\")\n",
|
|
262
|
+
"plt.ylabel(\"Cross-Entropy Loss\")\n",
|
|
263
|
+
"plt.title(\"MNIST CNN Training Loss\")\n",
|
|
264
|
+
"plt.grid(True, alpha=0.3)\n",
|
|
265
|
+
"plt.tight_layout()\n",
|
|
266
|
+
"plt.show()"
|
|
267
|
+
]
|
|
268
|
+
},
|
|
269
|
+
{
|
|
270
|
+
"cell_type": "markdown",
|
|
271
|
+
"id": "mem-header",
|
|
272
|
+
"metadata": {},
|
|
273
|
+
"source": [
|
|
274
|
+
"## GPU Memory & Utilization"
|
|
275
|
+
]
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
"cell_type": "code",
|
|
279
|
+
"execution_count": null,
|
|
280
|
+
"id": "mem-chart",
|
|
281
|
+
"metadata": {},
|
|
282
|
+
"outputs": [],
|
|
283
|
+
"source": [
|
|
284
|
+
"import matplotlib.pyplot as plt\n",
|
|
285
|
+
"import torch\n",
|
|
286
|
+
"\n",
|
|
287
|
+
"\n",
|
|
288
|
+
"peak_mb = torch.cuda.max_memory_allocated() / (1024**2)\n",
|
|
289
|
+
"total_mb = torch.cuda.get_device_properties(0).total_memory / (1024**2)\n",
|
|
290
|
+
"free_mb = total_mb - peak_mb\n",
|
|
291
|
+
"\n",
|
|
292
|
+
"fig, ax = plt.subplots(figsize=(6, 4))\n",
|
|
293
|
+
"bars = ax.bar([\"Peak Used\", \"Remaining\"], [peak_mb, free_mb], color=[\"#e74c3c\", \"#2ecc71\"])\n",
|
|
294
|
+
"ax.set_ylabel(\"MB\")\n",
|
|
295
|
+
"ax.set_title(f\"GPU Memory: {peak_mb:.0f} MB peak / {total_mb:.0f} MB total\")\n",
|
|
296
|
+
"for bar in bars:\n",
|
|
297
|
+
" ax.text(\n",
|
|
298
|
+
" bar.get_x() + bar.get_width() / 2,\n",
|
|
299
|
+
" bar.get_height() + 50,\n",
|
|
300
|
+
" f\"{bar.get_height():.0f}\",\n",
|
|
301
|
+
" ha=\"center\",\n",
|
|
302
|
+
" va=\"bottom\",\n",
|
|
303
|
+
" fontsize=11,\n",
|
|
304
|
+
" )\n",
|
|
305
|
+
"plt.tight_layout()\n",
|
|
306
|
+
"plt.show()"
|
|
307
|
+
]
|
|
308
|
+
},
|
|
309
|
+
{
|
|
310
|
+
"cell_type": "markdown",
|
|
311
|
+
"id": "summary",
|
|
312
|
+
"metadata": {},
|
|
313
|
+
"source": [
|
|
314
|
+
"## Summary\n",
|
|
315
|
+
"\n",
|
|
316
|
+
"If all cells above ran without error, the CUDA stack is healthy and the GPU is\n",
|
|
317
|
+
"ready for training workloads.\n",
|
|
318
|
+
"\n",
|
|
319
|
+
"### Next steps\n",
|
|
320
|
+
"\n",
|
|
321
|
+
"- **Full benchmark** (CNN + Transformer, configurable precision): `python ~/gpu_benchmark.py`\n",
|
|
322
|
+
"- **Jupyter tips**: use `!nvidia-smi` in a cell to check GPU utilisation at any time\n",
|
|
323
|
+
"- **VSCode Remote SSH**: connect with `ssh aws-gpu<N>` for a full IDE experience"
|
|
324
|
+
]
|
|
325
|
+
}
|
|
326
|
+
],
|
|
327
|
+
"metadata": {
|
|
328
|
+
"kernelspec": {
|
|
329
|
+
"display_name": "Python 3",
|
|
330
|
+
"language": "python",
|
|
331
|
+
"name": "python3"
|
|
332
|
+
},
|
|
333
|
+
"language_info": {
|
|
334
|
+
"name": "python",
|
|
335
|
+
"version": "3.12.0"
|
|
336
|
+
}
|
|
337
|
+
},
|
|
338
|
+
"nbformat": 4,
|
|
339
|
+
"nbformat_minor": 5
|
|
340
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# remote_setup.sh — Post-boot setup for Deep Learning AMI instances.
|
|
3
|
+
# Runs on the EC2 instance after SSH becomes available.
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
echo "=== aws-bootstrap-g4dn remote setup ==="
|
|
7
|
+
|
|
8
|
+
# 1. Verify GPU
|
|
9
|
+
echo ""
|
|
10
|
+
echo "[1/5] Verifying GPU and CUDA..."
|
|
11
|
+
if command -v nvidia-smi &>/dev/null; then
|
|
12
|
+
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
|
13
|
+
else
|
|
14
|
+
echo "WARNING: nvidia-smi not found"
|
|
15
|
+
fi
|
|
16
|
+
|
|
17
|
+
if command -v nvcc &>/dev/null; then
|
|
18
|
+
nvcc --version | grep "release"
|
|
19
|
+
else
|
|
20
|
+
echo "WARNING: nvcc not found (CUDA toolkit may not be installed)"
|
|
21
|
+
fi
|
|
22
|
+
|
|
23
|
+
# 2. Install utilities
|
|
24
|
+
echo ""
|
|
25
|
+
echo "[2/5] Installing utilities..."
|
|
26
|
+
sudo apt-get update -qq
|
|
27
|
+
sudo apt-get install -y -qq htop tmux tree jq
|
|
28
|
+
|
|
29
|
+
# 3. Set up Python environment with uv
|
|
30
|
+
echo ""
|
|
31
|
+
echo "[3/5] Setting up Python environment with uv..."
|
|
32
|
+
if ! command -v uv &>/dev/null; then
|
|
33
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
34
|
+
fi
|
|
35
|
+
export PATH="$HOME/.local/bin:$PATH"
|
|
36
|
+
|
|
37
|
+
uv venv ~/venv
|
|
38
|
+
|
|
39
|
+
# --- CUDA-aware PyTorch installation ---
|
|
40
|
+
# Known PyTorch CUDA wheel tags (ascending order).
|
|
41
|
+
# Update this list when PyTorch publishes new CUDA builds.
|
|
42
|
+
# See: https://download.pytorch.org/whl/
|
|
43
|
+
KNOWN_CUDA_TAGS=(118 121 124 126 128 129 130)
|
|
44
|
+
|
|
45
|
+
detect_cuda_version() {
|
|
46
|
+
# Primary: nvcc (actual toolkit installed on the system)
|
|
47
|
+
if command -v nvcc &>/dev/null; then
|
|
48
|
+
nvcc --version | grep -oP 'release \K[\d.]+'
|
|
49
|
+
return
|
|
50
|
+
fi
|
|
51
|
+
# Fallback: nvidia-smi (max CUDA the driver supports)
|
|
52
|
+
if command -v nvidia-smi &>/dev/null; then
|
|
53
|
+
nvidia-smi | grep -oP 'CUDA Version: \K[\d.]+'
|
|
54
|
+
return
|
|
55
|
+
fi
|
|
56
|
+
echo ""
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
cuda_version_to_tag() {
|
|
60
|
+
# "12.9" → "129", "13.0" → "130"
|
|
61
|
+
echo "$1" | tr -d '.'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
find_best_cuda_tag() {
|
|
65
|
+
local detected_tag="$1"
|
|
66
|
+
local best=""
|
|
67
|
+
for tag in "${KNOWN_CUDA_TAGS[@]}"; do
|
|
68
|
+
if [ "$tag" -le "$detected_tag" ]; then
|
|
69
|
+
best="$tag"
|
|
70
|
+
fi
|
|
71
|
+
done
|
|
72
|
+
echo "$best"
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
install_pytorch_cuda() {
|
|
76
|
+
local cuda_ver
|
|
77
|
+
cuda_ver=$(detect_cuda_version)
|
|
78
|
+
|
|
79
|
+
if [ -z "$cuda_ver" ]; then
|
|
80
|
+
echo " WARNING: No CUDA detected — installing PyTorch from PyPI (CPU or default CUDA)"
|
|
81
|
+
uv pip install --python ~/venv/bin/python torch torchvision
|
|
82
|
+
return
|
|
83
|
+
fi
|
|
84
|
+
echo " Detected CUDA version: $cuda_ver"
|
|
85
|
+
|
|
86
|
+
local detected_tag
|
|
87
|
+
detected_tag=$(cuda_version_to_tag "$cuda_ver")
|
|
88
|
+
|
|
89
|
+
local best_tag
|
|
90
|
+
best_tag=$(find_best_cuda_tag "$detected_tag")
|
|
91
|
+
|
|
92
|
+
if [ -z "$best_tag" ]; then
|
|
93
|
+
echo " WARNING: No matching PyTorch CUDA tag for cu${detected_tag} — installing from PyPI"
|
|
94
|
+
uv pip install --python ~/venv/bin/python torch torchvision
|
|
95
|
+
return
|
|
96
|
+
fi
|
|
97
|
+
|
|
98
|
+
echo " Using PyTorch CUDA index: cu${best_tag}"
|
|
99
|
+
if ! uv pip install --python ~/venv/bin/python \
|
|
100
|
+
--default-index "https://download.pytorch.org/whl/cu${best_tag}" \
|
|
101
|
+
torch torchvision; then
|
|
102
|
+
echo " WARNING: CUDA index install failed — falling back to PyPI"
|
|
103
|
+
uv pip install --python ~/venv/bin/python torch torchvision
|
|
104
|
+
fi
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
install_pytorch_cuda
|
|
108
|
+
|
|
109
|
+
# Install remaining dependencies (torch/torchvision already installed above)
|
|
110
|
+
uv pip install --python ~/venv/bin/python -r /tmp/requirements.txt
|
|
111
|
+
|
|
112
|
+
# Copy GPU benchmark script and smoke test notebook
|
|
113
|
+
cp /tmp/gpu_benchmark.py ~/gpu_benchmark.py
|
|
114
|
+
cp /tmp/gpu_smoke_test.ipynb ~/gpu_smoke_test.ipynb
|
|
115
|
+
|
|
116
|
+
# Auto-activate venv on login
|
|
117
|
+
if ! grep -q 'source ~/venv/bin/activate' ~/.bashrc 2>/dev/null; then
|
|
118
|
+
echo 'source ~/venv/bin/activate' >> ~/.bashrc
|
|
119
|
+
fi
|
|
120
|
+
|
|
121
|
+
# Quick CUDA smoke test
|
|
122
|
+
echo " Running CUDA smoke test..."
|
|
123
|
+
if ~/venv/bin/python -c "
|
|
124
|
+
import torch
|
|
125
|
+
assert torch.cuda.is_available(), 'CUDA not available'
|
|
126
|
+
x = torch.randn(256, 256, device='cuda')
|
|
127
|
+
y = torch.mm(x, x)
|
|
128
|
+
torch.cuda.synchronize()
|
|
129
|
+
print(f' PyTorch {torch.__version__}, CUDA {torch.version.cuda}, GPU: {torch.cuda.get_device_name(0)}')
|
|
130
|
+
print(' Quick matmul test: PASSED')
|
|
131
|
+
"; then
|
|
132
|
+
echo " CUDA smoke test passed"
|
|
133
|
+
else
|
|
134
|
+
echo " WARNING: CUDA smoke test failed — check PyTorch/CUDA installation"
|
|
135
|
+
fi
|
|
136
|
+
|
|
137
|
+
JUPYTER_CONFIG_DIR="$HOME/.jupyter"
|
|
138
|
+
mkdir -p "$JUPYTER_CONFIG_DIR"
|
|
139
|
+
cat > "$JUPYTER_CONFIG_DIR/jupyter_lab_config.py" << 'PYEOF'
|
|
140
|
+
c.ServerApp.ip = '0.0.0.0'
|
|
141
|
+
c.ServerApp.port = 8888
|
|
142
|
+
c.ServerApp.open_browser = False
|
|
143
|
+
c.IdentityProvider.token = ''
|
|
144
|
+
c.ServerApp.allow_remote_access = True
|
|
145
|
+
PYEOF
|
|
146
|
+
echo " Jupyter config written to $JUPYTER_CONFIG_DIR/jupyter_lab_config.py"
|
|
147
|
+
|
|
148
|
+
# 4. Jupyter systemd service
|
|
149
|
+
echo ""
|
|
150
|
+
echo "[4/5] Setting up Jupyter systemd service..."
|
|
151
|
+
LOGIN_USER=$(whoami)
|
|
152
|
+
|
|
153
|
+
sudo tee /etc/systemd/system/jupyter.service > /dev/null << SVCEOF
|
|
154
|
+
[Unit]
|
|
155
|
+
Description=Jupyter Lab Server
|
|
156
|
+
After=network.target
|
|
157
|
+
|
|
158
|
+
[Service]
|
|
159
|
+
Type=simple
|
|
160
|
+
User=${LOGIN_USER}
|
|
161
|
+
WorkingDirectory=/home/${LOGIN_USER}
|
|
162
|
+
ExecStart=/home/${LOGIN_USER}/venv/bin/python -m jupyterlab
|
|
163
|
+
Restart=on-failure
|
|
164
|
+
RestartSec=10
|
|
165
|
+
|
|
166
|
+
[Install]
|
|
167
|
+
WantedBy=multi-user.target
|
|
168
|
+
SVCEOF
|
|
169
|
+
|
|
170
|
+
sudo systemctl daemon-reload
|
|
171
|
+
sudo systemctl enable jupyter.service
|
|
172
|
+
sudo systemctl start jupyter.service
|
|
173
|
+
echo " Jupyter service started (port 8888)"
|
|
174
|
+
|
|
175
|
+
# 5. SSH keepalive
|
|
176
|
+
echo ""
|
|
177
|
+
echo "[5/5] Configuring SSH keepalive..."
|
|
178
|
+
if ! grep -q "ClientAliveInterval" /etc/ssh/sshd_config; then
|
|
179
|
+
echo "ClientAliveInterval 60" | sudo tee -a /etc/ssh/sshd_config > /dev/null
|
|
180
|
+
echo "ClientAliveCountMax 10" | sudo tee -a /etc/ssh/sshd_config > /dev/null
|
|
181
|
+
sudo systemctl reload sshd
|
|
182
|
+
echo " SSH keepalive configured"
|
|
183
|
+
else
|
|
184
|
+
echo " SSH keepalive already configured"
|
|
185
|
+
fi
|
|
186
|
+
|
|
187
|
+
echo ""
|
|
188
|
+
echo "=== Remote setup complete ==="
|