aws-bootstrap-g4dn 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,340 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "title",
6
+ "metadata": {},
7
+ "source": [
8
+ "# GPU Smoke Test\n",
9
+ "\n",
10
+ "Interactive GPU verification and quick benchmark for aws-bootstrap instances.\n",
11
+ "\n",
12
+ "Run each cell top-to-bottom to verify the CUDA stack, exercise FP32/FP16 operations,\n",
13
+ "train a small CNN on MNIST, and visualise loss and memory usage.\n",
14
+ "\n",
15
+ "For the full CLI benchmark (CNN + Transformer, configurable precision/batch-size),\n",
16
+ "run `python ~/gpu_benchmark.py` from a terminal."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "system-info",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "import sys\n",
27
+ "\n",
28
+ "import torch\n",
29
+ "\n",
30
+ "\n",
31
+ "print(f\"Python : {sys.version.split()[0]}\")\n",
32
+ "print(f\"PyTorch : {torch.__version__}\")\n",
33
+ "print(f\"CUDA : {torch.version.cuda}\")\n",
34
+ "print(f\"cuDNN : {torch.backends.cudnn.version()}\")\n",
35
+ "\n",
36
+ "assert torch.cuda.is_available(), \"CUDA is not available!\"\n",
37
+ "\n",
38
+ "props = torch.cuda.get_device_properties(0)\n",
39
+ "print(f\"\\nGPU : {props.name}\")\n",
40
+ "print(f\"Compute cap. : {props.major}.{props.minor}\")\n",
41
+ "print(f\"Total memory : {props.total_memory / (1024**3):.1f} GB\")\n",
42
+ "print(f\"SM count : {props.multi_processor_count}\")"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "id": "smoke-header",
48
+ "metadata": {},
49
+ "source": [
50
+ "## CUDA Smoke Tests"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "matmul-test",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "import torch\n",
61
+ "\n",
62
+ "\n",
63
+ "N = 1024\n",
64
+ "\n",
65
+ "# --- FP32 matmul ---\n",
66
+ "a32 = torch.randn(N, N, device=\"cuda\")\n",
67
+ "b32 = torch.randn(N, N, device=\"cuda\")\n",
68
+ "\n",
69
+ "start = torch.cuda.Event(enable_timing=True)\n",
70
+ "end = torch.cuda.Event(enable_timing=True)\n",
71
+ "\n",
72
+ "start.record()\n",
73
+ "c32 = torch.mm(a32, b32)\n",
74
+ "end.record()\n",
75
+ "torch.cuda.synchronize()\n",
76
+ "fp32_ms = start.elapsed_time(end)\n",
77
+ "print(f\"FP32 matmul ({N}x{N}): {fp32_ms:.2f} ms\")\n",
78
+ "\n",
79
+ "# --- FP16 matmul ---\n",
80
+ "a16 = a32.half()\n",
81
+ "b16 = b32.half()\n",
82
+ "\n",
83
+ "start.record()\n",
84
+ "c16 = torch.mm(a16, b16)\n",
85
+ "end.record()\n",
86
+ "torch.cuda.synchronize()\n",
87
+ "fp16_ms = start.elapsed_time(end)\n",
88
+ "print(f\"FP16 matmul ({N}x{N}): {fp16_ms:.2f} ms\")\n",
89
+ "\n",
90
+ "# Correctness check: FP16 result should be close to FP32\n",
91
+ "diff = (c32 - c16.float()).abs().max().item()\n",
92
+ "print(f\"Max abs diff FP32 vs FP16: {diff:.4f}\")\n",
93
+ "assert diff < N, f\"Unexpectedly large diff: {diff}\" # loose bound\n",
94
+ "print(\"PASSED\")"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "id": "amp-test",
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "import torch\n",
105
+ "import torch.nn as nn\n",
106
+ "\n",
107
+ "\n",
108
+ "# AMP autocast: Linear + Conv2d\n",
109
+ "linear = nn.Linear(512, 512).cuda()\n",
110
+ "conv = nn.Conv2d(3, 64, 3, padding=1).cuda()\n",
111
+ "\n",
112
+ "x_lin = torch.randn(32, 512, device=\"cuda\")\n",
113
+ "x_conv = torch.randn(4, 3, 32, 32, device=\"cuda\")\n",
114
+ "\n",
115
+ "with torch.amp.autocast(device_type=\"cuda\"):\n",
116
+ " y_lin = linear(x_lin)\n",
117
+ " y_conv = conv(x_conv)\n",
118
+ "\n",
119
+ "torch.cuda.synchronize()\n",
120
+ "print(f\"Linear output: {y_lin.shape}, dtype={y_lin.dtype}\")\n",
121
+ "print(f\"Conv2d output: {y_conv.shape}, dtype={y_conv.dtype}\")\n",
122
+ "print(\"AMP autocast PASSED\")"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "memory-test",
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "import torch\n",
133
+ "\n",
134
+ "\n",
135
+ "torch.cuda.empty_cache()\n",
136
+ "torch.cuda.reset_peak_memory_stats()\n",
137
+ "\n",
138
+ "before = torch.cuda.memory_allocated()\n",
139
+ "big = torch.randn(4096, 4096, device=\"cuda\") # ~64 MB\n",
140
+ "allocated = torch.cuda.memory_allocated()\n",
141
+ "total = torch.cuda.get_device_properties(0).total_memory\n",
142
+ "\n",
143
+ "print(f\"Before alloc : {before / 1e6:.1f} MB\")\n",
144
+ "print(f\"After alloc : {allocated / 1e6:.1f} MB\")\n",
145
+ "print(f\"Total GPU mem: {total / 1e9:.1f} GB\")\n",
146
+ "\n",
147
+ "del big\n",
148
+ "torch.cuda.empty_cache()\n",
149
+ "after_free = torch.cuda.memory_allocated()\n",
150
+ "print(f\"After free : {after_free / 1e6:.1f} MB\")\n",
151
+ "assert after_free <= before + 1e6, \"Memory not freed!\"\n",
152
+ "print(\"Memory alloc/free PASSED\")"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "id": "train-header",
158
+ "metadata": {},
159
+ "source": [
160
+ "## Quick Training Benchmark\n",
161
+ "\n",
162
+ "Train a small CNN on MNIST for 5 epochs and collect the loss per batch."
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "mnist-train",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "import torch\n",
173
+ "import torch.nn as nn\n",
174
+ "import torch.nn.functional as F\n",
175
+ "from torch.utils.data import DataLoader\n",
176
+ "from torchvision import datasets, transforms\n",
177
+ "\n",
178
+ "\n",
179
+ "class MNISTConvNet(nn.Module):\n",
180
+ " def __init__(self):\n",
181
+ " super().__init__()\n",
182
+ " self.features = nn.Sequential(\n",
183
+ " nn.Conv2d(1, 64, 3, padding=1),\n",
184
+ " nn.BatchNorm2d(64),\n",
185
+ " nn.ReLU(inplace=True),\n",
186
+ " nn.Conv2d(64, 64, 3, padding=1),\n",
187
+ " nn.BatchNorm2d(64),\n",
188
+ " nn.ReLU(inplace=True),\n",
189
+ " nn.MaxPool2d(2),\n",
190
+ " nn.Conv2d(64, 128, 3, padding=1),\n",
191
+ " nn.BatchNorm2d(128),\n",
192
+ " nn.ReLU(inplace=True),\n",
193
+ " nn.Conv2d(128, 128, 3, padding=1),\n",
194
+ " nn.BatchNorm2d(128),\n",
195
+ " nn.ReLU(inplace=True),\n",
196
+ " nn.MaxPool2d(2),\n",
197
+ " nn.Conv2d(128, 256, 3, padding=1),\n",
198
+ " nn.BatchNorm2d(256),\n",
199
+ " nn.ReLU(inplace=True),\n",
200
+ " nn.AdaptiveAvgPool2d(1),\n",
201
+ " )\n",
202
+ " self.classifier = nn.Sequential(\n",
203
+ " nn.Flatten(),\n",
204
+ " nn.Linear(256, 256),\n",
205
+ " nn.ReLU(inplace=True),\n",
206
+ " nn.Dropout(0.5),\n",
207
+ " nn.Linear(256, 10),\n",
208
+ " )\n",
209
+ "\n",
210
+ " def forward(self, x):\n",
211
+ " return self.classifier(self.features(x))\n",
212
+ "\n",
213
+ "\n",
214
+ "device = torch.device(\"cuda\")\n",
215
+ "torch.cuda.reset_peak_memory_stats()\n",
216
+ "\n",
217
+ "transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])\n",
218
+ "train_set = datasets.MNIST(\"/tmp/data\", train=True, download=True, transform=transform)\n",
219
+ "loader = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=2, pin_memory=True)\n",
220
+ "\n",
221
+ "model = MNISTConvNet().to(device)\n",
222
+ "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
223
+ "scaler = torch.amp.GradScaler(\"cuda\")\n",
224
+ "\n",
225
+ "NUM_EPOCHS = 5\n",
226
+ "losses = []\n",
227
+ "\n",
228
+ "for epoch in range(1, NUM_EPOCHS + 1):\n",
229
+ " model.train()\n",
230
+ " epoch_loss = 0.0\n",
231
+ " for images, labels in loader:\n",
232
+ " images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)\n",
233
+ " optimizer.zero_grad(set_to_none=True)\n",
234
+ " with torch.amp.autocast(device_type=\"cuda\"):\n",
235
+ " loss = F.cross_entropy(model(images), labels)\n",
236
+ " scaler.scale(loss).backward()\n",
237
+ " scaler.step(optimizer)\n",
238
+ " scaler.update()\n",
239
+ " losses.append(loss.item())\n",
240
+ " epoch_loss += loss.item()\n",
241
+ " avg = epoch_loss / len(loader)\n",
242
+ " print(f\"Epoch {epoch}/{NUM_EPOCHS} avg loss: {avg:.4f}\")\n",
243
+ "\n",
244
+ "peak_mb = torch.cuda.max_memory_allocated() / (1024**2)\n",
245
+ "print(f\"\\nPeak GPU memory during training: {peak_mb:.0f} MB\")\n",
246
+ "print(f\"Total batches: {len(losses)}\")"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "loss-plot",
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "import matplotlib.pyplot as plt\n",
257
+ "\n",
258
+ "\n",
259
+ "plt.figure(figsize=(10, 4))\n",
260
+ "plt.plot(losses, linewidth=0.8, alpha=0.7)\n",
261
+ "plt.xlabel(\"Batch\")\n",
262
+ "plt.ylabel(\"Cross-Entropy Loss\")\n",
263
+ "plt.title(\"MNIST CNN Training Loss\")\n",
264
+ "plt.grid(True, alpha=0.3)\n",
265
+ "plt.tight_layout()\n",
266
+ "plt.show()"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "markdown",
271
+ "id": "mem-header",
272
+ "metadata": {},
273
+ "source": [
274
+ "## GPU Memory & Utilization"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "mem-chart",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": [
284
+ "import matplotlib.pyplot as plt\n",
285
+ "import torch\n",
286
+ "\n",
287
+ "\n",
288
+ "peak_mb = torch.cuda.max_memory_allocated() / (1024**2)\n",
289
+ "total_mb = torch.cuda.get_device_properties(0).total_memory / (1024**2)\n",
290
+ "free_mb = total_mb - peak_mb\n",
291
+ "\n",
292
+ "fig, ax = plt.subplots(figsize=(6, 4))\n",
293
+ "bars = ax.bar([\"Peak Used\", \"Remaining\"], [peak_mb, free_mb], color=[\"#e74c3c\", \"#2ecc71\"])\n",
294
+ "ax.set_ylabel(\"MB\")\n",
295
+ "ax.set_title(f\"GPU Memory: {peak_mb:.0f} MB peak / {total_mb:.0f} MB total\")\n",
296
+ "for bar in bars:\n",
297
+ " ax.text(\n",
298
+ " bar.get_x() + bar.get_width() / 2,\n",
299
+ " bar.get_height() + 50,\n",
300
+ " f\"{bar.get_height():.0f}\",\n",
301
+ " ha=\"center\",\n",
302
+ " va=\"bottom\",\n",
303
+ " fontsize=11,\n",
304
+ " )\n",
305
+ "plt.tight_layout()\n",
306
+ "plt.show()"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "markdown",
311
+ "id": "summary",
312
+ "metadata": {},
313
+ "source": [
314
+ "## Summary\n",
315
+ "\n",
316
+ "If all cells above ran without error, the CUDA stack is healthy and the GPU is\n",
317
+ "ready for training workloads.\n",
318
+ "\n",
319
+ "### Next steps\n",
320
+ "\n",
321
+ "- **Full benchmark** (CNN + Transformer, configurable precision): `python ~/gpu_benchmark.py`\n",
322
+ "- **Jupyter tips**: use `!nvidia-smi` in a cell to check GPU utilisation at any time\n",
323
+ "- **VSCode Remote SSH**: connect with `ssh aws-gpu<N>` for a full IDE experience"
324
+ ]
325
+ }
326
+ ],
327
+ "metadata": {
328
+ "kernelspec": {
329
+ "display_name": "Python 3",
330
+ "language": "python",
331
+ "name": "python3"
332
+ },
333
+ "language_info": {
334
+ "name": "python",
335
+ "version": "3.12.0"
336
+ }
337
+ },
338
+ "nbformat": 4,
339
+ "nbformat_minor": 5
340
+ }
@@ -0,0 +1,188 @@
1
+ #!/usr/bin/env bash
2
+ # remote_setup.sh — Post-boot setup for Deep Learning AMI instances.
3
+ # Runs on the EC2 instance after SSH becomes available.
4
+ set -euo pipefail
5
+
6
+ echo "=== aws-bootstrap-g4dn remote setup ==="
7
+
8
+ # 1. Verify GPU
9
+ echo ""
10
+ echo "[1/5] Verifying GPU and CUDA..."
11
+ if command -v nvidia-smi &>/dev/null; then
12
+ nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
13
+ else
14
+ echo "WARNING: nvidia-smi not found"
15
+ fi
16
+
17
+ if command -v nvcc &>/dev/null; then
18
+ nvcc --version | grep "release"
19
+ else
20
+ echo "WARNING: nvcc not found (CUDA toolkit may not be installed)"
21
+ fi
22
+
23
+ # 2. Install utilities
24
+ echo ""
25
+ echo "[2/5] Installing utilities..."
26
+ sudo apt-get update -qq
27
+ sudo apt-get install -y -qq htop tmux tree jq
28
+
29
+ # 3. Set up Python environment with uv
30
+ echo ""
31
+ echo "[3/5] Setting up Python environment with uv..."
32
+ if ! command -v uv &>/dev/null; then
33
+ curl -LsSf https://astral.sh/uv/install.sh | sh
34
+ fi
35
+ export PATH="$HOME/.local/bin:$PATH"
36
+
37
+ uv venv ~/venv
38
+
39
+ # --- CUDA-aware PyTorch installation ---
40
+ # Known PyTorch CUDA wheel tags (ascending order).
41
+ # Update this list when PyTorch publishes new CUDA builds.
42
+ # See: https://download.pytorch.org/whl/
43
+ KNOWN_CUDA_TAGS=(118 121 124 126 128 129 130)
44
+
45
+ detect_cuda_version() {
46
+ # Primary: nvcc (actual toolkit installed on the system)
47
+ if command -v nvcc &>/dev/null; then
48
+ nvcc --version | grep -oP 'release \K[\d.]+'
49
+ return
50
+ fi
51
+ # Fallback: nvidia-smi (max CUDA the driver supports)
52
+ if command -v nvidia-smi &>/dev/null; then
53
+ nvidia-smi | grep -oP 'CUDA Version: \K[\d.]+'
54
+ return
55
+ fi
56
+ echo ""
57
+ }
58
+
59
+ cuda_version_to_tag() {
60
+ # "12.9" → "129", "13.0" → "130"
61
+ echo "$1" | tr -d '.'
62
+ }
63
+
64
+ find_best_cuda_tag() {
65
+ local detected_tag="$1"
66
+ local best=""
67
+ for tag in "${KNOWN_CUDA_TAGS[@]}"; do
68
+ if [ "$tag" -le "$detected_tag" ]; then
69
+ best="$tag"
70
+ fi
71
+ done
72
+ echo "$best"
73
+ }
74
+
75
+ install_pytorch_cuda() {
76
+ local cuda_ver
77
+ cuda_ver=$(detect_cuda_version)
78
+
79
+ if [ -z "$cuda_ver" ]; then
80
+ echo " WARNING: No CUDA detected — installing PyTorch from PyPI (CPU or default CUDA)"
81
+ uv pip install --python ~/venv/bin/python torch torchvision
82
+ return
83
+ fi
84
+ echo " Detected CUDA version: $cuda_ver"
85
+
86
+ local detected_tag
87
+ detected_tag=$(cuda_version_to_tag "$cuda_ver")
88
+
89
+ local best_tag
90
+ best_tag=$(find_best_cuda_tag "$detected_tag")
91
+
92
+ if [ -z "$best_tag" ]; then
93
+ echo " WARNING: No matching PyTorch CUDA tag for cu${detected_tag} — installing from PyPI"
94
+ uv pip install --python ~/venv/bin/python torch torchvision
95
+ return
96
+ fi
97
+
98
+ echo " Using PyTorch CUDA index: cu${best_tag}"
99
+ if ! uv pip install --python ~/venv/bin/python \
100
+ --default-index "https://download.pytorch.org/whl/cu${best_tag}" \
101
+ torch torchvision; then
102
+ echo " WARNING: CUDA index install failed — falling back to PyPI"
103
+ uv pip install --python ~/venv/bin/python torch torchvision
104
+ fi
105
+ }
106
+
107
+ install_pytorch_cuda
108
+
109
+ # Install remaining dependencies (torch/torchvision already installed above)
110
+ uv pip install --python ~/venv/bin/python -r /tmp/requirements.txt
111
+
112
+ # Copy GPU benchmark script and smoke test notebook
113
+ cp /tmp/gpu_benchmark.py ~/gpu_benchmark.py
114
+ cp /tmp/gpu_smoke_test.ipynb ~/gpu_smoke_test.ipynb
115
+
116
+ # Auto-activate venv on login
117
+ if ! grep -q 'source ~/venv/bin/activate' ~/.bashrc 2>/dev/null; then
118
+ echo 'source ~/venv/bin/activate' >> ~/.bashrc
119
+ fi
120
+
121
+ # Quick CUDA smoke test
122
+ echo " Running CUDA smoke test..."
123
+ if ~/venv/bin/python -c "
124
+ import torch
125
+ assert torch.cuda.is_available(), 'CUDA not available'
126
+ x = torch.randn(256, 256, device='cuda')
127
+ y = torch.mm(x, x)
128
+ torch.cuda.synchronize()
129
+ print(f' PyTorch {torch.__version__}, CUDA {torch.version.cuda}, GPU: {torch.cuda.get_device_name(0)}')
130
+ print(' Quick matmul test: PASSED')
131
+ "; then
132
+ echo " CUDA smoke test passed"
133
+ else
134
+ echo " WARNING: CUDA smoke test failed — check PyTorch/CUDA installation"
135
+ fi
136
+
137
+ JUPYTER_CONFIG_DIR="$HOME/.jupyter"
138
+ mkdir -p "$JUPYTER_CONFIG_DIR"
139
+ cat > "$JUPYTER_CONFIG_DIR/jupyter_lab_config.py" << 'PYEOF'
140
+ c.ServerApp.ip = '0.0.0.0'
141
+ c.ServerApp.port = 8888
142
+ c.ServerApp.open_browser = False
143
+ c.IdentityProvider.token = ''
144
+ c.ServerApp.allow_remote_access = True
145
+ PYEOF
146
+ echo " Jupyter config written to $JUPYTER_CONFIG_DIR/jupyter_lab_config.py"
147
+
148
+ # 4. Jupyter systemd service
149
+ echo ""
150
+ echo "[4/5] Setting up Jupyter systemd service..."
151
+ LOGIN_USER=$(whoami)
152
+
153
+ sudo tee /etc/systemd/system/jupyter.service > /dev/null << SVCEOF
154
+ [Unit]
155
+ Description=Jupyter Lab Server
156
+ After=network.target
157
+
158
+ [Service]
159
+ Type=simple
160
+ User=${LOGIN_USER}
161
+ WorkingDirectory=/home/${LOGIN_USER}
162
+ ExecStart=/home/${LOGIN_USER}/venv/bin/python -m jupyterlab
163
+ Restart=on-failure
164
+ RestartSec=10
165
+
166
+ [Install]
167
+ WantedBy=multi-user.target
168
+ SVCEOF
169
+
170
+ sudo systemctl daemon-reload
171
+ sudo systemctl enable jupyter.service
172
+ sudo systemctl start jupyter.service
173
+ echo " Jupyter service started (port 8888)"
174
+
175
+ # 5. SSH keepalive
176
+ echo ""
177
+ echo "[5/5] Configuring SSH keepalive..."
178
+ if ! grep -q "ClientAliveInterval" /etc/ssh/sshd_config; then
179
+ echo "ClientAliveInterval 60" | sudo tee -a /etc/ssh/sshd_config > /dev/null
180
+ echo "ClientAliveCountMax 10" | sudo tee -a /etc/ssh/sshd_config > /dev/null
181
+ sudo systemctl reload sshd
182
+ echo " SSH keepalive configured"
183
+ else
184
+ echo " SSH keepalive already configured"
185
+ fi
186
+
187
+ echo ""
188
+ echo "=== Remote setup complete ==="
@@ -0,0 +1,8 @@
1
+ numpy
2
+ matplotlib
3
+ scipy
4
+ datasets
5
+ transformers
6
+ pandas
7
+ tensorboard
8
+ jupyterlab