tpuz 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tpuz-0.1.0/.github/workflows/deploy.yaml +46 -0
- tpuz-0.1.0/.github/workflows/publish.yaml +41 -0
- tpuz-0.1.0/.github/workflows/test.yaml +21 -0
- tpuz-0.1.0/.gitignore +8 -0
- tpuz-0.1.0/CLAUDE.md +51 -0
- tpuz-0.1.0/LICENSE +21 -0
- tpuz-0.1.0/PKG-INFO +306 -0
- tpuz-0.1.0/README.md +285 -0
- tpuz-0.1.0/SKILL.md +144 -0
- tpuz-0.1.0/docs/Gemfile +7 -0
- tpuz-0.1.0/docs/_config.yml +18 -0
- tpuz-0.1.0/docs/best-practices.md +210 -0
- tpuz-0.1.0/docs/getting-started.md +298 -0
- tpuz-0.1.0/docs/gpu.md +168 -0
- tpuz-0.1.0/docs/guide.md +328 -0
- tpuz-0.1.0/docs/index.md +51 -0
- tpuz-0.1.0/docs/secrets.md +165 -0
- tpuz-0.1.0/pyproject.toml +34 -0
- tpuz-0.1.0/tests/__init__.py +0 -0
- tpuz-0.1.0/tests/test_tpu.py +224 -0
- tpuz-0.1.0/tpuz/__init__.py +12 -0
- tpuz-0.1.0/tpuz/cli.py +299 -0
- tpuz-0.1.0/tpuz/costs.py +66 -0
- tpuz-0.1.0/tpuz/gce.py +359 -0
- tpuz-0.1.0/tpuz/gcs.py +142 -0
- tpuz-0.1.0/tpuz/launcher.py +72 -0
- tpuz-0.1.0/tpuz/notify.py +46 -0
- tpuz-0.1.0/tpuz/secrets.py +181 -0
- tpuz-0.1.0/tpuz/tpu.py +1105 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
name: Deploy Docs to GitHub Pages
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["main"]
|
|
6
|
+
paths: ["docs/**"]
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
pages: write
|
|
12
|
+
id-token: write
|
|
13
|
+
|
|
14
|
+
concurrency:
|
|
15
|
+
group: "pages"
|
|
16
|
+
cancel-in-progress: false
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
steps:
|
|
22
|
+
- name: Checkout
|
|
23
|
+
uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- name: Setup Pages
|
|
26
|
+
uses: actions/configure-pages@v5
|
|
27
|
+
|
|
28
|
+
- name: Build with Jekyll
|
|
29
|
+
uses: actions/jekyll-build-pages@v1
|
|
30
|
+
with:
|
|
31
|
+
source: ./docs
|
|
32
|
+
destination: ./_site
|
|
33
|
+
|
|
34
|
+
- name: Upload artifact
|
|
35
|
+
uses: actions/upload-pages-artifact@v3
|
|
36
|
+
|
|
37
|
+
deploy:
|
|
38
|
+
environment:
|
|
39
|
+
name: github-pages
|
|
40
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
41
|
+
runs-on: ubuntu-latest
|
|
42
|
+
needs: build
|
|
43
|
+
steps:
|
|
44
|
+
- name: Deploy to GitHub Pages
|
|
45
|
+
id: deployment
|
|
46
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
push:
|
|
7
|
+
tags:
|
|
8
|
+
- "v*"
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
build:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
- run: pip install build
|
|
22
|
+
- run: python -m build
|
|
23
|
+
- uses: actions/upload-artifact@v4
|
|
24
|
+
with:
|
|
25
|
+
name: dist
|
|
26
|
+
path: dist/
|
|
27
|
+
|
|
28
|
+
publish:
|
|
29
|
+
needs: build
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
environment:
|
|
32
|
+
name: pypi
|
|
33
|
+
url: https://pypi.org/p/tpuz
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/download-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: dist
|
|
40
|
+
path: dist/
|
|
41
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- run: pip install -e ".[dev]"
|
|
21
|
+
- run: pytest tests/ -v
|
tpuz-0.1.0/.gitignore
ADDED
tpuz-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# tpuz — Claude Code Integration
|
|
2
|
+
|
|
3
|
+
## What This Is
|
|
4
|
+
|
|
5
|
+
tpuz manages GCP TPU VMs via gcloud CLI. Create, run, debug, monitor, recover, teardown.
|
|
6
|
+
|
|
7
|
+
## Key Files
|
|
8
|
+
|
|
9
|
+
- `tpuz/tpu.py` — Core `TPU` class (lifecycle, SSH, multi-host, debugging, secrets, costs)
|
|
10
|
+
- `tpuz/gcs.py` — GCS checkpoint sync
|
|
11
|
+
- `tpuz/secrets.py` — Google Cloud Secret Manager integration
|
|
12
|
+
- `tpuz/costs.py` — Cost tracking with TPU hourly rates
|
|
13
|
+
- `tpuz/notify.py` — Slack/webhook notifications
|
|
14
|
+
- `tpuz/launcher.py` — One-command training orchestrator
|
|
15
|
+
- `tpuz/cli.py` — CLI with 25+ commands
|
|
16
|
+
|
|
17
|
+
## Quick Usage
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from tpuz import TPU, GCS, SecretManager
|
|
21
|
+
|
|
22
|
+
tpu = TPU("my-tpu", accelerator="v4-8")
|
|
23
|
+
tpu.up()
|
|
24
|
+
tpu.setup()
|
|
25
|
+
tpu.run("python train.py", secrets=["WANDB_API_KEY"], sync="./src")
|
|
26
|
+
tpu.logs()
|
|
27
|
+
tpu.cost_summary()
|
|
28
|
+
tpu.down()
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Secrets (IMPORTANT)
|
|
32
|
+
|
|
33
|
+
Always use Cloud Secret Manager, not env vars:
|
|
34
|
+
```python
|
|
35
|
+
# GOOD: secrets never leave GCP
|
|
36
|
+
tpu.run("python train.py", secrets=["WANDB_API_KEY", "HF_TOKEN"])
|
|
37
|
+
|
|
38
|
+
# OK for quick tests only
|
|
39
|
+
tpu.run("python train.py", env={"KEY": "val"})
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Docs
|
|
43
|
+
|
|
44
|
+
- `docs/secrets.md` — Full secrets & security guide
|
|
45
|
+
- `docs/best-practices.md` — Training workflow, costs, multi-host tips
|
|
46
|
+
|
|
47
|
+
## Running Tests
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pytest tests/ -v # 35 tests, no GCP needed
|
|
51
|
+
```
|
tpuz-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Taha Bouhsine
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tpuz-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tpuz
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Manage GCP TPU VMs from your terminal — create, run, recover, teardown
|
|
5
|
+
Project-URL: Homepage, https://github.com/tahabsn/tpuz
|
|
6
|
+
Project-URL: Issues, https://github.com/tahabsn/tpuz/issues
|
|
7
|
+
Author: Taha Bouhsine
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: cloud,distributed,gcp,jax,tpu,training
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# tpuz
|
|
23
|
+
|
|
24
|
+
Manage GCP TPU VMs from your terminal. Create, train, debug, recover, teardown — one command.
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install tpuz
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Why?
|
|
31
|
+
|
|
32
|
+
Training on TPU pods requires 10+ gcloud commands, manual SSH to each worker, no preemption handling, no cost visibility, and painful debugging. tpuz wraps it all:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from tpuz import TPU
|
|
36
|
+
|
|
37
|
+
tpu = TPU("my-tpu", accelerator="v4-8")
|
|
38
|
+
tpu.up()
|
|
39
|
+
tpu.setup()
|
|
40
|
+
tpu.run("python train.py", sync="./src")
|
|
41
|
+
tpu.logs()
|
|
42
|
+
tpu.down()
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Or in one command:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
tpuz train my-tpu "python train.py" -a v4-8 --sync=. --recover --teardown
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
### Lifecycle
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
tpu.preflight() # Verify gcloud config
|
|
57
|
+
tpu.up() # Create VM (idempotent)
|
|
58
|
+
tpu.up_queued() # Queued Resources API (reliable spot)
|
|
59
|
+
tpu.down() # Delete VM
|
|
60
|
+
tpu.info() # State, IPs, accelerator
|
|
61
|
+
tpu.setup(extra_pip="jax") # Install JAX[TPU] + deps
|
|
62
|
+
tpu.verify() # Verify JAX on all workers
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Training
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
tpu.run("python train.py", sync="./src", env={"KEY": "val"})
|
|
69
|
+
tpu.logs() # Stream logs (Ctrl-C to detach)
|
|
70
|
+
tpu.logs_all() # Color-coded logs from ALL workers
|
|
71
|
+
tpu.is_running() # Check if alive
|
|
72
|
+
tpu.kill() # Stop training
|
|
73
|
+
tpu.wait() # Poll for COMPLETE/FAILED
|
|
74
|
+
tpu.collect(["model.pkl"]) # Download artifacts
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Cost Tracking
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
tpu.cost_summary() # "$4.12 (2.0h x $2.06/hr v4-8 spot)"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### GCS Checkpoint Sync
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from tpuz import GCS
|
|
87
|
+
|
|
88
|
+
gcs = GCS("gs://my-bucket")
|
|
89
|
+
gcs.upload_checkpoint("./ckpt", "run-01", step=1000)
|
|
90
|
+
gcs.latest_step("run-01") # 5000
|
|
91
|
+
gcs.list_runs() # ["run-01", "run-02"]
|
|
92
|
+
|
|
93
|
+
# Auto-resume from latest checkpoint
|
|
94
|
+
tpu.run_with_resume("python train.py", gcs=gcs, run_name="run-01")
|
|
95
|
+
# Finds step 5000 -> appends --resume-from-step=5000
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Preemption Recovery
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
tpu.watch("python train.py", max_retries=5)
|
|
102
|
+
# Polls every 60s -> on PREEMPTED: delete -> recreate -> setup -> restart
|
|
103
|
+
|
|
104
|
+
# With Slack notifications
|
|
105
|
+
tpu.watch_notify("python train.py",
|
|
106
|
+
notify_url="https://hooks.slack.com/services/...",
|
|
107
|
+
max_retries=5)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Debugging
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
tpu.repl() # Interactive Python on worker 0
|
|
114
|
+
tpu.debug("python train.py", port=5678) # VS Code debugger attach
|
|
115
|
+
tpu.logs_all(lines=20) # All workers side by side
|
|
116
|
+
tpu.health_pretty() # Worker dashboard:
|
|
117
|
+
# Worker Status Last Log
|
|
118
|
+
# -------------------------------------------
|
|
119
|
+
# worker 0 running step 1234 | loss 2.31
|
|
120
|
+
# worker 1 running step 1234 | loss 2.31
|
|
121
|
+
# worker 2 stopped (no log)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### SSH Tunnel
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
tpu.tunnel(6006) # TensorBoard: localhost:6006
|
|
128
|
+
tpu.tunnel(8888, 9999) # Jupyter: localhost:9999 -> TPU:8888
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Scaling
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
tpu.scale("v4-32") # Delete -> recreate with v4-32 -> re-setup
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Multi-Zone Failover
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
tpu = TPU.create_multi_zone("my-tpu", "v4-8",
|
|
141
|
+
zones=["us-central2-b", "us-central1-a", "europe-west4-a"])
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Availability Check
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
TPU.availability("v4-8", zone="us-central2-b")
|
|
148
|
+
# {"available": True, "spot_rate": 2.06, "on_demand_rate": 6.18}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Run-Once (Docker-like)
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
tpu.run_once("python train.py",
|
|
155
|
+
sync="./src",
|
|
156
|
+
collect_files=["model.pkl", "results.json"],
|
|
157
|
+
gcs=gcs,
|
|
158
|
+
notify_url="https://hooks.slack.com/...")
|
|
159
|
+
# up -> setup -> resume -> run -> wait -> collect -> notify -> down
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Scheduled Training
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
tpu.schedule("python train.py",
|
|
166
|
+
start_after="22:00", # Wait until 10 PM
|
|
167
|
+
max_cost=10.0) # Kill if exceeds $10
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Environment Snapshot/Restore
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
tpu.snapshot_env(gcs=gcs) # pip freeze -> GCS
|
|
174
|
+
tpu.restore_env(gcs=gcs) # Restore after preemption
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Secrets (Cloud Secret Manager)
|
|
178
|
+
|
|
179
|
+
**Recommended:** Use Google Cloud Secret Manager. Secrets never leave GCP:
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from tpuz import SecretManager
|
|
183
|
+
|
|
184
|
+
# One-time setup
|
|
185
|
+
sm = SecretManager()
|
|
186
|
+
sm.create("WANDB_API_KEY", "your-key")
|
|
187
|
+
sm.grant_tpu_access_all()
|
|
188
|
+
|
|
189
|
+
# Training: VM reads secrets directly from GCP
|
|
190
|
+
tpu.run("python train.py", secrets=["WANDB_API_KEY", "HF_TOKEN"])
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Fallback:** `env={}` writes a `.env` file via SCP (encrypted, but secrets transit your machine).
|
|
194
|
+
|
|
195
|
+
See [docs/secrets.md](docs/secrets.md) for full setup guide and security comparison.
|
|
196
|
+
|
|
197
|
+
## Multi-Host (TPU Pods)
|
|
198
|
+
|
|
199
|
+
Worker count auto-detected. All SSH commands run in parallel with per-worker retries:
|
|
200
|
+
|
|
201
|
+
| Accelerator | Chips | Workers | Spot $/hr |
|
|
202
|
+
|-------------|-------|---------|-----------|
|
|
203
|
+
| `v4-8` | 4 | 1 | $2.06 |
|
|
204
|
+
| `v4-32` | 16 | 4 | $8.24 |
|
|
205
|
+
| `v5litepod-8` | 8 | 1 | $9.60 |
|
|
206
|
+
| `v5litepod-64` | 64 | 8 | $76.80 |
|
|
207
|
+
| `v6e-8` | 8 | 1 | $9.60 |
|
|
208
|
+
|
|
209
|
+
## CLI
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
# Lifecycle
|
|
213
|
+
tpuz up my-tpu -a v4-8
|
|
214
|
+
tpuz down my-tpu
|
|
215
|
+
tpuz status my-tpu
|
|
216
|
+
tpuz list
|
|
217
|
+
tpuz preflight
|
|
218
|
+
tpuz avail v4-8
|
|
219
|
+
|
|
220
|
+
# Training
|
|
221
|
+
tpuz setup my-tpu --pip="flaxchat"
|
|
222
|
+
tpuz verify my-tpu
|
|
223
|
+
tpuz run my-tpu "python train.py" --sync=./src
|
|
224
|
+
tpuz logs my-tpu
|
|
225
|
+
tpuz logs-all my-tpu
|
|
226
|
+
tpuz kill my-tpu
|
|
227
|
+
tpuz wait my-tpu
|
|
228
|
+
tpuz collect my-tpu model.pkl results.json
|
|
229
|
+
|
|
230
|
+
# Debugging
|
|
231
|
+
tpuz repl my-tpu
|
|
232
|
+
tpuz debug my-tpu "python train.py"
|
|
233
|
+
tpuz health my-tpu
|
|
234
|
+
tpuz tunnel my-tpu 6006
|
|
235
|
+
tpuz scale my-tpu v4-32
|
|
236
|
+
tpuz cost my-tpu
|
|
237
|
+
|
|
238
|
+
# Recovery
|
|
239
|
+
tpuz watch my-tpu "python train.py"
|
|
240
|
+
|
|
241
|
+
# All-in-one
|
|
242
|
+
tpuz train my-tpu "python train.py" -a v4-8 --sync=. --recover --teardown
|
|
243
|
+
tpuz run-once my-tpu "python train.py" --sync=. --collect model.pkl
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Development Workflow
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
from tpuz import TPU
|
|
250
|
+
|
|
251
|
+
# 1. Develop on single host
|
|
252
|
+
dev = TPU("dev", accelerator="v4-8")
|
|
253
|
+
dev.up()
|
|
254
|
+
dev.setup()
|
|
255
|
+
dev.repl() # Interactive development
|
|
256
|
+
|
|
257
|
+
# 2. Test training
|
|
258
|
+
dev.run("python train.py --steps=10", sync="./src")
|
|
259
|
+
dev.logs()
|
|
260
|
+
|
|
261
|
+
# 3. Scale up
|
|
262
|
+
dev.scale("v4-32") # 4 workers now
|
|
263
|
+
dev.run("python train.py --steps=50000", sync="./src")
|
|
264
|
+
dev.watch("python train.py --steps=50000")
|
|
265
|
+
|
|
266
|
+
# 4. Collect and cleanup
|
|
267
|
+
dev.collect(["model.pkl", "results.json"])
|
|
268
|
+
dev.cost_summary() # $12.36
|
|
269
|
+
dev.down()
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Documentation
|
|
273
|
+
|
|
274
|
+
- **[docs/secrets.md](docs/secrets.md)** — Secrets & security guide (Cloud Secret Manager setup)
|
|
275
|
+
- **[docs/best-practices.md](docs/best-practices.md)** — Training workflow, cost optimization, multi-host tips
|
|
276
|
+
- **[SKILL.md](SKILL.md)** — Claude Code skill reference
|
|
277
|
+
- **[CLAUDE.md](CLAUDE.md)** — Quick reference for AI agents
|
|
278
|
+
|
|
279
|
+
## Requirements
|
|
280
|
+
|
|
281
|
+
- `gcloud` CLI installed and authenticated
|
|
282
|
+
- GCP project with TPU quota
|
|
283
|
+
- Python 3.10+
|
|
284
|
+
- Zero Python dependencies
|
|
285
|
+
|
|
286
|
+
## Pair with kgz
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
pip install kgz # Kaggle free GPUs
|
|
290
|
+
pip install tpuz # GCP TPU pods
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
## Claude Code Integration
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
mkdir -p ~/.claude/skills/tpuz-guide
|
|
297
|
+
cp SKILL.md ~/.claude/skills/tpuz-guide/skill.md
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## License
|
|
301
|
+
|
|
302
|
+
MIT
|
|
303
|
+
|
|
304
|
+
## Acknowledgments
|
|
305
|
+
|
|
306
|
+
Cloud TPU resources for developing and testing tpuz were provided by Google's [TPU Research Cloud (TRC)](https://sites.research.google/trc/about/) program. We gratefully acknowledge their support in making TPU access available for open-source research.
|