tuft 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tuft-0.1.2 → tuft-0.1.3}/.github/workflows/docker/docker-compose.yml +2 -1
- {tuft-0.1.2 → tuft-0.1.3}/.github/workflows/unittest.yml +2 -2
- {tuft-0.1.2 → tuft-0.1.3}/PKG-INFO +9 -65
- {tuft-0.1.2 → tuft-0.1.3}/README.md +8 -64
- {tuft-0.1.2 → tuft-0.1.3}/config/tuft_config.example.yaml +24 -9
- tuft-0.1.3/docs/persistence.md +297 -0
- tuft-0.1.3/docs/telemetry.md +286 -0
- {tuft-0.1.2 → tuft-0.1.3}/pyproject.toml +1 -1
- {tuft-0.1.2 → tuft-0.1.3}/scripts/install.sh +2 -2
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/backends/hf_training_model.py +184 -64
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/cli.py +120 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/config.py +58 -56
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/exceptions.py +66 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/futures.py +22 -2
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/loss_fn/__init__.py +33 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/persistence/__init__.py +10 -2
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/persistence/redis_store.py +352 -31
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/sampling_controller.py +34 -10
- tuft-0.1.3/src/tuft/sequence_executor.py +72 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/server.py +9 -2
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/state.py +3 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/training_controller.py +14 -4
- {tuft-0.1.2 → tuft-0.1.3}/tests/conftest.py +3 -3
- {tuft-0.1.2 → tuft-0.1.3}/tests/helpers.py +13 -36
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_futures.py +46 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_integration_persistence.py +1 -1
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_loss_fn.py +42 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_persistence.py +419 -3
- tuft-0.1.3/tests/test_sequence_executor.py +81 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_state_controllers.py +20 -20
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_telemetry.py +1 -1
- {tuft-0.1.2 → tuft-0.1.3}/.gitattributes +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.github/workflows/checks.yml +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.github/workflows/docker.yml +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.github/workflows/install-script.yml +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.github/workflows/publish.yml +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.gitignore +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.gitmodules +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.pre-commit-config.yaml +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.python-version +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/.secrets.baseline +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/LICENSE +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/assets/countdown_rl.png +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/assets/test_nll_sft.png +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/assets/train_mean_nll_sft.png +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/assets/tuft-logo-colorful.svg +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/docker/Dockerfile +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/docs/chat_sft.md +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/docs/countdown_rl.md +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/docs/how_to_write_tests.md +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/examples/chat_sft.ipynb +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/examples/countdown_rl.ipynb +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/scripts/install_flash_attn.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/__init__.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/__main__.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/auth.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/backend.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/backends/__init__.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/backends/base_backend.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/backends/sampling_backend.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/backends/training_backend.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/checkpoints.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/loss_fn/cispo.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/loss_fn/cross_entropy.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/loss_fn/dro.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/loss_fn/importance_sampling.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/loss_fn/ppo.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/persistence/file_redis.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/telemetry/__init__.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/telemetry/metrics.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/telemetry/provider.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/src/tuft/telemetry/tracing.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/__init__.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/data/models.yaml +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_checkpoints.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_cli.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_file_redis.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_integration.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_sampling_backend.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_server.py +0 -0
- {tuft-0.1.2 → tuft-0.1.3}/tests/test_training_backend.py +0 -0
|
@@ -13,7 +13,7 @@ services:
|
|
|
13
13
|
&& ln -sf /usr/bin/python3 /usr/bin/python \
|
|
14
14
|
&& ln -sf /usr/bin/pip3 /usr/bin/pip \
|
|
15
15
|
&& bash /workspace/scripts/install.sh --local-source /workspace \
|
|
16
|
-
&& source
|
|
16
|
+
&& source /root/.local/bin/env \
|
|
17
17
|
&& source /root/.tuft/venv/bin/activate \
|
|
18
18
|
&& uv pip install .[dev] \
|
|
19
19
|
&& ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block"
|
|
@@ -24,6 +24,7 @@ services:
|
|
|
24
24
|
- TUFT_TEST_MODEL=/mnt/models/Qwen3-0.6B
|
|
25
25
|
- TUFT_TEST_MODEL_1=/mnt/models/Qwen3-0.6B
|
|
26
26
|
- TUFT_TEST_MODEL_2=/mnt/models/Qwen3-1.7B
|
|
27
|
+
- TUFT_DOCKER_UNITTEST=1
|
|
27
28
|
- TEST_REDIS_URL=redis://tuft-redis:6379
|
|
28
29
|
- VIRTUAL_ENV=/root/.tuft/venv
|
|
29
30
|
working_dir: /workspace
|
|
@@ -31,8 +31,8 @@ jobs:
|
|
|
31
31
|
- name: Check ray status
|
|
32
32
|
working-directory: tuft-${{ github.run_id }}/.github/workflows/docker
|
|
33
33
|
run: |
|
|
34
|
-
MAX_RETRIES=
|
|
35
|
-
RETRY_INTERVAL=
|
|
34
|
+
MAX_RETRIES=90
|
|
35
|
+
RETRY_INTERVAL=30
|
|
36
36
|
for i in $(seq 1 $MAX_RETRIES); do
|
|
37
37
|
if docker compose exec tuft-node-1 bash -c "source /root/.tuft/venv/bin/activate && ray status"; then
|
|
38
38
|
break
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tuft
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
|
|
5
5
|
Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
|
|
6
6
|
License: MIT License
|
|
@@ -300,7 +300,7 @@ uv pip install "tuft[dev,backend,persistence]"
|
|
|
300
300
|
The CLI starts a FastAPI server:
|
|
301
301
|
|
|
302
302
|
```bash
|
|
303
|
-
tuft --port 10610 --config /path/to/tuft_config.yaml
|
|
303
|
+
tuft launch --port 10610 --config /path/to/tuft_config.yaml
|
|
304
304
|
```
|
|
305
305
|
|
|
306
306
|
The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
|
|
@@ -340,7 +340,7 @@ you can use the pre-built Docker image.
|
|
|
340
340
|
-p 10610:10610 \
|
|
341
341
|
-v <host_dir>:/data \
|
|
342
342
|
ghcr.io/agentscope-ai/tuft:latest \
|
|
343
|
-
tuft --port 10610 --config /data/tuft_config.yaml
|
|
343
|
+
tuft launch --port 10610 --config /data/tuft_config.yaml
|
|
344
344
|
```
|
|
345
345
|
|
|
346
346
|
Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
|
|
@@ -378,77 +378,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
|
|
|
378
378
|
|
|
379
379
|
## Persistence
|
|
380
380
|
|
|
381
|
-
TuFT supports optional
|
|
382
|
-
the server can recover sessions, training runs, and pending futures after a restart.
|
|
381
|
+
TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
|
|
383
382
|
|
|
384
|
-
|
|
383
|
+
See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
|
|
385
384
|
|
|
386
385
|
```bash
|
|
387
|
-
uv pip install tuft[persistence]
|
|
386
|
+
uv pip install "tuft[persistence]"
|
|
388
387
|
```
|
|
389
388
|
|
|
390
|
-
### Persistence Modes
|
|
391
|
-
|
|
392
|
-
TuFT provides three persistence modes:
|
|
393
|
-
|
|
394
|
-
| Mode | Description | Use Case |
|
|
395
|
-
|------|-------------|----------|
|
|
396
|
-
| `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
|
|
397
|
-
| `redis_url` | External Redis server | Production, multi-instance deployments |
|
|
398
|
-
| `file_redis` | File-backed store | Demos, small-scale testing |
|
|
399
|
-
|
|
400
|
-
### Configuration
|
|
401
|
-
|
|
402
|
-
Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
|
|
403
|
-
|
|
404
|
-
#### Mode 1: Disabled (Default)
|
|
405
|
-
|
|
406
|
-
No configuration needed. All data is stored in memory and lost on restart.
|
|
407
|
-
|
|
408
|
-
```yaml
|
|
409
|
-
# tuft_config.yaml
|
|
410
|
-
persistence:
|
|
411
|
-
mode: disabled
|
|
412
|
-
```
|
|
413
|
-
|
|
414
|
-
#### Mode 2: External Redis Server
|
|
415
|
-
|
|
416
|
-
Use an external Redis server for production deployments:
|
|
417
|
-
|
|
418
389
|
```yaml
|
|
419
390
|
# tuft_config.yaml
|
|
420
391
|
persistence:
|
|
421
|
-
mode:
|
|
392
|
+
mode: REDIS
|
|
422
393
|
redis_url: "redis://localhost:6379/0"
|
|
423
|
-
namespace: "tuft"
|
|
424
|
-
```
|
|
425
|
-
|
|
426
|
-
You can start a local Redis instance using Docker:
|
|
427
|
-
|
|
428
|
-
```bash
|
|
429
|
-
docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
|
|
430
|
-
```
|
|
431
|
-
|
|
432
|
-
#### Mode 3: File-backed Store
|
|
433
|
-
|
|
434
|
-
Use the file-backed store for demos or small-scale testing:
|
|
435
|
-
|
|
436
|
-
```yaml
|
|
437
|
-
# tuft_config.yaml
|
|
438
|
-
persistence:
|
|
439
|
-
mode: file_redis
|
|
440
|
-
file_path: "~/.cache/tuft/file_redis.json"
|
|
441
|
-
namespace: "tuft"
|
|
394
|
+
namespace: "persistence-tuft-server"
|
|
442
395
|
```
|
|
443
396
|
|
|
444
397
|
## Observability (OpenTelemetry)
|
|
445
398
|
|
|
446
|
-
TuFT supports optional OpenTelemetry integration for
|
|
447
|
-
This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
|
|
448
|
-
|
|
449
|
-
### Configuration
|
|
450
|
-
|
|
451
|
-
Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
|
|
399
|
+
TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
|
|
452
400
|
|
|
453
401
|
```yaml
|
|
454
402
|
# tuft_config.yaml
|
|
@@ -457,10 +405,6 @@ telemetry:
|
|
|
457
405
|
service_name: tuft
|
|
458
406
|
otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
|
|
459
407
|
resource_attributes: {}
|
|
460
|
-
# example:
|
|
461
|
-
# deployment.environment: production
|
|
462
|
-
# service.version: 1.0.0
|
|
463
|
-
# service.namespace: my-namespace
|
|
464
408
|
```
|
|
465
409
|
|
|
466
410
|
Alternatively, use environment variables:
|
|
@@ -238,7 +238,7 @@ uv pip install "tuft[dev,backend,persistence]"
|
|
|
238
238
|
The CLI starts a FastAPI server:
|
|
239
239
|
|
|
240
240
|
```bash
|
|
241
|
-
tuft --port 10610 --config /path/to/tuft_config.yaml
|
|
241
|
+
tuft launch --port 10610 --config /path/to/tuft_config.yaml
|
|
242
242
|
```
|
|
243
243
|
|
|
244
244
|
The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
|
|
@@ -278,7 +278,7 @@ you can use the pre-built Docker image.
|
|
|
278
278
|
-p 10610:10610 \
|
|
279
279
|
-v <host_dir>:/data \
|
|
280
280
|
ghcr.io/agentscope-ai/tuft:latest \
|
|
281
|
-
tuft --port 10610 --config /data/tuft_config.yaml
|
|
281
|
+
tuft launch --port 10610 --config /data/tuft_config.yaml
|
|
282
282
|
```
|
|
283
283
|
|
|
284
284
|
Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
|
|
@@ -316,77 +316,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
|
|
|
316
316
|
|
|
317
317
|
## Persistence
|
|
318
318
|
|
|
319
|
-
TuFT supports optional
|
|
320
|
-
the server can recover sessions, training runs, and pending futures after a restart.
|
|
319
|
+
TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
|
|
321
320
|
|
|
322
|
-
|
|
321
|
+
See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
|
|
323
322
|
|
|
324
323
|
```bash
|
|
325
|
-
uv pip install tuft[persistence]
|
|
324
|
+
uv pip install "tuft[persistence]"
|
|
326
325
|
```
|
|
327
326
|
|
|
328
|
-
### Persistence Modes
|
|
329
|
-
|
|
330
|
-
TuFT provides three persistence modes:
|
|
331
|
-
|
|
332
|
-
| Mode | Description | Use Case |
|
|
333
|
-
|------|-------------|----------|
|
|
334
|
-
| `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
|
|
335
|
-
| `redis_url` | External Redis server | Production, multi-instance deployments |
|
|
336
|
-
| `file_redis` | File-backed store | Demos, small-scale testing |
|
|
337
|
-
|
|
338
|
-
### Configuration
|
|
339
|
-
|
|
340
|
-
Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
|
|
341
|
-
|
|
342
|
-
#### Mode 1: Disabled (Default)
|
|
343
|
-
|
|
344
|
-
No configuration needed. All data is stored in memory and lost on restart.
|
|
345
|
-
|
|
346
|
-
```yaml
|
|
347
|
-
# tuft_config.yaml
|
|
348
|
-
persistence:
|
|
349
|
-
mode: disabled
|
|
350
|
-
```
|
|
351
|
-
|
|
352
|
-
#### Mode 2: External Redis Server
|
|
353
|
-
|
|
354
|
-
Use an external Redis server for production deployments:
|
|
355
|
-
|
|
356
327
|
```yaml
|
|
357
328
|
# tuft_config.yaml
|
|
358
329
|
persistence:
|
|
359
|
-
mode:
|
|
330
|
+
mode: REDIS
|
|
360
331
|
redis_url: "redis://localhost:6379/0"
|
|
361
|
-
namespace: "tuft"
|
|
362
|
-
```
|
|
363
|
-
|
|
364
|
-
You can start a local Redis instance using Docker:
|
|
365
|
-
|
|
366
|
-
```bash
|
|
367
|
-
docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
|
|
368
|
-
```
|
|
369
|
-
|
|
370
|
-
#### Mode 3: File-backed Store
|
|
371
|
-
|
|
372
|
-
Use the file-backed store for demos or small-scale testing:
|
|
373
|
-
|
|
374
|
-
```yaml
|
|
375
|
-
# tuft_config.yaml
|
|
376
|
-
persistence:
|
|
377
|
-
mode: file_redis
|
|
378
|
-
file_path: "~/.cache/tuft/file_redis.json"
|
|
379
|
-
namespace: "tuft"
|
|
332
|
+
namespace: "persistence-tuft-server"
|
|
380
333
|
```
|
|
381
334
|
|
|
382
335
|
## Observability (OpenTelemetry)
|
|
383
336
|
|
|
384
|
-
TuFT supports optional OpenTelemetry integration for
|
|
385
|
-
This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
|
|
386
|
-
|
|
387
|
-
### Configuration
|
|
388
|
-
|
|
389
|
-
Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
|
|
337
|
+
TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
|
|
390
338
|
|
|
391
339
|
```yaml
|
|
392
340
|
# tuft_config.yaml
|
|
@@ -395,10 +343,6 @@ telemetry:
|
|
|
395
343
|
service_name: tuft
|
|
396
344
|
otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
|
|
397
345
|
resource_attributes: {}
|
|
398
|
-
# example:
|
|
399
|
-
# deployment.environment: production
|
|
400
|
-
# service.version: 1.0.0
|
|
401
|
-
# service.namespace: my-namespace
|
|
402
346
|
```
|
|
403
347
|
|
|
404
348
|
Alternatively, use environment variables:
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# Copy this file to your desired location and modify as needed.
|
|
5
5
|
#
|
|
6
6
|
# Usage:
|
|
7
|
-
# tuft --config /path/to/your/tuft_config.yaml
|
|
7
|
+
# tuft launch --config /path/to/your/tuft_config.yaml
|
|
8
8
|
|
|
9
9
|
# =============================================================================
|
|
10
10
|
# Checkpoint Directory
|
|
@@ -79,23 +79,38 @@ authorized_users:
|
|
|
79
79
|
# Persistence Configuration
|
|
80
80
|
# =============================================================================
|
|
81
81
|
# Configure state persistence for recovery after server restart.
|
|
82
|
+
# For detailed documentation, see the "Persistence" section in README.md.
|
|
82
83
|
#
|
|
83
84
|
# Available modes:
|
|
84
|
-
# -
|
|
85
|
-
# -
|
|
86
|
-
# -
|
|
85
|
+
# - DISABLE: No persistence (default)
|
|
86
|
+
# - REDIS: External Redis server
|
|
87
|
+
# - FILE: File-backed store
|
|
87
88
|
|
|
88
89
|
persistence:
|
|
89
|
-
mode:
|
|
90
|
+
mode: DISABLE # Options: DISABLE, REDIS, FILE
|
|
90
91
|
|
|
91
|
-
# For
|
|
92
|
+
# For REDIS mode:
|
|
92
93
|
# redis_url: "redis://localhost:6379/0"
|
|
93
94
|
|
|
94
|
-
# For
|
|
95
|
+
# For FILE mode:
|
|
95
96
|
# file_path: "~/.cache/tuft/file_redis.json"
|
|
96
97
|
|
|
97
|
-
# Namespace prefix for Redis keys (optional)
|
|
98
|
-
# namespace: "tuft"
|
|
98
|
+
# Namespace prefix for Redis keys. (optional, defaults to "persistence-tuft-server".)
|
|
99
|
+
# namespace: "persistence-tuft-server"
|
|
100
|
+
|
|
101
|
+
# TTL (Time-To-Live) for future records in seconds.
|
|
102
|
+
# Futures are short-lived async operation results that expire after this duration.
|
|
103
|
+
# Set to null for no expiry (not recommended for production).
|
|
104
|
+
# Default: 86400 (1 day)
|
|
105
|
+
# future_ttl_seconds: 86400
|
|
106
|
+
|
|
107
|
+
# Fields to validate on server restart for config consistency.
|
|
108
|
+
# For detailed documentation on available fields and config validation,
|
|
109
|
+
# see the "Configuration Validation" section in README.md.
|
|
110
|
+
# Defaults to ["SUPPORTED_MODELS"]. SUPPORTED_MODELS is always checked.
|
|
111
|
+
# check_fields:
|
|
112
|
+
# - SUPPORTED_MODELS
|
|
113
|
+
# - CHECKPOINT_DIR
|
|
99
114
|
|
|
100
115
|
# =============================================================================
|
|
101
116
|
# Telemetry Configuration (OpenTelemetry)
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
# Persistence
|
|
2
|
+
|
|
3
|
+
TuFT supports **optional persistence** for server state. When enabled, TuFT can recover key runtime metadata (sessions, training runs, sampling sessions, futures) after a server restart, and then **reconstruct model runtime state from checkpoints on disk**.
|
|
4
|
+
|
|
5
|
+
This document is organized into two parts:
|
|
6
|
+
|
|
7
|
+
- **Part 1: User Guide** – how to configure and use persistence
|
|
8
|
+
- **Part 2: Design & Internals** – how persistence works under the hood
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Table of Contents
|
|
13
|
+
|
|
14
|
+
### Part 1: User Guide
|
|
15
|
+
|
|
16
|
+
- [Quick start](#quick-start)
|
|
17
|
+
- [Configuration options](#configuration-options)
|
|
18
|
+
- [Persistence backends](#persistence-backends)
|
|
19
|
+
- [What is persisted](#what-is-persisted)
|
|
20
|
+
- [Operational workflows](#operational-workflows)
|
|
21
|
+
- [Troubleshooting](#troubleshooting)
|
|
22
|
+
|
|
23
|
+
### Part 2: Design & Internals
|
|
24
|
+
|
|
25
|
+
- [Goals and non-goals](#goals-and-non-goals)
|
|
26
|
+
- [Redis key design](#redis-key-design)
|
|
27
|
+
- [Startup restore semantics](#startup-restore-semantics)
|
|
28
|
+
- [Safety checks](#safety-checks)
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
# Part 1: User Guide
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
### Install optional dependency
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv pip install "tuft[persistence]"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Enable persistence
|
|
45
|
+
|
|
46
|
+
Add a `persistence` section to your `tuft_config.yaml`:
|
|
47
|
+
|
|
48
|
+
```yaml
|
|
49
|
+
persistence:
|
|
50
|
+
mode: REDIS
|
|
51
|
+
redis_url: "redis://localhost:6379/0"
|
|
52
|
+
namespace: "persistence-tuft-server"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
For file-backed storage (demos/tests):
|
|
56
|
+
|
|
57
|
+
```yaml
|
|
58
|
+
persistence:
|
|
59
|
+
mode: FILE
|
|
60
|
+
file_path: "~/.cache/tuft/file_redis.json"
|
|
61
|
+
namespace: "persistence-tuft-server"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Configuration options
|
|
67
|
+
|
|
68
|
+
Persistence is configured via the `persistence` section in your `tuft_config.yaml` configuration file. The following options are available:
|
|
69
|
+
|
|
70
|
+
| Option | Type | Default | Description |
|
|
71
|
+
|--------|------|---------|-------------|
|
|
72
|
+
| `mode` | string | `DISABLE` | Persistence mode: `DISABLE`, `REDIS`, or `FILE` |
|
|
73
|
+
| `redis_url` | string | `redis://localhost:6379/0` | Redis server URL (only used when `mode: REDIS`) |
|
|
74
|
+
| `file_path` | string | `~/.cache/tuft/file_redis.json` | JSON file path (only used when `mode: FILE`) |
|
|
75
|
+
| `namespace` | string | `persistence-tuft-server` | Key namespace prefix for Redis keys |
|
|
76
|
+
| `future_ttl_seconds` | integer or null | `86400` (1 day) | TTL for future records in seconds. Set to `null` for no expiry. |
|
|
77
|
+
| `check_fields` | list | `["SUPPORTED_MODELS"]` | List of config fields to validate on restart (see [Safety checks](#safety-checks)) |
|
|
78
|
+
|
|
79
|
+
### Full configuration example
|
|
80
|
+
|
|
81
|
+
```yaml
|
|
82
|
+
persistence:
|
|
83
|
+
mode: REDIS
|
|
84
|
+
redis_url: "redis://localhost:6379/0"
|
|
85
|
+
namespace: "my-tuft-deployment"
|
|
86
|
+
future_ttl_seconds: 86400 # 1 day
|
|
87
|
+
check_fields:
|
|
88
|
+
- SUPPORTED_MODELS
|
|
89
|
+
- CHECKPOINT_DIR
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Persistence backends
|
|
95
|
+
|
|
96
|
+
TuFT exposes three modes via `persistence.mode`:
|
|
97
|
+
|
|
98
|
+
- `DISABLE`: in-memory only; everything is lost on restart.
|
|
99
|
+
- `REDIS`: external Redis via `redis-py` (recommended for production).
|
|
100
|
+
- `FILE`: a file-backed Redis-like store (intended for demos/tests; uses a JSON file and is not optimized for concurrency/performance).
|
|
101
|
+
|
|
102
|
+
Internally, all records are stored as **JSON-serialized Pydantic models**, one record per key.
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## What is persisted
|
|
107
|
+
|
|
108
|
+
TuFT persists **metadata for major server subsystems** incrementally as changes occur. Specifically, the following subsystems have their state persisted:
|
|
109
|
+
|
|
110
|
+
- **Sessions** (`SessionManager`)
|
|
111
|
+
- session metadata, tags, `user_id`, heartbeat timestamp
|
|
112
|
+
- stored as permanent records (no TTL)
|
|
113
|
+
|
|
114
|
+
- **Training runs** (`TrainingController`)
|
|
115
|
+
- training run metadata (`training_run_id`, `base_model`, `lora_rank`, `model_owner`, `next_seq_id`, etc.)
|
|
116
|
+
- **checkpoint records (metadata only)** (training checkpoints and sampler checkpoints) are stored under separate keys
|
|
117
|
+
(the actual checkpoint weight artifacts live on disk under `checkpoint_dir`, not in Redis)
|
|
118
|
+
- stored as permanent records (no TTL)
|
|
119
|
+
|
|
120
|
+
- **Sampling sessions** (`SamplingController`)
|
|
121
|
+
- sampling session metadata + **sampling history** (seq ids + prompt hashes)
|
|
122
|
+
- stored as permanent records (no TTL)
|
|
123
|
+
|
|
124
|
+
- **Futures** (`FutureStore`)
|
|
125
|
+
- request lifecycle records: `pending` / `ready` / `failed`
|
|
126
|
+
- includes `operation_type`, `operation_args`, `future_id`, payload or error
|
|
127
|
+
- stored with a **TTL** (default: 1 day, configurable via `future_ttl_seconds`)
|
|
128
|
+
|
|
129
|
+
- **Configuration signature** (`ConfigSignature`)
|
|
130
|
+
- a snapshot of selected `AppConfig` fields for restore safety
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Operational workflows
|
|
135
|
+
|
|
136
|
+
### Clearing persistence state
|
|
137
|
+
|
|
138
|
+
If you intentionally changed config and want to start fresh, clear persistence data:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
tuft clear persistence --config /path/to/tuft_config.yaml
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
This removes keys under the configured `namespace`. It does **not** delete checkpoint files on disk.
|
|
145
|
+
|
|
146
|
+
### Changing config safely
|
|
147
|
+
|
|
148
|
+
Recommended workflow when changing any field that affects restore safety:
|
|
149
|
+
|
|
150
|
+
- deploy with a **new namespace**, or
|
|
151
|
+
- clear the old namespace explicitly before restart.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Troubleshooting
|
|
156
|
+
|
|
157
|
+
### Startup fails with "Configuration Mismatch"
|
|
158
|
+
|
|
159
|
+
- **Cause**: you restarted TuFT with a config whose signature differs from the stored signature in the same namespace.
|
|
160
|
+
- **Fix**:
|
|
161
|
+
- either revert the config change,
|
|
162
|
+
- or clear persistence state (destructive) and restart,
|
|
163
|
+
- or switch to a new `persistence.namespace` for the new deployment.
|
|
164
|
+
|
|
165
|
+
### After restart, some results are marked failed
|
|
166
|
+
|
|
167
|
+
This is expected if those futures were created **after** the latest recovered checkpoint (or when no checkpoint existed). Re-run those operations from the client.
|
|
168
|
+
|
|
169
|
+
### Redis grows indefinitely
|
|
170
|
+
|
|
171
|
+
Long-lived records (sessions, training runs, sampling sessions, checkpoints metadata) do not expire. Futures expire based on the configured `future_ttl_seconds` (default: 1 day). You should also set a namespace per deployment and clear unused namespaces.
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
# Part 2: Design & Internals
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Goals and non-goals
|
|
180
|
+
|
|
181
|
+
### Goals
|
|
182
|
+
|
|
183
|
+
- **Crash/restart recovery** of server state metadata so users can:
|
|
184
|
+
- list sessions / training runs / sampling sessions after restart
|
|
185
|
+
- retrieve completed futures after restart (within TTL)
|
|
186
|
+
- continue training **from the latest checkpoint** for each training run
|
|
187
|
+
- **Safety-first restore**: prevent silent corruption when server configuration changes.
|
|
188
|
+
|
|
189
|
+
### Non-goals
|
|
190
|
+
|
|
191
|
+
- Persistence **does not** snapshot live GPU memory / in-flight model execution state.
|
|
192
|
+
- TuFT **does not** re-run pending tasks after a crash. Pending work is treated as unsafe and must be retried.
|
|
193
|
+
- Persistence **does not** store model weight blobs in Redis; weight artifacts live on disk under `checkpoint_dir` (and can be archived via the API).
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Redis key design
|
|
198
|
+
|
|
199
|
+
All keys are prefixed by a configurable `namespace` (default: `persistence-tuft-server`) and use `::` as the separator:
|
|
200
|
+
|
|
201
|
+
- Top-level records:
|
|
202
|
+
`"{namespace}::{type}::{id}"`
|
|
203
|
+
- Nested records:
|
|
204
|
+
`"{namespace}::{type}::{parent_id}::{nested_type}::{nested_id}"`
|
|
205
|
+
|
|
206
|
+
> Note: To avoid ambiguity, any literal `::` inside parts is escaped internally.
|
|
207
|
+
|
|
208
|
+
### Key families (high-level)
|
|
209
|
+
|
|
210
|
+
With the default namespace, TuFT uses these major key families:
|
|
211
|
+
|
|
212
|
+
- **Sessions**
|
|
213
|
+
`persistence-tuft-server::session::{session_id}`
|
|
214
|
+
|
|
215
|
+
- **Training runs**
|
|
216
|
+
`persistence-tuft-server::training_run::{training_run_id}`
|
|
217
|
+
|
|
218
|
+
- **Training checkpoints metadata** (nested under training runs)
|
|
219
|
+
`persistence-tuft-server::training_run::{training_run_id}::ckpt::{checkpoint_id}`
|
|
220
|
+
|
|
221
|
+
- **Sampler checkpoints metadata** (nested under training runs)
|
|
222
|
+
`persistence-tuft-server::training_run::{training_run_id}::sampler_ckpt::{checkpoint_id}`
|
|
223
|
+
|
|
224
|
+
- **Sampling sessions**
|
|
225
|
+
`persistence-tuft-server::sampling_session::{sampling_session_id}`
|
|
226
|
+
|
|
227
|
+
- **Futures** (TTL-based)
|
|
228
|
+
`persistence-tuft-server::future::{request_id}`
|
|
229
|
+
|
|
230
|
+
- **Config signature**
|
|
231
|
+
`persistence-tuft-server::config_signature`
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Startup restore semantics
|
|
236
|
+
|
|
237
|
+
Restore has **one preflight step** plus **two restore phases**:
|
|
238
|
+
|
|
239
|
+
### Phase 0: configuration validation (before server starts)
|
|
240
|
+
|
|
241
|
+
When persistence is enabled, TuFT validates the current `AppConfig` against the stored configuration signature **before** launching the server. This is designed to prevent a restart from silently interpreting old state with a new incompatible config.
|
|
242
|
+
|
|
243
|
+
If a mismatch is detected, TuFT aborts startup with a fatal error and shows a diff.
|
|
244
|
+
|
|
245
|
+
### Phase 1: in-memory restore from persistence backend (controller construction)
|
|
246
|
+
|
|
247
|
+
On process start, these components restore their in-memory registries by scanning their key prefixes and deserializing records:
|
|
248
|
+
|
|
249
|
+
- `SessionManager` restores sessions.
|
|
250
|
+
- `TrainingController` restores training runs + checkpoint records.
|
|
251
|
+
- If a training run references a `base_model` not present in the current config, it is marked **corrupted**.
|
|
252
|
+
- `SamplingController` restores sampling sessions.
|
|
253
|
+
- Sampling sessions whose `base_model` is no longer supported are deleted from storage.
|
|
254
|
+
- `FutureStore` restores futures.
|
|
255
|
+
- Completed futures (`ready` / `failed`) are immediately marked as completed.
|
|
256
|
+
- Restored futures also rebuild `future_id` allocation state to keep ordering monotonic.
|
|
257
|
+
|
|
258
|
+
At the end of Phase 1, TuFT has restored *metadata*, but model runtime state (adapters/weights in GPU memory) is not yet reconstructed.
|
|
259
|
+
|
|
260
|
+
### Phase 2: checkpoint-based recovery (async init)
|
|
261
|
+
|
|
262
|
+
After controller restore, TuFT performs checkpoint-based recovery:
|
|
263
|
+
|
|
264
|
+
- For each training run that is not corrupted and has a usable backend:
|
|
265
|
+
- load the **latest checkpoint** on disk (and recreate adapter state)
|
|
266
|
+
- treat that checkpoint as the server's recovery boundary
|
|
267
|
+
- Futures are reconciled against this boundary:
|
|
268
|
+
- if a training run has a valid latest checkpoint with `future_id = F`, **all futures for that run with `future_id > F` are marked failed** (and must be retried)
|
|
269
|
+
- if no checkpoint exists, **all futures for that run are marked failed**
|
|
270
|
+
|
|
271
|
+
This means TuFT guarantees:
|
|
272
|
+
|
|
273
|
+
- **Training can continue from the latest checkpoint**, but
|
|
274
|
+
- any operations after that checkpoint are considered unsafe and require retries.
|
|
275
|
+
|
|
276
|
+
> Important: sequence IDs remain **monotonically increasing** even across restarts. TuFT does not "rewind" `next_seq_id` to the checkpoint boundary, by design.
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## Safety checks
|
|
281
|
+
|
|
282
|
+
TuFT includes a restart-safety check to prevent silent corruption when configuration changes across restarts.
|
|
283
|
+
|
|
284
|
+
### Config signature validation (restart safety)
|
|
285
|
+
|
|
286
|
+
TuFT stores a `ConfigSignature` derived from `AppConfig.get_config_for_persistence()` (notably excluding the persistence config itself).
|
|
287
|
+
|
|
288
|
+
On startup, TuFT compares selected fields (default: `SUPPORTED_MODELS`) and can be configured to check additional fields via `check_fields`:
|
|
289
|
+
|
|
290
|
+
- `SUPPORTED_MODELS` (always checked; mandatory)
|
|
291
|
+
- `CHECKPOINT_DIR`
|
|
292
|
+
- `MODEL_OWNER`
|
|
293
|
+
- `TOY_BACKEND_SEED`
|
|
294
|
+
- `AUTHORIZED_USERS`
|
|
295
|
+
- `TELEMETRY`
|
|
296
|
+
|
|
297
|
+
If validation fails, startup aborts and prints a diff. This avoids cases where an old state references models no longer configured, or a new deployment accidentally points at an old namespace.
|