tuft 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. tuft-0.1.3/.github/workflows/docker/docker-compose.yml +57 -0
  2. {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/install-script.yml +51 -46
  3. {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/unittest.yml +4 -5
  4. {tuft-0.1.1 → tuft-0.1.3}/PKG-INFO +10 -66
  5. {tuft-0.1.1 → tuft-0.1.3}/README.md +8 -64
  6. {tuft-0.1.1 → tuft-0.1.3}/config/tuft_config.example.yaml +24 -9
  7. {tuft-0.1.1 → tuft-0.1.3}/docker/Dockerfile +8 -10
  8. tuft-0.1.3/docs/persistence.md +297 -0
  9. tuft-0.1.3/docs/telemetry.md +286 -0
  10. {tuft-0.1.1 → tuft-0.1.3}/pyproject.toml +2 -2
  11. {tuft-0.1.1 → tuft-0.1.3}/scripts/install.sh +65 -150
  12. tuft-0.1.3/src/tuft/__main__.py +7 -0
  13. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/hf_training_model.py +184 -64
  14. tuft-0.1.3/src/tuft/cli.py +244 -0
  15. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/config.py +63 -59
  16. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/exceptions.py +66 -0
  17. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/futures.py +22 -2
  18. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/__init__.py +33 -0
  19. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/persistence/__init__.py +10 -2
  20. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/persistence/redis_store.py +352 -31
  21. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/sampling_controller.py +37 -11
  22. tuft-0.1.3/src/tuft/sequence_executor.py +72 -0
  23. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/server.py +9 -2
  24. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/state.py +3 -0
  25. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/training_controller.py +20 -5
  26. {tuft-0.1.1 → tuft-0.1.3}/tests/conftest.py +3 -3
  27. {tuft-0.1.1 → tuft-0.1.3}/tests/helpers.py +24 -0
  28. {tuft-0.1.1 → tuft-0.1.3}/tests/test_cli.py +1 -0
  29. {tuft-0.1.1 → tuft-0.1.3}/tests/test_futures.py +46 -0
  30. {tuft-0.1.1 → tuft-0.1.3}/tests/test_integration.py +3 -7
  31. {tuft-0.1.1 → tuft-0.1.3}/tests/test_integration_persistence.py +5 -8
  32. {tuft-0.1.1 → tuft-0.1.3}/tests/test_loss_fn.py +42 -0
  33. {tuft-0.1.1 → tuft-0.1.3}/tests/test_persistence.py +419 -3
  34. {tuft-0.1.1 → tuft-0.1.3}/tests/test_sampling_backend.py +3 -1
  35. tuft-0.1.3/tests/test_sequence_executor.py +81 -0
  36. {tuft-0.1.1 → tuft-0.1.3}/tests/test_server.py +3 -1
  37. {tuft-0.1.1 → tuft-0.1.3}/tests/test_state_controllers.py +23 -23
  38. {tuft-0.1.1 → tuft-0.1.3}/tests/test_telemetry.py +4 -7
  39. {tuft-0.1.1 → tuft-0.1.3}/tests/test_training_backend.py +15 -0
  40. tuft-0.1.1/.github/workflows/docker/docker-compose.yml +0 -72
  41. tuft-0.1.1/src/tuft/cli.py +0 -91
  42. {tuft-0.1.1 → tuft-0.1.3}/.gitattributes +0 -0
  43. {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/checks.yml +0 -0
  44. {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/docker.yml +0 -0
  45. {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/publish.yml +0 -0
  46. {tuft-0.1.1 → tuft-0.1.3}/.gitignore +0 -0
  47. {tuft-0.1.1 → tuft-0.1.3}/.gitmodules +0 -0
  48. {tuft-0.1.1 → tuft-0.1.3}/.pre-commit-config.yaml +0 -0
  49. {tuft-0.1.1 → tuft-0.1.3}/.python-version +0 -0
  50. {tuft-0.1.1 → tuft-0.1.3}/.secrets.baseline +0 -0
  51. {tuft-0.1.1 → tuft-0.1.3}/LICENSE +0 -0
  52. {tuft-0.1.1 → tuft-0.1.3}/assets/countdown_rl.png +0 -0
  53. {tuft-0.1.1 → tuft-0.1.3}/assets/test_nll_sft.png +0 -0
  54. {tuft-0.1.1 → tuft-0.1.3}/assets/train_mean_nll_sft.png +0 -0
  55. {tuft-0.1.1 → tuft-0.1.3}/assets/tuft-logo-colorful.svg +0 -0
  56. {tuft-0.1.1 → tuft-0.1.3}/docs/chat_sft.md +0 -0
  57. {tuft-0.1.1 → tuft-0.1.3}/docs/countdown_rl.md +0 -0
  58. {tuft-0.1.1 → tuft-0.1.3}/docs/how_to_write_tests.md +0 -0
  59. {tuft-0.1.1 → tuft-0.1.3}/examples/chat_sft.ipynb +0 -0
  60. {tuft-0.1.1 → tuft-0.1.3}/examples/countdown_rl.ipynb +0 -0
  61. {tuft-0.1.1 → tuft-0.1.3}/scripts/install_flash_attn.py +0 -0
  62. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/__init__.py +0 -0
  63. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/auth.py +0 -0
  64. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backend.py +0 -0
  65. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/__init__.py +0 -0
  66. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/base_backend.py +0 -0
  67. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/sampling_backend.py +0 -0
  68. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/training_backend.py +0 -0
  69. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/checkpoints.py +0 -0
  70. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/cispo.py +0 -0
  71. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/cross_entropy.py +0 -0
  72. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/dro.py +0 -0
  73. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/importance_sampling.py +0 -0
  74. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/ppo.py +0 -0
  75. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/persistence/file_redis.py +0 -0
  76. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/__init__.py +0 -0
  77. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/metrics.py +0 -0
  78. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/provider.py +0 -0
  79. {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/tracing.py +0 -0
  80. {tuft-0.1.1 → tuft-0.1.3}/tests/__init__.py +0 -0
  81. {tuft-0.1.1 → tuft-0.1.3}/tests/data/models.yaml +0 -0
  82. {tuft-0.1.1 → tuft-0.1.3}/tests/test_checkpoints.py +0 -0
  83. {tuft-0.1.1 → tuft-0.1.3}/tests/test_file_redis.py +0 -0
@@ -0,0 +1,57 @@
1
+ services:
2
+ # use 2 nodes to simulate a cluster environment
3
+ tuft-node-1:
4
+ image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
5
+ pull_policy: never
6
+ command: bash -c "
7
+ chmod 1777 /tmp && apt update && apt install -y --no-install-recommends \
8
+ build-essential \
9
+ curl git wget vim tmux net-tools \
10
+ python3 python3-pip python3-dev python3-packaging python3-venv \
11
+ libomp-dev infiniband-diags libibverbs-dev librdmacm-dev rdma-core perftest \
12
+ && rm -rf /var/lib/apt/lists/* \
13
+ && ln -sf /usr/bin/python3 /usr/bin/python \
14
+ && ln -sf /usr/bin/pip3 /usr/bin/pip \
15
+ && bash /workspace/scripts/install.sh --local-source /workspace \
16
+ && source /root/.local/bin/env \
17
+ && source /root/.tuft/venv/bin/activate \
18
+ && uv pip install .[dev] \
19
+ && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block"
20
+ environment:
21
+ - HF_ENDPOINT=https://hf-mirror.com
22
+ - RAY_ADDRESS=auto
23
+ - TUFT_CHECKPOINT_DIR=/mnt/checkpoints
24
+ - TUFT_TEST_MODEL=/mnt/models/Qwen3-0.6B
25
+ - TUFT_TEST_MODEL_1=/mnt/models/Qwen3-0.6B
26
+ - TUFT_TEST_MODEL_2=/mnt/models/Qwen3-1.7B
27
+ - TUFT_DOCKER_UNITTEST=1
28
+ - TEST_REDIS_URL=redis://tuft-redis:6379
29
+ - VIRTUAL_ENV=/root/.tuft/venv
30
+ working_dir: /workspace
31
+ networks:
32
+ - tuft-network
33
+ volumes:
34
+ - tuft-volume:/mnt
35
+ - ../../..:/workspace
36
+ shm_size: "64G"
37
+ deploy:
38
+ resources:
39
+ reservations:
40
+ devices:
41
+ - driver: nvidia
42
+ device_ids: ['0', '1', '2', '3']
43
+ capabilities: [gpu]
44
+
45
+ tuft-redis:
46
+ image: redis:7.0
47
+ command: ["redis-server", "--save", "60", "1", "--loglevel", "warning"]
48
+ networks:
49
+ - tuft-network
50
+
51
+ networks:
52
+ tuft-network:
53
+ driver: bridge
54
+
55
+ volumes:
56
+ tuft-volume:
57
+ external: true
@@ -59,36 +59,42 @@ jobs:
59
59
  env:
60
60
  TUFT_HOME: ${{ runner.temp }}/tuft
61
61
 
62
- - name: Test tuft (dry run - check config error)
62
+ - name: Test tuft launch --help
63
63
  run: |
64
64
  export PATH="${TUFT_HOME}/bin:$PATH"
65
- # Should fail with config error, not import error
66
- tuft 2>&1 | grep -q "\-\-config" || tuft 2>&1 | grep -q "config"
65
+ tuft launch --help
67
66
  env:
68
67
  TUFT_HOME: ${{ runner.temp }}/tuft
69
68
 
70
- - name: Clean up installation
71
- run: rm -rf "${TUFT_HOME}"
72
- env:
73
- TUFT_HOME: ${{ runner.temp }}/tuft
74
-
75
- test-install-default-with-backend:
76
- runs-on: ubuntu-latest
77
-
78
- steps:
79
- - name: Checkout code
80
- uses: actions/checkout@v4
81
-
82
- - name: Run install script (default includes backend)
69
+ - name: Test tuft launch requires config
83
70
  run: |
84
- bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
71
+ export PATH="${TUFT_HOME}/bin:$PATH"
72
+ # Should fail with config error when no config provided
73
+ if tuft launch 2>&1; then
74
+ echo "Expected tuft launch to fail without config"
75
+ exit 1
76
+ fi
77
+ # Verify error message mentions config
78
+ tuft launch 2>&1 | grep -qi "config"
85
79
  env:
86
80
  TUFT_HOME: ${{ runner.temp }}/tuft
87
81
 
88
- - name: Verify backend dependencies installed
82
+ - name: Test tuft launch with config file
89
83
  run: |
90
- "${TUFT_HOME}/venv/bin/python" -c "import peft; print('peft imported successfully')"
91
- "${TUFT_HOME}/venv/bin/python" -c "import redis; print('redis imported successfully')"
84
+ export PATH="${TUFT_HOME}/bin:$PATH"
85
+ # Create a minimal config file
86
+ cat > "${TUFT_HOME}/configs/tuft_config.yaml" << 'EOF'
87
+ model_owner: test
88
+ supported_models:
89
+ - model_name: test-model
90
+ model_path: /nonexistent/path
91
+ max_model_len: 1024
92
+ authorized_users:
93
+ test-key: test-user
94
+ EOF
95
+ # Launch should fail due to missing model, but get past config validation
96
+ # We just verify it doesn't fail on config parsing
97
+ tuft launch 2>&1 | grep -v "Configuration file must be provided" || true
92
98
  env:
93
99
  TUFT_HOME: ${{ runner.temp }}/tuft
94
100
 
@@ -97,25 +103,23 @@ jobs:
97
103
  env:
98
104
  TUFT_HOME: ${{ runner.temp }}/tuft
99
105
 
100
- test-install-without-backend:
106
+ test-backend-dependencies:
101
107
  runs-on: ubuntu-latest
102
108
 
103
109
  steps:
104
110
  - name: Checkout code
105
111
  uses: actions/checkout@v4
106
112
 
107
- - name: Run install script without backend
113
+ - name: Run install script
108
114
  run: |
109
- bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --without-backend
115
+ bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
110
116
  env:
111
117
  TUFT_HOME: ${{ runner.temp }}/tuft
112
118
 
113
- - name: Verify minimal install (no peft)
119
+ - name: Verify backend dependencies installed
114
120
  run: |
115
- # peft should NOT be installed in minimal mode
116
- "${TUFT_HOME}/venv/bin/python" -c "import peft" 2>&1 && exit 1 || echo "peft not installed (expected)"
117
- # tuft should still be importable
118
- "${TUFT_HOME}/venv/bin/python" -c "import tuft; print('tuft imported successfully')"
121
+ "${TUFT_HOME}/venv/bin/python" -c "import peft; print('peft imported successfully')"
122
+ "${TUFT_HOME}/venv/bin/python" -c "import redis; print('redis imported successfully')"
119
123
  env:
120
124
  TUFT_HOME: ${{ runner.temp }}/tuft
121
125
 
@@ -150,13 +154,20 @@ jobs:
150
154
  env:
151
155
  TUFT_HOME: ${{ runner.temp }}/tuft
152
156
 
153
- - name: Test upgrade command
157
+ - name: Test upgrade command (from PyPI)
154
158
  run: |
155
159
  export PATH="${TUFT_HOME}/bin:$PATH"
156
160
  tuft upgrade
157
161
  env:
158
162
  TUFT_HOME: ${{ runner.temp }}/tuft
159
163
 
164
+ - name: Test upgrade command (from local source)
165
+ run: |
166
+ export PATH="${TUFT_HOME}/bin:$PATH"
167
+ tuft upgrade --local-source "$GITHUB_WORKSPACE"
168
+ env:
169
+ TUFT_HOME: ${{ runner.temp }}/tuft
170
+
160
171
  - name: Clean up installation
161
172
  run: rm -rf "${TUFT_HOME}"
162
173
  env:
@@ -171,7 +182,7 @@ jobs:
171
182
 
172
183
  - name: Initial install
173
184
  run: |
174
- bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --without-backend
185
+ bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
175
186
  env:
176
187
  TUFT_HOME: ${{ runner.temp }}/tuft
177
188
 
@@ -184,7 +195,7 @@ jobs:
184
195
 
185
196
  - name: Reinstall with --clean
186
197
  run: |
187
- bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --without-backend --clean
198
+ bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --clean
188
199
  env:
189
200
  TUFT_HOME: ${{ runner.temp }}/tuft
190
201
 
@@ -202,36 +213,30 @@ jobs:
202
213
  env:
203
214
  TUFT_HOME: ${{ runner.temp }}/tuft
204
215
 
205
- test-install-backend-command:
216
+ test-upgrade-from-source:
206
217
  runs-on: ubuntu-latest
207
218
 
208
219
  steps:
209
220
  - name: Checkout code
210
221
  uses: actions/checkout@v4
211
222
 
212
- - name: Install without backend first
213
- run: |
214
- bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --without-backend
215
- env:
216
- TUFT_HOME: ${{ runner.temp }}/tuft
217
-
218
- - name: Verify peft is NOT installed
219
- run: |
220
- "${TUFT_HOME}/venv/bin/python" -c "import peft" 2>&1 && exit 1 || echo "peft not installed (expected)"
223
+ - name: Install tuft
224
+ run: bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
221
225
  env:
222
226
  TUFT_HOME: ${{ runner.temp }}/tuft
223
227
 
224
- - name: Run install-backend command
228
+ - name: Test upgrade --from-source
225
229
  run: |
226
230
  export PATH="${TUFT_HOME}/bin:$PATH"
227
- tuft install-backend
231
+ tuft upgrade --from-source
228
232
  env:
229
233
  TUFT_HOME: ${{ runner.temp }}/tuft
230
234
 
231
- - name: Verify backend dependencies now installed
235
+ - name: Verify tuft still works after upgrade
232
236
  run: |
233
- "${TUFT_HOME}/venv/bin/python" -c "import peft; print('peft imported successfully')"
234
- "${TUFT_HOME}/venv/bin/python" -c "import redis; print('redis imported successfully')"
237
+ export PATH="${TUFT_HOME}/bin:$PATH"
238
+ tuft version
239
+ tuft launch --help
235
240
  env:
236
241
  TUFT_HOME: ${{ runner.temp }}/tuft
237
242
 
@@ -31,11 +31,10 @@ jobs:
31
31
  - name: Check ray status
32
32
  working-directory: tuft-${{ github.run_id }}/.github/workflows/docker
33
33
  run: |
34
- MAX_RETRIES=20
35
- RETRY_INTERVAL=5
34
+ MAX_RETRIES=90
35
+ RETRY_INTERVAL=30
36
36
  for i in $(seq 1 $MAX_RETRIES); do
37
- if docker compose exec tuft-node-1 bash -c "source /opt/venv/bin/activate && ray status" \
38
- && docker compose exec tuft-node-2 bash -c "source /opt/venv/bin/activate && ray status"; then
37
+ if docker compose exec tuft-node-1 bash -c "source /root/.tuft/venv/bin/activate && ray status"; then
39
38
  break
40
39
  fi
41
40
  echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)"
@@ -51,7 +50,7 @@ jobs:
51
50
  # set a github env variable to indicate tests were run, so that subsequent steps can check it
52
51
  run: |
53
52
  echo "tests_run=true" >> $GITHUB_ENV
54
- docker compose exec tuft-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --gpu --basetemp /mnt/checkpoints --ctrf report.json"
53
+ docker compose exec tuft-node-1 bash -c "source /root/.tuft/venv/bin/activate && pytest tests -v -s --gpu --basetemp /mnt/checkpoints --ctrf report.json"
55
54
 
56
55
  - name: Convert report.json time to ms
57
56
  working-directory: tuft-${{ github.run_id }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tuft
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
5
5
  Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
6
6
  License: MIT License
@@ -29,6 +29,7 @@ Requires-Python: >=3.11
29
29
  Requires-Dist: fastapi>=0.125.0
30
30
  Requires-Dist: httpx>=0.28.1
31
31
  Requires-Dist: numpy<2.0.0
32
+ Requires-Dist: nvidia-ml-py>=13.0.0
32
33
  Requires-Dist: omegaconf>=2.3.0
33
34
  Requires-Dist: opentelemetry-api>=1.20.0
34
35
  Requires-Dist: opentelemetry-exporter-otlp>=1.20.0
@@ -36,7 +37,6 @@ Requires-Dist: opentelemetry-instrumentation-fastapi>=0.41b0
36
37
  Requires-Dist: opentelemetry-instrumentation-logging>=0.41b0
37
38
  Requires-Dist: opentelemetry-sdk>=1.20.0
38
39
  Requires-Dist: psutil>=5.9.0
39
- Requires-Dist: pynvml>=11.5.0
40
40
  Requires-Dist: ray>=2.50.0
41
41
  Requires-Dist: tinker>=0.7.0
42
42
  Requires-Dist: transformers<5.0.0,>=4.57.3
@@ -300,7 +300,7 @@ uv pip install "tuft[dev,backend,persistence]"
300
300
  The CLI starts a FastAPI server:
301
301
 
302
302
  ```bash
303
- tuft --port 10610 --config /path/to/tuft_config.yaml
303
+ tuft launch --port 10610 --config /path/to/tuft_config.yaml
304
304
  ```
305
305
 
306
306
  The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
@@ -340,7 +340,7 @@ you can use the pre-built Docker image.
340
340
  -p 10610:10610 \
341
341
  -v <host_dir>:/data \
342
342
  ghcr.io/agentscope-ai/tuft:latest \
343
- tuft --port 10610 --config /data/tuft_config.yaml
343
+ tuft launch --port 10610 --config /data/tuft_config.yaml
344
344
  ```
345
345
 
346
346
  Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
@@ -378,77 +378,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
378
378
 
379
379
  ## Persistence
380
380
 
381
- TuFT supports optional Redis-based persistence for server state. When enabled,
382
- the server can recover sessions, training runs, and pending futures after a restart.
381
+ TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
383
382
 
384
- To use persistence, install the optional dependency:
383
+ See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
385
384
 
386
385
  ```bash
387
- uv pip install tuft[persistence]
386
+ uv pip install "tuft[persistence]"
388
387
  ```
389
388
 
390
- ### Persistence Modes
391
-
392
- TuFT provides three persistence modes:
393
-
394
- | Mode | Description | Use Case |
395
- |------|-------------|----------|
396
- | `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
397
- | `redis_url` | External Redis server | Production, multi-instance deployments |
398
- | `file_redis` | File-backed store | Demos, small-scale testing |
399
-
400
- ### Configuration
401
-
402
- Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
403
-
404
- #### Mode 1: Disabled (Default)
405
-
406
- No configuration needed. All data is stored in memory and lost on restart.
407
-
408
- ```yaml
409
- # tuft_config.yaml
410
- persistence:
411
- mode: disabled
412
- ```
413
-
414
- #### Mode 2: External Redis Server
415
-
416
- Use an external Redis server for production deployments:
417
-
418
389
  ```yaml
419
390
  # tuft_config.yaml
420
391
  persistence:
421
- mode: redis_url
392
+ mode: REDIS
422
393
  redis_url: "redis://localhost:6379/0"
423
- namespace: "tuft"
424
- ```
425
-
426
- You can start a local Redis instance using Docker:
427
-
428
- ```bash
429
- docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
430
- ```
431
-
432
- #### Mode 3: File-backed Store
433
-
434
- Use the file-backed store for demos or small-scale testing:
435
-
436
- ```yaml
437
- # tuft_config.yaml
438
- persistence:
439
- mode: file_redis
440
- file_path: "~/.cache/tuft/file_redis.json"
441
- namespace: "tuft"
394
+ namespace: "persistence-tuft-server"
442
395
  ```
443
396
 
444
397
  ## Observability (OpenTelemetry)
445
398
 
446
- TuFT supports optional OpenTelemetry integration for distributed tracing, metrics, and logging.
447
- This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
448
-
449
- ### Configuration
450
-
451
- Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
399
+ TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
452
400
 
453
401
  ```yaml
454
402
  # tuft_config.yaml
@@ -457,10 +405,6 @@ telemetry:
457
405
  service_name: tuft
458
406
  otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
459
407
  resource_attributes: {}
460
- # example:
461
- # deployment.environment: production
462
- # service.version: 1.0.0
463
- # service.namespace: my-namespace
464
408
  ```
465
409
 
466
410
  Alternatively, use environment variables:
@@ -238,7 +238,7 @@ uv pip install "tuft[dev,backend,persistence]"
238
238
  The CLI starts a FastAPI server:
239
239
 
240
240
  ```bash
241
- tuft --port 10610 --config /path/to/tuft_config.yaml
241
+ tuft launch --port 10610 --config /path/to/tuft_config.yaml
242
242
  ```
243
243
 
244
244
  The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
@@ -278,7 +278,7 @@ you can use the pre-built Docker image.
278
278
  -p 10610:10610 \
279
279
  -v <host_dir>:/data \
280
280
  ghcr.io/agentscope-ai/tuft:latest \
281
- tuft --port 10610 --config /data/tuft_config.yaml
281
+ tuft launch --port 10610 --config /data/tuft_config.yaml
282
282
  ```
283
283
 
284
284
  Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
@@ -316,77 +316,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
316
316
 
317
317
  ## Persistence
318
318
 
319
- TuFT supports optional Redis-based persistence for server state. When enabled,
320
- the server can recover sessions, training runs, and pending futures after a restart.
319
+ TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
321
320
 
322
- To use persistence, install the optional dependency:
321
+ See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
323
322
 
324
323
  ```bash
325
- uv pip install tuft[persistence]
324
+ uv pip install "tuft[persistence]"
326
325
  ```
327
326
 
328
- ### Persistence Modes
329
-
330
- TuFT provides three persistence modes:
331
-
332
- | Mode | Description | Use Case |
333
- |------|-------------|----------|
334
- | `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
335
- | `redis_url` | External Redis server | Production, multi-instance deployments |
336
- | `file_redis` | File-backed store | Demos, small-scale testing |
337
-
338
- ### Configuration
339
-
340
- Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
341
-
342
- #### Mode 1: Disabled (Default)
343
-
344
- No configuration needed. All data is stored in memory and lost on restart.
345
-
346
- ```yaml
347
- # tuft_config.yaml
348
- persistence:
349
- mode: disabled
350
- ```
351
-
352
- #### Mode 2: External Redis Server
353
-
354
- Use an external Redis server for production deployments:
355
-
356
327
  ```yaml
357
328
  # tuft_config.yaml
358
329
  persistence:
359
- mode: redis_url
330
+ mode: REDIS
360
331
  redis_url: "redis://localhost:6379/0"
361
- namespace: "tuft"
362
- ```
363
-
364
- You can start a local Redis instance using Docker:
365
-
366
- ```bash
367
- docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
368
- ```
369
-
370
- #### Mode 3: File-backed Store
371
-
372
- Use the file-backed store for demos or small-scale testing:
373
-
374
- ```yaml
375
- # tuft_config.yaml
376
- persistence:
377
- mode: file_redis
378
- file_path: "~/.cache/tuft/file_redis.json"
379
- namespace: "tuft"
332
+ namespace: "persistence-tuft-server"
380
333
  ```
381
334
 
382
335
  ## Observability (OpenTelemetry)
383
336
 
384
- TuFT supports optional OpenTelemetry integration for distributed tracing, metrics, and logging.
385
- This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
386
-
387
- ### Configuration
388
-
389
- Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
337
+ TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
390
338
 
391
339
  ```yaml
392
340
  # tuft_config.yaml
@@ -395,10 +343,6 @@ telemetry:
395
343
  service_name: tuft
396
344
  otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
397
345
  resource_attributes: {}
398
- # example:
399
- # deployment.environment: production
400
- # service.version: 1.0.0
401
- # service.namespace: my-namespace
402
346
  ```
403
347
 
404
348
  Alternatively, use environment variables:
@@ -4,7 +4,7 @@
4
4
  # Copy this file to your desired location and modify as needed.
5
5
  #
6
6
  # Usage:
7
- # tuft --config /path/to/your/tuft_config.yaml
7
+ # tuft launch --config /path/to/your/tuft_config.yaml
8
8
 
9
9
  # =============================================================================
10
10
  # Checkpoint Directory
@@ -79,23 +79,38 @@ authorized_users:
79
79
  # Persistence Configuration
80
80
  # =============================================================================
81
81
  # Configure state persistence for recovery after server restart.
82
+ # For detailed documentation, see the "Persistence" section in README.md.
82
83
  #
83
84
  # Available modes:
84
- # - disabled: No persistence (default)
85
- # - redis_url: External Redis server
86
- # - file_redis: File-backed store
85
+ # - DISABLE: No persistence (default)
86
+ # - REDIS: External Redis server
87
+ # - FILE: File-backed store
87
88
 
88
89
  persistence:
89
- mode: disabled # Options: disabled, redis_url, file_redis
90
+ mode: DISABLE # Options: DISABLE, REDIS, FILE
90
91
 
91
- # For redis_url mode:
92
+ # For REDIS mode:
92
93
  # redis_url: "redis://localhost:6379/0"
93
94
 
94
- # For file_redis mode:
95
+ # For FILE mode:
95
96
  # file_path: "~/.cache/tuft/file_redis.json"
96
97
 
97
- # Namespace prefix for Redis keys (optional)
98
- # namespace: "tuft"
98
+ # Namespace prefix for Redis keys. (optional, defaults to "persistence-tuft-server".)
99
+ # namespace: "persistence-tuft-server"
100
+
101
+ # TTL (Time-To-Live) for future records in seconds.
102
+ # Futures are short-lived async operation results that expire after this duration.
103
+ # Set to null for no expiry (not recommended for production).
104
+ # Default: 86400 (1 day)
105
+ # future_ttl_seconds: 86400
106
+
107
+ # Fields to validate on server restart for config consistency.
108
+ # For detailed documentation on available fields and config validation,
109
+ # see the "Configuration Validation" section in README.md.
110
+ # Defaults to ["SUPPORTED_MODELS"]. SUPPORTED_MODELS is always checked.
111
+ # check_fields:
112
+ # - SUPPORTED_MODELS
113
+ # - CHECKPOINT_DIR
99
114
 
100
115
  # =============================================================================
101
116
  # Telemetry Configuration (OpenTelemetry)
@@ -7,7 +7,7 @@
7
7
  #
8
8
  # Note:
9
9
  # This Dockerfile uses 'uv' to create a virtual environment for better package management.
10
- # The uv virtual environment is created at `/opt/venv`, use `source /opt/venv/bin/activate` to activate it.
10
+ # The uv virtual environment is created at `/root/.tuft/venv`, use `source /root/.tuft/venv/bin/activate` to activate it.
11
11
  # Make sure to use `uv pip` to install packages within the virtual environment.
12
12
 
13
13
  FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
@@ -23,13 +23,14 @@ RUN chmod 1777 /tmp && apt update && apt install -y --no-install-recommends \
23
23
  && ln -sf /usr/bin/python3 /usr/bin/python \
24
24
  && ln -sf /usr/bin/pip3 /usr/bin/pip
25
25
 
26
- ENV VIRTUAL_ENV=/opt/venv
26
+ ENV VIRTUAL_ENV=/root/.tuft/venv
27
27
 
28
28
  # copy the TuFT dir into the workspace
29
29
  COPY ./pyproject.toml .
30
30
  COPY ./LICENSE .
31
31
  COPY ./README.md .
32
32
  COPY ./src ./src
33
+ COPY ./scripts ./scripts
33
34
 
34
35
  # Uncomment the following line if you want to use AliCloud Mirror to speed up pip install
35
36
  # ENV UV_DEFAULT_INDEX=http://mirrors.cloud.aliyuncs.com/pypi/simple/
@@ -37,14 +38,11 @@ COPY ./src ./src
37
38
  # Uncomment the following line to use a Hugging Face mirror if you have network connection problem with Hugging Face
38
39
  # ENV HF_ENDPOINT=https://hf-mirror.com
39
40
 
40
- # Install uv
41
- RUN pip install uv && uv venv ${VIRTUAL_ENV} --python=python3.12
42
-
43
- # Install minimal TuFT
44
- RUN . ${VIRTUAL_ENV}/bin/activate && uv pip install -e .[dev,backend,persistence]
45
-
46
- # Install flash_attn
47
- RUN . ${VIRTUAL_ENV}/bin/activate && uv pip install flash_attn==2.8.1 --no-build-isolation
41
+ # Install
42
+ RUN bash ./scripts/install.sh --local-source /workspace \
43
+ && . $HOME/.local/bin/env \
44
+ && . /root/.tuft/venv/bin/activate \
45
+ && uv pip install .[dev]
48
46
 
49
47
  ENTRYPOINT ["/bin/bash", "-c", "source ${VIRTUAL_ENV}/bin/activate && exec \"$@\"", "--"]
50
48
  CMD ["bash"]