tuft 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tuft-0.1.3/.github/workflows/docker/docker-compose.yml +57 -0
- {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/install-script.yml +51 -46
- {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/unittest.yml +4 -5
- {tuft-0.1.1 → tuft-0.1.3}/PKG-INFO +10 -66
- {tuft-0.1.1 → tuft-0.1.3}/README.md +8 -64
- {tuft-0.1.1 → tuft-0.1.3}/config/tuft_config.example.yaml +24 -9
- {tuft-0.1.1 → tuft-0.1.3}/docker/Dockerfile +8 -10
- tuft-0.1.3/docs/persistence.md +297 -0
- tuft-0.1.3/docs/telemetry.md +286 -0
- {tuft-0.1.1 → tuft-0.1.3}/pyproject.toml +2 -2
- {tuft-0.1.1 → tuft-0.1.3}/scripts/install.sh +65 -150
- tuft-0.1.3/src/tuft/__main__.py +7 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/hf_training_model.py +184 -64
- tuft-0.1.3/src/tuft/cli.py +244 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/config.py +63 -59
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/exceptions.py +66 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/futures.py +22 -2
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/__init__.py +33 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/persistence/__init__.py +10 -2
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/persistence/redis_store.py +352 -31
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/sampling_controller.py +37 -11
- tuft-0.1.3/src/tuft/sequence_executor.py +72 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/server.py +9 -2
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/state.py +3 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/training_controller.py +20 -5
- {tuft-0.1.1 → tuft-0.1.3}/tests/conftest.py +3 -3
- {tuft-0.1.1 → tuft-0.1.3}/tests/helpers.py +24 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_cli.py +1 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_futures.py +46 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_integration.py +3 -7
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_integration_persistence.py +5 -8
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_loss_fn.py +42 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_persistence.py +419 -3
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_sampling_backend.py +3 -1
- tuft-0.1.3/tests/test_sequence_executor.py +81 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_server.py +3 -1
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_state_controllers.py +23 -23
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_telemetry.py +4 -7
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_training_backend.py +15 -0
- tuft-0.1.1/.github/workflows/docker/docker-compose.yml +0 -72
- tuft-0.1.1/src/tuft/cli.py +0 -91
- {tuft-0.1.1 → tuft-0.1.3}/.gitattributes +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/checks.yml +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/docker.yml +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.github/workflows/publish.yml +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.gitignore +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.gitmodules +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.pre-commit-config.yaml +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.python-version +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/.secrets.baseline +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/LICENSE +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/assets/countdown_rl.png +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/assets/test_nll_sft.png +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/assets/train_mean_nll_sft.png +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/assets/tuft-logo-colorful.svg +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/docs/chat_sft.md +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/docs/countdown_rl.md +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/docs/how_to_write_tests.md +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/examples/chat_sft.ipynb +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/examples/countdown_rl.ipynb +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/scripts/install_flash_attn.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/__init__.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/auth.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backend.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/__init__.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/base_backend.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/sampling_backend.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/backends/training_backend.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/checkpoints.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/cispo.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/cross_entropy.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/dro.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/importance_sampling.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/loss_fn/ppo.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/persistence/file_redis.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/__init__.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/metrics.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/provider.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/src/tuft/telemetry/tracing.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/__init__.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/data/models.yaml +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_checkpoints.py +0 -0
- {tuft-0.1.1 → tuft-0.1.3}/tests/test_file_redis.py +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
services:
|
|
2
|
+
# use 2 nodes to simulate a cluster environment
|
|
3
|
+
tuft-node-1:
|
|
4
|
+
image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
|
|
5
|
+
pull_policy: never
|
|
6
|
+
command: bash -c "
|
|
7
|
+
chmod 1777 /tmp && apt update && apt install -y --no-install-recommends \
|
|
8
|
+
build-essential \
|
|
9
|
+
curl git wget vim tmux net-tools \
|
|
10
|
+
python3 python3-pip python3-dev python3-packaging python3-venv \
|
|
11
|
+
libomp-dev infiniband-diags libibverbs-dev librdmacm-dev rdma-core perftest \
|
|
12
|
+
&& rm -rf /var/lib/apt/lists/* \
|
|
13
|
+
&& ln -sf /usr/bin/python3 /usr/bin/python \
|
|
14
|
+
&& ln -sf /usr/bin/pip3 /usr/bin/pip \
|
|
15
|
+
&& bash /workspace/scripts/install.sh --local-source /workspace \
|
|
16
|
+
&& source /root/.local/bin/env \
|
|
17
|
+
&& source /root/.tuft/venv/bin/activate \
|
|
18
|
+
&& uv pip install .[dev] \
|
|
19
|
+
&& ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block"
|
|
20
|
+
environment:
|
|
21
|
+
- HF_ENDPOINT=https://hf-mirror.com
|
|
22
|
+
- RAY_ADDRESS=auto
|
|
23
|
+
- TUFT_CHECKPOINT_DIR=/mnt/checkpoints
|
|
24
|
+
- TUFT_TEST_MODEL=/mnt/models/Qwen3-0.6B
|
|
25
|
+
- TUFT_TEST_MODEL_1=/mnt/models/Qwen3-0.6B
|
|
26
|
+
- TUFT_TEST_MODEL_2=/mnt/models/Qwen3-1.7B
|
|
27
|
+
- TUFT_DOCKER_UNITTEST=1
|
|
28
|
+
- TEST_REDIS_URL=redis://tuft-redis:6379
|
|
29
|
+
- VIRTUAL_ENV=/root/.tuft/venv
|
|
30
|
+
working_dir: /workspace
|
|
31
|
+
networks:
|
|
32
|
+
- tuft-network
|
|
33
|
+
volumes:
|
|
34
|
+
- tuft-volume:/mnt
|
|
35
|
+
- ../../..:/workspace
|
|
36
|
+
shm_size: "64G"
|
|
37
|
+
deploy:
|
|
38
|
+
resources:
|
|
39
|
+
reservations:
|
|
40
|
+
devices:
|
|
41
|
+
- driver: nvidia
|
|
42
|
+
device_ids: ['0', '1', '2', '3']
|
|
43
|
+
capabilities: [gpu]
|
|
44
|
+
|
|
45
|
+
tuft-redis:
|
|
46
|
+
image: redis:7.0
|
|
47
|
+
command: ["redis-server", "--save", "60", "1", "--loglevel", "warning"]
|
|
48
|
+
networks:
|
|
49
|
+
- tuft-network
|
|
50
|
+
|
|
51
|
+
networks:
|
|
52
|
+
tuft-network:
|
|
53
|
+
driver: bridge
|
|
54
|
+
|
|
55
|
+
volumes:
|
|
56
|
+
tuft-volume:
|
|
57
|
+
external: true
|
|
@@ -59,36 +59,42 @@ jobs:
|
|
|
59
59
|
env:
|
|
60
60
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
61
61
|
|
|
62
|
-
- name: Test tuft
|
|
62
|
+
- name: Test tuft launch --help
|
|
63
63
|
run: |
|
|
64
64
|
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
65
|
-
|
|
66
|
-
tuft 2>&1 | grep -q "\-\-config" || tuft 2>&1 | grep -q "config"
|
|
65
|
+
tuft launch --help
|
|
67
66
|
env:
|
|
68
67
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
69
68
|
|
|
70
|
-
- name:
|
|
71
|
-
run: rm -rf "${TUFT_HOME}"
|
|
72
|
-
env:
|
|
73
|
-
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
74
|
-
|
|
75
|
-
test-install-default-with-backend:
|
|
76
|
-
runs-on: ubuntu-latest
|
|
77
|
-
|
|
78
|
-
steps:
|
|
79
|
-
- name: Checkout code
|
|
80
|
-
uses: actions/checkout@v4
|
|
81
|
-
|
|
82
|
-
- name: Run install script (default includes backend)
|
|
69
|
+
- name: Test tuft launch requires config
|
|
83
70
|
run: |
|
|
84
|
-
|
|
71
|
+
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
72
|
+
# Should fail with config error when no config provided
|
|
73
|
+
if tuft launch 2>&1; then
|
|
74
|
+
echo "Expected tuft launch to fail without config"
|
|
75
|
+
exit 1
|
|
76
|
+
fi
|
|
77
|
+
# Verify error message mentions config
|
|
78
|
+
tuft launch 2>&1 | grep -qi "config"
|
|
85
79
|
env:
|
|
86
80
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
87
81
|
|
|
88
|
-
- name:
|
|
82
|
+
- name: Test tuft launch with config file
|
|
89
83
|
run: |
|
|
90
|
-
"${TUFT_HOME}/
|
|
91
|
-
|
|
84
|
+
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
85
|
+
# Create a minimal config file
|
|
86
|
+
cat > "${TUFT_HOME}/configs/tuft_config.yaml" << 'EOF'
|
|
87
|
+
model_owner: test
|
|
88
|
+
supported_models:
|
|
89
|
+
- model_name: test-model
|
|
90
|
+
model_path: /nonexistent/path
|
|
91
|
+
max_model_len: 1024
|
|
92
|
+
authorized_users:
|
|
93
|
+
test-key: test-user
|
|
94
|
+
EOF
|
|
95
|
+
# Launch should fail due to missing model, but get past config validation
|
|
96
|
+
# We just verify it doesn't fail on config parsing
|
|
97
|
+
tuft launch 2>&1 | grep -v "Configuration file must be provided" || true
|
|
92
98
|
env:
|
|
93
99
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
94
100
|
|
|
@@ -97,25 +103,23 @@ jobs:
|
|
|
97
103
|
env:
|
|
98
104
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
99
105
|
|
|
100
|
-
test-
|
|
106
|
+
test-backend-dependencies:
|
|
101
107
|
runs-on: ubuntu-latest
|
|
102
108
|
|
|
103
109
|
steps:
|
|
104
110
|
- name: Checkout code
|
|
105
111
|
uses: actions/checkout@v4
|
|
106
112
|
|
|
107
|
-
- name: Run install script
|
|
113
|
+
- name: Run install script
|
|
108
114
|
run: |
|
|
109
|
-
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
|
|
115
|
+
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
|
|
110
116
|
env:
|
|
111
117
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
112
118
|
|
|
113
|
-
- name: Verify
|
|
119
|
+
- name: Verify backend dependencies installed
|
|
114
120
|
run: |
|
|
115
|
-
|
|
116
|
-
"${TUFT_HOME}/venv/bin/python" -c "import
|
|
117
|
-
# tuft should still be importable
|
|
118
|
-
"${TUFT_HOME}/venv/bin/python" -c "import tuft; print('tuft imported successfully')"
|
|
121
|
+
"${TUFT_HOME}/venv/bin/python" -c "import peft; print('peft imported successfully')"
|
|
122
|
+
"${TUFT_HOME}/venv/bin/python" -c "import redis; print('redis imported successfully')"
|
|
119
123
|
env:
|
|
120
124
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
121
125
|
|
|
@@ -150,13 +154,20 @@ jobs:
|
|
|
150
154
|
env:
|
|
151
155
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
152
156
|
|
|
153
|
-
- name: Test upgrade command
|
|
157
|
+
- name: Test upgrade command (from PyPI)
|
|
154
158
|
run: |
|
|
155
159
|
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
156
160
|
tuft upgrade
|
|
157
161
|
env:
|
|
158
162
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
159
163
|
|
|
164
|
+
- name: Test upgrade command (from local source)
|
|
165
|
+
run: |
|
|
166
|
+
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
167
|
+
tuft upgrade --local-source "$GITHUB_WORKSPACE"
|
|
168
|
+
env:
|
|
169
|
+
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
170
|
+
|
|
160
171
|
- name: Clean up installation
|
|
161
172
|
run: rm -rf "${TUFT_HOME}"
|
|
162
173
|
env:
|
|
@@ -171,7 +182,7 @@ jobs:
|
|
|
171
182
|
|
|
172
183
|
- name: Initial install
|
|
173
184
|
run: |
|
|
174
|
-
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
|
|
185
|
+
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
|
|
175
186
|
env:
|
|
176
187
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
177
188
|
|
|
@@ -184,7 +195,7 @@ jobs:
|
|
|
184
195
|
|
|
185
196
|
- name: Reinstall with --clean
|
|
186
197
|
run: |
|
|
187
|
-
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --
|
|
198
|
+
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --clean
|
|
188
199
|
env:
|
|
189
200
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
190
201
|
|
|
@@ -202,36 +213,30 @@ jobs:
|
|
|
202
213
|
env:
|
|
203
214
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
204
215
|
|
|
205
|
-
test-
|
|
216
|
+
test-upgrade-from-source:
|
|
206
217
|
runs-on: ubuntu-latest
|
|
207
218
|
|
|
208
219
|
steps:
|
|
209
220
|
- name: Checkout code
|
|
210
221
|
uses: actions/checkout@v4
|
|
211
222
|
|
|
212
|
-
- name: Install
|
|
213
|
-
run:
|
|
214
|
-
bash scripts/install.sh --local-source "$GITHUB_WORKSPACE" --without-backend
|
|
215
|
-
env:
|
|
216
|
-
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
217
|
-
|
|
218
|
-
- name: Verify peft is NOT installed
|
|
219
|
-
run: |
|
|
220
|
-
"${TUFT_HOME}/venv/bin/python" -c "import peft" 2>&1 && exit 1 || echo "peft not installed (expected)"
|
|
223
|
+
- name: Install tuft
|
|
224
|
+
run: bash scripts/install.sh --local-source "$GITHUB_WORKSPACE"
|
|
221
225
|
env:
|
|
222
226
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
223
227
|
|
|
224
|
-
- name:
|
|
228
|
+
- name: Test upgrade --from-source
|
|
225
229
|
run: |
|
|
226
230
|
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
227
|
-
tuft
|
|
231
|
+
tuft upgrade --from-source
|
|
228
232
|
env:
|
|
229
233
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
230
234
|
|
|
231
|
-
- name: Verify
|
|
235
|
+
- name: Verify tuft still works after upgrade
|
|
232
236
|
run: |
|
|
233
|
-
"${TUFT_HOME}/
|
|
234
|
-
|
|
237
|
+
export PATH="${TUFT_HOME}/bin:$PATH"
|
|
238
|
+
tuft version
|
|
239
|
+
tuft launch --help
|
|
235
240
|
env:
|
|
236
241
|
TUFT_HOME: ${{ runner.temp }}/tuft
|
|
237
242
|
|
|
@@ -31,11 +31,10 @@ jobs:
|
|
|
31
31
|
- name: Check ray status
|
|
32
32
|
working-directory: tuft-${{ github.run_id }}/.github/workflows/docker
|
|
33
33
|
run: |
|
|
34
|
-
MAX_RETRIES=
|
|
35
|
-
RETRY_INTERVAL=
|
|
34
|
+
MAX_RETRIES=90
|
|
35
|
+
RETRY_INTERVAL=30
|
|
36
36
|
for i in $(seq 1 $MAX_RETRIES); do
|
|
37
|
-
if docker compose exec tuft-node-1 bash -c "source /
|
|
38
|
-
&& docker compose exec tuft-node-2 bash -c "source /opt/venv/bin/activate && ray status"; then
|
|
37
|
+
if docker compose exec tuft-node-1 bash -c "source /root/.tuft/venv/bin/activate && ray status"; then
|
|
39
38
|
break
|
|
40
39
|
fi
|
|
41
40
|
echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)"
|
|
@@ -51,7 +50,7 @@ jobs:
|
|
|
51
50
|
# set a github env variable to indicate tests were run, so that subsequent steps can check it
|
|
52
51
|
run: |
|
|
53
52
|
echo "tests_run=true" >> $GITHUB_ENV
|
|
54
|
-
docker compose exec tuft-node-1 bash -c "source /
|
|
53
|
+
docker compose exec tuft-node-1 bash -c "source /root/.tuft/venv/bin/activate && pytest tests -v -s --gpu --basetemp /mnt/checkpoints --ctrf report.json"
|
|
55
54
|
|
|
56
55
|
- name: Convert report.json time to ms
|
|
57
56
|
working-directory: tuft-${{ github.run_id }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tuft
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
|
|
5
5
|
Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Requires-Python: >=3.11
|
|
|
29
29
|
Requires-Dist: fastapi>=0.125.0
|
|
30
30
|
Requires-Dist: httpx>=0.28.1
|
|
31
31
|
Requires-Dist: numpy<2.0.0
|
|
32
|
+
Requires-Dist: nvidia-ml-py>=13.0.0
|
|
32
33
|
Requires-Dist: omegaconf>=2.3.0
|
|
33
34
|
Requires-Dist: opentelemetry-api>=1.20.0
|
|
34
35
|
Requires-Dist: opentelemetry-exporter-otlp>=1.20.0
|
|
@@ -36,7 +37,6 @@ Requires-Dist: opentelemetry-instrumentation-fastapi>=0.41b0
|
|
|
36
37
|
Requires-Dist: opentelemetry-instrumentation-logging>=0.41b0
|
|
37
38
|
Requires-Dist: opentelemetry-sdk>=1.20.0
|
|
38
39
|
Requires-Dist: psutil>=5.9.0
|
|
39
|
-
Requires-Dist: pynvml>=11.5.0
|
|
40
40
|
Requires-Dist: ray>=2.50.0
|
|
41
41
|
Requires-Dist: tinker>=0.7.0
|
|
42
42
|
Requires-Dist: transformers<5.0.0,>=4.57.3
|
|
@@ -300,7 +300,7 @@ uv pip install "tuft[dev,backend,persistence]"
|
|
|
300
300
|
The CLI starts a FastAPI server:
|
|
301
301
|
|
|
302
302
|
```bash
|
|
303
|
-
tuft --port 10610 --config /path/to/tuft_config.yaml
|
|
303
|
+
tuft launch --port 10610 --config /path/to/tuft_config.yaml
|
|
304
304
|
```
|
|
305
305
|
|
|
306
306
|
The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
|
|
@@ -340,7 +340,7 @@ you can use the pre-built Docker image.
|
|
|
340
340
|
-p 10610:10610 \
|
|
341
341
|
-v <host_dir>:/data \
|
|
342
342
|
ghcr.io/agentscope-ai/tuft:latest \
|
|
343
|
-
tuft --port 10610 --config /data/tuft_config.yaml
|
|
343
|
+
tuft launch --port 10610 --config /data/tuft_config.yaml
|
|
344
344
|
```
|
|
345
345
|
|
|
346
346
|
Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
|
|
@@ -378,77 +378,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
|
|
|
378
378
|
|
|
379
379
|
## Persistence
|
|
380
380
|
|
|
381
|
-
TuFT supports optional
|
|
382
|
-
the server can recover sessions, training runs, and pending futures after a restart.
|
|
381
|
+
TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
|
|
383
382
|
|
|
384
|
-
|
|
383
|
+
See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
|
|
385
384
|
|
|
386
385
|
```bash
|
|
387
|
-
uv pip install tuft[persistence]
|
|
386
|
+
uv pip install "tuft[persistence]"
|
|
388
387
|
```
|
|
389
388
|
|
|
390
|
-
### Persistence Modes
|
|
391
|
-
|
|
392
|
-
TuFT provides three persistence modes:
|
|
393
|
-
|
|
394
|
-
| Mode | Description | Use Case |
|
|
395
|
-
|------|-------------|----------|
|
|
396
|
-
| `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
|
|
397
|
-
| `redis_url` | External Redis server | Production, multi-instance deployments |
|
|
398
|
-
| `file_redis` | File-backed store | Demos, small-scale testing |
|
|
399
|
-
|
|
400
|
-
### Configuration
|
|
401
|
-
|
|
402
|
-
Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
|
|
403
|
-
|
|
404
|
-
#### Mode 1: Disabled (Default)
|
|
405
|
-
|
|
406
|
-
No configuration needed. All data is stored in memory and lost on restart.
|
|
407
|
-
|
|
408
|
-
```yaml
|
|
409
|
-
# tuft_config.yaml
|
|
410
|
-
persistence:
|
|
411
|
-
mode: disabled
|
|
412
|
-
```
|
|
413
|
-
|
|
414
|
-
#### Mode 2: External Redis Server
|
|
415
|
-
|
|
416
|
-
Use an external Redis server for production deployments:
|
|
417
|
-
|
|
418
389
|
```yaml
|
|
419
390
|
# tuft_config.yaml
|
|
420
391
|
persistence:
|
|
421
|
-
mode:
|
|
392
|
+
mode: REDIS
|
|
422
393
|
redis_url: "redis://localhost:6379/0"
|
|
423
|
-
namespace: "tuft"
|
|
424
|
-
```
|
|
425
|
-
|
|
426
|
-
You can start a local Redis instance using Docker:
|
|
427
|
-
|
|
428
|
-
```bash
|
|
429
|
-
docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
|
|
430
|
-
```
|
|
431
|
-
|
|
432
|
-
#### Mode 3: File-backed Store
|
|
433
|
-
|
|
434
|
-
Use the file-backed store for demos or small-scale testing:
|
|
435
|
-
|
|
436
|
-
```yaml
|
|
437
|
-
# tuft_config.yaml
|
|
438
|
-
persistence:
|
|
439
|
-
mode: file_redis
|
|
440
|
-
file_path: "~/.cache/tuft/file_redis.json"
|
|
441
|
-
namespace: "tuft"
|
|
394
|
+
namespace: "persistence-tuft-server"
|
|
442
395
|
```
|
|
443
396
|
|
|
444
397
|
## Observability (OpenTelemetry)
|
|
445
398
|
|
|
446
|
-
TuFT supports optional OpenTelemetry integration for
|
|
447
|
-
This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
|
|
448
|
-
|
|
449
|
-
### Configuration
|
|
450
|
-
|
|
451
|
-
Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
|
|
399
|
+
TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
|
|
452
400
|
|
|
453
401
|
```yaml
|
|
454
402
|
# tuft_config.yaml
|
|
@@ -457,10 +405,6 @@ telemetry:
|
|
|
457
405
|
service_name: tuft
|
|
458
406
|
otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
|
|
459
407
|
resource_attributes: {}
|
|
460
|
-
# example:
|
|
461
|
-
# deployment.environment: production
|
|
462
|
-
# service.version: 1.0.0
|
|
463
|
-
# service.namespace: my-namespace
|
|
464
408
|
```
|
|
465
409
|
|
|
466
410
|
Alternatively, use environment variables:
|
|
@@ -238,7 +238,7 @@ uv pip install "tuft[dev,backend,persistence]"
|
|
|
238
238
|
The CLI starts a FastAPI server:
|
|
239
239
|
|
|
240
240
|
```bash
|
|
241
|
-
tuft --port 10610 --config /path/to/tuft_config.yaml
|
|
241
|
+
tuft launch --port 10610 --config /path/to/tuft_config.yaml
|
|
242
242
|
```
|
|
243
243
|
|
|
244
244
|
The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
|
|
@@ -278,7 +278,7 @@ you can use the pre-built Docker image.
|
|
|
278
278
|
-p 10610:10610 \
|
|
279
279
|
-v <host_dir>:/data \
|
|
280
280
|
ghcr.io/agentscope-ai/tuft:latest \
|
|
281
|
-
tuft --port 10610 --config /data/tuft_config.yaml
|
|
281
|
+
tuft launch --port 10610 --config /data/tuft_config.yaml
|
|
282
282
|
```
|
|
283
283
|
|
|
284
284
|
Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
|
|
@@ -316,77 +316,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
|
|
|
316
316
|
|
|
317
317
|
## Persistence
|
|
318
318
|
|
|
319
|
-
TuFT supports optional
|
|
320
|
-
the server can recover sessions, training runs, and pending futures after a restart.
|
|
319
|
+
TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
|
|
321
320
|
|
|
322
|
-
|
|
321
|
+
See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
|
|
323
322
|
|
|
324
323
|
```bash
|
|
325
|
-
uv pip install tuft[persistence]
|
|
324
|
+
uv pip install "tuft[persistence]"
|
|
326
325
|
```
|
|
327
326
|
|
|
328
|
-
### Persistence Modes
|
|
329
|
-
|
|
330
|
-
TuFT provides three persistence modes:
|
|
331
|
-
|
|
332
|
-
| Mode | Description | Use Case |
|
|
333
|
-
|------|-------------|----------|
|
|
334
|
-
| `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
|
|
335
|
-
| `redis_url` | External Redis server | Production, multi-instance deployments |
|
|
336
|
-
| `file_redis` | File-backed store | Demos, small-scale testing |
|
|
337
|
-
|
|
338
|
-
### Configuration
|
|
339
|
-
|
|
340
|
-
Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
|
|
341
|
-
|
|
342
|
-
#### Mode 1: Disabled (Default)
|
|
343
|
-
|
|
344
|
-
No configuration needed. All data is stored in memory and lost on restart.
|
|
345
|
-
|
|
346
|
-
```yaml
|
|
347
|
-
# tuft_config.yaml
|
|
348
|
-
persistence:
|
|
349
|
-
mode: disabled
|
|
350
|
-
```
|
|
351
|
-
|
|
352
|
-
#### Mode 2: External Redis Server
|
|
353
|
-
|
|
354
|
-
Use an external Redis server for production deployments:
|
|
355
|
-
|
|
356
327
|
```yaml
|
|
357
328
|
# tuft_config.yaml
|
|
358
329
|
persistence:
|
|
359
|
-
mode:
|
|
330
|
+
mode: REDIS
|
|
360
331
|
redis_url: "redis://localhost:6379/0"
|
|
361
|
-
namespace: "tuft"
|
|
362
|
-
```
|
|
363
|
-
|
|
364
|
-
You can start a local Redis instance using Docker:
|
|
365
|
-
|
|
366
|
-
```bash
|
|
367
|
-
docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
|
|
368
|
-
```
|
|
369
|
-
|
|
370
|
-
#### Mode 3: File-backed Store
|
|
371
|
-
|
|
372
|
-
Use the file-backed store for demos or small-scale testing:
|
|
373
|
-
|
|
374
|
-
```yaml
|
|
375
|
-
# tuft_config.yaml
|
|
376
|
-
persistence:
|
|
377
|
-
mode: file_redis
|
|
378
|
-
file_path: "~/.cache/tuft/file_redis.json"
|
|
379
|
-
namespace: "tuft"
|
|
332
|
+
namespace: "persistence-tuft-server"
|
|
380
333
|
```
|
|
381
334
|
|
|
382
335
|
## Observability (OpenTelemetry)
|
|
383
336
|
|
|
384
|
-
TuFT supports optional OpenTelemetry integration for
|
|
385
|
-
This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
|
|
386
|
-
|
|
387
|
-
### Configuration
|
|
388
|
-
|
|
389
|
-
Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
|
|
337
|
+
TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
|
|
390
338
|
|
|
391
339
|
```yaml
|
|
392
340
|
# tuft_config.yaml
|
|
@@ -395,10 +343,6 @@ telemetry:
|
|
|
395
343
|
service_name: tuft
|
|
396
344
|
otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
|
|
397
345
|
resource_attributes: {}
|
|
398
|
-
# example:
|
|
399
|
-
# deployment.environment: production
|
|
400
|
-
# service.version: 1.0.0
|
|
401
|
-
# service.namespace: my-namespace
|
|
402
346
|
```
|
|
403
347
|
|
|
404
348
|
Alternatively, use environment variables:
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# Copy this file to your desired location and modify as needed.
|
|
5
5
|
#
|
|
6
6
|
# Usage:
|
|
7
|
-
# tuft --config /path/to/your/tuft_config.yaml
|
|
7
|
+
# tuft launch --config /path/to/your/tuft_config.yaml
|
|
8
8
|
|
|
9
9
|
# =============================================================================
|
|
10
10
|
# Checkpoint Directory
|
|
@@ -79,23 +79,38 @@ authorized_users:
|
|
|
79
79
|
# Persistence Configuration
|
|
80
80
|
# =============================================================================
|
|
81
81
|
# Configure state persistence for recovery after server restart.
|
|
82
|
+
# For detailed documentation, see the "Persistence" section in README.md.
|
|
82
83
|
#
|
|
83
84
|
# Available modes:
|
|
84
|
-
# -
|
|
85
|
-
# -
|
|
86
|
-
# -
|
|
85
|
+
# - DISABLE: No persistence (default)
|
|
86
|
+
# - REDIS: External Redis server
|
|
87
|
+
# - FILE: File-backed store
|
|
87
88
|
|
|
88
89
|
persistence:
|
|
89
|
-
mode:
|
|
90
|
+
mode: DISABLE # Options: DISABLE, REDIS, FILE
|
|
90
91
|
|
|
91
|
-
# For
|
|
92
|
+
# For REDIS mode:
|
|
92
93
|
# redis_url: "redis://localhost:6379/0"
|
|
93
94
|
|
|
94
|
-
# For
|
|
95
|
+
# For FILE mode:
|
|
95
96
|
# file_path: "~/.cache/tuft/file_redis.json"
|
|
96
97
|
|
|
97
|
-
# Namespace prefix for Redis keys (optional)
|
|
98
|
-
# namespace: "tuft"
|
|
98
|
+
# Namespace prefix for Redis keys. (optional, defaults to "persistence-tuft-server".)
|
|
99
|
+
# namespace: "persistence-tuft-server"
|
|
100
|
+
|
|
101
|
+
# TTL (Time-To-Live) for future records in seconds.
|
|
102
|
+
# Futures are short-lived async operation results that expire after this duration.
|
|
103
|
+
# Set to null for no expiry (not recommended for production).
|
|
104
|
+
# Default: 86400 (1 day)
|
|
105
|
+
# future_ttl_seconds: 86400
|
|
106
|
+
|
|
107
|
+
# Fields to validate on server restart for config consistency.
|
|
108
|
+
# For detailed documentation on available fields and config validation,
|
|
109
|
+
# see the "Configuration Validation" section in README.md.
|
|
110
|
+
# Defaults to ["SUPPORTED_MODELS"]. SUPPORTED_MODELS is always checked.
|
|
111
|
+
# check_fields:
|
|
112
|
+
# - SUPPORTED_MODELS
|
|
113
|
+
# - CHECKPOINT_DIR
|
|
99
114
|
|
|
100
115
|
# =============================================================================
|
|
101
116
|
# Telemetry Configuration (OpenTelemetry)
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#
|
|
8
8
|
# Note:
|
|
9
9
|
# This Dockerfile uses 'uv' to create a virtual environment for better package management.
|
|
10
|
-
# The uv virtual environment is created at `/
|
|
10
|
+
# The uv virtual environment is created at `/root/.tuft/venv`, use `source /root/.tuft/venv/bin/activate` to activate it.
|
|
11
11
|
# Make sure to use `uv pip` to install packages within the virtual environment.
|
|
12
12
|
|
|
13
13
|
FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
|
|
@@ -23,13 +23,14 @@ RUN chmod 1777 /tmp && apt update && apt install -y --no-install-recommends \
|
|
|
23
23
|
&& ln -sf /usr/bin/python3 /usr/bin/python \
|
|
24
24
|
&& ln -sf /usr/bin/pip3 /usr/bin/pip
|
|
25
25
|
|
|
26
|
-
ENV VIRTUAL_ENV=/
|
|
26
|
+
ENV VIRTUAL_ENV=/root/.tuft/venv
|
|
27
27
|
|
|
28
28
|
# copy the TuFT dir into the workspace
|
|
29
29
|
COPY ./pyproject.toml .
|
|
30
30
|
COPY ./LICENSE .
|
|
31
31
|
COPY ./README.md .
|
|
32
32
|
COPY ./src ./src
|
|
33
|
+
COPY ./scripts ./scripts
|
|
33
34
|
|
|
34
35
|
# Uncomment the following line if you want to use AliCloud Mirror to speed up pip install
|
|
35
36
|
# ENV UV_DEFAULT_INDEX=http://mirrors.cloud.aliyuncs.com/pypi/simple/
|
|
@@ -37,14 +38,11 @@ COPY ./src ./src
|
|
|
37
38
|
# Uncomment the following line to use a Hugging Face mirror if you have network connection problem with Hugging Face
|
|
38
39
|
# ENV HF_ENDPOINT=https://hf-mirror.com
|
|
39
40
|
|
|
40
|
-
# Install
|
|
41
|
-
RUN
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
# Install flash_attn
|
|
47
|
-
RUN . ${VIRTUAL_ENV}/bin/activate && uv pip install flash_attn==2.8.1 --no-build-isolation
|
|
41
|
+
# Install
|
|
42
|
+
RUN bash ./scripts/install.sh --local-source /workspace \
|
|
43
|
+
&& . $HOME/.local/bin/env \
|
|
44
|
+
&& . /root/.tuft/venv/bin/activate \
|
|
45
|
+
&& uv pip install .[dev]
|
|
48
46
|
|
|
49
47
|
ENTRYPOINT ["/bin/bash", "-c", "source ${VIRTUAL_ENV}/bin/activate && exec \"$@\"", "--"]
|
|
50
48
|
CMD ["bash"]
|