tuft 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tuft/__main__.py +7 -0
- tuft/backends/hf_training_model.py +184 -64
- tuft/cli.py +161 -8
- tuft/config.py +63 -59
- tuft/exceptions.py +66 -0
- tuft/futures.py +22 -2
- tuft/loss_fn/__init__.py +33 -0
- tuft/persistence/__init__.py +10 -2
- tuft/persistence/redis_store.py +352 -31
- tuft/sampling_controller.py +37 -11
- tuft/sequence_executor.py +72 -0
- tuft/server.py +9 -2
- tuft/state.py +3 -0
- tuft/training_controller.py +20 -5
- {tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/METADATA +10 -66
- {tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/RECORD +19 -17
- {tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/WHEEL +0 -0
- {tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/entry_points.txt +0 -0
- {tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/licenses/LICENSE +0 -0
tuft/server.py
CHANGED
|
@@ -18,7 +18,7 @@ from tinker import types
|
|
|
18
18
|
from .auth import User
|
|
19
19
|
from .config import AppConfig
|
|
20
20
|
from .exceptions import TuFTException
|
|
21
|
-
from .persistence import get_redis_store
|
|
21
|
+
from .persistence import get_redis_store, save_config_signature
|
|
22
22
|
from .state import ServerState
|
|
23
23
|
from .telemetry import shutdown_telemetry
|
|
24
24
|
|
|
@@ -76,11 +76,19 @@ def _instrument_fastapi(app: FastAPI) -> None:
|
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
def create_root_app(config: AppConfig | None = None) -> FastAPI:
|
|
79
|
+
resolved_config = config or AppConfig()
|
|
80
|
+
|
|
79
81
|
@asynccontextmanager
|
|
80
82
|
async def lifespan(app: FastAPI):
|
|
81
83
|
try:
|
|
82
84
|
await app.state.server_state.async_init()
|
|
83
85
|
logger.info("Server initialized successfully")
|
|
86
|
+
|
|
87
|
+
# After successful init/restore, save the current config signature
|
|
88
|
+
if resolved_config.persistence.enabled:
|
|
89
|
+
save_config_signature(resolved_config)
|
|
90
|
+
logger.debug("Config signature saved after successful initialization")
|
|
91
|
+
|
|
84
92
|
yield
|
|
85
93
|
finally:
|
|
86
94
|
logger.info("Server shutting down")
|
|
@@ -95,7 +103,6 @@ def create_root_app(config: AppConfig | None = None) -> FastAPI:
|
|
|
95
103
|
route.dependencies = getattr(route, "dependencies", []) + [Depends(_get_user)]
|
|
96
104
|
return route
|
|
97
105
|
|
|
98
|
-
resolved_config = config or AppConfig()
|
|
99
106
|
if resolved_config.persistence.enabled:
|
|
100
107
|
store = get_redis_store()
|
|
101
108
|
store.configure(resolved_config.persistence)
|
tuft/state.py
CHANGED
|
@@ -135,7 +135,10 @@ class ServerState:
|
|
|
135
135
|
1. For each training run restored from Redis, create adapter and load latest checkpoint
|
|
136
136
|
2. Mark ALL futures created after checkpoint's future_id as failed
|
|
137
137
|
3. For training runs without checkpoints, mark all futures as failed
|
|
138
|
+
4. Mark all pending sample futures as failed
|
|
138
139
|
"""
|
|
140
|
+
self.future_store.mark_pending_sample_futures_failed()
|
|
141
|
+
|
|
139
142
|
# Restore training runs (adapter + checkpoint)
|
|
140
143
|
for model_id, record in self.training.training_runs.items():
|
|
141
144
|
if record.backend is None or record.corrupted:
|
tuft/training_controller.py
CHANGED
|
@@ -274,7 +274,8 @@ class TrainingController:
|
|
|
274
274
|
if seq_id is not None:
|
|
275
275
|
self._reserve_seq_id(record, seq_id)
|
|
276
276
|
# Save the updated next_seq_id to Redis
|
|
277
|
-
|
|
277
|
+
loop = asyncio.get_event_loop()
|
|
278
|
+
await loop.run_in_executor(None, self._save_training_run, record.training_run_id)
|
|
278
279
|
return await operation()
|
|
279
280
|
|
|
280
281
|
def _reserve_seq_id(self, record: TrainingRunRecord, seq_id: int) -> None:
|
|
@@ -314,7 +315,8 @@ class TrainingController:
|
|
|
314
315
|
)
|
|
315
316
|
await backend.create_adapter(model_id, lora_config)
|
|
316
317
|
self.training_runs[model_id] = record
|
|
317
|
-
|
|
318
|
+
loop = asyncio.get_event_loop()
|
|
319
|
+
await loop.run_in_executor(None, self._save_training_run, model_id)
|
|
318
320
|
|
|
319
321
|
# Update metrics
|
|
320
322
|
get_metrics().training_models_active.add(1, {"base_model": base_model})
|
|
@@ -508,6 +510,7 @@ class TrainingController:
|
|
|
508
510
|
logger.info("Checkpoint save begin: %s", checkpoint_id)
|
|
509
511
|
|
|
510
512
|
setattr(training_run, counter_attr, counter + 1)
|
|
513
|
+
assert self.config.checkpoint_dir is not None
|
|
511
514
|
checkpoint = CheckpointRecord.from_training_run(
|
|
512
515
|
training_run_id=training_run.training_run_id,
|
|
513
516
|
checkpoint_name=checkpoint_name,
|
|
@@ -540,7 +543,14 @@ class TrainingController:
|
|
|
540
543
|
|
|
541
544
|
# Save training run and checkpoint atomically to prevent inconsistency
|
|
542
545
|
# if server crashes between saves
|
|
543
|
-
|
|
546
|
+
loop = asyncio.get_event_loop()
|
|
547
|
+
await loop.run_in_executor(
|
|
548
|
+
None,
|
|
549
|
+
self._save_training_run_with_checkpoint,
|
|
550
|
+
model_id,
|
|
551
|
+
checkpoint_name,
|
|
552
|
+
checkpoint_type,
|
|
553
|
+
)
|
|
544
554
|
|
|
545
555
|
# Update metrics
|
|
546
556
|
metrics = get_metrics()
|
|
@@ -567,7 +577,11 @@ class TrainingController:
|
|
|
567
577
|
) -> None:
|
|
568
578
|
"""Load a checkpoint."""
|
|
569
579
|
try:
|
|
570
|
-
|
|
580
|
+
assert self.config.checkpoint_dir is not None
|
|
581
|
+
parsed_checkpoint = CheckpointRecord.from_tinker_path(
|
|
582
|
+
path,
|
|
583
|
+
self.config.checkpoint_dir,
|
|
584
|
+
)
|
|
571
585
|
except FileNotFoundError as exc:
|
|
572
586
|
raise CheckpointNotFoundException(checkpoint_id=model_id) from exc
|
|
573
587
|
source_model_id = parsed_checkpoint.training_run_id or model_id
|
|
@@ -717,7 +731,8 @@ class TrainingController:
|
|
|
717
731
|
except Exception: # pylint: disable=broad-except
|
|
718
732
|
# If loading fails, mark as corrupted
|
|
719
733
|
record.corrupted = True
|
|
720
|
-
|
|
734
|
+
loop = asyncio.get_event_loop()
|
|
735
|
+
await loop.run_in_executor(None, self._save_training_run, model_id)
|
|
721
736
|
return None
|
|
722
737
|
|
|
723
738
|
return latest_ckpt
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tuft
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
|
|
5
5
|
Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Requires-Python: >=3.11
|
|
|
29
29
|
Requires-Dist: fastapi>=0.125.0
|
|
30
30
|
Requires-Dist: httpx>=0.28.1
|
|
31
31
|
Requires-Dist: numpy<2.0.0
|
|
32
|
+
Requires-Dist: nvidia-ml-py>=13.0.0
|
|
32
33
|
Requires-Dist: omegaconf>=2.3.0
|
|
33
34
|
Requires-Dist: opentelemetry-api>=1.20.0
|
|
34
35
|
Requires-Dist: opentelemetry-exporter-otlp>=1.20.0
|
|
@@ -36,7 +37,6 @@ Requires-Dist: opentelemetry-instrumentation-fastapi>=0.41b0
|
|
|
36
37
|
Requires-Dist: opentelemetry-instrumentation-logging>=0.41b0
|
|
37
38
|
Requires-Dist: opentelemetry-sdk>=1.20.0
|
|
38
39
|
Requires-Dist: psutil>=5.9.0
|
|
39
|
-
Requires-Dist: pynvml>=11.5.0
|
|
40
40
|
Requires-Dist: ray>=2.50.0
|
|
41
41
|
Requires-Dist: tinker>=0.7.0
|
|
42
42
|
Requires-Dist: transformers<5.0.0,>=4.57.3
|
|
@@ -300,7 +300,7 @@ uv pip install "tuft[dev,backend,persistence]"
|
|
|
300
300
|
The CLI starts a FastAPI server:
|
|
301
301
|
|
|
302
302
|
```bash
|
|
303
|
-
tuft --port 10610 --config /path/to/tuft_config.yaml
|
|
303
|
+
tuft launch --port 10610 --config /path/to/tuft_config.yaml
|
|
304
304
|
```
|
|
305
305
|
|
|
306
306
|
The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
|
|
@@ -340,7 +340,7 @@ you can use the pre-built Docker image.
|
|
|
340
340
|
-p 10610:10610 \
|
|
341
341
|
-v <host_dir>:/data \
|
|
342
342
|
ghcr.io/agentscope-ai/tuft:latest \
|
|
343
|
-
tuft --port 10610 --config /data/tuft_config.yaml
|
|
343
|
+
tuft launch --port 10610 --config /data/tuft_config.yaml
|
|
344
344
|
```
|
|
345
345
|
|
|
346
346
|
Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
|
|
@@ -378,77 +378,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
|
|
|
378
378
|
|
|
379
379
|
## Persistence
|
|
380
380
|
|
|
381
|
-
TuFT supports optional
|
|
382
|
-
the server can recover sessions, training runs, and pending futures after a restart.
|
|
381
|
+
TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
|
|
383
382
|
|
|
384
|
-
|
|
383
|
+
See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
|
|
385
384
|
|
|
386
385
|
```bash
|
|
387
|
-
uv pip install tuft[persistence]
|
|
386
|
+
uv pip install "tuft[persistence]"
|
|
388
387
|
```
|
|
389
388
|
|
|
390
|
-
### Persistence Modes
|
|
391
|
-
|
|
392
|
-
TuFT provides three persistence modes:
|
|
393
|
-
|
|
394
|
-
| Mode | Description | Use Case |
|
|
395
|
-
|------|-------------|----------|
|
|
396
|
-
| `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
|
|
397
|
-
| `redis_url` | External Redis server | Production, multi-instance deployments |
|
|
398
|
-
| `file_redis` | File-backed store | Demos, small-scale testing |
|
|
399
|
-
|
|
400
|
-
### Configuration
|
|
401
|
-
|
|
402
|
-
Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
|
|
403
|
-
|
|
404
|
-
#### Mode 1: Disabled (Default)
|
|
405
|
-
|
|
406
|
-
No configuration needed. All data is stored in memory and lost on restart.
|
|
407
|
-
|
|
408
|
-
```yaml
|
|
409
|
-
# tuft_config.yaml
|
|
410
|
-
persistence:
|
|
411
|
-
mode: disabled
|
|
412
|
-
```
|
|
413
|
-
|
|
414
|
-
#### Mode 2: External Redis Server
|
|
415
|
-
|
|
416
|
-
Use an external Redis server for production deployments:
|
|
417
|
-
|
|
418
389
|
```yaml
|
|
419
390
|
# tuft_config.yaml
|
|
420
391
|
persistence:
|
|
421
|
-
mode:
|
|
392
|
+
mode: REDIS
|
|
422
393
|
redis_url: "redis://localhost:6379/0"
|
|
423
|
-
namespace: "tuft"
|
|
424
|
-
```
|
|
425
|
-
|
|
426
|
-
You can start a local Redis instance using Docker:
|
|
427
|
-
|
|
428
|
-
```bash
|
|
429
|
-
docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
|
|
430
|
-
```
|
|
431
|
-
|
|
432
|
-
#### Mode 3: File-backed Store
|
|
433
|
-
|
|
434
|
-
Use the file-backed store for demos or small-scale testing:
|
|
435
|
-
|
|
436
|
-
```yaml
|
|
437
|
-
# tuft_config.yaml
|
|
438
|
-
persistence:
|
|
439
|
-
mode: file_redis
|
|
440
|
-
file_path: "~/.cache/tuft/file_redis.json"
|
|
441
|
-
namespace: "tuft"
|
|
394
|
+
namespace: "persistence-tuft-server"
|
|
442
395
|
```
|
|
443
396
|
|
|
444
397
|
## Observability (OpenTelemetry)
|
|
445
398
|
|
|
446
|
-
TuFT supports optional OpenTelemetry integration for
|
|
447
|
-
This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
|
|
448
|
-
|
|
449
|
-
### Configuration
|
|
450
|
-
|
|
451
|
-
Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
|
|
399
|
+
TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
|
|
452
400
|
|
|
453
401
|
```yaml
|
|
454
402
|
# tuft_config.yaml
|
|
@@ -457,10 +405,6 @@ telemetry:
|
|
|
457
405
|
service_name: tuft
|
|
458
406
|
otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
|
|
459
407
|
resource_attributes: {}
|
|
460
|
-
# example:
|
|
461
|
-
# deployment.environment: production
|
|
462
|
-
# service.version: 1.0.0
|
|
463
|
-
# service.namespace: my-namespace
|
|
464
408
|
```
|
|
465
409
|
|
|
466
410
|
Alternatively, use environment variables:
|
|
@@ -1,35 +1,37 @@
|
|
|
1
1
|
tuft/__init__.py,sha256=BJu6iJ_QGwcJXRXDgR1LjV25KgM6sVd7_WqIXVTEuVM,97
|
|
2
|
+
tuft/__main__.py,sha256=MPhC9msQXf9py5xkLPQ4JoqrvCpL_qXVwksasNUj7ig,131
|
|
2
3
|
tuft/auth.py,sha256=2Wk9ATXlAiGm1Irpj66CfIyORuHzciSNAOzVwM8PeO0,1071
|
|
3
4
|
tuft/backend.py,sha256=ftiaaNds2MXroszZW8l6DEq515qiw1KmrODI3x6AHE4,10254
|
|
4
5
|
tuft/checkpoints.py,sha256=bObo2NzDrfzp5BiS6I_FIA3frLFic_sT4o4c-PEzfpk,6917
|
|
5
|
-
tuft/cli.py,sha256=
|
|
6
|
-
tuft/config.py,sha256=
|
|
7
|
-
tuft/exceptions.py,sha256=
|
|
8
|
-
tuft/futures.py,sha256=
|
|
9
|
-
tuft/sampling_controller.py,sha256=
|
|
10
|
-
tuft/
|
|
11
|
-
tuft/
|
|
12
|
-
tuft/
|
|
6
|
+
tuft/cli.py,sha256=I5229iyre2dtOscoGNfo64sLbybyOYc0nFVBwK2EBgY,7995
|
|
7
|
+
tuft/config.py,sha256=xf7J24SjnQyqb6R91spZRFXMtA9DvF7BqxS0qTqFzs4,4729
|
|
8
|
+
tuft/exceptions.py,sha256=j_fGWNpkGZUd29RVUOuCdFCqedq8MO3c520LR9_THxQ,6575
|
|
9
|
+
tuft/futures.py,sha256=NG9OaSGkmS1FrJVzwo_2Jqn0B7xrsrXjUXdjhzW9Mg0,17263
|
|
10
|
+
tuft/sampling_controller.py,sha256=DYeqHzRHK1-k-v4k0BglHz-V-bD8_JTSszzY7-HItQE,16273
|
|
11
|
+
tuft/sequence_executor.py,sha256=mRgpM94kZozNsQr7L-QmJ5JmLlytmdWFtbhgNF6nP7A,2873
|
|
12
|
+
tuft/server.py,sha256=F4ZKAEVlB6iW8-IR88qbx7uDL6EGvAV2I2VE5F_NlKI,25295
|
|
13
|
+
tuft/state.py,sha256=eHRQLdSGRsjhSC2uAVc85Ds7dO2KIfUn-oWu3hEfNoE,12970
|
|
14
|
+
tuft/training_controller.py,sha256=yZpUNkwioWjgAWGw9lOj-QmH46gHRe020X7apg_0bFc,30451
|
|
13
15
|
tuft/backends/__init__.py,sha256=7A6Pu-vEMbcMWapAh-zkI1O5WtBHO0OxwED8qAy9kAQ,262
|
|
14
16
|
tuft/backends/base_backend.py,sha256=bdlx3hRyEj00GKFlh2fAczn7h4zANz7bdKgXb_F18y4,3462
|
|
15
|
-
tuft/backends/hf_training_model.py,sha256=
|
|
17
|
+
tuft/backends/hf_training_model.py,sha256=P7vVFo6d6Oenp5oKKg_z85XbfPnVjVnr0Kuh_kS8QCU,21293
|
|
16
18
|
tuft/backends/sampling_backend.py,sha256=gf5laCMGbk9CrFuEJB0udKywVIimyU9-lqlwKok6j_w,10178
|
|
17
19
|
tuft/backends/training_backend.py,sha256=p1w-1i9-vxlocr97eumB46WZS5LxrTnxz4y86mKA950,13149
|
|
18
|
-
tuft/loss_fn/__init__.py,sha256=
|
|
20
|
+
tuft/loss_fn/__init__.py,sha256=k1SZmAihSItEHoqzI_jKP1lyPTFRkWg0wGgj1piVJqU,2723
|
|
19
21
|
tuft/loss_fn/cispo.py,sha256=L8HhqJ0rJcfgqfEkk445bvaKsZNWNAqwqm_47M9SB1Y,1598
|
|
20
22
|
tuft/loss_fn/cross_entropy.py,sha256=e9D2U_G8TNXOlOvEw7OQj-YE1H5DldzG2HS2QjKBfe8,935
|
|
21
23
|
tuft/loss_fn/dro.py,sha256=6d3jDK1OybcoFjq5vDwiUrURyaV-EajGLMECyF_2mjE,1315
|
|
22
24
|
tuft/loss_fn/importance_sampling.py,sha256=MTxO63LBhghVCEyDQYptaziFmVvEiSTE9xeGoyo20wc,1090
|
|
23
25
|
tuft/loss_fn/ppo.py,sha256=YpIYWNWqv9Asr3tV8iq5erSDHlZD8VmgOmDJQFHVWSo,1678
|
|
24
|
-
tuft/persistence/__init__.py,sha256=
|
|
26
|
+
tuft/persistence/__init__.py,sha256=7BeKykcqeVghBJTnbcSZAGkTC8AHziWond36nuaQWD8,813
|
|
25
27
|
tuft/persistence/file_redis.py,sha256=hLGClNhd9OID9JZMP-RZTisyoXOvQ0ctv3czj01dgIY,8091
|
|
26
|
-
tuft/persistence/redis_store.py,sha256=
|
|
28
|
+
tuft/persistence/redis_store.py,sha256=r_wACoo5o9yFrx-vovKw7QKN3swUY342Ry9AcCdF9Kk,27401
|
|
27
29
|
tuft/telemetry/__init__.py,sha256=dlSGiJ_pMElhwEe31olGg88ZrjoBeGUBn2P17qFNymM,336
|
|
28
30
|
tuft/telemetry/metrics.py,sha256=Yz6s2AQ5CptFXvEm-PbO-Ib17-aF0rnoG8vZxH-Pawo,11538
|
|
29
31
|
tuft/telemetry/provider.py,sha256=jGKqTMsP-WekKGCMN9QHwt-g_1Lk1xUOy1BO-__xG5I,6700
|
|
30
32
|
tuft/telemetry/tracing.py,sha256=GL-wEEQtzM1ycgfI4sMsHUeIC7qj5MyOH-sBwHihbsE,957
|
|
31
|
-
tuft-0.1.
|
|
32
|
-
tuft-0.1.
|
|
33
|
-
tuft-0.1.
|
|
34
|
-
tuft-0.1.
|
|
35
|
-
tuft-0.1.
|
|
33
|
+
tuft-0.1.3.dist-info/METADATA,sha256=whBMfZujTyWaXku4hPWEJhsOl7PRuMpWv4YHBg3sszI,19201
|
|
34
|
+
tuft-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
35
|
+
tuft-0.1.3.dist-info/entry_points.txt,sha256=T48zU7Vdi2ZsARDeOZ9jK6XGuYNaCbSaUTd5POouLms,39
|
|
36
|
+
tuft-0.1.3.dist-info/licenses/LICENSE,sha256=fJHdoqbikZ-GATzLNmixfKDot1w_cJuHKY3mH4qSmYs,1069
|
|
37
|
+
tuft-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|