tuft 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tuft/server.py CHANGED
@@ -18,7 +18,7 @@ from tinker import types
18
18
  from .auth import User
19
19
  from .config import AppConfig
20
20
  from .exceptions import TuFTException
21
- from .persistence import get_redis_store
21
+ from .persistence import get_redis_store, save_config_signature
22
22
  from .state import ServerState
23
23
  from .telemetry import shutdown_telemetry
24
24
 
@@ -76,11 +76,19 @@ def _instrument_fastapi(app: FastAPI) -> None:
76
76
 
77
77
 
78
78
  def create_root_app(config: AppConfig | None = None) -> FastAPI:
79
+ resolved_config = config or AppConfig()
80
+
79
81
  @asynccontextmanager
80
82
  async def lifespan(app: FastAPI):
81
83
  try:
82
84
  await app.state.server_state.async_init()
83
85
  logger.info("Server initialized successfully")
86
+
87
+ # After successful init/restore, save the current config signature
88
+ if resolved_config.persistence.enabled:
89
+ save_config_signature(resolved_config)
90
+ logger.debug("Config signature saved after successful initialization")
91
+
84
92
  yield
85
93
  finally:
86
94
  logger.info("Server shutting down")
@@ -95,7 +103,6 @@ def create_root_app(config: AppConfig | None = None) -> FastAPI:
95
103
  route.dependencies = getattr(route, "dependencies", []) + [Depends(_get_user)]
96
104
  return route
97
105
 
98
- resolved_config = config or AppConfig()
99
106
  if resolved_config.persistence.enabled:
100
107
  store = get_redis_store()
101
108
  store.configure(resolved_config.persistence)
tuft/state.py CHANGED
@@ -135,7 +135,10 @@ class ServerState:
135
135
  1. For each training run restored from Redis, create adapter and load latest checkpoint
136
136
  2. Mark ALL futures created after checkpoint's future_id as failed
137
137
  3. For training runs without checkpoints, mark all futures as failed
138
+ 4. Mark all pending sample futures as failed
138
139
  """
140
+ self.future_store.mark_pending_sample_futures_failed()
141
+
139
142
  # Restore training runs (adapter + checkpoint)
140
143
  for model_id, record in self.training.training_runs.items():
141
144
  if record.backend is None or record.corrupted:
@@ -274,7 +274,8 @@ class TrainingController:
274
274
  if seq_id is not None:
275
275
  self._reserve_seq_id(record, seq_id)
276
276
  # Save the updated next_seq_id to Redis
277
- self._save_training_run(record.training_run_id)
277
+ loop = asyncio.get_event_loop()
278
+ await loop.run_in_executor(None, self._save_training_run, record.training_run_id)
278
279
  return await operation()
279
280
 
280
281
  def _reserve_seq_id(self, record: TrainingRunRecord, seq_id: int) -> None:
@@ -314,7 +315,8 @@ class TrainingController:
314
315
  )
315
316
  await backend.create_adapter(model_id, lora_config)
316
317
  self.training_runs[model_id] = record
317
- self._save_training_run(model_id)
318
+ loop = asyncio.get_event_loop()
319
+ await loop.run_in_executor(None, self._save_training_run, model_id)
318
320
 
319
321
  # Update metrics
320
322
  get_metrics().training_models_active.add(1, {"base_model": base_model})
@@ -541,7 +543,14 @@ class TrainingController:
541
543
 
542
544
  # Save training run and checkpoint atomically to prevent inconsistency
543
545
  # if server crashes between saves
544
- self._save_training_run_with_checkpoint(model_id, checkpoint_name, checkpoint_type)
546
+ loop = asyncio.get_event_loop()
547
+ await loop.run_in_executor(
548
+ None,
549
+ self._save_training_run_with_checkpoint,
550
+ model_id,
551
+ checkpoint_name,
552
+ checkpoint_type,
553
+ )
545
554
 
546
555
  # Update metrics
547
556
  metrics = get_metrics()
@@ -722,7 +731,8 @@ class TrainingController:
722
731
  except Exception: # pylint: disable=broad-except
723
732
  # If loading fails, mark as corrupted
724
733
  record.corrupted = True
725
- self._save_training_run(model_id)
734
+ loop = asyncio.get_event_loop()
735
+ await loop.run_in_executor(None, self._save_training_run, model_id)
726
736
  return None
727
737
 
728
738
  return latest_ckpt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tuft
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
5
5
  Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
6
6
  License: MIT License
@@ -300,7 +300,7 @@ uv pip install "tuft[dev,backend,persistence]"
300
300
  The CLI starts a FastAPI server:
301
301
 
302
302
  ```bash
303
- tuft --port 10610 --config /path/to/tuft_config.yaml
303
+ tuft launch --port 10610 --config /path/to/tuft_config.yaml
304
304
  ```
305
305
 
306
306
  The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
@@ -340,7 +340,7 @@ you can use the pre-built Docker image.
340
340
  -p 10610:10610 \
341
341
  -v <host_dir>:/data \
342
342
  ghcr.io/agentscope-ai/tuft:latest \
343
- tuft --port 10610 --config /data/tuft_config.yaml
343
+ tuft launch --port 10610 --config /data/tuft_config.yaml
344
344
  ```
345
345
 
346
346
  Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
@@ -378,77 +378,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
378
378
 
379
379
  ## Persistence
380
380
 
381
- TuFT supports optional Redis-based persistence for server state. When enabled,
382
- the server can recover sessions, training runs, and pending futures after a restart.
381
+ TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
383
382
 
384
- To use persistence, install the optional dependency:
383
+ See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
385
384
 
386
385
  ```bash
387
- uv pip install tuft[persistence]
386
+ uv pip install "tuft[persistence]"
388
387
  ```
389
388
 
390
- ### Persistence Modes
391
-
392
- TuFT provides three persistence modes:
393
-
394
- | Mode | Description | Use Case |
395
- |------|-------------|----------|
396
- | `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
397
- | `redis_url` | External Redis server | Production, multi-instance deployments |
398
- | `file_redis` | File-backed store | Demos, small-scale testing |
399
-
400
- ### Configuration
401
-
402
- Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
403
-
404
- #### Mode 1: Disabled (Default)
405
-
406
- No configuration needed. All data is stored in memory and lost on restart.
407
-
408
- ```yaml
409
- # tuft_config.yaml
410
- persistence:
411
- mode: disabled
412
- ```
413
-
414
- #### Mode 2: External Redis Server
415
-
416
- Use an external Redis server for production deployments:
417
-
418
389
  ```yaml
419
390
  # tuft_config.yaml
420
391
  persistence:
421
- mode: redis_url
392
+ mode: REDIS
422
393
  redis_url: "redis://localhost:6379/0"
423
- namespace: "tuft"
424
- ```
425
-
426
- You can start a local Redis instance using Docker:
427
-
428
- ```bash
429
- docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
430
- ```
431
-
432
- #### Mode 3: File-backed Store
433
-
434
- Use the file-backed store for demos or small-scale testing:
435
-
436
- ```yaml
437
- # tuft_config.yaml
438
- persistence:
439
- mode: file_redis
440
- file_path: "~/.cache/tuft/file_redis.json"
441
- namespace: "tuft"
394
+ namespace: "persistence-tuft-server"
442
395
  ```
443
396
 
444
397
  ## Observability (OpenTelemetry)
445
398
 
446
- TuFT supports optional OpenTelemetry integration for distributed tracing, metrics, and logging.
447
- This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
448
-
449
- ### Configuration
450
-
451
- Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
399
+ TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
452
400
 
453
401
  ```yaml
454
402
  # tuft_config.yaml
@@ -457,10 +405,6 @@ telemetry:
457
405
  service_name: tuft
458
406
  otlp_endpoint: http://localhost:4317 # Your OTLP collector endpoint
459
407
  resource_attributes: {}
460
- # example:
461
- # deployment.environment: production
462
- # service.version: 1.0.0
463
- # service.namespace: my-namespace
464
408
  ```
465
409
 
466
410
  Alternatively, use environment variables:
@@ -3,34 +3,35 @@ tuft/__main__.py,sha256=MPhC9msQXf9py5xkLPQ4JoqrvCpL_qXVwksasNUj7ig,131
3
3
  tuft/auth.py,sha256=2Wk9ATXlAiGm1Irpj66CfIyORuHzciSNAOzVwM8PeO0,1071
4
4
  tuft/backend.py,sha256=ftiaaNds2MXroszZW8l6DEq515qiw1KmrODI3x6AHE4,10254
5
5
  tuft/checkpoints.py,sha256=bObo2NzDrfzp5BiS6I_FIA3frLFic_sT4o4c-PEzfpk,6917
6
- tuft/cli.py,sha256=-WhmHGIHmWtL46LvXRlhTPVPhBUjZHVLJi0nYR_pqoE,4024
7
- tuft/config.py,sha256=bX6NuSora0Wqhk5Q5lsnc0lojeevxnLHfiijJHMdtVg,4380
8
- tuft/exceptions.py,sha256=_xdsL8bx3Y6jvC5VYHVCa73uAEWXxcl2YwVc09lJXFk,4088
9
- tuft/futures.py,sha256=0gRLgDJJQRGGmULYsKdUs3VDsrLN8QfuFfXV00kxHO4,16375
10
- tuft/sampling_controller.py,sha256=c02VQ6Qww9IQC9VJYzQO9Z9v45kK2QeaOKlknYWjSI4,15250
11
- tuft/server.py,sha256=NUapRGdQbQH6PbuCfMZeMVi_7vM6nM7xmxepCPkgyko,24996
12
- tuft/state.py,sha256=J9R5Wd9JlMtpYcaY_6t5RvgJbY3EX5ZJTZfoQhwZ9hU,12853
13
- tuft/training_controller.py,sha256=V4JMgyEnf4wYGrk72AR5rHH1iYl488vt7d0c-ubTrO0,30008
6
+ tuft/cli.py,sha256=I5229iyre2dtOscoGNfo64sLbybyOYc0nFVBwK2EBgY,7995
7
+ tuft/config.py,sha256=xf7J24SjnQyqb6R91spZRFXMtA9DvF7BqxS0qTqFzs4,4729
8
+ tuft/exceptions.py,sha256=j_fGWNpkGZUd29RVUOuCdFCqedq8MO3c520LR9_THxQ,6575
9
+ tuft/futures.py,sha256=NG9OaSGkmS1FrJVzwo_2Jqn0B7xrsrXjUXdjhzW9Mg0,17263
10
+ tuft/sampling_controller.py,sha256=DYeqHzRHK1-k-v4k0BglHz-V-bD8_JTSszzY7-HItQE,16273
11
+ tuft/sequence_executor.py,sha256=mRgpM94kZozNsQr7L-QmJ5JmLlytmdWFtbhgNF6nP7A,2873
12
+ tuft/server.py,sha256=F4ZKAEVlB6iW8-IR88qbx7uDL6EGvAV2I2VE5F_NlKI,25295
13
+ tuft/state.py,sha256=eHRQLdSGRsjhSC2uAVc85Ds7dO2KIfUn-oWu3hEfNoE,12970
14
+ tuft/training_controller.py,sha256=yZpUNkwioWjgAWGw9lOj-QmH46gHRe020X7apg_0bFc,30451
14
15
  tuft/backends/__init__.py,sha256=7A6Pu-vEMbcMWapAh-zkI1O5WtBHO0OxwED8qAy9kAQ,262
15
16
  tuft/backends/base_backend.py,sha256=bdlx3hRyEj00GKFlh2fAczn7h4zANz7bdKgXb_F18y4,3462
16
- tuft/backends/hf_training_model.py,sha256=XQa598SpY7DnYYU0rTaHjlh-5dRCPueFtcdxrcjXWIc,16993
17
+ tuft/backends/hf_training_model.py,sha256=P7vVFo6d6Oenp5oKKg_z85XbfPnVjVnr0Kuh_kS8QCU,21293
17
18
  tuft/backends/sampling_backend.py,sha256=gf5laCMGbk9CrFuEJB0udKywVIimyU9-lqlwKok6j_w,10178
18
19
  tuft/backends/training_backend.py,sha256=p1w-1i9-vxlocr97eumB46WZS5LxrTnxz4y86mKA950,13149
19
- tuft/loss_fn/__init__.py,sha256=l6wNbeqV6_WCs0jIg3H89eTUUpTf50aitLnDb9lRdM4,1620
20
+ tuft/loss_fn/__init__.py,sha256=k1SZmAihSItEHoqzI_jKP1lyPTFRkWg0wGgj1piVJqU,2723
20
21
  tuft/loss_fn/cispo.py,sha256=L8HhqJ0rJcfgqfEkk445bvaKsZNWNAqwqm_47M9SB1Y,1598
21
22
  tuft/loss_fn/cross_entropy.py,sha256=e9D2U_G8TNXOlOvEw7OQj-YE1H5DldzG2HS2QjKBfe8,935
22
23
  tuft/loss_fn/dro.py,sha256=6d3jDK1OybcoFjq5vDwiUrURyaV-EajGLMECyF_2mjE,1315
23
24
  tuft/loss_fn/importance_sampling.py,sha256=MTxO63LBhghVCEyDQYptaziFmVvEiSTE9xeGoyo20wc,1090
24
25
  tuft/loss_fn/ppo.py,sha256=YpIYWNWqv9Asr3tV8iq5erSDHlZD8VmgOmDJQFHVWSo,1678
25
- tuft/persistence/__init__.py,sha256=U-yEVEgikbrTMLdwPS2S9GUwuQV-1Fnt9Y5key0r9bA,615
26
+ tuft/persistence/__init__.py,sha256=7BeKykcqeVghBJTnbcSZAGkTC8AHziWond36nuaQWD8,813
26
27
  tuft/persistence/file_redis.py,sha256=hLGClNhd9OID9JZMP-RZTisyoXOvQ0ctv3czj01dgIY,8091
27
- tuft/persistence/redis_store.py,sha256=9z1zbtUSXzaQP3bAH18eLd4OQ8YiJtFo4TgsZoGsGX4,15904
28
+ tuft/persistence/redis_store.py,sha256=r_wACoo5o9yFrx-vovKw7QKN3swUY342Ry9AcCdF9Kk,27401
28
29
  tuft/telemetry/__init__.py,sha256=dlSGiJ_pMElhwEe31olGg88ZrjoBeGUBn2P17qFNymM,336
29
30
  tuft/telemetry/metrics.py,sha256=Yz6s2AQ5CptFXvEm-PbO-Ib17-aF0rnoG8vZxH-Pawo,11538
30
31
  tuft/telemetry/provider.py,sha256=jGKqTMsP-WekKGCMN9QHwt-g_1Lk1xUOy1BO-__xG5I,6700
31
32
  tuft/telemetry/tracing.py,sha256=GL-wEEQtzM1ycgfI4sMsHUeIC7qj5MyOH-sBwHihbsE,957
32
- tuft-0.1.2.dist-info/METADATA,sha256=UlTE_gR3cPFLzV69GyIHD6TOm-dHSmSM5NcEHt8L0Pg,20381
33
- tuft-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
34
- tuft-0.1.2.dist-info/entry_points.txt,sha256=T48zU7Vdi2ZsARDeOZ9jK6XGuYNaCbSaUTd5POouLms,39
35
- tuft-0.1.2.dist-info/licenses/LICENSE,sha256=fJHdoqbikZ-GATzLNmixfKDot1w_cJuHKY3mH4qSmYs,1069
36
- tuft-0.1.2.dist-info/RECORD,,
33
+ tuft-0.1.3.dist-info/METADATA,sha256=whBMfZujTyWaXku4hPWEJhsOl7PRuMpWv4YHBg3sszI,19201
34
+ tuft-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
35
+ tuft-0.1.3.dist-info/entry_points.txt,sha256=T48zU7Vdi2ZsARDeOZ9jK6XGuYNaCbSaUTd5POouLms,39
36
+ tuft-0.1.3.dist-info/licenses/LICENSE,sha256=fJHdoqbikZ-GATzLNmixfKDot1w_cJuHKY3mH4qSmYs,1069
37
+ tuft-0.1.3.dist-info/RECORD,,
File without changes