PyPI - tuft - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

tuft 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

tuft/__main__.py +7 -0
tuft/backends/hf_training_model.py +184 -64
tuft/cli.py +161 -8
tuft/config.py +63 -59
tuft/exceptions.py +66 -0
tuft/futures.py +22 -2
tuft/loss_fn/__init__.py +33 -0
tuft/persistence/__init__.py +10 -2
tuft/persistence/redis_store.py +352 -31
tuft/sampling_controller.py +37 -11
tuft/sequence_executor.py +72 -0
tuft/server.py +9 -2
tuft/state.py +3 -0
tuft/training_controller.py +20 -5
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/METADATA +10 -66
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/RECORD +19 -17
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/WHEEL +0 -0
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/entry_points.txt +0 -0
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/licenses/LICENSE +0 -0

tuft/server.py CHANGED Viewed

@@ -18,7 +18,7 @@ from tinker import types
 from .auth import User
 from .config import AppConfig
 from .exceptions import TuFTException
-from .persistence import get_redis_store
+from .persistence import get_redis_store, save_config_signature
 from .state import ServerState
 from .telemetry import shutdown_telemetry
@@ -76,11 +76,19 @@ def _instrument_fastapi(app: FastAPI) -> None:
 def create_root_app(config: AppConfig | None = None) -> FastAPI:
+    resolved_config = config or AppConfig()
     @asynccontextmanager
     async def lifespan(app: FastAPI):
         try:
             await app.state.server_state.async_init()
             logger.info("Server initialized successfully")
+            # After successful init/restore, save the current config signature
+            if resolved_config.persistence.enabled:
+                save_config_signature(resolved_config)
+                logger.debug("Config signature saved after successful initialization")
             yield
         finally:
             logger.info("Server shutting down")
@@ -95,7 +103,6 @@ def create_root_app(config: AppConfig | None = None) -> FastAPI:
             route.dependencies = getattr(route, "dependencies", []) + [Depends(_get_user)]
         return route
-    resolved_config = config or AppConfig()
     if resolved_config.persistence.enabled:
         store = get_redis_store()
         store.configure(resolved_config.persistence)

tuft/state.py CHANGED Viewed

@@ -135,7 +135,10 @@ class ServerState:
         1. For each training run restored from Redis, create adapter and load latest checkpoint
         2. Mark ALL futures created after checkpoint's future_id as failed
         3. For training runs without checkpoints, mark all futures as failed
+        4. Mark all pending sample futures as failed
         """
+        self.future_store.mark_pending_sample_futures_failed()
         # Restore training runs (adapter + checkpoint)
         for model_id, record in self.training.training_runs.items():
             if record.backend is None or record.corrupted:

tuft/training_controller.py CHANGED Viewed

@@ -274,7 +274,8 @@ class TrainingController:
             if seq_id is not None:
                 self._reserve_seq_id(record, seq_id)
                 # Save the updated next_seq_id to Redis
-                self._save_training_run(record.training_run_id)
+                loop = asyncio.get_event_loop()
+                await loop.run_in_executor(None, self._save_training_run, record.training_run_id)
             return await operation()
     def _reserve_seq_id(self, record: TrainingRunRecord, seq_id: int) -> None:
@@ -314,7 +315,8 @@ class TrainingController:
                 )
                 await backend.create_adapter(model_id, lora_config)
                 self.training_runs[model_id] = record
-                self._save_training_run(model_id)
+                loop = asyncio.get_event_loop()
+                await loop.run_in_executor(None, self._save_training_run, model_id)
                 # Update metrics
                 get_metrics().training_models_active.add(1, {"base_model": base_model})
@@ -508,6 +510,7 @@ class TrainingController:
                 logger.info("Checkpoint save begin: %s", checkpoint_id)
                 setattr(training_run, counter_attr, counter + 1)
+                assert self.config.checkpoint_dir is not None
                 checkpoint = CheckpointRecord.from_training_run(
                     training_run_id=training_run.training_run_id,
                     checkpoint_name=checkpoint_name,
@@ -540,7 +543,14 @@ class TrainingController:
                 # Save training run and checkpoint atomically to prevent inconsistency
                 # if server crashes between saves
-                self._save_training_run_with_checkpoint(model_id, checkpoint_name, checkpoint_type)
+                loop = asyncio.get_event_loop()
+                await loop.run_in_executor(
+                    None,
+                    self._save_training_run_with_checkpoint,
+                    model_id,
+                    checkpoint_name,
+                    checkpoint_type,
+                )
                 # Update metrics
                 metrics = get_metrics()
@@ -567,7 +577,11 @@ class TrainingController:
     ) -> None:
         """Load a checkpoint."""
         try:
-            parsed_checkpoint = CheckpointRecord.from_tinker_path(path, self.config.checkpoint_dir)
+            assert self.config.checkpoint_dir is not None
+            parsed_checkpoint = CheckpointRecord.from_tinker_path(
+                path,
+                self.config.checkpoint_dir,
+            )
         except FileNotFoundError as exc:
             raise CheckpointNotFoundException(checkpoint_id=model_id) from exc
         source_model_id = parsed_checkpoint.training_run_id or model_id
@@ -717,7 +731,8 @@ class TrainingController:
         except Exception:  # pylint: disable=broad-except
             # If loading fails, mark as corrupted
             record.corrupted = True
-            self._save_training_run(model_id)
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, self._save_training_run, model_id)
             return None
         return latest_ckpt

{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tuft
-Version: 0.1.1
+Version: 0.1.3
 Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
 Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
 License: MIT License
@@ -29,6 +29,7 @@ Requires-Python: >=3.11
 Requires-Dist: fastapi>=0.125.0
 Requires-Dist: httpx>=0.28.1
 Requires-Dist: numpy<2.0.0
+Requires-Dist: nvidia-ml-py>=13.0.0
 Requires-Dist: omegaconf>=2.3.0
 Requires-Dist: opentelemetry-api>=1.20.0
 Requires-Dist: opentelemetry-exporter-otlp>=1.20.0
@@ -36,7 +37,6 @@ Requires-Dist: opentelemetry-instrumentation-fastapi>=0.41b0
 Requires-Dist: opentelemetry-instrumentation-logging>=0.41b0
 Requires-Dist: opentelemetry-sdk>=1.20.0
 Requires-Dist: psutil>=5.9.0
-Requires-Dist: pynvml>=11.5.0
 Requires-Dist: ray>=2.50.0
 Requires-Dist: tinker>=0.7.0
 Requires-Dist: transformers<5.0.0,>=4.57.3
@@ -300,7 +300,7 @@ uv pip install "tuft[dev,backend,persistence]"
 The CLI starts a FastAPI server:
 ```bash
-tuft --port 10610 --config /path/to/tuft_config.yaml
+tuft launch --port 10610 --config /path/to/tuft_config.yaml
 ```
 The config file `tuft_config.yaml` specifies server settings including available base models, authentication, persistence, and telemetry. Below is a minimal example.
@@ -340,7 +340,7 @@ you can use the pre-built Docker image.
         -p 10610:10610 \
         -v <host_dir>:/data \
         ghcr.io/agentscope-ai/tuft:latest \
-        tuft --port 10610 --config /data/tuft_config.yaml
+        tuft launch --port 10610 --config /data/tuft_config.yaml
     ```
     Please replace `<host_dir>` with a directory on your host machine where you want to store model checkpoints and other data.
@@ -378,77 +378,25 @@ We provide practical examples to demonstrate how to use TuFT for training and sa
 ## Persistence
-TuFT supports optional Redis-based persistence for server state. When enabled,
-the server can recover sessions, training runs, and pending futures after a restart.
+TuFT supports optional persistence for server state. When enabled, the server can recover sessions, training runs, sampling sessions, and futures after a restart (and then restore runtime model state from checkpoints).
-To use persistence, install the optional dependency:
+See [docs/persistence.md](docs/persistence.md) for full details (key layout, restore semantics, and safety checks).
 ```bash
-uv pip install tuft[persistence]
+uv pip install "tuft[persistence]"
 ```
-### Persistence Modes
-TuFT provides three persistence modes:
-| Mode | Description | Use Case |
-|------|-------------|----------|
-| `disabled` | No persistence, data in-memory only | Development, testing without state recovery |
-| `redis_url` | External Redis server | Production, multi-instance deployments |
-| `file_redis` | File-backed store | Demos, small-scale testing |
-### Configuration
-Add a `persistence` section to your `tuft_config.yaml` configuration file and choose one of the following modes.
-#### Mode 1: Disabled (Default)
-No configuration needed. All data is stored in memory and lost on restart.
-```yaml
-# tuft_config.yaml
-persistence:
-  mode: disabled
-```
-#### Mode 2: External Redis Server
-Use an external Redis server for production deployments:
 ```yaml
 # tuft_config.yaml
 persistence:
-  mode: redis_url
+  mode: REDIS
   redis_url: "redis://localhost:6379/0"
-  namespace: "tuft"
-```
-You can start a local Redis instance using Docker:
-```bash
-docker run -d --name TuFT-redis -p 6379:6379 redis:7-alpine
-```
-#### Mode 3: File-backed Store
-Use the file-backed store for demos or small-scale testing:
-```yaml
-# tuft_config.yaml
-persistence:
-  mode: file_redis
-  file_path: "~/.cache/tuft/file_redis.json"
-  namespace: "tuft"
+  namespace: "persistence-tuft-server"
 ```
 ## Observability (OpenTelemetry)
-TuFT supports optional OpenTelemetry integration for distributed tracing, metrics, and logging.
-This allows you to monitor your TuFT server using observability tools like SigNoz, Jaeger, or Grafana.
-### Configuration
-Add the following `telemetry` section to your `tuft_config.yaml` configuration file:
+TuFT supports optional OpenTelemetry integration for tracing, metrics, and logs. See [docs/telemetry.md](docs/telemetry.md) for details (what TuFT records, correlation keys, Ray context propagation, and collector setup).
 ```yaml
 # tuft_config.yaml
@@ -457,10 +405,6 @@ telemetry:
   service_name: tuft
   otlp_endpoint: http://localhost:4317  # Your OTLP collector endpoint
   resource_attributes: {}
-    # example:
-    # deployment.environment: production
-    # service.version: 1.0.0
-    # service.namespace: my-namespace
 ```
 Alternatively, use environment variables:

{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,35 +1,37 @@
 tuft/__init__.py,sha256=BJu6iJ_QGwcJXRXDgR1LjV25KgM6sVd7_WqIXVTEuVM,97
+tuft/__main__.py,sha256=MPhC9msQXf9py5xkLPQ4JoqrvCpL_qXVwksasNUj7ig,131
 tuft/auth.py,sha256=2Wk9ATXlAiGm1Irpj66CfIyORuHzciSNAOzVwM8PeO0,1071
 tuft/backend.py,sha256=ftiaaNds2MXroszZW8l6DEq515qiw1KmrODI3x6AHE4,10254
 tuft/checkpoints.py,sha256=bObo2NzDrfzp5BiS6I_FIA3frLFic_sT4o4c-PEzfpk,6917
-tuft/cli.py,sha256=PJ89JfrJ7wB5Zd-pe9rkWLcfavmWRtWLmQ_r1Y_Qdwk,2725
-tuft/config.py,sha256=EGkDmnYNNHgtkEuffHoBE9R-hc2epAJe4sUEVBP_6Ug,4292
-tuft/exceptions.py,sha256=_xdsL8bx3Y6jvC5VYHVCa73uAEWXxcl2YwVc09lJXFk,4088
-tuft/futures.py,sha256=0gRLgDJJQRGGmULYsKdUs3VDsrLN8QfuFfXV00kxHO4,16375
-tuft/sampling_controller.py,sha256=WIQ29aVD9JWuxZ8JV4a71nYZXh8Es2wtA4QhaaGRSXQ,15151
-tuft/server.py,sha256=NUapRGdQbQH6PbuCfMZeMVi_7vM6nM7xmxepCPkgyko,24996
-tuft/state.py,sha256=J9R5Wd9JlMtpYcaY_6t5RvgJbY3EX5ZJTZfoQhwZ9hU,12853
-tuft/training_controller.py,sha256=fQI6sxtG3v2JYgbd1y501MLzuTUAp0NIvzv6cBOm-T8,29841
+tuft/cli.py,sha256=I5229iyre2dtOscoGNfo64sLbybyOYc0nFVBwK2EBgY,7995
+tuft/config.py,sha256=xf7J24SjnQyqb6R91spZRFXMtA9DvF7BqxS0qTqFzs4,4729
+tuft/exceptions.py,sha256=j_fGWNpkGZUd29RVUOuCdFCqedq8MO3c520LR9_THxQ,6575
+tuft/futures.py,sha256=NG9OaSGkmS1FrJVzwo_2Jqn0B7xrsrXjUXdjhzW9Mg0,17263
+tuft/sampling_controller.py,sha256=DYeqHzRHK1-k-v4k0BglHz-V-bD8_JTSszzY7-HItQE,16273
+tuft/sequence_executor.py,sha256=mRgpM94kZozNsQr7L-QmJ5JmLlytmdWFtbhgNF6nP7A,2873
+tuft/server.py,sha256=F4ZKAEVlB6iW8-IR88qbx7uDL6EGvAV2I2VE5F_NlKI,25295
+tuft/state.py,sha256=eHRQLdSGRsjhSC2uAVc85Ds7dO2KIfUn-oWu3hEfNoE,12970
+tuft/training_controller.py,sha256=yZpUNkwioWjgAWGw9lOj-QmH46gHRe020X7apg_0bFc,30451
 tuft/backends/__init__.py,sha256=7A6Pu-vEMbcMWapAh-zkI1O5WtBHO0OxwED8qAy9kAQ,262
 tuft/backends/base_backend.py,sha256=bdlx3hRyEj00GKFlh2fAczn7h4zANz7bdKgXb_F18y4,3462
-tuft/backends/hf_training_model.py,sha256=XQa598SpY7DnYYU0rTaHjlh-5dRCPueFtcdxrcjXWIc,16993
+tuft/backends/hf_training_model.py,sha256=P7vVFo6d6Oenp5oKKg_z85XbfPnVjVnr0Kuh_kS8QCU,21293
 tuft/backends/sampling_backend.py,sha256=gf5laCMGbk9CrFuEJB0udKywVIimyU9-lqlwKok6j_w,10178
 tuft/backends/training_backend.py,sha256=p1w-1i9-vxlocr97eumB46WZS5LxrTnxz4y86mKA950,13149
-tuft/loss_fn/__init__.py,sha256=l6wNbeqV6_WCs0jIg3H89eTUUpTf50aitLnDb9lRdM4,1620
+tuft/loss_fn/__init__.py,sha256=k1SZmAihSItEHoqzI_jKP1lyPTFRkWg0wGgj1piVJqU,2723
 tuft/loss_fn/cispo.py,sha256=L8HhqJ0rJcfgqfEkk445bvaKsZNWNAqwqm_47M9SB1Y,1598
 tuft/loss_fn/cross_entropy.py,sha256=e9D2U_G8TNXOlOvEw7OQj-YE1H5DldzG2HS2QjKBfe8,935
 tuft/loss_fn/dro.py,sha256=6d3jDK1OybcoFjq5vDwiUrURyaV-EajGLMECyF_2mjE,1315
 tuft/loss_fn/importance_sampling.py,sha256=MTxO63LBhghVCEyDQYptaziFmVvEiSTE9xeGoyo20wc,1090
 tuft/loss_fn/ppo.py,sha256=YpIYWNWqv9Asr3tV8iq5erSDHlZD8VmgOmDJQFHVWSo,1678
-tuft/persistence/__init__.py,sha256=U-yEVEgikbrTMLdwPS2S9GUwuQV-1Fnt9Y5key0r9bA,615
+tuft/persistence/__init__.py,sha256=7BeKykcqeVghBJTnbcSZAGkTC8AHziWond36nuaQWD8,813
 tuft/persistence/file_redis.py,sha256=hLGClNhd9OID9JZMP-RZTisyoXOvQ0ctv3czj01dgIY,8091
-tuft/persistence/redis_store.py,sha256=9z1zbtUSXzaQP3bAH18eLd4OQ8YiJtFo4TgsZoGsGX4,15904
+tuft/persistence/redis_store.py,sha256=r_wACoo5o9yFrx-vovKw7QKN3swUY342Ry9AcCdF9Kk,27401
 tuft/telemetry/__init__.py,sha256=dlSGiJ_pMElhwEe31olGg88ZrjoBeGUBn2P17qFNymM,336
 tuft/telemetry/metrics.py,sha256=Yz6s2AQ5CptFXvEm-PbO-Ib17-aF0rnoG8vZxH-Pawo,11538
 tuft/telemetry/provider.py,sha256=jGKqTMsP-WekKGCMN9QHwt-g_1Lk1xUOy1BO-__xG5I,6700
 tuft/telemetry/tracing.py,sha256=GL-wEEQtzM1ycgfI4sMsHUeIC7qj5MyOH-sBwHihbsE,957
-tuft-0.1.1.dist-info/METADATA,sha256=zH2lHrE8kZh2O61cvI7_uSyGCJ9obARa0FZGLnj0HQY,20375
-tuft-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tuft-0.1.1.dist-info/entry_points.txt,sha256=T48zU7Vdi2ZsARDeOZ9jK6XGuYNaCbSaUTd5POouLms,39
-tuft-0.1.1.dist-info/licenses/LICENSE,sha256=fJHdoqbikZ-GATzLNmixfKDot1w_cJuHKY3mH4qSmYs,1069
-tuft-0.1.1.dist-info/RECORD,,
+tuft-0.1.3.dist-info/METADATA,sha256=whBMfZujTyWaXku4hPWEJhsOl7PRuMpWv4YHBg3sszI,19201
+tuft-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tuft-0.1.3.dist-info/entry_points.txt,sha256=T48zU7Vdi2ZsARDeOZ9jK6XGuYNaCbSaUTd5POouLms,39
+tuft-0.1.3.dist-info/licenses/LICENSE,sha256=fJHdoqbikZ-GATzLNmixfKDot1w_cJuHKY3mH4qSmYs,1069
+tuft-0.1.3.dist-info/RECORD,,

{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tuft 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

tuft 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl