PyPI - tuft - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

tuft 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

tuft/__main__.py +7 -0
tuft/backends/hf_training_model.py +184 -64
tuft/cli.py +161 -8
tuft/config.py +63 -59
tuft/exceptions.py +66 -0
tuft/futures.py +22 -2
tuft/loss_fn/__init__.py +33 -0
tuft/persistence/__init__.py +10 -2
tuft/persistence/redis_store.py +352 -31
tuft/sampling_controller.py +37 -11
tuft/sequence_executor.py +72 -0
tuft/server.py +9 -2
tuft/state.py +3 -0
tuft/training_controller.py +20 -5
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/METADATA +10 -66
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/RECORD +19 -17
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/WHEEL +0 -0
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/entry_points.txt +0 -0
{tuft-0.1.1.dist-info → tuft-0.1.3.dist-info}/licenses/LICENSE +0 -0

tuft/__main__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Entry point for running tuft as a module: python -m tuft."""
+from tuft.cli import main
+if __name__ == "__main__":
+    main()

tuft/backends/hf_training_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
+import logging
 import shutil
-from typing import Dict
+from typing import Callable, Dict
 import ray
 import torch
@@ -14,13 +15,12 @@ from transformers import AutoModelForCausalLM
 from tuft.checkpoints import CheckpointRecord
 from tuft.config import ModelConfig
-from tuft.loss_fn import get_loss_fn
+from tuft.loss_fn import get_loss_fn, metrics_reduction
 from tuft.telemetry.tracing import extract_context, get_tracer
 _get_tracer = lambda: get_tracer("tuft.hf_training_model")  # noqa: E731
 MODULE_MAP = {
     "llama": {
         "attn": ["q_proj", "k_proj", "v_proj", "o_proj"],
@@ -58,6 +58,8 @@ class HFTrainingModel:
         self.model = self._init_peft_model(config)
         self.adapter_optimizer: Dict[str, torch.optim.AdamW] = {}
         self._lock = asyncio.Lock()
+        self.logger = logging.getLogger()
+        self.micro_batch_size = config.micro_batch_size
     async def async_init(self) -> None:
         """Do nothing for now. Just used to make sure the actor is ready."""
@@ -193,7 +195,9 @@ class HFTrainingModel:
         async with self._lock:
             if lora_id in self.adapter_optimizer:
                 self.model.delete_adapter(lora_id)
-                self.adapter_optimizer.pop(lora_id)
+                optimizer = self.adapter_optimizer.pop(lora_id)
+                del optimizer
+                torch.cuda.empty_cache()
     # --------------------------------
     # Training methods
@@ -207,7 +211,7 @@ class HFTrainingModel:
         backward: bool = False,
         trace_context: dict[str, str] | None = None,
     ) -> types.ForwardBackwardOutput:
-        """Forward pass (and backward if specified).
+        """Forward pass with micro-batch gradient accumulation.
         Args:
             data: List of Datum objects containing input data.
@@ -222,73 +226,163 @@ class HFTrainingModel:
         """
         ctx = extract_context(trace_context or {})
         span_name = "hf_model.forward_backward" if backward else "hf_model.forward"
         with _get_tracer().start_as_current_span(span_name, context=ctx) as span:
             span.set_attribute("tuft.lora_id", lora_id)
             span.set_attribute("tuft.backward", backward)
             span.set_attribute("tuft.data_count", len(data))
-            # Prepare input tensors
-            input_ids = [
-                torch.tensor(datum.model_input.to_ints(), dtype=torch.long) for datum in data
-            ]
-            input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
-            attention_mask = (input_ids_padded != 0).long()
-            position_ids = (
-                torch.arange(input_ids_padded.size(1), dtype=torch.long)
-                .unsqueeze(0)
-                .expand(input_ids_padded.size(0), -1)
-            )
-            # Move tensors to model device
-            device = next(self.model.parameters()).device
-            input_ids_padded = input_ids_padded.to(device)
-            attention_mask = attention_mask.to(device)
-            position_ids = position_ids.to(device)
-            # Activate the correct adapter
-            async with self._lock:
-                self._activate_adapter(lora_id)
-                # Forward pass
-                outputs = self.model(
-                    input_ids=input_ids_padded,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    return_dict=True,
-                )
-                # Compute loss
-                if loss_fn_config is None:
-                    loss_fn_config = {}
-                loss_fn_callable = get_loss_fn(loss_fn)
-                logits = outputs.logits
-                if "temperature" in loss_fn_config:
-                    temperature = loss_fn_config["temperature"]
-                    logits.div_(temperature)
+            batch_size = len(data)
+            micro_batch_size = self.config.micro_batch_size
-                loss_fn_inputs = self._prepare_loss_fn_inputs(data)
+            num_micro_batches = (batch_size + micro_batch_size - 1) // micro_batch_size
+            span.set_attribute("tuft.num_micro_batches", num_micro_batches)
-                ## compute target_logprobs from logits and target_tokens
-                target_tokens = loss_fn_inputs["target_tokens"]
-                target_logprobs = self._compute_logprobs_from_target_tokens(logits, target_tokens)
-                loss_fn_inputs["target_logprobs"] = target_logprobs
+            if num_micro_batches > 1:
+                self.logger.debug(
+                    f"[MICRO_BATCH] Splitting batch_size={batch_size} into "
+                    f"{num_micro_batches} micro-batches of size {micro_batch_size}"
+                )
-                loss, metric = loss_fn_callable(loss_fn_inputs, loss_fn_config)
+            loss_fn_callable = get_loss_fn(loss_fn)
+            all_loss_fn_outputs = []
+            micro_batch_weights = []
+            metric_list = []
+            total_loss = 0.0
-                # Backward pass if needed
-                if backward:
-                    loss.backward()
+            async with self._lock:
+                self._activate_adapter(lora_id)
-            unpaded_logprobs = self._unpad_tensor(
-                target_logprobs, [len(datum.model_input.to_ints()) for datum in data]
+                for micro_idx in range(num_micro_batches):
+                    start_idx = micro_idx * micro_batch_size
+                    end_idx = min(start_idx + micro_batch_size, batch_size)
+                    micro_data = data[start_idx:end_idx]
+                    torch.cuda.reset_peak_memory_stats()
+                    self.logger.debug(
+                        f"[GPU-micro_batch_{micro_idx}] before_forward: "
+                        f"allocated={torch.cuda.memory_allocated() / 1e9:.2f}GB, "
+                        f"reserved={torch.cuda.memory_reserved() / 1e9:.2f}GB"
+                    )
+                    micro_loss, micro_metrics, micro_outputs = await self._forward_micro_batch(
+                        micro_data,
+                        loss_fn_callable,
+                        loss_fn_config,
+                        backward=backward,
+                    )
+                    total_loss += micro_loss
+                    all_loss_fn_outputs.extend(micro_outputs)
+                    micro_batch_weights.append(len(micro_outputs))
+                    metric_list.append(micro_metrics)
+                    self.logger.debug(
+                        f"[GPU-micro_batch_{micro_idx}] after_forward: "
+                        f"allocated={torch.cuda.memory_allocated() / 1e9:.2f}GB, "
+                        f"reserved={torch.cuda.memory_reserved() / 1e9:.2f}GB, "
+                        f"max_allocated={torch.cuda.max_memory_allocated() / 1e9:.2f}GB"
+                    )
+                    torch.cuda.empty_cache()
+            avg_loss = total_loss / num_micro_batches
+            self.logger.debug(f"Average loss: {avg_loss}")
+            metric_list = metrics_reduction(metric_list, micro_batch_weights)
+            self.logger.debug(
+                f"[GPU-after_micro_batches] allocated={torch.cuda.memory_allocated() / 1e9:.2f}GB"
+                f", reserved={torch.cuda.memory_reserved() / 1e9:.2f}GB"
             )
             return types.ForwardBackwardOutput(
                 loss_fn_output_type=loss_fn,
-                loss_fn_outputs=[
-                    {"logprobs": types.TensorData.from_torch(logprobs.detach())}
-                    for logprobs in unpaded_logprobs
-                ],
-                metrics=metric,
+                loss_fn_outputs=all_loss_fn_outputs,
+                metrics=metric_list or {},
             )
+    async def _forward_micro_batch(
+        self,
+        data: list[types.Datum],
+        loss_fn_callable: Callable,
+        loss_fn_config: dict[str, float] | None,
+        backward: bool,
+    ) -> tuple[float, dict[str, float], list[dict]]:
+        """Process a single micro-batch.
+        Returns:
+            tuple: (loss_value, metrics_dict, loss_fn_outputs_list)
+        """
+        # Prepare input tensors
+        input_ids = [torch.tensor(datum.model_input.to_ints(), dtype=torch.long) for datum in data]
+        input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
+        attention_mask = (input_ids_padded != 0).long()
+        position_ids = (
+            torch.arange(input_ids_padded.size(1), dtype=torch.long)
+            .unsqueeze(0)
+            .expand(input_ids_padded.size(0), -1)
+        )
+        device = next(self.model.parameters()).device
+        input_ids_padded = input_ids_padded.to(device)
+        attention_mask = attention_mask.to(device)
+        position_ids = position_ids.to(device)
+        # Forward pass
+        outputs = self.model(
+            input_ids=input_ids_padded,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_dict=True,
+        )
+        if loss_fn_config is None:
+            loss_fn_config = {}
+        logits = outputs.logits
+        del outputs
+        torch.cuda.empty_cache()
+        if "temperature" in loss_fn_config:
+            temperature = loss_fn_config["temperature"]
+            logits = logits / temperature
+        loss_fn_inputs = self._prepare_loss_fn_inputs(data)
+        target_tokens = loss_fn_inputs["target_tokens"]
+        target_logprobs = self._compute_logprobs_from_target_tokens(logits, target_tokens)
+        del logits
+        torch.cuda.empty_cache()
+        loss_fn_inputs["target_logprobs"] = target_logprobs
+        loss, metric = loss_fn_callable(loss_fn_inputs, loss_fn_config)
+        # Backward with gradient accumulation
+        if backward:
+            loss.backward(retain_graph=False)
+            torch.cuda.empty_cache()
+        unpaded_logprobs = self._unpad_tensor(
+            target_logprobs.detach(),
+            [len(datum.model_input.to_ints()) for datum in data],
+        )
+        loss_fn_outputs = [
+            {"logprobs": types.TensorData.from_torch(logprobs.cpu().clone())}
+            for logprobs in unpaded_logprobs
+        ]
+        loss_value = loss.detach().item()
+        del target_logprobs
+        del unpaded_logprobs
+        del loss_fn_inputs
+        del loss
+        torch.cuda.empty_cache()
+        return loss_value, metric, loss_fn_outputs
     async def optim_step(
         self,
         adam_params: types.AdamParams,
@@ -316,7 +410,9 @@ class HFTrainingModel:
                 param_group["weight_decay"] = adam_params.weight_decay
             optimizer.step()
             optimizer.zero_grad()
-            return types.OptimStepResponse()
+        torch.cuda.empty_cache()
+        return types.OptimStepResponse()
     # --------------------------------
     # Helper methods
@@ -362,11 +458,33 @@ class HFTrainingModel:
     def _compute_logprobs_from_target_tokens(
         self, logits: torch.Tensor, target_tokens: torch.Tensor
     ) -> torch.Tensor:
-        logits_labels = torch.gather(logits, dim=-1, index=target_tokens.unsqueeze(-1)).squeeze(-1)
-        # loop to reduce peak mem consumption
-        logsumexp_values = torch.stack([torch.logsumexp(logit, dim=-1) for logit in logits])
-        logprobs_labels = logits_labels - logsumexp_values  # log_softmax(x_i) = x_i - logsumexp(x)
-        return logprobs_labels
+        """Compute log probabilities of target tokens from logits with low memory usage.
+        https://github.com/OpenRLHF/OpenRLHF/pull/718
+        """
+        if logits.dtype in [torch.float32, torch.float64]:
+            logits_labels = torch.gather(logits, dim=-1, index=target_tokens.unsqueeze(-1)).squeeze(
+                -1
+            )
+            logsumexp_values = torch.stack(
+                [
+                    torch.logsumexp(logit, dim=-1) for logit in logits
+                ]  # loop to reduce peak mem consumption
+            )
+            log_probs_labels = (
+                logits_labels - logsumexp_values
+            )  # log_softmax(x_i) = x_i - logsumexp(x)
+        else:
+            log_probs_labels = []
+            for row_logits, row_labels in zip(
+                logits, target_tokens, strict=True
+            ):  # loop to reduce peak mem consumption
+                row_log_probs = torch.nn.functional.log_softmax(row_logits, dim=-1)
+                row_log_probs_labels = row_log_probs.gather(
+                    dim=-1, index=row_labels.unsqueeze(-1)
+                ).squeeze(-1)
+                log_probs_labels.append(row_log_probs_labels)
+            log_probs_labels = torch.stack(log_probs_labels)
+        return log_probs_labels
     def _unpad_tensor(
         self, padded_tensor: torch.Tensor, original_lengths: list[int]
@@ -383,6 +501,8 @@ class HFTrainingModel:
             dtype="auto",
             device_map="auto",
         )
+        model.enable_input_require_grads()
+        model.gradient_checkpointing_enable({"use_reentrant": False})
         peft_config = LoraConfig()
         peft_model = get_peft_model(model, peft_config=peft_config, adapter_name="default")
         return peft_model
@@ -398,7 +518,7 @@ class HFTrainingModel:
             ray.remote(cls)
             .options(
                 name="training_model_" + config.model_name,
-                num_gpus=1 if not config.colocate else 1 - config.sampling_memory_fraction,
+                num_gpus=(1 if not config.colocate else 1 - config.sampling_memory_fraction),
             )
             .remote(config)
         )

tuft/cli.py CHANGED Viewed

@@ -3,49 +3,198 @@
 from __future__ import annotations
 import logging
+import os
 from pathlib import Path
 import typer
 import uvicorn
 from .config import AppConfig, load_yaml_config
+from .exceptions import ConfigMismatchError
+from .persistence import (
+    flush_all_data,
+    get_current_namespace,
+    get_redis_store,
+    validate_config_signature,
+)
 from .server import create_root_app
 from .telemetry import init_telemetry
 from .telemetry.metrics import ResourceMetricsCollector
-app = typer.Typer(help="TuFT - Tenant-unified Fine-Tuning Server.")
+app = typer.Typer(help="TuFT - Tenant-unified Fine-Tuning Server.", no_args_is_help=True)
+clear_app = typer.Typer(help="Clear data commands.", no_args_is_help=True)
+app.add_typer(clear_app, name="clear")
+# Required for Typer to recognize subcommands when using no_args_is_help=True
+@app.callback()
+def callback() -> None:
+    """TuFT - Tenant-unified Fine-Tuning Server."""
+# Default paths based on TUFT_HOME
+_TUFT_HOME = Path(os.environ.get("TUFT_HOME", Path.home() / ".tuft"))
+_DEFAULT_CONFIG_PATH = _TUFT_HOME / "configs" / "tuft_config.yaml"
+_DEFAULT_CHECKPOINT_DIR = _TUFT_HOME / "checkpoints"
 _HOST_OPTION = typer.Option("127.0.0.1", "--host", help="Interface to bind", envvar="TUFT_HOST")
 _PORT_OPTION = typer.Option(10610, "--port", "-p", help="Port to bind", envvar="TUFT_PORT")
-_LOG_LEVEL_OPTION = typer.Option("info", "--log-level", help="Uvicorn log level")
+_LOG_LEVEL_OPTION = typer.Option(
+    "info", "--log-level", help="Uvicorn log level", envvar="TUFT_LOG_LEVEL"
+)
 _RELOAD_OPTION = typer.Option(False, "--reload", help="Enable auto-reload (development only)")
 _CONFIG_OPTION = typer.Option(
     None,
     "--config",
     "-c",
-    help="Path to a TuFT configuration file (YAML)",
+    help=f"Path to a TuFT configuration file (YAML). Defaults to {_DEFAULT_CONFIG_PATH}",
+    envvar="TUFT_CONFIG",
 )
 _CHECKPOINT_DIR_OPTION = typer.Option(
     None,
     "--checkpoint-dir",
-    help="Override checkpoint_dir from config file. Defaults to ~/.cache/tuft/checkpoints.",
+    help=f"Override checkpoint_dir from config file. Defaults to {_DEFAULT_CHECKPOINT_DIR}",
+    envvar="TUFT_CHECKPOINT_DIR",
 )
+def _resolve_config_path(config_path: Path | None) -> Path:
+    """Resolve the config path, falling back to default if not provided."""
+    if config_path is not None:
+        return config_path
+    if _DEFAULT_CONFIG_PATH.exists():
+        return _DEFAULT_CONFIG_PATH
+    raise typer.BadParameter(
+        f"Configuration file must be provided via --config or TUFT_CONFIG, "
+        f"or create a default config at {_DEFAULT_CONFIG_PATH}"
+    )
 def _build_config(
     config_path: Path | None,
     checkpoint_dir: Path | None,
 ) -> AppConfig:
-    if config_path is None:
-        raise typer.BadParameter("Configuration file must be provided via --config")
-    config = load_yaml_config(config_path)
+    resolved_config_path = _resolve_config_path(config_path)
+    config = load_yaml_config(resolved_config_path)
+    # Apply checkpoint_dir override, or use default if not in config
     if checkpoint_dir is not None:
         config.checkpoint_dir = checkpoint_dir.expanduser()
+    elif config.checkpoint_dir is None:
+        config.checkpoint_dir = _DEFAULT_CHECKPOINT_DIR
+    # Guarantee checkpoint_dir is set after resolution
+    assert config.checkpoint_dir is not None, "checkpoint_dir must be set after config resolution"
     config.ensure_directories()
     return config
+_FORCE_OPTION = typer.Option(
+    False,
+    "--force",
+    "-f",
+    help="Skip confirmation prompts when clearing persistence data.",
+)
+@clear_app.command(name="persistence")
+def clear_persistence(
+    config_path: Path | None = _CONFIG_OPTION,
+    force: bool = _FORCE_OPTION,
+) -> None:
+    """Clear all persistence data and start fresh.
+    This command clears all existing persistence data in the configured namespace.
+    Use this when the configuration has changed and you want to discard old data.
+    """
+    # Build config to get persistence settings
+    try:
+        resolved_config_path = _resolve_config_path(config_path)
+        config = load_yaml_config(resolved_config_path)
+    except typer.BadParameter as e:
+        typer.secho(f"Error: {e}", fg=typer.colors.RED)
+        raise typer.Exit(1) from e
+    if not config.persistence.enabled:
+        typer.secho(
+            "Persistence is disabled in the configuration. Nothing to clear.",
+            fg=typer.colors.YELLOW,
+        )
+        raise typer.Exit(0)
+    # Configure the store
+    store = get_redis_store()
+    store.configure(config.persistence)
+    namespace = get_current_namespace()
+    if not force:
+        typer.secho(
+            "\n🚨🚨🚨 CRITICAL WARNING 🚨🚨🚨\n",
+            fg=typer.colors.RED,
+            bold=True,
+        )
+        typer.secho(
+            "This command will PERMANENTLY DELETE ALL persistence data!\n",
+            fg=typer.colors.RED,
+            bold=True,
+        )
+        typer.secho(
+            f"📦 Target namespace: '{namespace}'\n",
+            fg=typer.colors.YELLOW,
+            bold=True,
+        )
+        typer.echo(
+            f"This IRREVERSIBLE action will destroy ALL data in namespace '{namespace}':\n"
+            "  ❌ All saved sessions\n"
+            "  ❌ All training run records and checkpoint metadata (NOT local checkpoint files)\n"
+            "  ❌ All future records\n"
+            "  ❌ All sampling session records\n"
+            "  ❌ Configuration signature\n"
+            "\n"
+            "⚠️  The server will start fresh with NO previous state.\n"
+            "⚠️  This action CANNOT be undone!\n"
+            "⚠️  Local checkpoint files on disk are NOT affected.\n"
+            f"⚠️  Only data in namespace '{namespace}' will be affected.\n"
+        )
+        confirmed = typer.confirm(
+            f"Do you REALLY want to delete all data in namespace '{namespace}'?",
+            default=False,
+        )
+        if not confirmed:
+            typer.echo("Aborted. No data was cleared.")
+            raise typer.Exit(0)
+    deleted_count, cleared_namespace = flush_all_data()
+    typer.secho(
+        f"✅ Cleared {deleted_count} keys from namespace '{cleared_namespace}'.",
+        fg=typer.colors.GREEN,
+    )
+    typer.echo("Persistence data has been cleared. You can now start the server fresh.")
+def _validate_persistence_config(config: AppConfig) -> None:
+    """Validate that persistence config matches stored config.
+    If config mismatch is detected, exits with an error message.
+    """
+    if not config.persistence.enabled:
+        return
+    # Configure the Redis store first
+    store = get_redis_store()
+    store.configure(config.persistence)
+    try:
+        validate_config_signature(config)
+    except ConfigMismatchError as e:
+        typer.secho(
+            "\n 🚫 FATAL ERROR: Configuration Mismatch Detected 🚫",
+            fg=typer.colors.RED,
+            bold=True,
+        )
+        typer.echo(f"\n{e}\n")
+        raise typer.Exit(1) from e
 def _init_telemetry(config: AppConfig, log_level: str) -> None:
     """Initialize OpenTelemetry if enabled."""
     # Configure root logger level to ensure logs flow to OTel
@@ -71,6 +220,10 @@ def launch(
 ) -> None:
     """Launch the TuFT server."""
     app_config = _build_config(config_path, checkpoint_dir)
+    # Validate persistence configuration before starting
+    _validate_persistence_config(app_config)
     # Initialize telemetry before starting the server
     _init_telemetry(app_config, log_level)
     logging.getLogger("tuft").info("Server starting on %s:%s", host, port)
@@ -84,7 +237,7 @@ def launch(
 def main() -> None:
-    app()
+    app(prog_name="tuft")
 if __name__ == "__main__":

tuft 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

tuft 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl