PyPI - tuft - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

tuft 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

tuft/backends/hf_training_model.py +184 -64
tuft/cli.py +120 -0
tuft/config.py +58 -56
tuft/exceptions.py +66 -0
tuft/futures.py +22 -2
tuft/loss_fn/__init__.py +33 -0
tuft/persistence/__init__.py +10 -2
tuft/persistence/redis_store.py +352 -31
tuft/sampling_controller.py +34 -10
tuft/sequence_executor.py +72 -0
tuft/server.py +9 -2
tuft/state.py +3 -0
tuft/training_controller.py +14 -4
{tuft-0.1.2.dist-info → tuft-0.1.3.dist-info}/METADATA +9 -65
{tuft-0.1.2.dist-info → tuft-0.1.3.dist-info}/RECORD +18 -17
{tuft-0.1.2.dist-info → tuft-0.1.3.dist-info}/WHEEL +0 -0
{tuft-0.1.2.dist-info → tuft-0.1.3.dist-info}/entry_points.txt +0 -0
{tuft-0.1.2.dist-info → tuft-0.1.3.dist-info}/licenses/LICENSE +0 -0

tuft/exceptions.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Some custom exceptions."""
+from typing import Any
 class TuFTException(Exception):
     """Base exception for TuFT errors."""
@@ -79,6 +81,15 @@ class SequenceConflictException(FutureException):
         self.got = got
+class SequenceTimeoutException(FutureException):
+    """Timeout waiting for the expected sequence ID."""
+    def __init__(self, expected_sequence_id: int):
+        detail = f"Timeout when waiting for sequence ID {expected_sequence_id}."
+        super().__init__(detail)
+        self.sequence_id = expected_sequence_id
 class MissingSequenceIDException(FutureException):
     """Missing sequence ID in the request."""
@@ -136,3 +147,58 @@ class LossFunctionInputShapeMismatchException(LossFunctionException):
         detail = f"Input tensors must have the same shape. Got shapes: {shapes}"
         super().__init__(detail)
         self.shapes = shapes
+class LossFunctionUnknownMetricReductionException(LossFunctionException):
+    def __init__(self, reduction_type: str):
+        detail = f"Unknown metric reduction type: {reduction_type}"
+        super().__init__(detail)
+        self.reduction_type = reduction_type
+class PersistenceException(TuFTException):
+    """Base exception for Persistence related errors."""
+class ConfigMismatchError(PersistenceException):
+    """Raised when current config doesn't match the stored config in Redis.
+    This error occurs during server startup when persistence is enabled and
+    the configuration has changed since the last run. This can cause data
+    corruption when restoring persisted state.
+    """
+    def __init__(
+        self,
+        diff: dict[str, dict[str, Any]],
+    ):
+        self.diff = diff
+        # Build detailed diff message
+        diff_parts = []
+        for field_name, field_diff in diff.items():
+            # Handle scalar fields (current/stored)
+            current = field_diff.get("current")
+            stored = field_diff.get("stored")
+            parts = []
+            if current is not None or stored is not None:
+                parts.append(f"current: {current}, stored: {stored}")
+            if parts:
+                diff_parts.append(f"{field_name} ({', '.join(parts)})")
+        diff_str = "; ".join(diff_parts) if diff_parts else "unknown difference"
+        message = (
+            f"Configuration mismatch detected: {diff_str}.\n"
+            "The current configuration does not match the stored configuration in Redis.\n"
+            "This can cause data corruption when restoring persisted state.\n\n"
+            "Options:\n"
+            "  1. Use a different Redis database (change redis_url in config)\n"
+            "  2. Run `tuft clear persistence -c <config_path>` to clear existing data\n"
+            "     Use `--force` or `-f` to skip confirmation prompt.\n"
+            "     (WARNING: This will delete all persisted sessions, training runs, etc.)\n"
+            "  3. Restore the original configuration that matches the stored data"
+        )
+        super().__init__(message)

tuft/futures.py CHANGED Viewed

@@ -189,6 +189,24 @@ class FutureStore:
                 count += 1
         return count
+    def mark_pending_sample_futures_failed(
+        self,
+        error_message: str = "Server restarted while sample request was pending. Please retry.",
+    ) -> int:
+        """Mark all pending sample futures as failed."""
+        count = 0
+        for record in self._records.values():
+            if record.status == "pending" and record.operation_type == "sample":
+                record.status = "failed"
+                record.error = types.RequestFailedResponse(
+                    error=error_message,
+                    category=types.RequestErrorCategory.Server,
+                )
+                record.event.set()
+                self._save_future(record.request_id)
+                count += 1
+        return count
     def _store_record(self, record: FutureRecord) -> None:
         self._records[record.request_id] = record
         self._save_future(record.request_id)
@@ -327,8 +345,9 @@ class FutureStore:
             record.payload = payload
             record.status = "ready"
             record.error = None
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, self._save_future, request_id)
             record.event.set()
-            self._save_future(request_id)
             # Update metrics
             get_metrics().futures_completed.add(
@@ -352,8 +371,9 @@ class FutureStore:
                 return
             record.status = "failed"
             record.error = failure
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, self._save_future, request_id)
             record.event.set()
-            self._save_future(request_id)
             # Update metrics
             get_metrics().futures_completed.add(

tuft/loss_fn/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Callable, Dict, Tuple
+from tinker.lib.chunked_fwdbwd_helpers import REDUCE_MAP
 from torch import Tensor
 from typing_extensions import TypeAlias
@@ -7,6 +8,7 @@ from ..exceptions import (
     LossFunctionInputShapeMismatchException,
     LossFunctionMissingInputException,
     LossFunctionNotFoundException,
+    LossFunctionUnknownMetricReductionException,
 )
@@ -46,3 +48,34 @@ def _check_loss_fn_inputs(
         shapes = [loss_fn_inputs[key].shape for key in required_keys]
         if not all(shape == shapes[0] for shape in shapes):
             raise LossFunctionInputShapeMismatchException(shapes)
+def metrics_reduction(
+    metric_list: list[dict[str, float]],
+    weights: list[float],
+) -> dict[str, float]:
+    """Aggregate metrics from multiple batches.
+    Modified from tinker.lib.chunked_fwdbwd_helpers._metrics_reduction
+    """
+    if not metric_list:
+        return {}
+    keys = metric_list[0].keys()
+    result = {}
+    for key in keys:
+        _, reduction = key.split(":")
+        if reduction not in REDUCE_MAP:
+            raise LossFunctionUnknownMetricReductionException(reduction)
+        if not all(key in m for m in metric_list):
+            continue
+        reduce_fn = REDUCE_MAP[reduction]
+        values = [m[key] for m in metric_list]
+        if reduction in ["mean", "slack"]:
+            result[key] = reduce_fn(values, weights)
+        elif reduction in ["unique"]:
+            result[key] = values[0]
+            result.update({f"{key}_{i + 1}": v for i, v in enumerate(values[1:])})
+        else:
+            result[key] = reduce_fn(values)
+    return result

tuft/persistence/__init__.py CHANGED Viewed

@@ -3,30 +3,38 @@
 from __future__ import annotations
 from .redis_store import (
-    DEFAULT_FUTURE_TTL_SECONDS,
+    ConfigCheckField,
     PersistenceConfig,
     PersistenceMode,
     RedisPipeline,
     RedisStore,
     delete_record,
+    flush_all_data,
+    get_current_namespace,
     get_redis_store,
     is_persistence_enabled,
     load_record,
+    save_config_signature,
     save_record,
     save_records_atomic,
+    validate_config_signature,
 )
 __all__ = [
-    "DEFAULT_FUTURE_TTL_SECONDS",
+    "ConfigCheckField",
     "PersistenceConfig",
     "PersistenceMode",
     "RedisPipeline",
     "RedisStore",
     "delete_record",
+    "flush_all_data",
+    "get_current_namespace",
     "get_redis_store",
     "is_persistence_enabled",
     "load_record",
+    "save_config_signature",
     "save_record",
     "save_records_atomic",
+    "validate_config_signature",
 ]

tuft 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

tuft 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl