PyPI - mlxsmith - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mlxsmith 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

mlxsmith/__init__.py +2 -0
mlxsmith/accel/__init__.py +10 -0
mlxsmith/accel/base.py +17 -0
mlxsmith/accel/none.py +13 -0
mlxsmith/accel/zmlx_backend.py +42 -0
mlxsmith/adapters.py +46 -0
mlxsmith/api/__init__.py +48 -0
mlxsmith/api/handlers.py +1217 -0
mlxsmith/api/schemas.py +436 -0
mlxsmith/auth.py +88 -0
mlxsmith/bench.py +102 -0
mlxsmith/cli.py +950 -0
mlxsmith/config.py +543 -0
mlxsmith/config_models.py +261 -0
mlxsmith/data.py +493 -0
mlxsmith/envs/__init__.py +33 -0
mlxsmith/envs/system.py +388 -0
mlxsmith/envs/token_env.py +191 -0
mlxsmith/eval.py +112 -0
mlxsmith/infer.py +140 -0
mlxsmith/llm/__init__.py +16 -0
mlxsmith/llm/backend.py +126 -0
mlxsmith/llm/interface.py +212 -0
mlxsmith/llm/mlx_lm_backend.py +509 -0
mlxsmith/llm/mock_backend.py +228 -0
mlxsmith/llm/registry.py +12 -0
mlxsmith/models.py +257 -0
mlxsmith/orchestrator/__init__.py +25 -0
mlxsmith/orchestrator/daemon.py +454 -0
mlxsmith/orchestrator/inference_worker.py +496 -0
mlxsmith/orchestrator/queue.py +355 -0
mlxsmith/orchestrator/trainer_worker.py +437 -0
mlxsmith/rlm/__init__.py +8 -0
mlxsmith/rlm/corpus.py +74 -0
mlxsmith/rlm/gating.py +90 -0
mlxsmith/rlm/generate.py +249 -0
mlxsmith/rlm/history.py +12 -0
mlxsmith/rlm/inference.py +150 -0
mlxsmith/rlm/loop.py +1297 -0
mlxsmith/rlm/mutate.py +82 -0
mlxsmith/rlm/trainer.py +73 -0
mlxsmith/rlm/weights.py +263 -0
mlxsmith/runs.py +44 -0
mlxsmith/sdk/__init__.py +392 -0
mlxsmith/sdk/future.py +486 -0
mlxsmith/sdk/losses.py +262 -0
mlxsmith/sdk/sampling_client.py +729 -0
mlxsmith/sdk/training_client.py +676 -0
mlxsmith/server.py +376 -0
mlxsmith/train/__init__.py +0 -0
mlxsmith/train/distill.py +279 -0
mlxsmith/train/lora.py +280 -0
mlxsmith/train/pref.py +180 -0
mlxsmith/train/rft.py +458 -0
mlxsmith/train/sft.py +151 -0
mlxsmith/util.py +174 -0
mlxsmith/verifiers/__init__.py +3 -0
mlxsmith/verifiers/compose.py +109 -0
mlxsmith/verifiers/docker_verifier.py +111 -0
mlxsmith/verifiers/jsonschema.py +54 -0
mlxsmith/verifiers/pytest_verifier.py +82 -0
mlxsmith/verifiers/regex.py +15 -0
mlxsmith/verifiers/types.py +10 -0
mlxsmith-0.1.0.dist-info/METADATA +163 -0
mlxsmith-0.1.0.dist-info/RECORD +69 -0
mlxsmith-0.1.0.dist-info/WHEEL +5 -0
mlxsmith-0.1.0.dist-info/entry_points.txt +2 -0
mlxsmith-0.1.0.dist-info/licenses/LICENSE +21 -0
mlxsmith-0.1.0.dist-info/top_level.txt +1 -0

mlxsmith/api/schemas.py ADDED Viewed

@@ -0,0 +1,436 @@
+"""Pydantic models for API request/response validation.
+OpenAPI 3.1 compatible schemas for MLXSmith API.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional, Union
+from pydantic import BaseModel, Field
+# =============================================================================
+# Common Schemas
+# =============================================================================
+class ErrorResponse(BaseModel):
+    """Error response schema."""
+    error: str = Field(..., description="Error message")
+    code: Optional[str] = Field(None, description="Error code")
+    details: Optional[Dict[str, Any]] = Field(None, description="Additional error details")
+class HealthResponse(BaseModel):
+    """Health check response."""
+    ok: bool = Field(..., description="Service health status")
+    version: Optional[str] = Field(None, description="API version")
+    model: Optional[str] = Field(None, description="Currently loaded model")
+# =============================================================================
+# Chat Completions (OpenAI-compatible)
+# =============================================================================
+class ChatMessage(BaseModel):
+    """A single chat message."""
+    role: Literal["system", "user", "assistant", "tool"] = Field(
+        ..., description="Role of the message sender"
+    )
+    content: str = Field(..., description="Message content")
+    name: Optional[str] = Field(None, description="Optional name for the sender")
+    tool_calls: Optional[List[Dict[str, Any]]] = Field(None, description="Tool calls (if any)")
+class ChatRequest(BaseModel):
+    """OpenAI-compatible chat completion request."""
+    model: Optional[str] = Field(
+        None, description="Model identifier (optional, uses default if not provided)"
+    )
+    messages: List[ChatMessage] = Field(
+        ..., description="List of chat messages", min_length=1
+    )
+    max_tokens: int = Field(
+        256, description="Maximum tokens to generate", ge=1, le=8192
+    )
+    temperature: float = Field(
+        0.7, description="Sampling temperature", ge=0.0, le=2.0
+    )
+    top_p: float = Field(
+        1.0, description="Nucleus sampling parameter", ge=0.0, le=1.0
+    )
+    top_k: Optional[int] = Field(
+        None, description="Top-k sampling parameter", ge=1
+    )
+    stream: Optional[bool] = Field(
+        False, description="Enable streaming response via SSE"
+    )
+    stop: Optional[Union[str, List[str]]] = Field(
+        None, description="Stop sequences"
+    )
+    seed: Optional[int] = Field(None, description="Random seed for reproducibility")
+    presence_penalty: Optional[float] = Field(
+        0.0, description="Presence penalty", ge=-2.0, le=2.0
+    )
+    frequency_penalty: Optional[float] = Field(
+        0.0, description="Frequency penalty", ge=-2.0, le=2.0
+    )
+    logprobs: Optional[bool] = Field(
+        False, description="Return logprobs of output tokens"
+    )
+    top_logprobs: Optional[int] = Field(
+        None, description="Number of top logprobs to return per token", ge=0, le=20
+    )
+class LogprobsContent(BaseModel):
+    """Logprob information for a token."""
+    token: str = Field(..., description="The token string")
+    logprob: float = Field(..., description="The log probability of the token")
+    bytes: Optional[List[int]] = Field(None, description="Bytes representation of token")
+    top_logprobs: Optional[List[Dict[str, float]]] = Field(
+        None, description="Top logprobs for this position"
+    )
+class ChoiceLogprobs(BaseModel):
+    """Logprobs for a completion choice."""
+    content: Optional[List[LogprobsContent]] = Field(
+        None, description="Logprobs for each token in the completion"
+    )
+class UsageInfo(BaseModel):
+    """Token usage information."""
+    prompt_tokens: int = Field(..., description="Number of tokens in the prompt")
+    completion_tokens: int = Field(..., description="Number of tokens in the completion")
+    total_tokens: int = Field(..., description="Total number of tokens")
+class Choice(BaseModel):
+    """A single completion choice."""
+    index: int = Field(..., description="Index of the choice")
+    message: ChatMessage = Field(..., description="The generated message")
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = Field(
+        None, description="Reason for completion finish"
+    )
+    logprobs: Optional[ChoiceLogprobs] = Field(None, description="Logprobs for this choice")
+class ChatResponse(BaseModel):
+    """OpenAI-compatible chat completion response."""
+    id: str = Field(..., description="Unique identifier for the completion")
+    object: Literal["chat.completion"] = Field("chat.completion")
+    created: int = Field(..., description="Unix timestamp of creation")
+    model: str = Field(..., description="Model used for the completion")
+    choices: List[Choice] = Field(..., description="List of completion choices")
+    usage: UsageInfo = Field(..., description="Token usage information")
+class DeltaMessage(BaseModel):
+    """Delta message for streaming responses."""
+    role: Optional[Literal["assistant"]] = Field(None)
+    content: Optional[str] = Field(None, description="Incremental content")
+class StreamChoice(BaseModel):
+    """A streaming completion choice."""
+    index: int = Field(..., description="Index of the choice")
+    delta: DeltaMessage = Field(..., description="Incremental message delta")
+    finish_reason: Optional[Literal["stop", "length"]] = Field(None)
+    logprobs: Optional[ChoiceLogprobs] = Field(None, description="Logprobs for this chunk")
+class ChatCompletionChunk(BaseModel):
+    """Streaming chat completion chunk (SSE)."""
+    id: str = Field(..., description="Unique identifier")
+    object: Literal["chat.completion.chunk"] = Field("chat.completion.chunk")
+    created: int = Field(..., description="Unix timestamp")
+    model: str = Field(..., description="Model used")
+    choices: List[StreamChoice] = Field(..., description="List of choices")
+# =============================================================================
+# Internal Rollout (for RLM training)
+# =============================================================================
+class RolloutRequest(BaseModel):
+    """Internal rollout request with detailed output options."""
+    prompt: str = Field(..., description="Input prompt text", min_length=1)
+    max_tokens: int = Field(256, description="Maximum tokens to generate", ge=1)
+    temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
+    top_p: float = Field(1.0, description="Nucleus sampling parameter", ge=0.0, le=1.0)
+    top_k: Optional[int] = Field(None, description="Top-k sampling", ge=1)
+    seed: Optional[int] = Field(None, description="Random seed")
+    include_tokens: bool = Field(True, description="Include token IDs in response")
+    include_logprobs: bool = Field(True, description="Include per-token logprobs")
+    include_top_k_logprobs: Optional[int] = Field(
+        None, description="Number of top logprobs per token to include", ge=0, le=20
+    )
+    include_prompt_logprobs: bool = Field(
+        False, description="Include per-token logprobs for prompt tokens"
+    )
+    include_prompt_top_k_logprobs: Optional[int] = Field(
+        None,
+        description="Number of top logprobs per prompt token to include",
+        ge=0,
+        le=20,
+    )
+    include_text: bool = Field(True, description="Include generated text")
+class RolloutResponse(BaseModel):
+    """Internal rollout response with tokens and logprobs."""
+    id: str = Field(..., description="Unique rollout identifier")
+    created: int = Field(..., description="Unix timestamp")
+    model: str = Field(..., description="Model used")
+    prompt_len: int = Field(..., description="Length of prompt in tokens")
+    token_ids: Optional[List[int]] = Field(None, description="Generated token IDs")
+    logprobs: Optional[List[float]] = Field(None, description="Per-token log probabilities")
+    top_k_logprobs: Optional[List[Dict[str, float]]] = Field(
+        None, description="Top-k logprobs per token"
+    )
+    prompt_logprobs: Optional[List[float]] = Field(
+        None,
+        description="Per-token log probabilities for prompt tokens (excluding first token)",
+    )
+    prompt_top_k_logprobs: Optional[List[Dict[str, float]]] = Field(
+        None, description="Top-k logprobs per prompt token"
+    )
+    completion: Optional[str] = Field(None, description="Generated text (if requested)")
+# =============================================================================
+# Training Endpoints
+# =============================================================================
+class ForwardBackwardRequest(BaseModel):
+    """Request for forward/backward pass."""
+    prompts: List[str] = Field(..., description="List of prompts", min_length=1)
+    responses: Optional[List[str]] = Field(None, description="List of responses (for SFT)")
+    rejected_responses: Optional[List[str]] = Field(
+        None, description="List of rejected responses (for preference training)"
+    )
+    loss_type: Literal["sft", "dpo", "orpo", "ppo", "custom"] = Field(
+        "sft", description="Type of loss to compute"
+    )
+    train_on_prompt: bool = Field(False, description="Compute loss on prompt tokens")
+    max_seq_len: Optional[int] = Field(None, description="Maximum sequence length")
+    extra: Optional[Dict[str, Any]] = Field(None, description="Additional loss parameters")
+class ForwardBackwardResponse(BaseModel):
+    """Response from forward/backward pass."""
+    loss: float = Field(..., description="Computed loss value")
+    has_grads: bool = Field(..., description="Whether gradients were computed")
+    batch_size: int = Field(..., description="Batch size processed")
+    metrics: Optional[Dict[str, float]] = Field(None, description="Additional metrics")
+class OptimStepRequest(BaseModel):
+    """Request for optimizer step."""
+    learning_rate: Optional[float] = Field(None, description="Override learning rate")
+    grad_clip: Optional[float] = Field(None, description="Gradient clipping threshold")
+class OptimStepResponse(BaseModel):
+    """Response from optimizer step."""
+    step: int = Field(..., description="Current training step")
+    learning_rate: float = Field(..., description="Learning rate used")
+    grad_norm: Optional[float] = Field(None, description="Gradient norm")
+    success: bool = Field(True, description="Whether step succeeded")
+class SaveStateRequest(BaseModel):
+    """Request to save training state."""
+    path: str = Field(..., description="Path to save checkpoint")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata to save")
+class SaveStateResponse(BaseModel):
+    """Response from save state operation."""
+    path: str = Field(..., description="Path where checkpoint was saved")
+    success: bool = Field(..., description="Whether save succeeded")
+    message: str = Field(..., description="Status message")
+class LoadStateRequest(BaseModel):
+    """Request to load training state."""
+    path: str = Field(..., description="Path to checkpoint to load")
+class LoadStateResponse(BaseModel):
+    """Response from load state operation."""
+    path: str = Field(..., description="Path from which checkpoint was loaded")
+    success: bool = Field(..., description="Whether load succeeded")
+    message: str = Field(..., description="Status message")
+    step: Optional[int] = Field(None, description="Training step from checkpoint")
+class GetWeightsResponse(BaseModel):
+    """Response for get weights operation."""
+    weights: Dict[str, Any] = Field(..., description="Model weights (may be partial/shape info)")
+    success: bool = Field(..., description="Whether operation succeeded")
+    message: str = Field(..., description="Status message")
+class SetWeightsRequest(BaseModel):
+    """Request to set model weights."""
+    weights: Dict[str, Any] = Field(..., description="Model weights to set")
+class SetWeightsResponse(BaseModel):
+    """Response from set weights operation."""
+    success: bool = Field(..., description="Whether operation succeeded")
+    message: str = Field(..., description="Status message")
+    num_tensors: int = Field(..., description="Number of weight tensors set")
+# =============================================================================
+# Adapter Management
+# =============================================================================
+class AdapterReloadRequest(BaseModel):
+    """Request to reload adapter weights."""
+    adapter_path: Optional[str] = Field(
+        None, description="Path to adapter directory (relative or absolute)"
+    )
+    reload_base: bool = Field(
+        False, description="Reload the base model before applying adapter"
+    )
+class AdapterReloadResponse(BaseModel):
+    """Response after adapter reload."""
+    ok: bool = Field(..., description="Whether reload was successful")
+    base_model: str = Field(..., description="Base model identifier")
+    adapter_path: Optional[str] = Field(None, description="Currently loaded adapter path")
+    message: Optional[str] = Field(None, description="Status message")
+# =============================================================================
+# RLM State and History
+# =============================================================================
+class RLMTrainingMetrics(BaseModel):
+    """RLM training metrics."""
+    loss: Optional[float] = Field(None, description="Training loss")
+    reward_mean: Optional[float] = Field(None, description="Mean reward")
+    reward_std: Optional[float] = Field(None, description="Reward standard deviation")
+    kl_div: Optional[float] = Field(None, description="KL divergence from reference")
+    learning_rate: Optional[float] = Field(None, description="Current learning rate")
+class RLMState(BaseModel):
+    """Current RLM training state."""
+    status: Literal["idle", "running", "paused", "completed", "error"] = Field(
+        ..., description="Current training status"
+    )
+    iteration: Optional[int] = Field(None, description="Current training iteration")
+    total_iterations: Optional[int] = Field(None, description="Total planned iterations")
+    metrics: Optional[RLMTrainingMetrics] = Field(None, description="Current metrics")
+    started_at: Optional[int] = Field(None, description="Training start timestamp")
+    updated_at: Optional[int] = Field(None, description="Last update timestamp")
+    error_message: Optional[str] = Field(None, description="Error message if status is error")
+class RLMHistoryEntry(BaseModel):
+    """Single RLM training history entry."""
+    iteration: int = Field(..., description="Training iteration number")
+    timestamp: int = Field(..., description="Unix timestamp")
+    adapter_score: Optional[float] = Field(None, description="Adapter evaluation score")
+    base_score: Optional[float] = Field(None, description="Base model score")
+    improvement: Optional[float] = Field(None, description="Relative improvement")
+    metrics: Optional[Dict[str, Any]] = Field(None, description="Additional metrics")
+# =============================================================================
+# Model Management
+# =============================================================================
+class ModelInfo(BaseModel):
+    """Information about a cached model."""
+    id: str = Field(..., description="Model identifier")
+    path: str = Field(..., description="Local path to the model")
+    size_bytes: Optional[int] = Field(None, description="Model size in bytes")
+    format: Literal["mlx", "hf", "gguf"] = Field(..., description="Model format")
+    has_adapter: bool = Field(False, description="Whether model has adapter weights")
+    adapter_path: Optional[str] = Field(None, description="Path to adapter if present")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="Additional model metadata")
+    downloaded_at: Optional[int] = Field(None, description="Download timestamp")
+class ModelsListResponse(BaseModel):
+    """Response for listing cached models."""
+    models: List[ModelInfo] = Field(..., description="List of cached models")
+    total: int = Field(..., description="Total number of models")
+    cache_dir: str = Field(..., description="Current cache directory")
+class ModelPullRequest(BaseModel):
+    """Request to pull a model from HuggingFace."""
+    model_id: str = Field(..., description="HuggingFace model identifier", min_length=1)
+    convert: bool = Field(True, description="Convert to MLX format")
+    quantize: bool = Field(False, description="Quantize during conversion")
+    q_bits: Optional[int] = Field(4, description="Quantization bits", ge=1, le=8)
+    q_group_size: Optional[int] = Field(64, description="Quantization group size")
+    trust_remote_code: bool = Field(False, description="Trust remote code in model")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "model_id": "mlx-community/Llama-3.2-1B-Instruct-4bit",
+                "convert": True,
+                "quantize": False,
+            }
+        }
+class ModelPullStatus(BaseModel):
+    """Status of model pull operation."""
+    status: Literal["pending", "downloading", "converting", "completed", "error"] = Field(
+        ..., description="Current pull status"
+    )
+    progress: Optional[float] = Field(None, description="Progress percentage (0-100)", ge=0, le=100)
+    message: Optional[str] = Field(None, description="Status message")
+    downloaded_bytes: Optional[int] = Field(None, description="Bytes downloaded so far")
+    total_bytes: Optional[int] = Field(None, description="Total bytes to download")
+class ModelPullResponse(BaseModel):
+    """Response for model pull request."""
+    ok: bool = Field(..., description="Whether pull was initiated successfully")
+    model_id: str = Field(..., description="Model identifier")
+    local_path: Optional[str] = Field(None, description="Local path where model will be stored")
+    status: ModelPullStatus = Field(..., description="Current pull status")
+    message: Optional[str] = Field(None, description="Status message")
+# =============================================================================
+# HuggingFace Token Management
+# =============================================================================
+class HFTokenRequest(BaseModel):
+    """Request to store HuggingFace token."""
+    token: str = Field(
+        ...,
+        description="HuggingFace API token",
+        min_length=1,
+        json_schema_extra={"format": "password"}
+    )
+    persist: bool = Field(
+        True, description="Persist token to disk (encrypted if possible)"
+    )
+    validate_token: bool = Field(
+        True, description="Validate token before storing"
+    )
+class HFTokenResponse(BaseModel):
+    """Response after storing HF token."""
+    ok: bool = Field(..., description="Whether token was stored successfully")
+    validated: bool = Field(..., description="Whether token was validated")
+    username: Optional[str] = Field(None, description="HF username if validated")
+    message: str = Field(..., description="Status message")
+    storage_method: Literal["keyring", "file", "memory"] = Field(
+        ..., description="How the token is stored"
+    )

mlxsmith/auth.py ADDED Viewed

@@ -0,0 +1,88 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import os
+from huggingface_hub import HfApi, get_token as hf_get_token, logout as hf_logout
+from huggingface_hub import constants as hf_constants
+@dataclass
+class AuthStatus:
+    token_present: bool
+    token_hint: Optional[str] = None
+    user: Optional[str] = None
+    warnings: list[str] = field(default_factory=list)
+def _mask_token(token: str) -> str:
+    if not token:
+        return ""
+    if len(token) <= 8:
+        return "***"
+    return f"{token[:4]}...{token[-4:]}"
+def _token_path() -> Path:
+    hf_home = os.environ.get("HF_HOME")
+    if hf_home:
+        return Path(hf_home) / "token"
+    return Path(getattr(hf_constants, "HF_TOKEN_PATH", Path(hf_constants.HF_HOME) / "token"))
+def load_token() -> Optional[str]:
+    try:
+        token = hf_get_token()
+        if token:
+            return token
+    except Exception:
+        pass
+    path = _token_path()
+    if path.exists():
+        return path.read_text(encoding="utf-8").strip()
+    return None
+def save_token(token: str) -> None:
+    path = _token_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(token, encoding="utf-8")
+def delete_token() -> bool:
+    removed = False
+    try:
+        hf_logout()
+        removed = True
+    except Exception:
+        pass
+    path = _token_path()
+    if path.exists():
+        path.unlink()
+        removed = True
+    return removed
+def get_status(validate: bool = False) -> AuthStatus:
+    token = load_token()
+    if not token:
+        return AuthStatus(token_present=False)
+    status = AuthStatus(token_present=True, token_hint=_mask_token(token))
+    if validate:
+        try:
+            info = HfApi().whoami(token=token)
+            status.user = info.get("name") or info.get("fullname") or info.get("email")
+        except Exception as exc:
+            status.warnings.append(f"Token validation failed: {exc}")
+    return status
+def login(token: str, validate: bool = True) -> AuthStatus:
+    save_token(token)
+    return get_status(validate=validate)
+def logout() -> bool:
+    return delete_token()

mlxsmith/bench.py ADDED Viewed

@@ -0,0 +1,102 @@
+from __future__ import annotations
+import json
+import time
+from pathlib import Path
+from .config import ProjectConfig
+from .models import resolve_model_spec
+from .util import ensure_dir, now_ts
+from .llm.registry import get_llm_backend
+from .accel import get_backend
+def run_bench(
+    project_root: Path,
+    cfg: ProjectConfig,
+    model_id_or_path: str,
+    accel: str,
+    *,
+    prompt: str,
+    max_tokens: int,
+    reps: int,
+    mode: str = "inference",
+    steps: int = 5,
+) -> Path:
+    out_dir = ensure_dir(project_root / "bench")
+    out_path = out_dir / f"bench_{now_ts()}.json"
+    accel_backend = get_backend(accel)
+    accel_backend.patch()
+    llm = get_llm_backend(cfg.model.backend)
+    base_model, adapter_path, _meta = resolve_model_spec(project_root, model_id_or_path, cfg)
+    llm.load(
+        base_model,
+        max_seq_len=cfg.model.max_seq_len,
+        dtype=cfg.model.dtype,
+        trust_remote_code=cfg.model.trust_remote_code,
+    )
+    if adapter_path:
+        llm.apply_adapter(str(adapter_path))
+    results = []
+    mode = (mode or "inference").lower()
+    if mode == "trainer":
+        opt, _params = llm.optimizer_and_params(lr=cfg.train.lr, weight_decay=cfg.train.weight_decay)
+        prompt_ids = llm.encode(prompt)
+        ids = llm.encode(prompt + " " + "x" * max_tokens)
+        for i in range(max(1, reps)):
+            t0 = time.time()
+            for _ in range(max(1, steps)):
+                def loss_fn(_model):
+                    return llm.sft_loss(ids, train_on_prompt=cfg.train.train_on_prompt, prompt_len=len(prompt_ids))
+                _loss, grads = llm.value_and_grad(loss_fn)
+                if grads is not None:
+                    llm.apply_grads(opt, grads)
+            elapsed = max(time.time() - t0, 1e-6)
+            results.append({"rep": i, "steps": steps, "time_s": elapsed, "steps_per_s": steps / elapsed})
+    elif mode == "end_to_end":
+        opt, _params = llm.optimizer_and_params(lr=cfg.train.lr, weight_decay=cfg.train.weight_decay)
+        for i in range(max(1, reps)):
+            t0 = time.time()
+            gen = llm.generate(prompt, max_new_tokens=max_tokens, temperature=0.0)
+            def loss_fn(_model):
+                return llm.rl_loss(gen.token_ids, prompt_len=gen.prompt_len, advantage=1.0)
+            _loss, grads = llm.value_and_grad(loss_fn)
+            if grads is not None:
+                llm.apply_grads(opt, grads)
+            elapsed = max(time.time() - t0, 1e-6)
+            gen_tokens = max(0, len(gen.token_ids) - gen.prompt_len)
+            results.append({"rep": i, "tokens": gen_tokens, "time_s": elapsed, "tps": gen_tokens / elapsed})
+    else:
+        for i in range(max(1, reps)):
+            t0 = time.time()
+            gen = llm.generate(prompt, max_new_tokens=max_tokens, temperature=0.0)
+            elapsed = max(time.time() - t0, 1e-6)
+            gen_tokens = max(0, len(gen.token_ids) - gen.prompt_len)
+            results.append({"rep": i, "tokens": gen_tokens, "time_s": elapsed, "tps": gen_tokens / elapsed})
+    if mode == "trainer":
+        avg_metric = sum(r["steps_per_s"] for r in results) / max(1, len(results))
+        metric_name = "avg_steps_per_s"
+    else:
+        avg_metric = sum(r["tps"] for r in results) / max(1, len(results))
+        metric_name = "avg_tps"
+    summary = {
+        "model": base_model,
+        "adapter": str(adapter_path) if adapter_path else None,
+        "prompt": prompt,
+        "max_tokens": max_tokens,
+        "reps": reps,
+        "mode": mode,
+        "steps": steps if mode == "trainer" else None,
+        "results": results,
+        metric_name: avg_metric,
+        "accel": accel_backend.name,
+    }
+    out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    return out_path