PyPI - xax - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

xax 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

xax/__init__.py +9 -3
xax/task/mixins/__init__.py +2 -1
xax/task/mixins/supervised.py +368 -0
xax/task/mixins/train.py +36 -345
xax/task/task.py +26 -2
{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/METADATA +1 -1
{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/RECORD +11 -10
{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/WHEEL +0 -0
{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/entry_points.txt +0 -0
{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/licenses/LICENSE +0 -0
{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/top_level.txt +0 -0

xax/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.3.14"
+__version__ = "0.4.0"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -94,10 +94,13 @@ __all__ = [
     "DataloaderConfig",
     "GPUStatsOptions",
     "StepContext",
+    "InitParams",
     "ValidStepTimer",
     "Script",
     "ScriptConfig",
     "Config",
+    "SupervisedConfig",
+    "SupervisedTask",
     "Task",
     "collate",
     "collate_non_null",
@@ -291,10 +294,13 @@ NAME_MAP: dict[str, str] = {
     "DataloaderConfig": "task.mixins.data_loader",
     "GPUStatsOptions": "task.mixins.gpu_stats",
     "StepContext": "task.mixins.step_wrapper",
+    "InitParams": "task.mixins.train",
     "ValidStepTimer": "task.mixins.train",
     "Script": "task.script",
     "ScriptConfig": "task.script",
     "Config": "task.task",
+    "SupervisedConfig": "task.task",
+    "SupervisedTask": "task.task",
     "Task": "task.task",
     "collate": "utils.data.collate",
     "collate_non_null": "utils.data.collate",
@@ -488,9 +494,9 @@ if IMPORT_ALL or TYPE_CHECKING:
     from xax.task.mixins.data_loader import DataloaderConfig
     from xax.task.mixins.gpu_stats import GPUStatsOptions
     from xax.task.mixins.step_wrapper import StepContext
-    from xax.task.mixins.train import Batch, Output, ValidStepTimer
+    from xax.task.mixins.train import Batch, InitParams, Output, ValidStepTimer
     from xax.task.script import Script, ScriptConfig
-    from xax.task.task import Config, Task
+    from xax.task.task import Config, SupervisedConfig, SupervisedTask, Task
     from xax.utils.data.collate import CollateMode, collate, collate_non_null
     from xax.utils.debugging import (
         breakpoint_if_nonfinite,

xax/task/mixins/__init__.py CHANGED Viewed

@@ -10,4 +10,5 @@ from xax.task.mixins.logger import LoggerConfig, LoggerMixin
 from xax.task.mixins.process import ProcessConfig, ProcessMixin
 from xax.task.mixins.runnable import RunnableConfig, RunnableMixin
 from xax.task.mixins.step_wrapper import StepContextConfig, StepContextMixin
-from xax.task.mixins.train import TrainConfig, TrainMixin
+from xax.task.mixins.supervised import SupervisedConfig, SupervisedMixin
+from xax.task.mixins.train import InitParams, TrainConfig, TrainMixin

xax/task/mixins/supervised.py ADDED Viewed

@@ -0,0 +1,368 @@
+"""Defines a mixin for running the training loop."""
+import bdb
+import contextlib
+import itertools
+import logging
+import signal
+import sys
+import textwrap
+import traceback
+from abc import ABC
+from dataclasses import dataclass
+from threading import Thread
+from typing import (
+    Generator,
+    Generic,
+    Iterator,
+    Sequence,
+    TypeVar,
+)
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+import optax
+from jaxtyping import Array, PRNGKeyArray, PyTree
+from xax.core.conf import field
+from xax.core.state import State
+from xax.nn.parallel import is_master
+from xax.task.mixins.train import Batch, InitParams, Output, TrainConfig, TrainMixin
+from xax.utils.experiments import (
+    ContextTimer,
+    TrainingFinishedError,
+)
+from xax.utils.jax import jit as xax_jit, scan as xax_scan
+from xax.utils.logging import LOG_PING
+from xax.utils.pytree import get_pytree_param_count
+from xax.utils.text import highlight_exception_message, show_info
+from xax.utils.types.frozen_dict import FrozenDict
+logger = logging.getLogger(__name__)
+@jax.tree_util.register_dataclass
+@dataclass
+class SupervisedConfig(TrainConfig):
+    updates_per_step: int = field(1, help="Number of updates to perform per step")
+Config = TypeVar("Config", bound=SupervisedConfig)
+class SupervisedMixin(
+    TrainMixin[Config, InitParams],
+    Generic[Config],
+    ABC,
+):
+    def get_output(self, model: PyTree, batch: Batch, state: State) -> Output:
+        """Gets the output from the model.
+        By default, we assume the model is a function that takes the batch as
+        input and returns the loss. This function can be patched to do more
+        complex operations instead.
+        Args:
+            model: The current model.
+            batch: The current minibatch of samples.
+            state: The current training state.
+        """
+        raise NotImplementedError("`get_output` must be implemented by the subclass")
+    def compute_loss(self, model: PyTree, batch: Batch, output: Output, state: State) -> Array:
+        """Gets the loss for the current batch.
+        By default, we assume the model is a function that takes the batch as
+        input and returns the loss. This function can be patched to do more
+        complex operations instead.
+        Args:
+            model: The current model.
+            batch: The current minibatch of samples.
+            output: The output from the model.
+            state: The current training state.
+        Returns:
+            The computed loss, as a tensor.
+        """
+        if not isinstance(output, Array):
+            raise ValueError(f"When model output is not the loss, you must override `compute_loss`. Got {type(output)}")
+        return output
+    def compute_metrics(
+        self,
+        model: PyTree,
+        batch: Batch,
+        output: Output,
+        loss: Array,
+        state: State,
+    ) -> dict[str, Array]:
+        """Computes the metrics for the current batch.
+        Args:
+            model: The current model.
+            batch: The current minibatch of samples.
+            output: The output from the model.
+            loss: The loss for the current batch.
+            state: The current training state.
+        Returns:
+            A dictionary of metrics.
+        """
+        return {
+            "loss": loss,
+        }
+    @xax_jit(static_argnames=["self", "model_static"], jit_level=3)
+    def get_output_and_loss(
+        self,
+        model_arr: PyTree,
+        model_static: PyTree,
+        batch: Batch,
+        state: State,
+    ) -> tuple[Array, tuple[Output, dict[str, Array]]]:
+        model = eqx.combine(model_arr, model_static)
+        output = self.get_output(model, batch, state)
+        loss = self.compute_loss(model, batch, output, state)
+        metrics = self.compute_metrics(model, batch, output, loss, state)
+        return loss, (output, metrics)
+    @xax_jit(static_argnames=["self", "model_static", "optimizer"], jit_level=3)
+    def update(
+        self,
+        model_arr: PyTree,
+        model_static: PyTree,
+        optimizer: optax.GradientTransformation,
+        opt_state: optax.OptState,
+        batch: Batch,
+        state: State,
+    ) -> tuple[PyTree, optax.OptState, Output, dict[str, Array]]:
+        grad_fn = jax.grad(self.get_output_and_loss, argnums=0, has_aux=True)
+        grad_fn = xax_jit(static_argnums=[1], jit_level=3)(grad_fn)
+        grads, (output, metrics) = grad_fn(model_arr, model_static, batch, state)
+        updates, opt_state = optimizer.update(grads, opt_state, model_arr)
+        model_arr = eqx.apply_updates(model_arr, updates)
+        return model_arr, opt_state, output, metrics
+    @xax_jit(static_argnames=["self", "model_static", "optimizer"], jit_level=3)
+    def train_step(
+        self,
+        model_arr: PyTree,
+        model_static: PyTree,
+        optimizer: optax.GradientTransformation,
+        opt_state: optax.OptState,
+        batches: Batch,
+        state: State,
+    ) -> tuple[PyTree, optax.OptState, Output, FrozenDict[str, Array]]:
+        def update_fn(
+            carry: tuple[PyTree, optax.OptState],
+            batch: Batch,
+        ) -> tuple[tuple[PyTree, optax.OptState], tuple[Output, FrozenDict[str, Array]]]:
+            model_arr, opt_state = carry
+            model_arr, opt_state, output, metrics = self.update(
+                model_arr,
+                model_static,
+                optimizer,
+                opt_state,
+                batch,
+                state,
+            )
+            return (model_arr, opt_state), (output, FrozenDict(metrics))
+        (model_arr, opt_state), (output, metrics) = xax_scan(
+            update_fn,
+            (model_arr, opt_state),
+            batches,
+            jit_level=3,
+        )
+        # Only get the final output and metrics.
+        output = jax.tree.map(lambda x: x[-1], output)
+        metrics = jax.tree.map(lambda x: x[-1], metrics)
+        return model_arr, opt_state, output, metrics
+    @xax_jit(static_argnames=["self", "model_static"], jit_level=3)
+    def val_step(
+        self,
+        model_arr: PyTree,
+        model_static: PyTree,
+        batch: Batch,
+        state: State,
+    ) -> tuple[Output, FrozenDict[str, Array]]:
+        _, (output, metrics) = self.get_output_and_loss(model_arr, model_static, batch, state)
+        return output, FrozenDict(metrics)
+    def train_loop(
+        self,
+        models: Sequence[PyTree],
+        optimizers: Sequence[optax.GradientTransformation],
+        opt_states: Sequence[optax.OptState],
+        train_pf: Iterator[Batch],
+        valid_pf: Iterator[Batch],
+        state: State,
+    ) -> None:
+        if len(models) != 1 or len(optimizers) != 1 or len(opt_states) != 1:
+            raise ValueError(
+                "Vanilla training expects a single model, optimizer and optimizer state. "
+                f"Found {len(models)} models, {len(optimizers)} optimizers and {len(opt_states)} optimizer states."
+            )
+        model_arr, model_static = eqx.partition(models[0], self.model_partition_fn)
+        optimizer = optimizers[0]
+        opt_state = opt_states[0]
+        while not self.is_training_over(state):
+            valid_step = self.valid_step_timer(state)
+            if valid_step:
+                with ContextTimer() as timer:
+                    state = state.replace(phase="valid")
+                    valid_batch = next(valid_pf)
+                    output, metrics = self.val_step(model_arr, model_static, valid_batch, state)
+                    self.log_step(eqx.combine(model_arr, model_static), valid_batch, output, metrics, state)
+                state = state.replace(
+                    num_steps=state.num_steps + 1,
+                    num_samples=state.num_samples + (self.get_size_of_batch(valid_batch) or 0),
+                    elapsed_time_s=state.elapsed_time_s + timer.elapsed_time,
+                )
+            with ContextTimer() as timer:
+                state = self.on_step_start(state)
+                state = state.replace(phase="train")
+                train_batches = list(itertools.islice(train_pf, self.config.updates_per_step))
+                model_arr, opt_state, output, metrics = self.train_step(
+                    model_arr=model_arr,
+                    model_static=model_static,
+                    optimizer=optimizer,
+                    opt_state=opt_state,
+                    batches=jax.tree.map(lambda *xs: jnp.stack(xs, axis=0), *train_batches),
+                    state=state,
+                )
+                self.log_step(eqx.combine(model_arr, model_static), train_batches[-1], output, metrics, state)
+                state = self.on_step_end(state)
+            state = state.replace(
+                num_steps=state.num_steps + 1,
+                num_samples=state.num_samples + (self.get_size_of_batch(train_batches[-1]) or 0),
+                elapsed_time_s=state.elapsed_time_s + timer.elapsed_time,
+            )
+            if state.num_steps <= 3:
+                logger.log(LOG_PING, "Step %d took %.2f second", state.num_steps, timer.elapsed_time)
+            if self.should_checkpoint(state):
+                model = eqx.combine(model_arr, model_static)
+                self.save_checkpoint(models=[model], optimizers=[optimizer], opt_states=[opt_state], state=state)
+        # After finishing training, save the final checkpoint.
+        model = eqx.combine(model_arr, model_static)
+        self.save_checkpoint(models=[model], optimizers=[optimizer], opt_states=[opt_state], state=state)
+    @contextlib.contextmanager
+    def get_train_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
+        try:
+            train_iterator: Iterator[Batch] = self.get_data_iterator("train", key=key)
+            yield train_iterator
+            return
+        except NotImplementedError:
+            pass
+        train_ds = self.get_dataset("train")
+        train_dl = self.get_dataloader(train_ds, "train", prefetch_factor=self.config.updates_per_step + 1)
+        train_pf = self.get_prefetcher(train_dl)
+        try:
+            with train_pf as train_pf_ctx:
+                yield train_pf_ctx
+        finally:
+            logger.info("Closing train prefetcher")
+    @contextlib.contextmanager
+    def get_valid_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
+        try:
+            valid_iterator: Iterator[Batch] = self.get_data_iterator("valid", key=key)
+            yield valid_iterator
+            return
+        except NotImplementedError:
+            pass
+        valid_ds = self.get_dataset("valid")
+        valid_dl = self.get_dataloader(valid_ds, "valid")
+        valid_pf = self.get_prefetcher(valid_dl)
+        try:
+            with valid_pf as valid_pf_ctx:
+                yield valid_pf_ctx
+        finally:
+            logger.info("Closing valid prefetcher")
+    def run(self) -> None:
+        self.run_training()
+    def run_training(self) -> None:
+        """Runs the training loop.
+        Args:
+            model: The current model
+            task: The current task
+            optimizer: The current optimizer
+            lr_scheduler: The current learning rate scheduler
+        Raises:
+            ValueError: If the task is not a supervised learning task
+        """
+        with self:
+            key = self.prng_key()
+            self.set_loggers()
+            if is_master():
+                Thread(target=self.log_state, daemon=True).start()
+            key, model_key = jax.random.split(key)
+            init_params = InitParams(key=model_key)
+            models, optimizers, opt_states, state = self.load_initial_state(init_params, load_optimizer=True)
+            logger.info("Model size: %s", f"{get_pytree_param_count(models):,}")
+            logger.info("Optimizer size: %s", f"{get_pytree_param_count(opt_states):,}")
+            state = self.on_training_start(state)
+            def on_exit() -> None:
+                self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
+            # Handle user-defined interrupts during the training loop.
+            self.add_signal_handler(on_exit, signal.SIGUSR1, signal.SIGTERM)
+            key, tkey, vkey = jax.random.split(key, 3)
+            with self.get_train_iterator(tkey) as train_pf, self.get_valid_iterator(vkey) as valid_pf:
+                try:
+                    self.train_loop(
+                        models=models,
+                        optimizers=optimizers,
+                        opt_states=opt_states,
+                        train_pf=train_pf,
+                        valid_pf=valid_pf,
+                        state=state,
+                    )
+                except TrainingFinishedError:
+                    if is_master():
+                        num_steps, num_samples = int(state.num_steps), int(state.num_samples)
+                        show_info(f"Finished training after {num_steps} steps, {num_samples} samples", important=True)
+                    self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
+                except (KeyboardInterrupt, bdb.BdbQuit):
+                    if is_master():
+                        show_info("Interrupted training", important=True)
+                except BaseException:
+                    exception_tb = textwrap.indent(highlight_exception_message(traceback.format_exc()), "  ")
+                    sys.stdout.write(f"Caught exception during training loop:\n\n{exception_tb}\n")
+                    sys.stdout.flush()
+                    self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
+                finally:
+                    state = self.on_training_end(state)

xax/task/mixins/train.py CHANGED Viewed

@@ -1,24 +1,15 @@
 """Defines a mixin for running the training loop."""
-import bdb
-import contextlib
 import functools
 import itertools
 import logging
-import signal
-import sys
-import textwrap
 import time
-import traceback
 from abc import ABC, abstractmethod
 from dataclasses import asdict, dataclass, is_dataclass
 from pathlib import Path
-from threading import Thread
 from typing import (
     Any,
-    Generator,
     Generic,
-    Iterator,
     Literal,
     Mapping,
     Sequence,
@@ -30,7 +21,6 @@ from typing import (
 import equinox as eqx
 import jax
-import jax.numpy as jnp
 import numpy as np
 import optax
 from jaxtyping import Array, PRNGKeyArray, PyTree
@@ -38,7 +28,6 @@ from jaxtyping import Array, PRNGKeyArray, PyTree
 from xax.core.conf import field
 from xax.core.state import Phase, State
 from xax.nn.functions import set_random_seed
-from xax.nn.parallel import is_master
 from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
 from xax.task.mixins.checkpointing import (
     CheckpointingConfig,
@@ -51,19 +40,14 @@ from xax.task.mixins.logger import LoggerConfig, LoggerMixin
 from xax.task.mixins.runnable import RunnableConfig, RunnableMixin
 from xax.task.mixins.step_wrapper import StepContextConfig, StepContextMixin
 from xax.utils.experiments import (
-    ContextTimer,
     StateTimer,
-    TrainingFinishedError,
     diff_configs,
     get_diff_string,
     get_info_json,
     get_state_file_string,
     get_training_code,
 )
-from xax.utils.jax import jit as xax_jit, scan as xax_scan
 from xax.utils.logging import LOG_PING, LOG_STATUS
-from xax.utils.pytree import get_pytree_param_count
-from xax.utils.text import highlight_exception_message, show_info
 from xax.utils.types.frozen_dict import FrozenDict
 logger = logging.getLogger(__name__)
@@ -159,6 +143,16 @@ class ValidStepTimer:
         return False
+@jax.tree_util.register_dataclass
+@dataclass(frozen=True)
+class InitParams:
+    key: PRNGKeyArray
+# Subclasses should be able to override the init params.
+InitParamsT = TypeVar("InitParamsT", bound=InitParams)
 @jax.tree_util.register_dataclass
 @dataclass
 class TrainConfig(
@@ -175,7 +169,6 @@ class TrainConfig(
     valid_first_n_seconds: float | None = field(60.0, help="Run first validation after N seconds")
     max_steps: int | None = field(None, help="Maximum number of steps to run")
     step_kind: str = field("step", help=f"How to measure a step; one of [{', '.join(get_args(StepKind))}]")
-    updates_per_step: int = field(1, help="Number of updates to perform per step")
     random_seed: int = field(1337, help="Random seed for the task")
@@ -189,7 +182,7 @@ class TrainMixin(
     StepContextMixin[Config],
     ArtifactsMixin[Config],
     RunnableMixin[Config],
-    Generic[Config],
+    Generic[Config, InitParamsT],
     ABC,
 ):
     valid_step_timer: ValidStepTimer
@@ -309,15 +302,18 @@ class TrainMixin(
         self.write_logs(state)
     @abstractmethod
-    def get_model(self, key: PRNGKeyArray) -> PyTree | Sequence[PyTree]:
+    def get_model(self, params: InitParamsT) -> PyTree | Sequence[PyTree]:
         """Returns the Equinox model to train.
+        Args:
+            params: The parameters for initializing the model.
         Returns:
             The model to train.
         """
-    def _get_models(self, key: PRNGKeyArray) -> list[PyTree]:
-        models = self.get_model(key)
+    def _get_models(self, params: InitParamsT) -> list[PyTree]:
+        models = self.get_model(params)
         if isinstance(models, Sequence):
             models = list(models)
         elif isinstance(models, eqx.Module):
@@ -353,20 +349,20 @@ class TrainMixin(
     @overload
     def load_initial_state(
         self,
-        key: PRNGKeyArray,
+        params: InitParamsT,
         load_optimizer: Literal[False] = False,
     ) -> tuple[PyTree, State]: ...
     @overload
     def load_initial_state(
         self,
-        key: PRNGKeyArray,
+        params: InitParamsT,
         load_optimizer: Literal[True],
     ) -> tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State]: ...
     def load_initial_state(
         self,
-        key: PRNGKeyArray,
+        params: InitParamsT,
         load_optimizer: bool = False,
     ) -> (
         tuple[list[PyTree], State]
@@ -376,7 +372,7 @@ class TrainMixin(
         if init_ckpt_path is not None:
             logger.info("Loading checkpoint from %s", init_ckpt_path)
-            model, state, config = self.load_ckpt(init_ckpt_path, part="model_state_config")
+            model, state, config = self.load_ckpt(init_ckpt_path, params, part="model_state_config")
             config_diff = get_diff_string(diff_configs(asdict(config), asdict(self.config)))
             if config_diff:
                 logger.warning("Loaded config differs from current config:\n%s", config_diff)
@@ -384,12 +380,12 @@ class TrainMixin(
             if not load_optimizer:
                 return model, state
-            optimizer = self.load_ckpt(init_ckpt_path, part="opt")
-            opt_state = self.load_ckpt(init_ckpt_path, part="opt_state", model=model, optimizer=optimizer)
+            optimizer = self.load_ckpt(init_ckpt_path, params, part="opt")
+            opt_state = self.load_ckpt(init_ckpt_path, params, part="opt_state", model=model, optimizer=optimizer)
             return model, optimizer, opt_state, state
         logger.info("Starting a new training run")
-        models = self._get_models(key)
+        models = self._get_models(params)
         state = State.init_state()
         if not load_optimizer:
@@ -405,6 +401,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["all"],
     ) -> tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State, Config]: ...
@@ -413,6 +410,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["model_state_config"],
     ) -> tuple[list[PyTree], State, Config]: ...
@@ -421,6 +419,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["model"],
     ) -> list[PyTree]: ...
@@ -429,6 +428,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["opt"],
     ) -> list[optax.GradientTransformation]: ...
@@ -437,6 +437,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["opt_state"],
         model: PyTree | None = None,
@@ -447,6 +448,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["state"],
     ) -> list[State]: ...
@@ -455,6 +457,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: Path,
+        init_params: InitParamsT,
         *,
         part: Literal["config"],
     ) -> list[Config]: ...
@@ -462,6 +465,7 @@ class TrainMixin(
     def load_ckpt(
         self,
         path: str | Path,
+        init_params: InitParamsT,
         *,
         part: CheckpointPart = "all",
         model: PyTree | None = None,
@@ -477,18 +481,15 @@ class TrainMixin(
     ):
         path = Path(path)
-        # This key isn't used for anything, it's just a required argument.
-        key = jax.random.PRNGKey(0)
         match part:
             case "model_state_config":
-                model_specs = eqx.filter_eval_shape(self._get_models, key)
+                model_specs = eqx.filter_eval_shape(self._get_models, init_params)
                 model, state, config = load_ckpt(path, part="model_state_config", model_templates=model_specs)
                 config = self.get_config(config, use_cli=False)
                 return model, state, config
             case "model":
-                model_specs = eqx.filter_eval_shape(self._get_models, key)
+                model_specs = eqx.filter_eval_shape(self._get_models, init_params)
                 return load_ckpt(path, part="model", model_templates=model_specs)
             case "opt":
@@ -497,7 +498,7 @@ class TrainMixin(
             case "opt_state":
                 if model is None:
-                    model_specs = eqx.filter_eval_shape(self._get_models, key)
+                    model_specs = eqx.filter_eval_shape(self._get_models, init_params)
                     model = load_ckpt(path, part="model", model_templates=model_specs)
                 if optimizer is None:
                     optimizer_specs = eqx.filter_eval_shape(self._get_optimizers)
@@ -512,7 +513,7 @@ class TrainMixin(
                 return self.get_config(load_ckpt(path, part="config"), use_cli=False)
             case "all":
-                model_specs = eqx.filter_eval_shape(self._get_models, key)
+                model_specs = eqx.filter_eval_shape(self._get_models, init_params)
                 model = load_ckpt(path, part="model", model_templates=model_specs)
                 optimizer_specs = eqx.filter_eval_shape(self._get_optimizers)
                 optimizer = load_ckpt(path, part="opt", optimizer_templates=optimizer_specs)
@@ -525,95 +526,6 @@ class TrainMixin(
             case _:
                 raise ValueError(f"Unknown checkpoint part: {part}")
-    def get_output(self, model: PyTree, batch: Batch, state: State) -> Output:
-        """Gets the output from the model.
-        By default, we assume the model is a function that takes the batch as
-        input and returns the loss. This function can be patched to do more
-        complex operations instead.
-        Args:
-            model: The current model.
-            batch: The current minibatch of samples.
-            state: The current training state.
-        """
-        raise NotImplementedError("`get_output` must be implemented by the subclass")
-    def compute_loss(self, model: PyTree, batch: Batch, output: Output, state: State) -> Array:
-        """Gets the loss for the current batch.
-        By default, we assume the model is a function that takes the batch as
-        input and returns the loss. This function can be patched to do more
-        complex operations instead.
-        Args:
-            model: The current model.
-            batch: The current minibatch of samples.
-            output: The output from the model.
-            state: The current training state.
-        Returns:
-            The computed loss, as a tensor.
-        """
-        if not isinstance(output, Array):
-            raise ValueError(f"When model output is not the loss, you must override `compute_loss`. Got {type(output)}")
-        return output
-    def compute_metrics(
-        self,
-        model: PyTree,
-        batch: Batch,
-        output: Output,
-        loss: Array,
-        state: State,
-    ) -> dict[str, Array]:
-        """Computes the metrics for the current batch.
-        Args:
-            model: The current model.
-            batch: The current minibatch of samples.
-            output: The output from the model.
-            loss: The loss for the current batch.
-            state: The current training state.
-        Returns:
-            A dictionary of metrics.
-        """
-        return {
-            "loss": loss,
-        }
-    @xax_jit(static_argnames=["self", "model_static"], jit_level=3)
-    def get_output_and_loss(
-        self,
-        model_arr: PyTree,
-        model_static: PyTree,
-        batch: Batch,
-        state: State,
-    ) -> tuple[Array, tuple[Output, dict[str, Array]]]:
-        model = eqx.combine(model_arr, model_static)
-        output = self.get_output(model, batch, state)
-        loss = self.compute_loss(model, batch, output, state)
-        metrics = self.compute_metrics(model, batch, output, loss, state)
-        return loss, (output, metrics)
-    @xax_jit(static_argnames=["self", "model_static", "optimizer"], jit_level=3)
-    def update(
-        self,
-        model_arr: PyTree,
-        model_static: PyTree,
-        optimizer: optax.GradientTransformation,
-        opt_state: optax.OptState,
-        batch: Batch,
-        state: State,
-    ) -> tuple[PyTree, optax.OptState, Output, dict[str, Array]]:
-        grad_fn = jax.grad(self.get_output_and_loss, argnums=0, has_aux=True)
-        grad_fn = xax_jit(static_argnums=[1], jit_level=3)(grad_fn)
-        grads, (output, metrics) = grad_fn(model_arr, model_static, batch, state)
-        updates, opt_state = optimizer.update(grads, opt_state, model_arr)
-        model_arr = eqx.apply_updates(model_arr, updates)
-        return model_arr, opt_state, output, metrics
     def get_size_of_batch(self, batch: Batch) -> int | None:
         """Gets the batch size for the current batch.
@@ -687,224 +599,3 @@ class TrainMixin(
     def model_partition_fn(self, item: Any) -> bool:  # noqa: ANN401
         return eqx.is_inexact_array(item)
-    @xax_jit(static_argnames=["self", "model_static", "optimizer"], jit_level=3)
-    def train_step(
-        self,
-        model_arr: PyTree,
-        model_static: PyTree,
-        optimizer: optax.GradientTransformation,
-        opt_state: optax.OptState,
-        batches: Batch,
-        state: State,
-    ) -> tuple[PyTree, optax.OptState, Output, FrozenDict[str, Array]]:
-        def update_fn(
-            carry: tuple[PyTree, optax.OptState],
-            batch: Batch,
-        ) -> tuple[tuple[PyTree, optax.OptState], tuple[Output, FrozenDict[str, Array]]]:
-            model_arr, opt_state = carry
-            model_arr, opt_state, output, metrics = self.update(
-                model_arr,
-                model_static,
-                optimizer,
-                opt_state,
-                batch,
-                state,
-            )
-            return (model_arr, opt_state), (output, FrozenDict(metrics))
-        (model_arr, opt_state), (output, metrics) = xax_scan(
-            update_fn,
-            (model_arr, opt_state),
-            batches,
-            jit_level=3,
-        )
-        # Only get the final output and metrics.
-        output = jax.tree.map(lambda x: x[-1], output)
-        metrics = jax.tree.map(lambda x: x[-1], metrics)
-        return model_arr, opt_state, output, metrics
-    @xax_jit(static_argnames=["self", "model_static"], jit_level=3)
-    def val_step(
-        self,
-        model_arr: PyTree,
-        model_static: PyTree,
-        batch: Batch,
-        state: State,
-    ) -> tuple[Output, FrozenDict[str, Array]]:
-        _, (output, metrics) = self.get_output_and_loss(model_arr, model_static, batch, state)
-        return output, FrozenDict(metrics)
-    def train_loop(
-        self,
-        models: Sequence[PyTree],
-        optimizers: Sequence[optax.GradientTransformation],
-        opt_states: Sequence[optax.OptState],
-        train_pf: Iterator[Batch],
-        valid_pf: Iterator[Batch],
-        state: State,
-    ) -> None:
-        if len(models) != 1 or len(optimizers) != 1 or len(opt_states) != 1:
-            raise ValueError(
-                "Vanilla training expects a single model, optimizer and optimizer state. "
-                f"Found {len(models)} models, {len(optimizers)} optimizers and {len(opt_states)} optimizer states."
-            )
-        model_arr, model_static = eqx.partition(models[0], self.model_partition_fn)
-        optimizer = optimizers[0]
-        opt_state = opt_states[0]
-        while not self.is_training_over(state):
-            valid_step = self.valid_step_timer(state)
-            if valid_step:
-                with ContextTimer() as timer:
-                    state = state.replace(phase="valid")
-                    valid_batch = next(valid_pf)
-                    output, metrics = self.val_step(model_arr, model_static, valid_batch, state)
-                    self.log_step(eqx.combine(model_arr, model_static), valid_batch, output, metrics, state)
-                state = state.replace(
-                    num_steps=state.num_steps + 1,
-                    num_samples=state.num_samples + (self.get_size_of_batch(valid_batch) or 0),
-                    elapsed_time_s=state.elapsed_time_s + timer.elapsed_time,
-                )
-            with ContextTimer() as timer:
-                state = self.on_step_start(state)
-                state = state.replace(phase="train")
-                train_batches = list(itertools.islice(train_pf, self.config.updates_per_step))
-                model_arr, opt_state, output, metrics = self.train_step(
-                    model_arr=model_arr,
-                    model_static=model_static,
-                    optimizer=optimizer,
-                    opt_state=opt_state,
-                    batches=jax.tree.map(lambda *xs: jnp.stack(xs, axis=0), *train_batches),
-                    state=state,
-                )
-                self.log_step(eqx.combine(model_arr, model_static), train_batches[-1], output, metrics, state)
-                state = self.on_step_end(state)
-            state = state.replace(
-                num_steps=state.num_steps + 1,
-                num_samples=state.num_samples + (self.get_size_of_batch(train_batches[-1]) or 0),
-                elapsed_time_s=state.elapsed_time_s + timer.elapsed_time,
-            )
-            if state.num_steps <= 3:
-                logger.log(LOG_PING, "Step %d took %.2f second", state.num_steps, timer.elapsed_time)
-            if self.should_checkpoint(state):
-                model = eqx.combine(model_arr, model_static)
-                self.save_checkpoint(models=[model], optimizers=[optimizer], opt_states=[opt_state], state=state)
-        # After finishing training, save the final checkpoint.
-        model = eqx.combine(model_arr, model_static)
-        self.save_checkpoint(models=[model], optimizers=[optimizer], opt_states=[opt_state], state=state)
-    @contextlib.contextmanager
-    def get_train_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
-        try:
-            train_iterator: Iterator[Batch] = self.get_data_iterator("train", key=key)
-            yield train_iterator
-            return
-        except NotImplementedError:
-            pass
-        train_ds = self.get_dataset("train")
-        train_dl = self.get_dataloader(train_ds, "train", prefetch_factor=self.config.updates_per_step + 1)
-        train_pf = self.get_prefetcher(train_dl)
-        try:
-            with train_pf as train_pf_ctx:
-                yield train_pf_ctx
-        finally:
-            logger.info("Closing train prefetcher")
-    @contextlib.contextmanager
-    def get_valid_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
-        try:
-            valid_iterator: Iterator[Batch] = self.get_data_iterator("valid", key=key)
-            yield valid_iterator
-            return
-        except NotImplementedError:
-            pass
-        valid_ds = self.get_dataset("valid")
-        valid_dl = self.get_dataloader(valid_ds, "valid")
-        valid_pf = self.get_prefetcher(valid_dl)
-        try:
-            with valid_pf as valid_pf_ctx:
-                yield valid_pf_ctx
-        finally:
-            logger.info("Closing valid prefetcher")
-    def run(self) -> None:
-        self.run_training()
-    def run_training(self) -> None:
-        """Runs the training loop.
-        Args:
-            model: The current model
-            task: The current task
-            optimizer: The current optimizer
-            lr_scheduler: The current learning rate scheduler
-        Raises:
-            ValueError: If the task is not a supervised learning task
-        """
-        with self:
-            key = self.prng_key()
-            self.set_loggers()
-            if is_master():
-                Thread(target=self.log_state, daemon=True).start()
-            key, model_key = jax.random.split(key)
-            models, optimizers, opt_states, state = self.load_initial_state(model_key, load_optimizer=True)
-            logger.info("Model size: %s", f"{get_pytree_param_count(models):,}")
-            logger.info("Optimizer size: %s", f"{get_pytree_param_count(opt_states):,}")
-            state = self.on_training_start(state)
-            def on_exit() -> None:
-                self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
-            # Handle user-defined interrupts during the training loop.
-            self.add_signal_handler(on_exit, signal.SIGUSR1, signal.SIGTERM)
-            key, tkey, vkey = jax.random.split(key, 3)
-            with self.get_train_iterator(tkey) as train_pf, self.get_valid_iterator(vkey) as valid_pf:
-                try:
-                    self.train_loop(
-                        models=models,
-                        optimizers=optimizers,
-                        opt_states=opt_states,
-                        train_pf=train_pf,
-                        valid_pf=valid_pf,
-                        state=state,
-                    )
-                except TrainingFinishedError:
-                    if is_master():
-                        num_steps, num_samples = int(state.num_steps), int(state.num_samples)
-                        show_info(f"Finished training after {num_steps} steps, {num_samples} samples", important=True)
-                    self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
-                except (KeyboardInterrupt, bdb.BdbQuit):
-                    if is_master():
-                        show_info("Interrupted training", important=True)
-                except BaseException:
-                    exception_tb = textwrap.indent(highlight_exception_message(traceback.format_exc()), "  ")
-                    sys.stdout.write(f"Caught exception during training loop:\n\n{exception_tb}\n")
-                    sys.stdout.flush()
-                    self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
-                finally:
-                    state = self.on_training_end(state)

xax/task/task.py CHANGED Viewed

@@ -19,6 +19,7 @@ from xax.task.mixins import (
     DataloadersMixin,
     GPUStatsConfig,
     GPUStatsMixin,
+    InitParams,
     LoggerConfig,
     LoggerMixin,
     ProcessConfig,
@@ -27,6 +28,8 @@ from xax.task.mixins import (
     RunnableMixin,
     StepContextConfig,
     StepContextMixin,
+    SupervisedConfig as BaseSupervisedConfig,
+    SupervisedMixin as BaseSupervisedMixin,
     TrainConfig,
     TrainMixin,
 )
@@ -52,10 +55,11 @@ class Config(
 ConfigT = TypeVar("ConfigT", bound=Config)
+InitParamsT = TypeVar("InitParamsT", bound=InitParams)
 class Task(
-    TrainMixin[ConfigT],
+    TrainMixin[ConfigT, InitParamsT],
     CheckpointingMixin[ConfigT],
     CompileMixin[ConfigT],
     DataloadersMixin[ConfigT],
@@ -67,6 +71,26 @@ class Task(
     ArtifactsMixin[ConfigT],
     RunnableMixin[ConfigT],
     BaseTask[ConfigT],
-    Generic[ConfigT],
+    Generic[ConfigT, InitParamsT],
+):
+    pass
+@jax.tree_util.register_dataclass
+@dataclass
+class SupervisedConfig(
+    BaseSupervisedConfig,
+    Config,
+):
+    pass
+SupervisedConfigT = TypeVar("SupervisedConfigT", bound=SupervisedConfig)
+class SupervisedTask(
+    BaseSupervisedMixin[SupervisedConfigT],
+    Task[SupervisedConfigT, InitParams],
+    Generic[SupervisedConfigT],
 ):
     pass

{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.3.14
+Version: 0.4.0
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-xax/__init__.py,sha256=l3acv85D5Sq8IEv1tajSuCVY_eTGt8iJGnu_JuONB48,16944
+xax/__init__.py,sha256=50NFQGS6aOMcJQAJ4U1mLpvMRtWc8Kbgtv4zIMWodfc,17164
 xax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/requirements-dev.txt,sha256=qkscNkFzWd1S5fump-AKH53rR65v2x5FmboFdy_kKvs,128
 xax/requirements.txt,sha256=6qY-84e-sTmlfJNrSjwONQKqzAn5h8G_oGIhnhmfSr4,302
@@ -21,7 +21,7 @@ xax/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/task/base.py,sha256=i6FRJ75aqlekWkzJNRWDUEX7P514pUjLVuxjhX1GBgw,8198
 xax/task/logger.py,sha256=Bmhl4mv08Aq49ZyX6BdjPIsPJK28e8s3mVFatM4IY2Q,41060
 xax/task/script.py,sha256=bMMIJoUtpSBvPp6-7bejTrajTXvSg0794sYLKdPIToE,972
-xax/task/task.py,sha256=UHMpnv__gqMcfbC_L-Hhk-DCnUYlFVsgbNf-v8o8B7U,1424
+xax/task/task.py,sha256=Iy02wRUti5lDX1rfDHIgX87dGYeayJxJ9nzJzp_lMq0,1960
 xax/task/launchers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/task/launchers/base.py,sha256=8LB_r6YISKu1vq1zk3aVYmiedRr9MxE5IMRocs6unFI,731
 xax/task/launchers/cli.py,sha256=cK7Nm-3fO-W2gTxpn3FEThsT2NvneS2w0UjA1Nt-84A,1402
@@ -32,7 +32,7 @@ xax/task/loggers/json.py,sha256=6A5wL7kspsXnpPhI_vu0scgd2Z2-WLhw4gbBFm7eZMM,4377
 xax/task/loggers/state.py,sha256=0Jy0NYnY4c0qt0LvNlaTaCKOSqk5SCKln5VdyuQGnIc,1407
 xax/task/loggers/stdout.py,sha256=giKSW2R83YkgRefm3BLkE7t8Pbj5Dux4AgsdJxYIbGo,6619
 xax/task/loggers/tensorboard.py,sha256=sRyBbeBeVXDTYhPZIKIapW0JEfL9hqqzhNTeIcSd374,8883
-xax/task/mixins/__init__.py,sha256=D3oU31rB9FeOr9MPLleLt5JFbftUr4sBTwgnwQdc2qA,809
+xax/task/mixins/__init__.py,sha256=wYc4zfutdMyEmzCVV421gSf25ZXW9htNTSY_TW6vL_8,894
 xax/task/mixins/artifacts.py,sha256=UN26TW22ARduO6Bjs0yRu4-V6-Md9MPbXLKDnS28m44,3861
 xax/task/mixins/checkpointing.py,sha256=v50IZ7j58DWmEu-_6Zh_02R5KUVGhrMkg5n-MYM_J4c,11484
 xax/task/mixins/compile.py,sha256=PG5aF3W9v_xGiImHgUJ7gmwuQQoSQWufdpl2N_mlLX0,3922
@@ -43,7 +43,8 @@ xax/task/mixins/logger.py,sha256=6oXsJJyNUx6YT3q58FVXMZBUpMgjVkGre6BXFN20cVI,280
 xax/task/mixins/process.py,sha256=hqDEsMp_SL6ee97iq26-G0g49OcWZZaX82JD4F22eJU,1781
 xax/task/mixins/runnable.py,sha256=pcLrYc_TycZUY9zZim05Skc2FWk3IZKFnu6p3UDMonM,1966
 xax/task/mixins/step_wrapper.py,sha256=-Yu5Nft2CRw1JvZt6J_94SM1vqX8fk08IDK95Pmd2ew,1648
-xax/task/mixins/train.py,sha256=qb0zpsyeCk_U8Sk8THxtXkUVwj5r0lOlMLNRTctvcWU,32812
+xax/task/mixins/supervised.py,sha256=IxAh-ywvjDNoqXtzHwv2WpVsXFOX45SZjyF3qpbN-2k,13757
+xax/task/mixins/train.py,sha256=0loO44W6vVjP5usWvN0D1TgYTJ7N3PDevR7brmw3ymQ,20493
 xax/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/debugging.py,sha256=85JYIdnzLnvXsuli-4YHei_3tE3DnX3rmDSARKW2u1M,2192
 xax/utils/experiments.py,sha256=5k5hPYSaVjzoR_nm2Q3DAHMMYi3Bcp3N3PAQbwZq7Gg,29830
@@ -60,9 +61,9 @@ xax/utils/data/collate.py,sha256=Rd9vMomr_S_zCa_Hi4dO-8ntzAfVwndIUtuXFA3iNcc,706
 xax/utils/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/types/frozen_dict.py,sha256=ebtHENhyUzSjyJTlbMaLtcckQIJ7EtgJiok_40TJZpo,4689
 xax/utils/types/hashable_array.py,sha256=l5iIcFmkYzfGeaZmcSoeFkthFASqM8xJYK3AXhZQYwc,992
-xax-0.3.14.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
-xax-0.3.14.dist-info/METADATA,sha256=eb-f4GhyPyCizmbj87lEg7CE7ufKtAf7uEGLI9mBms4,1247
-xax-0.3.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-xax-0.3.14.dist-info/entry_points.txt,sha256=uRC6rx5ce0bf-FblJaZSBMxxKFfMyoWTf8OWbBmLSe8,61
-xax-0.3.14.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
-xax-0.3.14.dist-info/RECORD,,
+xax-0.4.0.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
+xax-0.4.0.dist-info/METADATA,sha256=oaK0oAc0WM428EAjuwTvFFvaa0JibJl-CpPOBUBVmUY,1246
+xax-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+xax-0.4.0.dist-info/entry_points.txt,sha256=uRC6rx5ce0bf-FblJaZSBMxxKFfMyoWTf8OWbBmLSe8,61
+xax-0.4.0.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
+xax-0.4.0.dist-info/RECORD,,

{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xax-0.3.14.dist-info → xax-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

xax 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

xax 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl