PyPI - xax - Versions diffs - 0.2.13__tar.gz → 0.2.15__tar.gz - Mend

xax 0.2.13tar.gz → 0.2.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

{xax-0.2.13/xax.egg-info → xax-0.2.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.2.13
+Version: 0.2.15
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.2.13 → xax-0.2.15}/xax/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.2.13"
+__version__ = "0.2.15"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -51,6 +51,7 @@ __all__ = [
     "rotation_matrix_to_rotation6d",
     "cross_entropy",
     "cast_norm_type",
+    "dynamic_time_warping",
     "get_norm",
     "is_master",
     "BaseSSMBlock",
@@ -136,6 +137,7 @@ __all__ = [
     "reshuffle_pytree_independently",
     "slice_array",
     "slice_pytree",
+    "tuple_insert",
     "update_pytree",
     "TextBlock",
     "camelcase_to_snakecase",
@@ -229,8 +231,9 @@ NAME_MAP: dict[str, str] = {
     "rotation6d_to_rotation_matrix": "nn.geom",
     "rotation_matrix_to_rotation6d": "nn.geom",
     "cross_entropy": "nn.losses",
-    "cast_norm_type": "nn.norm",
-    "get_norm": "nn.norm",
+    "cast_norm_type": "nn.metrics",
+    "dynamic_time_warping": "nn.metrics",
+    "get_norm": "nn.metrics",
     "is_master": "nn.parallel",
     "BaseSSMBlock": "nn.ssm",
     "DiagSSMBlock": "nn.ssm",
@@ -315,6 +318,7 @@ NAME_MAP: dict[str, str] = {
     "reshuffle_pytree_independently": "utils.pytree",
     "slice_array": "utils.pytree",
     "slice_pytree": "utils.pytree",
+    "tuple_insert": "utils.pytree",
     "update_pytree": "utils.pytree",
     "TextBlock": "utils.text",
     "camelcase_to_snakecase": "utils.text",
@@ -345,7 +349,7 @@ NAME_MAP.update(
         "LOG_ERROR_SUMMARY": "utils.logging",
         "LOG_PING": "utils.logging",
         "LOG_STATUS": "utils.logging",
-        "NormType": "nn.norm",
+        "NormType": "nn.metrics",
         "Output": "task.mixins.output",
         "Phase": "core.state",
         "RawConfigType": "task.base",
@@ -410,7 +414,12 @@ if IMPORT_ALL or TYPE_CHECKING:
         rotation_matrix_to_rotation6d,
     )
     from xax.nn.losses import cross_entropy
-    from xax.nn.norm import NormType, cast_norm_type, get_norm
+    from xax.nn.metrics import (
+        NormType,
+        cast_norm_type,
+        dynamic_time_warping,
+        get_norm,
+    )
     from xax.nn.parallel import is_master
     from xax.nn.ssm import SSM, BaseSSMBlock, DiagSSMBlock, SSMBlock
     from xax.task.base import RawConfigType
@@ -495,6 +504,7 @@ if IMPORT_ALL or TYPE_CHECKING:
         reshuffle_pytree_independently,
         slice_array,
         slice_pytree,
+        tuple_insert,
         update_pytree,
     )
     from xax.utils.text import (

xax-0.2.15/xax/nn/metrics.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Norm and metric utilities."""
+from typing import Literal, cast, get_args, overload
+import chex
+import jax
+import jax.numpy as jnp
+from jaxtyping import Array
+from xax.utils.jax import jit as xax_jit
+NormType = Literal["l1", "l2"]
+def cast_norm_type(norm: str) -> NormType:
+    if norm not in get_args(NormType):
+        raise ValueError(f"Invalid norm: {norm}")
+    return cast(NormType, norm)
+def get_norm(x: Array, norm: NormType) -> Array:
+    match norm:
+        case "l1":
+            return jnp.abs(x)
+        case "l2":
+            return jnp.square(x)
+        case _:
+            raise ValueError(f"Invalid norm: {norm}")
+@overload
+def dynamic_time_warping(distance_matrix_nm: Array) -> Array: ...
+@overload
+def dynamic_time_warping(distance_matrix_nm: Array, return_path: Literal[True]) -> tuple[Array, Array]: ...
+@xax_jit(static_argnames=["return_path"])
+def dynamic_time_warping(distance_matrix_nm: Array, return_path: bool = False) -> Array | tuple[Array, Array]:
+    """Dynamic Time Warping.
+    Args:
+        distance_matrix_nm: A matrix of pairwise distances between two
+            sequences, with shape (N, M), with the condition that N <= M.
+        return_path: If set, return the minimum path, otherwise just return
+            the cost. The latter is preferred if using this function as a
+            distance metric since it avoids the backwards scan on backpointers.
+    Returns:
+        The cost of the minimum path from the top-left corner of the distance
+        matrix to the bottom-right corner, along with the indices of that
+        minimum path.
+    """
+    chex.assert_shape(distance_matrix_nm, (None, None))
+    n, m = distance_matrix_nm.shape
+    assert n <= m, f"Invalid dynamic time warping distance matrix shape: ({n}, {m})"
+    # Masks values which cannot be reached.
+    row_idx = jnp.arange(n)[:, None]
+    col_idx = jnp.arange(m)[None, :]
+    mask = row_idx > col_idx
+    distance_matrix_nm = jnp.where(mask, jnp.inf, distance_matrix_nm)
+    # Pre-pads with inf
+    distance_matrix_nm = jnp.pad(distance_matrix_nm, ((1, 0), (0, 0)), mode="constant", constant_values=jnp.inf)
+    indices = jnp.arange(n)
+    # Scan over remaining rows to fill cost matrix
+    def scan_fn(prev_cost: Array, cur_distances: Array) -> tuple[Array, Array]:
+        same_trans = prev_cost
+        prev_trans = jnp.pad(prev_cost[:-1], ((1, 0),), mode="constant", constant_values=jnp.inf)
+        nc = jnp.minimum(prev_trans, same_trans) + cur_distances[1:]
+        return nc, jnp.where(prev_trans < same_trans, indices - 1, indices) if return_path else nc
+    init_cost = distance_matrix_nm[1:, 0]
+    final_cost, back_pointers = jax.lax.scan(scan_fn, init_cost, distance_matrix_nm[:, 1:].T)
+    if not return_path:
+        return final_cost
+    # Scan the back pointers backwards to get the minimum path.
+    def scan_back_fn(carry: Array, back_pointer: Array) -> tuple[Array, Array]:
+        prev_idx = back_pointer[carry]
+        return prev_idx, carry
+    final_index = jnp.array(n - 1)
+    _, min_path = jax.lax.scan(scan_back_fn, final_index, back_pointers, reverse=True)
+    min_path = jnp.pad(min_path, ((1, 0)), mode="constant", constant_values=0)
+    return final_cost[-1], min_path

{xax-0.2.13 → xax-0.2.15}/xax/task/mixins/checkpointing.py RENAMED Viewed

@@ -6,7 +6,7 @@ import logging
 import tarfile
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Generic, Literal, TypeVar, cast, overload
+from typing import Generic, Literal, Sequence, TypeVar, cast, overload
 import equinox as eqx
 import jax
@@ -57,10 +57,10 @@ def load_ckpt(
     path: Path,
     *,
     part: Literal["all"],
-    model_template: PyTree,
-    optimizer_template: PyTree,
-    opt_state_template: PyTree,
-) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]: ...
+    model_templates: Sequence[PyTree],
+    optimizer_templates: Sequence[optax.GradientTransformation],
+    opt_state_templates: Sequence[optax.OptState],
+) -> tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State, DictConfig]: ...
 @overload
@@ -68,20 +68,35 @@ def load_ckpt(
     path: Path,
     *,
     part: Literal["model_state_config"],
-    model_template: PyTree,
-) -> tuple[PyTree, State, DictConfig]: ...
+    model_templates: Sequence[PyTree],
+) -> tuple[list[PyTree], State, DictConfig]: ...
 @overload
-def load_ckpt(path: Path, *, part: Literal["model"], model_template: PyTree) -> PyTree: ...
+def load_ckpt(
+    path: Path,
+    *,
+    part: Literal["model"],
+    model_templates: Sequence[PyTree],
+) -> list[PyTree]: ...
 @overload
-def load_ckpt(path: Path, *, part: Literal["opt"], optimizer_template: PyTree) -> optax.GradientTransformation: ...
+def load_ckpt(
+    path: Path,
+    *,
+    part: Literal["opt"],
+    optimizer_templates: Sequence[optax.GradientTransformation],
+) -> list[optax.GradientTransformation]: ...
 @overload
-def load_ckpt(path: Path, *, part: Literal["opt_state"], opt_state_template: PyTree) -> optax.OptState: ...
+def load_ckpt(
+    path: Path,
+    *,
+    part: Literal["opt_state"],
+    opt_state_templates: Sequence[optax.OptState],
+) -> list[optax.OptState]: ...
 @overload
@@ -96,40 +111,49 @@ def load_ckpt(
     path: str | Path,
     *,
     part: CheckpointPart = "model",
-    model_template: PyTree | None = None,
-    optimizer_template: PyTree | None = None,
-    opt_state_template: PyTree | None = None,
+    model_templates: Sequence[PyTree] | None = None,
+    optimizer_templates: Sequence[optax.GradientTransformation] | None = None,
+    opt_state_templates: Sequence[optax.OptState] | None = None,
 ) -> (
-    tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]
-    | tuple[PyTree, State, DictConfig]
-    | PyTree
-    | optax.GradientTransformation
-    | optax.OptState
+    tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State, DictConfig]
+    | tuple[list[PyTree], State, DictConfig]
+    | list[PyTree]
+    | list[optax.GradientTransformation]
+    | list[optax.OptState]
     | State
     | DictConfig
 ):
     with tarfile.open(path, "r:gz") as tar:
-        def get_model() -> PyTree:
-            if model_template is None:
+        def get_model() -> list[PyTree]:
+            if model_templates is None:
                 raise ValueError("model_template must be provided to load model weights")
-            if (model := tar.extractfile("model")) is None:
-                raise ValueError(f"Checkpoint does not contain a model file: {path}")
-            return eqx.tree_deserialise_leaves(io.BytesIO(model.read()), model_template)
-        def get_opt() -> optax.GradientTransformation:
-            if optimizer_template is None:
+            models: list[PyTree] = []
+            for i, model_template in enumerate(model_templates):
+                if (model := tar.extractfile(f"model_{i}")) is None:
+                    raise ValueError(f"Checkpoint does not contain a model file: {path}")
+                models.append(eqx.tree_deserialise_leaves(io.BytesIO(model.read()), model_template))
+            return models
+        def get_opt() -> list[optax.GradientTransformation]:
+            if optimizer_templates is None:
                 raise ValueError("optimizer_template must be provided to load optimizer")
-            if (opt := tar.extractfile("optimizer")) is None:
-                raise ValueError(f"Checkpoint does not contain an optimizer file: {path}")
-            return eqx.tree_deserialise_leaves(io.BytesIO(opt.read()), optimizer_template)
-        def get_opt_state() -> optax.OptState:
-            if opt_state_template is None:
+            opts: list[optax.GradientTransformation] = []
+            for i, optimizer_template in enumerate(optimizer_templates):
+                if (opt := tar.extractfile(f"optimizer_{i}")) is None:
+                    raise ValueError(f"Checkpoint does not contain an optimizer file: {path}")
+                opts.append(eqx.tree_deserialise_leaves(io.BytesIO(opt.read()), optimizer_template))
+            return opts
+        def get_opt_state() -> list[optax.OptState]:
+            if opt_state_templates is None:
                 raise ValueError("opt_state_template must be provided to load optimizer state")
-            if (opt_state := tar.extractfile("opt_state")) is None:
-                raise ValueError(f"Checkpoint does not contain an optimizer state file: {path}")
-            return eqx.tree_deserialise_leaves(io.BytesIO(opt_state.read()), opt_state_template)
+            opt_states: list[optax.OptState] = []
+            for i, opt_state_template in enumerate(opt_state_templates):
+                if (opt_state := tar.extractfile(f"opt_state_{i}")) is None:
+                    raise ValueError(f"Checkpoint does not contain an optimizer state file: {path}")
+                opt_states.append(eqx.tree_deserialise_leaves(io.BytesIO(opt_state.read()), opt_state_template))
+            return opt_states
         def get_state() -> State:
             if (state := tar.extractfile("state")) is None:
@@ -192,20 +216,20 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def save_checkpoint(
         self,
-        model: PyTree | None = None,
-        optimizer: optax.GradientTransformation | None = None,
-        opt_state: optax.OptState | None = None,
+        models: Sequence[PyTree] | None = None,
+        optimizers: Sequence[optax.GradientTransformation] | None = None,
+        opt_states: Sequence[optax.OptState] | None = None,
         aux_data: PyTree | None = None,
         state: State | None = None,
     ) -> Path:
         """Save a checkpoint.
         Args:
-            model: The model to save
-            state: The current training state
-            optimizer: The optimizer to save
+            models: The models to save
+            optimizers: The optimizers to save
+            opt_states: The optimizer states to save
             aux_data: Additional data to save
-            opt_state: The optimizer state to save
+            state: The current training state
         Returns:
             Path to the saved checkpoint
@@ -235,22 +259,25 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
                 tar.addfile(tarinfo, buf)
             # Save model using Equinox
-            if model is not None:
-                with io.BytesIO() as buf:
-                    eqx.tree_serialise_leaves(buf, model)
-                    add_file("model", buf)
+            if models is not None:
+                for i, model in enumerate(models):
+                    with io.BytesIO() as buf:
+                        eqx.tree_serialise_leaves(buf, model)
+                        add_file(f"model_{i}", buf)
             # Save optimizer using Equinox
-            if optimizer is not None:
-                with io.BytesIO() as buf:
-                    eqx.tree_serialise_leaves(buf, optimizer)
-                    add_file("optimizer", buf)
+            if optimizers is not None:
+                for i, optimizer in enumerate(optimizers):
+                    with io.BytesIO() as buf:
+                        eqx.tree_serialise_leaves(buf, optimizer)
+                        add_file(f"optimizer_{i}", buf)
             # Save optimizer state using Equinox
-            if opt_state is not None:
-                with io.BytesIO() as buf:
-                    eqx.tree_serialise_leaves(buf, opt_state)
-                    add_file("opt_state", buf)
+            if opt_states is not None:
+                for i, opt_state in enumerate(opt_states):
+                    with io.BytesIO() as buf:
+                        eqx.tree_serialise_leaves(buf, opt_state)
+                        add_file(f"opt_state_{i}", buf)
             # Save aux data using Equinox.
             if aux_data is not None:

{xax-0.2.13 → xax-0.2.15}/xax/task/mixins/train.py RENAMED Viewed

@@ -310,23 +310,46 @@ class TrainMixin(
         self.write_logs(state)
     @abstractmethod
-    def get_model(self, key: PRNGKeyArray) -> PyTree:
+    def get_model(self, key: PRNGKeyArray) -> PyTree | Sequence[PyTree]:
         """Returns the Equinox model to train.
         Returns:
             The model to train.
         """
+    def _get_models(self, key: PRNGKeyArray) -> list[PyTree]:
+        models = self.get_model(key)
+        if isinstance(models, Sequence):
+            models = list(models)
+        elif isinstance(models, eqx.Module):
+            models = [models]
+        else:
+            logger.warning("Model is not a sequence or an eqx.Module, wrapping it in a list anyway")
+            models = [models]
+        return models
     @abstractmethod
-    def get_optimizer(self) -> optax.GradientTransformation:
+    def get_optimizer(self) -> optax.GradientTransformation | Sequence[optax.GradientTransformation]:
         """Gets the optimizer for the model.
         Returns:
             The optimizer to use to train the model.
         """
-    def get_initial_opt_state(self, model: PyTree, optimizer: optax.GradientTransformation) -> optax.OptState:
-        return optimizer.init(eqx.filter(model, eqx.is_array))
+    def _get_optimizers(self) -> list[optax.GradientTransformation]:
+        optimizers = self.get_optimizer()
+        if isinstance(optimizers, optax.GradientTransformation):
+            optimizers = [optimizers]
+        elif isinstance(optimizers, Sequence):
+            optimizers = list(optimizers)
+        return optimizers
+    def get_initial_opt_state(
+        self,
+        models: list[PyTree],
+        optimizers: list[optax.GradientTransformation],
+    ) -> list[optax.OptState]:
+        return [opt.init(eqx.filter(model, eqx.is_array)) for model, opt in zip(models, optimizers, strict=True)]
     @overload
     def load_initial_state(
@@ -340,13 +363,16 @@ class TrainMixin(
         self,
         key: PRNGKeyArray,
         load_optimizer: Literal[True],
-    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State]: ...
+    ) -> tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State]: ...
     def load_initial_state(
         self,
         key: PRNGKeyArray,
         load_optimizer: bool = False,
-    ) -> tuple[PyTree, State] | tuple[PyTree, optax.GradientTransformation, optax.OptState, State]:
+    ) -> (
+        tuple[list[PyTree], State]
+        | tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State]
+    ):
         init_ckpt_path = self.get_init_ckpt_path()
         if init_ckpt_path is not None:
@@ -364,16 +390,17 @@ class TrainMixin(
             return model, optimizer, opt_state, state
         logger.info("Starting a new training run")
-        model = self.get_model(key)
+        models = self._get_models(key)
         state = State.init_state()
         if not load_optimizer:
-            return model, state
+            return models, state
-        optimizer = self.get_optimizer()
-        opt_state = self.get_initial_opt_state(model, optimizer)
+        # Gets the optimizer(s) for the model.
+        optimizers = self._get_optimizers()
+        opt_states = self.get_initial_opt_state(models, optimizers)
-        return model, optimizer, opt_state, state
+        return models, optimizers, opt_states, state
     @overload
     def load_ckpt(
@@ -381,7 +408,7 @@ class TrainMixin(
         path: Path,
         *,
         part: Literal["all"],
-    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, Config]: ...
+    ) -> tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State, Config]: ...
     @overload
     def load_ckpt(
@@ -389,7 +416,7 @@ class TrainMixin(
         path: Path,
         *,
         part: Literal["model_state_config"],
-    ) -> tuple[PyTree, State, Config]: ...
+    ) -> tuple[list[PyTree], State, Config]: ...
     @overload
     def load_ckpt(
@@ -397,7 +424,7 @@ class TrainMixin(
         path: Path,
         *,
         part: Literal["model"],
-    ) -> PyTree: ...
+    ) -> list[PyTree]: ...
     @overload
     def load_ckpt(
@@ -405,7 +432,7 @@ class TrainMixin(
         path: Path,
         *,
         part: Literal["opt"],
-    ) -> optax.GradientTransformation: ...
+    ) -> list[optax.GradientTransformation]: ...
     @overload
     def load_ckpt(
@@ -415,7 +442,7 @@ class TrainMixin(
         part: Literal["opt_state"],
         model: PyTree | None = None,
         optimizer: optax.GradientTransformation | None = None,
-    ) -> optax.OptState: ...
+    ) -> list[optax.OptState]: ...
     @overload
     def load_ckpt(
@@ -423,7 +450,7 @@ class TrainMixin(
         path: Path,
         *,
         part: Literal["state"],
-    ) -> State: ...
+    ) -> list[State]: ...
     @overload
     def load_ckpt(
@@ -431,7 +458,7 @@ class TrainMixin(
         path: Path,
         *,
         part: Literal["config"],
-    ) -> Config: ...
+    ) -> list[Config]: ...
     def load_ckpt(
         self,
@@ -441,11 +468,11 @@ class TrainMixin(
         model: PyTree | None = None,
         optimizer: optax.GradientTransformation | None = None,
     ) -> (
-        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, Config]
-        | tuple[PyTree, State, Config]
-        | PyTree
-        | optax.GradientTransformation
-        | optax.OptState
+        tuple[list[PyTree], list[optax.GradientTransformation], list[optax.OptState], State, Config]
+        | tuple[list[PyTree], State, Config]
+        | list[PyTree]
+        | list[optax.GradientTransformation]
+        | list[optax.OptState]
         | State
         | Config
     ):
@@ -456,28 +483,28 @@ class TrainMixin(
         match part:
             case "model_state_config":
-                model_spec = eqx.filter_eval_shape(self.get_model, key)
-                model, state, config = load_ckpt(path, part="model_state_config", model_template=model_spec)
+                model_specs = eqx.filter_eval_shape(self._get_models, key)
+                model, state, config = load_ckpt(path, part="model_state_config", model_templates=model_specs)
                 config = self.get_config(config, use_cli=False)
                 return model, state, config
             case "model":
-                model_spec = eqx.filter_eval_shape(self.get_model, key)
-                return load_ckpt(path, part="model", model_template=model_spec)
+                model_specs = eqx.filter_eval_shape(self._get_models, key)
+                return load_ckpt(path, part="model", model_templates=model_specs)
             case "opt":
-                optimizer_spec = eqx.filter_eval_shape(self.get_optimizer)
-                return load_ckpt(path, part="opt", optimizer_template=optimizer_spec)
+                optimizer_specs = eqx.filter_eval_shape(self._get_optimizers)
+                return load_ckpt(path, part="opt", optimizer_templates=optimizer_specs)
             case "opt_state":
                 if model is None:
-                    model_spec = eqx.filter_eval_shape(self.get_model, key)
-                    model = load_ckpt(path, part="model", model_template=model_spec)
+                    model_specs = eqx.filter_eval_shape(self._get_models, key)
+                    model = load_ckpt(path, part="model", model_templates=model_specs)
                 if optimizer is None:
-                    optimizer_spec = eqx.filter_eval_shape(self.get_optimizer)
-                    optimizer = load_ckpt(path, part="opt", optimizer_template=optimizer_spec)
-                opt_state_spec = eqx.filter_eval_shape(self.get_initial_opt_state, model, optimizer)
-                return load_ckpt(path, part="opt_state", opt_state_template=opt_state_spec)
+                    optimizer_specs = eqx.filter_eval_shape(self._get_optimizers)
+                    optimizer = load_ckpt(path, part="opt", optimizer_templates=optimizer_specs)
+                opt_state_specs = eqx.filter_eval_shape(self.get_initial_opt_state, model, optimizer)
+                return load_ckpt(path, part="opt_state", opt_state_templates=opt_state_specs)
             case "state":
                 return load_ckpt(path, part="state")
@@ -486,12 +513,12 @@ class TrainMixin(
                 return self.get_config(load_ckpt(path, part="config"), use_cli=False)
             case "all":
-                model_spec = eqx.filter_eval_shape(self.get_model, key)
-                model = load_ckpt(path, part="model", model_template=model_spec)
-                optimizer_spec = eqx.filter_eval_shape(self.get_optimizer)
-                optimizer = load_ckpt(path, part="opt", optimizer_template=optimizer_spec)
-                opt_state_spec = eqx.filter_eval_shape(self.get_initial_opt_state, model, optimizer)
-                opt_state = load_ckpt(path, part="opt_state", opt_state_template=opt_state_spec)
+                model_specs = eqx.filter_eval_shape(self._get_models, key)
+                model = load_ckpt(path, part="model", model_templates=model_specs)
+                optimizer_specs = eqx.filter_eval_shape(self._get_optimizers)
+                optimizer = load_ckpt(path, part="opt", optimizer_templates=optimizer_specs)
+                opt_state_specs = eqx.filter_eval_shape(self.get_initial_opt_state, model, optimizer)
+                opt_state = load_ckpt(path, part="opt_state", opt_state_templates=opt_state_specs)
                 state = load_ckpt(path, part="state")
                 config = self.get_config(load_ckpt(path, part="config"), use_cli=False)
                 return model, optimizer, opt_state, state, config
@@ -718,14 +745,22 @@ class TrainMixin(
     def train_loop(
         self,
-        model: PyTree,
-        optimizer: optax.GradientTransformation,
-        opt_state: optax.OptState,
+        models: Sequence[PyTree],
+        optimizers: Sequence[optax.GradientTransformation],
+        opt_states: Sequence[optax.OptState],
         train_pf: Iterator[Batch],
         valid_pf: Iterator[Batch],
         state: State,
     ) -> None:
-        model_arr, model_static = eqx.partition(model, self.model_partition_fn)
+        if len(models) != 1 or len(optimizers) != 1 or len(opt_states) != 1:
+            raise ValueError(
+                "Vanilla training expects a single model, optimizer and optimizer state. "
+                f"Found {len(models)} models, {len(optimizers)} optimizers and {len(opt_states)} optimizer states."
+            )
+        model_arr, model_static = eqx.partition(models[0], self.model_partition_fn)
+        optimizer = optimizers[0]
+        opt_state = opt_states[0]
         while not self.is_training_over(state):
             valid_step = self.valid_step_timer(state)
@@ -773,11 +808,11 @@ class TrainMixin(
             if self.should_checkpoint(state):
                 model = eqx.combine(model_arr, model_static)
-                self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
+                self.save_checkpoint(models=[model], optimizers=[optimizer], opt_states=[opt_state], state=state)
         # After finishing training, save the final checkpoint.
         model = eqx.combine(model_arr, model_static)
-        self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
+        self.save_checkpoint(models=[model], optimizers=[optimizer], opt_states=[opt_state], state=state)
     @contextlib.contextmanager
     def get_train_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
@@ -841,14 +876,14 @@ class TrainMixin(
                 Thread(target=self.log_state, daemon=True).start()
             key, model_key = jax.random.split(key)
-            model, optimizer, opt_state, state = self.load_initial_state(model_key, load_optimizer=True)
-            logger.info("Model size: %s", f"{get_pytree_param_count(model):,}")
-            logger.info("Optimizer size: %s", f"{get_pytree_param_count(optimizer):,}")
+            models, optimizers, opt_states, state = self.load_initial_state(model_key, load_optimizer=True)
+            logger.info("Model size: %s", f"{get_pytree_param_count(models):,}")
+            logger.info("Optimizer size: %s", f"{get_pytree_param_count(optimizers):,}")
             state = self.on_training_start(state)
             def on_exit() -> None:
-                self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
+                self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
             # Handle user-defined interrupts during the training loop.
             self.add_signal_handler(on_exit, signal.SIGUSR1, signal.SIGTERM)
@@ -857,9 +892,9 @@ class TrainMixin(
             with self.get_train_iterator(tkey) as train_pf, self.get_valid_iterator(vkey) as valid_pf:
                 try:
                     self.train_loop(
-                        model=model,
-                        optimizer=optimizer,
-                        opt_state=opt_state,
+                        models=models,
+                        optimizers=optimizers,
+                        opt_states=opt_states,
                         train_pf=train_pf,
                         valid_pf=valid_pf,
                         state=state,
@@ -869,7 +904,7 @@ class TrainMixin(
                     if is_master():
                         num_steps, num_samples = int(state.num_steps), int(state.num_samples)
                         show_info(f"Finished training after {num_steps} steps, {num_samples} samples", important=True)
-                    self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
+                    self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
                 except (KeyboardInterrupt, bdb.BdbQuit):
                     if is_master():
@@ -879,7 +914,7 @@ class TrainMixin(
                     exception_tb = textwrap.indent(highlight_exception_message(traceback.format_exc()), "  ")
                     sys.stdout.write(f"Caught exception during training loop:\n\n{exception_tb}\n")
                     sys.stdout.flush()
-                    self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
+                    self.save_checkpoint(models=models, optimizers=optimizers, opt_states=opt_states, state=state)
                 finally:
                     state = self.on_training_end(state)

{xax-0.2.13 → xax-0.2.15}/xax/utils/pytree.py RENAMED Viewed

@@ -1,5 +1,7 @@
 """Utils for accessing, modifying, and otherwise manipulating pytrees."""
+from typing import TypeVar
 import chex
 import equinox as eqx
 import jax
@@ -7,6 +9,8 @@ import jax.numpy as jnp
 from jax import Array
 from jaxtyping import PRNGKeyArray, PyTree
+T = TypeVar("T")
 def slice_array(x: Array, start: Array, slice_length: int) -> Array:
     """Get a slice of an array along the first dimension.
@@ -243,3 +247,9 @@ def get_pytree_param_count(pytree: PyTree) -> int:
     """Calculates the total number of parameters in a PyTree."""
     leaves, _ = jax.tree.flatten(pytree)
     return sum(x.size for x in leaves if isinstance(x, jnp.ndarray) and eqx.is_inexact_array(x))
+def tuple_insert(t: tuple[T, ...], index: int, value: T) -> tuple[T, ...]:
+    mut = list(t)
+    mut[index] = value
+    return tuple(mut)

{xax-0.2.13 → xax-0.2.15/xax.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.2.13
+Version: 0.2.15
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.2.13 → xax-0.2.15}/xax.egg-info/SOURCES.txt RENAMED Viewed

@@ -23,7 +23,7 @@ xax/nn/export.py
 xax/nn/functions.py
 xax/nn/geom.py
 xax/nn/losses.py
-xax/nn/norm.py
+xax/nn/metrics.py
 xax/nn/parallel.py
 xax/nn/ssm.py
 xax/task/__init__.py

xax-0.2.13/xax/nn/norm.py DELETED Viewed

@@ -1,24 +0,0 @@
-"""Normalization utilities."""
-from typing import Literal, cast, get_args
-import jax.numpy as jnp
-from jaxtyping import Array
-NormType = Literal["l1", "l2"]
-def cast_norm_type(norm: str) -> NormType:
-    if norm not in get_args(NormType):
-        raise ValueError(f"Invalid norm: {norm}")
-    return cast(NormType, norm)
-def get_norm(x: Array, norm: NormType) -> Array:
-    match norm:
-        case "l1":
-            return jnp.abs(x)
-        case "l2":
-            return jnp.square(x)
-        case _:
-            raise ValueError(f"Invalid norm: {norm}")