PyPI - xax - Versions diffs - 0.1.16__tar.gz → 0.2.0__tar.gz - Mend

xax 0.1.16tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

{xax-0.1.16/xax.egg-info → xax-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.1.16
+Version: 0.2.0
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte
@@ -8,14 +8,14 @@ Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: attrs
+Requires-Dist: chex
+Requires-Dist: dpshdl
+Requires-Dist: equinox
+Requires-Dist: importlib-resources
 Requires-Dist: jax
 Requires-Dist: jaxtyping
-Requires-Dist: equinox
 Requires-Dist: optax
-Requires-Dist: dpshdl
-Requires-Dist: chex
-Requires-Dist: importlib-resources
-Requires-Dist: cloudpickle
+Requires-Dist: orbax-checkpoint
 Requires-Dist: pillow
 Requires-Dist: omegaconf
 Requires-Dist: gitpython

{xax-0.1.16 → xax-0.2.0}/pyproject.toml RENAMED Viewed

@@ -35,7 +35,6 @@ explicit_package_bases = true
 [[tool.mypy.overrides]]
 module = [
-    "cloudpickle.*",
     "optax.*",
     "setuptools.*",
     "tensorboard.*",

{xax-0.1.16 → xax-0.2.0}/xax/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.1.16"
+__version__ = "0.2.0"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [

{xax-0.1.16 → xax-0.2.0}/xax/core/state.py RENAMED Viewed

@@ -12,6 +12,14 @@ from xax.core.conf import field
 Phase = Literal["train", "valid"]
+def _phase_to_int(phase: Phase) -> int:
+    return {"train": 0, "valid": 1}[phase]
+def _int_to_phase(i: int) -> Phase:
+    return cast(Phase, ["train", "valid"][i])
 class StateDict(TypedDict, total=False):
     num_steps: NotRequired[int]
     num_samples: NotRequired[int]
@@ -35,7 +43,7 @@ class State:
     @property
     def phase(self) -> Phase:
-        return cast(Phase, ["train", "valid"][self._phase])
+        return _int_to_phase(self._phase)
     @classmethod
     def init_state(cls) -> "State":
@@ -74,3 +82,20 @@ class State:
                 case _:
                     raise ValueError(f"Invalid phase: {phase}")
         return State(**{**asdict(self), **kwargs, **extra_kwargs})
+    def to_dict(self) -> dict[str, int | float | str]:
+        return {
+            "num_steps": int(self.num_steps),
+            "num_samples": int(self.num_samples),
+            "num_valid_steps": int(self.num_valid_steps),
+            "num_valid_samples": int(self.num_valid_samples),
+            "start_time_s": float(self.start_time_s),
+            "elapsed_time_s": float(self.elapsed_time_s),
+            "phase": str(self.phase),
+        }
+    @classmethod
+    def from_dict(cls, d: dict[str, int | float | str]) -> "State":
+        if "phase" in d:
+            d["_phase"] = _phase_to_int(cast(Phase, d.pop("phase")))
+        return cls(**d)  # type: ignore[arg-type]

{xax-0.1.16 → xax-0.2.0}/xax/requirements.txt RENAMED Viewed

@@ -2,16 +2,16 @@
 # Core ML/JAX dependencies
 attrs
+chex
+dpshdl
+equinox
+importlib-resources
 jax
 jaxtyping
-equinox
 optax
-dpshdl
-chex
-importlib-resources
+orbax-checkpoint
 # Data processing and serialization
-cloudpickle
 pillow
 # Configuration and project management

{xax-0.1.16 → xax-0.2.0}/xax/task/base.py RENAMED Viewed

@@ -79,7 +79,7 @@ class BaseTask(Generic[Config]):
     def on_training_end(self, state: State) -> State:
         return state
-    def on_after_checkpoint_save(self, ckpt_path: Path, state: State) -> State:
+    def on_after_checkpoint_save(self, ckpt_path: Path, state: State | None) -> State | None:
         return state
     @functools.cached_property

{xax-0.1.16 → xax-0.2.0}/xax/task/logger.py RENAMED Viewed

@@ -18,11 +18,22 @@ from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
 from types import TracebackType
-from typing import Callable, Iterator, Literal, Self, Sequence, TypeVar, cast, get_args
+from typing import (
+    Any,
+    Callable,
+    Iterator,
+    Literal,
+    Self,
+    Sequence,
+    TypeVar,
+    cast,
+    get_args,
+)
 import jax
 import jax.numpy as jnp
 import numpy as np
+from jax._src.core import ClosedJaxpr
 from jaxtyping import Array
 from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as PILImage
@@ -194,7 +205,10 @@ def tile_images(images: list[PILImage], sep: int = 0) -> PILImage:
     return tiled
-def as_numpy(array: Array) -> np.ndarray:
+def as_numpy(array: Array | np.ndarray) -> np.ndarray:
+    """Convert a JAX array or numpy array to numpy array."""
+    if isinstance(array, np.ndarray):
+        return array
     array = jax.device_get(array)
     if jax.dtypes.issubdtype(array.dtype, jnp.floating):
         array = array.astype(jnp.float32)
@@ -205,6 +219,13 @@ def as_numpy(array: Array) -> np.ndarray:
     return np.array(array)
+def as_numpy_opt(array: Array | np.ndarray | None) -> np.ndarray | None:
+    """Convert an optional JAX array or numpy array to numpy array."""
+    if array is None:
+        return None
+    return as_numpy(array)
 @dataclass(kw_only=True)
 class LogString:
     value: str
@@ -252,6 +273,19 @@ class LogHistogram:
     bucket_counts: list[int]
+@dataclass(kw_only=True)
+class LogMesh:
+    vertices: np.ndarray
+    colors: np.ndarray | None
+    faces: np.ndarray | None
+    config_dict: dict[str, Any] | None  # noqa: ANN401
+@dataclass(kw_only=True)
+class LogGraph:
+    computation: ClosedJaxpr
 @dataclass(kw_only=True)
 class LogLine:
     state: State
@@ -261,6 +295,7 @@ class LogLine:
     strings: dict[str, dict[str, LogString]]
     images: dict[str, dict[str, LogImage]]
     videos: dict[str, dict[str, LogVideo]]
+    meshes: dict[str, dict[str, LogMesh]]
 @dataclass(kw_only=True)
@@ -533,6 +568,7 @@ class Logger:
         self.strings: dict[str, dict[str, Callable[[], LogString]]] = defaultdict(dict)
         self.images: dict[str, dict[str, Callable[[], LogImage]]] = defaultdict(dict)
         self.videos: dict[str, dict[str, Callable[[], LogVideo]]] = defaultdict(dict)
+        self.meshes: dict[str, dict[str, Callable[[], LogMesh]]] = defaultdict(dict)
         self.default_namespace = default_namespace
         self.loggers: list[LoggerImpl] = []
@@ -560,6 +596,7 @@ class Logger:
             strings={k: {kk: v() for kk, v in v.items()} for k, v in self.strings.items()},
             images={k: {kk: v() for kk, v in v.items()} for k, v in self.images.items()},
             videos={k: {kk: v() for kk, v in v.items()} for k, v in self.videos.items()},
+            meshes={k: {kk: v() for kk, v in v.items()} for k, v in self.meshes.items()},
         )
     def clear(self) -> None:
@@ -569,6 +606,7 @@ class Logger:
         self.strings.clear()
         self.images.clear()
         self.videos.clear()
+        self.meshes.clear()
     def write(self, state: State) -> None:
         """Writes the current step's logging information.
@@ -1051,6 +1089,73 @@ class Logger:
         self.videos[namespace][key] = video_future
+    def log_mesh(
+        self,
+        key: str,
+        vertices: np.ndarray | Array | Callable[[], np.ndarray | Array],
+        colors: np.ndarray | Array | None | Callable[[], np.ndarray | Array | None] = None,
+        faces: np.ndarray | Array | None | Callable[[], np.ndarray | Array | None] = None,
+        config_dict: dict[str, Any] | None = None,
+        *,
+        namespace: str | None = None,
+    ) -> None:
+        if not self.active:
+            raise RuntimeError("The logger is not active")
+        namespace = self.resolve_namespace(namespace)
+        @functools.lru_cache(maxsize=None)
+        def mesh_future() -> LogMesh:
+            with ContextTimer() as timer:
+                # Get the raw values
+                vertices_val = vertices() if callable(vertices) else vertices
+                colors_val = colors() if callable(colors) else colors
+                faces_val = faces() if callable(faces) else faces
+                # Convert to numpy arrays with proper type handling
+                vertices_np = as_numpy(vertices_val)
+                colors_np = as_numpy_opt(colors_val)
+                faces_np = as_numpy_opt(faces_val)
+                # Checks vertices shape.
+                if vertices_np.ndim == 2:
+                    vertices_np = vertices_np[None]
+                if vertices_np.shape[-1] != 3 or vertices_np.ndim != 3:
+                    raise ValueError("Vertices must have shape (N, 3) or (B, N, 3)")
+                # Checks colors shape.
+                if colors_np is not None:
+                    if colors_np.ndim == 2:
+                        colors_np = colors_np[None]
+                    if colors_np.shape[-1] != 3 or colors_np.ndim != 3:
+                        raise ValueError("Colors must have shape (N, 3) or (B, N, 3)")
+                # Checks faces shape.
+                if faces_np is not None:
+                    if faces_np.ndim == 2:
+                        faces_np = faces_np[None]
+                    if faces_np.shape[-1] != 3 or faces_np.ndim != 3:
+                        raise ValueError("Faces must have shape (N, 3) or (B, N, 3)")
+                # Ensures colors dtype is uint8.
+                if colors_np is not None:
+                    if colors_np.dtype != np.uint8:
+                        colors_np = (colors_np * 255).astype(np.uint8)
+                # Ensures faces dtype is int32.
+                if faces_np is not None:
+                    if faces_np.dtype != np.int32:
+                        faces_np = faces_np.astype(np.int32)
+            logger.debug("Mesh Key: %s, Time: %s", key, timer.elapsed_time)
+            return LogMesh(
+                vertices=vertices_np,
+                colors=colors_np,
+                faces=faces_np,
+                config_dict=config_dict,
+            )
+        self.meshes[namespace][key] = mesh_future
     def __enter__(self) -> Self:
         self.active = True
         for logger in self.loggers:

{xax-0.1.16 → xax-0.2.0}/xax/task/loggers/tensorboard.py RENAMED Viewed

@@ -70,6 +70,9 @@ class TensorboardLogger(LoggerImpl):
         self._started = True
     def worker_thread(self) -> None:
+        if os.environ.get("DISABLE_TENSORBOARD", "0") == "1":
+            return
         time.sleep(self.wait_seconds)
         port = int(os.environ.get("TENSORBOARD_PORT", DEFAULT_TENSORBOARD_PORT))
@@ -213,6 +216,19 @@ class TensorboardLogger(LoggerImpl):
                     video_value.frames,
                     fps=video_value.fps,
                     global_step=line.state.num_steps,
+                    walltime=walltime,
+                )
+        for namespace, meshes in line.meshes.items():
+            for mesh_key, mesh_value in meshes.items():
+                writer.add_mesh(
+                    f"{namespace}/{mesh_key}",
+                    vertices=mesh_value.vertices,
+                    faces=mesh_value.faces,
+                    colors=mesh_value.colors,
+                    config_dict=mesh_value.config_dict,
+                    global_step=line.state.num_steps,
+                    walltime=walltime,
                 )
         for name, contents in self.files.items():

{xax-0.1.16 → xax-0.2.0}/xax/task/mixins/checkpointing.py RENAMED Viewed

@@ -6,9 +6,9 @@ import logging
 import tarfile
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, TypeVar, cast, overload
+from typing import Generic, Literal, TypeVar, cast, overload
-import cloudpickle
+import equinox as eqx
 import jax
 import optax
 from jaxtyping import PyTree
@@ -64,7 +64,9 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def get_init_ckpt_path(self) -> Path | None:
         if self._exp_dir is not None:
             ckpt_path = self.get_ckpt_path()
-            if ckpt_path.exists():
+            if not ckpt_path.exists():
+                logger.warning("No checkpoint found in experiment directory: %s", ckpt_path)
+            else:
                 return ckpt_path
         if self.config.load_from_ckpt_path is not None:
             ckpt_path = Path(self.config.load_from_ckpt_path)
@@ -87,41 +89,54 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def load_checkpoint(
         self,
         path: Path,
-        part: Literal["all"] = "all",
-    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]: ...
+        *,
+        part: Literal["all"],
+        model_template: PyTree,
+        optimizer_template: PyTree,
+        opt_state_template: PyTree,
+    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, Config]: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
-        part: Literal["model_state_config"] = "model_state_config",
-    ) -> tuple[PyTree, State, DictConfig]: ...
+        *,
+        part: Literal["model_state_config"],
+        model_template: PyTree,
+    ) -> tuple[PyTree, State, Config]: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["model"],
+        model_template: PyTree,
     ) -> PyTree: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["opt"],
+        optimizer_template: PyTree,
     ) -> optax.GradientTransformation: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["opt_state"],
+        opt_state_template: PyTree,
     ) -> optax.OptState: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["state"],
     ) -> State: ...
@@ -129,48 +144,71 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["config"],
-    ) -> DictConfig: ...
+    ) -> Config: ...
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: CheckpointPart = "all",
+        model_template: PyTree | None = None,
+        optimizer_template: PyTree | None = None,
+        opt_state_template: PyTree | None = None,
     ) -> (
-        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]
-        | tuple[PyTree, State, DictConfig]
+        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, Config]
+        | tuple[PyTree, State, Config]
         | PyTree
         | optax.GradientTransformation
         | optax.OptState
         | State
-        | DictConfig
+        | Config
     ):
+        """Load a checkpoint.
+        Args:
+            path: Path to the checkpoint directory
+            part: Which part of the checkpoint to load
+            model_template: Template model with correct structure but uninitialized weights
+            optimizer_template: Template optimizer with correct structure but uninitialized weights
+            opt_state_template: Template optimizer state with correct structure but uninitialized weights
+        Returns:
+            The requested checkpoint components
+        """
         with tarfile.open(path, "r:gz") as tar:
             def get_model() -> PyTree:
+                if model_template is None:
+                    raise ValueError("model_template must be provided to load model weights")
                 if (model := tar.extractfile("model")) is None:
                     raise ValueError(f"Checkpoint does not contain a model file: {path}")
-                return cloudpickle.load(model)
+                return eqx.tree_deserialise_leaves(io.BytesIO(model.read()), model_template)
             def get_opt() -> optax.GradientTransformation:
-                if (opt := tar.extractfile("opt")) is None:
-                    raise ValueError(f"Checkpoint does not contain an opt file: {path}")
-                return cloudpickle.load(opt)
+                if optimizer_template is None:
+                    raise ValueError("optimizer_template must be provided to load optimizer")
+                if (opt := tar.extractfile("optimizer")) is None:
+                    raise ValueError(f"Checkpoint does not contain an optimizer file: {path}")
+                return eqx.tree_deserialise_leaves(io.BytesIO(opt.read()), optimizer_template)
             def get_opt_state() -> optax.OptState:
+                if opt_state_template is None:
+                    raise ValueError("opt_state_template must be provided to load optimizer state")
                 if (opt_state := tar.extractfile("opt_state")) is None:
-                    raise ValueError(f"Checkpoint does not contain an opt_state file: {path}")
-                return cloudpickle.load(opt_state)
+                    raise ValueError(f"Checkpoint does not contain an optimizer state file: {path}")
+                return eqx.tree_deserialise_leaves(io.BytesIO(opt_state.read()), opt_state_template)
             def get_state() -> State:
                 if (state := tar.extractfile("state")) is None:
                     raise ValueError(f"Checkpoint does not contain a state file: {path}")
                 return State(**json.loads(state.read().decode()))
-            def get_config() -> DictConfig:
+            def get_config() -> Config:
                 if (config := tar.extractfile("config")) is None:
                     raise ValueError(f"Checkpoint does not contain a config file: {path}")
-                return cast(DictConfig, OmegaConf.load(config))
+                return self.get_config(cast(DictConfig, OmegaConf.load(config)), use_cli=False)
             match part:
                 case "model":
@@ -192,51 +230,90 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def save_checkpoint(
         self,
-        model: PyTree,
-        optimizer: optax.GradientTransformation,
-        opt_state: optax.OptState,
-        state: State,
+        model: PyTree | None = None,
+        optimizer: optax.GradientTransformation | None = None,
+        opt_state: optax.OptState | None = None,
+        aux_data: PyTree | None = None,
+        state: State | None = None,
     ) -> Path:
+        """Save a checkpoint.
+        Args:
+            model: The model to save
+            state: The current training state
+            optimizer: The optimizer to save
+            aux_data: Additional data to save
+            opt_state: The optimizer state to save
+        Returns:
+            Path to the saved checkpoint
+        """
         ckpt_path = self.get_ckpt_path(state)
         if not is_master():
             return ckpt_path
-        # Gets the path to the last checkpoint.
+        # Gets the path to the last checkpoint
         logger.info("Saving checkpoint to %s", ckpt_path)
         last_ckpt_path = self.get_ckpt_path()
         ckpt_path.parent.mkdir(exist_ok=True, parents=True)
-        # Potentially removes the last checkpoint.
+        # Potentially removes the last checkpoint
         if last_ckpt_path.exists() and self.config.only_save_most_recent:
             if (base_ckpt := last_ckpt_path.resolve()).is_file():
                 base_ckpt.unlink()
-        # Combines all temporary files into a single checkpoint TAR file.
+        # Save the checkpoint components
         with tarfile.open(ckpt_path, "w:gz") as tar:
-            def add_file(name: str, write_fn: Callable[[io.BytesIO], Any]) -> None:
+            def add_file(name: str, buf: io.BytesIO) -> None:
+                tarinfo = tarfile.TarInfo(name)
+                tarinfo.size = buf.tell()
+                buf.seek(0)
+                tar.addfile(tarinfo, buf)
+            # Save model using Equinox
+            if model is not None:
+                with io.BytesIO() as buf:
+                    eqx.tree_serialise_leaves(buf, model)
+                    add_file("model", buf)
+            # Save optimizer using Equinox
+            if optimizer is not None:
+                with io.BytesIO() as buf:
+                    eqx.tree_serialise_leaves(buf, optimizer)
+                    add_file("optimizer", buf)
+            # Save optimizer state using Equinox
+            if opt_state is not None:
                 with io.BytesIO() as buf:
-                    write_fn(buf)
-                    tarinfo = tarfile.TarInfo(name)
-                    tarinfo.size = buf.tell()
-                    buf.seek(0)
-                    tar.addfile(tarinfo, buf)
-            add_file("model", lambda buf: cloudpickle.dump(model, buf))
-            add_file("opt", lambda buf: cloudpickle.dump(optimizer, buf))
-            add_file("opt_state", lambda buf: cloudpickle.dump(opt_state, buf))
-            add_file("state", lambda buf: buf.write(json.dumps(asdict(state), indent=2).encode()))
-            add_file("config", lambda buf: buf.write(OmegaConf.to_yaml(self.config).encode()))
-        # Updates the symlink to the new checkpoint.
+                    eqx.tree_serialise_leaves(buf, opt_state)
+                    add_file("opt_state", buf)
+            # Save aux data using Equinox.
+            if aux_data is not None:
+                with io.BytesIO() as buf:
+                    eqx.tree_serialise_leaves(buf, aux_data)
+                    add_file("aux_data", buf)
+            # Save state and config as JSON
+            def add_file_bytes(name: str, data: bytes) -> None:  # noqa: ANN401
+                info = tarfile.TarInfo(name=name)
+                info.size = len(data)
+                tar.addfile(info, io.BytesIO(data))
+            if state is not None:
+                add_file_bytes("state", json.dumps(asdict(state), indent=2).encode())
+            add_file_bytes("config", OmegaConf.to_yaml(self.config).encode())
+        # Updates the symlink to the new checkpoint
         last_ckpt_path.unlink(missing_ok=True)
         try:
             last_ckpt_path.symlink_to(ckpt_path.relative_to(last_ckpt_path.parent))
         except FileExistsError:
             logger.exception("Exception while trying to update %s", ckpt_path)
-        # Calls the base callback.
+        # Calls the base callback
         self.on_after_checkpoint_save(ckpt_path, state)
         return ckpt_path

{xax-0.1.16 → xax-0.2.0}/xax/task/mixins/data_loader.py RENAMED Viewed

@@ -9,6 +9,7 @@ import jax
 from dpshdl.dataloader import CollatedDataloaderItem, Dataloader
 from dpshdl.dataset import Dataset, ErrorHandlingDataset
 from dpshdl.prefetcher import Prefetcher
+from jaxtyping import PRNGKeyArray
 from omegaconf import II, MISSING
 from xax.core.conf import field, is_missing
@@ -103,7 +104,7 @@ class DataloadersMixin(ProcessMixin[Config], BaseTask[Config], Generic[Config],
             "or `get_data_iterator` to return an iterator for the given dataset."
         )
-    def get_data_iterator(self, phase: Phase) -> Iterator:
+    def get_data_iterator(self, phase: Phase, key: PRNGKeyArray) -> Iterator:
         raise NotImplementedError(
             "You must implement either the `get_dataset` method to return the dataset for the given phase, "
             "or `get_data_iterator` to return an iterator for the given dataset."

{xax-0.1.16 → xax-0.2.0}/xax/task/mixins/train.py RENAMED Viewed

@@ -11,7 +11,7 @@ import textwrap
 import time
 import traceback
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, is_dataclass
+from dataclasses import asdict, dataclass, is_dataclass
 from threading import Thread
 from typing import (
     Any,
@@ -33,7 +33,6 @@ import jax.numpy as jnp
 import numpy as np
 import optax
 from jaxtyping import Array, PRNGKeyArray, PyTree
-from omegaconf import DictConfig
 from xax.core.conf import field
 from xax.core.state import Phase, State
@@ -50,6 +49,7 @@ from xax.utils.experiments import (
     TrainingFinishedError,
     diff_configs,
     get_diff_string,
+    get_info_json,
     get_state_file_string,
     get_training_code,
 )
@@ -340,20 +340,30 @@ class TrainMixin(
         if init_ckpt_path is not None:
             logger.info("Loading checkpoint from %s", init_ckpt_path)
-            if load_optimizer:
-                model, optimizer, opt_state, state, config = self.load_checkpoint(init_ckpt_path)
-                config_diff = get_diff_string(diff_configs(config, cast(DictConfig, self.config)))
-                if config_diff:
-                    logger.warning("Loaded config differs from current config:\n%s", config_diff)
-                return model, optimizer, opt_state, state
+            model_spec = eqx.filter_eval_shape(self.get_model, key)
+            model, state, config = self.load_checkpoint(
+                init_ckpt_path,
+                part="model_state_config",
+                model_template=model_spec,
+            )
+            config_diff = get_diff_string(diff_configs(asdict(config), asdict(self.config)))
+            if config_diff:
+                logger.warning("Loaded config differs from current config:\n%s", config_diff)
-            else:
-                model, state, config = self.load_checkpoint(init_ckpt_path, "model_state_config")
-                config_diff = get_diff_string(diff_configs(config, cast(DictConfig, self.config)))
-                if config_diff:
-                    logger.warning("Loaded config differs from current config:\n%s", config_diff)
+            if not load_optimizer:
                 return model, state
+            # Loads the optimizer.
+            optimizer_spec = eqx.filter_eval_shape(self.get_optimizer)
+            optimizer = self.load_checkpoint(init_ckpt_path, part="opt", optimizer_template=optimizer_spec)
+            # Loads the optimizer state.
+            opt_state_spec = eqx.filter_eval_shape(self.get_initial_opt_state, model, optimizer)
+            opt_state = self.load_checkpoint(init_ckpt_path, part="opt_state", opt_state_template=opt_state_spec)
+            return model, optimizer, opt_state, state
+        logger.info("No checkpoint found. Initializing a new model.")
         model = self.get_model(key)
         state = State.init_state()
@@ -554,6 +564,7 @@ class TrainMixin(
         self.logger.log_file("state.txt", get_state_file_string(self))
         self.logger.log_file("training_code.py", get_training_code(self))
         self.logger.log_file("config.yaml", self.config_str(self.config, use_cli=False))
+        self.logger.log_file("info.json", get_info_json())
     def model_partition_fn(self, item: Any) -> bool:  # noqa: ANN401
         return eqx.is_inexact_array(item)
@@ -627,16 +638,16 @@ class TrainMixin(
             if self.should_checkpoint(state):
                 model = eqx.combine(model_arr, model_static)
-                self.save_checkpoint(model, optimizer, opt_state, state)
+                self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
         # After finishing training, save the final checkpoint.
         model = eqx.combine(model_arr, model_static)
-        self.save_checkpoint(model, optimizer, opt_state, state)
+        self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
     @contextlib.contextmanager
-    def get_train_iterator(self) -> Generator[Iterator[Batch], None, None]:
+    def get_train_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
         try:
-            train_iterator: Iterator[Batch] = self.get_data_iterator("train")
+            train_iterator: Iterator[Batch] = self.get_data_iterator("train", key=key)
             yield train_iterator
             return
         except NotImplementedError:
@@ -653,9 +664,9 @@ class TrainMixin(
             logger.info("Closing train prefetcher")
     @contextlib.contextmanager
-    def get_valid_iterator(self) -> Generator[Iterator[Batch], None, None]:
+    def get_valid_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
         try:
-            valid_iterator: Iterator[Batch] = self.get_data_iterator("valid")
+            valid_iterator: Iterator[Batch] = self.get_data_iterator("valid", key=key)
             yield valid_iterator
             return
         except NotImplementedError:
@@ -699,12 +710,13 @@ class TrainMixin(
             state = self.on_training_start(state)
             def on_exit() -> None:
-                self.save_checkpoint(model, optimizer, opt_state, state)
+                self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
             # Handle user-defined interrupts during the training loop.
             self.add_signal_handler(on_exit, signal.SIGUSR1, signal.SIGTERM)
-            with self.get_train_iterator() as train_pf, self.get_valid_iterator() as valid_pf:
+            key, tkey, vkey = jax.random.split(key, 3)
+            with self.get_train_iterator(tkey) as train_pf, self.get_valid_iterator(vkey) as valid_pf:
                 try:
                     self.train_loop(
                         model=model,
@@ -721,7 +733,7 @@ class TrainMixin(
                             f"Finished training after {state.num_steps} steps, {state.num_samples} samples",
                             important=True,
                         )
-                    self.save_checkpoint(model, optimizer, opt_state, state)
+                    self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
                 except (KeyboardInterrupt, bdb.BdbQuit):
                     if is_master():
@@ -731,7 +743,7 @@ class TrainMixin(
                     exception_tb = textwrap.indent(highlight_exception_message(traceback.format_exc()), "  ")
                     sys.stdout.write(f"Caught exception during training loop:\n\n{exception_tb}\n")
                     sys.stdout.flush()
-                    self.save_checkpoint(model, optimizer, opt_state, state)
+                    self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
                 finally:
                     state = self.on_training_end(state)

{xax-0.1.16 → xax-0.2.0}/xax/utils/experiments.py RENAMED Viewed

@@ -7,6 +7,7 @@ import functools
 import hashlib
 import inspect
 import itertools
+import json
 import logging
 import math
 import os
@@ -24,7 +25,7 @@ import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Iterator, Self, TypeVar, cast
+from typing import Any, Iterator, Mapping, Self, Sequence, TypeVar, cast
 from urllib.parse import urlparse
 import git
@@ -116,9 +117,7 @@ class StateTimer:
     def log_dict(self) -> dict[str, int | float | tuple[int | float, bool]]:
         return {
-            "steps": (self.step_timer.steps, True),
             "steps/second": self.step_timer.steps_per_second,
-            "samples": (self.sample_timer.steps, True),
             "samples/second": (self.sample_timer.steps_per_second, True),
             "dt": self.iter_timer.iter_seconds,
         }
@@ -204,8 +203,8 @@ class MinGradScaleError(TrainingFinishedError):
 def diff_configs(
-    first: ListConfig | DictConfig,
-    second: ListConfig | DictConfig,
+    first: Mapping | Sequence,
+    second: Mapping | Sequence,
     prefix: str | None = None,
 ) -> tuple[list[str], list[str]]:
     """Returns the difference between two configs.
@@ -232,7 +231,7 @@ def diff_configs(
     any_config = (ListConfig, DictConfig)
-    if isinstance(first, DictConfig) and isinstance(second, DictConfig):
+    if isinstance(first, Mapping) and isinstance(second, Mapping):
         first_keys, second_keys = cast(set[str], set(first.keys())), cast(set[str], set(second.keys()))
         # Gets the new keys in each config.
@@ -242,11 +241,12 @@ def diff_configs(
         # Gets the new sub-keys in each config.
         for key in first_keys.intersection(second_keys):
             sub_prefix = key if prefix is None else f"{prefix}.{key}"
-            if OmegaConf.is_missing(first, key) or OmegaConf.is_missing(second, key):
-                if not OmegaConf.is_missing(first, key):
-                    new_first += [get_diff_string(sub_prefix, first[key])]
-                if not OmegaConf.is_missing(second, key):
-                    new_second += [get_diff_string(sub_prefix, second[key])]
+            if isinstance(first, DictConfig) and isinstance(second, DictConfig):
+                if OmegaConf.is_missing(first, key) or OmegaConf.is_missing(second, key):
+                    if not OmegaConf.is_missing(first, key):
+                        new_first += [get_diff_string(sub_prefix, first[key])]
+                    if not OmegaConf.is_missing(second, key):
+                        new_second += [get_diff_string(sub_prefix, second[key])]
             elif isinstance(first[key], any_config) and isinstance(second[key], any_config):
                 sub_new_first, sub_new_second = diff_configs(first[key], second[key], prefix=sub_prefix)
                 new_first, new_second = new_first + sub_new_first, new_second + sub_new_second
@@ -255,7 +255,7 @@ def diff_configs(
                 new_first += [get_diff_string(sub_prefix, first_val)]
                 new_second += [get_diff_string(sub_prefix, second_val)]
-    elif isinstance(first, ListConfig) and isinstance(second, ListConfig):
+    elif isinstance(first, Sequence) and isinstance(second, Sequence):
         if len(first) > len(second):
             for i in range(len(second), len(first)):
                 new_first += [get_diff_string(prefix, first[i])]
@@ -470,16 +470,33 @@ def get_command_line_string() -> str:
     return " ".join(sys.argv)
+def get_environment_variables() -> str:
+    return "\n".join([f"{key}={value}" for key, value in sorted(os.environ.items())])
 def get_state_file_string(obj: object) -> str:
     return "\n\n".join(
         [
             f"=== Command Line ===\n\n{get_command_line_string()}",
             f"=== Git State ===\n\n{get_git_state(obj)}",
             f"=== Packages ===\n\n{get_packages_with_versions()}",
+            f"=== Environment Variables ===\n\n{get_environment_variables()}",
         ]
     )
+def get_info_json() -> str:
+    return json.dumps(
+        {
+            "process_id": os.getpid(),
+            "job": {
+                "start_time": datetime.datetime.now().isoformat(),
+            },
+        },
+        indent=2,
+    )
 def get_training_code(obj: object) -> str:
     """Gets the text from the file containing the provided object.

{xax-0.1.16 → xax-0.2.0}/xax/utils/tensorboard.py RENAMED Viewed

@@ -2,11 +2,12 @@
 import functools
 import io
+import json
 import os
 import tempfile
 import time
 from pathlib import Path
-from typing import Literal, TypedDict
+from typing import Any, Literal, TypedDict
 import numpy as np
 import PIL.Image
@@ -14,9 +15,15 @@ from PIL.Image import Image as PILImage
 from tensorboard.compat.proto.config_pb2 import RunMetadata
 from tensorboard.compat.proto.event_pb2 import Event, TaggedRunMetadata
 from tensorboard.compat.proto.graph_pb2 import GraphDef
-from tensorboard.compat.proto.summary_pb2 import HistogramProto, Summary, SummaryMetadata
+from tensorboard.compat.proto.summary_pb2 import (
+    HistogramProto,
+    Summary,
+    SummaryMetadata,
+)
 from tensorboard.compat.proto.tensor_pb2 import TensorProto
 from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
+from tensorboard.plugins.mesh import metadata as mesh_metadata
+from tensorboard.plugins.mesh.plugin_data_pb2 import MeshPluginData
 from tensorboard.plugins.text.plugin_data_pb2 import TextPluginData
 from tensorboard.summary.writer.event_file_writer import EventFileWriter
@@ -84,6 +91,68 @@ def make_histogram(values: np.ndarray, bins: str | np.ndarray, max_bins: int | N
     )
+def _get_json_config(config_dict: dict[str, Any] | None) -> str:
+    json_config = "{}"
+    if config_dict is not None:
+        json_config = json.dumps(config_dict, sort_keys=True)
+    return json_config
+def make_mesh_summary(
+    tag: str,
+    vertices: np.ndarray,
+    colors: np.ndarray | None,
+    faces: np.ndarray | None,
+    config_dict: dict[str, Any] | None,
+    display_name: str | None = None,
+    description: str | None = None,
+) -> Summary:
+    json_config = _get_json_config(config_dict)
+    summaries = []
+    tensors = [
+        (vertices, MeshPluginData.VERTEX),
+        (faces, MeshPluginData.FACE),
+        (colors, MeshPluginData.COLOR),
+    ]
+    # Filter out None tensors and explicitly type the list
+    valid_tensors = [(t, content_type) for t, content_type in tensors if t is not None]
+    components = mesh_metadata.get_components_bitmask([content_type for (_, content_type) in valid_tensors])
+    for tensor, content_type in valid_tensors:  # Now we know tensor is not None
+        tensor_metadata = mesh_metadata.create_summary_metadata(
+            tag,
+            display_name,
+            content_type,
+            components,
+            tensor.shape,  # Safe now since tensor is not None
+            description,
+            json_config=json_config,
+        )
+        tensor_proto = TensorProto(
+            dtype="DT_FLOAT",
+            float_val=tensor.reshape(-1).tolist(),  # Safe now since tensor is not None
+            tensor_shape=TensorShapeProto(
+                dim=[
+                    TensorShapeProto.Dim(size=tensor.shape[0]),  # Safe now since tensor is not None
+                    TensorShapeProto.Dim(size=tensor.shape[1]),
+                    TensorShapeProto.Dim(size=tensor.shape[2]),
+                ]
+            ),
+        )
+        tensor_summary = Summary.Value(
+            tag=mesh_metadata.get_instance_name(tag, content_type),
+            tensor=tensor_proto,
+            metadata=tensor_metadata,
+        )
+        summaries.append(tensor_summary)
+    return Summary(value=summaries)
 class TensorboardProtobufWriter:
     def __init__(
         self,
@@ -454,6 +523,9 @@ class TensorboardWriter:
         weighted_sum = float((bin_centers * bucket_counts).sum())
         weighted_sum_squares = float((bin_centers**2 * bucket_counts).sum())
+        # Convert bin edges to list of floats explicitly
+        bucket_limits: list[float | np.ndarray] = [float(x) for x in bin_edges[1:]]
         self.add_histogram_raw(
             tag=tag,
             min=float(bin_edges[0]),
@@ -461,12 +533,28 @@ class TensorboardWriter:
             num=int(total_counts),
             sum=weighted_sum,
             sum_squares=weighted_sum_squares,
-            bucket_limits=bin_edges[1:].tolist(),  # TensorBoard expects right bin edges
+            bucket_limits=bucket_limits,  # Now properly typed
             bucket_counts=bucket_counts.tolist(),
             global_step=global_step,
             walltime=walltime,
         )
+    def add_mesh(
+        self,
+        tag: str,
+        vertices: np.ndarray,
+        colors: np.ndarray | None,
+        faces: np.ndarray | None,
+        config_dict: dict[str, Any] | None,
+        global_step: int | None = None,
+        walltime: float | None = None,
+    ) -> None:
+        self.pb_writer.add_summary(
+            make_mesh_summary(tag, vertices, colors, faces, config_dict),
+            global_step=global_step,
+            walltime=walltime,
+        )
 class TensorboardWriterKwargs(TypedDict):
     max_queue_size: int

{xax-0.1.16 → xax-0.2.0/xax.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.1.16
+Version: 0.2.0
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte
@@ -8,14 +8,14 @@ Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: attrs
+Requires-Dist: chex
+Requires-Dist: dpshdl
+Requires-Dist: equinox
+Requires-Dist: importlib-resources
 Requires-Dist: jax
 Requires-Dist: jaxtyping
-Requires-Dist: equinox
 Requires-Dist: optax
-Requires-Dist: dpshdl
-Requires-Dist: chex
-Requires-Dist: importlib-resources
-Requires-Dist: cloudpickle
+Requires-Dist: orbax-checkpoint
 Requires-Dist: pillow
 Requires-Dist: omegaconf
 Requires-Dist: gitpython

{xax-0.1.16 → xax-0.2.0}/xax.egg-info/requires.txt RENAMED Viewed

@@ -1,12 +1,12 @@
 attrs
+chex
+dpshdl
+equinox
+importlib-resources
 jax
 jaxtyping
-equinox
 optax
-dpshdl
-chex
-importlib-resources
-cloudpickle
+orbax-checkpoint
 pillow
 omegaconf
 gitpython