PyPI - xax - Versions diffs - 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

xax 0.0.6py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

xax/__init__.py +121 -3
xax/nn/equinox.py +180 -0
xax/nn/export.py +147 -0
xax/nn/geom.py +101 -0
xax/nn/norm.py +23 -0
xax/requirements.txt +1 -0
xax/task/base.py +6 -0
xax/task/logger.py +97 -2
xax/task/loggers/stdout.py +2 -2
xax/task/loggers/tensorboard.py +25 -14
xax/task/mixins/artifacts.py +1 -21
xax/task/mixins/checkpointing.py +19 -5
xax/task/mixins/logger.py +28 -4
xax/task/mixins/step_wrapper.py +23 -32
xax/task/mixins/train.py +50 -34
xax/task/script.py +0 -4
xax/utils/debugging.py +49 -0
xax/utils/experiments.py +23 -4
xax/utils/jax.py +126 -0
xax/utils/jaxpr.py +77 -0
xax/utils/profile.py +61 -0
xax/utils/pytree.py +238 -0
xax/utils/tensorboard.py +177 -1
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/METADATA +23 -4
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/RECORD +28 -20
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/WHEEL +1 -1
{xax-0.0.6.dist-info → xax-0.1.0.dist-info/licenses}/LICENSE +0 -0
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/top_level.txt +0 -0

xax/task/mixins/train.py CHANGED Viewed

@@ -24,6 +24,7 @@ from typing import (
     TypeVar,
     cast,
     get_args,
+    overload,
 )
 import equinox as eqx
@@ -35,6 +36,7 @@ from omegaconf import DictConfig
 from xax.core.conf import field
 from xax.core.state import Phase, State
+from xax.nn.functions import set_random_seed
 from xax.nn.parallel import is_master
 from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
 from xax.task.mixins.checkpointing import CheckpointingConfig, CheckpointingMixin
@@ -115,7 +117,7 @@ class ValidStepTimer:
         if self.last_valid_time is None or self.last_valid_step is None:
             self.last_valid_time = state.elapsed_time_s
             self.last_valid_step = state.num_steps
-            return True
+            return False
         # Step-based validation.
         valid_every_n_steps = self.valid_every_n_steps
@@ -183,6 +185,9 @@ class TrainMixin(
     def __init__(self, config: Config) -> None:
         super().__init__(config)
+        # Sets the random seed whenever we instantiate a new train mixin.
+        set_random_seed(self.config.random_seed)
         # Timer for validation steps.
         self.valid_step_timer = ValidStepTimer(
             valid_every_n_steps=config.valid_every_n_steps,
@@ -279,31 +284,53 @@ class TrainMixin(
     def get_initial_opt_state(self, model: PyTree, optimizer: optax.GradientTransformation) -> optax.OptState:
         return optimizer.init(eqx.filter(model, eqx.is_array))
+    @overload
+    def load_initial_state(
+        self,
+        key: PRNGKeyArray,
+        load_optimizer: Literal[False] = False,
+    ) -> tuple[PyTree, State]: ...
+    @overload
     def load_initial_state(
         self,
         key: PRNGKeyArray,
-    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State]:
+        load_optimizer: Literal[True],
+    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State]: ...
+    def load_initial_state(
+        self,
+        key: PRNGKeyArray,
+        load_optimizer: bool = False,
+    ) -> tuple[PyTree, State] | tuple[PyTree, optax.GradientTransformation, optax.OptState, State]:
         init_ckpt_path = self.get_init_ckpt_path()
         if init_ckpt_path is not None:
             logger.info("Loading checkpoint from %s", init_ckpt_path)
-            with self.step_context("load_checkpoint"):
+            if load_optimizer:
                 model, optimizer, opt_state, state, config = self.load_checkpoint(init_ckpt_path)
                 config_diff = get_diff_string(diff_configs(config, cast(DictConfig, self.config)))
                 if config_diff:
                     logger.warning("Loaded config differs from current config:\n%s", config_diff)
                 return model, optimizer, opt_state, state
-        with self.step_context("get_model"):
-            model = self.get_model(key)
+            else:
+                model, state, config = self.load_checkpoint(init_ckpt_path, "model_state_config")
+                config_diff = get_diff_string(diff_configs(config, cast(DictConfig, self.config)))
+                if config_diff:
+                    logger.warning("Loaded config differs from current config:\n%s", config_diff)
+                return model, state
+        model = self.get_model(key)
+        state = State.init_state()
-        with self.step_context("get_optimizer"):
-            optimizer = self.get_optimizer()
+        if not load_optimizer:
+            return model, state
-        with self.step_context("get_initial_opt_state"):
-            opt_state = self.get_initial_opt_state(model, optimizer)
+        optimizer = self.get_optimizer()
+        opt_state = self.get_initial_opt_state(model, optimizer)
-        return model, optimizer, opt_state, State.init_state()
+        return model, optimizer, opt_state, state
     @eqx.filter_jit
     def get_output(self, model: PyTree, batch: Batch) -> Output:
@@ -424,6 +451,7 @@ class TrainMixin(
     def log_state(self) -> None:
         logger.log(LOG_STATUS, self.task_path)
         logger.log(LOG_STATUS, self.task_name)
+        logger.log(LOG_STATUS, "JAX devices: %s", jax.devices())
         self.logger.log_file("git_state.txt", get_git_state(self))
         self.logger.log_file("training_code.txt", get_training_code(self))
         self.logger.log_file("config.yaml", self.config_str(self.config, use_cli=False))
@@ -456,7 +484,8 @@ class TrainMixin(
         while not self.is_training_over(state):
             if self.valid_step_timer.is_valid_step(state):
                 valid_batch = next(valid_pf)
-                model, loss, output = self.val_step(model, valid_batch)
+                with self.step_context("model_step"):
+                    model, loss, output = self.val_step(model, valid_batch)
                 # Perform logging.
                 with self.step_context("write_logs"):
@@ -464,22 +493,19 @@ class TrainMixin(
                     self.log_step(model, valid_batch, output, loss, state)
                     state.num_valid_samples += 1
-            with self.step_context("on_step_start"):
-                state = self.on_step_start(state)
+            state = self.on_step_start(state)
-            with self.step_context("update_state"):
+            with self.step_context("model_step"):
                 train_batch = next(train_pf)
                 model, opt_state, loss, output = self.train_step(model, optimizer, opt_state, train_batch)
-            # Perform logging.
             with self.step_context("write_logs"):
                 state.phase = "train"
                 self.log_step(model, train_batch, output, loss, state)
                 state.num_steps += 1
                 state.num_samples += self.get_size_of_batch(train_batch) or 0
-            with self.step_context("on_step_end"):
-                state = self.on_step_end(state)
+            state = self.on_step_end(state)
             if self.should_checkpoint(state):
                 self.save_checkpoint(model, optimizer, opt_state, state)
@@ -496,14 +522,9 @@ class TrainMixin(
         except NotImplementedError:
             pass
-        with self.step_context("get_dataset"):
-            train_ds = self.get_dataset("train")
-        with self.step_context("get_dataloader"):
-            train_dl = self.get_dataloader(train_ds, "train")
-        with self.step_context("get_prefetcher"):
-            train_pf = self.get_prefetcher(train_dl)
+        train_ds = self.get_dataset("train")
+        train_dl = self.get_dataloader(train_ds, "train")
+        train_pf = self.get_prefetcher(train_dl)
         try:
             with train_pf as train_pf_ctx:
@@ -520,14 +541,9 @@ class TrainMixin(
         except NotImplementedError:
             pass
-        with self.step_context("get_dataset"):
-            valid_ds = self.get_dataset("valid")
-        with self.step_context("get_dataloader"):
-            valid_dl = self.get_dataloader(valid_ds, "valid")
-        with self.step_context("get_prefetcher"):
-            valid_pf = self.get_prefetcher(valid_dl)
+        valid_ds = self.get_dataset("valid")
+        valid_dl = self.get_dataloader(valid_ds, "valid")
+        valid_pf = self.get_prefetcher(valid_dl)
         try:
             with valid_pf as valid_pf_ctx:
@@ -559,7 +575,7 @@ class TrainMixin(
                 Thread(target=self.log_state, daemon=True).start()
             key, model_key = jax.random.split(key)
-            model, optimizer, opt_state, state = self.load_initial_state(model_key)
+            model, optimizer, opt_state, state = self.load_initial_state(model_key, load_optimizer=True)
             state = self.on_training_start(state)
             def on_exit() -> None:

xax/task/script.py CHANGED Viewed

@@ -17,8 +17,6 @@ from xax.task.mixins import (
     ProcessMixin,
     RunnableConfig,
     RunnableMixin,
-    StepContextConfig,
-    StepContextMixin,
 )
@@ -28,7 +26,6 @@ class ScriptConfig(
     GPUStatsConfig,
     ProcessConfig,
     LoggerConfig,
-    StepContextConfig,
     ArtifactsConfig,
     RunnableConfig,
     BaseConfig,
@@ -44,7 +41,6 @@ class Script(
     GPUStatsMixin[ConfigT],
     ProcessMixin[ConfigT],
     LoggerMixin[ConfigT],
-    StepContextMixin[ConfigT],
     ArtifactsMixin[ConfigT],
     RunnableMixin[ConfigT],
     BaseTask[ConfigT],

xax/utils/debugging.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Defines some useful Jax debugging utilities."""
+from collections import deque
+from collections.abc import Iterable, Mapping
+from typing import Any, Callable, Deque
+from jaxtyping import Array
+def get_named_leaves(
+    obj: Any,  # noqa: ANN401
+    is_leaf: Callable[[Any], bool] = lambda x: isinstance(x, Array),  # noqa: ANN401
+    max_depth: int = 100,
+) -> list[tuple[str, Any]]:  # noqa: ANN401
+    ret: list[tuple[str, Any]] = []
+    q: Deque[tuple[int, str, Any]] = deque()  # noqa: ANN401
+    q.append((0, "", obj))
+    while q:
+        depth, name, node = q.popleft()
+        if depth > max_depth:
+            continue
+        if hasattr(node, "__dict__") and isinstance(node.__dict__, Mapping):
+            for cname, cnode in node.__dict__.items():
+                gname = f"{name}.{cname}" if name else cname
+                if is_leaf(cnode):
+                    ret.append((gname, cnode))
+                else:
+                    q.append((depth + 1, gname, cnode))
+        elif isinstance(node, Mapping):
+            for cname, cnode in node.items():
+                gname = f"{name}.{cname}" if name else cname
+                if is_leaf(cnode):
+                    ret.append((gname, cnode))
+                else:
+                    q.append((depth + 1, gname, cnode))
+        elif isinstance(node, Iterable):
+            for i, cnode in enumerate(node):
+                gname = f"{name}.{i}" if name else str(i)
+                if is_leaf(cnode):
+                    ret.append((gname, cnode))
+                else:
+                    q.append((depth + 1, gname, cnode))
+    return ret

xax/utils/experiments.py CHANGED Viewed

@@ -23,7 +23,8 @@ import urllib.request
 import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Iterator, TypeVar, cast
+from types import TracebackType
+from typing import Any, Iterator, Self, TypeVar, cast
 from urllib.parse import urlparse
 import git
@@ -116,19 +117,19 @@ class StateTimer:
         logs: dict[str, dict[str, int | float]] = {}
         # Logs step statistics.
-        logs["⏰ steps"] = {
+        logs["⌛ steps"] = {
             "total": self.step_timer.steps,
             "per-second": self.step_timer.steps_per_second,
         }
         # Logs sample statistics.
-        logs["⏰ samples"] = {
+        logs["⌛ samples"] = {
             "total": self.sample_timer.steps,
             "per-second": self.sample_timer.steps_per_second,
         }
         # Logs full iteration statistics.
-        logs["🔧 dt"] = {
+        logs["⌛ dt"] = {
             "iter": self.iter_timer.iter_seconds,
         }
@@ -147,6 +148,24 @@ class IntervalTicker:
         return False
+class ContextTimer:
+    def __init__(self) -> None:
+        self.start_time = 0.0
+        self.elapsed_time = 0.0
+    def __enter__(self) -> Self:
+        self.start_time = time.time()
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> None:
+        self.elapsed_time = time.time() - self.start_time
 def abs_path(path: str) -> str:
     return str(Path(path).resolve())

xax/utils/jax.py CHANGED Viewed

@@ -1,14 +1,140 @@
 """Defines some utility functions for interfacing with Jax."""
+import inspect
+import logging
+import os
+import time
+from functools import wraps
+from typing import Any, Callable, Iterable, ParamSpec, Sequence, TypeVar, cast
+import jax
 import jax.numpy as jnp
 import numpy as np
+from jax._src import sharding_impls
+from jax._src.lib import xla_client as xc
+logger = logging.getLogger(__name__)
+DEFAULT_COMPILE_TIMEOUT = 1.0
 Number = int | float | np.ndarray | jnp.ndarray
+P = ParamSpec("P")  # For function parameters
+R = TypeVar("R")  # For function return type
 def as_float(value: int | float | np.ndarray | jnp.ndarray) -> float:
     if isinstance(value, (int, float)):
         return float(value)
     if isinstance(value, (np.ndarray, jnp.ndarray)):
         return float(value.item())
     raise TypeError(f"Unexpected type: {type(value)}")
+def get_hash(obj: object) -> int:
+    """Get a hash of an object.
+    If the object is hashable, use the hash. Otherwise, use the id.
+    """
+    if hasattr(obj, "__hash__"):
+        return hash(obj)
+    return id(obj)
+def jit(
+    in_shardings: Any = sharding_impls.UNSPECIFIED,  # noqa: ANN401
+    out_shardings: Any = sharding_impls.UNSPECIFIED,  # noqa: ANN401
+    static_argnums: int | Sequence[int] | None = None,
+    static_argnames: str | Iterable[str] | None = None,
+    donate_argnums: int | Sequence[int] | None = None,
+    donate_argnames: str | Iterable[str] | None = None,
+    keep_unused: bool = False,
+    device: xc.Device | None = None,
+    backend: str | None = None,
+    inline: bool = False,
+    abstracted_axes: Any | None = None,  # noqa: ANN401
+    compiler_options: dict[str, Any] | None = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    """Wrapper function that provides utility improvements over Jax's JIT.
+    Specifically, this function works on class methods, is toggleable, and
+    detects recompilations by matching hash values.
+    This is meant to be used as a decorator factory, and the decorated function
+    calls `wrapped`.
+    """
+    def decorator(fn: Callable[P, R]) -> Callable[P, R]:
+        class JitState:
+            compilation_count = 0
+            last_arg_dict: dict[str, int] | None = None
+        sig = inspect.signature(fn)
+        param_names = list(sig.parameters.keys())
+        jitted_fn = jax.jit(
+            fn,
+            in_shardings=in_shardings,
+            out_shardings=out_shardings,
+            static_argnums=static_argnums,
+            static_argnames=static_argnames,
+            donate_argnums=donate_argnums,
+            donate_argnames=donate_argnames,
+            keep_unused=keep_unused,
+            device=device,
+            backend=backend,
+            inline=inline,
+            abstracted_axes=abstracted_axes,
+            compiler_options=compiler_options,
+        )
+        @wraps(fn)
+        def wrapped(*args: P.args, **kwargs: P.kwargs) -> R:
+            if os.environ.get("DEBUG", "0") == "1":  # skipping during debug
+                return fn(*args, **kwargs)
+            do_profile = os.environ.get("JIT_PROFILE", "0") == "1"
+            if do_profile:
+                class_name = (args[0].__class__.__name__) + "." if fn.__name__ == "__call__" else ""
+                logger.info(
+                    "Currently running %s (count: %s)",
+                    f"{class_name}{fn.__name__}",
+                    JitState.compilation_count,
+                )
+            start_time = time.time()
+            res = jitted_fn(*args, **kwargs)
+            end_time = time.time()
+            runtime = end_time - start_time
+            # if this is true, if runtime is higher than COMPILE_TIMEOUT, we recompile
+            # TODO: we should probably reimplement the lower-level jitting logic to avoid this
+            if do_profile:
+                arg_dict = {}
+                for i, arg in enumerate(args):
+                    if i < len(param_names):
+                        arg_dict[param_names[i]] = get_hash(arg)
+                for k, v in kwargs.items():
+                    arg_dict[k] = get_hash(v)
+                logger.info("Hashing took %s seconds", runtime)
+                JitState.compilation_count += 1
+                if JitState.last_arg_dict is not None:
+                    all_keys = set(arg_dict.keys()) | set(JitState.last_arg_dict.keys())
+                    for k in all_keys:
+                        prev = JitState.last_arg_dict.get(k, "N/A")
+                        curr = arg_dict.get(k, "N/A")
+                        if prev != curr:
+                            logger.info("- Arg '%s' hash changed: %s -> %s", k, prev, curr)
+                JitState.last_arg_dict = arg_dict
+            return cast(R, res)
+        return wrapped
+    return decorator

xax/utils/jaxpr.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""Visualize JAXPR."""
+from pathlib import Path
+import jax
+import jax.core
+def save_jaxpr_dot(closed_jaxpr: jax.core.ClosedJaxpr, filename: str | Path) -> None:
+    """Save the JAXPR to a DOT file.
+    Example usage:
+        grad_fn_jaxpr = jax.make_jaxpr(loss_fn)(variables)
+        save_jaxpr_dot(grad_fn_jaxpr, "grad_fn_jaxpr.dot")
+    Then, you can visualize the JAXPR using Graphviz:
+        dot -Tpng grad_fn_jaxpr.dot > grad_fn_jaxpr.png
+    Args:
+        closed_jaxpr: The closed JAXPR to save.
+        filename: The filename to save the JAXPR to.
+    """
+    if hasattr(closed_jaxpr, "jaxpr"):
+        jaxpr = closed_jaxpr.jaxpr
+    else:
+        jaxpr = closed_jaxpr
+    with open(filename, "w") as f:
+        f.write("digraph Jaxpr {\n")
+        var_names: dict[jax.core.Var, str] = {}
+        var_count = 0
+        def get_var_name(var: jax.core.Var) -> str:
+            """Get a unique name for a variable."""
+            nonlocal var_names, var_count
+            # Handle Literal objects specially since they're not hashable
+            if isinstance(var, jax.core.Literal):
+                # Create a name based on the literal value
+                name = f"lit_{var.val}"
+                return name
+            # For other variables
+            if var not in var_names:
+                name = f"var_{var_count}"
+                var_names[var] = name
+                var_count += 1
+            return var_names[var]
+        for var in jaxpr.invars:
+            node_name = get_var_name(var)
+            f.write(f'  {node_name} [label="{node_name}\\n(input)"];\n')
+        eq_count = 0
+        for eq in jaxpr.eqns:
+            eq_node = f"eq{eq_count}"
+            label = f"{eq.primitive.name}"
+            f.write(f'  {eq_node} [shape=box, label="{label}"];\n')
+            for invar in eq.invars:
+                var_name = get_var_name(invar)
+                f.write(f"  {var_name} -> {eq_node};\n")
+            for outvar in eq.outvars:
+                var_name = get_var_name(outvar)
+                f.write(f"  {eq_node} -> {var_name};\n")
+            eq_count += 1
+        for var in jaxpr.outvars:
+            node_name = get_var_name(var)
+            f.write(f'  {node_name} [peripheries=2, label="{node_name}\\n(output)"];\n')
+        f.write("}\n")

xax/utils/profile.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Profiling utilities."""
+import logging
+import os
+import time
+from functools import wraps
+from typing import Callable, ParamSpec, TypeVar
+logger = logging.getLogger(__name__)
+P = ParamSpec("P")  # For function parameters
+R = TypeVar("R")  # For function return type
+def profile(fn: Callable[P, R]) -> Callable[P, R]:
+    """Profiling decorator that tracks function call count and execution time.
+    Activated when the PROFILE environment variable is set to "1".
+    Returns:
+        A decorated function with profiling capabilities.
+    """
+    class ProfileState:
+        call_count = 0
+        total_time = 0.0
+    @wraps(fn)
+    def wrapped(*args: P.args, **kwargs: P.kwargs) -> R:
+        if os.environ.get("PROFILE", "0") != "1":
+            return fn(*args, **kwargs)
+        start_time = time.time()
+        res = fn(*args, **kwargs)
+        end_time = time.time()
+        runtime = end_time - start_time
+        ProfileState.call_count += 1
+        ProfileState.total_time += runtime
+        # Handle class methods by showing class name
+        if fn.__name__ == "__call__" or (args and hasattr(args[0], "__class__")):
+            try:
+                class_name = args[0].__class__.__name__ + "."
+            except (IndexError, AttributeError):
+                class_name = ""
+        else:
+            class_name = ""
+        logger.info(
+            "%s %s - call #%s, took %s seconds, total: %s seconds",
+            class_name,
+            fn.__name__,
+            ProfileState.call_count,
+            runtime,
+            ProfileState.total_time,
+        )
+        return res
+    return wrapped

xax 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl

xax 0.0.6py3-none-any.whl → 0.1.0py3-none-any.whl