PyPI - xax - Versions diffs - 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

xax 0.0.6py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

xax/__init__.py +121 -3
xax/nn/equinox.py +180 -0
xax/nn/export.py +147 -0
xax/nn/geom.py +101 -0
xax/nn/norm.py +23 -0
xax/requirements.txt +1 -0
xax/task/base.py +6 -0
xax/task/logger.py +97 -2
xax/task/loggers/stdout.py +2 -2
xax/task/loggers/tensorboard.py +25 -14
xax/task/mixins/artifacts.py +1 -21
xax/task/mixins/checkpointing.py +19 -5
xax/task/mixins/logger.py +28 -4
xax/task/mixins/step_wrapper.py +23 -32
xax/task/mixins/train.py +50 -34
xax/task/script.py +0 -4
xax/utils/debugging.py +49 -0
xax/utils/experiments.py +23 -4
xax/utils/jax.py +126 -0
xax/utils/jaxpr.py +77 -0
xax/utils/profile.py +61 -0
xax/utils/pytree.py +238 -0
xax/utils/tensorboard.py +177 -1
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/METADATA +23 -4
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/RECORD +28 -20
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/WHEEL +1 -1
{xax-0.0.6.dist-info → xax-0.1.0.dist-info/licenses}/LICENSE +0 -0
{xax-0.0.6.dist-info → xax-0.1.0.dist-info}/top_level.txt +0 -0

xax/nn/norm.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Normalization utilities."""
+from typing import Literal, cast, get_args
+import jax.numpy as jnp
+NormType = Literal["l1", "l2"]
+def cast_norm_type(norm: str) -> NormType:
+    if norm not in get_args(NormType):
+        raise ValueError(f"Invalid norm: {norm}")
+    return cast(NormType, norm)
+def get_norm(x: jnp.ndarray, norm: NormType) -> jnp.ndarray:
+    match norm:
+        case "l1":
+            return jnp.abs(x)
+        case "l2":
+            return jnp.square(x)
+        case _:
+            raise ValueError(f"Invalid norm: {norm}")

xax/requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 # requirements.txt
 # Core ML/JAX dependencies
+attrs
 jax
 jaxtyping
 equinox

xax/task/base.py CHANGED Viewed

@@ -81,6 +81,12 @@ class BaseTask(Generic[Config]):
     def on_training_end(self, state: State) -> State:
         return state
+    def on_after_checkpoint_save(self, ckpt_path: Path, state: State) -> State:
+        return state
+    def on_before_checkpoint_load(self, ckpt_path: Path) -> None:
+        pass
     @functools.cached_property
     def task_class_name(self) -> str:
         return self.__class__.__name__

xax/task/logger.py CHANGED Viewed

@@ -223,10 +223,29 @@ class LogVideo:
     fps: int
+@dataclass(kw_only=True)
+class LogDistribution:
+    mean: Number
+    std: Number
+@dataclass(kw_only=True)
+class LogHistogram:
+    min: Number
+    max: Number
+    num: int
+    sum: Number
+    sum_squares: Number
+    bucket_limits: list[Number]
+    bucket_counts: list[int]
 @dataclass(kw_only=True)
 class LogLine:
     state: State
     scalars: dict[str, dict[str, Number]]
+    distributions: dict[str, dict[str, LogDistribution]]
+    histograms: dict[str, dict[str, LogHistogram]]
     strings: dict[str, dict[str, str]]
     images: dict[str, dict[str, LogImage]]
     videos: dict[str, dict[str, LogVideo]]
@@ -329,9 +348,9 @@ def image_with_text(
     else:
         text = text[:max_num_lines]
     width, height = image.size
-    font: ImageFont.ImageFont = ImageFont.load_default()
+    font: ImageFont.ImageFont | ImageFont.FreeTypeFont = ImageFont.load_default()
     _, _, _, line_height = font.getbbox(text[0])
-    new_width, new_height = width, height + line_spacing + max_num_lines * (line_height + line_spacing)
+    new_width, new_height = width, int(height + line_spacing + max_num_lines * (line_height + line_spacing))
     padded_image = Image.new(image.mode, (new_width, new_height), 255)
     padded_image.paste(image, (0, 0))
     drawer = ImageDraw.Draw(padded_image)
@@ -497,6 +516,8 @@ class Logger:
     def __init__(self, default_namespace: str = DEFAULT_NAMESPACE) -> None:
         self.scalars: dict[str, dict[str, Callable[[], Number]]] = defaultdict(dict)
+        self.distributions: dict[str, dict[str, Callable[[], LogDistribution]]] = defaultdict(dict)
+        self.histograms: dict[str, dict[str, Callable[[], LogHistogram]]] = defaultdict(dict)
         self.strings: dict[str, dict[str, Callable[[], str]]] = defaultdict(dict)
         self.images: dict[str, dict[str, Callable[[], LogImage]]] = defaultdict(dict)
         self.videos: dict[str, dict[str, Callable[[], LogVideo]]] = defaultdict(dict)
@@ -522,6 +543,8 @@ class Logger:
         return LogLine(
             state=state,
             scalars={k: {kk: v() for kk, v in v.items()} for k, v in self.scalars.items()},
+            distributions={k: {kk: v() for kk, v in v.items()} for k, v in self.distributions.items()},
+            histograms={k: {kk: v() for kk, v in v.items()} for k, v in self.histograms.items()},
             strings={k: {kk: v() for kk, v in v.items()} for k, v in self.strings.items()},
             images={k: {kk: v() for kk, v in v.items()} for k, v in self.images.items()},
             videos={k: {kk: v() for kk, v in v.items()} for k, v in self.videos.items()},
@@ -529,6 +552,8 @@ class Logger:
     def clear(self) -> None:
         self.scalars.clear()
+        self.distributions.clear()
+        self.histograms.clear()
         self.strings.clear()
         self.images.clear()
         self.videos.clear()
@@ -612,6 +637,76 @@ class Logger:
         self.scalars[namespace][key] = scalar_future
+    def log_distribution(
+        self,
+        key: str,
+        value: Callable[[], tuple[Number, Number]] | tuple[Number, Number],
+        *,
+        namespace: str | None = None,
+    ) -> None:
+        """Logs a distribution value.
+        Args:
+            key: The key being logged
+            value: The distribution value being logged, a tuple of (mean, std)
+            namespace: An optional logging namespace
+        """
+        if not self.active:
+            raise RuntimeError("The logger is not active")
+        namespace = self.resolve_namespace(namespace)
+        @functools.lru_cache(maxsize=None)
+        def distribution_future() -> LogDistribution:
+            mean, std = value() if callable(value) else value
+            return LogDistribution(mean=mean, std=std)
+        self.distributions[namespace][key] = distribution_future
+    def log_histogram(
+        self,
+        key: str,
+        value: Callable[[], np.ndarray | Array] | np.ndarray | Array,
+        *,
+        bins: int = 100,
+        namespace: str | None = None,
+    ) -> None:
+        """Logs a histogram value.
+        Args:
+            key: The key being logged
+            value: The histogram value being logged
+            bins: The number of bins to use for the histogram
+            namespace: An optional logging namespace
+        """
+        if not self.active:
+            raise RuntimeError("The logger is not active")
+        namespace = self.resolve_namespace(namespace)
+        @functools.lru_cache(maxsize=None)
+        def histogram_future() -> LogHistogram:
+            values = value() if callable(value) else value
+            values = values.reshape(-1)  # Must be flat.
+            if isinstance(values, Array):
+                counts, limits = jnp.histogram(values, bins=bins)
+                counts, limits = as_numpy(counts), as_numpy(limits)
+            elif isinstance(values, np.ndarray):
+                counts, limits = np.histogram(values, bins=bins)
+            else:
+                raise ValueError(f"Unsupported histogram type: {type(values)}")
+            return LogHistogram(
+                min=float(values.min()),
+                max=float(values.max()),
+                num=int(values.size),
+                sum=float(values.sum()),
+                sum_squares=float(values.dot(values)),
+                bucket_limits=limits[1:].tolist(),
+                bucket_counts=counts.tolist(),
+            )
+        self.histograms[namespace][key] = histogram_future
     def log_string(self, key: str, value: Callable[[], str] | str, *, namespace: str | None = None) -> None:
         """Logs a string value.

xax/task/loggers/stdout.py CHANGED Viewed

@@ -33,7 +33,7 @@ class StdoutLogger(LoggerImpl):
         self,
         write_fp: TextIO = sys.stdout,
         precision: int = 4,
-        log_timers: bool = False,
+        log_timers: bool = True,
         log_perf: bool = False,
         log_optim: bool = False,
         log_fp: bool = False,
@@ -98,7 +98,7 @@ class StdoutLogger(LoggerImpl):
         def add_logs(log: dict[str, dict[str, Any]], namespace_to_lines: dict[str, dict[str, str]]) -> None:
             for namespace, values in log.items():
-                if not self.log_timers and namespace.startswith("⏰"):
+                if not self.log_timers and namespace.startswith("⌛"):
                     continue
                 if not self.log_perf and namespace.startswith("🔧"):
                     continue

xax/task/loggers/tensorboard.py CHANGED Viewed

@@ -1,11 +1,9 @@
 """Defines a Tensorboard logger backend."""
 import atexit
-import functools
 import logging
 import os
 import re
-import shutil
 import subprocess
 import threading
 import time
@@ -140,15 +138,6 @@ class TensorboardLogger(LoggerImpl):
     def __del__(self) -> None:
         self.cleanup()
-    @functools.lru_cache(None)  # Avoid clearing logs multiple times.
-    def clear_logs(self) -> None:
-        if not self.log_directory.exists():
-            return
-        if not any(child.is_dir() for child in self.log_directory.iterdir()):
-            return
-        logger.warning("Clearing TensorBoard logs")
-        shutil.rmtree(self.log_directory)
     def get_writer(self, phase: Phase) -> TensorboardWriter:
         self._start()
         return self.writers.writer(phase)
@@ -162,9 +151,6 @@ class TensorboardLogger(LoggerImpl):
         if not is_master():
             return
-        if line.state.num_steps == 0:
-            self.clear_logs()
         writer = self.get_writer(line.state.phase)
         walltime = line.state.start_time_s + line.state.elapsed_time_s
@@ -177,6 +163,31 @@ class TensorboardLogger(LoggerImpl):
                     walltime=walltime,
                 )
+        for namespace, distributions in line.distributions.items():
+            for distribution_key, distribution_value in distributions.items():
+                writer.add_gaussian_distribution(
+                    f"{namespace}/{distribution_key}",
+                    mean=float(distribution_value.mean),
+                    std=float(distribution_value.std),
+                    global_step=line.state.num_steps,
+                    walltime=walltime,
+                )
+        for namespace, histograms in line.histograms.items():
+            for histogram_key, histogram_value in histograms.items():
+                writer.add_histogram_raw(
+                    f"{namespace}/{histogram_key}",
+                    min=float(histogram_value.min),
+                    max=float(histogram_value.max),
+                    num=int(histogram_value.num),
+                    sum=float(histogram_value.sum),
+                    sum_squares=float(histogram_value.sum_squares),
+                    bucket_limits=[float(x) for x in histogram_value.bucket_limits],
+                    bucket_counts=[int(x) for x in histogram_value.bucket_counts],
+                    global_step=line.state.num_steps,
+                    walltime=walltime,
+                )
         for namespace, strings in line.strings.items():
             for string_key, string_value in strings.items():
                 writer.add_text(

xax/task/mixins/artifacts.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import functools
 import inspect
 import logging
-import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Self, TypeVar
@@ -54,20 +53,6 @@ class ArtifactsMixin(BaseTask[Config]):
         self._exp_dir = Path(exp_dir).expanduser().resolve()
         return self
-    def add_lock_file(self, lock_type: str, *, exists_ok: bool = False) -> None:
-        if (lock_file := self.exp_dir / f".lock_{lock_type}").exists():
-            if not exists_ok:
-                raise RuntimeError(f"Lock file already exists at {lock_file}")
-        else:
-            with open(lock_file, "w", encoding="utf-8") as f:
-                f.write(f"PID: {os.getpid()}")
-    def remove_lock_file(self, lock_type: str, *, missing_ok: bool = False) -> None:
-        if (lock_file := self.exp_dir / f".lock_{lock_type}").exists():
-            lock_file.unlink()
-        elif not missing_ok:
-            raise RuntimeError(f"Lock file not found at {lock_file}")
     def get_exp_dir(self) -> Path:
         if self._exp_dir is not None:
             return self._exp_dir
@@ -82,13 +67,8 @@ class ArtifactsMixin(BaseTask[Config]):
         def get_exp_dir(run_id: int) -> Path:
             return self.run_dir / f"run_{run_id}"
-        def has_lock_file(exp_dir: Path, lock_type: str | None = None) -> bool:
-            if lock_type is not None:
-                return (exp_dir / f".lock_{lock_type}").exists()
-            return any(exp_dir.glob(".lock_*"))
         run_id = 0
-        while (exp_dir := get_exp_dir(run_id)).is_dir() and has_lock_file(exp_dir):
+        while (exp_dir := get_exp_dir(run_id)).is_dir():
             run_id += 1
         exp_dir.mkdir(exist_ok=True, parents=True)
         self._exp_dir = exp_dir.expanduser().resolve()

xax/task/mixins/checkpointing.py CHANGED Viewed

@@ -21,7 +21,7 @@ from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
 logger = logging.getLogger(__name__)
-CheckpointPart = Literal["model", "opt", "opt_state", "state", "config"]
+CheckpointPart = Literal["model", "opt", "opt_state", "state", "config", "model_state_config", "all"]
 def get_ckpt_path(exp_dir: Path, state: State | None = None) -> Path:
@@ -88,8 +88,16 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def load_checkpoint(
         self,
         path: Path,
+        part: Literal["all"] = "all",
     ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]: ...
+    @overload
+    def load_checkpoint(
+        self,
+        path: Path,
+        part: Literal["model_state_config"] = "model_state_config",
+    ) -> tuple[PyTree, State, DictConfig]: ...
     @overload
     def load_checkpoint(self, path: Path, part: Literal["model"]) -> PyTree: ...
@@ -108,15 +116,19 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def load_checkpoint(
         self,
         path: Path,
-        part: CheckpointPart | None = None,
+        part: CheckpointPart = "all",
     ) -> (
         tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]
+        | tuple[PyTree, State, DictConfig]
         | PyTree
         | optax.GradientTransformation
         | optax.OptState
         | State
         | DictConfig
     ):
+        # Calls the base callback.
+        self.on_before_checkpoint_load(path)
         with tarfile.open(path, "r:gz") as tar:
             def get_model() -> PyTree:
@@ -155,7 +167,9 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
                     return get_state()
                 case "config":
                     return get_config()
-                case None:
+                case "model_state_config":
+                    return get_model(), get_state(), get_config()
+                case "all":
                     return get_model(), get_opt(), get_opt_state(), get_state(), get_config()
                 case _:
                     raise ValueError(f"Invalid checkpoint part: {part}")
@@ -215,7 +229,7 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
         except FileExistsError:
             logger.exception("Exception while trying to update %s", ckpt_path)
-        # Marks directory as having artifacts which shouldn't be overwritten.
-        self.add_lock_file("ckpt", exists_ok=True)
+        # Calls the base callback.
+        self.on_after_checkpoint_save(ckpt_path, state)
         return ckpt_path

xax/task/mixins/logger.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Generic, Self, TypeVar
 import jax
+from xax.core.conf import field
 from xax.core.state import State
 from xax.task.base import BaseConfig, BaseTask
 from xax.task.logger import Logger, LoggerImpl
@@ -22,7 +23,14 @@ from xax.utils.text import is_interactive_session
 @jax.tree_util.register_dataclass
 @dataclass
 class LoggerConfig(BaseConfig):
-    pass
+    log_interval_seconds: float = field(
+        value=1.0,
+        help="The interval between successive log lines.",
+    )
+    tensorboard_log_interval_seconds: float = field(
+        value=10.0,
+        help="The interval between successive Tensorboard log lines.",
+    )
 Config = TypeVar("Config", bound=LoggerConfig)
@@ -49,11 +57,27 @@ class LoggerMixin(BaseTask[Config], Generic[Config]):
         self.logger.add_logger(*logger)
     def set_loggers(self) -> None:
-        self.add_logger(StdoutLogger() if is_interactive_session() else JsonLogger())
+        self.add_logger(
+            StdoutLogger(
+                log_interval_seconds=self.config.log_interval_seconds,
+            )
+            if is_interactive_session()
+            else JsonLogger(
+                log_interval_seconds=self.config.log_interval_seconds,
+            )
+        )
+        # If this is also an ArtifactsMixin, we should default add some
+        # additional loggers which log data to the artifacts directory.
         if isinstance(self, ArtifactsMixin):
             self.add_logger(
-                StateLogger(self.exp_dir),
-                TensorboardLogger(self.exp_dir),
+                StateLogger(
+                    run_directory=self.exp_dir,
+                ),
+                TensorboardLogger(
+                    run_directory=self.exp_dir,
+                    log_interval_seconds=self.config.tensorboard_log_interval_seconds,
+                ),
             )
     def write_logs(self, state: State) -> None:

xax/task/mixins/step_wrapper.py CHANGED Viewed

@@ -1,53 +1,39 @@
 """Defines a mixin to wrap some steps in a context manager."""
+import time
 from dataclasses import dataclass
 from types import TracebackType
-from typing import ContextManager, Literal, TypeVar
+from typing import Callable, ContextManager, TypeVar
-import equinox as eqx
 import jax
 from xax.task.base import BaseConfig, BaseTask
-StepType = Literal[
-    "backward",
-    "change_mode",
-    "clip_grads",
-    "create_optimizers",
-    "forward",
-    "get_dataloader",
-    "get_dataset",
-    "get_prefetcher",
-    "get_model",
-    "get_optimizer",
-    "get_initial_opt_state",
-    "get_update_fn",
-    "load_checkpoint",
-    "log_losses",
-    "model_to_device",
-    "on_step_end",
-    "on_step_start",
-    "save_checkpoint",
-    "step",
-    "update_state",
-    "write_logs",
-    "zero_grads",
-]
 class StepContext(ContextManager):
     """Context manager to get the current step type."""
-    CURRENT_STEP: StepType | None = None
+    CURRENT_STEP: str | None = None
-    def __init__(self, step: StepType) -> None:
+    def __init__(
+        self,
+        step: str,
+        on_context_start: Callable[[str], None],
+        on_context_end: Callable[[str, float], None],
+    ) -> None:
         self.step = step
+        self.start_time = 0.0
+        self.on_context_start = on_context_start
+        self.on_context_end = on_context_end
     def __enter__(self) -> None:
         StepContext.CURRENT_STEP = self.step
+        self.start_time = time.time()
+        self.on_context_start(self.step)
     def __exit__(self, _t: type[BaseException] | None, _e: BaseException | None, _tr: TracebackType | None) -> None:
         StepContext.CURRENT_STEP = None
+        self.on_context_end(self.step, time.time() - self.start_time)
 @jax.tree_util.register_dataclass
@@ -63,6 +49,11 @@ class StepContextMixin(BaseTask[Config]):
     def __init__(self, config: Config) -> None:
         super().__init__(config)
-    @eqx.filter_jit
-    def step_context(self, step: StepType) -> ContextManager:
-        return StepContext(step)
+    def step_context(self, step: str) -> ContextManager:
+        return StepContext(step, self.on_context_start, self.on_context_stop)
+    def on_context_start(self, step: str) -> None:
+        pass
+    def on_context_stop(self, step: str, elapsed_time: float) -> None:
+        pass

xax 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl

xax 0.0.6py3-none-any.whl → 0.1.0py3-none-any.whl