PyPI - xax - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

xax 0.0.3py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xax might be problematic. Click here for more details.

Files changed (38) hide show

xax/__init__.py +122 -8
xax/core/conf.py +9 -33
xax/core/state.py +13 -23
xax/nn/embeddings.py +355 -0
xax/nn/functions.py +8 -4
xax/requirements-dev.txt +9 -1
xax/requirements.txt +17 -10
xax/task/base.py +2 -6
xax/task/logger.py +419 -412
xax/task/loggers/callback.py +44 -0
xax/task/loggers/state.py +5 -18
xax/task/loggers/tensorboard.py +16 -33
xax/task/mixins/__init__.py +3 -1
xax/task/mixins/artifacts.py +19 -9
xax/task/mixins/checkpointing.py +221 -0
xax/task/mixins/compile.py +104 -0
xax/task/mixins/cpu_stats.py +26 -15
xax/task/mixins/data_loader.py +27 -19
xax/task/mixins/gpu_stats.py +22 -8
xax/task/mixins/logger.py +5 -251
xax/task/mixins/process.py +8 -1
xax/task/mixins/runnable.py +3 -0
xax/task/mixins/step_wrapper.py +5 -0
xax/task/mixins/train.py +236 -145
xax/task/script.py +1 -1
xax/task/task.py +13 -5
xax/utils/data/collate.py +6 -6
xax/utils/experiments.py +45 -1
xax/utils/logging.py +29 -0
xax/utils/tensorboard.py +89 -21
xax-0.0.6.dist-info/METADATA +50 -0
xax-0.0.6.dist-info/RECORD +52 -0
{xax-0.0.3.dist-info → xax-0.0.6.dist-info}/WHEEL +1 -1
xax/task/launchers/staged.py +0 -29
xax-0.0.3.dist-info/METADATA +0 -39
xax-0.0.3.dist-info/RECORD +0 -49
{xax-0.0.3.dist-info → xax-0.0.6.dist-info}/LICENSE +0 -0
{xax-0.0.3.dist-info → xax-0.0.6.dist-info}/top_level.txt +0 -0

xax/task/loggers/callback.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Defines a logger that calls a callback function with the log line."""
+from typing import Callable
+from xax.task.logger import LogError, LogErrorSummary, LoggerImpl, LogLine, LogPing, LogStatus
+class CallbackLogger(LoggerImpl):
+    def __init__(
+        self,
+        *,
+        callback: Callable[[LogLine], None] = lambda x: None,
+        error_summary_callback: Callable[[LogErrorSummary], None] = lambda x: None,
+        error_callback: Callable[[LogError], None] = lambda x: None,
+        status_callback: Callable[[LogStatus], None] = lambda x: None,
+        ping_callback: Callable[[LogPing], None] = lambda x: None,
+        file_callback: Callable[[str, str], None] = lambda x, y: None,
+    ) -> None:
+        super().__init__()
+        self.callback = callback
+        self.error_summary_callback = error_summary_callback
+        self.error_callback = error_callback
+        self.status_callback = status_callback
+        self.ping_callback = ping_callback
+        self.file_callback = file_callback
+    def write(self, line: LogLine) -> None:
+        self.callback(line)
+    def write_error_summary(self, error_summary: LogErrorSummary) -> None:
+        self.error_summary_callback(error_summary)
+    def write_error(self, error: LogError) -> None:
+        self.error_callback(error)
+    def write_status(self, status: LogStatus) -> None:
+        self.status_callback(status)
+    def write_ping(self, ping: LogPing) -> None:
+        self.ping_callback(ping)
+    def log_file(self, name: str, contents: str) -> None:
+        self.file_callback(name, contents)

xax/task/loggers/state.py CHANGED Viewed

@@ -3,8 +3,6 @@
 from pathlib import Path
 from typing import Literal
-from omegaconf import DictConfig, OmegaConf
 from xax.task.logger import LoggerImpl, LogLine
@@ -12,9 +10,6 @@ class StateLogger(LoggerImpl):
     def __init__(
         self,
         run_directory: str | Path,
-        git_state_name: str = "git_state.txt",
-        train_code_name: str = "train_code.py",
-        config_name: str = "config.yaml",
         flush_immediately: bool = False,
         open_mode: Literal["w", "a"] = "w",
         line_sep: str = "\n",
@@ -22,24 +17,16 @@ class StateLogger(LoggerImpl):
     ) -> None:
         super().__init__(float("inf"))
-        self.git_state_file = Path(run_directory).expanduser().resolve() / git_state_name
-        self.train_code_file = Path(run_directory).expanduser().resolve() / train_code_name
-        self.config_file = Path(run_directory).expanduser().resolve() / config_name
+        self.run_directory = Path(run_directory).expanduser().resolve()
         self.flush_immediately = flush_immediately
         self.open_mode = open_mode
         self.line_sep = line_sep
         self.remove_unicode_from_namespaces = remove_unicode_from_namespaces
-    def log_git_state(self, git_state: str) -> None:
-        with open(self.git_state_file, "w") as f:
-            f.write(git_state)
-    def log_training_code(self, training_code: str) -> None:
-        with open(self.train_code_file, "w") as f:
-            f.write(training_code)
-    def log_config(self, config: DictConfig) -> None:
-        OmegaConf.save(config, self.config_file)
+    def log_file(self, name: str, contents: str) -> None:
+        with open(self.run_directory / name, "w") as f:
+            f.write(contents)
     def write(self, line: LogLine) -> None:
         pass

xax/task/loggers/tensorboard.py CHANGED Viewed

@@ -12,10 +12,6 @@ import time
 from pathlib import Path
 from typing import TypeVar
-import jax
-import PIL.Image
-from omegaconf import DictConfig, OmegaConf
 from xax.core.state import Phase
 from xax.nn.parallel import is_master
 from xax.task.logger import LoggerImpl, LogLine
@@ -62,10 +58,7 @@ class TensorboardLogger(LoggerImpl):
         self.proc: subprocess.Popen | None = None
-        self.git_state: str | None = None
-        self.training_code: str | None = None
-        self.config: DictConfig | None = None
+        self.files: dict[str, str] = {}
         self.writers = TensorboardWriters(log_directory=self.log_directory, flush_seconds=flush_seconds)
         self._started = False
@@ -84,7 +77,7 @@ class TensorboardLogger(LoggerImpl):
         port = int(os.environ.get("TENSORBOARD_PORT", DEFAULT_TENSORBOARD_PORT))
         while port_is_busy(port):
-            logger.warning(f"Port {port} is busy, waiting...")
+            logger.warning("Port %s is busy, waiting...", port)
             time.sleep(10)
         def make_localhost(s: str) -> str:
@@ -160,20 +153,10 @@ class TensorboardLogger(LoggerImpl):
         self._start()
         return self.writers.writer(phase)
-    def log_git_state(self, git_state: str) -> None:
-        if not is_master():
-            return
-        self.git_state = f"```\n{git_state}\n```"
-    def log_training_code(self, training_code: str) -> None:
+    def log_file(self, name: str, contents: str) -> None:
         if not is_master():
             return
-        self.training_code = f"```python\n{training_code}\n```"
-    def log_config(self, config: DictConfig) -> None:
-        if not is_master():
-            return
-        self.config = config
+        self.files[name] = f"```\n{contents}\n```"
     def write(self, line: LogLine) -> None:
         if not is_master():
@@ -205,22 +188,22 @@ class TensorboardLogger(LoggerImpl):
         for namespace, images in line.images.items():
             for image_key, image_value in images.items():
-                image = PIL.Image.fromarray(jax.device_get(image_value.pixels))
                 writer.add_image(
                     f"{namespace}/{image_key}",
-                    image,
+                    image_value.image,
                     global_step=line.state.num_steps,
                     walltime=walltime,
                 )
-        if self.config is not None:
-            writer.add_text("config", f"```\n{OmegaConf.to_yaml(self.config)}\n```")
-            self.config = None
-        if self.git_state is not None:
-            writer.add_text("git", self.git_state)
-            self.git_state = None
+        for namespace, videos in line.videos.items():
+            for video_key, video_value in videos.items():
+                writer.add_video(
+                    f"{namespace}/{video_key}",
+                    video_value.frames,
+                    fps=video_value.fps,
+                    global_step=line.state.num_steps,
+                )
-        if self.training_code is not None:
-            writer.add_text("code", self.training_code)
-            self.training_code = None
+        for name, contents in self.files.items():
+            writer.add_text(name, contents)
+        self.files.clear()

xax/task/mixins/__init__.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Defines a single interface for all the mixins."""
 from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
+from xax.task.mixins.checkpointing import CheckpointingConfig, CheckpointingMixin
+from xax.task.mixins.compile import CompileConfig, CompileMixin
 from xax.task.mixins.cpu_stats import CPUStatsConfig, CPUStatsMixin
 from xax.task.mixins.data_loader import DataloadersConfig, DataloadersMixin
 from xax.task.mixins.gpu_stats import GPUStatsConfig, GPUStatsMixin
@@ -8,4 +10,4 @@ from xax.task.mixins.logger import LoggerConfig, LoggerMixin
 from xax.task.mixins.process import ProcessConfig, ProcessMixin
 from xax.task.mixins.runnable import RunnableConfig, RunnableMixin
 from xax.task.mixins.step_wrapper import StepContextConfig, StepContextMixin
-from xax.task.mixins.train import Batch, Model, Output, TrainConfig, TrainMixin
+from xax.task.mixins.train import TrainConfig, TrainMixin

xax/task/mixins/artifacts.py CHANGED Viewed

@@ -8,6 +8,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Self, TypeVar
+import jax
 from xax.core.conf import field, get_run_dir
 from xax.core.state import State
 from xax.nn.parallel import is_master
@@ -19,6 +21,7 @@ from xax.utils.text import show_info
 logger = logging.getLogger(__name__)
+@jax.tree_util.register_dataclass
 @dataclass
 class ArtifactsConfig(BaseConfig):
     exp_dir: str | None = field(None, help="The fixed experiment directory")
@@ -43,8 +46,12 @@ class ArtifactsMixin(BaseTask[Config]):
             run_dir = Path(task_file).resolve().parent
         return run_dir / self.task_name
-    def set_exp_dir(self, exp_dir: Path) -> Self:
-        self._exp_dir = exp_dir
+    @property
+    def exp_dir(self) -> Path:
+        return self.get_exp_dir()
+    def set_exp_dir(self, exp_dir: str | Path) -> Self:
+        self._exp_dir = Path(exp_dir).expanduser().resolve()
         return self
     def add_lock_file(self, lock_type: str, *, exists_ok: bool = False) -> None:
@@ -61,13 +68,16 @@ class ArtifactsMixin(BaseTask[Config]):
         elif not missing_ok:
             raise RuntimeError(f"Lock file not found at {lock_file}")
-    @functools.cached_property
-    def exp_dir(self) -> Path:
+    def get_exp_dir(self) -> Path:
+        if self._exp_dir is not None:
+            return self._exp_dir
         if self.config.exp_dir is not None:
             exp_dir = Path(self.config.exp_dir).expanduser().resolve()
             exp_dir.mkdir(parents=True, exist_ok=True)
-            logger.log(LOG_STATUS, self.exp_dir)
-            return exp_dir
+            self._exp_dir = exp_dir
+            logger.log(LOG_STATUS, self._exp_dir)
+            return self._exp_dir
         def get_exp_dir(run_id: int) -> Path:
             return self.run_dir / f"run_{run_id}"
@@ -81,9 +91,9 @@ class ArtifactsMixin(BaseTask[Config]):
         while (exp_dir := get_exp_dir(run_id)).is_dir() and has_lock_file(exp_dir):
             run_id += 1
         exp_dir.mkdir(exist_ok=True, parents=True)
-        exp_dir = exp_dir.expanduser().resolve()
-        logger.log(LOG_STATUS, exp_dir)
-        return exp_dir
+        self._exp_dir = exp_dir.expanduser().resolve()
+        logger.log(LOG_STATUS, self._exp_dir)
+        return self._exp_dir
     @functools.lru_cache(maxsize=None)
     def stage_environment(self) -> Path | None:

xax/task/mixins/checkpointing.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""Defines a mixin for handling model checkpointing."""
+import io
+import json
+import logging
+import tarfile
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Callable, Generic, Literal, TypeVar, cast, overload
+import cloudpickle
+import jax
+import optax
+from jaxtyping import PyTree
+from omegaconf import DictConfig, OmegaConf
+from xax.core.conf import field
+from xax.core.state import State
+from xax.nn.parallel import is_master
+from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
+logger = logging.getLogger(__name__)
+CheckpointPart = Literal["model", "opt", "opt_state", "state", "config"]
+def get_ckpt_path(exp_dir: Path, state: State | None = None) -> Path:
+    """Defines the path to the checkpoint for a given state.
+    Args:
+        exp_dir: The experiment directory
+        state: The current trainer state
+    Returns:
+        The path to the checkpoint file.
+    """
+    if state is None:
+        return exp_dir / "checkpoints" / "ckpt.bin"
+    return exp_dir / "checkpoints" / f"ckpt.{state.num_steps}.bin"
+@jax.tree_util.register_dataclass
+@dataclass
+class CheckpointingConfig(ArtifactsConfig):
+    save_every_n_steps: int | None = field(None, help="Save a checkpoint every N steps")
+    save_every_n_seconds: float | None = field(60.0 * 60.0, help="Save a checkpoint every N seconds")
+    only_save_most_recent: bool = field(True, help="Only keep the most recent checkpoint")
+    load_from_ckpt_path: str | None = field(None, help="If set, load initial model weights from this path")
+    load_ckpt_strict: bool = field(True, help="If set, only load weights for which have a matching key in the model")
+    save_tf_model: bool = field(False, help="If set, saves a Tensorflow version of the model")
+Config = TypeVar("Config", bound=CheckpointingConfig)
+class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
+    def __init__(self, config: Config) -> None:
+        super().__init__(config)
+        self.__last_ckpt_time = 0.0
+    def get_ckpt_path(self, state: State | None = None) -> Path:
+        return get_ckpt_path(self.exp_dir, state)
+    def get_init_ckpt_path(self) -> Path | None:
+        if self._exp_dir is not None:
+            ckpt_path = self.get_ckpt_path()
+            if ckpt_path.exists():
+                return ckpt_path
+        if self.config.load_from_ckpt_path is not None:
+            ckpt_path = Path(self.config.load_from_ckpt_path)
+            assert ckpt_path.exists(), f"Checkpoint path {ckpt_path} does not exist."
+            return ckpt_path
+        return None
+    def should_checkpoint(self, state: State) -> bool:
+        if self.config.save_every_n_steps is not None:
+            if state.num_steps % self.config.save_every_n_steps == 0:
+                return True
+        if self.config.save_every_n_seconds is not None:
+            last_time, cur_time = self.__last_ckpt_time, state.elapsed_time_s
+            if cur_time - last_time >= self.config.save_every_n_seconds:
+                self.__last_ckpt_time = cur_time
+                return True
+        return False
+    @overload
+    def load_checkpoint(
+        self,
+        path: Path,
+    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["model"]) -> PyTree: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["opt"]) -> optax.GradientTransformation: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["opt_state"]) -> optax.OptState: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["state"]) -> State: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["config"]) -> DictConfig: ...
+    def load_checkpoint(
+        self,
+        path: Path,
+        part: CheckpointPart | None = None,
+    ) -> (
+        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]
+        | PyTree
+        | optax.GradientTransformation
+        | optax.OptState
+        | State
+        | DictConfig
+    ):
+        with tarfile.open(path, "r:gz") as tar:
+            def get_model() -> PyTree:
+                if (model := tar.extractfile("model")) is None:
+                    raise ValueError(f"Checkpoint does not contain a model file: {path}")
+                return cloudpickle.load(model)
+            def get_opt() -> optax.GradientTransformation:
+                if (opt := tar.extractfile("opt")) is None:
+                    raise ValueError(f"Checkpoint does not contain an opt file: {path}")
+                return cloudpickle.load(opt)
+            def get_opt_state() -> optax.OptState:
+                if (opt_state := tar.extractfile("opt_state")) is None:
+                    raise ValueError(f"Checkpoint does not contain an opt_state file: {path}")
+                return cloudpickle.load(opt_state)
+            def get_state() -> State:
+                if (state := tar.extractfile("state")) is None:
+                    raise ValueError(f"Checkpoint does not contain a state file: {path}")
+                return State(**json.loads(state.read().decode()))
+            def get_config() -> DictConfig:
+                if (config := tar.extractfile("config")) is None:
+                    raise ValueError(f"Checkpoint does not contain a config file: {path}")
+                return cast(DictConfig, OmegaConf.load(config))
+            match part:
+                case "model":
+                    return get_model()
+                case "opt":
+                    return get_opt()
+                case "opt_state":
+                    return get_opt_state()
+                case "state":
+                    return get_state()
+                case "config":
+                    return get_config()
+                case None:
+                    return get_model(), get_opt(), get_opt_state(), get_state(), get_config()
+                case _:
+                    raise ValueError(f"Invalid checkpoint part: {part}")
+    def save_checkpoint(
+        self,
+        model: PyTree,
+        optimizer: optax.GradientTransformation,
+        opt_state: optax.OptState,
+        state: State,
+    ) -> Path:
+        ckpt_path = self.get_ckpt_path(state)
+        if not is_master():
+            return ckpt_path
+        # Gets the path to the last checkpoint.
+        logger.info("Saving checkpoint to %s", ckpt_path)
+        last_ckpt_path = self.get_ckpt_path()
+        ckpt_path.parent.mkdir(exist_ok=True, parents=True)
+        # Potentially removes the last checkpoint.
+        if last_ckpt_path.exists() and self.config.only_save_most_recent:
+            if (base_ckpt := last_ckpt_path.resolve()).is_file():
+                base_ckpt.unlink()
+        # Combines all temporary files into a single checkpoint TAR file.
+        with tarfile.open(ckpt_path, "w:gz") as tar:
+            def add_file(name: str, write_fn: Callable[[io.BytesIO], Any]) -> None:
+                with io.BytesIO() as buf:
+                    write_fn(buf)
+                    tarinfo = tarfile.TarInfo(name)
+                    tarinfo.size = buf.tell()
+                    buf.seek(0)
+                    tar.addfile(tarinfo, buf)
+            add_file("model", lambda buf: cloudpickle.dump(model, buf))
+            add_file("opt", lambda buf: cloudpickle.dump(optimizer, buf))
+            add_file("opt_state", lambda buf: cloudpickle.dump(opt_state, buf))
+            add_file("state", lambda buf: buf.write(json.dumps(asdict(state), indent=2).encode()))
+            add_file("config", lambda buf: buf.write(OmegaConf.to_yaml(self.config).encode()))
+        if self.config.save_tf_model:
+            try:
+                from jax.experimental import jax2tf
+            except ModuleNotFoundError:
+                raise ImportError("Tensorflow is not installed. Install it with `pip install tensorflow`")
+            tf_model = jax2tf.convert(model)
+            add_file("model.tf", lambda buf: cloudpickle.dump(tf_model, buf))
+        # Updates the symlink to the new checkpoint.
+        last_ckpt_path.unlink(missing_ok=True)
+        try:
+            last_ckpt_path.symlink_to(ckpt_path.relative_to(last_ckpt_path.parent))
+        except FileExistsError:
+            logger.exception("Exception while trying to update %s", ckpt_path)
+        # Marks directory as having artifacts which shouldn't be overwritten.
+        self.add_lock_file("ckpt", exists_ok=True)
+        return ckpt_path

xax/task/mixins/compile.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""Defines a mixin for handling JAX compilation behavior.
+This mixin allows control over JAX compilation settings like jit, pmap, and vmap
+behavior during initialization and training.
+"""
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generic, TypeVar
+import jax
+from xax.core.conf import field
+from xax.task.base import BaseConfig, BaseTask
+logger = logging.getLogger(__name__)
+@jax.tree_util.register_dataclass
+@dataclass
+class CompileOptions:
+    # JAX compilation options
+    disable_jit: bool = field(
+        value=False,
+        help="If True, disables JIT compilation",
+    )
+    enable_x64: bool = field(
+        value=False,
+        help="If True, enables 64-bit precision",
+    )
+    default_device: str | None = field(
+        value=None,
+        help="Default device to use (e.g. 'cpu', 'gpu')",
+    )
+    # JAX logging options
+    logging_level: str = field(
+        value="INFO",
+        help="JAX logging verbosity level",
+    )
+    # JAX cache options
+    cache_dir: str | None = field(
+        value=lambda: str((Path.home() / ".cache" / "jax" / "jaxcache").resolve()),
+        help="Directory for JAX compilation cache. If None, caching is disabled",
+    )
+    cache_min_size_bytes: int = field(
+        value=-1,
+        help="Minimum size in bytes for cache entries. -1 means no minimum",
+    )
+    cache_min_compile_time_secs: float = field(
+        value=0.0,
+        help="Minimum compilation time in seconds for cache entries. 0 means no minimum",
+    )
+    cache_enable_xla: str = field(
+        value="all",
+        help="Which XLA caches to enable",
+    )
+@jax.tree_util.register_dataclass
+@dataclass
+class CompileConfig(BaseConfig):
+    compile: CompileOptions = field(CompileOptions(), help="Compilation configuration")
+Config = TypeVar("Config", bound=CompileConfig)
+class CompileMixin(BaseTask[Config], Generic[Config]):
+    """Defines a task mixin for controlling JAX compilation behavior."""
+    def __init__(self, config: Config) -> None:
+        super().__init__(config)
+        cc = self.config.compile
+        # Set basic compilation flags
+        if cc.disable_jit:
+            logger.info("Disabling JIT compilation")
+            jax.config.update("jax_disable_jit", True)
+        if cc.enable_x64:
+            logger.info("Enabling 64-bit precision")
+            jax.config.update("jax_enable_x64", True)
+        if cc.default_device is not None:
+            logger.info("Setting default device to %s", cc.default_device)
+            jax.config.update("jax_default_device", cc.default_device)
+        # Set logging level
+        logger.info("Setting JAX logging level to %s", cc.logging_level)
+        jax.config.update("jax_logging_level", cc.logging_level)
+        # Configure compilation cache
+        if cc.cache_dir is not None:
+            logger.info("Setting JAX compilation cache directory to %s", cc.cache_dir)
+            jax.config.update("jax_compilation_cache_dir", cc.cache_dir)
+            logger.info("Configuring JAX compilation cache parameters")
+            jax.config.update("jax_persistent_cache_min_entry_size_bytes", cc.cache_min_size_bytes)
+            jax.config.update("jax_persistent_cache_min_compile_time_secs", cc.cache_min_compile_time_secs)
+            jax.config.update("jax_persistent_cache_enable_xla_caches", cc.cache_enable_xla)

xax 0.0.3__py3-none-any.whl → 0.0.6__py3-none-any.whl

Potentially problematic release.

xax 0.0.3py3-none-any.whl → 0.0.6py3-none-any.whl