PyPI - xax - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

xax 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

xax/__init__.py +49 -7
xax/core/conf.py +1 -0
xax/nn/embeddings.py +355 -0
xax/nn/functions.py +8 -4
xax/requirements-dev.txt +9 -1
xax/requirements.txt +15 -10
xax/task/base.py +0 -6
xax/task/logger.py +328 -393
xax/task/loggers/callback.py +56 -0
xax/task/loggers/tensorboard.py +2 -5
xax/task/mixins/__init__.py +2 -1
xax/task/mixins/artifacts.py +14 -7
xax/task/mixins/checkpointing.py +209 -0
xax/task/mixins/cpu_stats.py +10 -10
xax/task/mixins/data_loader.py +6 -9
xax/task/mixins/gpu_stats.py +3 -3
xax/task/mixins/logger.py +2 -250
xax/task/mixins/process.py +4 -0
xax/task/mixins/train.py +71 -40
xax/task/task.py +6 -5
xax/utils/data/collate.py +6 -6
xax/utils/experiments.py +45 -1
xax/utils/logging.py +29 -0
xax/utils/tensorboard.py +49 -29
{xax-0.0.3.dist-info → xax-0.0.5.dist-info}/METADATA +15 -14
xax-0.0.5.dist-info/RECORD +52 -0
{xax-0.0.3.dist-info → xax-0.0.5.dist-info}/WHEEL +1 -1
xax-0.0.3.dist-info/RECORD +0 -49
{xax-0.0.3.dist-info → xax-0.0.5.dist-info}/LICENSE +0 -0
{xax-0.0.3.dist-info → xax-0.0.5.dist-info}/top_level.txt +0 -0

xax/task/loggers/callback.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Defines a logger that calls a callback function with the log line."""
+from typing import Callable
+from omegaconf import DictConfig
+from xax.task.logger import LogError, LogErrorSummary, LoggerImpl, LogLine, LogPing, LogStatus
+class CallbackLogger(LoggerImpl):
+    def __init__(
+        self,
+        *,
+        callback: Callable[[LogLine], None] = lambda x: None,
+        error_summary_callback: Callable[[LogErrorSummary], None] = lambda x: None,
+        error_callback: Callable[[LogError], None] = lambda x: None,
+        status_callback: Callable[[LogStatus], None] = lambda x: None,
+        ping_callback: Callable[[LogPing], None] = lambda x: None,
+        git_state_callback: Callable[[str], None] = lambda x: None,
+        training_code_callback: Callable[[str], None] = lambda x: None,
+        config_callback: Callable[[DictConfig], None] = lambda x: None,
+    ) -> None:
+        super().__init__()
+        self.callback = callback
+        self.error_summary_callback = error_summary_callback
+        self.error_callback = error_callback
+        self.status_callback = status_callback
+        self.ping_callback = ping_callback
+        self.git_state_callback = git_state_callback
+        self.training_code_callback = training_code_callback
+        self.config_callback = config_callback
+    def write(self, line: LogLine) -> None:
+        self.callback(line)
+    def write_error_summary(self, error_summary: LogErrorSummary) -> None:
+        self.error_summary_callback(error_summary)
+    def write_error(self, error: LogError) -> None:
+        self.error_callback(error)
+    def write_status(self, status: LogStatus) -> None:
+        self.status_callback(status)
+    def write_ping(self, ping: LogPing) -> None:
+        self.ping_callback(ping)
+    def log_git_state(self, git_state: str) -> None:
+        self.git_state_callback(git_state)
+    def log_training_code(self, training_code: str) -> None:
+        self.training_code_callback(training_code)
+    def log_config(self, config: DictConfig) -> None:
+        self.config_callback(config)

xax/task/loggers/tensorboard.py CHANGED Viewed

@@ -12,8 +12,6 @@ import time
 from pathlib import Path
 from typing import TypeVar
-import jax
-import PIL.Image
 from omegaconf import DictConfig, OmegaConf
 from xax.core.state import Phase
@@ -84,7 +82,7 @@ class TensorboardLogger(LoggerImpl):
         port = int(os.environ.get("TENSORBOARD_PORT", DEFAULT_TENSORBOARD_PORT))
         while port_is_busy(port):
-            logger.warning(f"Port {port} is busy, waiting...")
+            logger.warning("Port %s is busy, waiting...", port)
             time.sleep(10)
         def make_localhost(s: str) -> str:
@@ -205,10 +203,9 @@ class TensorboardLogger(LoggerImpl):
         for namespace, images in line.images.items():
             for image_key, image_value in images.items():
-                image = PIL.Image.fromarray(jax.device_get(image_value.pixels))
                 writer.add_image(
                     f"{namespace}/{image_key}",
-                    image,
+                    image_value.image,
                     global_step=line.state.num_steps,
                     walltime=walltime,
                 )

xax/task/mixins/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Defines a single interface for all the mixins."""
 from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
+from xax.task.mixins.checkpointing import CheckpointingConfig, CheckpointingMixin
 from xax.task.mixins.cpu_stats import CPUStatsConfig, CPUStatsMixin
 from xax.task.mixins.data_loader import DataloadersConfig, DataloadersMixin
 from xax.task.mixins.gpu_stats import GPUStatsConfig, GPUStatsMixin
@@ -8,4 +9,4 @@ from xax.task.mixins.logger import LoggerConfig, LoggerMixin
 from xax.task.mixins.process import ProcessConfig, ProcessMixin
 from xax.task.mixins.runnable import RunnableConfig, RunnableMixin
 from xax.task.mixins.step_wrapper import StepContextConfig, StepContextMixin
-from xax.task.mixins.train import Batch, Model, Output, TrainConfig, TrainMixin
+from xax.task.mixins.train import TrainConfig, TrainMixin

xax/task/mixins/artifacts.py CHANGED Viewed

@@ -47,6 +47,10 @@ class ArtifactsMixin(BaseTask[Config]):
         self._exp_dir = exp_dir
         return self
+    @property
+    def exp_dir(self) -> Path:
+        return self.get_exp_dir()
     def add_lock_file(self, lock_type: str, *, exists_ok: bool = False) -> None:
         if (lock_file := self.exp_dir / f".lock_{lock_type}").exists():
             if not exists_ok:
@@ -61,13 +65,16 @@ class ArtifactsMixin(BaseTask[Config]):
         elif not missing_ok:
             raise RuntimeError(f"Lock file not found at {lock_file}")
-    @functools.cached_property
-    def exp_dir(self) -> Path:
+    def get_exp_dir(self) -> Path:
+        if self._exp_dir is not None:
+            return self._exp_dir
         if self.config.exp_dir is not None:
             exp_dir = Path(self.config.exp_dir).expanduser().resolve()
             exp_dir.mkdir(parents=True, exist_ok=True)
-            logger.log(LOG_STATUS, self.exp_dir)
-            return exp_dir
+            self._exp_dir = exp_dir
+            logger.log(LOG_STATUS, self._exp_dir)
+            return self._exp_dir
         def get_exp_dir(run_id: int) -> Path:
             return self.run_dir / f"run_{run_id}"
@@ -81,9 +88,9 @@ class ArtifactsMixin(BaseTask[Config]):
         while (exp_dir := get_exp_dir(run_id)).is_dir() and has_lock_file(exp_dir):
             run_id += 1
         exp_dir.mkdir(exist_ok=True, parents=True)
-        exp_dir = exp_dir.expanduser().resolve()
-        logger.log(LOG_STATUS, exp_dir)
-        return exp_dir
+        self._exp_dir = exp_dir.expanduser().resolve()
+        logger.log(LOG_STATUS, self._exp_dir)
+        return self._exp_dir
     @functools.lru_cache(maxsize=None)
     def stage_environment(self) -> Path | None:

xax/task/mixins/checkpointing.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Defines a mixin for handling model checkpointing."""
+import io
+import json
+import logging
+import tarfile
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Callable, Generic, Literal, TypeVar, cast, overload
+import cloudpickle
+import optax
+from jaxtyping import PyTree
+from omegaconf import DictConfig, OmegaConf
+from xax.core.conf import field
+from xax.core.state import State
+from xax.nn.parallel import is_master
+from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
+logger = logging.getLogger(__name__)
+CheckpointPart = Literal["model", "opt", "opt_state", "state", "config"]
+def get_ckpt_path(exp_dir: Path, state: State | None = None) -> Path:
+    """Defines the path to the checkpoint for a given state.
+    Args:
+        exp_dir: The experiment directory
+        state: The current trainer state
+    Returns:
+        The path to the checkpoint file.
+    """
+    if state is None:
+        return exp_dir / "checkpoints" / "ckpt.bin"
+    return exp_dir / "checkpoints" / f"ckpt.{state.num_steps}.bin"
+@dataclass
+class CheckpointingConfig(ArtifactsConfig):
+    save_every_n_steps: int | None = field(None, help="Save a checkpoint every N steps")
+    save_every_n_seconds: float | None = field(60.0 * 60.0, help="Save a checkpoint every N seconds")
+    only_save_most_recent: bool = field(True, help="Only keep the most recent checkpoint")
+    load_from_ckpt_path: str | None = field(None, help="If set, load initial model weights from this path")
+    load_ckpt_strict: bool = field(True, help="If set, only load weights for which have a matching key in the model")
+Config = TypeVar("Config", bound=CheckpointingConfig)
+class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
+    def __init__(self, config: Config) -> None:
+        super().__init__(config)
+        self.__last_ckpt_time = 0.0
+    def get_ckpt_path(self, state: State | None = None) -> Path:
+        return get_ckpt_path(self.exp_dir, state)
+    def get_init_ckpt_path(self) -> Path | None:
+        if self._exp_dir is not None:
+            ckpt_path = self.get_ckpt_path()
+            if ckpt_path.exists():
+                return ckpt_path
+        if self.config.load_from_ckpt_path is not None:
+            ckpt_path = Path(self.config.load_from_ckpt_path)
+            assert ckpt_path.exists(), f"Checkpoint path {ckpt_path} does not exist."
+            return ckpt_path
+        return None
+    def should_checkpoint(self, state: State) -> bool:
+        if self.config.save_every_n_steps is not None:
+            if state.num_steps % self.config.save_every_n_steps == 0:
+                return True
+        if self.config.save_every_n_seconds is not None:
+            last_time, cur_time = self.__last_ckpt_time, state.elapsed_time_s
+            if cur_time - last_time >= self.config.save_every_n_seconds:
+                self.__last_ckpt_time = cur_time
+                return True
+        return False
+    @overload
+    def load_checkpoint(
+        self,
+        path: Path,
+    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["model"]) -> PyTree: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["opt"]) -> optax.GradientTransformation: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["opt_state"]) -> optax.OptState: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["state"]) -> State: ...
+    @overload
+    def load_checkpoint(self, path: Path, part: Literal["config"]) -> DictConfig: ...
+    def load_checkpoint(
+        self,
+        path: Path,
+        part: CheckpointPart | None = None,
+    ) -> (
+        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]
+        | PyTree
+        | optax.GradientTransformation
+        | optax.OptState
+        | State
+        | DictConfig
+    ):
+        with tarfile.open(path, "r:gz") as tar:
+            def get_model() -> PyTree:
+                if (model := tar.extractfile("model")) is None:
+                    raise ValueError(f"Checkpoint does not contain a model file: {path}")
+                return cloudpickle.load(model)
+            def get_opt() -> optax.GradientTransformation:
+                if (opt := tar.extractfile("opt")) is None:
+                    raise ValueError(f"Checkpoint does not contain an opt file: {path}")
+                return cloudpickle.load(opt)
+            def get_opt_state() -> optax.OptState:
+                if (opt_state := tar.extractfile("opt_state")) is None:
+                    raise ValueError(f"Checkpoint does not contain an opt_state file: {path}")
+                return cloudpickle.load(opt_state)
+            def get_state() -> State:
+                if (state := tar.extractfile("state")) is None:
+                    raise ValueError(f"Checkpoint does not contain a state file: {path}")
+                return State(**json.loads(state.read().decode()))
+            def get_config() -> DictConfig:
+                if (config := tar.extractfile("config")) is None:
+                    raise ValueError(f"Checkpoint does not contain a config file: {path}")
+                return cast(DictConfig, OmegaConf.load(config))
+            match part:
+                case "model":
+                    return get_model()
+                case "opt":
+                    return get_opt()
+                case "opt_state":
+                    return get_opt_state()
+                case "state":
+                    return get_state()
+                case "config":
+                    return get_config()
+                case None:
+                    return get_model(), get_opt(), get_opt_state(), get_state(), get_config()
+                case _:
+                    raise ValueError(f"Invalid checkpoint part: {part}")
+    def save_checkpoint(
+        self,
+        model: PyTree,
+        optimizer: optax.GradientTransformation,
+        opt_state: optax.OptState,
+        state: State,
+    ) -> Path:
+        ckpt_path = self.get_ckpt_path(state)
+        if not is_master():
+            return ckpt_path
+        # Gets the path to the last checkpoint.
+        logger.info("Saving checkpoint to %s", ckpt_path)
+        last_ckpt_path = self.get_ckpt_path()
+        ckpt_path.parent.mkdir(exist_ok=True, parents=True)
+        # Potentially removes the last checkpoint.
+        if last_ckpt_path.exists() and self.config.only_save_most_recent:
+            if (base_ckpt := last_ckpt_path.resolve()).is_file():
+                base_ckpt.unlink()
+        # Combines all temporary files into a single checkpoint TAR file.
+        with tarfile.open(ckpt_path, "w:gz") as tar:
+            def add_file(name: str, write_fn: Callable[[io.BytesIO], Any]) -> None:
+                with io.BytesIO() as buf:
+                    write_fn(buf)
+                    tarinfo = tarfile.TarInfo(name)
+                    tarinfo.size = buf.tell()
+                    buf.seek(0)
+                    tar.addfile(tarinfo, buf)
+            add_file("model", lambda buf: cloudpickle.dump(model, buf))
+            add_file("opt", lambda buf: cloudpickle.dump(optimizer, buf))
+            add_file("opt_state", lambda buf: cloudpickle.dump(opt_state, buf))
+            add_file("state", lambda buf: buf.write(json.dumps(asdict(state), indent=2).encode()))
+            add_file("config", lambda buf: buf.write(OmegaConf.to_yaml(self.config).encode()))
+        # Updates the symlink to the new checkpoint.
+        last_ckpt_path.unlink(missing_ok=True)
+        try:
+            last_ckpt_path.symlink_to(ckpt_path.relative_to(last_ckpt_path.parent))
+        except FileExistsError:
+            logger.exception("Exception while trying to update %s", ckpt_path)
+        # Marks directory as having artifacts which shouldn't be overwritten.
+        self.add_lock_file("ckpt", exists_ok=True)
+        return ckpt_path

xax/task/mixins/cpu_stats.py CHANGED Viewed

@@ -237,15 +237,15 @@ class CPUStatsMixin(ProcessMixin[Config], LoggerMixin[Config], Generic[Config]):
         stats = monitor.get_if_set() if self.config.cpu_stats.only_log_once else monitor.get()
         if stats is not None:
-            self.log_scalar("child_procs", stats.num_child_procs, namespace="🔧 cpu")
-            self.log_scalar("percent", stats.cpu_percent, namespace="🔧 cpu")
-            self.log_scalar("child_percent", stats.child_cpu_percent, namespace="🔧 cpu")
-            self.log_scalar("percent", stats.mem_percent, namespace="🔧 mem")
-            self.log_scalar("shared", stats.mem_shared, namespace="🔧 mem")
-            self.log_scalar("child_percent", stats.child_mem_percent, namespace="🔧 mem")
-            self.log_scalar("rss/cur", stats.mem_rss, namespace="🔧 mem")
-            self.log_scalar("rss/total", stats.mem_rss_total, namespace="🔧 mem")
-            self.log_scalar("vms/cur", stats.mem_vms, namespace="🔧 mem")
-            self.log_scalar("vms/total", stats.mem_vms_total, namespace="🔧 mem")
+            self.logger.log_scalar("child_procs", stats.num_child_procs, namespace="🔧 cpu")
+            self.logger.log_scalar("percent", stats.cpu_percent, namespace="🔧 cpu")
+            self.logger.log_scalar("child_percent", stats.child_cpu_percent, namespace="🔧 cpu")
+            self.logger.log_scalar("percent", stats.mem_percent, namespace="🔧 mem")
+            self.logger.log_scalar("shared", stats.mem_shared, namespace="🔧 mem")
+            self.logger.log_scalar("child_percent", stats.child_mem_percent, namespace="🔧 mem")
+            self.logger.log_scalar("rss/cur", stats.mem_rss, namespace="🔧 mem")
+            self.logger.log_scalar("rss/total", stats.mem_rss_total, namespace="🔧 mem")
+            self.logger.log_scalar("vms/cur", stats.mem_vms, namespace="🔧 mem")
+            self.logger.log_scalar("vms/total", stats.mem_vms_total, namespace="🔧 mem")
         return state

xax/task/mixins/data_loader.py CHANGED Viewed

@@ -38,7 +38,6 @@ class DataloaderErrorConfig:
 @dataclass
 class DataloaderConfig:
-    batch_size: int = field(MISSING, help="Size of each batch")
     num_workers: int | None = field(MISSING, help="Number of workers for loading samples")
     prefetch_factor: int = field(2, help="Number of items to pre-fetch on each worker")
     error: DataloaderErrorConfig = field(DataloaderErrorConfig(), help="Dataloader error configuration")
@@ -49,11 +48,11 @@ class DataloadersConfig(ProcessConfig, BaseConfig):
     batch_size: int = field(MISSING, help="Size of each batch")
     raise_dataloader_errors: bool = field(False, help="If set, raise dataloader errors inside the worker processes")
     train_dl: DataloaderConfig = field(
-        DataloaderConfig(batch_size=II("batch_size")),
+        DataloaderConfig(num_workers=II("mlfab.num_workers:-1")),
         help="Train dataloader config",
     )
     valid_dl: DataloaderConfig = field(
-        DataloaderConfig(batch_size=II("batch_size"), num_workers=1),
+        DataloaderConfig(num_workers=1),
         help="Valid dataloader config",
     )
     debug_dataloader: bool = field(False, help="Debug dataloaders")
@@ -64,9 +63,7 @@ Config = TypeVar("Config", bound=DataloadersConfig)
 class DataloadersMixin(ProcessMixin[Config], BaseTask[Config], Generic[Config], ABC):
     def __init__(self, config: Config) -> None:
-        if is_missing(config, "batch_size") and (
-            is_missing(config.train_dl, "batch_size") or is_missing(config.valid_dl, "batch_size")
-        ):
+        if is_missing(config, "batch_size"):
             config.batch_size = self.get_batch_size()
         super().__init__(config)
@@ -120,10 +117,10 @@ class DataloadersMixin(ProcessMixin[Config], BaseTask[Config], Generic[Config],
         return Dataloader(
             dataset=dataset,
-            batch_size=cfg.batch_size,
+            batch_size=self.config.batch_size,
             num_workers=0 if debugging else cfg.num_workers,
             prefetch_factor=cfg.prefetch_factor,
-            ctx=self.multiprocessing_context,
+            mp_manager=self.multiprocessing_manager,
             dataloader_worker_init_fn=self.dataloader_worker_init_fn,
             collate_worker_init_fn=self.collate_worker_init_fn,
             item_callback=self.dataloader_item_callback,
@@ -135,7 +132,7 @@ class DataloadersMixin(ProcessMixin[Config], BaseTask[Config], Generic[Config],
     @classmethod
     def to_device_fn(cls, sample: T) -> T:
-        return recursive_apply(sample, jax.device_put)
+        return recursive_apply(sample, jax.device_put, include_numpy=True)
     @classmethod
     def dataloader_worker_init_fn(cls, worker_id: int, num_workers: int) -> None:

xax/task/mixins/gpu_stats.py CHANGED Viewed

@@ -250,8 +250,8 @@ class GPUStatsMixin(ProcessMixin[Config], LoggerMixin[Config], Generic[Config]):
         for gpu_stat in stats.values():
             if gpu_stat is None:
                 continue
-            self.log_scalar(f"mem/{gpu_stat.index}", gpu_stat.memory_used, namespace="🔧 gpu")
-            self.log_scalar(f"temp/{gpu_stat.index}", gpu_stat.temperature, namespace="🔧 gpu")
-            self.log_scalar(f"util/{gpu_stat.index}", gpu_stat.utilization, namespace="🔧 gpu")
+            self.logger.log_scalar(f"mem/{gpu_stat.index}", gpu_stat.memory_used, namespace="🔧 gpu")
+            self.logger.log_scalar(f"temp/{gpu_stat.index}", gpu_stat.temperature, namespace="🔧 gpu")
+            self.logger.log_scalar(f"util/{gpu_stat.index}", gpu_stat.utilization, namespace="🔧 gpu")
         return state

xax 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

xax 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl