PyPI - xax - Versions diffs - 0.1.15__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

xax 0.1.15py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

xax/__init__.py +1 -1
xax/core/state.py +26 -1
xax/requirements.txt +5 -5
xax/task/base.py +1 -1
xax/task/logger.py +149 -12
xax/task/loggers/json.py +12 -4
xax/task/loggers/stdout.py +21 -16
xax/task/loggers/tensorboard.py +18 -2
xax/task/mixins/checkpointing.py +118 -41
xax/task/mixins/cpu_stats.py +10 -10
xax/task/mixins/data_loader.py +2 -1
xax/task/mixins/gpu_stats.py +3 -3
xax/task/mixins/train.py +59 -29
xax/utils/experiments.py +34 -30
xax/utils/tensorboard.py +91 -3
{xax-0.1.15.dist-info → xax-0.2.0.dist-info}/METADATA +6 -6
{xax-0.1.15.dist-info → xax-0.2.0.dist-info}/RECORD +20 -20
{xax-0.1.15.dist-info → xax-0.2.0.dist-info}/WHEEL +0 -0
{xax-0.1.15.dist-info → xax-0.2.0.dist-info}/licenses/LICENSE +0 -0
{xax-0.1.15.dist-info → xax-0.2.0.dist-info}/top_level.txt +0 -0

xax/task/mixins/checkpointing.py CHANGED Viewed

@@ -6,9 +6,9 @@ import logging
 import tarfile
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, TypeVar, cast, overload
+from typing import Generic, Literal, TypeVar, cast, overload
-import cloudpickle
+import equinox as eqx
 import jax
 import optax
 from jaxtyping import PyTree
@@ -64,7 +64,9 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def get_init_ckpt_path(self) -> Path | None:
         if self._exp_dir is not None:
             ckpt_path = self.get_ckpt_path()
-            if ckpt_path.exists():
+            if not ckpt_path.exists():
+                logger.warning("No checkpoint found in experiment directory: %s", ckpt_path)
+            else:
                 return ckpt_path
         if self.config.load_from_ckpt_path is not None:
             ckpt_path = Path(self.config.load_from_ckpt_path)
@@ -87,41 +89,54 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def load_checkpoint(
         self,
         path: Path,
-        part: Literal["all"] = "all",
-    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]: ...
+        *,
+        part: Literal["all"],
+        model_template: PyTree,
+        optimizer_template: PyTree,
+        opt_state_template: PyTree,
+    ) -> tuple[PyTree, optax.GradientTransformation, optax.OptState, State, Config]: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
-        part: Literal["model_state_config"] = "model_state_config",
-    ) -> tuple[PyTree, State, DictConfig]: ...
+        *,
+        part: Literal["model_state_config"],
+        model_template: PyTree,
+    ) -> tuple[PyTree, State, Config]: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["model"],
+        model_template: PyTree,
     ) -> PyTree: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["opt"],
+        optimizer_template: PyTree,
     ) -> optax.GradientTransformation: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["opt_state"],
+        opt_state_template: PyTree,
     ) -> optax.OptState: ...
     @overload
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["state"],
     ) -> State: ...
@@ -129,48 +144,71 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: Literal["config"],
-    ) -> DictConfig: ...
+    ) -> Config: ...
     def load_checkpoint(
         self,
         path: Path,
+        *,
         part: CheckpointPart = "all",
+        model_template: PyTree | None = None,
+        optimizer_template: PyTree | None = None,
+        opt_state_template: PyTree | None = None,
     ) -> (
-        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, DictConfig]
-        | tuple[PyTree, State, DictConfig]
+        tuple[PyTree, optax.GradientTransformation, optax.OptState, State, Config]
+        | tuple[PyTree, State, Config]
         | PyTree
         | optax.GradientTransformation
         | optax.OptState
         | State
-        | DictConfig
+        | Config
     ):
+        """Load a checkpoint.
+        Args:
+            path: Path to the checkpoint directory
+            part: Which part of the checkpoint to load
+            model_template: Template model with correct structure but uninitialized weights
+            optimizer_template: Template optimizer with correct structure but uninitialized weights
+            opt_state_template: Template optimizer state with correct structure but uninitialized weights
+        Returns:
+            The requested checkpoint components
+        """
         with tarfile.open(path, "r:gz") as tar:
             def get_model() -> PyTree:
+                if model_template is None:
+                    raise ValueError("model_template must be provided to load model weights")
                 if (model := tar.extractfile("model")) is None:
                     raise ValueError(f"Checkpoint does not contain a model file: {path}")
-                return cloudpickle.load(model)
+                return eqx.tree_deserialise_leaves(io.BytesIO(model.read()), model_template)
             def get_opt() -> optax.GradientTransformation:
-                if (opt := tar.extractfile("opt")) is None:
-                    raise ValueError(f"Checkpoint does not contain an opt file: {path}")
-                return cloudpickle.load(opt)
+                if optimizer_template is None:
+                    raise ValueError("optimizer_template must be provided to load optimizer")
+                if (opt := tar.extractfile("optimizer")) is None:
+                    raise ValueError(f"Checkpoint does not contain an optimizer file: {path}")
+                return eqx.tree_deserialise_leaves(io.BytesIO(opt.read()), optimizer_template)
             def get_opt_state() -> optax.OptState:
+                if opt_state_template is None:
+                    raise ValueError("opt_state_template must be provided to load optimizer state")
                 if (opt_state := tar.extractfile("opt_state")) is None:
-                    raise ValueError(f"Checkpoint does not contain an opt_state file: {path}")
-                return cloudpickle.load(opt_state)
+                    raise ValueError(f"Checkpoint does not contain an optimizer state file: {path}")
+                return eqx.tree_deserialise_leaves(io.BytesIO(opt_state.read()), opt_state_template)
             def get_state() -> State:
                 if (state := tar.extractfile("state")) is None:
                     raise ValueError(f"Checkpoint does not contain a state file: {path}")
                 return State(**json.loads(state.read().decode()))
-            def get_config() -> DictConfig:
+            def get_config() -> Config:
                 if (config := tar.extractfile("config")) is None:
                     raise ValueError(f"Checkpoint does not contain a config file: {path}")
-                return cast(DictConfig, OmegaConf.load(config))
+                return self.get_config(cast(DictConfig, OmegaConf.load(config)), use_cli=False)
             match part:
                 case "model":
@@ -192,51 +230,90 @@ class CheckpointingMixin(ArtifactsMixin[Config], Generic[Config]):
     def save_checkpoint(
         self,
-        model: PyTree,
-        optimizer: optax.GradientTransformation,
-        opt_state: optax.OptState,
-        state: State,
+        model: PyTree | None = None,
+        optimizer: optax.GradientTransformation | None = None,
+        opt_state: optax.OptState | None = None,
+        aux_data: PyTree | None = None,
+        state: State | None = None,
     ) -> Path:
+        """Save a checkpoint.
+        Args:
+            model: The model to save
+            state: The current training state
+            optimizer: The optimizer to save
+            aux_data: Additional data to save
+            opt_state: The optimizer state to save
+        Returns:
+            Path to the saved checkpoint
+        """
         ckpt_path = self.get_ckpt_path(state)
         if not is_master():
             return ckpt_path
-        # Gets the path to the last checkpoint.
+        # Gets the path to the last checkpoint
         logger.info("Saving checkpoint to %s", ckpt_path)
         last_ckpt_path = self.get_ckpt_path()
         ckpt_path.parent.mkdir(exist_ok=True, parents=True)
-        # Potentially removes the last checkpoint.
+        # Potentially removes the last checkpoint
         if last_ckpt_path.exists() and self.config.only_save_most_recent:
             if (base_ckpt := last_ckpt_path.resolve()).is_file():
                 base_ckpt.unlink()
-        # Combines all temporary files into a single checkpoint TAR file.
+        # Save the checkpoint components
         with tarfile.open(ckpt_path, "w:gz") as tar:
-            def add_file(name: str, write_fn: Callable[[io.BytesIO], Any]) -> None:
+            def add_file(name: str, buf: io.BytesIO) -> None:
+                tarinfo = tarfile.TarInfo(name)
+                tarinfo.size = buf.tell()
+                buf.seek(0)
+                tar.addfile(tarinfo, buf)
+            # Save model using Equinox
+            if model is not None:
+                with io.BytesIO() as buf:
+                    eqx.tree_serialise_leaves(buf, model)
+                    add_file("model", buf)
+            # Save optimizer using Equinox
+            if optimizer is not None:
+                with io.BytesIO() as buf:
+                    eqx.tree_serialise_leaves(buf, optimizer)
+                    add_file("optimizer", buf)
+            # Save optimizer state using Equinox
+            if opt_state is not None:
                 with io.BytesIO() as buf:
-                    write_fn(buf)
-                    tarinfo = tarfile.TarInfo(name)
-                    tarinfo.size = buf.tell()
-                    buf.seek(0)
-                    tar.addfile(tarinfo, buf)
-            add_file("model", lambda buf: cloudpickle.dump(model, buf))
-            add_file("opt", lambda buf: cloudpickle.dump(optimizer, buf))
-            add_file("opt_state", lambda buf: cloudpickle.dump(opt_state, buf))
-            add_file("state", lambda buf: buf.write(json.dumps(asdict(state), indent=2).encode()))
-            add_file("config", lambda buf: buf.write(OmegaConf.to_yaml(self.config).encode()))
-        # Updates the symlink to the new checkpoint.
+                    eqx.tree_serialise_leaves(buf, opt_state)
+                    add_file("opt_state", buf)
+            # Save aux data using Equinox.
+            if aux_data is not None:
+                with io.BytesIO() as buf:
+                    eqx.tree_serialise_leaves(buf, aux_data)
+                    add_file("aux_data", buf)
+            # Save state and config as JSON
+            def add_file_bytes(name: str, data: bytes) -> None:  # noqa: ANN401
+                info = tarfile.TarInfo(name=name)
+                info.size = len(data)
+                tar.addfile(info, io.BytesIO(data))
+            if state is not None:
+                add_file_bytes("state", json.dumps(asdict(state), indent=2).encode())
+            add_file_bytes("config", OmegaConf.to_yaml(self.config).encode())
+        # Updates the symlink to the new checkpoint
         last_ckpt_path.unlink(missing_ok=True)
         try:
             last_ckpt_path.symlink_to(ckpt_path.relative_to(last_ckpt_path.parent))
         except FileExistsError:
             logger.exception("Exception while trying to update %s", ckpt_path)
-        # Calls the base callback.
+        # Calls the base callback
         self.on_after_checkpoint_save(ckpt_path, state)
         return ckpt_path

xax/task/mixins/cpu_stats.py CHANGED Viewed

@@ -248,15 +248,15 @@ class CPUStatsMixin(ProcessMixin[Config], LoggerMixin[Config], Generic[Config]):
         stats = monitor.get_if_set() if self.config.cpu_stats.only_log_once else monitor.get()
         if stats is not None:
-            self.logger.log_scalar("child_procs", stats.num_child_procs, namespace="🔧 cpu")
-            self.logger.log_scalar("percent", stats.cpu_percent, namespace="🔧 cpu")
-            self.logger.log_scalar("child_percent", stats.child_cpu_percent, namespace="🔧 cpu")
-            self.logger.log_scalar("percent", stats.mem_percent, namespace="🔧 mem")
-            self.logger.log_scalar("shared", stats.mem_shared, namespace="🔧 mem")
-            self.logger.log_scalar("child_percent", stats.child_mem_percent, namespace="🔧 mem")
-            self.logger.log_scalar("rss/cur", stats.mem_rss, namespace="🔧 mem")
-            self.logger.log_scalar("rss/total", stats.mem_rss_total, namespace="🔧 mem")
-            self.logger.log_scalar("vms/cur", stats.mem_vms, namespace="🔧 mem")
-            self.logger.log_scalar("vms/total", stats.mem_vms_total, namespace="🔧 mem")
+            self.logger.log_scalar("child_procs", stats.num_child_procs, namespace="🔧 cpu", secondary=True)
+            self.logger.log_scalar("percent", stats.cpu_percent, namespace="🔧 cpu", secondary=True)
+            self.logger.log_scalar("child_percent", stats.child_cpu_percent, namespace="🔧 cpu", secondary=True)
+            self.logger.log_scalar("percent", stats.mem_percent, namespace="🔧 mem", secondary=True)
+            self.logger.log_scalar("shared", stats.mem_shared, namespace="🔧 mem", secondary=True)
+            self.logger.log_scalar("child_percent", stats.child_mem_percent, namespace="🔧 mem", secondary=True)
+            self.logger.log_scalar("rss/cur", stats.mem_rss, namespace="🔧 mem", secondary=True)
+            self.logger.log_scalar("rss/total", stats.mem_rss_total, namespace="🔧 mem", secondary=True)
+            self.logger.log_scalar("vms/cur", stats.mem_vms, namespace="🔧 mem", secondary=True)
+            self.logger.log_scalar("vms/total", stats.mem_vms_total, namespace="🔧 mem", secondary=True)
         return state

xax/task/mixins/data_loader.py CHANGED Viewed

@@ -9,6 +9,7 @@ import jax
 from dpshdl.dataloader import CollatedDataloaderItem, Dataloader
 from dpshdl.dataset import Dataset, ErrorHandlingDataset
 from dpshdl.prefetcher import Prefetcher
+from jaxtyping import PRNGKeyArray
 from omegaconf import II, MISSING
 from xax.core.conf import field, is_missing
@@ -103,7 +104,7 @@ class DataloadersMixin(ProcessMixin[Config], BaseTask[Config], Generic[Config],
             "or `get_data_iterator` to return an iterator for the given dataset."
         )
-    def get_data_iterator(self, phase: Phase) -> Iterator:
+    def get_data_iterator(self, phase: Phase, key: PRNGKeyArray) -> Iterator:
         raise NotImplementedError(
             "You must implement either the `get_dataset` method to return the dataset for the given phase, "
             "or `get_data_iterator` to return an iterator for the given dataset."

xax/task/mixins/gpu_stats.py CHANGED Viewed

@@ -264,8 +264,8 @@ class GPUStatsMixin(ProcessMixin[Config], LoggerMixin[Config], Generic[Config]):
         for gpu_stat in stats.values():
             if gpu_stat is None:
                 continue
-            self.logger.log_scalar(f"mem/{gpu_stat.index}", gpu_stat.memory_used, namespace="🔧 gpu")
-            self.logger.log_scalar(f"temp/{gpu_stat.index}", gpu_stat.temperature, namespace="🔧 gpu")
-            self.logger.log_scalar(f"util/{gpu_stat.index}", gpu_stat.utilization, namespace="🔧 gpu")
+            self.logger.log_scalar(f"mem/{gpu_stat.index}", gpu_stat.memory_used, namespace="🔧 gpu", secondary=True)
+            self.logger.log_scalar(f"temp/{gpu_stat.index}", gpu_stat.temperature, namespace="🔧 gpu", secondary=True)
+            self.logger.log_scalar(f"util/{gpu_stat.index}", gpu_stat.utilization, namespace="🔧 gpu", secondary=True)
         return state

xax/task/mixins/train.py CHANGED Viewed

@@ -11,7 +11,7 @@ import textwrap
 import time
 import traceback
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, is_dataclass
+from dataclasses import asdict, dataclass, is_dataclass
 from threading import Thread
 from typing import (
     Any,
@@ -33,7 +33,6 @@ import jax.numpy as jnp
 import numpy as np
 import optax
 from jaxtyping import Array, PRNGKeyArray, PyTree
-from omegaconf import DictConfig
 from xax.core.conf import field
 from xax.core.state import Phase, State
@@ -50,6 +49,7 @@ from xax.utils.experiments import (
     TrainingFinishedError,
     diff_configs,
     get_diff_string,
+    get_info_json,
     get_state_file_string,
     get_training_code,
 )
@@ -218,7 +218,12 @@ class TrainMixin(
         return state.replace(elapsed_time_s=time.time() - state.start_time_s)
     def log_train_step(
-        self, model: PyTree, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State
+        self,
+        model: PyTree,
+        batch: Batch,
+        output: Output,
+        metrics: FrozenDict[str, Array],
+        state: State,
     ) -> None:
         """Override this function to do logging during the training phase.
@@ -234,7 +239,12 @@ class TrainMixin(
         """
     def log_valid_step(
-        self, model: PyTree, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State
+        self,
+        model: PyTree,
+        batch: Batch,
+        output: Output,
+        metrics: FrozenDict[str, Array],
+        state: State,
     ) -> None:
         """Override this function to do logging during the validation phase.
@@ -252,12 +262,20 @@ class TrainMixin(
     def log_state_timers(self, state: State) -> None:
         timer = self.state_timers[state.phase]
         timer.step(state)
-        for ns, d in timer.log_dict().items():
-            for k, v in d.items():
-                self.logger.log_scalar(k, v, namespace=ns)
+        for k, v in timer.log_dict().items():
+            if isinstance(v, tuple):
+                v, secondary = v
+            else:
+                secondary = False
+            self.logger.log_scalar(k, v, namespace="⌛ timers", secondary=secondary)
     def log_step(
-        self, model: PyTree, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State
+        self,
+        model: PyTree,
+        batch: Batch,
+        output: Output,
+        metrics: FrozenDict[str, Array],
+        state: State,
     ) -> None:
         phase = state.phase
@@ -322,20 +340,30 @@ class TrainMixin(
         if init_ckpt_path is not None:
             logger.info("Loading checkpoint from %s", init_ckpt_path)
-            if load_optimizer:
-                model, optimizer, opt_state, state, config = self.load_checkpoint(init_ckpt_path)
-                config_diff = get_diff_string(diff_configs(config, cast(DictConfig, self.config)))
-                if config_diff:
-                    logger.warning("Loaded config differs from current config:\n%s", config_diff)
-                return model, optimizer, opt_state, state
+            model_spec = eqx.filter_eval_shape(self.get_model, key)
+            model, state, config = self.load_checkpoint(
+                init_ckpt_path,
+                part="model_state_config",
+                model_template=model_spec,
+            )
+            config_diff = get_diff_string(diff_configs(asdict(config), asdict(self.config)))
+            if config_diff:
+                logger.warning("Loaded config differs from current config:\n%s", config_diff)
-            else:
-                model, state, config = self.load_checkpoint(init_ckpt_path, "model_state_config")
-                config_diff = get_diff_string(diff_configs(config, cast(DictConfig, self.config)))
-                if config_diff:
-                    logger.warning("Loaded config differs from current config:\n%s", config_diff)
+            if not load_optimizer:
                 return model, state
+            # Loads the optimizer.
+            optimizer_spec = eqx.filter_eval_shape(self.get_optimizer)
+            optimizer = self.load_checkpoint(init_ckpt_path, part="opt", optimizer_template=optimizer_spec)
+            # Loads the optimizer state.
+            opt_state_spec = eqx.filter_eval_shape(self.get_initial_opt_state, model, optimizer)
+            opt_state = self.load_checkpoint(init_ckpt_path, part="opt_state", opt_state_template=opt_state_spec)
+            return model, optimizer, opt_state, state
+        logger.info("No checkpoint found. Initializing a new model.")
         model = self.get_model(key)
         state = State.init_state()
@@ -536,6 +564,7 @@ class TrainMixin(
         self.logger.log_file("state.txt", get_state_file_string(self))
         self.logger.log_file("training_code.py", get_training_code(self))
         self.logger.log_file("config.yaml", self.config_str(self.config, use_cli=False))
+        self.logger.log_file("info.json", get_info_json())
     def model_partition_fn(self, item: Any) -> bool:  # noqa: ANN401
         return eqx.is_inexact_array(item)
@@ -609,16 +638,16 @@ class TrainMixin(
             if self.should_checkpoint(state):
                 model = eqx.combine(model_arr, model_static)
-                self.save_checkpoint(model, optimizer, opt_state, state)
+                self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
         # After finishing training, save the final checkpoint.
         model = eqx.combine(model_arr, model_static)
-        self.save_checkpoint(model, optimizer, opt_state, state)
+        self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
     @contextlib.contextmanager
-    def get_train_iterator(self) -> Generator[Iterator[Batch], None, None]:
+    def get_train_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
         try:
-            train_iterator: Iterator[Batch] = self.get_data_iterator("train")
+            train_iterator: Iterator[Batch] = self.get_data_iterator("train", key=key)
             yield train_iterator
             return
         except NotImplementedError:
@@ -635,9 +664,9 @@ class TrainMixin(
             logger.info("Closing train prefetcher")
     @contextlib.contextmanager
-    def get_valid_iterator(self) -> Generator[Iterator[Batch], None, None]:
+    def get_valid_iterator(self, key: PRNGKeyArray) -> Generator[Iterator[Batch], None, None]:
         try:
-            valid_iterator: Iterator[Batch] = self.get_data_iterator("valid")
+            valid_iterator: Iterator[Batch] = self.get_data_iterator("valid", key=key)
             yield valid_iterator
             return
         except NotImplementedError:
@@ -681,12 +710,13 @@ class TrainMixin(
             state = self.on_training_start(state)
             def on_exit() -> None:
-                self.save_checkpoint(model, optimizer, opt_state, state)
+                self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
             # Handle user-defined interrupts during the training loop.
             self.add_signal_handler(on_exit, signal.SIGUSR1, signal.SIGTERM)
-            with self.get_train_iterator() as train_pf, self.get_valid_iterator() as valid_pf:
+            key, tkey, vkey = jax.random.split(key, 3)
+            with self.get_train_iterator(tkey) as train_pf, self.get_valid_iterator(vkey) as valid_pf:
                 try:
                     self.train_loop(
                         model=model,
@@ -703,7 +733,7 @@ class TrainMixin(
                             f"Finished training after {state.num_steps} steps, {state.num_samples} samples",
                             important=True,
                         )
-                    self.save_checkpoint(model, optimizer, opt_state, state)
+                    self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
                 except (KeyboardInterrupt, bdb.BdbQuit):
                     if is_master():
@@ -713,7 +743,7 @@ class TrainMixin(
                     exception_tb = textwrap.indent(highlight_exception_message(traceback.format_exc()), "  ")
                     sys.stdout.write(f"Caught exception during training loop:\n\n{exception_tb}\n")
                     sys.stdout.flush()
-                    self.save_checkpoint(model, optimizer, opt_state, state)
+                    self.save_checkpoint(model=model, optimizer=optimizer, opt_state=opt_state, state=state)
                 finally:
                     state = self.on_training_end(state)

xax/utils/experiments.py CHANGED Viewed

@@ -7,6 +7,7 @@ import functools
 import hashlib
 import inspect
 import itertools
+import json
 import logging
 import math
 import os
@@ -24,7 +25,7 @@ import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Iterator, Self, TypeVar, cast
+from typing import Any, Iterator, Mapping, Self, Sequence, TypeVar, cast
 from urllib.parse import urlparse
 import git
@@ -114,28 +115,13 @@ class StateTimer:
         self.sample_timer.step(state.num_samples if state.phase == "train" else state.num_valid_samples, cur_time)
         self.iter_timer.step(cur_time)
-    def log_dict(self) -> dict[str, dict[str, int | float]]:
-        logs: dict[str, dict[str, int | float]] = {}
-        # Logs step statistics.
-        logs["⌛ steps"] = {
-            "total": self.step_timer.steps,
-            "per-second": self.step_timer.steps_per_second,
-        }
-        # Logs sample statistics.
-        logs["⌛ samples"] = {
-            "total": self.sample_timer.steps,
-            "per-second": self.sample_timer.steps_per_second,
+    def log_dict(self) -> dict[str, int | float | tuple[int | float, bool]]:
+        return {
+            "steps/second": self.step_timer.steps_per_second,
+            "samples/second": (self.sample_timer.steps_per_second, True),
+            "dt": self.iter_timer.iter_seconds,
         }
-        # Logs full iteration statistics.
-        logs["⌛ dt"] = {
-            "iter": self.iter_timer.iter_seconds,
-        }
-        return logs
 class IntervalTicker:
     def __init__(self, interval: float) -> None:
@@ -217,8 +203,8 @@ class MinGradScaleError(TrainingFinishedError):
 def diff_configs(
-    first: ListConfig | DictConfig,
-    second: ListConfig | DictConfig,
+    first: Mapping | Sequence,
+    second: Mapping | Sequence,
     prefix: str | None = None,
 ) -> tuple[list[str], list[str]]:
     """Returns the difference between two configs.
@@ -245,7 +231,7 @@ def diff_configs(
     any_config = (ListConfig, DictConfig)
-    if isinstance(first, DictConfig) and isinstance(second, DictConfig):
+    if isinstance(first, Mapping) and isinstance(second, Mapping):
         first_keys, second_keys = cast(set[str], set(first.keys())), cast(set[str], set(second.keys()))
         # Gets the new keys in each config.
@@ -255,11 +241,12 @@ def diff_configs(
         # Gets the new sub-keys in each config.
         for key in first_keys.intersection(second_keys):
             sub_prefix = key if prefix is None else f"{prefix}.{key}"
-            if OmegaConf.is_missing(first, key) or OmegaConf.is_missing(second, key):
-                if not OmegaConf.is_missing(first, key):
-                    new_first += [get_diff_string(sub_prefix, first[key])]
-                if not OmegaConf.is_missing(second, key):
-                    new_second += [get_diff_string(sub_prefix, second[key])]
+            if isinstance(first, DictConfig) and isinstance(second, DictConfig):
+                if OmegaConf.is_missing(first, key) or OmegaConf.is_missing(second, key):
+                    if not OmegaConf.is_missing(first, key):
+                        new_first += [get_diff_string(sub_prefix, first[key])]
+                    if not OmegaConf.is_missing(second, key):
+                        new_second += [get_diff_string(sub_prefix, second[key])]
             elif isinstance(first[key], any_config) and isinstance(second[key], any_config):
                 sub_new_first, sub_new_second = diff_configs(first[key], second[key], prefix=sub_prefix)
                 new_first, new_second = new_first + sub_new_first, new_second + sub_new_second
@@ -268,7 +255,7 @@ def diff_configs(
                 new_first += [get_diff_string(sub_prefix, first_val)]
                 new_second += [get_diff_string(sub_prefix, second_val)]
-    elif isinstance(first, ListConfig) and isinstance(second, ListConfig):
+    elif isinstance(first, Sequence) and isinstance(second, Sequence):
         if len(first) > len(second):
             for i in range(len(second), len(first)):
                 new_first += [get_diff_string(prefix, first[i])]
@@ -483,16 +470,33 @@ def get_command_line_string() -> str:
     return " ".join(sys.argv)
+def get_environment_variables() -> str:
+    return "\n".join([f"{key}={value}" for key, value in sorted(os.environ.items())])
 def get_state_file_string(obj: object) -> str:
     return "\n\n".join(
         [
             f"=== Command Line ===\n\n{get_command_line_string()}",
             f"=== Git State ===\n\n{get_git_state(obj)}",
             f"=== Packages ===\n\n{get_packages_with_versions()}",
+            f"=== Environment Variables ===\n\n{get_environment_variables()}",
         ]
     )
+def get_info_json() -> str:
+    return json.dumps(
+        {
+            "process_id": os.getpid(),
+            "job": {
+                "start_time": datetime.datetime.now().isoformat(),
+            },
+        },
+        indent=2,
+    )
 def get_training_code(obj: object) -> str:
     """Gets the text from the file containing the provided object.

xax 0.1.15__py3-none-any.whl → 0.2.0__py3-none-any.whl

xax 0.1.15py3-none-any.whl → 0.2.0py3-none-any.whl