PyPI - ezmsg-learn - Versions diffs - 1.0__py3-none-any.whl - Mend

ezmsg-learn 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ezmsg/learn/__init__.py +2 -0
ezmsg/learn/__version__.py +34 -0
ezmsg/learn/dim_reduce/__init__.py +0 -0
ezmsg/learn/dim_reduce/adaptive_decomp.py +284 -0
ezmsg/learn/dim_reduce/incremental_decomp.py +181 -0
ezmsg/learn/linear_model/__init__.py +1 -0
ezmsg/learn/linear_model/adaptive_linear_regressor.py +6 -0
ezmsg/learn/linear_model/cca.py +1 -0
ezmsg/learn/linear_model/linear_regressor.py +5 -0
ezmsg/learn/linear_model/sgd.py +5 -0
ezmsg/learn/linear_model/slda.py +6 -0
ezmsg/learn/model/__init__.py +0 -0
ezmsg/learn/model/cca.py +122 -0
ezmsg/learn/model/mlp.py +133 -0
ezmsg/learn/model/mlp_old.py +49 -0
ezmsg/learn/model/refit_kalman.py +401 -0
ezmsg/learn/model/rnn.py +160 -0
ezmsg/learn/model/transformer.py +175 -0
ezmsg/learn/nlin_model/__init__.py +1 -0
ezmsg/learn/nlin_model/mlp.py +6 -0
ezmsg/learn/process/__init__.py +0 -0
ezmsg/learn/process/adaptive_linear_regressor.py +157 -0
ezmsg/learn/process/base.py +173 -0
ezmsg/learn/process/linear_regressor.py +99 -0
ezmsg/learn/process/mlp_old.py +200 -0
ezmsg/learn/process/refit_kalman.py +407 -0
ezmsg/learn/process/rnn.py +266 -0
ezmsg/learn/process/sgd.py +131 -0
ezmsg/learn/process/sklearn.py +274 -0
ezmsg/learn/process/slda.py +119 -0
ezmsg/learn/process/torch.py +378 -0
ezmsg/learn/process/transformer.py +222 -0
ezmsg/learn/util.py +66 -0
ezmsg_learn-1.0.dist-info/METADATA +34 -0
ezmsg_learn-1.0.dist-info/RECORD +36 -0
ezmsg_learn-1.0.dist-info/WHEEL +4 -0

ezmsg/learn/model/rnn.py ADDED Viewed

@@ -0,0 +1,160 @@
+from typing import Optional
+import torch
+class RNNModel(torch.nn.Module):
+    """
+    Recurrent neural network supporting GRU, LSTM, and vanilla RNN (tanh/relu).
+    Attributes:
+        input_size (int): Number of input features per time step.
+        hidden_size (int): Number of hidden units in the RNN cell.
+        num_layers (int, optional): Number of RNN layers. Default is 1.
+        output_size (int | dict[str, int], optional): Number of output features or classes if single head output or a
+            dictionary mapping head names to output sizes if multi-head output. Default is 2 (single head).
+        dropout (float, optional): Dropout rate applied after input and RNN output. Default is 0.3.
+        rnn_type (str, optional): Type of RNN cell to use: 'GRU', 'LSTM', 'RNN-Tanh', 'RNN-ReLU'. Default is 'GRU'.
+    Returns:
+        dict[str, torch.Tensor]: Dictionary of decoded predictions mapping head names to tensors of shape
+            (batch, seq_len, output_size). If single head output, the key is "output".
+    """
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        output_size: int | dict[str, int] = 2,
+        dropout: float = 0.3,
+        rnn_type: str = "GRU",
+    ):
+        super().__init__()
+        self.linear_embeddings = torch.nn.Linear(input_size, input_size)
+        self.dropout_input = torch.nn.Dropout(dropout)
+        rnn_klass_str = rnn_type.upper().split("-")[0]
+        if rnn_klass_str not in ["GRU", "LSTM", "RNN"]:
+            raise ValueError(f"Unrecognized rnn_type: {rnn_type}")
+        rnn_klass = {"GRU": torch.nn.GRU, "LSTM": torch.nn.LSTM, "RNN": torch.nn.RNN}[
+            rnn_klass_str
+        ]
+        rnn_kwargs = {}
+        if rnn_klass_str == "RNN":
+            rnn_kwargs["nonlinearity"] = rnn_type.lower().split("-")[-1]
+        self.rnn = rnn_klass(
+            input_size,
+            hidden_size,
+            num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0.0,
+            **rnn_kwargs,
+        )
+        self.rnn_type = rnn_klass_str
+        self.output_dropout = torch.nn.Dropout(dropout)
+        if isinstance(output_size, int):
+            output_size = {"output": output_size}
+        self.heads = torch.nn.ModuleDict(
+            {
+                name: torch.nn.Linear(hidden_size, size)
+                for name, size in output_size.items()
+            }
+        )
+    @classmethod
+    def infer_config_from_state_dict(
+        cls, state_dict: dict, rnn_type: str = "GRU"
+    ) -> dict[str, int | float]:
+        """
+        This method is specific to each processor.
+        Args:
+            state_dict: The state dict of the model.
+            rnn_type: The type of RNN used in the model (e.g., 'GRU', 'LSTM', 'RNN-Tanh', 'RNN-ReLU').
+        Returns:
+            A dictionary of model parameters obtained from the state dict.
+        """
+        output_size = {
+            key.split(".")[1]: param.shape[0]
+            for key, param in state_dict.items()
+            if key.startswith("heads.") and key.endswith(".bias")
+        }
+        return {
+            # Infer input_size from linear_embeddings.weight (shape: [input_size, input_size])
+            "input_size": state_dict["linear_embeddings.weight"].shape[1],
+            # Infer hidden_size from rnn.weight_ih_l0 (shape: [hidden_size * 3, input_size])
+            "hidden_size": state_dict["rnn.weight_ih_l0"].shape[0]
+            // cls._get_gate_count(rnn_type),
+            # Infer num_layers by counting rnn layers in state_dict (e.g., weight_ih_l<k>)
+            "num_layers": sum(1 for key in state_dict if "rnn.weight_ih_l" in key),
+            "output_size": output_size,
+        }
+    @staticmethod
+    def _get_gate_count(rnn_type: str) -> int:
+        if rnn_type.upper() == "GRU":
+            return 3
+        elif rnn_type.upper() == "LSTM":
+            return 4
+        elif rnn_type.upper().startswith("RNN"):
+            return 1
+        else:
+            raise ValueError(f"Unsupported rnn_type: {rnn_type}")
+    def init_hidden(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        """
+        Initialize the hidden state for the RNN.
+        Args:
+            batch_size (int): Size of the batch.
+            device (torch.device): Device to place the hidden state on (e.g., 'cpu' or 'cuda').
+        Returns:
+            torch.Tensor | tuple[torch.Tensor, torch.Tensor]: Initial hidden state for the RNN.
+                For LSTM, returns a tuple of (h_n, c_n) where h_n is the hidden state and c_n is the cell state.
+                For GRU or vanilla RNN, returns just h_n.
+        """
+        shape = (self.rnn.num_layers, batch_size, self.rnn.hidden_size)
+        if self.rnn_type == "LSTM":
+            return (
+                torch.zeros(shape, device=device),
+                torch.zeros(shape, device=device),
+            )
+        else:
+            return torch.zeros(shape, device=device)
+    def forward(
+        self,
+        x: torch.Tensor,
+        input_lens: Optional[torch.Tensor] = None,
+        hx: Optional[torch.Tensor | tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor | tuple]:
+        """
+        Forward pass through the RNN model.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch, seq_len, input_size).
+            input_lens (Optional[torch.Tensor]): Optional tensor of lengths for each sequence in the batch.
+                If provided, sequences will be packed before passing through the RNN.
+            hx (Optional[torch.Tensor | tuple[torch.Tensor, torch.Tensor]]): Optional initial hidden state for the RNN.
+        Returns:
+            tuple[dict[str, torch.Tensor], torch.Tensor | tuple]:
+                A dictionary mapping head names to output tensors of shape (batch, seq_len, output_size).
+                If the RNN is LSTM, the second element is the hidden state (h_n, c_n) or just h_n if GRU.
+        """
+        x = self.linear_embeddings(x)
+        x = self.dropout_input(x)
+        total_length = x.shape[1]
+        if input_lens is not None:
+            x = torch.nn.utils.rnn.pack_padded_sequence(
+                x, input_lens, batch_first=True, enforce_sorted=False
+            )
+        x_out, hx_out = self.rnn(x, hx)
+        if input_lens is not None:
+            x_out, _ = torch.nn.utils.rnn.pad_packed_sequence(
+                x_out, batch_first=True, total_length=total_length
+            )
+        x_out = self.output_dropout(x_out)
+        return {name: head(x_out) for name, head in self.heads.items()}, hx_out

ezmsg/learn/model/transformer.py ADDED Viewed

@@ -0,0 +1,175 @@
+from typing import Optional
+import torch
+class TransformerModel(torch.nn.Module):
+    """
+    Transformer-based encoder (optional decoder) neural network.
+    If `decoder_layers > 0`, the model includes a Transformer decoder. In this case, the `tgt` argument must be
+    provided: during training, it is typically the ground-truth target sequence (i.e. teacher forcing); during
+    inference, it can be constructed autoregressively from previous predictions.
+    Attributes:
+        input_size (int): Number of input features per time step.
+        hidden_size (int): Dimensionality of the transformer model.
+        encoder_layers (int, optional): Number of transformer encoder layers. Default is 1.
+        decoder_layers (int, optional): Number of transformer decoder layers. Default is 0.
+        output_size (int | dict[str, int], optional): Number of output features or classes if single head output, or a
+            dictionary mapping head names to output sizes if multi-head output. Default is 2 (single head).
+        dropout (float, optional): Dropout rate applied after input and transformer output. Default is 0.3.
+        attention_heads (int, optional): Number of attention heads in the transformer. Default is 4.
+        max_seq_len (int, optional): Maximum sequence length for positional embeddings. Default is 512.
+    Returns:
+        dict[str, torch.Tensor]: Dictionary of decoded predictions mapping head names to tensors of shape
+            (batch, seq_len, output_size). If single head output, the key is "output".
+    """
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        encoder_layers: int = 1,
+        decoder_layers: int = 0,
+        output_size: int | dict[str, int] = 2,
+        dropout: float = 0.3,
+        attention_heads: int = 4,
+        max_seq_len: int = 512,
+        autoregressive_head: str | None = None,
+    ):
+        super().__init__()
+        self.decoder_layers = decoder_layers
+        self.hidden_size = hidden_size
+        if isinstance(output_size, int):
+            autoregressive_size = output_size
+        else:
+            autoregressive_size = list(output_size.values())[0]
+        if isinstance(output_size, dict):
+            autoregressive_size = output_size.get(
+                autoregressive_head, autoregressive_size
+            )
+        self.start_token = torch.nn.Parameter(torch.zeros(1, 1, autoregressive_size))
+        self.output_to_hidden = torch.nn.Linear(autoregressive_size, hidden_size)
+        self.input_proj = torch.nn.Linear(input_size, hidden_size)
+        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_size)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.encoder = torch.nn.TransformerEncoder(
+            torch.nn.TransformerEncoderLayer(
+                d_model=hidden_size,
+                nhead=attention_heads,
+                dim_feedforward=hidden_size * 4,
+                dropout=dropout,
+                batch_first=True,
+            ),
+            num_layers=encoder_layers,
+        )
+        self.decoder = None
+        if decoder_layers > 0:
+            self.decoder = torch.nn.TransformerDecoder(
+                torch.nn.TransformerDecoderLayer(
+                    d_model=hidden_size,
+                    nhead=attention_heads,
+                    dim_feedforward=hidden_size * 4,
+                    dropout=dropout,
+                    batch_first=True,
+                ),
+                num_layers=decoder_layers,
+            )
+        if isinstance(output_size, int):
+            output_size = {"output": output_size}
+        self.heads = torch.nn.ModuleDict(
+            {
+                name: torch.nn.Linear(hidden_size, out_dim)
+                for name, out_dim in output_size.items()
+            }
+        )
+    @classmethod
+    def infer_config_from_state_dict(cls, state_dict: dict) -> dict[str, int | float]:
+        # Infer output size from heads.<name>.bias (shape: [output_size])
+        output_size = {
+            key.split(".")[1]: param.shape[0]
+            for key, param in state_dict.items()
+            if key.startswith("heads.") and key.endswith(".bias")
+        }
+        return {
+            # Infer input_size from input_proj.weight (shape: [hidden_size, input_size])
+            "input_size": state_dict["input_proj.weight"].shape[1],
+            # Infer hidden_size from input_proj.weight (shape: [hidden_size, input_size])
+            "hidden_size": state_dict["input_proj.weight"].shape[0],
+            "output_size": output_size,
+            # Infer encoder_layers from transformer layers in state_dict
+            "encoder_layers": len(
+                [k for k in state_dict if k.startswith("encoder.layers")]
+            ),
+            # Infer decoder_layers from transformer decoder layers in state_dict
+            "decoder_layers": len(
+                {k.split(".")[2] for k in state_dict if k.startswith("decoder.layers")}
+            )
+            if any(k.startswith("decoder.layers") for k in state_dict)
+            else 0,
+        }
+    def forward(
+        self,
+        src: torch.Tensor,
+        tgt: Optional[torch.Tensor] = None,
+        src_mask: Optional[torch.Tensor] = None,
+        tgt_mask: Optional[torch.Tensor] = None,
+        start_pos: int = 0,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Forward pass through the transformer model.
+        Args:
+            src (torch.Tensor): Input tensor of shape (batch, seq_len, input_size).
+            tgt (Optional[torch.Tensor]): Target tensor for decoder, shape (batch, seq_len, input_size).
+                Required if `decoder_layers > 0`. In training, this can be the ground-truth target sequence
+                (i.e. teacher forcing). During inference, this is constructed autoregressively.
+            src_mask (Optional[torch.Tensor]): Optional attention mask for the encoder input. Should be broadcastable
+                to shape (batch, seq_len, seq_len) or (seq_len, seq_len).
+            tgt_mask (Optional[torch.Tensor]): Optional attention mask for the decoder input. Used to enforce causal
+                decoding (i.e. autoregressive generation) during training or inference.
+            start_pos (int): Starting offset for positional embeddings. Used for streaming inference to maintain
+                correct positional indices. Default is 0.
+        Returns:
+            dict[str, torch.Tensor]: Dictionary of output tensors each output head, each with shape (batch, seq_len,
+                output_size).
+        """
+        B, T, _ = src.shape
+        device = src.device
+        x = self.input_proj(src)
+        pos_ids = torch.arange(start_pos, start_pos + T, device=device).expand(B, T)
+        x = x + self.pos_embedding(pos_ids)
+        x = self.dropout(x)
+        memory = self.encoder(x, mask=src_mask)
+        if self.decoder is not None:
+            if tgt is None:
+                tgt = self.start_token.expand(B, -1, -1).to(device)
+            tgt_proj = self.output_to_hidden(tgt)
+            tgt_pos_ids = torch.arange(tgt.shape[1], device=device).expand(
+                B, tgt.shape[1]
+            )
+            tgt_proj = tgt_proj + self.pos_embedding(tgt_pos_ids)
+            tgt_proj = self.dropout(tgt_proj)
+            out = self.decoder(
+                tgt_proj,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=src_mask,
+            )
+        else:
+            out = memory
+        return {name: head(out) for name, head in self.heads.items()}

ezmsg/learn/nlin_model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Use of this module is deprecated. Please use `ezmsg.learn.model` or `ezmsg.learn.process` instead.

ezmsg/learn/nlin_model/mlp.py ADDED Viewed

@@ -0,0 +1,6 @@
+from ..model.mlp_old import MLP as MLP
+from ..process.mlp_old import (
+    MLPSettings as MLPSettings,
+    MLPState as MLPState,
+    MLPProcessor as MLPProcessor,
+)

ezmsg/learn/process/__init__.py ADDED Viewed

File without changes

ezmsg/learn/process/adaptive_linear_regressor.py ADDED Viewed

@@ -0,0 +1,157 @@
+from dataclasses import field
+import numpy as np
+import pandas as pd
+import river.optim
+import river.linear_model
+import sklearn.base
+import ezmsg.core as ez
+from ezmsg.sigproc.sampler import SampleMessage
+from ezmsg.sigproc.base import (
+    processor_state,
+    BaseAdaptiveTransformer,
+    BaseAdaptiveTransformerUnit,
+)
+from ezmsg.util.messages.axisarray import AxisArray, replace
+from ..util import AdaptiveLinearRegressor, RegressorType, get_regressor
+class AdaptiveLinearRegressorSettings(ez.Settings):
+    model_type: AdaptiveLinearRegressor = AdaptiveLinearRegressor.LINEAR
+    settings_path: str | None = None
+    model_kwargs: dict = field(default_factory=dict)
+@processor_state
+class AdaptiveLinearRegressorState:
+    template: AxisArray | None = None
+    model: river.linear_model.base.GLM | sklearn.base.RegressorMixin | None = None
+class AdaptiveLinearRegressorTransformer(
+    BaseAdaptiveTransformer[
+        AdaptiveLinearRegressorSettings,
+        AxisArray,
+        AxisArray,
+        AdaptiveLinearRegressorState,
+    ]
+):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.settings = replace(
+            self.settings, model_type=AdaptiveLinearRegressor(self.settings.model_type)
+        )
+        b_river = self.settings.model_type in [
+            AdaptiveLinearRegressor.LINEAR,
+            AdaptiveLinearRegressor.LOGISTIC,
+        ]
+        if b_river:
+            self.settings.model_kwargs["l2"] = self.settings.model_kwargs.get("l2", 0.0)
+            if "learn_rate" in self.settings.model_kwargs:
+                self.settings.model_kwargs["optimizer"] = river.optim.SGD(
+                    self.settings.model_kwargs.pop("learn_rate")
+                )
+        if self.settings.settings_path is not None:
+            # Load model from file
+            import pickle
+            with open(self.settings.settings_path, "rb") as f:
+                self.state.model = pickle.load(f)
+            if b_river:
+                # Override with kwargs?!
+                self.state.model.l2 = self.settings.model_kwargs["l2"]
+                if "optimizer" in self.settings.model_kwargs:
+                    self.state.model.optimizer = self.settings.model_kwargs["optimizer"]
+            else:
+                print("TODO: Override sklearn model with kwargs")
+        else:
+            # Build model from scratch.
+            regressor_klass = get_regressor(
+                RegressorType.ADAPTIVE, self.settings.model_type
+            )
+            self.state.model = regressor_klass(**self.settings.model_kwargs)
+    def _hash_message(self, message: AxisArray) -> int:
+        # So far, nothing to reset so hash can be constant.
+        return -1
+    def _reset_state(self, message: AxisArray) -> None:
+        # So far, there is nothing to reset.
+        #  .model is initialized in __init__
+        #  .template is updated in partial_fit
+        pass
+    def partial_fit(self, message: SampleMessage) -> None:
+        if np.any(np.isnan(message.sample.data)):
+            return
+        if self.settings.model_type in [
+            AdaptiveLinearRegressor.LINEAR,
+            AdaptiveLinearRegressor.LOGISTIC,
+        ]:
+            x = pd.DataFrame.from_dict(
+                {
+                    k: v
+                    for k, v in zip(
+                        message.sample.axes["ch"].data, message.sample.data.T
+                    )
+                }
+            )
+            y = pd.Series(
+                data=message.trigger.value.data[:, 0],
+                name=message.trigger.value.axes["ch"].data[0],
+            )
+            self.state.model.learn_many(x, y)
+        else:
+            X = message.sample.data
+            if message.sample.get_axis_idx("time") != 0:
+                X = np.moveaxis(X, message.sample.get_axis_idx("time"), 0)
+            self.state.model.partial_fit(X, message.trigger.value.data)
+        self.state.template = replace(
+            message.trigger.value,
+            data=np.empty_like(message.trigger.value.data),
+            key=message.trigger.value.key + "_pred",
+        )
+    def _process(self, message: AxisArray) -> AxisArray | None:
+        if self.state.template is None:
+            return AxisArray(np.array([]), dims=[""])
+        if not np.any(np.isnan(message.data)):
+            if self.settings.model_type in [
+                AdaptiveLinearRegressor.LINEAR,
+                AdaptiveLinearRegressor.LOGISTIC,
+            ]:
+                # convert msg_in.data to something appropriate for river
+                x = pd.DataFrame.from_dict(
+                    {k: v for k, v in zip(message.axes["ch"].data, message.data.T)}
+                )
+                preds = self.state.model.predict_many(x).values
+            else:
+                preds = self.state.model.predict(message.data)
+            return replace(
+                self.state.template,
+                data=preds.reshape((len(preds), -1)),
+                axes={
+                    **self.state.template.axes,
+                    "time": replace(
+                        message.axes["time"],
+                        offset=message.axes["time"].offset,
+                    ),
+                },
+            )
+class AdaptiveLinearRegressorUnit(
+    BaseAdaptiveTransformerUnit[
+        AdaptiveLinearRegressorSettings,
+        AxisArray,
+        AxisArray,
+        AdaptiveLinearRegressorTransformer,
+    ]
+):
+    SETTINGS = AdaptiveLinearRegressorSettings

ezmsg/learn/process/base.py ADDED Viewed

@@ -0,0 +1,173 @@
+import inspect
+import json
+from pathlib import Path
+import typing
+import ezmsg.core as ez
+import torch
+class ModelInitMixin:
+    """
+    Mixin class to support model initialization from:
+        1. Setting parameters
+        2. Config file
+        3. Checkpoint file
+    """
+    @staticmethod
+    def _merge_config(model_kwargs: dict, config) -> None:
+        """
+        Mutate the model_kwargs dictionary with the config parameters.
+        Args:
+            model_kwargs: Original to-be-mutated model kwargs.
+            config: Update config parameters.
+        Returns:
+            None because model_kwargs is mutated in place.
+        """
+        if "model_params" in config:
+            config = config["model_params"]
+        # Update model_kwargs with config parameters
+        for key, value in config.items():
+            if key in model_kwargs:
+                if model_kwargs[key] != value:
+                    ez.logger.warning(
+                        f"Config parameter {key} ({value}) differs from settings ({model_kwargs[key]})."
+                    )
+            else:
+                ez.logger.warning(f"Config parameter {key} is not in model_kwargs.")
+            model_kwargs[key] = value
+    def _filter_model_kwargs(self, model_class, kwargs: dict) -> dict:
+        valid_params = inspect.signature(model_class.__init__).parameters
+        filtered_out = set(kwargs.keys()) - {k for k in valid_params if k != "self"}
+        if filtered_out:
+            ez.logger.warning(
+                f"Ignoring unexpected model parameters not accepted by {model_class.__name__} constructor: {sorted(filtered_out)}"
+            )
+        # Keep all valid parameters, including None values, so checkpoint-inferred values can overwrite them
+        return {k: v for k, v in kwargs.items() if k in valid_params and k != "self"}
+    def _init_model(
+        self,
+        model_class,
+        params: dict[str, typing.Any] | None = None,
+        config_path: str | None = None,
+        checkpoint_path: str | None = None,
+        device: str = "cpu",
+        state_dict_prefix: str | None = None,
+        weights_only: bool | None = None,
+    ) -> torch.nn.Module:
+        """
+        Args:
+            model_class: The class of the model to be initialized.
+            params: A dictionary of setting parameters to be used for model initialization.
+            config_path: Path to a JSON config file to update model parameters.
+            checkpoint_path: Path to a checkpoint file to load model weights and possibly config.
+        Returns:
+            The initialized model.
+        The model will be initialized with the correct config and weights.
+        """
+        # Model parameters are taken from multiple sources, in ascending priority:
+        # 1. Setting parameters
+        # 2. Config file if provided
+        # 3. "config" entry in checkpoint file if checkpoint file provided and config present
+        # 4. Sizes of weights in checkpoint file if provided
+        # Get configs from setting params.
+        model_kwargs = params or {}
+        state_dict = None
+        # Check if a config file is provided and if so use that to update kwargs (with warnings).
+        if config_path:
+            config_path = Path(config_path)
+            if not config_path.exists():
+                ez.logger.error(f"Config path {config_path} does not exist.")
+                raise FileNotFoundError(f"Config path {config_path} does not exist.")
+            try:
+                with open(config_path, "r") as f:
+                    config = json.load(f)
+                self._merge_config(model_kwargs, config)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to load config from {config_path}: {str(e)}"
+                )
+        # If a checkpoint file is provided, load it.
+        if checkpoint_path:
+            checkpoint_path = Path(checkpoint_path)
+            if not checkpoint_path.exists():
+                ez.logger.error(f"Checkpoint path {checkpoint_path} does not exist.")
+                raise FileNotFoundError(
+                    f"Checkpoint path {checkpoint_path} does not exist."
+                )
+            try:
+                checkpoint = torch.load(
+                    checkpoint_path, map_location=device, weights_only=weights_only
+                )
+                if "config" in checkpoint:
+                    config = checkpoint["config"]
+                    self._merge_config(model_kwargs, config)
+                # Load the model weights and infer the config.
+                state_dict = checkpoint
+                if "model_state_dict" in checkpoint:
+                    state_dict = checkpoint["model_state_dict"]
+                elif "state_dict" in checkpoint:
+                    # This is for backward compatibility with older checkpoints
+                    # that used "state_dict" instead of "model_state_dict"
+                    state_dict = checkpoint["state_dict"]
+                infer_config = getattr(
+                    model_class,
+                    "infer_config_from_state_dict",
+                    lambda _state_dict: {},  # Default to empty dict if not defined
+                )
+                infer_kwargs = (
+                    {"rnn_type": model_kwargs["rnn_type"]}
+                    if "rnn_type" in model_kwargs
+                    else {}
+                )
+                self._merge_config(
+                    model_kwargs,
+                    infer_config(state_dict, **infer_kwargs),
+                )
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to load checkpoint from {checkpoint_path}: {str(e)}"
+                )
+        # Filter model_kwargs to only include valid parameters for the model class
+        filtered_kwargs = self._filter_model_kwargs(model_class, model_kwargs)
+        # Remove None values from filtered_kwargs to avoid passing them to the model constructor
+        # This should only happen for parameters that weren't inferred from the checkpoint
+        final_kwargs = {k: v for k, v in filtered_kwargs.items() if v is not None}
+        # Create the model with the final kwargs
+        model = model_class(**final_kwargs)
+        # Finally, load the weights.
+        if state_dict:
+            if state_dict_prefix:
+                # If a prefix is provided, filter the state_dict keys
+                state_dict = {
+                    k[len(state_dict_prefix) :]: v
+                    for k, v in state_dict.items()
+                    if k.startswith(state_dict_prefix)
+                }
+            # Load the model weights
+            missing, unexpected = model.load_state_dict(
+                state_dict, strict=False, assign=True
+            )
+            if missing or unexpected:
+                ez.logger.warning(
+                    f"Partial load: missing keys: {missing}, unexpected keys: {unexpected}"
+                )
+        model.to(device)
+        return model