PyPI - amica-python - Versions diffs - 0.1.0__py3-none-any.whl - Mend

amica-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

amica/__init__.py +5 -0
amica/_batching.py +194 -0
amica/_newton.py +77 -0
amica/_sklearn_interface.py +387 -0
amica/_types.py +44 -0
amica/conftest.py +30 -0
amica/constants.py +47 -0
amica/core.py +1165 -0
amica/datasets.py +15 -0
amica/kernels.py +1308 -0
amica/linalg.py +349 -0
amica/state.py +385 -0
amica/tests/test_amica.py +497 -0
amica/utils/__init__.py +36 -0
amica/utils/_logging.py +64 -0
amica/utils/_progress.py +34 -0
amica/utils/_verbose.py +14 -0
amica/utils/fetch.py +274 -0
amica/utils/fortran.py +387 -0
amica/utils/imports.py +46 -0
amica/utils/mne.py +74 -0
amica/utils/parallel.py +72 -0
amica/utils/simulation.py +36 -0
amica/utils/tests/test_fetch.py +9 -0
amica/utils/tests/test_fortran.py +47 -0
amica/utils/tests/test_imports.py +0 -0
amica/utils/tests/test_logger.py +29 -0
amica/utils/tests/test_mne.py +27 -0
amica_python-0.1.0.dist-info/METADATA +196 -0
amica_python-0.1.0.dist-info/RECORD +33 -0
amica_python-0.1.0.dist-info/WHEEL +5 -0
amica_python-0.1.0.dist-info/licenses/LICENSE +25 -0
amica_python-0.1.0.dist-info/top_level.txt +1 -0

amica/core.py ADDED Viewed

@@ -0,0 +1,1165 @@
+"""Module containing amica funciton entry point."""
+import time
+import torch
+from numpy.testing import assert_allclose
+from amica._types import (
+    DataTensor2D,
+)
+from amica.constants import (
+    doscaling,
+    epsdble,
+    invsigmax,
+    invsigmin,
+    lratefact,
+    maxdecs,
+    maxincs,
+    maxrho,
+    mineig,
+    minlog,
+    minlrate,
+    minrho,
+    outstep,
+    rholratefact,
+    share_comps,
+    share_iter,
+    share_start,
+    use_grad_norm,
+    use_min_dll,
+)
+from amica.kernels import (
+    accumulate_alpha_stats,
+    accumulate_beta_stats,
+    accumulate_c_stats,
+    accumulate_kappa_stats,
+    accumulate_lambda_stats,
+    accumulate_mu_stats,
+    accumulate_rho_stats,
+    accumulate_sigma2_stats,
+    compute_mixture_responsibilities,
+    compute_model_loglikelihood_per_sample,
+    compute_model_responsibilities,
+    compute_preactivations,
+    compute_scaled_scores,
+    compute_source_densities,
+    compute_source_scores,
+    compute_total_loglikelihood_per_sample,
+    compute_weighted_responsibilities,
+    precompute_weighted_scores,
+)
+from amica.linalg import (
+    compute_sign_log_determinant,
+    get_initial_model_log_likelihood,
+    get_unmixing_matrices,
+    pre_whiten,
+)
+from amica.state import (
+    AmicaAccumulators,
+    AmicaConfig,
+    AmicaState,
+    IterationMetrics,
+    get_initial_state,
+    initialize_accumulators,
+)
+from ._batching import BatchLoader, choose_batch_size
+from ._newton import compute_newton_terms
+from .utils._logging import log, set_log_level
+from .utils._progress import make_progress_bar
+from .utils._verbose import _validate_verbose
+def fit_amica(
+        X,
+        *,
+        whiten="zca",
+        mean_center=True,
+        n_components=None,
+        device="cpu",
+        n_mixtures=3,
+        max_iter=500,
+        tol=1e-7,
+        lrate=0.05,
+        rholrate=0.05,
+        pdftype=0,
+        do_newton=True,
+        newt_start=50,
+        newtrate=1.0,
+        newt_ramp=10,
+        batch_size=None,
+        w_init=None,
+        sbeta_init=None,
+        mu_init=None,
+        do_reject=False,
+        random_state=None,
+        verbose=1,
+):
+    """Perform Adaptive Mixture Independent Component Analysis (AMICA).
+    Implements the AMICA algorithm as described in :footcite:t:`palmer2012` and
+    :footcite:t:`palmer2008`, and originally implemented in :footcite:t:`amica`.
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        Training data, where ``n_samples`` is the number of samples and
+        ``n_features`` is the number of features.
+    n_components : int, optional
+        Number of components to extract. If ``None`` (default), set to ``n_features``.
+        Note that the number of components may be reduced during whitening if the data
+        are rank-deficient.
+    n_mixtures: int, optional, default=3
+         Number of mixtures components to use in the Gaussian Mixture Model (GMM) for
+         each component's source density. default is ``3``.
+    batch_size : int, optional
+        Batch size for processing data in chunks along the samples axis. If ``None``,
+        the batch size is chosen automatically to keep peak memory under ~1.5 GB, and
+        warns if the batch size is below ~8k samples. If the input data is small enough
+        to process in one shot, no batching is used. If you want to enforce no
+        batching, you can override this memory cap by setting batch_size explicitly,
+        e.g. to  `X.shape[0]` to process all samples at once. but note that this may
+        lead to high memory usage for large datasets.
+    device : str, optional
+        Device to run the computations on. Can be either 'cpu' or 'cuda' for GPU
+        acceleration. Note that using 'cuda' requires a compatible NVIDIA GPU and
+        the appropriate CUDA drivers installed.
+    whiten : str {"zca", "pca", "variance"}
+        Whitening method to apply to the data before fitting AMICA. Options are:
+        - "zca": Zero-phase component analysis (ZCA) whitening.
+        - "pca": Principal component analysis (PCA) whitening.
+        - "variance": Only variance normalization of the features is done (no sphering).
+    mean_center : bool, optional
+        If ``True``, X is mean corrected.
+    max_iter : int, optional
+        Maximum number of iterations to perform. Default is ``500``.
+    random_state : int or None, optional (default=None)
+        Used to perform a random initialization when w_init is not provided.
+        If int, random_state is the seed used by the random number generator during
+        whitening, and is used to set the seed during optimization initialization.
+    w_init : array-like, shape (``n_components``, ``n_components``), optional
+        Initial weights for the mixture components. If None, weights are initialized
+        randomly. This is meant to be used for testing and debugging purposes only.
+    sbeta_init : array-like, shape (``n_components``, ``n_mixtures``), optional
+        Initial scales (sbeta) for the mixture components. If None, scales are
+        initialized randomly. This is meant to be used for testing and debugging
+        purposes only.
+    mu_init : array-like, shape (``n_components``, ``n_mixtures``), optional
+        Initial locations (mu) for the mixture components. If None, locations are
+        initialized randomly. This is meant to be used for testing and debugging
+        purposes only.
+    lrate : float, default=0.05
+        Initial learning rate for the natural gradient.
+    rholrate : float = default=0.05
+        initial learning rate for shape parameters.
+    pdftype : int, default=0
+        Type of source density model to use. Currently only ``0`` is supported,
+        which corresponds to the Gaussian Mixture Model (GMM) density.
+    do_newton : bool, default=True
+        If ``True``, the optimization method will switch from Stochastic Gradient
+        Descent (SGD) to newton updates after ``newt_start`` iterations. If ``False``,
+        only SGD updates are used.
+    newt_start : int, default=50
+        Number of iterations before switching to Newton updates if ``do_newton`` is
+        ``True``.
+    newtrate : float, default=1.0
+        learning rate for newton iterations.
+    verbose : int, default=1
+        Output mode during optimization:
+        - ``0``: silent
+        - ``1``: progress bar
+        - ``2``: per-iteration FORTRAN-style logs
+    Returns
+    -------
+    results : dict
+        Dictionary containing the following entries:
+        - mean : array, shape (``n_features``,) | ``None``
+            The mean over features. if ``do_mean=False``, this is ``None``.
+        - S : array, shape (``n_components``, ``n_features``)
+            The sphering (whitening) matrix applied to the data.
+        - W : array, shape (``n_components``, ``n_components``)
+            The unmixing matrix.
+        - A : array, shape (``n_components``, ``n_components``)
+            The mixing matrix in the space of sphered data. To get the mixing matrix
+            in the original data space, use ``np.linalg.pinv(S) @ A``.
+        - LL : array, shape (``max_iter``,)
+            The log-likelihood values at each iteration. If the algorithm converged
+            before reaching ``max_iter``, the remaining entries will be zero.
+        - gm : array, shape (1,)
+            The Gaussian mixture model weights. Since only one model is supported,
+            this will be of shape (1,).
+        - mu : array, shape (``n_components``, ``n_mixtures``)
+            The location parameters for the mixture components, i.e. the means of the
+            mixture components.
+        - rho : array, shape (``n_components``, ``n_mixtures``)
+            The shape parameters for the mixture components.
+        - sbeta : array, shape (``n_components``, ``n_mixtures``)
+            The scale (precision) parameters for the mixture components.
+        - alpha : array, shape (``n_components``, ``n_mixtures``)
+            The mixture weights for the mixture components.
+        - c : array, shape (``n_components``,)
+            The model bias terms.
+    Notes
+    -----
+    In Fortran AMICA, ``alpha``, ``sbeta``, ``mu``, and ``rho`` are of shape
+    (``n_mixtures``, ``n_components``) (transposed compared to here).
+    References
+    ----------
+    .. footbibliography::
+    """
+    verbose = _validate_verbose(verbose)
+    set_log_level("INFO" if verbose == 2 else "ERROR")
+    if batch_size is None:
+        batch_size = choose_batch_size(
+            N=X.shape[0],
+            n_comps=n_components if n_components is not None else X.shape[1],
+            n_mix=n_mixtures,
+        )
+    # Step 1: Create config and state objects (new dataclass approach)
+    config = AmicaConfig(
+        n_features=X.shape[1],  # Number of channels (corrected from X.shape[1])
+        n_components=n_components if n_components is not None else X.shape[1],
+        n_models=1,
+        n_mixtures=n_mixtures,
+        max_iter=max_iter,
+        batch_size=batch_size,
+        device=torch.device(device),
+        pdftype=pdftype,
+        tol=tol,
+        lrate=lrate,
+        rholrate=rholrate,
+        do_newton=do_newton,
+        newt_start=newt_start,
+        newtrate=newtrate,
+        newt_ramp=newt_ramp,
+        do_reject=do_reject,
+        verbose=verbose,
+    )
+    # Step 2: Create initial state (this will eventually replace manual initialization)
+    torch.set_default_dtype(config.dtype) # TODO: Make this less global
+    state = get_initial_state(config)
+    # Init
+    if config.do_reject:
+        raise NotImplementedError(
+            "Sample rejection by log likelihood is not yet supported."
+        )  # pragma: no cover
+    dataseg = X.copy()
+    # Whitening
+    do_sphere = True if whiten in {"zca", "pca"} else False
+    do_approx_sphere = True if whiten == "zca" else False
+    do_mean = True if mean_center else False
+    dataseg, whitening_matrix, sldet, whitening_inverse, mean = pre_whiten(
+        X=dataseg,
+        n_components=n_components,
+        mineig=mineig,
+        do_mean=do_mean,
+        do_sphere=do_sphere,
+        do_approx_sphere=do_approx_sphere,
+        inplace=True,
+        )
+    # Run AMICA
+    state_dict, LL = solve(
+        X=dataseg,
+        config=config,
+        state=state,
+        sldet=sldet,
+        random_state=random_state,
+        initial_weights=w_init,
+        initial_scales=sbeta_init,
+        initial_locations=mu_init,
+        )
+    return dict(
+        S=whitening_matrix,
+        mean=mean,
+        gm=state_dict["gm"],
+        mu=state_dict["mu"],
+        rho=state_dict["rho"],
+        sbeta=state_dict["sbeta"],
+        W=state_dict["W"],
+        A=state_dict["A"],
+        c=state_dict["c"],
+        alpha=state_dict["alpha"],
+        LL=LL,
+    )
+def solve(
+        X,
+        *,
+        config,
+        state,
+        sldet,
+        random_state=None,
+        initial_weights=None,
+        initial_scales=None,
+        initial_locations=None,
+):
+    """Run the AMICA algorithm.
+    Parameters
+    ----------
+    X : array, shape (N, T)
+        Matrix containing the features that have to be unmixed. N is the
+        number of features, T is the number of samples. X has to be centered
+    initial_weights : array-like, shape (n_components, n_components), optional
+        Initial weights for the mixture components. If None, weights are initialized
+        randomly. This is meant to be used for testing and debugging purposes only.
+    initial_scales : array-like, shape (n_components, n_mixtures), optional
+        Initial scales (sbeta) for the mixture components. If None, scales are
+        initialized randomly. This is meant to be used for testing and debugging
+       purposes only.
+    initial_locations : array-like, shape (n_components, n_mixtures), optional
+        Initial locations (mu) for the mixture components. If None, locations are
+        initialized randomly. This is meant to be used for testing and debugging
+        purposes only.
+    """
+    # No-copy (if on CPU)
+    X: DataTensor2D = torch.as_tensor(X, dtype=config.dtype, device=config.device)
+    rng = torch.Generator()
+    if random_state is not None:
+        rng.manual_seed(random_state)
+    # The API will use n_components but under the hood we'll match the Fortran naming
+    # TODO: Maybe rename n_components to num_comps in the config dataclass?
+    num_comps = config.n_components
+    num_mix = config.n_mixtures
+    # !-------------------- ALLOCATE VARIABLES ---------------------
+    # !------------------- INITIALIZE VARIABLES ----------------------
+    # print *, myrank+1, ': Initializing variables ...'; call flush(6);
+    # if (seg_rank == 0) then
+    assert_allclose(state.gm.sum(), 1.0)
+    # load_alpha:
+    state.alpha[:, :num_mix] = 1.0 / num_mix
+    # load_mu:
+    mu_values = torch.arange(num_mix) - (num_mix - 1) / 2
+    state.mu[:, :] = mu_values[None, :]
+    if initial_locations is None:
+        initial_locations = torch.rand(num_comps, num_mix, generator=rng)
+    else:
+        assert initial_locations.shape == (num_comps, num_mix)
+        initial_locations = torch.as_tensor(initial_locations, dtype=torch.float64)
+    state.mu = state.mu + 0.05 * (1.0 - 2.0 * initial_locations)
+    # load_beta:
+    if initial_scales is None:
+        initial_scales = torch.rand(num_comps, num_mix, generator=rng)
+    else:
+        assert initial_scales.shape == (num_comps, num_mix)
+        initial_scales = torch.as_tensor(initial_scales, dtype=torch.float64)
+    state.sbeta = 1.0 + 0.1 * (0.5 - initial_scales)
+    # load_c:
+    state.c.fill_(0.0)
+    # load_A:
+    if initial_weights is None:
+        initial_weights = torch.rand(num_comps, num_comps, generator=rng)
+    else:
+        assert initial_weights.shape == (num_comps, num_comps)
+        initial_weights = torch.as_tensor(initial_weights, dtype=torch.float64)
+    state.A[:, :] = 0.01 * (0.5 - initial_weights)
+    idx = torch.arange(num_comps)
+    state.A[idx, idx] = 1.0
+    Anrmk = torch.linalg.norm(state.A[:, :], dim=0)
+    state.A[:, :] /= Anrmk
+    # end load_A
+    W, wc = get_unmixing_matrices(
+        c=state.c,
+        A=state.A,
+        W=state.W,
+    )
+    assert W.dtype == torch.float64
+    state.W = W.clone()
+    del W # safe guard against accidental use of W instead of state.W
+    # !-------------------- Determine optimal block size -------------------
+    log(f"1: block size = {config.batch_size}", level="info", color=None)
+    # !XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX main loop XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+    log(
+        "Solving. (please be patient, this may take a while)...",
+        level="info",
+        color="blue",
+        weight="bold"
+    )
+    with torch.no_grad():
+        state, LL = optimize(
+            X=X,
+            sldet=sldet.item(),
+            wc=wc,
+            config=config,
+            state=state,
+        )
+    # Convert Tensors to numpy arrays for output
+    state_dict = state.to_numpy()
+    LL = LL.cpu().numpy()
+    return state_dict, LL
+def optimize(
+        *,
+        X: DataTensor2D,
+        sldet: float,
+        wc: torch.Tensor,
+        config: AmicaConfig,
+        state: AmicaState,
+):
+    """Optimize the learnable Paramters."""
+    # Just set all convergence creterion to the user specific tol
+    min_dll = config.tol
+    min_nd = config.tol
+    # These variables can be updated in the loop
+    leave = False
+    do_newton = config.do_newton
+    numdecs = 0  # number of consecutive iterations where LL decreased from previous
+    numincs = 0  # number of consecutive iterations where LL increased by less than tol
+    metrics = IterationMetrics(
+        iter=1,
+        lrate=config.lrate,
+        rholrate=config.rholrate,
+        lrate0=config.lrate,  # updates slower than lrate..
+        rholrate0=config.rholrate,  # Updates slower than rholrate..
+        newtrate=config.newtrate,
+    )
+    # Initialize accumulators container
+    accumulators = initialize_accumulators(config)
+    if config.device.type != "cpu":
+        state.to_device(device=config.device)
+        wc = wc.to(device=config.device)
+    # We allocate these separately.
+    Dsum = torch.tensor(0.0, dtype=torch.float64, device=config.device)
+    Dsign = torch.tensor(1.0, dtype=torch.float64, device=config.device)
+    # per sample loglik
+    loglik = torch.zeros((X.shape[0],), dtype=torch.float64, device=config.device)
+    # likelihood history
+    LL = torch.zeros(max(1, config.max_iter), dtype=torch.float64, device=config.device)
+    c_start = time.time()
+    c1 = time.time()
+    progress = None
+    task_id = None
+    if config.verbose == 1:
+        progress, task_id = make_progress_bar(
+            total=config.max_iter,
+            lrate=metrics.lrate,
+        )
+    try:
+        return _main_loop(
+            X=X,
+            sldet=sldet,
+            wc=wc,
+            config=config,
+            state=state,
+            do_newton=do_newton,
+            leave=leave,
+            numdecs=numdecs,
+            numincs=numincs,
+            metrics=metrics,
+            accumulators=accumulators,
+            Dsum=Dsum,
+            Dsign=Dsign,
+            loglik=loglik,
+            LL=LL,
+            c_start=c_start,
+            c1=c1,
+            progress=progress,
+            task_id=task_id,
+            min_dll=min_dll,
+            min_nd=min_nd,
+        )
+    finally:
+        if progress is not None:
+            progress.stop()
+def _main_loop(
+        *,
+        X: DataTensor2D,
+        sldet: float,
+        wc: torch.Tensor,
+        config: AmicaConfig,
+        state: AmicaState,
+        do_newton: bool,
+        leave: bool,
+        numdecs: int,
+        numincs: int,
+        metrics: IterationMetrics,
+        accumulators: AmicaAccumulators,
+        Dsum: torch.Tensor,
+        Dsign: torch.Tensor,
+        loglik: torch.Tensor,
+        LL: torch.Tensor,
+        c_start: float,
+        c1: float,
+        progress,
+        task_id,
+        min_dll: float,
+        min_nd: float,
+):
+    """Run the AMICA optimization loop and return updated state and LL history."""
+    while metrics.iter <= config.max_iter:
+        accumulators.reset()
+        loglik.fill_(0.0)
+        doing_newton = do_newton and (metrics.iter >= config.newt_start)
+        # !----- get determinants
+        # The Fortran code computed log|det(W)| indirectly via QR factorization
+        # We use slogdet on the original unmixing matrix to get sign and log|det|
+        _, Dsum = compute_sign_log_determinant(
+            unmixing_matrix=state.W,
+            minlog=minlog,
+        )
+        if config.do_reject:
+            raise NotImplementedError()  # pragma: no cover
+        # !--------- loop over the blocks ----------
+        '''
+        # In Fortran, the OMP parallel region would start before the lines below.
+        # !$OMP PARALLEL DEFAULT(SHARED) &
+        # ...
+        # !print *, myrank+1, thrdnum+1, ': Inside openmp code ... '; call flush(6)
+        '''
+        # -- 0. Baseline terms for per-sample model log-likelihood --
+        initial = get_initial_model_log_likelihood(
+                unmixing_logdet=Dsum,
+                whitening_logdet=sldet,
+                model_weight=state.gm[0],
+        )
+        #=============================== Subsection ====================================
+        # === Begin chunk loop ===
+        # ==============================================================================
+        batch_loader = BatchLoader(X, axis=0, batch_size=config.batch_size)
+        for batch_idx, (data_batch, batch_indices) in enumerate(batch_loader):
+            # ======================================================================
+            #                       Expectation Step (E-step)
+            # ======================================================================
+            # 1. --- Compute source pre-activations
+            # !--- get b
+            if not state.W.device.type == data_batch.device.type:
+                raise ValueError(
+                    f"Mismatch between state.W device ({state.W.device}) "
+                    "and data_batch device ({data_batch.device})"
+                )
+            b = compute_preactivations(
+                X=data_batch,
+                unmixing_matrix=state.W,
+                bias=wc,
+                do_reject=config.do_reject,
+                n_weights=config.n_components,
+            )
+            # 2. --- Source densities, and per-sample mixture log-densities (logits)
+            y, z = compute_source_densities(
+                pdftype=config.pdftype,
+                b=b,
+                sbeta=state.sbeta,
+                mu=state.mu,
+                alpha=state.alpha,
+                rho=state.rho,
+                )
+            z0 = z  # log densities (alias for clarity with Fortran code)
+            # 3. --- Aggregate mixture logits into per-sample model log likelihoods
+            modloglik = torch.full(
+                size=(data_batch.shape[0], 1),
+                fill_value=initial,
+                dtype=config.dtype,
+                device=config.device,
+                )
+            compute_model_loglikelihood_per_sample(
+                log_densities=z0,
+                out_modloglik=modloglik[:, 0],
+            )
+            # 4. -- Responsibilities within each component ---
+            # !--- get normalized z
+            z = compute_mixture_responsibilities(log_densities=z0, inplace=True)
+            z0 = None
+            del z0  # guard against use of stale name. z owns that memory
+            # 5. --- Across-model Responsibilities and Total Log-Likelihood ---
+            loglik[batch_indices] = compute_total_loglikelihood_per_sample(
+                modloglik=modloglik,
+                out_loglik=loglik[batch_indices]
+            )
+            if config.do_reject:
+                raise NotImplementedError()  # pragma: no cover
+            else:
+                # 6. --- Responsibilities for each model ---
+                v = compute_model_responsibilities(
+                    modloglik=modloglik,
+                    out=modloglik,  # reuse modloglik memory
+                    )
+                modloglik = None
+                del modloglik  # Guard. v owns that memory now
+            # ================================ M-STEP ==================================
+            # === Maximization-step: Parameter accumulators ===
+            # - Update parameters based on current responsibilities
+            # - Update unmixing matrices with gradient ascent and Newton-Raphson
+            # ==========================================================================
+            # !--- get g, u, ufp
+            #--------------------------FORTRAN CODE-------------------------
+            # vsum = sum( v(bstrt:bstp,h) )
+            # dgm_numer_tmp(h) = dgm_numer_tmp(h) + vsum
+            #---------------------------------------------------------------
+            model_resps = v[:, 0] #  select responsibilities for this model
+            vsum = model_resps.sum()
+            # NOTE: u is a view of z, so changes to u will affect z (and vice versa)
+            u = compute_weighted_responsibilities(
+                mixture_responsibilities=z,
+                model_responsibilities=model_resps,
+                single_model=True,
+            )
+            z = None
+            del z # guard against use of stale name. u owns that memory now
+            usum = u.sum(dim=0)  # shape: (nw, num_mix)
+            fp = compute_source_scores(
+                pdftype=config.pdftype,
+                y=y,
+                rho=state.rho,
+            )
+            # For SGD, fp only exists to get ufp. Lets overwrite it to save memory.
+            ufp = precompute_weighted_scores(
+                weighted_responsibilities=u,
+                scores=fp,
+                out_ufp=fp if not doing_newton else None,
+            )
+            if not doing_newton:
+                fp = None
+                del fp  # End of life. ufp owns that memory now
+            g = compute_scaled_scores(
+                weighted_scores=ufp,
+                scales=state.sbeta,
+            )
+            # --- Stochastic Gradient Descent accumulators ---
+            # gm (model weights)
+            accumulators.dgm_numer[0] += vsum
+            # c (bias)
+            accumulate_c_stats(
+                X=data_batch,
+                model_responsibilities=model_resps,
+                vsum=vsum,
+                n_weights=config.n_components,
+                out_numer=accumulators.dc_numer,
+                out_denom=accumulators.dc_denom,
+            )
+            # Alpha (mixture weights)
+            accumulate_alpha_stats(
+                usum=usum,
+                vsum=vsum,
+                out_numer=accumulators.dalpha_numer,
+                out_denom=accumulators.dalpha_denom,
+            )
+            # Mu (location)
+            accumulate_mu_stats(
+                ufp=ufp,
+                y=y,
+                sbeta=state.sbeta,
+                rho=state.rho,
+                out_numer=accumulators.dmu_numer,
+                out_denom=accumulators.dmu_denom,
+            )
+            # Beta (scale/precision)
+            accumulate_beta_stats(
+                usum=usum,
+                rho=state.rho,
+                ufp=ufp,
+                y=y,
+                out_numer=accumulators.dbeta_numer,
+                out_denom=accumulators.dbeta_denom,
+            )
+            # Rho (shape parameter of generalized Gaussian)
+            accumulate_rho_stats(
+                y=y,
+                rho=state.rho,
+                u=u,
+                usum=usum,
+                epsdble=epsdble,
+                out_numer=accumulators.drho_numer,
+                out_denom=accumulators.drho_denom,
+            )
+            # --- Newton-Raphson accumulators ---
+            if do_newton and metrics.iter >= config.newt_start:
+                # NOTE: Fortran computes dsigma_* for all iters, but its unnecessary
+                # Sigma^2 accumulators (noise variance)
+                accumulate_sigma2_stats(
+                    model_responsibilities=model_resps,
+                    source_estimates=b,
+                    vsum=vsum,
+                    out_numer=accumulators.newton.dsigma2_numer,
+                    out_denom=accumulators.newton.dsigma2_denom,
+                )
+                # Kappa accumulators (curvature terms for A)
+                accumulate_kappa_stats(
+                    ufp=ufp,
+                    fp=fp,
+                    sbeta=state.sbeta,
+                    usum=usum,
+                    out_numer=accumulators.newton.dkappa_numer,
+                    out_denom=accumulators.newton.dkappa_denom,
+                )
+                # Lambda accumulators (nonlinearity shape parameter)
+                accumulate_lambda_stats(
+                    fp=fp,
+                    y=y,
+                    u=u,
+                    usum=usum,
+                    out_numer=accumulators.newton.dlambda_numer,
+                    out_denom=accumulators.newton.dlambda_denom,
+                )
+                # (dbar)Alpha accumulators
+                accumulators.newton.dbaralpha_numer[:, :] += usum
+                accumulators.newton.dbaralpha_denom[:, :] += vsum
+            # end if (do_newton and iteration >= newt_start)
+            # if (print_debug .and. (blk == 1) .and. (thrdnum == 0)) then
+            # if update_A:
+            #--------------------------FORTRAN CODE--------------------------------
+            # call DSCAL(nw*nw,dble(0.0),Wtmp2(:,:,thrdnum+1),1)
+            # call DGEMM('T','N',nw,nw,tblksize,dble(1.0),g(bstrt:bstp,:),...
+            #            dble(1.0),Wtmp2(:,:,thrdnum+1),nw)
+            # call DAXPY(nw*nw,dble(1.0),Wtmp2(:,:,thrdnum+1),1,dWtmp(:,:,h),1)
+            #----------------------------------------------------------------------
+            accumulators.dA[:, :] += torch.matmul(g.T, b)
+        # end do (blk)'
+        # In Fortran, the OMP parallel region is closed here
+        # !$OMP END PARALLEL
+        # End of these lifetimes
+        del b, g, u, ufp, usum, vsum, v, model_resps, y
+        if doing_newton:
+            del fp  # already deleted if not doing_newton
+        likelihood, ndtmpsum = accum_updates_and_likelihood(
+            X=X,
+            config=config,
+            accumulators=accumulators,
+            state=state,
+            total_LL=loglik.sum(),
+            iteration=metrics.iter
+        )
+        metrics.loglik = likelihood
+        metrics.ndtmpsum = ndtmpsum
+        # return accumulators, metrics
+        # ==============================================================================
+        ndtmpsum = metrics.ndtmpsum
+        LL[metrics.iter - 1] = metrics.loglik
+        # !----- display log likelihood of data
+        # if (seg_rank == 0) then
+        c2 = time.time()
+        t0 = c2 - c1
+        #  if (mod(iter,outstep) == 0) then
+        if progress is not None and task_id is not None:
+            progress.update(
+                task_id,
+                completed=metrics.iter,
+                ll=f"{float(LL[metrics.iter - 1]):.4f}",
+                nd=f"{float(ndtmpsum):.4f}",
+                lrate=f"{metrics.lrate:.5f}",
+            )
+        if config.verbose == 2 and (metrics.iter % outstep) == 0:
+            report = (
+                f"Iteration {metrics.iter}, "
+                f"lrate = {metrics.lrate:.5f}, "
+                f"LL = {LL[metrics.iter - 1]:.7f}, "
+                f"nd = {ndtmpsum:.7f}, D = {float(Dsum):.5f} "
+                f"took {t0:.2f} seconds"
+            )
+            log(msg=report, level="info", color=None)
+            c1 = time.time()
+        # !----- check whether likelihood is increasing
+        # if (seg_rank == 0) then
+        # ! if we get a NaN early, try to reinitialize and startover a few times
+        if torch.isnan(LL[metrics.iter - 1]):
+            raise RuntimeError(f"Log Likelihood is NaN at iteration {metrics.iter}")
+        # end if
+        if metrics.iter > 1:
+            if (LL[metrics.iter - 1] < LL[metrics.iter - 2]):
+                # assert 1 == 0
+                log("Likelihood decreasing!", level="warning", color="yellow")
+                if (metrics.lrate < minlrate) or (ndtmpsum <= min_nd):
+                    leave = True
+                    log(
+                        "minimum change threshold met, exiting loop",
+                        level="info",
+                        color="green",
+                        weight="bold"
+                        )
+                else:
+                    metrics.lrate *= lratefact
+                    metrics.rholrate *= rholratefact
+                    numdecs += 1
+                    if numdecs >= maxdecs:
+                        metrics.lrate0 *= lratefact
+                        if metrics.iter > config.newt_start:
+                            metrics.rholrate0 *= rholratefact
+                        if config.do_newton and metrics.iter > config.newt_start:
+                            log(
+                                "Reducing maximum Newton lrate",
+                                level="info",
+                                color="blue"
+                                )
+                            metrics.newtrate *= lratefact
+                        numdecs = 0
+                    # end if (numdecs >= maxdecs)
+                # end if (lrate vs minlrate)
+            # end if LL
+            if use_min_dll:
+                if (LL[metrics.iter - 1] - LL[metrics.iter - 2]) < min_dll:
+                    numincs += 1
+                    if numincs > maxincs:
+                        leave = True
+                        log(
+                            "Exiting because likelihood increasing by less than "
+                            f"{min_dll} for more than {maxincs} iterations ...",
+                            level="info",
+                            color="green",
+                            weight="bold"
+                            )
+                else:
+                    numincs = 0
+            else:
+                raise NotImplementedError()  # pragma: no cover
+            if use_grad_norm:
+                if ndtmpsum < min_nd:
+                    leave = True
+                    log(
+                        "Exiting because norm of weight gradient less than "
+                        f"{min_nd:.12f}",
+                        level="info",
+                        color="green",
+                        weight="bold",
+                        )
+        # end if (iter > 1)
+        if config.do_newton and (metrics.iter == config.newt_start):
+            log("Starting Newton ... setting numdecs to 0", level="info", color="blue")
+            numdecs = 0
+        # call MPI_BCAST(leave,1,MPI_LOGICAL,0,seg_comm,ierr)
+        # call MPI_BCAST(startover,1,MPI_LOGICAL,0,seg_comm,ierr)
+        if leave:
+            c_end = time.time()
+            log(f"Finished in {c_end - c_start:.2f} seconds", level="info")
+            return state, LL
+        # else:
+        # !----- do accumulators: gm, alpha, mu, sbeta, rho, W
+        # the updated lrate & rholrate for the next iteration
+        metrics.lrate, metrics.rholrate, state, wc = update_params(
+            X=X,
+            iteration=metrics.iter,
+            config=config,
+            state=state,
+            accumulators=accumulators,
+            lrate=metrics.lrate,
+            rholrate=metrics.rholrate,
+            lrate0=metrics.lrate0,
+            rholrate0=metrics.rholrate0,
+            wc=wc,
+            newtrate=metrics.newtrate,
+        )
+        # !----- reject data
+        if config.do_reject:
+            raise NotImplementedError()  # pragma: no cover
+        metrics.iter += 1
+        # end if/else
+    # end while
+    log(
+        "Maximum number of iterations reached before convergence."
+        " Consider increasing max_iter or relaxing tol.",
+        level="warning",
+        color="yellow",
+        weight="bold",
+    )
+    c_end = time.time()
+    log(f"Finished in {c_end - c_start:.2f} seconds", level="info")
+    return state, LL
+def accum_updates_and_likelihood(
+        *,
+        X,
+        config,
+        accumulators,
+        state,
+        total_LL,  # this is LLtmp in Fortran
+        iteration
+        ):
+    """Use accumulated arrays to updated logk and ndtmpsum."""
+    # !--- add to the cumulative dtmps
+    # ...
+    #--------------------------FORTRAN CODE-------------------------
+    # call MPI_REDUCE(dgm_numer_tmp,dgm_numer,num_models,MPI_DOUBLE_PRECISION,MPI_S...
+    # ...
+    # if update_A:
+    # call MPI_REDUCE(dWtmp,dA,nw*nw*num_models,MPI_DOUBLE_PRECISION,MPI_SUM,0,seg_co...
+    nw = config.n_components
+    Wtmp_working = torch.zeros(
+        (config.n_components, config.n_components),
+        dtype=config.dtype, device=config.device
+        )
+    # if (seg_rank == 0) then
+    if config.do_newton and iteration >= config.newt_start:
+        newton_terms = compute_newton_terms(
+            accumulators=accumulators, config=config, mu=state.mu
+            )
+        sigma2 = newton_terms["sigma2"]
+        kappa = newton_terms["kappa"]
+        lambda_ = newton_terms["lambda_"]
+        # if (print_debug) then
+    # end if (do_newton .and. iter >= newt_start)
+    #--------------------------FORTRAN CODE-------------------------
+    # if (print_debug) then
+    # print *, 'dA ', h, ' = '; call flush(6)
+    # call DSCAL(nw*nw,dble(-1.0)/dgm_numer(h),dA(:,:,h),1)
+    # dA(i,i,h) = dA(i,i,h) + dble(1.0)
+    #---------------------------------------------------------------
+    if config.do_reject:
+        raise NotImplementedError()  # pragma: no cover
+    else:
+        accumulators.dA[:, :] *= -1.0 / accumulators.dgm_numer[0]
+    # basically the same as np.fill_diagonal where fill value is diag + 1.0
+    diag = accumulators.dA.diagonal()
+    idx = torch.arange(nw)
+    accumulators.dA[idx, idx] = diag + 1.0
+    # if (print_debug) then
+    if config.do_newton and iteration >= config.newt_start:
+        #--------------------------FORTRAN CODE-------------------------
+        # do i = 1,nw ... do k = 1,nw
+        # if (i == k) then
+        # Wtmp(i,i) = dA(i,i,h) / lambda(i,h)
+        # else
+        # sk1 = sigma2(i,h) * kappa(k,h)
+        # sk2 = sigma2(k,h) * kappa(i,h)
+        #---------------------------------------------------------------
+        # on-diagonal elements
+        diag = accumulators.dA.diagonal()
+        fill_values = diag / lambda_
+        idx = torch.arange(Wtmp_working.shape[0])
+        Wtmp_working[idx, idx] = fill_values
+        # off-diagonal elements
+        i_indices, k_indices = torch.meshgrid(
+            torch.arange(config.n_components, device=config.device),
+            torch.arange(config.n_components, device=config.device), indexing='ij',
+            )
+        off_diag_mask = i_indices != k_indices
+        sk1 = sigma2[i_indices] * kappa[k_indices]
+        sk2 = sigma2[k_indices] * kappa[i_indices]
+        positive_mask = (sk1 * sk2 > 0.0)
+        if torch.any(~positive_mask):
+            raise RuntimeError(
+                "Non-positive definite Hessian encountered in Newton update. "
+                f"Iteration {iteration}. Try setting do_newton to False."
+                )
+        condition_mask = positive_mask & off_diag_mask
+        if torch.any(condition_mask):
+            # # Wtmp(i,k) = (sk1*dA(i,k,h) - dA(k,i,h)) / (sk1*sk2 - dble(1.0))
+            numerator = (
+                sk1
+                * accumulators.dA[i_indices, k_indices]
+                - accumulators.dA[k_indices, i_indices]
+                )
+            denominator = sk1 * sk2 - 1.0
+            Wtmp_working[condition_mask] = (numerator / denominator)[condition_mask]
+        # end if (i == k)
+        # end do (k)
+        # end do (i)
+    # end if (do_newton .and. iter >= newt_start)
+    if ((not config.do_newton) or (iteration < config.newt_start)):
+        #  Wtmp = dA(:,:,h)
+        assert Wtmp_working.shape == accumulators.dA.shape == (nw, nw)
+        Wtmp_working = accumulators.dA.clone()
+        assert Wtmp_working.shape == (nw, nw)
+    #--------------------------FORTRAN CODE-------------------------
+    # call DSCAL(nw*nw,dble(0.0),dA(:,:,h),1)
+    # call DGEMM('N','N',nw,nw,nw,dble(1.0),A(:,comp_list(:,h)),nw,Wtmp,nw,dble...
+    #---------------------------------------------------------------
+    accumulators.dA[:, :] = 0.0
+    accumulators.dA[:, :] += torch.matmul(state.A, Wtmp_working)
+    zeta = torch.zeros(config.n_components, dtype=config.dtype, device=config.device)
+    #--------------------------FORTRAN CODE-------------------------
+    # dAk(:,comp_list(i,h)) = dAk(:,comp_list(i,h)) + gm(h)*dA(:,i,h)
+    # zeta(comp_list(i,h)) = zeta(comp_list(i,h)) + gm(h)
+    #---------------------------------------------------------------
+    source_columns = state.gm[0] * accumulators.dA
+    accumulators.dAK[:, :] += source_columns
+    zeta[:] += state.gm[0]
+    #--------------------------FORTRAN CODE-------------------------
+    # dAk(:,k) = dAk(:,k) / zeta(k)
+    # nd(iter,:) = sum(dAk*dAk,1)
+    # ndtmpsum = sqrt(sum(nd(iter,:),mask=comp_used) / (nw*count(comp_used)))
+    #---------------------------------------------------------------
+    accumulators.dAK[:,:] /= zeta  # Broadcasting division
+    # nd is (num_iters, num_comps) in Fortran, but we only store current iteration
+    nd = torch.sum(accumulators.dAK * accumulators.dAK, dim=0)
+    assert nd.shape == (config.n_components,)
+    # comp_used should be a vector of True
+    # In Fortran comp_used was based on component availability.
+    # Unless identify_shared_comps was run. I have no plans to implement that.
+    comp_used = torch.ones(config.n_components, dtype=bool)
+    assert isinstance(comp_used, torch.Tensor)
+    assert comp_used.shape == (config.n_components,)
+    assert comp_used.dtype == torch.bool
+    ndtmpsum = torch.sqrt(torch.sum(nd) / (nw * torch.count_nonzero(comp_used)))
+    # end if (update_A)
+    # if (seg_rank == 0) then
+    if config.do_reject:
+        raise NotImplementedError()  # pragma: no cover
+    else:
+        # LL(iter) = LLtmp2 / dble(all_blks*nw)
+        # XXX: In the Fortran code LLtmp2 is the summed LLtmps across processes.
+        likelihood = total_LL / (X.shape[0] * nw)
+    return (likelihood, ndtmpsum)
+def update_params(
+        *,
+        X,
+        iteration,
+        config,
+        state,
+        accumulators,
+        lrate,
+        rholrate,
+        lrate0,
+        rholrate0,
+        newtrate,
+        wc,
+):
+    """Update learnable ICA Parameters, and learning rates."""
+    # if (seg_rank == 0) then
+    # if update_gm:
+    if config.do_reject:
+        raise NotImplementedError()  # pragma: no cover
+        # gm = dgm_numer / dble(numgoodsum)
+    else:
+        state.gm[:] = accumulators.dgm_numer / X.shape[0]
+    # end if (update_gm)
+    # if update_alpha:
+    # assert alpha.shape == (num_comps, num_mix)
+    state.alpha[:, :] = accumulators.dalpha_numer / accumulators.dalpha_denom
+    if torch.any(~torch.isfinite(state.alpha)):
+        raise RuntimeError("Non-finite alpha encountered during update.")
+    # if update_c:
+    # assert c.shape == (nw, num_models)
+    state.c[:] = accumulators.dc_numer / accumulators.dc_denom
+    if torch.any(~torch.isfinite(state.c)):
+        raise RuntimeError("Non-finite c encountered during update.")
+    # === Section: Apply Parameter accumulators & Rescale ===
+    # Apply accumulated statistics to update parameters, then rescale and refresh W/wc.
+    # !print *, 'updating A ...'; call flush(6)
+    if (iteration < share_start or (iteration % share_iter > 5)):
+        if config.do_newton and (iteration >= config.newt_start):
+            # lrate = min( newtrate, lrate + min(dble(1.0)/dble(newt_ramp),lrate) )
+            # rholrate = rholrate0
+            # call DAXPY(nw*num_comps,dble(-1.0)*lrate,dAk,1,A,1)
+            lrate = min(newtrate, lrate + min(1.0 / config.newt_ramp, lrate))
+            rholrate = rholrate0
+            state.A -= lrate * accumulators.dAK
+        else:
+            lrate = min(lrate0, lrate + min(1 / config.newt_ramp, lrate))
+            rholrate = rholrate0
+            # call DAXPY(nw*num_comps,dble(-1.0)*lrate,dAk,1,A,1)
+            state.A -= lrate * accumulators.dAK
+        # end if do_newton
+    # end if (update_A)
+    # if update_mu:
+    state.mu += accumulators.dmu_numer / accumulators.dmu_denom
+    if torch.any(~torch.isfinite(state.mu)):
+        raise RuntimeError("Non-finite mu encountered during update.")
+    # if update_beta:
+    state.sbeta *= torch.sqrt(accumulators.dbeta_numer / accumulators.dbeta_denom)
+    sbetatmp = torch.minimum(torch.tensor(invsigmax), state.sbeta)
+    state.sbeta = torch.maximum(torch.tensor(invsigmin), sbetatmp)
+    if torch.any(~torch.isfinite(state.sbeta)):
+        raise RuntimeError("Non-finite sbeta encountered during update.")
+    state.rho += (
+            rholrate
+            * (
+                1.0
+                - (state.rho / torch.special.psi(1.0 + 1.0 / state.rho))
+            * accumulators.drho_numer
+            / accumulators.drho_denom
+        )
+    )
+    rhotmp = torch.minimum(torch.tensor(maxrho), state.rho) # shape (num_comps, num_mix)
+    assert rhotmp.shape == (config.n_components, config.n_mixtures)
+    state.rho = torch.maximum(torch.tensor(minrho), rhotmp)
+    # !--- rescale
+    # !print *, 'rescaling A ...'; call flush(6)
+    # from seed import A_FORTRAN
+    if doscaling:
+        # calculate the L2 norm for each column of A and then use it to normalize that
+        # column and scale the corresponding columns in mu and sbeta, but only if the
+        # norm is positive.
+        Anrmk = torch.linalg.norm(state.A, dim=0)
+        positive_mask = Anrmk > 0
+        if positive_mask.all():
+            state.A[:, positive_mask] /= Anrmk[positive_mask]
+            state.mu[positive_mask, :] *= Anrmk[positive_mask, None]
+            state.sbeta[positive_mask, :] /= Anrmk[positive_mask, None]
+        else:
+            raise NotImplementedError()  # pragma: no cover
+    # end if (doscaling)
+    if share_comps:
+        raise NotImplementedError()  # pragma: no cover
+    state.W, wc = get_unmixing_matrices(
+        c=state.c,
+        A=state.A,
+        W=state.W,
+    )
+    # if (print_debug) then
+    # call MPI_BCAST(gm,num_models,MPI_DOUBLE_PRECISION,0,seg_comm,ierr)
+    # ...
+    return lrate, rholrate, state, wc