PyPI - maradoner - Versions diffs - 0.10__tar.gz → 0.11__tar.gz - Mend

maradoner 0.10tar.gz → 0.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maradoner might be problematic. Click here for more details.

Files changed (25) hide show

{maradoner-0.10 → maradoner-0.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: maradoner
-Version: 0.10
+Version: 0.11
 Summary: Variance-adjusted estimation of motif activities.
 Home-page: https://github.com/autosome-ru/nemara
 Author: Georgy Meshcheryakov

{maradoner-0.10 → maradoner-0.11}/maradoner/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-__version__ = '0.10'
+__version__ = '0.11'
 import importlib

{maradoner-0.10 → maradoner-0.11}/maradoner/create.py RENAMED Viewed

@@ -88,6 +88,7 @@ def create_project(project_name: str, promoter_expression_filename: str, loading
                             f'{len(loading_matrix_transformations)}.')
     logger_print('Filtering promoters of low expression...', verbose)
+    print('aaaaa', len(promoter_expression))
     inds, weights = filter_lowexp(promoter_expression, cutoff=promoter_filter_lowexp_cutoff, fit_plot_filename=promoter_filter_plot_filename)
     promoter_expression = promoter_expression.loc[inds]
     proms = promoter_expression.index
@@ -115,6 +116,7 @@ def create_project(project_name: str, promoter_expression_filename: str, loading
         motif_expression = None
     loading_matrices = pd.concat(loading_matrices, axis=1)
     if motif_names is not None:
+        motif_names = list(set(motif_names) & set(loading_matrices.columns))
         loading_matrices = loading_matrices[motif_names]
     proms = list(promoter_expression.index)
     sample_names = list(promoter_expression.columns)

{maradoner-0.10 → maradoner-0.11}/maradoner/dataset_filter.py RENAMED Viewed

@@ -105,5 +105,6 @@ def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, fit_plot_filename=None,
     inds[:k] = False
     # print(inds)
     # inds[:] = 1
+    print(x[inds].mean(), x[~inds].mean())
     inds = inds[inds_inv]
     return inds, ws

{maradoner-0.10 → maradoner-0.11}/maradoner/export.py RENAMED Viewed

@@ -159,7 +159,7 @@ def export_fov(fovs: tuple[FOVResult], folder: str,
 def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
-                    B: np.ndarray, corr_stat=False):
+                    B: np.ndarray, corr_stat=False, map_cov=False):
     precs = list()
     istds = list()
     covs = list()
@@ -170,20 +170,28 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
     #     mot = np.delete(mot, activities.filtered_motifs)
     #     ind = mot * nu < cov.diagonal() + 1e-9
     #     bad_inds[ind] = True
-    for cov, U, nu in zip(activities.cov(), activities.U.T, fit.motif_variance.group):
-        mot = fit.motif_variance.motif
-        mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
+    # mot = fit.motif_variance.motif
+    # mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
+    if map_cov:
+        # fit.motif_variance.m
+        BTB = B.T @ B
+        BTB_s = BTB * fit.motif_variance.motif ** 0.5
+        BTB_s = BTB_s @ BTB_s.T
+    for cov, U, sigma, n, nu in zip(activities.cov(), activities.U.T,
+                          activities._cov[-2],
+                          fit.error_variance.variance, fit.motif_variance.group):
         # cov = cov[~bad_inds, ~bad_inds]
-        cov = cov[..., ~bad_inds]
-        cov = cov[~bad_inds]
+        # cov = cov[..., ~bad_inds]
+        # cov = cov[~bad_inds]
+        if map_cov:
+            D = BTB_s * nu  + np.identity(len(BTB)) * sigma
+            cov = cov @ D @ cov.T * n / sigma ** 2
         covs.append(cov)
-        U = U[~bad_inds]
+        # U = U[~bad_inds]
         # prec = np.linalg.inv(np.diag(mot * nu) - cov)
         prec = np.linalg.inv(cov)
         mean += prec @ U
         precs.append(prec)
-    print(bad_inds.sum())
     total_prec = sum(precs)
     total_cov = np.linalg.inv(total_prec)
     mean = total_cov @ mean
@@ -210,9 +218,7 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
 def export_results(project_name: str, output_folder: str,
                    std_mode: Standardization,
                    anova_mode: ANOVAType=ANOVAType.positive,
-                   compute_corrected_pvalues=False,
-                   corrected_numerical=False,
-                   corrected_num_samples=1e5,
+                   weighted_zscore=False,
                    alpha=0.05,
                    n_jobs=6):
@@ -324,12 +330,11 @@ def export_results(project_name: str, output_folder: str,
     pval = calc_z_test(anova_ass)
     fdrs = multitest.multipletests(pval, alpha=0.05, method='fdr_bh')[1]
-    lrt = 2 * fit.motif_variance.logratios
-    lrt_pvalues = chi2.sf(lrt, 1)
-    lrt_fdr = multitest.multipletests(lrt_pvalues, alpha=0.05, method='fdr_bh')[1]
-    anova_ass = DF(np.array([anova_ass, pval, fdrs, lrt, lrt_pvalues, lrt_fdr]).T, index=motif_names_filtered,
-                   columns=['stat', 'p-value', 'FDR',
-                            'logratio', 'lrt_p-value', 'lrt_FDR'])
+    # lrt = 2 * fit.motif_variance.logratios
+    # lrt_pvalues = chi2.sf(lrt, 1)
+    # lrt_fdr = multitest.multipletests(lrt_pvalues, alpha=0.05, method='fdr_bh')[1]
+    anova_ass = DF(np.array([anova_ass, pval, fdrs]).T, index=motif_names_filtered,
+                   columns=['stat', 'p-value', 'FDR'])
     anova_ass.to_csv(os.path.join(folder, 'anova.tsv'), sep='\t')
     sign = motif_mean.flatten() / motif_mean_std
@@ -347,6 +352,28 @@ def export_results(project_name: str, output_folder: str,
                   index=motif_names)
     sign_ass.to_csv(os.path.join(folder, 'sign.tsv'), sep='\t')
+    folder = os.path.join(output_folder, 'activities')
+    os.makedirs(folder, exist_ok=True)
+    U = list()
+    stds = list()
+    for u, cov in zip(act.U.T, act.cov()):
+        std = cov.diagonal() ** 0.5
+        u = u / std
+        U.append(u)
+        stds.append(std)
+    U = np.array(U).T
+    DF(U, index=motif_names_filtered, columns=group_names).to_csv(os.path.join(folder, 'activity.tsv'), sep='\t')
+    U = U ** 2
+    if weighted_zscore:
+        U_total = U.sum(axis=1, keepdims=True) / (1 / np.array(stds).T ** 2).sum(axis=1, keepdims=True)
+    else:
+        U_total = U.mean(axis=1, keepdims=True)
+    U = np.hstack((U_total, U)) ** 0.5
+    DF(U, index=motif_names_filtered,
+       columns=['overall'] + list(group_names)).to_csv(os.path.join(folder, 'z_score.tsv'), sep='\t')
+    DF(act.U_raw, index=motif_names_filtered, columns=data.sample_names).to_csv(os.path.join(folder, 'activity_raw.tsv'), sep='\t')
     if os.path.isfile(f'{project_name}.fov.{fmt}'):
         with open(f'{project_name}.fov.{fmt}', 'rb') as f:
             fov = dill.load(f)

{maradoner-0.10 → maradoner-0.11}/maradoner/fit.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import numpy as np
 import jax.numpy as jnp
 import jax
+import scipy.linalg.lapack as lapack
 from sklearn.cluster import KMeans
 from sklearn.decomposition import NMF
 from dataclasses import dataclass
@@ -27,7 +28,67 @@ class LowrankDecomposition:
     Q: np.ndarray
     S: np.ndarray
     V: np.ndarray
-    null_Q: np.ndarray
+    def null_space_transform(self, Y: np.ndarray) -> np.ndarray:
+        """
+        Compute V^T Y where V is the orthogonal complement to Q, using Householder
+        transformations via LAPACK's dormqr. Ensures inputs are compatible.
+        Parameters:
+        Q (ndarray): p x r semi-orthogonal matrix where Q^T Q = I_r, r <= p.
+                     Should be a standard float array (e.g., float64).
+        Y (ndarray): p x n matrix. Will be converted to float64 if necessary.
+        Returns:
+        VT_Y (ndarray): (p - r) x n matrix representing V^T Y (float64).
+        """
+        Y = np.array(Y, order='F', copy=True)
+        Q = np.array(self.Q).astype(np.float64, copy=False)
+        p, r = Q.shape
+        if r > p:
+            raise ValueError(f"Number of columns r ({r}) cannot exceed number of rows p ({p}) in Q.")
+        # 1. Compute QR factorization of Q
+        # Need a copy of Q because 'raw' QR might modify it slightly in some versions/backends,
+        # even though documentation often says it doesn't. Using overwrite_a=True below is safer.
+        Q_copy = np.array(Q, order='F', dtype=np.float64) # Fortran order often preferred by LAPACK
+        qr_a, tau, work_qr, info_qr = lapack.dgeqrf(Q_copy, overwrite_a=True)
+        if info_qr != 0:
+            raise RuntimeError(f"LAPACK dgeqrf failed with info = {info_qr}")
+        # qr_a now contains R in upper triangle and reflectors below diagonal (overwritten Q_copy)
+        # 2. Prepare matrix Z (to be modified by dormqr)
+        # 3. Apply Q_full^T to Z using dormqr
+        # Workspace query
+        # try:
+        lwork = -1
+        # Use Z's shape here for the query, pass dummy Z
+        _, work_query, _ = lapack.dormqr('L', 'T', qr_a, tau, np.empty_like(Y), lwork=lwork, overwrite_c=True)
+        optimal_lwork = int(work_query[0].real)
+        lwork = max(1, optimal_lwork)
+        # Actual application
+        q_mult_y, work_actual, info_ormqr = lapack.dormqr('L', 'T', qr_a, tau, Y,
+                                                          lwork=lwork, overwrite_c=True)
+        if info_ormqr != 0:
+            # Add more debug info if it fails
+            print("--- Debug Info Before dormqr Failure ---")
+            print(f"Q shape: {Q.shape}, dtype: {Q.dtype}")
+            print(f"qr_a shape: {qr_a.shape}, dtype: {qr_a.dtype}, order: {'F' if qr_a.flags.f_contiguous else 'C'}")
+            print(f"tau shape: {tau.shape}, dtype: {tau.dtype}")
+            print(f"Y shape: {Y.shape}, dtype: {Y.dtype}, order: {'F' if Y.flags.f_contiguous else 'C'}")
+            print(f"lwork: {lwork}")
+            print("--- End Debug Info ---")
+            raise RuntimeError(f"LAPACK dormqr failed with info = {info_ormqr}")
+        VT_Y = q_mult_y[r:, :]
+        return VT_Y
+    #null_Q: np.ndarray
 @dataclass
 class TransformedData:
@@ -52,7 +113,6 @@ class MotifVarianceEstimates:
     fixed_group: int
     loglik: float
     loglik_start: float
-    logratios: np.ndarray
 @dataclass(frozen=True)
 class MotifMeanEstimates:
@@ -87,9 +147,60 @@ def ones_nullspace(n: int):
         res[i - 1, i] = 1 / norm
     return res
+def ones_nullspace_transform(x):
+    n, m = x.shape
+    if n <= 1:
+        return np.zeros((0, m), dtype=x.dtype)
+    Y = np.zeros((n - 1, m), dtype=float)
+    current_sum = x[0, :].astype(float)
+    for r in range(n - 1):
+        i = r + 1
+        sqrt_i_i_plus_1 = np.sqrt(i * (i + 1))
+        # Coefficients for row r of Y (which uses row i-1 = r of H)
+        coeff1 = -1.0 / sqrt_i_i_plus_1
+        coeff2 = np.sqrt(i / (i + 1))
+        Y[r, :] = coeff1 * current_sum + coeff2 * x[r + 1, :]
+        # Update current_sum for the next iteration (to become sum_{k=0}^{r+1} X[k,:])
+        if r < n - 2: # Avoid adding beyond X's bounds on the last iteration
+             current_sum += x[r + 1, :]
+    return Y
+def ones_nullspace_transform_transpose(X: np.ndarray) -> np.ndarray:
+    n, m = X.shape
+    n = n + 1
+    if n == 1:
+         output_dtype = X.dtype if np.issubdtype(X.dtype, np.floating) else float
+         return np.zeros((1, m), dtype=output_dtype)
+    output_dtype = X.dtype if np.issubdtype(X.dtype, np.floating) else float
+    Y = np.zeros((n, m), dtype=output_dtype)
+    current_suffix_sum = np.zeros(m, dtype=output_dtype)
+    for k in range(n - 2, -1, -1):
+        i = k + 1.0
+        sqrt_term_i_ip1 = np.sqrt(i * (i + 1.0))
+        coeff_pos = i / sqrt_term_i_ip1
+        coeff_neg = -1.0 / sqrt_term_i_ip1
+        Y[k + 1, :] = coeff_pos * X[k, :] + current_suffix_sum
+        current_suffix_sum += coeff_neg * X[k, :]
+    Y[0, :] = current_suffix_sum
+    return Y
 def lowrank_decomposition(X: np.ndarray, rel_eps=1e-12) -> LowrankDecomposition:
     svd = jnp.linalg.svd
-    q, s, v = [np.array(t) for t in svd(X)]
+    q, s, v = [np.array(t) for t in svd(X, full_matrices=False)]
     max_sv = max(s)
     n = len(s)
     for r in range(n):
@@ -98,10 +209,9 @@ def lowrank_decomposition(X: np.ndarray, rel_eps=1e-12) -> LowrankDecomposition:
             break
     r += 1
     s = s[:r]
-    null_q = q[:, r:]
     q = q[:, :r]
     v = v[:r]
-    return LowrankDecomposition(q, s, v, null_q)
+    return LowrankDecomposition(q, s, v)
 def transform_data(data, std_y=False, std_b=False, helmert=True) -> TransformedData:
     try:
@@ -115,9 +225,11 @@ def transform_data(data, std_y=False, std_b=False, helmert=True) -> TransformedD
     if std_b:
         B /= B.std(axis=0, keepdims=True)
     if helmert:
-        F_p = ones_nullspace(len(Y))
-        Y = F_p @ Y
-        B = F_p @ B
+        # F_p = ones_nullspace(len(Y))
+        # Y = F_p @ Y
+        # B = F_p @ B
+        Y = ones_nullspace_transform(Y)
+        B = ones_nullspace_transform(B)
     group_inds_inv = list()
     d = dict()
     for i, items in enumerate(group_inds):
@@ -346,9 +458,24 @@ def loglik_motifs_fim(x: jnp.ndarray, BTB: jnp.ndarray,
     return FIM
+def calc_error_variance_fim(data: TransformedData, error_variance: jnp.ndarray):
+    d = 1 / jnp.array(error_variance).at[data.group_inds_inv].get()
+    d = d / d.sum() ** 0.5
+    D_product_inv = jnp.outer(-d, d)
+    D_product_inv = jnp.fill_diagonal(D_product_inv,
+                                      D_product_inv.diagonal() + d * d.sum(),
+                                      inplace=False )
+    fim = D_product_inv * D_product_inv.T / 2
+    group_inds = data.group_inds
+    group_loadings = np.zeros((len(d), len(group_inds)), dtype=int)
+    for i, indices in enumerate(group_inds):
+        group_loadings[indices, i] = 1
+    group_loadings = jnp.array(group_loadings)
+    return group_loadings.T @ fim @ group_loadings
 def estimate_error_variance(data: TransformedData, B_decomposition: LowrankDecomposition,
                              verbose=False) -> ErrorVarianceEstimates:
-    Y = B_decomposition.null_Q.T @ data.Y
+    Y = B_decomposition.null_space_transform(data.Y)
     d0 = jnp.array([np.var(Y[:, inds]) for inds in data.group_inds])
     fun = partial(loglik_error, Qn_Y=Y, group_inds_inv=data.group_inds_inv)
@@ -362,7 +489,8 @@ def estimate_error_variance(data: TransformedData, B_decomposition: LowrankDecom
         print('-' * 15)
         print(res)
         print('-' * 15)
-    fim = jax.jacrev(grad)(res.x)
+    fim = calc_error_variance_fim(data, res.x)
     return ErrorVarianceEstimates(np.array(res.x), np.array(fim),
                                   loglik_start=res.start_loglik,
                                   loglik=res.fun)
@@ -374,13 +502,16 @@ def estimate_promoter_mean(data: TransformedData,
     D = error_variance.variance[data.group_inds_inv]
     Y = jnp.array(data.Y)
-    F_p = jnp.array(ones_nullspace(len(Y) + 1))
-    Q_N = jnp.array(B_decomposition.null_Q)
+    # F_p = jnp.array(ones_nullspace(len(Y) + 1))
+    # Q_N = jnp.array(B_decomposition.null_Q)
+    Q_C = jnp.array(B_decomposition.Q)
     w = (1 / D).sum()
     mean = Y @ (1 / D.reshape(-1, 1))
-    mean = Q_N.T @ mean
-    mean = Q_N @ mean
-    mean = F_p.T @ mean
+    mean = mean - Q_C @ (Q_C.T @ mean)
+    # mean = Q_N.T @ mean
+    # mean = Q_N @ mean
+    # mean = F_p.T @ mean
+    mean = ones_nullspace_transform_transpose(mean)
     mean = mean / w
     return PromoterMeanEstimates(mean)
@@ -437,12 +568,14 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
     eig = jnp.linalg.eigh(f)[0].min()
     if eig < 0:
         eig = list()
-        epsilons =  [1e-15, 1e-12, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
+        epsilons =  [1e-23, 1e-15, 1e-12, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
         for eps in epsilons:
             x = res.x.copy()
             x = x.at[:len(BTB)].set(jnp.clip(x.at[:len(BTB)].get(), eps, float('inf')))
             f = fim(x)
             eig.append(jnp.linalg.eigh(f)[0].min())
+            if eig[-1] > 0:
+                break
         i = np.argmax(eig)
         eps = epsilons[i]
         x = res.x.copy()
@@ -450,31 +583,9 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
         fim = fim(x)
     else:
         fim = f
-    print('FIM', eig)
-    logliks = list()
-    from tqdm import tqdm
-    for i in tqdm(list(range(len(BTB)))):
-        x = res.x.copy()
-        x = x.at[i].set(0)
-        subfun = partial(fun, _motif_zero=i)
-        subgrad = partial(grad, _motif_zero=i)
-        opt = MetaOptimizer(subfun, subgrad, num_steps_momentum=5, skip_init=False)
-        logliks.append(opt.optimize(x).fun)
-    logliks = np.array(logliks) - float(res.fun)
-    # fim_naive = partial(loglik_motifs_fim_naive, B=data.B, D=D,
-    #               group_inds_inv=data.group_inds_inv, group_inds=data.group_inds,
-    #               G_fix_ind=j, G_fix_val=fix)
-    # fim_naive = fim_naive(res.x)
-    # print('FIM')
-    # print(fim)
-    # print('Naive')
-    # print(fim_naive)
-    # print(np.abs(fim - fim_naive) / np.abs(fim_naive))
-    # fim = fim_naive
-    # fim = (fim, fim_naive)
     return MotifVarianceEstimates(motif=np.array(Sigma), group=np.array(G), fim=np.array(fim),
                                   fixed_group=j, loglik_start=res.start_loglik,
-                                  loglik=res.fun, logratios=logliks)
+                                  loglik=res.fun)
 def estimate_motif_mean(data: TransformedData, B_decomposition: LowrankDecomposition,
                          error_variance: ErrorVarianceEstimates,
@@ -494,8 +605,9 @@ def estimate_motif_mean(data: TransformedData, B_decomposition: LowrankDecomposi
     BTB = B_decomposition.V.T * B_decomposition.S ** 2 @ B_decomposition.V
     A = jnp.sqrt(Sigma).reshape(-1, 1) * BTB
-    Fp = ones_nullspace(len(data.Y) + 1)
-    Y_tilde = (data.Y - Fp @ mu_p.reshape(-1, 1)) / d
+    # Fp = ones_nullspace(len(data.Y) + 1)
+    # Y_tilde = (data.Y - Fp @ mu_p.reshape(-1, 1)) / d
+    Y_tilde = (data.Y - ones_nullspace_transform(mu_p.reshape(-1, 1))) / d
     Y_hat = jnp.sqrt(Sigma).reshape(-1,1) *  data.B.T @ Y_tilde * g / d
     D_B, Q_B = jnp.linalg.eigh(jnp.sqrt(Sigma).reshape(-1, 1) * BTB * jnp.sqrt(Sigma))
     At_QB = A.T @ Q_B

{maradoner-0.10 → maradoner-0.11}/maradoner/main.py RENAMED Viewed

@@ -264,14 +264,14 @@ def _export(name: str = Argument(..., help='Project name.'),
             output_folder: Path = Argument(..., help='Output folder.'),
             std_mode: Standardization = Option(Standardization.full, help='Whether to standardize activities with plain variances or also decorrelate them.'),
             anova_mode: ANOVAType = Option(ANOVAType.positive, help='If negative, look for non-variative motifs'),
-            corrected_pvalues: bool = Option(False, help='Compute MVN-based FDR correction.'),
+            weighted_zscore: bool = Option(False, help='Reciprocal variance weighted Z-scores'),
             alpha: float = Option(0.05, help='FDR alpha.')):
     t0 = time()
     p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
     p.add_task(description="Exporting results...", total=None)
     p.start()
     export_results(name, output_folder, std_mode=std_mode, anova_mode=anova_mode, alpha=alpha,
-                   compute_corrected_pvalues=corrected_pvalues)
+                   weighted_zscore=weighted_zscore)
     p.stop()
     dt = time() - t0
     rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')

{maradoner-0.10 → maradoner-0.11}/maradoner/mara/export.py RENAMED Viewed

@@ -62,6 +62,8 @@ def export_results(project_name: str, output_folder: str):
     U = act.U
     U_var = act.variance
+    U = U / U_var ** 0.5
     # U_grouped = list()
     # U_var_grouped = list()
@@ -74,15 +76,13 @@ def export_results(project_name: str, output_folder: str):
     os.makedirs(output_folder, exist_ok=True)
     DF(np.array([error_variance, motif_variance]).T, index=sample_names,
        columns=['sigma', 'tau']).to_csv(os.path.join(output_folder, 'params.tsv'), sep='\t')
-    act = U / U_var ** 0.5
-    U_total = act.sum(axis=1, keepdims=True) / (1 / U_var ** 0.5).sum(axis=1, keepdims=True)
-    act = np.hstack((U_total, act))
+    U_total = U.mean(axis=1, keepdims=True) # / (1 / U_var ** 0.5).sum(axis=1, keepdims=True)
+    act = np.hstack((U_total, U))
     DF(act, index=motif_names,
        columns=['overall'] + list(sample_names)).to_csv(os.path.join(output_folder, 'activities.tsv'),
                                     sep='\t')
-    z = U / U_var ** 0.5
-    z = z ** 2
+    z = U ** 2
     U_total = z.mean(axis=1, keepdims=True) #/ (1 / U_var ** 0.5).sum(axis=1, keepdims=True)
     z = np.hstack((U_total, z))
     z = z ** 0.5
@@ -90,5 +90,4 @@ def export_results(project_name: str, output_folder: str):
        columns=['overall'] + list(sample_names)).to_csv(os.path.join(output_folder, 'z_scores.tsv'),
                                     sep='\t')

{maradoner-0.10 → maradoner-0.11}/maradoner/mara/fit.py RENAMED Viewed

@@ -44,6 +44,7 @@ class MotifVarianceEstimates:
 class FitResult:
     error_variance: ErrorVarianceEstimates
     motif_variance: MotifVarianceEstimates
+    B_decomposition: LowrankDecomposition
     group_names: list
     clustering: np.ndarray = None
     clustered_B: np.ndarray = None
@@ -70,7 +71,8 @@ def transform_data(data, std_y=False, std_b=False, helmert=True) -> TransformedD
 def estimate_error_variance(data: TransformedData,
                             B_decomposition: LowrankDecomposition) -> ErrorVarianceEstimates:
-    Y = B_decomposition.null_Q.T @ data.Y
+    # Y = B_decomposition.null_Q.T @ data.Y
+    Y = B_decomposition.null_space_transform(data.Y)
     variance = (Y ** 2).mean(axis=0)
     return ErrorVarianceEstimates(variance)
@@ -79,7 +81,7 @@ def calc_tau(tau: float, error_variance: np.ndarray, mode: TauMode):
     if mode == mode.mara:
         taus = tau * np.ones_like(error_variance)
     else:
-        taus = tau / (error_variance + tau)
+        taus = tau / error_variance
     return taus
 def loglik_tau(tau: float, Sigma: np.ndarray, Y_hat: np.ndarray,
@@ -88,10 +90,10 @@ def loglik_tau(tau: float, Sigma: np.ndarray, Y_hat: np.ndarray,
     logdet = 0
     taus = calc_tau(tau, error_variance, mode)
     for sigma, tau, y in zip(error_variance, taus, Y_hat.T):
-        S = tau * Sigma + sigma
-        vec += (y ** 2 * S).sum()
+        S = tau / sigma * Sigma + 1
+        vec += (y ** 2 / S).sum() * (tau / sigma ** 2)
         logdet += S.sum()
-    return vec + logdet
+    return -vec + logdet
 def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecomposition,
                             error_variance: ErrorVarianceEstimates,
@@ -106,7 +108,7 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
     Y_hat = Q.T @ data.B.T @ data.Y
     fun = partial(loglik_tau, Sigma=Sigma, Y_hat=Y_hat, error_variance=error_variance.variance,
                   mode=mode)
-    tau = calc_tau(minimize_scalar(fun, bounds=(0.0, 5.0)).x, error_variance.variance, mode)
+    tau = calc_tau(minimize_scalar(fun, bounds=(0.0, error_variance.variance.max() * 10)).x, error_variance.variance, mode)
     return MotifVarianceEstimates(tau)
@@ -118,18 +120,11 @@ class ActivitiesPrediction:
 def predict_activities(data: TransformedData, fit: FitResult,
-                       gpu_decomposition=False, gpu=False, verbose=True) -> ActivitiesPrediction:
+                       gpu=False, verbose=True) -> ActivitiesPrediction:
     U = list()
     variance = list()
-    if gpu_decomposition:
-        device = jax.devices()
-    else:
-        device = jax.devices('cpu')
-    device = next(iter(device))
-    logger_print('Computing low-rank decompositions of the loading matrix...', verbose)
-    with jax.default_device(device):
-        B_decomposition = lowrank_decomposition(data.B)
+    B_decomposition = fit.B_decomposition
     if gpu:
         device = jax.devices()
     else:
@@ -200,7 +195,7 @@ def fit(project: str, tau_mode: TauMode, tau_estimation: TauEstimation,
     res = FitResult(error_variance=error_variance, motif_variance=motif_variance,
-                    clustering=clustering,
+                    clustering=clustering, B_decomposition=B_decomposition,
                     group_names=group_names, promoter_inds_to_drop=promoter_inds_to_drop)
     if dump:
         with openers[fmt](f'{project}.old.fit.{fmt}', 'wb') as f:
@@ -257,10 +252,10 @@ def _cor(a, b, axis=1):
     return numerator / denominator
 def calculate_fov(project: str, gpu: bool,
-                  stat_type: GOFStat, x64=True,
+                  stat_type: GOFStat, keep_motifs: str, x64=True,
                   verbose=True, dump=True):
     def calc_fov(data: TransformedData, fit: FitResult,
-                 activities: ActivitiesPrediction) -> tuple[FOVResult]:
+                 activities: ActivitiesPrediction, keep_motifs=None) -> tuple[FOVResult]:
         def sub(Y, effects) -> FOVResult:
             if stat_type == stat_type.fov:
                 Y1 = Y - effects
@@ -277,17 +272,33 @@ def calculate_fov(project: str, gpu: bool,
                 sample = _cor(Y, effects, axis=0)
             return FOVResult(total, prom, sample)
         data = transform_data(data)
-        B = data.B
+        B = data.B if activities.clustering is None else activities.clustering[0]
         Y = data.Y
         U = activities.U
-        if activities.clustering is not None:
-            d = activities.clustering[0] @ U
-        else:
-            d = B @ U
+        if keep_motifs is not None:
+            B = B[:, keep_motifs]
+            U = U[keep_motifs]
+        d = B @ U
         stat_0 = sub(Y, d)
         return stat_0,
     data = read_init(project)
     fmt = data.fmt
+    motif_names = data.motif_names
+    if keep_motifs:
+        import datatable as dt
+        df = dt.fread(keep_motifs).to_pandas().groupby('status')
+        keep_motifs = list()
+        for name, motifs in df:
+            inds = list()
+            for mot in motifs.iloc[:, 0]:
+                try:
+                    i = motif_names.index(mot)
+                    inds.append(i)
+                except ValueError:
+                    print(f'Motif {mot} not found in the project.')
+            keep_motifs.append((name, np.array(inds, dtype=int)))
+    else:
+        keep_motifs = [(None, None)]
     with openers[fmt](f'{project}.old.fit.{fmt}', 'rb') as f:
         fit = dill.load(f)
     with openers[fmt](f'{project}.old.predict.{fmt}', 'rb') as f:
@@ -303,17 +314,23 @@ def calculate_fov(project: str, gpu: bool,
     else:
         device = jax.devices('cpu')
     device = next(iter(device))
-    with jax.default_device(device):
-        if data_test is not None:
-            test_FOV = calc_fov(data=data_test, fit=fit, activities=activities)
-        train_FOV = calc_fov(data=data, fit=fit, activities=activities)
-    if data_test is None:
-        test_FOV = None
-    res = TestResult(train_FOV, test_FOV, grouped=False)
+    results = list()
+    for status_name, motifs in keep_motifs:
+        if status_name:
+            status_name = f'{status_name} ({len(motifs)})'
+        print(status_name)
+        with jax.default_device(device):
+            if data_test is not None:
+                test_FOV = calc_fov(data=data_test, fit=fit, activities=activities, keep_motifs=motifs)
+            train_FOV = calc_fov(data=data, fit=fit, activities=activities, keep_motifs=motifs)
+        if data_test is None:
+            test_FOV = None
+        res = TestResult(train_FOV, test_FOV, grouped=False)
+        results.append((status_name, res))
     with openers[fmt](f'{project}.old.fov.{fmt}', 'wb') as f:
-        dill.dump(res, f)
-    return res
+        dill.dump(results, f)
+    return results

{maradoner-0.10 → maradoner-0.11}/maradoner/mara/main.py RENAMED Viewed

@@ -51,6 +51,7 @@ def _fit(name: str = Argument(..., help='Project name.'),
 @app_old.command('gof', help='Estimate GOFs given test/train data split. Provides test info only if [orange]test-chromosomes[/orange] is not None in [cyan]fit[/cyan].')
 def _gof(name: str = Argument(..., help='Project name.'),
          # use_groups: bool = Option(False, help='Compute statistic for sammples aggragated across groups.'),
+         keep_motifs: Path = Option(None, help='Table with 2 columns: motif and status'),
          stat_type: GOFStat = Option(GOFStat.fov, help='Statistic type to compute'),
          gpu: bool = Option(False, help='Use GPU if available for most of computations.'),
          x64: bool = Option(True, help='Use high precision algebra.')):
@@ -62,21 +63,25 @@ def _gof(name: str = Argument(..., help='Project name.'),
     p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
     p.add_task(description="Calculating FOVs...", total=None)
     p.start()
-    res = calculate_fov(name, stat_type=stat_type, gpu=gpu, x64=x64)
-    if stat_type == GOFStat.corr:
-        title = 'Pearson correlation'
-    else:
-        title = 'Fraction of variance explained'
-    t = Table('Set', 'stat',
-              title=title)
-    row = [f'{t.total:.6f}' for t in res.train]
-    t.add_row('train', *row)
-    if res.test is not None:
-        row = [f'{t.total:.6f}' for t in res.test]
-        t.add_row('test', *row)
+    res = calculate_fov(name, stat_type=stat_type, keep_motifs=keep_motifs, gpu=gpu, x64=x64)
+    for name, res in res:
+        print(name)
+        if stat_type == GOFStat.corr:
+            title = 'Pearson correlation'
+        else:
+            title = 'Fraction of variance explained'
+        if name:
+            title = f'({name}) {title}'
+        t = Table('Set', 'stat',
+                  title=title)
+        row = [f'{t.total:.6f}' for t in res.train]
+        t.add_row('train', *row)
+        if res.test is not None:
+            row = [f'{t.total:.6f}' for t in res.test]
+            t.add_row('test', *row)
+        rprint(t)
     p.stop()
     dt = time() - t0
-    rprint(t)
     rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
 @app_old.command('predict', help='Estimate deviations of motif activities from their means.')

{maradoner-0.10 → maradoner-0.11}/maradoner.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: maradoner
-Version: 0.10
+Version: 0.11
 Summary: Variance-adjusted estimation of motif activities.
 Home-page: https://github.com/autosome-ru/nemara
 Author: Georgy Meshcheryakov