PyPI - maradoner - Versions diffs - 0.11__tar.gz → 0.13__tar.gz - Mend

maradoner 0.11tar.gz → 0.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maradoner might be problematic. Click here for more details.

Files changed (26) hide show

{maradoner-0.11 → maradoner-0.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.1
 Name: maradoner
-Version: 0.11
+Version: 0.13
 Summary: Variance-adjusted estimation of motif activities.
 Home-page: https://github.com/autosome-ru/nemara
 Author: Georgy Meshcheryakov
@@ -25,15 +25,8 @@ Requires-Dist: statsmodels>=0.14
 Requires-Dist: datatable>=1.0.0
 Requires-Dist: dill>=0.3.9
 Requires-Dist: rich>=12.6.0
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: requires-dist
-Dynamic: requires-python
-Dynamic: summary
+Requires-Dist: tqdm>=4.0
+Requires-Dist: scikit-learn>=1.6
 **MARADONER**

{maradoner-0.11 → maradoner-0.13}/maradoner/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-__version__ = '0.11'
+__version__ = '0.13'
 import importlib
@@ -16,6 +16,8 @@ __min_reqs__ = [
             'datatable>=1.0.0' ,
             'dill>=0.3.9',
             'rich>=12.6.0',
+            'tqdm>=4.0',
+            'scikit-learn>=1.6'
            ]
 def versiontuple(v):

{maradoner-0.11 → maradoner-0.13}/maradoner/create.py RENAMED Viewed

@@ -37,7 +37,7 @@ def transform_loadings(df, mode: str, zero_cutoff=1e-9, prom_inds=None):
 def create_project(project_name: str, promoter_expression_filename: str, loading_matrix_filenames: list[str],
                    motif_expression_filenames=None, loading_matrix_transformations=None, sample_groups=None, motif_postfixes=None,
-                   promoter_filter_lowexp_cutoff=0.95, promoter_filter_plot_filename=None,
+                   promoter_filter_lowexp_cutoff=0.95, promoter_filter_plot_filename=None, promoter_filter_max=True,
                    motif_names_filename=None, compression='raw', dump=True, verbose=True):
     if not os.path.isfile(promoter_expression_filename):
         raise FileNotFoundError(f'Promoter expression file {promoter_expression_filename} not found.')
@@ -88,8 +88,8 @@ def create_project(project_name: str, promoter_expression_filename: str, loading
                             f'{len(loading_matrix_transformations)}.')
     logger_print('Filtering promoters of low expression...', verbose)
-    print('aaaaa', len(promoter_expression))
-    inds, weights = filter_lowexp(promoter_expression, cutoff=promoter_filter_lowexp_cutoff, fit_plot_filename=promoter_filter_plot_filename)
+    inds, weights = filter_lowexp(promoter_expression, cutoff=promoter_filter_lowexp_cutoff, fit_plot_filename=promoter_filter_plot_filename,
+                                  max_mode=promoter_filter_max)
     promoter_expression = promoter_expression.loc[inds]
     proms = promoter_expression.index
     loading_matrices = [transform_loadings(df, mode, prom_inds=inds) for df, mode in zip(loading_matrices, loading_matrix_transformations)]

{maradoner-0.11 → maradoner-0.13}/maradoner/dataset_filter.py RENAMED Viewed

@@ -6,7 +6,19 @@ import pandas as pd
 import numpy as np
 from scipy.optimize import minimize
 from functools import partial
+from sklearn.mixture import GaussianMixture
+def compute_leftmost_probability(Y):
+    Y = Y.reshape(-1, 1)
+    gmm = GaussianMixture(n_components=2, random_state=0)
+    gmm.fit(Y)
+    means = gmm.means_.flatten()
+    leftmost_component_index = np.argmin(means)
+    probas = gmm.predict_proba(Y)
+    leftmost_probs = probas[:, leftmost_component_index]
+    return leftmost_probs, gmm
 def normax_logpdf(x: jnp.ndarray, mu: float, sigma: float, n: int):
     x = (x - mu) / sigma
@@ -39,9 +51,33 @@ def loglik(params: jnp.ndarray, x: jnp.ndarray, n: int):
     w = params[-1]
     return -logmixture(x, mu, sigma, w, n).sum()
-def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, fit_plot_filename=None, plot_dpi=200):
+def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, max_mode=True,
+                  fit_plot_filename=None, plot_dpi=200):
     expression = (expression - expression.mean()) / expression.std()
+    if not max_mode:
+        expression = expression.mean(axis=1).values
+        probs, gmm = compute_leftmost_probability(expression)
+        inds = probs < (1-cutoff)
+        if fit_plot_filename:
+            import matplotlib.pyplot as plt
+            from matplotlib.collections import LineCollection
+            import seaborn as sns
+            x = np.array(sorted(expression))
+            pdf = np.exp(gmm.score_samples(expression[:, None]))
+            points = np.array([x, pdf]).T.reshape(-1, 1, 2)
+            segments = np.concatenate([points[:-1], points[1:]], axis=1)
+            plt.figure(dpi=plot_dpi, )
+            sns.histplot(expression, stat='density', color='grey')
+            lc = LineCollection(segments, cmap='winter')
+            lc.set_array(probs)
+            lc.set_linewidth(3)
+            line = plt.gca().add_collection(lc)
+            plt.colorbar(line)
+            plt.xlabel('Standardized expression')
+            plt.tight_layout()
+            plt.savefig(fit_plot_filename)
+        return inds, probs
     expression_max = expression.max(axis=1).values
     mu = [-1.0, 0.0]

{maradoner-0.11 → maradoner-0.13}/maradoner/export.py RENAMED Viewed

@@ -2,8 +2,9 @@
 # -*- coding: utf-8 -*-
 from pandas import DataFrame as DF
 # add dot
-from .utils import read_init, openers
+from .utils import read_init, openers, ProjectData
 from .fit import FOVResult, ActivitiesPrediction, FitResult
+from .grn import grn
 from scipy.stats import norm, chi2, multivariate_normal, Covariance
 from scipy.linalg import eigh, lapack, cholesky, solve
 from statsmodels.stats import multitest
@@ -80,7 +81,9 @@ class Information():
         try:
             x = chol_inv(x)
         except:
-            print('alarm')
+            print('Failed to compute inverse using Cholesky decomposition. ')
+            print('This can be a sign of a numerical errors during parameters estimation.')
+            print('Will use pseudo-inverse now. The minimal and maximal eigenvalues are:')
             # print(x.diagonal().min())
             assert np.allclose(x, x.T), x - x.T
             x = np.linalg.eigh(x)
@@ -155,7 +158,8 @@ def export_fov(fovs: tuple[FOVResult], folder: str,
     samples = [fov_null.sample[:, None], fov_means.sample[:, None], fov_motif_means.sample[:, None]]
     samples = np.concatenate(samples, axis=-1)
     DF(samples, index=sample_names, columns=cols).to_csv(os.path.join(folder, 'samples.tsv'), sep='\t')
 def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
@@ -172,12 +176,17 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
     #     bad_inds[ind] = True
     # mot = fit.motif_variance.motif
     # mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
+    motif_variance = fit.motif_variance.motif
+    if activities.filtered_motifs is not None:
+        motif_variance = np.delete(motif_variance, activities.filtered_motifs)
+        B = np.delete(B, activities.filtered_motifs, axis=1)
+    U = activities.U
     if map_cov:
         # fit.motif_variance.m
         BTB = B.T @ B
-        BTB_s = BTB * fit.motif_variance.motif ** 0.5
+        BTB_s = BTB * motif_variance ** 0.5
         BTB_s = BTB_s @ BTB_s.T
-    for cov, U, sigma, n, nu in zip(activities.cov(), activities.U.T,
+    for cov, U, sigma, n, nu in zip(activities.cov(), U.T,
                           activities._cov[-2],
                           fit.error_variance.variance, fit.motif_variance.group):
         # cov = cov[~bad_inds, ~bad_inds]
@@ -189,11 +198,11 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
         covs.append(cov)
         # U = U[~bad_inds]
         # prec = np.linalg.inv(np.diag(mot * nu) - cov)
-        prec = np.linalg.inv(cov)
+        prec = np.linalg.pinv(cov, hermitian=True)
         mean += prec @ U
         precs.append(prec)
     total_prec = sum(precs)
-    total_cov = np.linalg.inv(total_prec)
+    total_cov = np.linalg.pinv(total_prec, hermitian=True)
     mean = total_cov @ mean
     stats = activities.U[~bad_inds] - mean.reshape(-1, 1)
     # if corr_stat:
@@ -211,9 +220,6 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
     fdr = multitest.multipletests(pvalues, alpha=0.05, method='fdr_by')[1]
     return stats, pvalues, fdr, bad_inds
 def export_results(project_name: str, output_folder: str,
                    std_mode: Standardization,
@@ -249,7 +255,7 @@ def export_results(project_name: str, output_folder: str,
         motif_names_filtered = motif_names
     os.makedirs(output_folder, exist_ok=True)
+    # grn(data, act, fit, os.path.join(output_folder, 'grn'))
     error_variance = fit.error_variance.variance
     error_variance_fim = Information(fit.error_variance.fim)
     error_variance_stat, error_variance_std = error_variance_fim.standardize(error_variance,
@@ -278,9 +284,12 @@ def export_results(project_name: str, output_folder: str,
     folder = os.path.join(output_folder, 'params')
     os.makedirs(folder, exist_ok=True)
+    if os.path.isfile(f'{project_name}.promvar.{fmt}'):
+        with openers[fmt](f'{project_name}.promvar.{fmt}', 'rb') as f:
+            promvar: np.ndarray = dill.load(f)
+        DF(promvar, index=prom_names, columns=group_names).to_csv(os.path.join(folder, 'promoter_variances.tsv'), sep='\t')
     if excluded_motif_group is not None:
         motif_group_variance_std = np.insert(motif_group_variance_std, excluded_motif_group, np.nan)
-    print(error_variance.shape, error_variance_std.shape,   motif_group_variance.shape, motif_group_variance_std.shape)
     DF(np.array([error_variance, error_variance_std, motif_group_variance, motif_group_variance_std]).T,
                 index=group_names,
                 columns=['sigma', 'sigma_std', 'nu', 'nu_std']).to_csv(os.path.join(folder, 'group_variances.tsv'),
@@ -400,6 +409,48 @@ def export_results(project_name: str, output_folder: str,
                        sample_names=sample_names)
+def export_loadings_product(project_name: str, output_folder: str,
+                            use_hdf: bool = True, intercepts: bool = True,
+                            tsv_truncation=4):
-    # return {'z-test': z_test, 'anova': anova, 'off_test': off_test,
-    #         'anova_ass': anova_ass, 'sign_ass': sign_ass}
+    data = read_init(project_name)
+    fmt = data.fmt
+    motif_names = data.motif_names
+    prom_names = data.promoter_names
+    # del data
+    with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
+        fit: FitResult = dill.load(f)
+    if fit.promoter_inds_to_drop:
+        prom_names = np.delete(prom_names, fit.promoter_inds_to_drop)
+    group_names = fit.group_names
+    with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
+        act: ActivitiesPrediction = dill.load(f)
+    output_folder = os.path.join(output_folder, 'loadings-product')
+    os.makedirs(output_folder, exist_ok=True)
+    U = act.U
+    B = data.B
+    mu = fit.motif_mean.mean
+    if act.filtered_motifs is not None:
+        motif_names = np.delete(motif_names, act.filtered_motifs)
+        B = np.delete(B, act.filtered_motifs, axis=1)
+        mu = np.delete(mu, act.filtered_motifs)
+    BM = B * mu
+    for name, U in zip(group_names, U.T):
+        effect = B * U
+        if intercepts:
+            effect += BM
+        if use_hdf:
+            effect = effect.astype(np.half)
+            filename = os.path.join(output_folder, f'{name}.hdf')
+            DF(data=effect, index=prom_names, columns=motif_names).to_hdf(filename, key='lrt', mode='w', complevel=4)
+        else:
+            filename = os.path.join(output_folder, f'{name}.tsv')
+            DF(data=effect, index=prom_names, columns=motif_names).to_csv(filename, sep='\t',
+                                                                          float_format=f'%.{tsv_truncation}f')

{maradoner-0.11 → maradoner-0.13}/maradoner/fit.py RENAMED Viewed

@@ -198,7 +198,7 @@ def ones_nullspace_transform_transpose(X: np.ndarray) -> np.ndarray:
     return Y
-def lowrank_decomposition(X: np.ndarray, rel_eps=1e-12) -> LowrankDecomposition:
+def lowrank_decomposition(X: np.ndarray, rel_eps=1e-15) -> LowrankDecomposition:
     svd = jnp.linalg.svd
     q, s, v = [np.array(t) for t in svd(X, full_matrices=False)]
     max_sv = max(s)
@@ -449,12 +449,6 @@ def loglik_motifs_fim(x: jnp.ndarray, BTB: jnp.ndarray,
         FIM_tau_nu = jnp.delete(FIM_tau_nu, G_fix_ind, axis=1)
     FIM = jnp.block([[FIM_tau, FIM_tau_nu],
                      [FIM_tau_nu.T, FIM_nu]])
-    t = FIM[:len(Sigma), :len(Sigma)]
-    t = jnp.linalg.eigh(t)[0]
-    print('FIM_tau', np.min(t), np.max(t), np.min(np.abs(t)))
-    t = FIM[len(Sigma):, len(Sigma):]
-    t = jnp.linalg.eigh(t)[0]
-    print('FIM_nu', np.min(t), np.max(t), np.min(np.abs(t)))
     return FIM
@@ -483,7 +477,7 @@ def estimate_error_variance(data: TransformedData, B_decomposition: LowrankDecom
                    group_inds=data.group_inds)
     fun = jax.jit(fun)
     grad = jax.jit(grad)
-    opt = MetaOptimizer(fun, grad,  num_steps_momentum=10)
+    opt = MetaOptimizer(fun, grad,  num_steps_momentum=15)
     res = opt.optimize(d0)
     if verbose:
         print('-' * 15)
@@ -539,9 +533,7 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
                       G_fix_ind=j, G_fix_val=fix)
         fun = jax.jit(fun)
         grad = jax.jit(grad)
-        opt = MetaOptimizer(fun, grad, num_steps_momentum=80,
-                            # scaling_set=(slice(len(BTB)), slice(len(BTB), None))
-                            )
+        opt = MetaOptimizer(fun, grad, num_steps_momentum=50)
         try:
             res = opt.optimize(x0)
         except ValueError as E:
@@ -566,14 +558,17 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
                   G_fix_ind=j, G_fix_val=fix)
     f = fim(res.x)
     eig = jnp.linalg.eigh(f)[0].min()
+    print('FIM min eig', eig)
     if eig < 0:
         eig = list()
-        epsilons =  [1e-23, 1e-15, 1e-12, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
+        epsilons =  [1e-23, 1e-18, 1e-15, 1e-12, 1e-9, 1e-8,
+                     1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
         for eps in epsilons:
             x = res.x.copy()
             x = x.at[:len(BTB)].set(jnp.clip(x.at[:len(BTB)].get(), eps, float('inf')))
             f = fim(x)
             eig.append(jnp.linalg.eigh(f)[0].min())
+            print(eps, eig[-1])
             if eig[-1] > 0:
                 break
         i = np.argmax(eig)
@@ -870,8 +865,12 @@ def fit(project: str, clustering: ClusteringMode,
     data.B, clustering = cluster_data(data.B, mode=clustering,
                                       num_clusters=num_clusters)
     if test_chromosomes:
-        test_chromosomes = tuple([c + '_' for c in test_chromosomes])
-        promoter_inds_to_drop = [i for i, p in enumerate(data.promoter_names) if p.startswith(test_chromosomes)]
+        import re
+        pattern = re.compile(r'chr([0-9XYM]+|\d+)')
+        test_chromosomes = set(test_chromosomes)
+        promoter_inds_to_drop = [i for i, p in enumerate(data.promoter_names)
+                                 if pattern.search(p).group() in test_chromosomes]
         data.Y = np.delete(data.Y, promoter_inds_to_drop, axis=0)
         data.B = np.delete(data.B, promoter_inds_to_drop, axis=0)
     else:
@@ -942,12 +941,12 @@ def split_data(data: ProjectData, inds: list) -> tuple[ProjectData, ProjectData]
     data_d = ProjectData(Y=Y_d, B=B_d, K=data.K, weights=data.weights,
                          group_inds=data.group_inds, group_names=data.group_names,
                          motif_names=data.motif_names, promoter_names=promoter_names_d,
-                         motif_postfixes=data.motif_postfixes,
+                         motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
                          fmt=data.fmt)
     data = ProjectData(Y=Y, B=B, K=data.K, weights=data.weights,
                          group_inds=data.group_inds, group_names=data.group_names,
                          motif_names=data.motif_names, promoter_names=promoter_names,
-                         motif_postfixes=data.motif_postfixes,
+                         motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
                          fmt=data.fmt)
     return data_d, data

maradoner-0.13/maradoner/grn.py ADDED Viewed

@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import jax.numpy as jnp
+import jax
+from .utils import read_init, openers, ProjectData
+from .fit import FOVResult, ActivitiesPrediction, FitResult
+from scipy.optimize import minimize_scalar, minimize
+import os
+import dill
+from pandas import DataFrame as DF
+from scipy.stats import norm
+from functools import partial
+from tqdm import tqdm
+def estimate_promoter_prior_variance(data: ProjectData, activities: ActivitiesPrediction,
+                                     fit: FitResult, top=0.90, eps=1e-6):
+    B = data.B
+    Y = data.Y
+    group_inds = data.group_inds
+    Y = Y - fit.promoter_mean.mean.reshape(-1, 1) - fit.sample_mean.mean.reshape(1, -1)
+    Y = Y -  B @ fit.motif_mean.mean.reshape(-1, 1)
+    Y = np.concatenate([Y[:, inds].mean(axis=1, keepdims=True) - B @ U.reshape(-1, 1)
+                        for inds, U in zip(group_inds, activities.U.T)],
+                       axis=1)
+    var = (Y**2).mean(axis=1)
+    var = var[var > eps]
+    inds = np.argsort(var)
+    inds = inds[:int(len(inds) * top)]
+    return np.var(var[inds])
+def estimate_promoter_variance(project_name: str, prior_top=0.90):
+    def fun(sigma, y: jnp.ndarray, b: jnp.ndarray, s: int,
+          prior_mean: float, prior_var: float):
+        if jnp.iterable(sigma):
+            sigma = sigma[0]
+        theta = prior_var / prior_mean
+        alpha = prior_var / theta ** 2
+        penalty = sigma / theta - (alpha - 1) * jnp.log(sigma)
+        return y / (b + sigma) + s * jnp.log(b + sigma) + penalty
+    data = read_init(project_name)
+    fmt = data.fmt
+    with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
+        fit: FitResult = dill.load(f)
+    with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
+        activities: ActivitiesPrediction = dill.load(f)
+    B = data.B
+    Y = data.Y
+    group_inds = data.group_inds
+    prior_var = estimate_promoter_prior_variance(data, activities, fit,
+                                                 top=prior_top)
+    print('Piror standard deviation:', prior_var ** 0.5)
+    prior_means = fit.error_variance.variance
+    Y = Y - fit.promoter_mean.mean.reshape(-1, 1) - fit.sample_mean.mean.reshape(1, -1)
+    Y = Y - B @ fit.motif_mean.mean.reshape(-1, 1)
+    Y = Y ** 2
+    B_hat = B ** 2 * fit.motif_variance.motif
+    B_hat = B_hat.sum(axis=1)
+    var = list()
+    for inds, prior_mean, nu in tqdm(list(zip(group_inds, prior_means, fit.motif_variance.group))):
+        Yt = Y[:, inds].sum(axis=1)
+        s = len(inds)
+        f_ = jax.jit(partial(fun, prior_mean=prior_mean, prior_var=prior_var, s=s))
+        g_ = jax.jit(jax.grad(f_))
+        var_g = list()
+        for y, b in zip(Yt, B_hat * nu):
+            res = minimize(partial(f_, b=b, y=y), x0=jnp.array([prior_mean]),
+                           method='SLSQP', bounds=[(0, None)],
+                           jac=partial(g_, b=b, y=y))
+            var_g.append(res.x[0] ** 2)
+        var.append(var_g)
+    var = np.array(var, dtype=float).T
+    with openers[fmt](f'{project_name}.promvar.{fmt}', 'wb') as f:
+        dill.dump(var, f)
+    return var
+def grn(project_name: str,  output: str, use_hdf=False, save_stat=True,
+        prior_h1=1/100):
+    data = read_init(project_name)
+    fmt = data.fmt
+    with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
+        fit: FitResult = dill.load(f)
+    with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
+        activities: ActivitiesPrediction = dill.load(f)
+    dtype = np.float32
+    B = data.B.astype(dtype)
+    Y = data.Y.astype(dtype)
+    group_inds = data.group_inds
+    group_names = data.group_names
+    nus = fit.motif_variance.group.astype(dtype)
+    motif_names = data.motif_names
+    prom_names = data.promoter_names
+    U = activities.U_raw.astype(dtype)
+    motif_mean = fit.motif_mean.mean.flatten().astype(dtype)
+    motif_variance = fit.motif_variance.motif.astype(dtype)
+    promoter_mean = fit.promoter_mean.mean.astype(dtype)
+    sample_mean = fit.sample_mean.mean.astype(dtype)
+    try:
+        with openers[fmt](f'{project_name}.promvar.{fmt}', 'rb') as f:
+            promvar: np.ndarray = dill.load(f)
+    except FileNotFoundError:
+        print('WARNING')
+        print('It seems that promoter variances were not estimated prior to running GRN.')
+        print('All promoter-wise variances will be assumed to be equal to the average error variance.')
+        print('Consider estimating promoter-wise variances before running GRN in the future.')
+        promvar = np.zeros((len(B), len(group_names)))
+        for i, sigma in enumerate(fit.error_variance.variance):
+            promvar[:, i] = sigma
+    Y = Y - promoter_mean.reshape(-1, 1) - sample_mean.reshape(1, -1)
+    Y = Y - B @ motif_mean.reshape(-1, 1)
+    if activities.filtered_motifs is not None:
+        motif_names = np.delete(motif_names, activities.filtered_motifs)
+        B = np.delete(B, activities.filtered_motifs, axis=1)
+        motif_mean = np.delete(motif_mean, activities.filtered_motifs)
+        motif_variance = np.delete(motif_variance, activities.filtered_motifs)
+    BM = B * motif_mean
+    BM = BM[..., None]
+    # BU = BU[..., None]
+    B_hat = B ** 2 * motif_variance
+    B_hat = B_hat.sum(axis=1, keepdims=True) - B_hat
+    B_pow = B ** 2
+    folder_stat = os.path.join(output, 'lr')
+    folder_belief = os.path.join(output, 'belief')
+    if save_stat:
+        os.makedirs(folder_stat, exist_ok=True)
+    os.makedirs(folder_belief, exist_ok=True)
+    for sigma, nu, name, inds in zip(promvar.T[..., None], nus,  group_names, group_inds):
+        # if name != 'anconeus':
+        #     continue
+        print(name)
+        var = (B_hat * nu + sigma)
+        Y_ = Y[:, inds][..., None, :] + BM
+        # theta = U[:, inds][..., None, :] + BM
+        theta = B[..., None] * U[:, inds] + BM
+        loglr = 2 * B * (Y_ * theta).sum(axis=-1) - B_pow * (theta ** 2).sum(axis=-1)
+        del Y_
+        del theta
+        loglr = loglr / (2 * var)
+        del var
+        lr = np.exp(loglr)
+        belief = lr * prior_h1 / ((1 - prior_h1) + lr * prior_h1)
+        inds = sigma.flatten() > 1e-3
+        lr = lr[inds]
+        belief = belief[inds]
+        belief = belief.astype(np.half)
+        proms = list(np.array(prom_names)[inds])
+        # pvalue = n.sf(lr) * (theta > 0) + n.cdf(lr) * (theta <= 0)
+        if use_hdf:
+            if save_stat:
+                lr = lr.astype(np.half)
+                filename = os.path.join(folder_stat, f'{name}.hdf')
+                DF(data=lr, index=proms, columns=motif_names).to_hdf(filename, key='zscore', mode='w', complevel=4)
+            filename = os.path.join(folder_belief, f'{name}.hdf')
+            DF(data=belief, index=proms, columns=motif_names).to_hdf(filename, key='lrt', mode='w', complevel=4)
+        else:
+            if save_stat:
+                lr = lr.astype(np.half)
+                filename = os.path.join(folder_stat, f'{name}.tsv')
+                DF(data=lr, index=proms, columns=motif_names).to_csv(filename, sep='\t',
+                                                                          float_format='%.3f')
+            filename = os.path.join(folder_belief, f'{name}.tsv')
+            DF(data=belief, index=proms, columns=motif_names).to_csv(filename, sep='\t',
+                                                                          float_format='%.3f')

{maradoner-0.11 → maradoner-0.13}/maradoner/main.py RENAMED Viewed

@@ -13,10 +13,11 @@ from rich.table import Table
 from .create import create_project
 from pathlib import Path
 from .fit import fit, ClusteringMode, calculate_fov, predict, GOFStat, GOFStatMode
+from .grn import estimate_promoter_variance, grn
 from .synthetic_data import generate_dataset
 from time import time
 from dill import __version__ as dill_version
-from .export import export_results, Standardization, ANOVAType
+from .export import export_results, export_loadings_product, Standardization, ANOVAType
 from . import __version__ as project_version
 from .select import select_motifs_single
 import json
@@ -105,7 +106,7 @@ def _create(name: str = Argument(..., help='Project name. [bold]MARADONER[/bold]
                                             'name[/cyan].'),
             expression: Path = Argument(..., help='A path to the promoter expression table. Expression values are assumed to be in a log-scale.'),
             loading: List[Path] = Argument(..., help='A list (if applicable, separated by space) of filenames containing loading matrices. '),
-            loading_transform: List[LoadingTransform] = Option([LoadingTransform.none], '--loading-transform', '-t',
+            loading_transform: List[LoadingTransform] = Option([LoadingTransform.esf], '--loading-transform', '-t',
                                                                help='A type of transformation to apply to loading '
                                                                 'matrices. [orange]ecdf[/orange] substitutes values in the table with empricical CDF,'
                                                                 ' [orange]esf[/orange] with negative logarithm of the empirical survival function.'),
@@ -115,6 +116,8 @@ def _create(name: str = Argument(..., help='Project name. [bold]MARADONER[/bold]
                                           ' contain. If a text file, each line must start with a group name followed by space-separated sample names.'),
             filter_lowexp_w: float = Option(0.9, help='Truncation boundary for filtering out low-expressed promoters. The closer [orange]w[/orange]'
                                             ' to 1, the more promoters will be left in the dataset.'),
+            filter_max_mode: bool = Option(True, help='Use max-mode of filtering. Max-mode keeps promoters that are active at least for some samples.'
+                                                       ' If disabled, filtration using GMM on the averages will be ran instead.'),
             filter_plot: Path = Option(None, help='Expression plot with a fitted mixture that is used for filtering.'),
             loading_postfix: List[str] = Option(None, '--loading-postfix', '-p',
                                                 help='String postfixes will be appeneded to the motifs from each of the supplied loading matrices'),
@@ -133,7 +136,8 @@ def _create(name: str = Argument(..., help='Project name. [bold]MARADONER[/bold]
     r = create_project(name, expression, loading_matrix_filenames=loading, motif_expression_filenames=motif_expression,
                        loading_matrix_transformations=loading_transform, sample_groups=sample_groups,
                        promoter_filter_lowexp_cutoff=filter_lowexp_w,
-                       promoter_filter_plot_filename=filter_plot,
+                       promoter_filter_plot_filename=filter_plot,
+                       promoter_filter_max=filter_max_mode,
                        compression=compression,
                        motif_postfixes=loading_postfix,
                        motif_names_filename=motif_filename,
@@ -208,7 +212,7 @@ def _gof(name: str = Argument(..., help='Project name.'),
 @app.command('predict', help='Estimate deviations of motif activities from their means.')
 def _predict(name: str = Argument(..., help='Project name.'),
-         filter_motifs: bool = Option(False, help='Do not predict deviations from motifs whose variance is low.'),
+         filter_motifs: bool = Option(True, help='Do not predict deviations from motifs whose variance is low.'),
          filter_order: int = Option(7, help='Motif variance is considered low if it is [orange]filter-order[/orange] orders of magnitude smaller that a median motif variance.'),
          tau_search: bool = Option(False, help='Search for tau multiplier using CV'),
          cv_repeats: int = Option(3, help='CV repeats in [orange]RepeatedKFold[/orange]'),
@@ -265,7 +269,11 @@ def _export(name: str = Argument(..., help='Project name.'),
             std_mode: Standardization = Option(Standardization.full, help='Whether to standardize activities with plain variances or also decorrelate them.'),
             anova_mode: ANOVAType = Option(ANOVAType.positive, help='If negative, look for non-variative motifs'),
             weighted_zscore: bool = Option(False, help='Reciprocal variance weighted Z-scores'),
-            alpha: float = Option(0.05, help='FDR alpha.')):
+            alpha: float = Option(0.05, help='FDR alpha.'),
+            loadings_product: bool = Option(False, help='Export loading matrix-acitvity 3D tensor. This will produce num_of_groups tabular files.'),
+            lp_hdf: bool = Option(True, help='Each loadings-product table will be stored in hdf format (occupies much less space than plain tsv) using float16 precision.'),
+            lp_intercepts: bool = Option(True, help='Include motif means in the 3D tensor.'),
+            lp_tsv_truncation: int = Option(4, help='Number of digits after a floating point to truncate. Decreases the output size of a tabular if [orange]lp-hdf[/orange] is disabled.')):
     t0 = time()
     p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
     p.add_task(description="Exporting results...", total=None)
@@ -273,8 +281,17 @@ def _export(name: str = Argument(..., help='Project name.'),
     export_results(name, output_folder, std_mode=std_mode, anova_mode=anova_mode, alpha=alpha,
                    weighted_zscore=weighted_zscore)
     p.stop()
+    if loadings_product:
+        p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
+        p.add_task(description="Exporting results...", total=None)
+        p.start()
+        export_loadings_product(name, output_folder, use_hdf=lp_hdf, intercepts=lp_intercepts)
+        p.stop()
     dt = time() - t0
     rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
 __select_motif_doc = 'Selects best motif variants when the project was created from multiple loading matrices, each with an unique postfix.'\
@@ -287,13 +304,51 @@ def _select_motifs(name: str = Argument(..., help='Project name'),
                    filename: Path = Argument(..., help='Filename where a list of best motif variants will be stored')):
     t0 = time()
     p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
-    p.add_task(description="Exporting results...", total=None)
+    p.add_task(description="Selecting motifs...", total=None)
     p.start()
     select_motifs_single(name, filename)
     p.stop()
     dt = time() - t0
     rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
+__grn_doc = 'Tests each promoter against each motif per each group. Some people call it GRN.'
+@app.command('grn',
+             help=__select_motif_doc)
+def _grn(name: str = Argument(..., help='Project name'),
+         folder: Path = Argument(..., help='Output folder where results will be stored. In total, expect number_of_groups tables of size'
+                                           ' comparable to the expression file size.'),
+         hdf: bool = Option(True, help='Use HDF format instead of tar.gz files. Typically eats much less space'),
+         stat: bool = Option(True, help='Save statistics alongside probabilities.'),
+         prior_h1: float = Option(1/10, help='Prior belief on the expected fraction of motifs active per promoter.')):
+    t0 = time()
+    p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
+    p.add_task(description="Building GRN...", total=None)
+    p.start()
+    grn(name, output=folder, use_hdf=hdf, save_stat=stat, prior_h1=prior_h1)
+    p.stop()
+    dt = time() - t0
+    rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
+__estimate_promvar_doc = 'Estimates each promoter variance for each group using empirical Bayesian shrinkage.'\
+                         ' A necessary step before computing GRN.'
+@app.command('estimate-promoter-variance',
+             help=__estimate_promvar_doc)
+def _estimate_promoter_variance(name: str = Argument(..., help='Project name'),
+                                prior_top: float = Option(0.90,
+                                                          help='The fraction from the bottom as ranked by sample'
+                                                          ' variance of promoters to be used for estimating global group-wise variance.'
+                                                          ' Higher values result in higher prior variance and weaken the prior.'
+                                                          )):
+    t0 = time()
+    p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
+    p.add_task(description="Estimating each promoter's variance...", total=None)
+    p.start()
+    estimate_promoter_variance(name, prior_top=prior_top)
+    p.stop()
+    dt = time() - t0
+    rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
 def main():
     check_packages()
     app()

{maradoner-0.11 → maradoner-0.13}/maradoner/mara/fit.py RENAMED Viewed

@@ -51,7 +51,7 @@ class FitResult:
     promoter_inds_to_drop: list = None
-def transform_data(data, std_y=False, std_b=False, helmert=True) -> TransformedData:
+def transform_data(data, std_y=False, std_b=False) -> TransformedData:
     Y = data.Y - (data.Y.mean(axis=0, keepdims=True) + data.Y.mean(axis=1, keepdims=True) - data.Y.mean())
     B = data.B - data.B.mean(axis=0, keepdims=True)
     group_inds_inv = list()
@@ -159,8 +159,12 @@ def fit(project: str, tau_mode: TauMode, tau_estimation: TauEstimation,
     data.B, clustering = cluster_data(data.B, mode=clustering,
                                       num_clusters=num_clusters)
     if test_chromosomes:
-        test_chromosomes = tuple([c + '_' for c in test_chromosomes])
-        promoter_inds_to_drop = [i for i, p in enumerate(data.promoter_names) if p.startswith(test_chromosomes)]
+        import re
+        pattern = re.compile(r'chr([0-9XYM]+|\d+)')
+        test_chromosomes = set(test_chromosomes)
+        promoter_inds_to_drop = [i for i, p in enumerate(data.promoter_names)
+                                 if pattern.search(p).group() in test_chromosomes]
         data.Y = np.delete(data.Y, promoter_inds_to_drop, axis=0)
         data.B = np.delete(data.B, promoter_inds_to_drop, axis=0)
     else:
@@ -214,12 +218,12 @@ def split_data(data: ProjectData, inds: list) -> tuple[ProjectData, ProjectData]
     data_d = ProjectData(Y=Y_d, B=B_d, K=data.K, weights=data.weights,
                          group_inds=data.group_inds, group_names=data.group_names,
                          motif_names=data.motif_names, promoter_names=promoter_names_d,
-                         motif_postfixes=data.motif_postfixes,
+                         motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
                          fmt=data.fmt)
     data = ProjectData(Y=Y, B=B, K=data.K, weights=data.weights,
                          group_inds=data.group_inds, group_names=data.group_names,
                          motif_names=data.motif_names, promoter_names=promoter_names,
-                         motif_postfixes=data.motif_postfixes,
+                         motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
                          fmt=data.fmt)
     return data_d, data
@@ -255,7 +259,7 @@ def calculate_fov(project: str, gpu: bool,
                   stat_type: GOFStat, keep_motifs: str, x64=True,
                   verbose=True, dump=True):
     def calc_fov(data: TransformedData, fit: FitResult,
-                 activities: ActivitiesPrediction, keep_motifs=None) -> tuple[FOVResult]:
+                 activities: ActivitiesPrediction, keep_motifs=None, Bs=None) -> tuple[FOVResult]:
         def sub(Y, effects) -> FOVResult:
             if stat_type == stat_type.fov:
                 Y1 = Y - effects
@@ -271,10 +275,16 @@ def calculate_fov(project: str, gpu: bool,
                 prom = _cor(Y, effects, axis=1)
                 sample = _cor(Y, effects, axis=0)
             return FOVResult(total, prom, sample)
-        data = transform_data(data)
-        B = data.B if activities.clustering is None else activities.clustering[0]
-        Y = data.Y
-        U = activities.U
+        if Bs is None:
+            data = transform_data(data)
+            B = data.B if activities.clustering is None else activities.clustering[0]
+            Y = data.Y
+            U = activities.U
+        else:
+            B = data.B
+            Y = data.Y
+            B = np.hstack((B, np.ones((len(B), 1))))
+            U = np.linalg.pinv(np.hstack((Bs[0], np.ones((len(Bs[0]), 1))))) @ Bs[1]
         if keep_motifs is not None:
             B = B[:, keep_motifs]
             U = U[keep_motifs]
@@ -306,9 +316,9 @@ def calculate_fov(project: str, gpu: bool,
     data, data_test = split_data(data, fit.promoter_inds_to_drop)
     if x64:
         jax.config.update("jax_enable_x64", True)
-    data = transform_data(data, helmert=False)
-    if data_test is not None:
-        data_test = transform_data(data_test, helmert=False)
+    # data = transform_data(data, helmert=False)
+    # if data_test is not None:
+    #     data_test = transform_data(data_test, helmert=False)
     if gpu:
         device = jax.devices()
     else:
@@ -318,12 +328,15 @@ def calculate_fov(project: str, gpu: bool,
     for status_name, motifs in keep_motifs:
         if status_name:
             status_name = f'{status_name} ({len(motifs)})'
-        print(status_name)
         with jax.default_device(device):
             if data_test is not None:
-                test_FOV = calc_fov(data=data_test, fit=fit, activities=activities, keep_motifs=motifs)
-            train_FOV = calc_fov(data=data, fit=fit, activities=activities, keep_motifs=motifs)
+                test_FOV = calc_fov(data=data_test, fit=fit, activities=activities, keep_motifs=motifs,
+                                    Bs=(data.B, data.Y)
+                                    )
+            train_FOV = calc_fov(data=data, fit=fit, activities=activities, keep_motifs=motifs,
+                                 Bs=(data.B, data.Y)
+                                 )
         if data_test is None:
             test_FOV = None
         res = TestResult(train_FOV, test_FOV, grouped=False)

{maradoner-0.11 → maradoner-0.13}/maradoner/mara/main.py RENAMED Viewed

@@ -65,7 +65,8 @@ def _gof(name: str = Argument(..., help='Project name.'),
     p.start()
     res = calculate_fov(name, stat_type=stat_type, keep_motifs=keep_motifs, gpu=gpu, x64=x64)
     for name, res in res:
-        print(name)
+        if name:
+            print(name)
         if stat_type == GOFStat.corr:
             title = 'Pearson correlation'
         else:

{maradoner-0.11 → maradoner-0.13}/maradoner/utils.py RENAMED Viewed

@@ -75,6 +75,7 @@ class ProjectData:
     motif_postfixes: list
     fmt: str
 def read_init(project_name: str) -> ProjectData:
     if type(project_name) is str:
         filename, fmt = get_init_file(project_name)
@@ -86,7 +87,6 @@ def read_init(project_name: str) -> ProjectData:
     group_inds = list()
     for name in group_names:
         group_inds.append(np.array(init['groups'][name]))
     r = ProjectData(
         Y=init['expression'],
         B=init['loadings'],

{maradoner-0.11 → maradoner-0.13}/maradoner.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.1
 Name: maradoner
-Version: 0.11
+Version: 0.13
 Summary: Variance-adjusted estimation of motif activities.
 Home-page: https://github.com/autosome-ru/nemara
 Author: Georgy Meshcheryakov
@@ -25,15 +25,8 @@ Requires-Dist: statsmodels>=0.14
 Requires-Dist: datatable>=1.0.0
 Requires-Dist: dill>=0.3.9
 Requires-Dist: rich>=12.6.0
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: requires-dist
-Dynamic: requires-python
-Dynamic: summary
+Requires-Dist: tqdm>=4.0
+Requires-Dist: scikit-learn>=1.6
 **MARADONER**

{maradoner-0.11 → maradoner-0.13}/maradoner.egg-info/SOURCES.txt RENAMED Viewed

@@ -5,6 +5,7 @@ maradoner/create.py
 maradoner/dataset_filter.py
 maradoner/export.py
 maradoner/fit.py
+maradoner/grn.py
 maradoner/main.py
 maradoner/mara.py
 maradoner/meta_optimizer.py