maradoner 0.11__tar.gz → 0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maradoner might be problematic. Click here for more details.
- {maradoner-0.11 → maradoner-0.12}/PKG-INFO +4 -11
- {maradoner-0.11 → maradoner-0.12}/maradoner/__init__.py +3 -1
- {maradoner-0.11 → maradoner-0.12}/maradoner/create.py +3 -3
- {maradoner-0.11 → maradoner-0.12}/maradoner/dataset_filter.py +38 -2
- {maradoner-0.11 → maradoner-0.12}/maradoner/export.py +65 -14
- {maradoner-0.11 → maradoner-0.12}/maradoner/fit.py +15 -16
- maradoner-0.12/maradoner/grn.py +177 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/main.py +60 -5
- {maradoner-0.11 → maradoner-0.12}/maradoner/mara/fit.py +29 -16
- {maradoner-0.11 → maradoner-0.12}/maradoner/mara/main.py +2 -1
- {maradoner-0.11 → maradoner-0.12}/maradoner/utils.py +1 -1
- {maradoner-0.11 → maradoner-0.12}/maradoner.egg-info/PKG-INFO +4 -11
- {maradoner-0.11 → maradoner-0.12}/maradoner.egg-info/SOURCES.txt +1 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner.egg-info/requires.txt +2 -0
- {maradoner-0.11 → maradoner-0.12}/README.md +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/mara/__init__.py +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/mara/export.py +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/mara.py +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/meta_optimizer.py +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/select.py +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner/synthetic_data.py +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner.egg-info/dependency_links.txt +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner.egg-info/entry_points.txt +0 -0
- {maradoner-0.11 → maradoner-0.12}/maradoner.egg-info/top_level.txt +0 -0
- {maradoner-0.11 → maradoner-0.12}/setup.cfg +0 -0
- {maradoner-0.11 → maradoner-0.12}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: maradoner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12
|
|
4
4
|
Summary: Variance-adjusted estimation of motif activities.
|
|
5
5
|
Home-page: https://github.com/autosome-ru/nemara
|
|
6
6
|
Author: Georgy Meshcheryakov
|
|
@@ -25,15 +25,8 @@ Requires-Dist: statsmodels>=0.14
|
|
|
25
25
|
Requires-Dist: datatable>=1.0.0
|
|
26
26
|
Requires-Dist: dill>=0.3.9
|
|
27
27
|
Requires-Dist: rich>=12.6.0
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
Dynamic: classifier
|
|
31
|
-
Dynamic: description
|
|
32
|
-
Dynamic: description-content-type
|
|
33
|
-
Dynamic: home-page
|
|
34
|
-
Dynamic: requires-dist
|
|
35
|
-
Dynamic: requires-python
|
|
36
|
-
Dynamic: summary
|
|
28
|
+
Requires-Dist: tqdm>=4.0
|
|
29
|
+
Requires-Dist: scikit-learn>=1.6
|
|
37
30
|
|
|
38
31
|
|
|
39
32
|
**MARADONER**
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
__version__ = '0.
|
|
2
|
+
__version__ = '0.12'
|
|
3
3
|
import importlib
|
|
4
4
|
|
|
5
5
|
|
|
@@ -16,6 +16,8 @@ __min_reqs__ = [
|
|
|
16
16
|
'datatable>=1.0.0' ,
|
|
17
17
|
'dill>=0.3.9',
|
|
18
18
|
'rich>=12.6.0',
|
|
19
|
+
'tqdm>=4.0',
|
|
20
|
+
'scikit-learn>=1.6'
|
|
19
21
|
]
|
|
20
22
|
|
|
21
23
|
def versiontuple(v):
|
|
@@ -37,7 +37,7 @@ def transform_loadings(df, mode: str, zero_cutoff=1e-9, prom_inds=None):
|
|
|
37
37
|
|
|
38
38
|
def create_project(project_name: str, promoter_expression_filename: str, loading_matrix_filenames: list[str],
|
|
39
39
|
motif_expression_filenames=None, loading_matrix_transformations=None, sample_groups=None, motif_postfixes=None,
|
|
40
|
-
promoter_filter_lowexp_cutoff=0.95, promoter_filter_plot_filename=None,
|
|
40
|
+
promoter_filter_lowexp_cutoff=0.95, promoter_filter_plot_filename=None, promoter_filter_max=True,
|
|
41
41
|
motif_names_filename=None, compression='raw', dump=True, verbose=True):
|
|
42
42
|
if not os.path.isfile(promoter_expression_filename):
|
|
43
43
|
raise FileNotFoundError(f'Promoter expression file {promoter_expression_filename} not found.')
|
|
@@ -88,8 +88,8 @@ def create_project(project_name: str, promoter_expression_filename: str, loading
|
|
|
88
88
|
f'{len(loading_matrix_transformations)}.')
|
|
89
89
|
|
|
90
90
|
logger_print('Filtering promoters of low expression...', verbose)
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
inds, weights = filter_lowexp(promoter_expression, cutoff=promoter_filter_lowexp_cutoff, fit_plot_filename=promoter_filter_plot_filename,
|
|
92
|
+
max_mode=promoter_filter_max)
|
|
93
93
|
promoter_expression = promoter_expression.loc[inds]
|
|
94
94
|
proms = promoter_expression.index
|
|
95
95
|
loading_matrices = [transform_loadings(df, mode, prom_inds=inds) for df, mode in zip(loading_matrices, loading_matrix_transformations)]
|
|
@@ -6,7 +6,19 @@ import pandas as pd
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
from scipy.optimize import minimize
|
|
8
8
|
from functools import partial
|
|
9
|
+
from sklearn.mixture import GaussianMixture
|
|
9
10
|
|
|
11
|
+
def compute_leftmost_probability(Y):
|
|
12
|
+
Y = Y.reshape(-1, 1)
|
|
13
|
+
gmm = GaussianMixture(n_components=2, random_state=0)
|
|
14
|
+
gmm.fit(Y)
|
|
15
|
+
|
|
16
|
+
means = gmm.means_.flatten()
|
|
17
|
+
leftmost_component_index = np.argmin(means)
|
|
18
|
+
probas = gmm.predict_proba(Y)
|
|
19
|
+
leftmost_probs = probas[:, leftmost_component_index]
|
|
20
|
+
|
|
21
|
+
return leftmost_probs, gmm
|
|
10
22
|
|
|
11
23
|
def normax_logpdf(x: jnp.ndarray, mu: float, sigma: float, n: int):
|
|
12
24
|
x = (x - mu) / sigma
|
|
@@ -39,9 +51,33 @@ def loglik(params: jnp.ndarray, x: jnp.ndarray, n: int):
|
|
|
39
51
|
w = params[-1]
|
|
40
52
|
return -logmixture(x, mu, sigma, w, n).sum()
|
|
41
53
|
|
|
42
|
-
def filter_lowexp(expression: pd.DataFrame, cutoff=0.95,
|
|
54
|
+
def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, max_mode=True,
|
|
55
|
+
fit_plot_filename=None, plot_dpi=200):
|
|
43
56
|
expression = (expression - expression.mean()) / expression.std()
|
|
44
|
-
|
|
57
|
+
if not max_mode:
|
|
58
|
+
expression = expression.mean(axis=1).values
|
|
59
|
+
probs, gmm = compute_leftmost_probability(expression)
|
|
60
|
+
inds = probs < (1-cutoff)
|
|
61
|
+
if fit_plot_filename:
|
|
62
|
+
import matplotlib.pyplot as plt
|
|
63
|
+
from matplotlib.collections import LineCollection
|
|
64
|
+
import seaborn as sns
|
|
65
|
+
x = np.array(sorted(expression))
|
|
66
|
+
pdf = np.exp(gmm.score_samples(expression[:, None]))
|
|
67
|
+
points = np.array([x, pdf]).T.reshape(-1, 1, 2)
|
|
68
|
+
segments = np.concatenate([points[:-1], points[1:]], axis=1)
|
|
69
|
+
plt.figure(dpi=plot_dpi, )
|
|
70
|
+
sns.histplot(expression, stat='density', color='grey')
|
|
71
|
+
lc = LineCollection(segments, cmap='winter')
|
|
72
|
+
lc.set_array(probs)
|
|
73
|
+
lc.set_linewidth(3)
|
|
74
|
+
line = plt.gca().add_collection(lc)
|
|
75
|
+
plt.colorbar(line)
|
|
76
|
+
plt.xlabel('Standardized expression')
|
|
77
|
+
plt.tight_layout()
|
|
78
|
+
plt.savefig(fit_plot_filename)
|
|
79
|
+
return inds, probs
|
|
80
|
+
|
|
45
81
|
expression_max = expression.max(axis=1).values
|
|
46
82
|
|
|
47
83
|
mu = [-1.0, 0.0]
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
from pandas import DataFrame as DF
|
|
4
4
|
# add dot
|
|
5
|
-
from .utils import read_init, openers
|
|
5
|
+
from .utils import read_init, openers, ProjectData
|
|
6
6
|
from .fit import FOVResult, ActivitiesPrediction, FitResult
|
|
7
|
+
from .grn import grn
|
|
7
8
|
from scipy.stats import norm, chi2, multivariate_normal, Covariance
|
|
8
9
|
from scipy.linalg import eigh, lapack, cholesky, solve
|
|
9
10
|
from statsmodels.stats import multitest
|
|
@@ -80,7 +81,9 @@ class Information():
|
|
|
80
81
|
try:
|
|
81
82
|
x = chol_inv(x)
|
|
82
83
|
except:
|
|
83
|
-
print('
|
|
84
|
+
print('Failed to compute inverse using Cholesky decomposition. ')
|
|
85
|
+
print('This can be a sign of a numerical errors during parameters estimation.')
|
|
86
|
+
print('Will use pseudo-inverse now. The minimal and maximal eigenvalues are:')
|
|
84
87
|
# print(x.diagonal().min())
|
|
85
88
|
assert np.allclose(x, x.T), x - x.T
|
|
86
89
|
x = np.linalg.eigh(x)
|
|
@@ -155,7 +158,8 @@ def export_fov(fovs: tuple[FOVResult], folder: str,
|
|
|
155
158
|
samples = [fov_null.sample[:, None], fov_means.sample[:, None], fov_motif_means.sample[:, None]]
|
|
156
159
|
samples = np.concatenate(samples, axis=-1)
|
|
157
160
|
DF(samples, index=sample_names, columns=cols).to_csv(os.path.join(folder, 'samples.tsv'), sep='\t')
|
|
158
|
-
|
|
161
|
+
|
|
162
|
+
|
|
159
163
|
|
|
160
164
|
|
|
161
165
|
def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
|
|
@@ -172,12 +176,17 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
|
|
|
172
176
|
# bad_inds[ind] = True
|
|
173
177
|
# mot = fit.motif_variance.motif
|
|
174
178
|
# mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
|
|
179
|
+
motif_variance = fit.motif_variance.motif
|
|
180
|
+
if activities.filtered_motifs is not None:
|
|
181
|
+
motif_variance = np.delete(motif_variance, activities.filtered_motifs)
|
|
182
|
+
B = np.delete(B, activities.filtered_motifs, axis=1)
|
|
183
|
+
U = activities.U
|
|
175
184
|
if map_cov:
|
|
176
185
|
# fit.motif_variance.m
|
|
177
186
|
BTB = B.T @ B
|
|
178
|
-
BTB_s = BTB *
|
|
187
|
+
BTB_s = BTB * motif_variance ** 0.5
|
|
179
188
|
BTB_s = BTB_s @ BTB_s.T
|
|
180
|
-
for cov, U, sigma, n, nu in zip(activities.cov(),
|
|
189
|
+
for cov, U, sigma, n, nu in zip(activities.cov(), U.T,
|
|
181
190
|
activities._cov[-2],
|
|
182
191
|
fit.error_variance.variance, fit.motif_variance.group):
|
|
183
192
|
# cov = cov[~bad_inds, ~bad_inds]
|
|
@@ -189,11 +198,11 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
|
|
|
189
198
|
covs.append(cov)
|
|
190
199
|
# U = U[~bad_inds]
|
|
191
200
|
# prec = np.linalg.inv(np.diag(mot * nu) - cov)
|
|
192
|
-
prec = np.linalg.
|
|
201
|
+
prec = np.linalg.pinv(cov, hermitian=True)
|
|
193
202
|
mean += prec @ U
|
|
194
203
|
precs.append(prec)
|
|
195
204
|
total_prec = sum(precs)
|
|
196
|
-
total_cov = np.linalg.
|
|
205
|
+
total_cov = np.linalg.pinv(total_prec, hermitian=True)
|
|
197
206
|
mean = total_cov @ mean
|
|
198
207
|
stats = activities.U[~bad_inds] - mean.reshape(-1, 1)
|
|
199
208
|
# if corr_stat:
|
|
@@ -211,9 +220,6 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
|
|
|
211
220
|
fdr = multitest.multipletests(pvalues, alpha=0.05, method='fdr_by')[1]
|
|
212
221
|
return stats, pvalues, fdr, bad_inds
|
|
213
222
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
223
|
|
|
218
224
|
def export_results(project_name: str, output_folder: str,
|
|
219
225
|
std_mode: Standardization,
|
|
@@ -249,7 +255,7 @@ def export_results(project_name: str, output_folder: str,
|
|
|
249
255
|
motif_names_filtered = motif_names
|
|
250
256
|
|
|
251
257
|
os.makedirs(output_folder, exist_ok=True)
|
|
252
|
-
|
|
258
|
+
# grn(data, act, fit, os.path.join(output_folder, 'grn'))
|
|
253
259
|
error_variance = fit.error_variance.variance
|
|
254
260
|
error_variance_fim = Information(fit.error_variance.fim)
|
|
255
261
|
error_variance_stat, error_variance_std = error_variance_fim.standardize(error_variance,
|
|
@@ -278,9 +284,12 @@ def export_results(project_name: str, output_folder: str,
|
|
|
278
284
|
|
|
279
285
|
folder = os.path.join(output_folder, 'params')
|
|
280
286
|
os.makedirs(folder, exist_ok=True)
|
|
287
|
+
if os.path.isfile(f'{project_name}.promvar.{fmt}'):
|
|
288
|
+
with openers[fmt](f'{project_name}.promvar.{fmt}', 'rb') as f:
|
|
289
|
+
promvar: np.ndarray = dill.load(f)
|
|
290
|
+
DF(promvar, index=prom_names, columns=group_names).to_csv(os.path.join(folder, 'promoter_variances.tsv'), sep='\t')
|
|
281
291
|
if excluded_motif_group is not None:
|
|
282
292
|
motif_group_variance_std = np.insert(motif_group_variance_std, excluded_motif_group, np.nan)
|
|
283
|
-
print(error_variance.shape, error_variance_std.shape, motif_group_variance.shape, motif_group_variance_std.shape)
|
|
284
293
|
DF(np.array([error_variance, error_variance_std, motif_group_variance, motif_group_variance_std]).T,
|
|
285
294
|
index=group_names,
|
|
286
295
|
columns=['sigma', 'sigma_std', 'nu', 'nu_std']).to_csv(os.path.join(folder, 'group_variances.tsv'),
|
|
@@ -400,6 +409,48 @@ def export_results(project_name: str, output_folder: str,
|
|
|
400
409
|
sample_names=sample_names)
|
|
401
410
|
|
|
402
411
|
|
|
412
|
+
def export_loadings_product(project_name: str, output_folder: str,
|
|
413
|
+
use_hdf: bool = True, intercepts: bool = True,
|
|
414
|
+
tsv_truncation=4):
|
|
403
415
|
|
|
404
|
-
|
|
405
|
-
|
|
416
|
+
|
|
417
|
+
data = read_init(project_name)
|
|
418
|
+
fmt = data.fmt
|
|
419
|
+
motif_names = data.motif_names
|
|
420
|
+
prom_names = data.promoter_names
|
|
421
|
+
# del data
|
|
422
|
+
with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
|
|
423
|
+
fit: FitResult = dill.load(f)
|
|
424
|
+
if fit.promoter_inds_to_drop:
|
|
425
|
+
prom_names = np.delete(prom_names, fit.promoter_inds_to_drop)
|
|
426
|
+
group_names = fit.group_names
|
|
427
|
+
with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
|
|
428
|
+
act: ActivitiesPrediction = dill.load(f)
|
|
429
|
+
|
|
430
|
+
output_folder = os.path.join(output_folder, 'loadings-product')
|
|
431
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
432
|
+
|
|
433
|
+
U = act.U
|
|
434
|
+
B = data.B
|
|
435
|
+
mu = fit.motif_mean.mean
|
|
436
|
+
|
|
437
|
+
if act.filtered_motifs is not None:
|
|
438
|
+
motif_names = np.delete(motif_names, act.filtered_motifs)
|
|
439
|
+
B = np.delete(B, act.filtered_motifs, axis=1)
|
|
440
|
+
mu = np.delete(mu, act.filtered_motifs)
|
|
441
|
+
BM = B * mu
|
|
442
|
+
for name, U in zip(group_names, U.T):
|
|
443
|
+
effect = B * U
|
|
444
|
+
if intercepts:
|
|
445
|
+
effect += BM
|
|
446
|
+
if use_hdf:
|
|
447
|
+
effect = effect.astype(np.half)
|
|
448
|
+
filename = os.path.join(output_folder, f'{name}.hdf')
|
|
449
|
+
DF(data=effect, index=prom_names, columns=motif_names).to_hdf(filename, key='lrt', mode='w', complevel=4)
|
|
450
|
+
else:
|
|
451
|
+
filename = os.path.join(output_folder, f'{name}.tsv')
|
|
452
|
+
DF(data=effect, index=prom_names, columns=motif_names).to_csv(filename, sep='\t',
|
|
453
|
+
float_format=f'%.{tsv_truncation}f')
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
|
|
@@ -198,7 +198,7 @@ def ones_nullspace_transform_transpose(X: np.ndarray) -> np.ndarray:
|
|
|
198
198
|
|
|
199
199
|
return Y
|
|
200
200
|
|
|
201
|
-
def lowrank_decomposition(X: np.ndarray, rel_eps=1e-
|
|
201
|
+
def lowrank_decomposition(X: np.ndarray, rel_eps=1e-15) -> LowrankDecomposition:
|
|
202
202
|
svd = jnp.linalg.svd
|
|
203
203
|
q, s, v = [np.array(t) for t in svd(X, full_matrices=False)]
|
|
204
204
|
max_sv = max(s)
|
|
@@ -449,12 +449,6 @@ def loglik_motifs_fim(x: jnp.ndarray, BTB: jnp.ndarray,
|
|
|
449
449
|
FIM_tau_nu = jnp.delete(FIM_tau_nu, G_fix_ind, axis=1)
|
|
450
450
|
FIM = jnp.block([[FIM_tau, FIM_tau_nu],
|
|
451
451
|
[FIM_tau_nu.T, FIM_nu]])
|
|
452
|
-
t = FIM[:len(Sigma), :len(Sigma)]
|
|
453
|
-
t = jnp.linalg.eigh(t)[0]
|
|
454
|
-
print('FIM_tau', np.min(t), np.max(t), np.min(np.abs(t)))
|
|
455
|
-
t = FIM[len(Sigma):, len(Sigma):]
|
|
456
|
-
t = jnp.linalg.eigh(t)[0]
|
|
457
|
-
print('FIM_nu', np.min(t), np.max(t), np.min(np.abs(t)))
|
|
458
452
|
return FIM
|
|
459
453
|
|
|
460
454
|
|
|
@@ -483,7 +477,7 @@ def estimate_error_variance(data: TransformedData, B_decomposition: LowrankDecom
|
|
|
483
477
|
group_inds=data.group_inds)
|
|
484
478
|
fun = jax.jit(fun)
|
|
485
479
|
grad = jax.jit(grad)
|
|
486
|
-
opt = MetaOptimizer(fun, grad, num_steps_momentum=
|
|
480
|
+
opt = MetaOptimizer(fun, grad, num_steps_momentum=15)
|
|
487
481
|
res = opt.optimize(d0)
|
|
488
482
|
if verbose:
|
|
489
483
|
print('-' * 15)
|
|
@@ -539,9 +533,7 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
|
|
|
539
533
|
G_fix_ind=j, G_fix_val=fix)
|
|
540
534
|
fun = jax.jit(fun)
|
|
541
535
|
grad = jax.jit(grad)
|
|
542
|
-
opt = MetaOptimizer(fun, grad, num_steps_momentum=
|
|
543
|
-
# scaling_set=(slice(len(BTB)), slice(len(BTB), None))
|
|
544
|
-
)
|
|
536
|
+
opt = MetaOptimizer(fun, grad, num_steps_momentum=50)
|
|
545
537
|
try:
|
|
546
538
|
res = opt.optimize(x0)
|
|
547
539
|
except ValueError as E:
|
|
@@ -566,14 +558,17 @@ def estimate_motif_variance(data: TransformedData, B_decomposition: LowrankDecom
|
|
|
566
558
|
G_fix_ind=j, G_fix_val=fix)
|
|
567
559
|
f = fim(res.x)
|
|
568
560
|
eig = jnp.linalg.eigh(f)[0].min()
|
|
561
|
+
print('FIM min eig', eig)
|
|
569
562
|
if eig < 0:
|
|
570
563
|
eig = list()
|
|
571
|
-
epsilons = [1e-23, 1e-15, 1e-12, 1e-9, 1e-8,
|
|
564
|
+
epsilons = [1e-23, 1e-18, 1e-15, 1e-12, 1e-9, 1e-8,
|
|
565
|
+
1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
|
|
572
566
|
for eps in epsilons:
|
|
573
567
|
x = res.x.copy()
|
|
574
568
|
x = x.at[:len(BTB)].set(jnp.clip(x.at[:len(BTB)].get(), eps, float('inf')))
|
|
575
569
|
f = fim(x)
|
|
576
570
|
eig.append(jnp.linalg.eigh(f)[0].min())
|
|
571
|
+
print(eps, eig[-1])
|
|
577
572
|
if eig[-1] > 0:
|
|
578
573
|
break
|
|
579
574
|
i = np.argmax(eig)
|
|
@@ -870,8 +865,12 @@ def fit(project: str, clustering: ClusteringMode,
|
|
|
870
865
|
data.B, clustering = cluster_data(data.B, mode=clustering,
|
|
871
866
|
num_clusters=num_clusters)
|
|
872
867
|
if test_chromosomes:
|
|
873
|
-
|
|
874
|
-
|
|
868
|
+
import re
|
|
869
|
+
pattern = re.compile(r'chr([0-9XYM]+|\d+)')
|
|
870
|
+
|
|
871
|
+
test_chromosomes = set(test_chromosomes)
|
|
872
|
+
promoter_inds_to_drop = [i for i, p in enumerate(data.promoter_names)
|
|
873
|
+
if pattern.search(p).group() in test_chromosomes]
|
|
875
874
|
data.Y = np.delete(data.Y, promoter_inds_to_drop, axis=0)
|
|
876
875
|
data.B = np.delete(data.B, promoter_inds_to_drop, axis=0)
|
|
877
876
|
else:
|
|
@@ -942,12 +941,12 @@ def split_data(data: ProjectData, inds: list) -> tuple[ProjectData, ProjectData]
|
|
|
942
941
|
data_d = ProjectData(Y=Y_d, B=B_d, K=data.K, weights=data.weights,
|
|
943
942
|
group_inds=data.group_inds, group_names=data.group_names,
|
|
944
943
|
motif_names=data.motif_names, promoter_names=promoter_names_d,
|
|
945
|
-
motif_postfixes=data.motif_postfixes,
|
|
944
|
+
motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
|
|
946
945
|
fmt=data.fmt)
|
|
947
946
|
data = ProjectData(Y=Y, B=B, K=data.K, weights=data.weights,
|
|
948
947
|
group_inds=data.group_inds, group_names=data.group_names,
|
|
949
948
|
motif_names=data.motif_names, promoter_names=promoter_names,
|
|
950
|
-
motif_postfixes=data.motif_postfixes,
|
|
949
|
+
motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
|
|
951
950
|
fmt=data.fmt)
|
|
952
951
|
return data_d, data
|
|
953
952
|
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import numpy as np
|
|
3
|
+
import jax.numpy as jnp
|
|
4
|
+
import jax
|
|
5
|
+
from .utils import read_init, openers, ProjectData
|
|
6
|
+
from .fit import FOVResult, ActivitiesPrediction, FitResult
|
|
7
|
+
from scipy.optimize import minimize_scalar, minimize
|
|
8
|
+
import os
|
|
9
|
+
import dill
|
|
10
|
+
from pandas import DataFrame as DF
|
|
11
|
+
from scipy.stats import norm
|
|
12
|
+
from functools import partial
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def estimate_promoter_prior_variance(data: ProjectData, activities: ActivitiesPrediction,
|
|
17
|
+
fit: FitResult, top=0.90, eps=1e-6):
|
|
18
|
+
B = data.B
|
|
19
|
+
Y = data.Y
|
|
20
|
+
group_inds = data.group_inds
|
|
21
|
+
Y = Y - fit.promoter_mean.mean.reshape(-1, 1) - fit.sample_mean.mean.reshape(1, -1)
|
|
22
|
+
Y = Y - B @ fit.motif_mean.mean.reshape(-1, 1)
|
|
23
|
+
Y = np.concatenate([Y[:, inds].mean(axis=1, keepdims=True) - B @ U.reshape(-1, 1)
|
|
24
|
+
for inds, U in zip(group_inds, activities.U.T)],
|
|
25
|
+
axis=1)
|
|
26
|
+
|
|
27
|
+
var = (Y**2).mean(axis=1)
|
|
28
|
+
var = var[var > eps]
|
|
29
|
+
inds = np.argsort(var)
|
|
30
|
+
inds = inds[:int(len(inds) * top)]
|
|
31
|
+
return np.var(var[inds])
|
|
32
|
+
|
|
33
|
+
def estimate_promoter_variance(project_name: str, prior_top=0.90):
|
|
34
|
+
|
|
35
|
+
def fun(sigma, y: jnp.ndarray, b: jnp.ndarray, s: int,
|
|
36
|
+
prior_mean: float, prior_var: float):
|
|
37
|
+
if jnp.iterable(sigma):
|
|
38
|
+
sigma = sigma[0]
|
|
39
|
+
theta = prior_var / prior_mean
|
|
40
|
+
alpha = prior_var / theta ** 2
|
|
41
|
+
penalty = sigma / theta - (alpha - 1) * jnp.log(sigma)
|
|
42
|
+
return y / (b + sigma) + s * jnp.log(b + sigma) + penalty
|
|
43
|
+
data = read_init(project_name)
|
|
44
|
+
fmt = data.fmt
|
|
45
|
+
with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
|
|
46
|
+
fit: FitResult = dill.load(f)
|
|
47
|
+
with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
|
|
48
|
+
activities: ActivitiesPrediction = dill.load(f)
|
|
49
|
+
B = data.B
|
|
50
|
+
Y = data.Y
|
|
51
|
+
group_inds = data.group_inds
|
|
52
|
+
prior_var = estimate_promoter_prior_variance(data, activities, fit,
|
|
53
|
+
top=prior_top)
|
|
54
|
+
print('Piror standard deviation:', prior_var ** 0.5)
|
|
55
|
+
prior_means = fit.error_variance.variance
|
|
56
|
+
|
|
57
|
+
Y = Y - fit.promoter_mean.mean.reshape(-1, 1) - fit.sample_mean.mean.reshape(1, -1)
|
|
58
|
+
Y = Y - B @ fit.motif_mean.mean.reshape(-1, 1)
|
|
59
|
+
Y = Y ** 2
|
|
60
|
+
B_hat = B ** 2 * fit.motif_variance.motif
|
|
61
|
+
B_hat = B_hat.sum(axis=1)
|
|
62
|
+
var = list()
|
|
63
|
+
for inds, prior_mean, nu in tqdm(list(zip(group_inds, prior_means, fit.motif_variance.group))):
|
|
64
|
+
Yt = Y[:, inds].sum(axis=1)
|
|
65
|
+
s = len(inds)
|
|
66
|
+
f_ = jax.jit(partial(fun, prior_mean=prior_mean, prior_var=prior_var, s=s))
|
|
67
|
+
g_ = jax.jit(jax.grad(f_))
|
|
68
|
+
var_g = list()
|
|
69
|
+
for y, b in zip(Yt, B_hat * nu):
|
|
70
|
+
res = minimize(partial(f_, b=b, y=y), x0=jnp.array([prior_mean]),
|
|
71
|
+
method='SLSQP', bounds=[(0, None)],
|
|
72
|
+
jac=partial(g_, b=b, y=y))
|
|
73
|
+
var_g.append(res.x[0] ** 2)
|
|
74
|
+
var.append(var_g)
|
|
75
|
+
var = np.array(var, dtype=float).T
|
|
76
|
+
with openers[fmt](f'{project_name}.promvar.{fmt}', 'wb') as f:
|
|
77
|
+
dill.dump(var, f)
|
|
78
|
+
return var
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def grn(project_name: str, output: str, use_hdf=False, save_stat=True,
|
|
82
|
+
prior_h1=1/100):
|
|
83
|
+
data = read_init(project_name)
|
|
84
|
+
fmt = data.fmt
|
|
85
|
+
with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
|
|
86
|
+
fit: FitResult = dill.load(f)
|
|
87
|
+
with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
|
|
88
|
+
activities: ActivitiesPrediction = dill.load(f)
|
|
89
|
+
|
|
90
|
+
dtype = np.float32
|
|
91
|
+
B = data.B.astype(dtype)
|
|
92
|
+
Y = data.Y.astype(dtype)
|
|
93
|
+
group_inds = data.group_inds
|
|
94
|
+
group_names = data.group_names
|
|
95
|
+
nus = fit.motif_variance.group.astype(dtype)
|
|
96
|
+
motif_names = data.motif_names
|
|
97
|
+
prom_names = data.promoter_names
|
|
98
|
+
U = activities.U_raw.astype(dtype)
|
|
99
|
+
motif_mean = fit.motif_mean.mean.flatten().astype(dtype)
|
|
100
|
+
motif_variance = fit.motif_variance.motif.astype(dtype)
|
|
101
|
+
promoter_mean = fit.promoter_mean.mean.astype(dtype)
|
|
102
|
+
sample_mean = fit.sample_mean.mean.astype(dtype)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
with openers[fmt](f'{project_name}.promvar.{fmt}', 'rb') as f:
|
|
106
|
+
promvar: np.ndarray = dill.load(f)
|
|
107
|
+
except FileNotFoundError:
|
|
108
|
+
print('WARNING')
|
|
109
|
+
print('It seems that promoter variances were not estimated prior to running GRN.')
|
|
110
|
+
print('All promoter-wise variances will be assumed to be equal to the average error variance.')
|
|
111
|
+
print('Consider estimating promoter-wise variances before running GRN in the future.')
|
|
112
|
+
promvar = np.zeros((len(B), len(group_names)))
|
|
113
|
+
for i, sigma in enumerate(fit.error_variance.variance):
|
|
114
|
+
promvar[:, i] = sigma
|
|
115
|
+
|
|
116
|
+
Y = Y - promoter_mean.reshape(-1, 1) - sample_mean.reshape(1, -1)
|
|
117
|
+
Y = Y - B @ motif_mean.reshape(-1, 1)
|
|
118
|
+
|
|
119
|
+
if activities.filtered_motifs is not None:
|
|
120
|
+
motif_names = np.delete(motif_names, activities.filtered_motifs)
|
|
121
|
+
B = np.delete(B, activities.filtered_motifs, axis=1)
|
|
122
|
+
motif_mean = np.delete(motif_mean, activities.filtered_motifs)
|
|
123
|
+
motif_variance = np.delete(motif_variance, activities.filtered_motifs)
|
|
124
|
+
|
|
125
|
+
BM = B * motif_mean
|
|
126
|
+
BM = BM[..., None]
|
|
127
|
+
# BU = BU[..., None]
|
|
128
|
+
B_hat = B ** 2 * motif_variance
|
|
129
|
+
B_hat = B_hat.sum(axis=1, keepdims=True) - B_hat
|
|
130
|
+
B_pow = B ** 2
|
|
131
|
+
|
|
132
|
+
folder_stat = os.path.join(output, 'lr')
|
|
133
|
+
folder_belief = os.path.join(output, 'belief')
|
|
134
|
+
if save_stat:
|
|
135
|
+
os.makedirs(folder_stat, exist_ok=True)
|
|
136
|
+
os.makedirs(folder_belief, exist_ok=True)
|
|
137
|
+
for sigma, nu, name, inds in zip(promvar.T[..., None], nus, group_names, group_inds):
|
|
138
|
+
# if name != 'anconeus':
|
|
139
|
+
# continue
|
|
140
|
+
print(name)
|
|
141
|
+
var = (B_hat * nu + sigma)
|
|
142
|
+
Y_ = Y[:, inds][..., None, :] + BM
|
|
143
|
+
# theta = U[:, inds][..., None, :] + BM
|
|
144
|
+
theta = B[..., None] * U[:, inds] + BM
|
|
145
|
+
loglr = 2 * B * (Y_ * theta).sum(axis=-1) - B_pow * (theta ** 2).sum(axis=-1)
|
|
146
|
+
del Y_
|
|
147
|
+
del theta
|
|
148
|
+
loglr = loglr / (2 * var)
|
|
149
|
+
del var
|
|
150
|
+
lr = np.exp(loglr)
|
|
151
|
+
belief = lr * prior_h1 / ((1 - prior_h1) + lr * prior_h1)
|
|
152
|
+
inds = sigma.flatten() > 1e-3
|
|
153
|
+
lr = lr[inds]
|
|
154
|
+
belief = belief[inds]
|
|
155
|
+
belief = belief.astype(np.half)
|
|
156
|
+
|
|
157
|
+
proms = list(np.array(prom_names)[inds])
|
|
158
|
+
|
|
159
|
+
# pvalue = n.sf(lr) * (theta > 0) + n.cdf(lr) * (theta <= 0)
|
|
160
|
+
if use_hdf:
|
|
161
|
+
if save_stat:
|
|
162
|
+
lr = lr.astype(np.half)
|
|
163
|
+
filename = os.path.join(folder_stat, f'{name}.hdf')
|
|
164
|
+
DF(data=lr, index=proms, columns=motif_names).to_hdf(filename, key='zscore', mode='w', complevel=4)
|
|
165
|
+
filename = os.path.join(folder_belief, f'{name}.hdf')
|
|
166
|
+
DF(data=belief, index=proms, columns=motif_names).to_hdf(filename, key='lrt', mode='w', complevel=4)
|
|
167
|
+
else:
|
|
168
|
+
if save_stat:
|
|
169
|
+
lr = lr.astype(np.half)
|
|
170
|
+
filename = os.path.join(folder_stat, f'{name}.tsv')
|
|
171
|
+
DF(data=lr, index=proms, columns=motif_names).to_csv(filename, sep='\t',
|
|
172
|
+
float_format='%.3f')
|
|
173
|
+
filename = os.path.join(folder_belief, f'{name}.tsv')
|
|
174
|
+
DF(data=belief, index=proms, columns=motif_names).to_csv(filename, sep='\t',
|
|
175
|
+
float_format='%.3f')
|
|
176
|
+
|
|
177
|
+
|
|
@@ -13,10 +13,11 @@ from rich.table import Table
|
|
|
13
13
|
from .create import create_project
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
from .fit import fit, ClusteringMode, calculate_fov, predict, GOFStat, GOFStatMode
|
|
16
|
+
from .grn import estimate_promoter_variance, grn
|
|
16
17
|
from .synthetic_data import generate_dataset
|
|
17
18
|
from time import time
|
|
18
19
|
from dill import __version__ as dill_version
|
|
19
|
-
from .export import export_results, Standardization, ANOVAType
|
|
20
|
+
from .export import export_results, export_loadings_product, Standardization, ANOVAType
|
|
20
21
|
from . import __version__ as project_version
|
|
21
22
|
from .select import select_motifs_single
|
|
22
23
|
import json
|
|
@@ -105,7 +106,7 @@ def _create(name: str = Argument(..., help='Project name. [bold]MARADONER[/bold]
|
|
|
105
106
|
'name[/cyan].'),
|
|
106
107
|
expression: Path = Argument(..., help='A path to the promoter expression table. Expression values are assumed to be in a log-scale.'),
|
|
107
108
|
loading: List[Path] = Argument(..., help='A list (if applicable, separated by space) of filenames containing loading matrices. '),
|
|
108
|
-
loading_transform: List[LoadingTransform] = Option([LoadingTransform.
|
|
109
|
+
loading_transform: List[LoadingTransform] = Option([LoadingTransform.esf], '--loading-transform', '-t',
|
|
109
110
|
help='A type of transformation to apply to loading '
|
|
110
111
|
'matrices. [orange]ecdf[/orange] substitutes values in the table with empricical CDF,'
|
|
111
112
|
' [orange]esf[/orange] with negative logarithm of the empirical survival function.'),
|
|
@@ -115,6 +116,8 @@ def _create(name: str = Argument(..., help='Project name. [bold]MARADONER[/bold]
|
|
|
115
116
|
' contain. If a text file, each line must start with a group name followed by space-separated sample names.'),
|
|
116
117
|
filter_lowexp_w: float = Option(0.9, help='Truncation boundary for filtering out low-expressed promoters. The closer [orange]w[/orange]'
|
|
117
118
|
' to 1, the more promoters will be left in the dataset.'),
|
|
119
|
+
filter_max_mode: bool = Option(True, help='Use max-mode of filtering. Max-mode keeps promoters that are active at least for some samples.'
|
|
120
|
+
' If disabled, filtration using GMM on the averages will be ran instead.'),
|
|
118
121
|
filter_plot: Path = Option(None, help='Expression plot with a fitted mixture that is used for filtering.'),
|
|
119
122
|
loading_postfix: List[str] = Option(None, '--loading-postfix', '-p',
|
|
120
123
|
help='String postfixes will be appeneded to the motifs from each of the supplied loading matrices'),
|
|
@@ -133,7 +136,8 @@ def _create(name: str = Argument(..., help='Project name. [bold]MARADONER[/bold]
|
|
|
133
136
|
r = create_project(name, expression, loading_matrix_filenames=loading, motif_expression_filenames=motif_expression,
|
|
134
137
|
loading_matrix_transformations=loading_transform, sample_groups=sample_groups,
|
|
135
138
|
promoter_filter_lowexp_cutoff=filter_lowexp_w,
|
|
136
|
-
promoter_filter_plot_filename=filter_plot,
|
|
139
|
+
promoter_filter_plot_filename=filter_plot,
|
|
140
|
+
promoter_filter_max=filter_max_mode,
|
|
137
141
|
compression=compression,
|
|
138
142
|
motif_postfixes=loading_postfix,
|
|
139
143
|
motif_names_filename=motif_filename,
|
|
@@ -265,7 +269,11 @@ def _export(name: str = Argument(..., help='Project name.'),
|
|
|
265
269
|
std_mode: Standardization = Option(Standardization.full, help='Whether to standardize activities with plain variances or also decorrelate them.'),
|
|
266
270
|
anova_mode: ANOVAType = Option(ANOVAType.positive, help='If negative, look for non-variative motifs'),
|
|
267
271
|
weighted_zscore: bool = Option(False, help='Reciprocal variance weighted Z-scores'),
|
|
268
|
-
alpha: float = Option(0.05, help='FDR alpha.')
|
|
272
|
+
alpha: float = Option(0.05, help='FDR alpha.'),
|
|
273
|
+
loadings_product: bool = Option(False, help='Export loading matrix-acitvity 3D tensor. This will produce num_of_groups tabular files.'),
|
|
274
|
+
lp_hdf: bool = Option(True, help='Each loadings-product table will be stored in hdf format (occupies much less space than plain tsv) using float16 precision.'),
|
|
275
|
+
lp_intercepts: bool = Option(True, help='Include motif means in the 3D tensor.'),
|
|
276
|
+
lp_tsv_truncation: int = Option(4, help='Number of digits after a floating point to truncate. Decreases the output size of a tabular if [orange]lp-hdf[/orange] is disabled.')):
|
|
269
277
|
t0 = time()
|
|
270
278
|
p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
|
|
271
279
|
p.add_task(description="Exporting results...", total=None)
|
|
@@ -273,8 +281,17 @@ def _export(name: str = Argument(..., help='Project name.'),
|
|
|
273
281
|
export_results(name, output_folder, std_mode=std_mode, anova_mode=anova_mode, alpha=alpha,
|
|
274
282
|
weighted_zscore=weighted_zscore)
|
|
275
283
|
p.stop()
|
|
284
|
+
|
|
285
|
+
if loadings_product:
|
|
286
|
+
p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
|
|
287
|
+
p.add_task(description="Exporting results...", total=None)
|
|
288
|
+
p.start()
|
|
289
|
+
export_loadings_product(name, output_folder, use_hdf=lp_hdf, intercepts=lp_intercepts)
|
|
290
|
+
p.stop()
|
|
291
|
+
|
|
276
292
|
dt = time() - t0
|
|
277
293
|
rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
|
|
294
|
+
|
|
278
295
|
|
|
279
296
|
|
|
280
297
|
__select_motif_doc = 'Selects best motif variants when the project was created from multiple loading matrices, each with an unique postfix.'\
|
|
@@ -287,13 +304,51 @@ def _select_motifs(name: str = Argument(..., help='Project name'),
|
|
|
287
304
|
filename: Path = Argument(..., help='Filename where a list of best motif variants will be stored')):
|
|
288
305
|
t0 = time()
|
|
289
306
|
p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
|
|
290
|
-
p.add_task(description="
|
|
307
|
+
p.add_task(description="Selecting motifs...", total=None)
|
|
291
308
|
p.start()
|
|
292
309
|
select_motifs_single(name, filename)
|
|
293
310
|
p.stop()
|
|
294
311
|
dt = time() - t0
|
|
295
312
|
rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
|
|
296
313
|
|
|
314
|
+
|
|
315
|
+
__grn_doc = 'Tests each promoter against each motif per each group. Some people call it GRN.'
|
|
316
|
+
@app.command('grn',
|
|
317
|
+
help=__select_motif_doc)
|
|
318
|
+
def _grn(name: str = Argument(..., help='Project name'),
|
|
319
|
+
folder: Path = Argument(..., help='Output folder where results will be stored. In total, expect number_of_groups tables of size'
|
|
320
|
+
' comparable to the expression file size.'),
|
|
321
|
+
hdf: bool = Option(True, help='Use HDF format instead of tar.gz files. Typically eats much less space'),
|
|
322
|
+
stat: bool = Option(True, help='Save statistics alongside probabilities.'),
|
|
323
|
+
prior_h1: float = Option(1/10, help='Prior belief on the expected fraction of motifs active per promoter.')):
|
|
324
|
+
t0 = time()
|
|
325
|
+
p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
|
|
326
|
+
p.add_task(description="Building GRN...", total=None)
|
|
327
|
+
p.start()
|
|
328
|
+
grn(name, output=folder, use_hdf=hdf, save_stat=stat, prior_h1=prior_h1)
|
|
329
|
+
p.stop()
|
|
330
|
+
dt = time() - t0
|
|
331
|
+
rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
|
|
332
|
+
|
|
333
|
+
__estimate_promvar_doc = 'Estimates each promoter variance for each group using empirical Bayesian shrinkage.'\
|
|
334
|
+
' A necessary step before computing GRN.'
|
|
335
|
+
@app.command('estimate-promoter-variance',
|
|
336
|
+
help=__estimate_promvar_doc)
|
|
337
|
+
def _estimate_promoter_variance(name: str = Argument(..., help='Project name'),
|
|
338
|
+
prior_top: float = Option(0.90,
|
|
339
|
+
help='The fraction from the bottom as ranked by sample'
|
|
340
|
+
' variance of promoters to be used for estimating global group-wise variance.'
|
|
341
|
+
' Higher values result in higher prior variance and weaken the prior.'
|
|
342
|
+
)):
|
|
343
|
+
t0 = time()
|
|
344
|
+
p = Progress(SpinnerColumn(speed=0.5), TextColumn("[progress.description]{task.description}"), transient=True)
|
|
345
|
+
p.add_task(description="Estimating each promoter's variance...", total=None)
|
|
346
|
+
p.start()
|
|
347
|
+
estimate_promoter_variance(name, prior_top=prior_top)
|
|
348
|
+
p.stop()
|
|
349
|
+
dt = time() - t0
|
|
350
|
+
rprint(f'[green][bold]✔️[/bold] Done![/green]\t time: {dt:.2f} s.')
|
|
351
|
+
|
|
297
352
|
def main():
|
|
298
353
|
check_packages()
|
|
299
354
|
app()
|
|
@@ -51,7 +51,7 @@ class FitResult:
|
|
|
51
51
|
promoter_inds_to_drop: list = None
|
|
52
52
|
|
|
53
53
|
|
|
54
|
-
def transform_data(data, std_y=False, std_b=False
|
|
54
|
+
def transform_data(data, std_y=False, std_b=False) -> TransformedData:
|
|
55
55
|
Y = data.Y - (data.Y.mean(axis=0, keepdims=True) + data.Y.mean(axis=1, keepdims=True) - data.Y.mean())
|
|
56
56
|
B = data.B - data.B.mean(axis=0, keepdims=True)
|
|
57
57
|
group_inds_inv = list()
|
|
@@ -159,8 +159,12 @@ def fit(project: str, tau_mode: TauMode, tau_estimation: TauEstimation,
|
|
|
159
159
|
data.B, clustering = cluster_data(data.B, mode=clustering,
|
|
160
160
|
num_clusters=num_clusters)
|
|
161
161
|
if test_chromosomes:
|
|
162
|
-
|
|
163
|
-
|
|
162
|
+
import re
|
|
163
|
+
pattern = re.compile(r'chr([0-9XYM]+|\d+)')
|
|
164
|
+
|
|
165
|
+
test_chromosomes = set(test_chromosomes)
|
|
166
|
+
promoter_inds_to_drop = [i for i, p in enumerate(data.promoter_names)
|
|
167
|
+
if pattern.search(p).group() in test_chromosomes]
|
|
164
168
|
data.Y = np.delete(data.Y, promoter_inds_to_drop, axis=0)
|
|
165
169
|
data.B = np.delete(data.B, promoter_inds_to_drop, axis=0)
|
|
166
170
|
else:
|
|
@@ -214,12 +218,12 @@ def split_data(data: ProjectData, inds: list) -> tuple[ProjectData, ProjectData]
|
|
|
214
218
|
data_d = ProjectData(Y=Y_d, B=B_d, K=data.K, weights=data.weights,
|
|
215
219
|
group_inds=data.group_inds, group_names=data.group_names,
|
|
216
220
|
motif_names=data.motif_names, promoter_names=promoter_names_d,
|
|
217
|
-
motif_postfixes=data.motif_postfixes,
|
|
221
|
+
motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
|
|
218
222
|
fmt=data.fmt)
|
|
219
223
|
data = ProjectData(Y=Y, B=B, K=data.K, weights=data.weights,
|
|
220
224
|
group_inds=data.group_inds, group_names=data.group_names,
|
|
221
225
|
motif_names=data.motif_names, promoter_names=promoter_names,
|
|
222
|
-
motif_postfixes=data.motif_postfixes,
|
|
226
|
+
motif_postfixes=data.motif_postfixes, sample_names=data.sample_names,
|
|
223
227
|
fmt=data.fmt)
|
|
224
228
|
return data_d, data
|
|
225
229
|
|
|
@@ -255,7 +259,7 @@ def calculate_fov(project: str, gpu: bool,
|
|
|
255
259
|
stat_type: GOFStat, keep_motifs: str, x64=True,
|
|
256
260
|
verbose=True, dump=True):
|
|
257
261
|
def calc_fov(data: TransformedData, fit: FitResult,
|
|
258
|
-
activities: ActivitiesPrediction, keep_motifs=None) -> tuple[FOVResult]:
|
|
262
|
+
activities: ActivitiesPrediction, keep_motifs=None, Bs=None) -> tuple[FOVResult]:
|
|
259
263
|
def sub(Y, effects) -> FOVResult:
|
|
260
264
|
if stat_type == stat_type.fov:
|
|
261
265
|
Y1 = Y - effects
|
|
@@ -271,10 +275,16 @@ def calculate_fov(project: str, gpu: bool,
|
|
|
271
275
|
prom = _cor(Y, effects, axis=1)
|
|
272
276
|
sample = _cor(Y, effects, axis=0)
|
|
273
277
|
return FOVResult(total, prom, sample)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
+
if Bs is None:
|
|
279
|
+
data = transform_data(data)
|
|
280
|
+
B = data.B if activities.clustering is None else activities.clustering[0]
|
|
281
|
+
Y = data.Y
|
|
282
|
+
U = activities.U
|
|
283
|
+
else:
|
|
284
|
+
B = data.B
|
|
285
|
+
Y = data.Y
|
|
286
|
+
B = np.hstack((B, np.ones((len(B), 1))))
|
|
287
|
+
U = np.linalg.pinv(np.hstack((Bs[0], np.ones((len(Bs[0]), 1))))) @ Bs[1]
|
|
278
288
|
if keep_motifs is not None:
|
|
279
289
|
B = B[:, keep_motifs]
|
|
280
290
|
U = U[keep_motifs]
|
|
@@ -306,9 +316,9 @@ def calculate_fov(project: str, gpu: bool,
|
|
|
306
316
|
data, data_test = split_data(data, fit.promoter_inds_to_drop)
|
|
307
317
|
if x64:
|
|
308
318
|
jax.config.update("jax_enable_x64", True)
|
|
309
|
-
data = transform_data(data, helmert=False)
|
|
310
|
-
if data_test is not None:
|
|
311
|
-
|
|
319
|
+
# data = transform_data(data, helmert=False)
|
|
320
|
+
# if data_test is not None:
|
|
321
|
+
# data_test = transform_data(data_test, helmert=False)
|
|
312
322
|
if gpu:
|
|
313
323
|
device = jax.devices()
|
|
314
324
|
else:
|
|
@@ -318,12 +328,15 @@ def calculate_fov(project: str, gpu: bool,
|
|
|
318
328
|
for status_name, motifs in keep_motifs:
|
|
319
329
|
if status_name:
|
|
320
330
|
status_name = f'{status_name} ({len(motifs)})'
|
|
321
|
-
print(status_name)
|
|
322
331
|
with jax.default_device(device):
|
|
323
332
|
|
|
324
333
|
if data_test is not None:
|
|
325
|
-
test_FOV = calc_fov(data=data_test, fit=fit, activities=activities, keep_motifs=motifs
|
|
326
|
-
|
|
334
|
+
test_FOV = calc_fov(data=data_test, fit=fit, activities=activities, keep_motifs=motifs,
|
|
335
|
+
Bs=(data.B, data.Y)
|
|
336
|
+
)
|
|
337
|
+
train_FOV = calc_fov(data=data, fit=fit, activities=activities, keep_motifs=motifs,
|
|
338
|
+
Bs=(data.B, data.Y)
|
|
339
|
+
)
|
|
327
340
|
if data_test is None:
|
|
328
341
|
test_FOV = None
|
|
329
342
|
res = TestResult(train_FOV, test_FOV, grouped=False)
|
|
@@ -65,7 +65,8 @@ def _gof(name: str = Argument(..., help='Project name.'),
|
|
|
65
65
|
p.start()
|
|
66
66
|
res = calculate_fov(name, stat_type=stat_type, keep_motifs=keep_motifs, gpu=gpu, x64=x64)
|
|
67
67
|
for name, res in res:
|
|
68
|
-
|
|
68
|
+
if name:
|
|
69
|
+
print(name)
|
|
69
70
|
if stat_type == GOFStat.corr:
|
|
70
71
|
title = 'Pearson correlation'
|
|
71
72
|
else:
|
|
@@ -75,6 +75,7 @@ class ProjectData:
|
|
|
75
75
|
motif_postfixes: list
|
|
76
76
|
fmt: str
|
|
77
77
|
|
|
78
|
+
|
|
78
79
|
def read_init(project_name: str) -> ProjectData:
|
|
79
80
|
if type(project_name) is str:
|
|
80
81
|
filename, fmt = get_init_file(project_name)
|
|
@@ -86,7 +87,6 @@ def read_init(project_name: str) -> ProjectData:
|
|
|
86
87
|
group_inds = list()
|
|
87
88
|
for name in group_names:
|
|
88
89
|
group_inds.append(np.array(init['groups'][name]))
|
|
89
|
-
|
|
90
90
|
r = ProjectData(
|
|
91
91
|
Y=init['expression'],
|
|
92
92
|
B=init['loadings'],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: maradoner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12
|
|
4
4
|
Summary: Variance-adjusted estimation of motif activities.
|
|
5
5
|
Home-page: https://github.com/autosome-ru/nemara
|
|
6
6
|
Author: Georgy Meshcheryakov
|
|
@@ -25,15 +25,8 @@ Requires-Dist: statsmodels>=0.14
|
|
|
25
25
|
Requires-Dist: datatable>=1.0.0
|
|
26
26
|
Requires-Dist: dill>=0.3.9
|
|
27
27
|
Requires-Dist: rich>=12.6.0
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
Dynamic: classifier
|
|
31
|
-
Dynamic: description
|
|
32
|
-
Dynamic: description-content-type
|
|
33
|
-
Dynamic: home-page
|
|
34
|
-
Dynamic: requires-dist
|
|
35
|
-
Dynamic: requires-python
|
|
36
|
-
Dynamic: summary
|
|
28
|
+
Requires-Dist: tqdm>=4.0
|
|
29
|
+
Requires-Dist: scikit-learn>=1.6
|
|
37
30
|
|
|
38
31
|
|
|
39
32
|
**MARADONER**
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|