maradoner 0.10__tar.gz → 0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maradoner might be problematic. Click here for more details.

Files changed (26) hide show
  1. {maradoner-0.10 → maradoner-0.12}/PKG-INFO +4 -11
  2. {maradoner-0.10 → maradoner-0.12}/maradoner/__init__.py +3 -1
  3. {maradoner-0.10 → maradoner-0.12}/maradoner/create.py +4 -2
  4. {maradoner-0.10 → maradoner-0.12}/maradoner/dataset_filter.py +39 -2
  5. {maradoner-0.10 → maradoner-0.12}/maradoner/export.py +108 -30
  6. {maradoner-0.10 → maradoner-0.12}/maradoner/fit.py +167 -56
  7. maradoner-0.12/maradoner/grn.py +177 -0
  8. {maradoner-0.10 → maradoner-0.12}/maradoner/main.py +62 -7
  9. {maradoner-0.10 → maradoner-0.12}/maradoner/mara/export.py +5 -6
  10. {maradoner-0.10 → maradoner-0.12}/maradoner/mara/fit.py +73 -43
  11. {maradoner-0.10 → maradoner-0.12}/maradoner/mara/main.py +19 -13
  12. {maradoner-0.10 → maradoner-0.12}/maradoner/utils.py +1 -1
  13. {maradoner-0.10 → maradoner-0.12}/maradoner.egg-info/PKG-INFO +4 -11
  14. {maradoner-0.10 → maradoner-0.12}/maradoner.egg-info/SOURCES.txt +1 -0
  15. {maradoner-0.10 → maradoner-0.12}/maradoner.egg-info/requires.txt +2 -0
  16. {maradoner-0.10 → maradoner-0.12}/README.md +0 -0
  17. {maradoner-0.10 → maradoner-0.12}/maradoner/mara/__init__.py +0 -0
  18. {maradoner-0.10 → maradoner-0.12}/maradoner/mara.py +0 -0
  19. {maradoner-0.10 → maradoner-0.12}/maradoner/meta_optimizer.py +0 -0
  20. {maradoner-0.10 → maradoner-0.12}/maradoner/select.py +0 -0
  21. {maradoner-0.10 → maradoner-0.12}/maradoner/synthetic_data.py +0 -0
  22. {maradoner-0.10 → maradoner-0.12}/maradoner.egg-info/dependency_links.txt +0 -0
  23. {maradoner-0.10 → maradoner-0.12}/maradoner.egg-info/entry_points.txt +0 -0
  24. {maradoner-0.10 → maradoner-0.12}/maradoner.egg-info/top_level.txt +0 -0
  25. {maradoner-0.10 → maradoner-0.12}/setup.cfg +0 -0
  26. {maradoner-0.10 → maradoner-0.12}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.1
2
2
  Name: maradoner
3
- Version: 0.10
3
+ Version: 0.12
4
4
  Summary: Variance-adjusted estimation of motif activities.
5
5
  Home-page: https://github.com/autosome-ru/nemara
6
6
  Author: Georgy Meshcheryakov
@@ -25,15 +25,8 @@ Requires-Dist: statsmodels>=0.14
25
25
  Requires-Dist: datatable>=1.0.0
26
26
  Requires-Dist: dill>=0.3.9
27
27
  Requires-Dist: rich>=12.6.0
28
- Dynamic: author
29
- Dynamic: author-email
30
- Dynamic: classifier
31
- Dynamic: description
32
- Dynamic: description-content-type
33
- Dynamic: home-page
34
- Dynamic: requires-dist
35
- Dynamic: requires-python
36
- Dynamic: summary
28
+ Requires-Dist: tqdm>=4.0
29
+ Requires-Dist: scikit-learn>=1.6
37
30
 
38
31
 
39
32
  **MARADONER**
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- __version__ = '0.10'
2
+ __version__ = '0.12'
3
3
  import importlib
4
4
 
5
5
 
@@ -16,6 +16,8 @@ __min_reqs__ = [
16
16
  'datatable>=1.0.0' ,
17
17
  'dill>=0.3.9',
18
18
  'rich>=12.6.0',
19
+ 'tqdm>=4.0',
20
+ 'scikit-learn>=1.6'
19
21
  ]
20
22
 
21
23
  def versiontuple(v):
@@ -37,7 +37,7 @@ def transform_loadings(df, mode: str, zero_cutoff=1e-9, prom_inds=None):
37
37
 
38
38
  def create_project(project_name: str, promoter_expression_filename: str, loading_matrix_filenames: list[str],
39
39
  motif_expression_filenames=None, loading_matrix_transformations=None, sample_groups=None, motif_postfixes=None,
40
- promoter_filter_lowexp_cutoff=0.95, promoter_filter_plot_filename=None,
40
+ promoter_filter_lowexp_cutoff=0.95, promoter_filter_plot_filename=None, promoter_filter_max=True,
41
41
  motif_names_filename=None, compression='raw', dump=True, verbose=True):
42
42
  if not os.path.isfile(promoter_expression_filename):
43
43
  raise FileNotFoundError(f'Promoter expression file {promoter_expression_filename} not found.')
@@ -88,7 +88,8 @@ def create_project(project_name: str, promoter_expression_filename: str, loading
88
88
  f'{len(loading_matrix_transformations)}.')
89
89
 
90
90
  logger_print('Filtering promoters of low expression...', verbose)
91
- inds, weights = filter_lowexp(promoter_expression, cutoff=promoter_filter_lowexp_cutoff, fit_plot_filename=promoter_filter_plot_filename)
91
+ inds, weights = filter_lowexp(promoter_expression, cutoff=promoter_filter_lowexp_cutoff, fit_plot_filename=promoter_filter_plot_filename,
92
+ max_mode=promoter_filter_max)
92
93
  promoter_expression = promoter_expression.loc[inds]
93
94
  proms = promoter_expression.index
94
95
  loading_matrices = [transform_loadings(df, mode, prom_inds=inds) for df, mode in zip(loading_matrices, loading_matrix_transformations)]
@@ -115,6 +116,7 @@ def create_project(project_name: str, promoter_expression_filename: str, loading
115
116
  motif_expression = None
116
117
  loading_matrices = pd.concat(loading_matrices, axis=1)
117
118
  if motif_names is not None:
119
+ motif_names = list(set(motif_names) & set(loading_matrices.columns))
118
120
  loading_matrices = loading_matrices[motif_names]
119
121
  proms = list(promoter_expression.index)
120
122
  sample_names = list(promoter_expression.columns)
@@ -6,7 +6,19 @@ import pandas as pd
6
6
  import numpy as np
7
7
  from scipy.optimize import minimize
8
8
  from functools import partial
9
+ from sklearn.mixture import GaussianMixture
9
10
 
11
+ def compute_leftmost_probability(Y):
12
+ Y = Y.reshape(-1, 1)
13
+ gmm = GaussianMixture(n_components=2, random_state=0)
14
+ gmm.fit(Y)
15
+
16
+ means = gmm.means_.flatten()
17
+ leftmost_component_index = np.argmin(means)
18
+ probas = gmm.predict_proba(Y)
19
+ leftmost_probs = probas[:, leftmost_component_index]
20
+
21
+ return leftmost_probs, gmm
10
22
 
11
23
  def normax_logpdf(x: jnp.ndarray, mu: float, sigma: float, n: int):
12
24
  x = (x - mu) / sigma
@@ -39,9 +51,33 @@ def loglik(params: jnp.ndarray, x: jnp.ndarray, n: int):
39
51
  w = params[-1]
40
52
  return -logmixture(x, mu, sigma, w, n).sum()
41
53
 
42
- def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, fit_plot_filename=None, plot_dpi=200):
54
+ def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, max_mode=True,
55
+ fit_plot_filename=None, plot_dpi=200):
43
56
  expression = (expression - expression.mean()) / expression.std()
44
-
57
+ if not max_mode:
58
+ expression = expression.mean(axis=1).values
59
+ probs, gmm = compute_leftmost_probability(expression)
60
+ inds = probs < (1-cutoff)
61
+ if fit_plot_filename:
62
+ import matplotlib.pyplot as plt
63
+ from matplotlib.collections import LineCollection
64
+ import seaborn as sns
65
+ x = np.array(sorted(expression))
66
+ pdf = np.exp(gmm.score_samples(expression[:, None]))
67
+ points = np.array([x, pdf]).T.reshape(-1, 1, 2)
68
+ segments = np.concatenate([points[:-1], points[1:]], axis=1)
69
+ plt.figure(dpi=plot_dpi, )
70
+ sns.histplot(expression, stat='density', color='grey')
71
+ lc = LineCollection(segments, cmap='winter')
72
+ lc.set_array(probs)
73
+ lc.set_linewidth(3)
74
+ line = plt.gca().add_collection(lc)
75
+ plt.colorbar(line)
76
+ plt.xlabel('Standardized expression')
77
+ plt.tight_layout()
78
+ plt.savefig(fit_plot_filename)
79
+ return inds, probs
80
+
45
81
  expression_max = expression.max(axis=1).values
46
82
 
47
83
  mu = [-1.0, 0.0]
@@ -105,5 +141,6 @@ def filter_lowexp(expression: pd.DataFrame, cutoff=0.95, fit_plot_filename=None,
105
141
  inds[:k] = False
106
142
  # print(inds)
107
143
  # inds[:] = 1
144
+ print(x[inds].mean(), x[~inds].mean())
108
145
  inds = inds[inds_inv]
109
146
  return inds, ws
@@ -2,8 +2,9 @@
2
2
  # -*- coding: utf-8 -*-
3
3
  from pandas import DataFrame as DF
4
4
  # add dot
5
- from .utils import read_init, openers
5
+ from .utils import read_init, openers, ProjectData
6
6
  from .fit import FOVResult, ActivitiesPrediction, FitResult
7
+ from .grn import grn
7
8
  from scipy.stats import norm, chi2, multivariate_normal, Covariance
8
9
  from scipy.linalg import eigh, lapack, cholesky, solve
9
10
  from statsmodels.stats import multitest
@@ -80,7 +81,9 @@ class Information():
80
81
  try:
81
82
  x = chol_inv(x)
82
83
  except:
83
- print('alarm')
84
+ print('Failed to compute inverse using Cholesky decomposition. ')
85
+ print('This can be a sign of a numerical errors during parameters estimation.')
86
+ print('Will use pseudo-inverse now. The minimal and maximal eigenvalues are:')
84
87
  # print(x.diagonal().min())
85
88
  assert np.allclose(x, x.T), x - x.T
86
89
  x = np.linalg.eigh(x)
@@ -155,11 +158,12 @@ def export_fov(fovs: tuple[FOVResult], folder: str,
155
158
  samples = [fov_null.sample[:, None], fov_means.sample[:, None], fov_motif_means.sample[:, None]]
156
159
  samples = np.concatenate(samples, axis=-1)
157
160
  DF(samples, index=sample_names, columns=cols).to_csv(os.path.join(folder, 'samples.tsv'), sep='\t')
158
-
161
+
162
+
159
163
 
160
164
 
161
165
  def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
162
- B: np.ndarray, corr_stat=False):
166
+ B: np.ndarray, corr_stat=False, map_cov=False):
163
167
  precs = list()
164
168
  istds = list()
165
169
  covs = list()
@@ -170,22 +174,35 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
170
174
  # mot = np.delete(mot, activities.filtered_motifs)
171
175
  # ind = mot * nu < cov.diagonal() + 1e-9
172
176
  # bad_inds[ind] = True
173
-
174
- for cov, U, nu in zip(activities.cov(), activities.U.T, fit.motif_variance.group):
175
- mot = fit.motif_variance.motif
176
- mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
177
+ # mot = fit.motif_variance.motif
178
+ # mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
179
+ motif_variance = fit.motif_variance.motif
180
+ if activities.filtered_motifs is not None:
181
+ motif_variance = np.delete(motif_variance, activities.filtered_motifs)
182
+ B = np.delete(B, activities.filtered_motifs, axis=1)
183
+ U = activities.U
184
+ if map_cov:
185
+ # fit.motif_variance.m
186
+ BTB = B.T @ B
187
+ BTB_s = BTB * motif_variance ** 0.5
188
+ BTB_s = BTB_s @ BTB_s.T
189
+ for cov, U, sigma, n, nu in zip(activities.cov(), U.T,
190
+ activities._cov[-2],
191
+ fit.error_variance.variance, fit.motif_variance.group):
177
192
  # cov = cov[~bad_inds, ~bad_inds]
178
- cov = cov[..., ~bad_inds]
179
- cov = cov[~bad_inds]
193
+ # cov = cov[..., ~bad_inds]
194
+ # cov = cov[~bad_inds]
195
+ if map_cov:
196
+ D = BTB_s * nu + np.identity(len(BTB)) * sigma
197
+ cov = cov @ D @ cov.T * n / sigma ** 2
180
198
  covs.append(cov)
181
- U = U[~bad_inds]
199
+ # U = U[~bad_inds]
182
200
  # prec = np.linalg.inv(np.diag(mot * nu) - cov)
183
- prec = np.linalg.inv(cov)
201
+ prec = np.linalg.pinv(cov, hermitian=True)
184
202
  mean += prec @ U
185
203
  precs.append(prec)
186
- print(bad_inds.sum())
187
204
  total_prec = sum(precs)
188
- total_cov = np.linalg.inv(total_prec)
205
+ total_cov = np.linalg.pinv(total_prec, hermitian=True)
189
206
  mean = total_cov @ mean
190
207
  stats = activities.U[~bad_inds] - mean.reshape(-1, 1)
191
208
  # if corr_stat:
@@ -203,16 +220,11 @@ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
203
220
  fdr = multitest.multipletests(pvalues, alpha=0.05, method='fdr_by')[1]
204
221
  return stats, pvalues, fdr, bad_inds
205
222
 
206
-
207
-
208
-
209
223
 
210
224
  def export_results(project_name: str, output_folder: str,
211
225
  std_mode: Standardization,
212
226
  anova_mode: ANOVAType=ANOVAType.positive,
213
- compute_corrected_pvalues=False,
214
- corrected_numerical=False,
215
- corrected_num_samples=1e5,
227
+ weighted_zscore=False,
216
228
  alpha=0.05,
217
229
  n_jobs=6):
218
230
 
@@ -243,7 +255,7 @@ def export_results(project_name: str, output_folder: str,
243
255
  motif_names_filtered = motif_names
244
256
 
245
257
  os.makedirs(output_folder, exist_ok=True)
246
-
258
+ # grn(data, act, fit, os.path.join(output_folder, 'grn'))
247
259
  error_variance = fit.error_variance.variance
248
260
  error_variance_fim = Information(fit.error_variance.fim)
249
261
  error_variance_stat, error_variance_std = error_variance_fim.standardize(error_variance,
@@ -272,9 +284,12 @@ def export_results(project_name: str, output_folder: str,
272
284
 
273
285
  folder = os.path.join(output_folder, 'params')
274
286
  os.makedirs(folder, exist_ok=True)
287
+ if os.path.isfile(f'{project_name}.promvar.{fmt}'):
288
+ with openers[fmt](f'{project_name}.promvar.{fmt}', 'rb') as f:
289
+ promvar: np.ndarray = dill.load(f)
290
+ DF(promvar, index=prom_names, columns=group_names).to_csv(os.path.join(folder, 'promoter_variances.tsv'), sep='\t')
275
291
  if excluded_motif_group is not None:
276
292
  motif_group_variance_std = np.insert(motif_group_variance_std, excluded_motif_group, np.nan)
277
- print(error_variance.shape, error_variance_std.shape, motif_group_variance.shape, motif_group_variance_std.shape)
278
293
  DF(np.array([error_variance, error_variance_std, motif_group_variance, motif_group_variance_std]).T,
279
294
  index=group_names,
280
295
  columns=['sigma', 'sigma_std', 'nu', 'nu_std']).to_csv(os.path.join(folder, 'group_variances.tsv'),
@@ -324,12 +339,11 @@ def export_results(project_name: str, output_folder: str,
324
339
  pval = calc_z_test(anova_ass)
325
340
 
326
341
  fdrs = multitest.multipletests(pval, alpha=0.05, method='fdr_bh')[1]
327
- lrt = 2 * fit.motif_variance.logratios
328
- lrt_pvalues = chi2.sf(lrt, 1)
329
- lrt_fdr = multitest.multipletests(lrt_pvalues, alpha=0.05, method='fdr_bh')[1]
330
- anova_ass = DF(np.array([anova_ass, pval, fdrs, lrt, lrt_pvalues, lrt_fdr]).T, index=motif_names_filtered,
331
- columns=['stat', 'p-value', 'FDR',
332
- 'logratio', 'lrt_p-value', 'lrt_FDR'])
342
+ # lrt = 2 * fit.motif_variance.logratios
343
+ # lrt_pvalues = chi2.sf(lrt, 1)
344
+ # lrt_fdr = multitest.multipletests(lrt_pvalues, alpha=0.05, method='fdr_bh')[1]
345
+ anova_ass = DF(np.array([anova_ass, pval, fdrs]).T, index=motif_names_filtered,
346
+ columns=['stat', 'p-value', 'FDR'])
333
347
  anova_ass.to_csv(os.path.join(folder, 'anova.tsv'), sep='\t')
334
348
 
335
349
  sign = motif_mean.flatten() / motif_mean_std
@@ -347,6 +361,28 @@ def export_results(project_name: str, output_folder: str,
347
361
  index=motif_names)
348
362
  sign_ass.to_csv(os.path.join(folder, 'sign.tsv'), sep='\t')
349
363
 
364
+ folder = os.path.join(output_folder, 'activities')
365
+ os.makedirs(folder, exist_ok=True)
366
+ U = list()
367
+ stds = list()
368
+ for u, cov in zip(act.U.T, act.cov()):
369
+ std = cov.diagonal() ** 0.5
370
+ u = u / std
371
+ U.append(u)
372
+ stds.append(std)
373
+ U = np.array(U).T
374
+ DF(U, index=motif_names_filtered, columns=group_names).to_csv(os.path.join(folder, 'activity.tsv'), sep='\t')
375
+ U = U ** 2
376
+ if weighted_zscore:
377
+ U_total = U.sum(axis=1, keepdims=True) / (1 / np.array(stds).T ** 2).sum(axis=1, keepdims=True)
378
+ else:
379
+ U_total = U.mean(axis=1, keepdims=True)
380
+
381
+ U = np.hstack((U_total, U)) ** 0.5
382
+ DF(U, index=motif_names_filtered,
383
+ columns=['overall'] + list(group_names)).to_csv(os.path.join(folder, 'z_score.tsv'), sep='\t')
384
+ DF(act.U_raw, index=motif_names_filtered, columns=data.sample_names).to_csv(os.path.join(folder, 'activity_raw.tsv'), sep='\t')
385
+
350
386
  if os.path.isfile(f'{project_name}.fov.{fmt}'):
351
387
  with open(f'{project_name}.fov.{fmt}', 'rb') as f:
352
388
  fov = dill.load(f)
@@ -373,6 +409,48 @@ def export_results(project_name: str, output_folder: str,
373
409
  sample_names=sample_names)
374
410
 
375
411
 
412
+ def export_loadings_product(project_name: str, output_folder: str,
413
+ use_hdf: bool = True, intercepts: bool = True,
414
+ tsv_truncation=4):
376
415
 
377
- # return {'z-test': z_test, 'anova': anova, 'off_test': off_test,
378
- # 'anova_ass': anova_ass, 'sign_ass': sign_ass}
416
+
417
+ data = read_init(project_name)
418
+ fmt = data.fmt
419
+ motif_names = data.motif_names
420
+ prom_names = data.promoter_names
421
+ # del data
422
+ with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
423
+ fit: FitResult = dill.load(f)
424
+ if fit.promoter_inds_to_drop:
425
+ prom_names = np.delete(prom_names, fit.promoter_inds_to_drop)
426
+ group_names = fit.group_names
427
+ with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
428
+ act: ActivitiesPrediction = dill.load(f)
429
+
430
+ output_folder = os.path.join(output_folder, 'loadings-product')
431
+ os.makedirs(output_folder, exist_ok=True)
432
+
433
+ U = act.U
434
+ B = data.B
435
+ mu = fit.motif_mean.mean
436
+
437
+ if act.filtered_motifs is not None:
438
+ motif_names = np.delete(motif_names, act.filtered_motifs)
439
+ B = np.delete(B, act.filtered_motifs, axis=1)
440
+ mu = np.delete(mu, act.filtered_motifs)
441
+ BM = B * mu
442
+ for name, U in zip(group_names, U.T):
443
+ effect = B * U
444
+ if intercepts:
445
+ effect += BM
446
+ if use_hdf:
447
+ effect = effect.astype(np.half)
448
+ filename = os.path.join(output_folder, f'{name}.hdf')
449
+ DF(data=effect, index=prom_names, columns=motif_names).to_hdf(filename, key='lrt', mode='w', complevel=4)
450
+ else:
451
+ filename = os.path.join(output_folder, f'{name}.tsv')
452
+ DF(data=effect, index=prom_names, columns=motif_names).to_csv(filename, sep='\t',
453
+ float_format=f'%.{tsv_truncation}f')
454
+
455
+
456
+