maradoner 0.9__tar.gz → 0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maradoner might be problematic. Click here for more details.

Files changed (25) hide show
  1. {maradoner-0.9 → maradoner-0.10}/PKG-INFO +1 -1
  2. {maradoner-0.9 → maradoner-0.10}/maradoner/__init__.py +1 -1
  3. {maradoner-0.9 → maradoner-0.10}/maradoner/export.py +91 -103
  4. {maradoner-0.9 → maradoner-0.10}/maradoner/fit.py +150 -59
  5. {maradoner-0.9 → maradoner-0.10}/maradoner/main.py +4 -1
  6. maradoner-0.10/maradoner/mara/__init__.py +2 -0
  7. maradoner-0.10/maradoner/mara/export.py +94 -0
  8. maradoner-0.10/maradoner/mara/fit.py +319 -0
  9. maradoner-0.10/maradoner/mara/main.py +110 -0
  10. {maradoner-0.9 → maradoner-0.10}/maradoner/meta_optimizer.py +17 -15
  11. {maradoner-0.9 → maradoner-0.10}/maradoner/utils.py +2 -0
  12. {maradoner-0.9 → maradoner-0.10}/maradoner.egg-info/PKG-INFO +1 -1
  13. {maradoner-0.9 → maradoner-0.10}/maradoner.egg-info/SOURCES.txt +5 -1
  14. {maradoner-0.9 → maradoner-0.10}/README.md +0 -0
  15. {maradoner-0.9 → maradoner-0.10}/maradoner/create.py +0 -0
  16. {maradoner-0.9 → maradoner-0.10}/maradoner/dataset_filter.py +0 -0
  17. {maradoner-0.9 → maradoner-0.10}/maradoner/mara.py +0 -0
  18. {maradoner-0.9 → maradoner-0.10}/maradoner/select.py +0 -0
  19. {maradoner-0.9 → maradoner-0.10}/maradoner/synthetic_data.py +0 -0
  20. {maradoner-0.9 → maradoner-0.10}/maradoner.egg-info/dependency_links.txt +0 -0
  21. {maradoner-0.9 → maradoner-0.10}/maradoner.egg-info/entry_points.txt +0 -0
  22. {maradoner-0.9 → maradoner-0.10}/maradoner.egg-info/requires.txt +0 -0
  23. {maradoner-0.9 → maradoner-0.10}/maradoner.egg-info/top_level.txt +0 -0
  24. {maradoner-0.9 → maradoner-0.10}/setup.cfg +0 -0
  25. {maradoner-0.9 → maradoner-0.10}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: maradoner
3
- Version: 0.9
3
+ Version: 0.10
4
4
  Summary: Variance-adjusted estimation of motif activities.
5
5
  Home-page: https://github.com/autosome-ru/nemara
6
6
  Author: Georgy Meshcheryakov
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- __version__ = '0.9'
2
+ __version__ = '0.10'
3
3
  import importlib
4
4
 
5
5
 
@@ -3,7 +3,7 @@
3
3
  from pandas import DataFrame as DF
4
4
  # add dot
5
5
  from .utils import read_init, openers
6
- from .fit import FOVResult
6
+ from .fit import FOVResult, ActivitiesPrediction, FitResult
7
7
  from scipy.stats import norm, chi2, multivariate_normal, Covariance
8
8
  from scipy.linalg import eigh, lapack, cholesky, solve
9
9
  from statsmodels.stats import multitest
@@ -13,6 +13,8 @@ from tqdm import tqdm
13
13
  import multiprocessing as mp
14
14
  from functools import partial
15
15
  from scipy.integrate import quad
16
+ import math
17
+ import time
16
18
  import dill
17
19
  import os
18
20
 
@@ -58,7 +60,11 @@ def chol_inv(x: np.array):
58
60
  class Information():
59
61
  eps = 1e-10
60
62
 
61
- def __init__(self, fim: np.ndarray, slc=None, use_preconditioner=False):
63
+ def __init__(self, fim: np.ndarray, slc=None, use_preconditioner=False, filter_items=None):
64
+ self.filter_items = filter_items
65
+ if filter_items is not None:
66
+ fim = np.delete(fim, filter_items, axis=0)
67
+ fim = np.delete(fim, filter_items, axis=1)
62
68
  self.square_root_inv = self._square_root_inv(fim, slc, corr=True)
63
69
  precond = 1 / fim.diagonal() ** 0.5
64
70
  if not use_preconditioner:
@@ -69,11 +75,17 @@ class Information():
69
75
  self.slice = slice(None, None) if slc is None else slc
70
76
 
71
77
  def _inv(self, x: np.ndarray):
78
+ x = np.array(x)
79
+ # t = np.linalg.eigh(x)
72
80
  try:
73
81
  x = chol_inv(x)
74
82
  except:
75
83
  print('alarm')
84
+ # print(x.diagonal().min())
85
+ assert np.allclose(x, x.T), x - x.T
76
86
  x = np.linalg.eigh(x)
87
+ print(x[0].min(), x[0].max())
88
+ # x = np.linalg.pinv(x, hermitian=True)
77
89
  x = x[1] * (1/np.clip(x[0], self.eps, float('inf'))) @ x[1].T
78
90
  return x
79
91
 
@@ -94,6 +106,8 @@ class Information():
94
106
  def standardize(self, x: np.ndarray,
95
107
  mode: Standardization=Standardization.std,
96
108
  return_std=True):
109
+ if self.filter_items is not None:
110
+ x = np.delete(x, self.filter_items)
97
111
  x = x / self.precond[self.slice]
98
112
  cov = self._inv(self.fim)
99
113
  cov = cov[self.slice, self.slice]
@@ -127,59 +141,6 @@ class Information():
127
141
 
128
142
 
129
143
 
130
- def _corrected_numerical(x, mvn, n: int):
131
- x = np.abs(x)
132
- return 1.0 - mvn.cdf(np.repeat(x, n), lower_limit=-x)
133
-
134
- def _corrected_sampled(x, information: Information, num_samples: int, m: int,
135
- num_repeats=1):
136
- x = np.abs(x)
137
- c = 0
138
- n = 0
139
- for _ in range(num_repeats):
140
- t = np.abs(information.cholesky_transform(norm.rvs(size=(m, num_samples))))
141
- c += np.any(t > x, axis=0).sum()
142
- n += num_samples
143
- return c / n
144
-
145
- def corrected_z_test(stat: np.ndarray, information: Information,
146
- numerical: bool, num_samples: int,
147
- n_jobs: int) -> np.ndarray:
148
- if numerical:
149
- raise NotImplementedError
150
-
151
- num_samples = int(num_samples)
152
- f = partial(_corrected_sampled, information=information, num_samples=num_samples,
153
- m=len(stat), num_repeats=1)
154
-
155
- if n_jobs > 1:
156
- with mp.Pool(n_jobs) as p:
157
- corrected = np.array(list(p.map(f , stat)))
158
- else:
159
- corrected = np.array(list(map(f, stat)))
160
- return corrected
161
-
162
-
163
- def weird_test(mu, shift=0, eps=1e-12, std=None):
164
- if std is None:
165
- std = np.ones_like(mu)
166
-
167
- def log_integrand(u, mu, mu_k, std, std_k):
168
- return norm.logpdf(u, loc=mu_k, scale=std_k) + norm.logcdf((u - mu) / std_k).sum()
169
-
170
- def integrand(u, mu, mu_k, std, std_k):
171
- return np.exp(log_integrand(u, mu, mu_k, std, std_k) + shift)
172
-
173
- argmax = np.zeros_like(mu, dtype=float)
174
- for k in tqdm(list(range(len(mu)))):
175
- argmax[k] = quad(lambda x: integrand(x, np.delete(mu, k), mu[k], np.delete(std, k), std[k]),
176
- -np.inf, np.inf, epsabs=eps, epsrel=eps)[0]
177
- result = np.zeros_like(argmax)
178
- inds = np.arange(len(result), dtype=int)
179
- return argmax
180
- for k in range(len(mu)):
181
- result[k] = argmax[np.delete(inds, k)].sum()
182
- return result * np.exp(-shift)
183
144
 
184
145
  def export_fov(fovs: tuple[FOVResult], folder: str,
185
146
  promoter_names: list[str], sample_names: list[str]):
@@ -195,6 +156,55 @@ def export_fov(fovs: tuple[FOVResult], folder: str,
195
156
  samples = np.concatenate(samples, axis=-1)
196
157
  DF(samples, index=sample_names, columns=cols).to_csv(os.path.join(folder, 'samples.tsv'), sep='\t')
197
158
 
159
+
160
+
161
+ def posterior_anova(activities: ActivitiesPrediction, fit: FitResult,
162
+ B: np.ndarray, corr_stat=False):
163
+ precs = list()
164
+ istds = list()
165
+ covs = list()
166
+ mean = 0.0
167
+ bad_inds = np.zeros(activities.U.shape[0], dtype=bool)
168
+ # for cov, U, nu in zip(activities.cov(), activities.U.T, fit.motif_variance.group):
169
+ # mot = fit.motif_variance.motif
170
+ # mot = np.delete(mot, activities.filtered_motifs)
171
+ # ind = mot * nu < cov.diagonal() + 1e-9
172
+ # bad_inds[ind] = True
173
+
174
+ for cov, U, nu in zip(activities.cov(), activities.U.T, fit.motif_variance.group):
175
+ mot = fit.motif_variance.motif
176
+ mot = np.delete(mot, activities.filtered_motifs)[~bad_inds]
177
+ # cov = cov[~bad_inds, ~bad_inds]
178
+ cov = cov[..., ~bad_inds]
179
+ cov = cov[~bad_inds]
180
+ covs.append(cov)
181
+ U = U[~bad_inds]
182
+ # prec = np.linalg.inv(np.diag(mot * nu) - cov)
183
+ prec = np.linalg.inv(cov)
184
+ mean += prec @ U
185
+ precs.append(prec)
186
+ print(bad_inds.sum())
187
+ total_prec = sum(precs)
188
+ total_cov = np.linalg.inv(total_prec)
189
+ mean = total_cov @ mean
190
+ stats = activities.U[~bad_inds] - mean.reshape(-1, 1)
191
+ # if corr_stat:
192
+ # istd = 1 / total_cov.diagonal() ** 0.5
193
+ # total_cor = istd.reshape(-1, 1) * total_cov * istd
194
+ # stats = total_cor @ stats
195
+ # total_cov = total_cor @ total_cov @ total_cor
196
+ # stats = (1 / total_cov.diagonal().reshape(-1, 1)) ** 0.5 * stats
197
+ istds = [1 / c.diagonal() ** 0.5 for c in covs]
198
+ istds = np.array(istds).T
199
+ stats = stats * istds
200
+ stats = stats ** 2
201
+ stats = stats.sum(axis=-1)
202
+ pvalues = chi2.sf(stats, len(precs) - 1)
203
+ fdr = multitest.multipletests(pvalues, alpha=0.05, method='fdr_by')[1]
204
+ return stats, pvalues, fdr, bad_inds
205
+
206
+
207
+
198
208
 
199
209
 
200
210
  def export_results(project_name: str, output_folder: str,
@@ -221,12 +231,12 @@ def export_results(project_name: str, output_folder: str,
221
231
  prom_names = data.promoter_names
222
232
  # del data
223
233
  with openers[fmt](f'{project_name}.fit.{fmt}', 'rb') as f:
224
- fit = dill.load(f)
234
+ fit: FitResult = dill.load(f)
225
235
  if fit.promoter_inds_to_drop:
226
236
  prom_names = np.delete(prom_names, fit.promoter_inds_to_drop)
227
237
  group_names = fit.group_names
228
238
  with openers[fmt](f'{project_name}.predict.{fmt}', 'rb') as f:
229
- act = dill.load(f)
239
+ act: ActivitiesPrediction = dill.load(f)
230
240
  if act.filtered_motifs is not None:
231
241
  motif_names_filtered = np.delete(motif_names, act.filtered_motifs)
232
242
  else:
@@ -240,13 +250,13 @@ def export_results(project_name: str, output_folder: str,
240
250
  mode=Standardization.std)
241
251
 
242
252
  motif_variance = fit.motif_variance.motif
243
- motif_variance_fim = Information(fit.motif_variance.fim, slice(None, len(motif_names)))
253
+ motif_variance_fim = Information(fit.motif_variance.fim, slice(None, len(motif_names_filtered)),
254
+ filter_items=act.filtered_motifs)
244
255
  motif_variance_stat, motif_variance_std = motif_variance_fim.standardize(motif_variance,
245
256
  mode=Standardization.std)
246
257
 
247
258
  motif_group_variance = fit.motif_variance.group
248
259
  excluded_motif_group = fit.motif_variance.fixed_group
249
-
250
260
  motif_group_variance_fim = Information(fit.motif_variance.fim, slice(len(motif_names), None))
251
261
  motif_group_variance_std = motif_group_variance_fim.covariance().diagonal() ** 0.5
252
262
 
@@ -254,7 +264,7 @@ def export_results(project_name: str, output_folder: str,
254
264
  motif_mean = fit.motif_mean.mean.flatten()
255
265
  motif_mean_fim = Information(fit.motif_mean.fim)
256
266
  motif_mean_stat, motif_mean_std = motif_mean_fim.standardize(motif_mean,
257
- mode=Standardization.std)
267
+ mode=Standardization.std)
258
268
 
259
269
  promoter_mean = fit.promoter_mean.mean.flatten()
260
270
  # del fit
@@ -264,6 +274,7 @@ def export_results(project_name: str, output_folder: str,
264
274
  os.makedirs(folder, exist_ok=True)
265
275
  if excluded_motif_group is not None:
266
276
  motif_group_variance_std = np.insert(motif_group_variance_std, excluded_motif_group, np.nan)
277
+ print(error_variance.shape, error_variance_std.shape, motif_group_variance.shape, motif_group_variance_std.shape)
267
278
  DF(np.array([error_variance, error_variance_std, motif_group_variance, motif_group_variance_std]).T,
268
279
  index=group_names,
269
280
  columns=['sigma', 'sigma_std', 'nu', 'nu_std']).to_csv(os.path.join(folder, 'group_variances.tsv'),
@@ -284,67 +295,41 @@ def export_results(project_name: str, output_folder: str,
284
295
  sep='\t')
285
296
  DF(motif_mean_fim.correlation(), index=motif_names, columns=motif_names).to_csv(os.path.join(folder, 'motif_means.tsv'),
286
297
  sep='\t')
287
- DF(motif_variance_fim.correlation(), index=motif_names, columns=motif_names).to_csv(os.path.join(folder, 'motif_variances.tsv'),
298
+ DF(motif_variance_fim.correlation(), index=motif_names_filtered, columns=motif_names_filtered).to_csv(os.path.join(folder, 'motif_variances.tsv'),
288
299
  sep='\t')
289
300
  _group_names = group_names
290
301
  if excluded_motif_group is not None:
291
302
  _group_names = np.delete(_group_names, excluded_motif_group)
292
303
  DF(motif_group_variance_fim.correlation(), index=_group_names, columns=_group_names).to_csv(os.path.join(folder, 'motif_group_variances.tsv'),
293
304
  sep='\t')
294
- # DF(motif_cor_cross, index=motif_names, columns=_group_names).to_csv(os.path.join(folder, 'motif_cross.tsv'),
295
- # sep='\t')
305
+
296
306
  DF(error_variance_fim.correlation(), index=group_names, columns=group_names).to_csv(os.path.join(folder, 'error_variances.tsv'),
297
307
  sep='\t')
298
308
 
299
309
 
300
310
  folder = output_folder
301
- U_raw, U_decor, stds = act.U, act.U_decor, act.stds
302
311
 
303
- if std_mode == Standardization.full:
304
- U = U_decor
305
- else:
306
- U = U_raw / stds
307
- folder = os.path.join(output_folder, 'activities')
308
- os.makedirs(folder, exist_ok=True)
309
- DF(U_raw, index=motif_names_filtered, columns=group_names).to_csv(os.path.join(folder, 'activity_raw.tsv'), sep='\t')
310
- DF(U, index=motif_names_filtered, columns=group_names).to_csv(os.path.join(folder, 'activity.tsv'), sep='\t')
311
- DF(stds, index=motif_names_filtered, columns=group_names).to_csv(os.path.join(folder, 'activity_stds.tsv'), sep='\t')
312
-
313
312
  folder = os.path.join(output_folder, 'tests', 'prediction_based')
314
313
  os.makedirs(folder, exist_ok=True)
315
- z_test = 2 * norm.sf(np.abs(U))#calc_z_test(U)
316
- z_test_fdr = [multitest.multipletests(z_test[:, i], alpha=alpha, method='fdr_bh')[1] for i in range(z_test.shape[1])]
317
- z_test_fdr = np.array(z_test_fdr).T
318
- z_test = DF(z_test, index=motif_names_filtered, columns=group_names)
319
- z_test.to_csv(os.path.join(folder, 'z_test.tsv'), sep='\t')
320
- z_test = DF(z_test_fdr, index=motif_names_filtered, columns=group_names)
321
- z_test.to_csv(os.path.join(folder, 'z_test_fdr.tsv'), sep='\t')
322
- stat = (U ** 2).sum(axis=1)
323
- anova = chi2.sf(stat, df=U.shape[1])
324
- fdrs = multitest.multipletests(anova, alpha=0.05, method='fdr_bh')[1]
325
- anova = DF([stat, anova, fdrs], columns=motif_names_filtered, index=['stat', 'p-value', 'FDR']).T
314
+
315
+ stat, pvalue, fdr, bad_inds = posterior_anova(act, fit, B=data.B)
316
+ motif_names_filtered = np.array(motif_names_filtered)[~bad_inds]
317
+ anova = DF([stat, pvalue, fdr], columns=motif_names_filtered, index=['stat', 'p-value', 'FDR']).T
326
318
  anova.to_csv(os.path.join(folder, 'anova.tsv'), sep='\t')
327
- stat = (U ** 2).min(axis=1)
328
- off_test = -np.expm1(U.shape[1]*chi2.logsf(stat, df=1))
329
- fdrs = multitest.multipletests(off_test, alpha=0.05, method='fdr_bh')[1]
330
- off_test = DF([stat, off_test, fdrs], columns=motif_names_filtered, index=['stat', 'p-value', 'FDR']).T
331
- off_test.to_csv(os.path.join(folder, 'off_test.tsv'), sep='\t')
332
-
319
+
333
320
  folder = os.path.join(output_folder, 'tests', 'asymptotics_based')
334
321
  os.makedirs(folder, exist_ok=True)
335
322
 
336
323
  anova_ass = motif_variance_stat
337
324
  pval = calc_z_test(anova_ass)
338
- # anova_ass = motif_variance_stat * motif_variance_std
339
- # pval = weird_test(anova_ass, std=motif_variance_std)
325
+
340
326
  fdrs = multitest.multipletests(pval, alpha=0.05, method='fdr_bh')[1]
341
- if compute_corrected_pvalues:
342
- corrected_pval = corrected_z_test(anova_ass, motif_variance_fim, numerical=corrected_numerical,
343
- num_samples=corrected_num_samples,
344
- n_jobs=n_jobs)
345
- anova_ass = DF(np.array([anova_ass, pval, fdrs, corrected_pval]).T, index=motif_names, columns=['stat', 'p-value', 'FDR', 'corrected-p-value'])
346
- else:
347
- anova_ass = DF(np.array([anova_ass, pval, fdrs]).T, index=motif_names, columns=['stat', 'p-value', 'FDR'])
327
+ lrt = 2 * fit.motif_variance.logratios
328
+ lrt_pvalues = chi2.sf(lrt, 1)
329
+ lrt_fdr = multitest.multipletests(lrt_pvalues, alpha=0.05, method='fdr_bh')[1]
330
+ anova_ass = DF(np.array([anova_ass, pval, fdrs, lrt, lrt_pvalues, lrt_fdr]).T, index=motif_names_filtered,
331
+ columns=['stat', 'p-value', 'FDR',
332
+ 'logratio', 'lrt_p-value', 'lrt_FDR'])
348
333
  anova_ass.to_csv(os.path.join(folder, 'anova.tsv'), sep='\t')
349
334
 
350
335
  sign = motif_mean.flatten() / motif_mean_std
@@ -376,7 +361,10 @@ def export_results(project_name: str, output_folder: str,
376
361
  name = data.group_names[i]
377
362
  for k, j in enumerate(inds):
378
363
  sample_names[j] = f'{name}_{k+1}'
379
- promoter_names_train = np.delete(data.promoter_names, fit.promoter_inds_to_drop)
364
+ if fit.promoter_inds_to_drop:
365
+ promoter_names_train = np.delete(data.promoter_names, fit.promoter_inds_to_drop)
366
+ else:
367
+ promoter_names_train = data.promoter_names
380
368
  export_fov(train, os.path.join(folder, 'train'), promoter_names=promoter_names_train,
381
369
  sample_names=sample_names)
382
370
  if test is not None:
@@ -386,5 +374,5 @@ def export_results(project_name: str, output_folder: str,
386
374
 
387
375
 
388
376
 
389
- return {'z-test': z_test, 'anova': anova, 'off_test': off_test,
390
- 'anova_ass': anova_ass, 'sign_ass': sign_ass}
377
+ # return {'z-test': z_test, 'anova': anova, 'off_test': off_test,
378
+ # 'anova_ass': anova_ass, 'sign_ass': sign_ass}