pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. pertpy/__init__.py +3 -2
  2. pertpy/data/__init__.py +5 -1
  3. pertpy/data/_dataloader.py +2 -4
  4. pertpy/data/_datasets.py +203 -92
  5. pertpy/metadata/__init__.py +4 -0
  6. pertpy/metadata/_cell_line.py +826 -0
  7. pertpy/metadata/_compound.py +129 -0
  8. pertpy/metadata/_drug.py +242 -0
  9. pertpy/metadata/_look_up.py +582 -0
  10. pertpy/metadata/_metadata.py +73 -0
  11. pertpy/metadata/_moa.py +129 -0
  12. pertpy/plot/__init__.py +1 -9
  13. pertpy/plot/_augur.py +53 -116
  14. pertpy/plot/_coda.py +277 -677
  15. pertpy/plot/_guide_rna.py +17 -35
  16. pertpy/plot/_milopy.py +59 -134
  17. pertpy/plot/_mixscape.py +152 -391
  18. pertpy/preprocessing/_guide_rna.py +88 -4
  19. pertpy/tools/__init__.py +8 -13
  20. pertpy/tools/_augur.py +315 -17
  21. pertpy/tools/_cinemaot.py +143 -4
  22. pertpy/tools/_coda/_base_coda.py +1210 -65
  23. pertpy/tools/_coda/_sccoda.py +50 -21
  24. pertpy/tools/_coda/_tasccoda.py +27 -19
  25. pertpy/tools/_dialogue.py +164 -56
  26. pertpy/tools/_differential_gene_expression.py +240 -14
  27. pertpy/tools/_distances/_distance_tests.py +8 -8
  28. pertpy/tools/_distances/_distances.py +184 -34
  29. pertpy/tools/_enrichment.py +465 -0
  30. pertpy/tools/_milo.py +345 -11
  31. pertpy/tools/_mixscape.py +668 -50
  32. pertpy/tools/_perturbation_space/_clustering.py +5 -1
  33. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
  34. pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
  35. pertpy/tools/_perturbation_space/_simple.py +51 -10
  36. pertpy/tools/_scgen/__init__.py +1 -1
  37. pertpy/tools/_scgen/_scgen.py +701 -0
  38. pertpy/tools/_scgen/_utils.py +1 -3
  39. pertpy/tools/decoupler_LICENSE +674 -0
  40. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
  41. pertpy-0.7.0.dist-info/RECORD +53 -0
  42. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
  43. pertpy/plot/_cinemaot.py +0 -81
  44. pertpy/plot/_dialogue.py +0 -91
  45. pertpy/plot/_scgen.py +0 -337
  46. pertpy/tools/_metadata/__init__.py +0 -0
  47. pertpy/tools/_metadata/_cell_line.py +0 -613
  48. pertpy/tools/_metadata/_look_up.py +0 -342
  49. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  50. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  51. pertpy-0.6.0.dist-info/RECORD +0 -50
  52. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  53. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -3,24 +3,25 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ import numba
6
7
  import numpy as np
7
8
  import pandas as pd
8
9
  from ott.geometry.geometry import Geometry
9
10
  from ott.geometry.pointcloud import PointCloud
10
11
  from ott.problems.linear.linear_problem import LinearProblem
11
12
  from ott.solvers.linear.sinkhorn import Sinkhorn
13
+ from pandas import Series
12
14
  from rich.progress import track
13
15
  from scipy.sparse import issparse
14
16
  from scipy.spatial.distance import cosine
15
17
  from scipy.special import gammaln
16
- from scipy.stats import kendalltau, pearsonr, spearmanr
18
+ from scipy.stats import kendalltau, kstest, pearsonr, spearmanr
19
+ from sklearn.linear_model import LogisticRegression
17
20
  from sklearn.metrics import pairwise_distances, r2_score
18
21
  from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel
19
22
  from statsmodels.discrete.discrete_model import NegativeBinomialP
20
23
 
21
24
  if TYPE_CHECKING:
22
- from collections.abc import Iterable
23
-
24
25
  from anndata import AnnData
25
26
 
26
27
 
@@ -30,6 +31,7 @@ class Distance:
30
31
  The distance metric can be specified by the user. This class also provides a
31
32
  method to compute the pairwise distances between all groups of cells.
32
33
  Currently available metrics:
34
+
33
35
  - "edistance": Energy distance (Default metric).
34
36
  In essence, it is twice the mean pairwise distance between cells of two
35
37
  groups minus the mean pairwise distance between cells within each group
@@ -55,8 +57,6 @@ class Distance:
55
57
  Coefficient of determination distance between the means of cells from two groups.
56
58
  - "mean_pairwise": Mean pairwise distance.
57
59
  Mean of the pairwise euclidean distances between cells of two groups.
58
- - "mean_pairwise": Mean pairwise distance.
59
- Mean of the pairwise euclidean distances between cells of two groups.
60
60
  - "mmd": Maximum mean discrepancy
61
61
  Maximum mean discrepancy between the cells of two groups.
62
62
  Here, uses linear, rbf, and quadratic polynomial MMD. For theory on MMD in single-cell applications, see
@@ -66,14 +66,20 @@ class Distance:
66
66
  OTT-JAX implementation of the Sinkhorn algorithm to compute the distance.
67
67
  For more information on the optimal transport solver, see
68
68
  `Cuturi et al. (2013) <https://proceedings.neurips.cc/paper/2013/file/af21d0c97db2e27e13572cbf59eb343d-Paper.pdf>`__.
69
- - "kl_divergence": Kullback–Leibler divergence distance.
69
+ - "sym_kldiv": symmetrized Kullback–Leibler divergence distance.
70
70
  Kullback–Leibler divergence of the gaussian distributions between cells of two groups.
71
- Here we fit a gaussian distribution over each group of cells and then calculate the KL divergence
71
+ Here we fit a gaussian distribution over one group of cells and then calculate the KL divergence on the other, and vice versa.
72
72
  - "t_test": t-test statistic.
73
73
  T-test statistic measure between cells of two groups.
74
+ - "ks_test": Kolmogorov-Smirnov test statistic.
75
+ Kolmogorov-Smirnov test statistic measure between cells of two groups.
74
76
  - "nb_ll": log-likelihood over negative binomial
75
77
  Average of log-likelihoods of samples of the secondary group after fitting a negative binomial distribution
76
78
  over the samples of the first group.
79
+ - "classifier_proba": probability of a binary classifier
80
+ Average of the classification probability of the perturbation for a binary classifier.
81
+ - "classifier_cp": classifier class projection
82
+ Average of the class
77
83
 
78
84
  Attributes:
79
85
  metric: Name of distance metric.
@@ -137,12 +143,18 @@ class Distance:
137
143
  metric_fct = MMD()
138
144
  elif metric == "wasserstein":
139
145
  metric_fct = WassersteinDistance()
140
- elif metric == "kl_divergence":
141
- metric_fct = KLDivergence()
146
+ elif metric == "sym_kldiv":
147
+ metric_fct = SymmetricKLDivergence()
142
148
  elif metric == "t_test":
143
149
  metric_fct = TTestDistance()
150
+ elif metric == "ks_test":
151
+ metric_fct = KSTestDistance()
144
152
  elif metric == "nb_ll":
145
153
  metric_fct = NBLL()
154
+ elif metric == "classifier_proba":
155
+ metric_fct = ClassifierProbaDistance()
156
+ elif metric == "classifier_cp":
157
+ metric_fct = ClassifierClassProjection()
146
158
  else:
147
159
  raise ValueError(f"Metric {metric} not recognized.")
148
160
  self.metric_fct = metric_fct
@@ -280,7 +292,7 @@ class Distance:
280
292
  n_jobs: int = -1,
281
293
  **kwargs,
282
294
  ) -> pd.DataFrame:
283
- """Get pairwise distances between groups of cells.
295
+ """Get distances between one selected cell group and the remaining other cell groups.
284
296
 
285
297
  Args:
286
298
  adata: Annotated data matrix.
@@ -301,6 +313,11 @@ class Distance:
301
313
  >>> Distance = pt.tools.Distance(metric="edistance")
302
314
  >>> pairwise_df = Distance.onesided_distances(adata, groupby="perturbation", selected_group="control")
303
315
  """
316
+ if self.metric == "classifier_cp":
317
+ return self.metric_fct.onesided_distances( # type: ignore
318
+ adata, groupby, selected_group, groups, show_progressbar, n_jobs, **kwargs
319
+ )
320
+
304
321
  groups = adata.obs[groupby].unique() if groups is None else groups
305
322
  grouping = adata.obs[groupby].copy()
306
323
  df = pd.Series(index=groups, dtype=float)
@@ -329,7 +346,10 @@ class Distance:
329
346
  dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
330
347
  df.loc[group_x] = dist
331
348
  else:
332
- embedding = adata.obsm[self.obsm_key].copy()
349
+ if self.layer_key:
350
+ embedding = adata.layers[self.layer_key]
351
+ else:
352
+ embedding = adata.obsm[self.obsm_key].copy()
333
353
  for group_x in fct(groups):
334
354
  cells_x = embedding[grouping == group_x].copy()
335
355
  group_y = selected_group
@@ -337,7 +357,7 @@ class Distance:
337
357
  dist = 0.0
338
358
  else:
339
359
  cells_y = embedding[grouping == group_y].copy()
340
- dist = self.metric_fct(cells_x, cells_y, **kwargs)
360
+ dist = self(cells_x, cells_y, **kwargs)
341
361
  df.loc[group_x] = dist
342
362
  df.index.name = groupby
343
363
  df.name = f"{self.metric} to {selected_group}"
@@ -471,11 +491,8 @@ class WassersteinDistance(AbstractDistance):
471
491
  return self.solve_ot_problem(geom, **kwargs)
472
492
 
473
493
  def solve_ot_problem(self, geom: Geometry, **kwargs):
474
- # Define a linear problem with that cost structure.
475
494
  ot_prob = LinearProblem(geom)
476
- # Create a Sinkhorn solver
477
495
  solver = Sinkhorn()
478
- # Solve OT problem
479
496
  ot = solver(ot_prob, **kwargs)
480
497
  return ot.reg_ot_cost.item()
481
498
 
@@ -502,7 +519,7 @@ class MeanSquaredDistance(AbstractDistance):
502
519
  self.accepts_precomputed = False
503
520
 
504
521
  def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
505
- return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 0.5
522
+ return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
506
523
 
507
524
  def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
508
525
  raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
@@ -516,7 +533,7 @@ class MeanAbsoluteDistance(AbstractDistance):
516
533
  self.accepts_precomputed = False
517
534
 
518
535
  def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
519
- return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
536
+ return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
520
537
 
521
538
  def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
522
539
  raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
@@ -614,11 +631,12 @@ class R2ScoreDistance(AbstractDistance):
614
631
  raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
615
632
 
616
633
 
617
- class KLDivergence(AbstractDistance):
618
- """Average of KL divergence between gene distributions of two groups
634
+ class SymmetricKLDivergence(AbstractDistance):
635
+ """Average of symmetric KL divergence between gene distributions of two groups
619
636
 
620
637
  Assuming a Gaussian distribution for each gene in each group, calculates
621
- the KL divergence between them and averages over all genes
638
+ the KL divergence between them and averages over all genes. Repeats this ABBA to get a symmetrized distance.
639
+ See https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence#Symmetrised_divergence.
622
640
 
623
641
  """
624
642
 
@@ -632,11 +650,12 @@ class KLDivergence(AbstractDistance):
632
650
  x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
633
651
  y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
634
652
  kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
635
- kl_all.append(kl)
653
+ klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
654
+ kl_all.append(kl + klr)
636
655
  return sum(kl_all) / len(kl_all)
637
656
 
638
657
  def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
639
- raise NotImplementedError("KLDivergence cannot be called on a pairwise distance matrix.")
658
+ raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
640
659
 
641
660
 
642
661
  class TTestDistance(AbstractDistance):
@@ -663,6 +682,23 @@ class TTestDistance(AbstractDistance):
663
682
  raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
664
683
 
665
684
 
685
+ class KSTestDistance(AbstractDistance):
686
+ """Average of two-sided KS test statistic between two groups"""
687
+
688
+ def __init__(self) -> None:
689
+ super().__init__()
690
+ self.accepts_precomputed = False
691
+
692
+ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
693
+ stats = []
694
+ for i in range(X.shape[1]):
695
+ stats.append(abs(kstest(X[:, i], Y[:, i])[0]))
696
+ return sum(stats) / len(stats)
697
+
698
+ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
699
+ raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
700
+
701
+
666
702
  class NBLL(AbstractDistance):
667
703
  """
668
704
  Average of Log likelihood (scalar) of group B cells
@@ -683,16 +719,12 @@ class NBLL(AbstractDistance):
683
719
  if not _is_count_matrix(matrix=X) or not _is_count_matrix(matrix=Y):
684
720
  raise ValueError("NBLL distance only works for raw counts.")
685
721
 
686
- nlls = []
687
- for i in range(X.shape[1]):
688
- x, y = X[:, i], Y[:, i]
689
- nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
690
- mu = np.repeat(np.exp(nb_params[0]), y.shape[0])
691
- theta = np.repeat(1 / nb_params[1], y.shape[0])
692
- if mu[0] == np.nan or theta[0] == np.nan:
693
- raise ValueError("Could not fit a negative binomial distribution to the input data")
694
- # calculate the nll of y
695
- eps = np.repeat(epsilon, y.shape[0])
722
+ @numba.jit(forceobj=True)
723
+ def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
724
+ mu = np.exp(nb_params[0])
725
+ theta = 1 / nb_params[1]
726
+ eps = epsilon
727
+
696
728
  log_theta_mu_eps = np.log(theta + mu + eps)
697
729
  nll = (
698
730
  theta * (np.log(theta + eps) - log_theta_mu_eps)
@@ -701,9 +733,127 @@ class NBLL(AbstractDistance):
701
733
  - gammaln(theta)
702
734
  - gammaln(y + 1)
703
735
  )
704
- nlls.append(nll.mean())
736
+ return nll.mean()
737
+
738
+ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
739
+ try:
740
+ nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
741
+ return _compute_nll(y, nb_params, epsilon)
742
+ except np.linalg.linalg.LinAlgError:
743
+ if x.mean() < 10 and y.mean() < 10:
744
+ return 0.0
745
+ else:
746
+ return np.nan # Use NaN to indicate skipped genes
747
+
748
+ nlls = []
749
+ genes_skipped = 0
750
+
751
+ for i in range(X.shape[1]):
752
+ nll = _process_gene(X[:, i], Y[:, i], epsilon)
753
+ if np.isnan(nll):
754
+ genes_skipped += 1
755
+ else:
756
+ nlls.append(nll)
705
757
 
706
- return -sum(nlls) / len(nlls)
758
+ if genes_skipped > X.shape[1] / 2:
759
+ raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
760
+
761
+ return -np.sum(nlls) / len(nlls)
707
762
 
708
763
  def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
709
764
  raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
765
+
766
+
767
+ def _sample(X, frac=None, n=None):
768
+ """Returns subsample of cells in format (train, test)."""
769
+ if frac and n:
770
+ raise ValueError("Cannot pass both frac and n.")
771
+ if frac:
772
+ n_cells = max(1, int(X.shape[0] * frac))
773
+ elif n:
774
+ n_cells = n
775
+ else:
776
+ raise ValueError("Must pass either `frac` or `n`.")
777
+
778
+ rng = np.random.default_rng()
779
+ sampled_indices = rng.choice(X.shape[0], n_cells, replace=False)
780
+ remaining_indices = np.setdiff1d(np.arange(X.shape[0]), sampled_indices)
781
+ return X[remaining_indices, :], X[sampled_indices, :]
782
+
783
+
784
+ class ClassifierProbaDistance(AbstractDistance):
785
+ """Average of classification probabilites of a binary classifier.
786
+
787
+ Assumes the first condition is control and the second is perturbed.
788
+ Always holds out 20% of the perturbed condition.
789
+ """
790
+
791
+ def __init__(self) -> None:
792
+ super().__init__()
793
+ self.accepts_precomputed = False
794
+
795
+ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
796
+ Y_train, Y_test = _sample(Y, frac=0.2)
797
+ label = ["c"] * X.shape[0] + ["p"] * Y_train.shape[0]
798
+ train = np.concatenate([X, Y_train])
799
+
800
+ reg = LogisticRegression()
801
+ reg.fit(train, label)
802
+ test_labels = reg.predict_proba(Y_test)
803
+ return np.mean(test_labels[:, 1])
804
+
805
+ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
806
+ raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
807
+
808
+
809
+ class ClassifierClassProjection(AbstractDistance):
810
+ """Average of 1-(classification probability of control).
811
+
812
+ Warning: unlike all other distances, this must also take a list of categorical labels the same length as X.
813
+ """
814
+
815
+ def __init__(self) -> None:
816
+ super().__init__()
817
+ self.accepts_precomputed = False
818
+
819
+ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
820
+ raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
821
+
822
+ def onesided_distances(
823
+ self,
824
+ adata: AnnData,
825
+ groupby: str,
826
+ selected_group: str | None = None,
827
+ groups: list[str] | None = None,
828
+ show_progressbar: bool = True,
829
+ n_jobs: int = -1,
830
+ **kwargs,
831
+ ) -> Series:
832
+ """Unlike the parent function, all groups except the selected group are factored into the classifier.
833
+
834
+ Similar to the parent function, the returned dataframe contains only the specified groups.
835
+ """
836
+ groups = adata.obs[groupby].unique() if groups is None else groups
837
+
838
+ X = adata[adata.obs[groupby] != selected_group].X
839
+ labels = adata[adata.obs[groupby] != selected_group].obs[groupby].values
840
+ Y = adata[adata.obs[groupby] == selected_group].X
841
+
842
+ reg = LogisticRegression()
843
+ reg.fit(X, labels)
844
+ test_probas = reg.predict_proba(Y)
845
+
846
+ df = pd.Series(index=groups, dtype=float)
847
+ for group in groups:
848
+ if group == selected_group:
849
+ df.loc[group] = 0
850
+ else:
851
+ class_idx = list(reg.classes_).index(group)
852
+ df.loc[group] = 1 - np.mean(test_probas[:, class_idx])
853
+ df.index.name = groupby
854
+ df.name = f"classifier_cp to {selected_group}"
855
+
856
+ return df
857
+
858
+ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
859
+ raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")