pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. pertpy/__init__.py +3 -2
  2. pertpy/data/__init__.py +5 -1
  3. pertpy/data/_dataloader.py +2 -4
  4. pertpy/data/_datasets.py +203 -92
  5. pertpy/metadata/__init__.py +4 -0
  6. pertpy/metadata/_cell_line.py +826 -0
  7. pertpy/metadata/_compound.py +129 -0
  8. pertpy/metadata/_drug.py +242 -0
  9. pertpy/metadata/_look_up.py +582 -0
  10. pertpy/metadata/_metadata.py +73 -0
  11. pertpy/metadata/_moa.py +129 -0
  12. pertpy/plot/__init__.py +1 -9
  13. pertpy/plot/_augur.py +53 -116
  14. pertpy/plot/_coda.py +277 -677
  15. pertpy/plot/_guide_rna.py +17 -35
  16. pertpy/plot/_milopy.py +59 -134
  17. pertpy/plot/_mixscape.py +152 -391
  18. pertpy/preprocessing/_guide_rna.py +88 -4
  19. pertpy/tools/__init__.py +8 -13
  20. pertpy/tools/_augur.py +315 -17
  21. pertpy/tools/_cinemaot.py +143 -4
  22. pertpy/tools/_coda/_base_coda.py +1210 -65
  23. pertpy/tools/_coda/_sccoda.py +50 -21
  24. pertpy/tools/_coda/_tasccoda.py +27 -19
  25. pertpy/tools/_dialogue.py +164 -56
  26. pertpy/tools/_differential_gene_expression.py +240 -14
  27. pertpy/tools/_distances/_distance_tests.py +8 -8
  28. pertpy/tools/_distances/_distances.py +184 -34
  29. pertpy/tools/_enrichment.py +465 -0
  30. pertpy/tools/_milo.py +345 -11
  31. pertpy/tools/_mixscape.py +668 -50
  32. pertpy/tools/_perturbation_space/_clustering.py +5 -1
  33. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
  34. pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
  35. pertpy/tools/_perturbation_space/_simple.py +51 -10
  36. pertpy/tools/_scgen/__init__.py +1 -1
  37. pertpy/tools/_scgen/_scgen.py +701 -0
  38. pertpy/tools/_scgen/_utils.py +1 -3
  39. pertpy/tools/decoupler_LICENSE +674 -0
  40. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
  41. pertpy-0.7.0.dist-info/RECORD +53 -0
  42. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
  43. pertpy/plot/_cinemaot.py +0 -81
  44. pertpy/plot/_dialogue.py +0 -91
  45. pertpy/plot/_scgen.py +0 -337
  46. pertpy/tools/_metadata/__init__.py +0 -0
  47. pertpy/tools/_metadata/_cell_line.py +0 -613
  48. pertpy/tools/_metadata/_look_up.py +0 -342
  49. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  50. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  51. pertpy-0.6.0.dist-info/RECORD +0 -50
  52. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  53. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,16 +5,18 @@ from typing import TYPE_CHECKING, Literal
5
5
  import decoupler as dc
6
6
  import numpy as np
7
7
  import numpy.typing as npt
8
+ import pandas as pd
9
+ from scipy.stats import kendalltau, pearsonr, spearmanr
10
+ from statsmodels.stats.multitest import fdrcorrection
8
11
 
9
12
  if TYPE_CHECKING:
10
- import pandas as pd
11
13
  from anndata import AnnData
12
14
 
13
15
 
14
16
  class DifferentialGeneExpression:
15
17
  """Support for differential gene expression for scverse."""
16
18
 
17
- def pseudobulk(
19
+ def get_pseudobulk(
18
20
  self,
19
21
  adata: AnnData,
20
22
  sample_col: str,
@@ -22,28 +24,44 @@ class DifferentialGeneExpression:
22
24
  obs: pd.DataFrame = None,
23
25
  layer: str = None,
24
26
  use_raw: bool = False,
25
- min_prop: float = 0.2,
27
+ mode: str = "sum",
28
+ min_cells=10,
26
29
  min_counts: int = 1000,
27
- min_samples: int = 2,
28
30
  dtype: npt.DTypeLike = np.float32,
31
+ skip_checks: bool = False,
29
32
  ) -> AnnData:
30
- """Generate Pseudobulk for DE analysis.
33
+ """Summarizes expression profiles across cells per sample and group.
31
34
 
32
- Wraps decoupler's get_pseudobulk function.
35
+ Generates summarized expression profiles across cells per sample (e.g. sample id) and group (e.g. cell type) based on the metadata found in .obs.
36
+ To ensure a minimum quality control, this function removes genes that are not expressed enough across cells (min_prop) or samples (min_smpls),
37
+ and samples with not enough cells (min_cells) or gene counts (min_counts).
38
+
39
+ By default this function expects raw integer counts as input and sums them per sample and group (mode='sum'), but other modes are available.
40
+
41
+ This function produces some quality control metrics to assess if is necessary to filter some samples.
42
+ The number of cells that belong to each sample is stored in `.obs['psbulk_n_cells']`,
43
+ the total sum of counts per sample in .obs['psbulk_counts'], and the proportion of cells that express a given gene in `.layers[‘psbulk_props’]`.
44
+
45
+ Wraps decoupler's `get_pseudobulk` function.
33
46
  See: https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.get_pseudobulk.html#decoupler.get_pseudobulk
34
- for more details
47
+ for more details.
35
48
 
36
49
  Args:
37
50
  adata: Input AnnData object.
38
51
  sample_col: Column of obs where to extract the samples names.
39
52
  groups_col: Column of obs where to extract the groups names.
40
- obs: If provided, metadata dataframe.
53
+ obs: If provided, metadata DataFrame.
41
54
  layer: If provided, which layer to use.
42
- use_raw: Use raw attribute of adata if present.
43
- min_prop: Minimum proportion of cells with non-zero values.
44
- min_counts: Minimum number of cells per sample.
45
- min_samples: Minimum number of samples per feature.
55
+ use_raw: Use raw attribute of the AnnData object if present.
56
+ mode: How to perform the pseudobulk.
57
+ Available options are 'sum', 'mean' or 'median'. Also accepts callback functions to perform custom aggregations.
58
+ Additionally, it is also possible to provide a dictionary of different callback functions, each one stored in a different resulting `.layer`.
59
+ In this case, the result of the first callback function of the dictionary is stored in .X by default.
60
+ min_cells: Filter to remove samples by a minimum number of cells in a sample-group pair.
61
+ min_counts: Filter to remove samples by a minimum number of summed counts in a sample-group pair.
46
62
  dtype: Type of float used.
63
+ skip_checks: Whether to skip input checks.
64
+ Set to True when working with positive and negative data, or when counts are not integers.
47
65
 
48
66
  Returns:
49
67
  Returns new AnnData object with unormalized pseudobulk profiles per sample and group.
@@ -55,14 +73,222 @@ class DifferentialGeneExpression:
55
73
  obs=obs,
56
74
  layer=layer,
57
75
  use_raw=use_raw,
58
- min_prop=min_prop,
76
+ mode=mode,
59
77
  min_counts=min_counts,
60
- min_smpls=min_samples,
61
78
  dtype=dtype,
79
+ min_cells=min_cells,
80
+ skip_checks=skip_checks,
62
81
  )
63
82
 
64
83
  return pseudobulk_adata
65
84
 
85
+ def filter_by_expr(
86
+ self,
87
+ adata: AnnData,
88
+ obs: pd.DataFrame = None,
89
+ group: str | None = None,
90
+ lib_size: int | float | None = None,
91
+ min_count: int = 10,
92
+ min_total_count: int = 15,
93
+ large_n: int = 10,
94
+ min_prop: float = 0.7,
95
+ ) -> AnnData:
96
+ """Filter AnnData by which genes have sufficiently large counts to be retained in a statistical analysis.
97
+
98
+ Wraps decoupler's `filter_by_expr` function.
99
+ See https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.filter_by_expr.html#decoupler.filter_by_expr
100
+ for more details.
101
+
102
+ Args:
103
+ adata: AnnData obtained after running `get_pseudobulk`.
104
+ obs: Metadata dataframe, only needed if `adata` is not an `AnnData`.
105
+ group: Name of the `.obs` column to group by. If None, assumes all samples belong to one group.
106
+ lib_size: Library size. Defaults to the sum of reads per sample if None.
107
+ min_count: Minimum count required per gene for at least some samples.
108
+ min_total_count: Minimum total count required per gene across all samples.
109
+ large_n: Number of samples per group considered to be "large".
110
+ min_prop: Minimum proportion of samples in the smallest group that express the gene.
111
+
112
+ Returns:
113
+ AnnData with only the genes that are to be kept.
114
+ """
115
+ genes = dc.filter_by_expr(
116
+ adata=adata,
117
+ obs=obs,
118
+ group=group,
119
+ lib_size=lib_size,
120
+ min_count=min_count,
121
+ min_total_count=min_total_count,
122
+ large_n=large_n,
123
+ min_prop=min_prop,
124
+ )
125
+ filtered_adata = adata[:, genes].copy()
126
+
127
+ return filtered_adata
128
+
129
+ def filter_by_prop(self, adata: AnnData, min_prop: float = 0.2, min_samples: int = 2) -> AnnData:
130
+ """Determine which genes are expressed in a sufficient proportion of cells across samples.
131
+
132
+ This function selects genes that are sufficiently expressed across cells in each sample and that this condition
133
+ is met across a minimum number of samples.
134
+
135
+ Args:
136
+ adata: AnnData obtained after running `get_pseudobulk`. It requieres `.layer['psbulk_props']`.
137
+ min_prop: Minimum proportion of cells that express a gene in a sample.
138
+ min_samples: Minimum number of samples with bigger or equal proportion of cells with expression than `min_prop`.
139
+
140
+ Returns:
141
+ AnnData with only the genes that are to be kept.
142
+ """
143
+ genes = dc.filter_by_prop(adata=adata, min_prop=min_prop, min_smpls=min_samples)
144
+ filtered_adata = adata[:, genes].copy()
145
+
146
+ return filtered_adata
147
+
148
+ def calculate_correlation(
149
+ self,
150
+ de_res_1: pd.DataFrame,
151
+ de_res_2: pd.DataFrame,
152
+ method: Literal["spearman", "pearson", "kendall-tau"] = "spearman",
153
+ ) -> pd.DataFrame:
154
+ """Calculate the Spearman correlation coefficient for 'pvals_adj' and 'logfoldchanges' columns.
155
+
156
+ Args:
157
+ de_res_1: A DataFrame with DE result columns.
158
+ de_res_2: Another DataFrame with the same DE result columns.
159
+ method: The correlation method to apply. One of `spearman`, `pearson`, `kendall-tau`.
160
+ Defaults to `spearman`.
161
+
162
+ Returns:
163
+ A DataFrame with the Spearman correlation coefficients for 'pvals_adj' and 'logfoldchanges'.
164
+ """
165
+ columns_of_interest = ["pvals_adj", "logfoldchanges"]
166
+ correlation_data = {}
167
+ for col in columns_of_interest:
168
+ match method:
169
+ case "spearman":
170
+ correlation, _ = spearmanr(de_res_1[col], de_res_2[col])
171
+ case "pearson":
172
+ correlation, _ = pearsonr(de_res_1[col], de_res_2[col])
173
+ case "kendall-tau":
174
+ correlation, _ = kendalltau(de_res_1[col], de_res_2[col])
175
+ case _:
176
+ raise ValueError("Unknown correlation method.")
177
+ correlation_data[col] = correlation
178
+
179
+ return pd.DataFrame([correlation_data], columns=columns_of_interest)
180
+
181
+ def calculate_jaccard_index(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame, threshold: float = 0.05) -> float:
182
+ """Calculate the Jaccard index for sets of significantly expressed genes/features based on a p-value threshold.
183
+
184
+ Args:
185
+ de_res_1: A DataFrame with DE result columns, including 'pvals'.
186
+ de_res_2: Another DataFrame with the same DE result columns.
187
+ threshold: A threshold for determining significant expression (default is 0.05).
188
+
189
+ Returns:
190
+ The Jaccard index.
191
+ """
192
+ significant_set_1 = set(de_res_1[de_res_1["pvals"] <= threshold].index)
193
+ significant_set_2 = set(de_res_2[de_res_2["pvals"] <= threshold].index)
194
+
195
+ intersection = significant_set_1.intersection(significant_set_2)
196
+ union = significant_set_1.union(significant_set_2)
197
+
198
+ return len(intersection) / len(union) if union else 0
199
+
200
+ def calculate_cohens_d(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame) -> pd.Series:
201
+ """Calculate Cohen's D for the logfoldchanges.
202
+
203
+ Args:
204
+ de_res_1: A DataFrame with DE result columns, including 'logfoldchanges'.
205
+ de_res_2: Another DataFrame with the same DE result columns.
206
+
207
+ Returns:
208
+ A pandas Series containing Cohen's D for each gene/feature.
209
+ """
210
+ means_1 = de_res_1["logfoldchanges"].mean()
211
+ means_2 = de_res_2["logfoldchanges"].mean()
212
+ sd_1 = de_res_1["logfoldchanges"].std()
213
+ sd_2 = de_res_2["logfoldchanges"].std()
214
+
215
+ pooled_sd = np.sqrt((sd_1**2 + sd_2**2) / 2)
216
+ cohens_d = (means_1 - means_2) / pooled_sd
217
+
218
+ return cohens_d
219
+
220
+ def de_res_to_anndata(
221
+ self,
222
+ adata: AnnData,
223
+ de_res: pd.DataFrame,
224
+ *,
225
+ groupby: str,
226
+ gene_id_col: str = "gene_symbols",
227
+ score_col: str = "scores",
228
+ pval_col: str = "pvals",
229
+ pval_adj_col: str | None = "pvals_adj",
230
+ lfc_col: str = "logfoldchanges",
231
+ key_added: str = "rank_genes_groups",
232
+ ) -> None:
233
+ """Add tabular differential expression result to AnnData as if it was produced by `scanpy.tl.rank_genes_groups`.
234
+
235
+ Args:
236
+ adata:
237
+ Annotated data matrix
238
+ de_res:
239
+ Tablular de result
240
+ groupby:
241
+ Column in `de_res` that indicates the group. This column must also exist in `adata.obs`.
242
+ gene_id_col:
243
+ Column in `de_res` that holds the gene identifiers
244
+ score_col:
245
+ Column in `de_res` that holds the score (results will be ordered by score).
246
+ pval_col:
247
+ Column in `de_res` that holds the unadjusted pvalue
248
+ pval_adj_col:
249
+ Column in `de_res` that holds the adjusted pvalue.
250
+ If not specified, the unadjusted pvalues will be FDR-adjusted.
251
+ lfc_col:
252
+ Column in `de_res` that holds the log fold change
253
+ key_added:
254
+ Key under which the results will be stored in `adata.uns`
255
+ """
256
+ if groupby not in adata.obs.columns or groupby not in de_res.columns:
257
+ raise ValueError("groupby column must exist in both adata and de_res.")
258
+ res_dict = {
259
+ "params": {
260
+ "groupby": groupby,
261
+ "reference": "rest",
262
+ "method": "other",
263
+ "use_raw": True,
264
+ "layer": None,
265
+ "corr_method": "other",
266
+ },
267
+ "names": [],
268
+ "scores": [],
269
+ "pvals": [],
270
+ "pvals_adj": [],
271
+ "logfoldchanges": [],
272
+ }
273
+ df_groupby = de_res.groupby(groupby)
274
+ for _, tmp_df in df_groupby:
275
+ tmp_df = tmp_df.sort_values(score_col, ascending=False)
276
+ res_dict["names"].append(tmp_df[gene_id_col].values) # type: ignore
277
+ res_dict["scores"].append(tmp_df[score_col].values) # type: ignore
278
+ res_dict["pvals"].append(tmp_df[pval_col].values) # type: ignore
279
+ if pval_adj_col is not None:
280
+ res_dict["pvals_adj"].append(tmp_df[pval_adj_col].values) # type: ignore
281
+ else:
282
+ res_dict["pvals_adj"].append(fdrcorrection(tmp_df[pval_col].values)[1]) # type: ignore
283
+ res_dict["logfoldchanges"].append(tmp_df[lfc_col].values) # type: ignore
284
+
285
+ for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]:
286
+ res_dict[key] = pd.DataFrame(
287
+ np.vstack(res_dict[key]).T,
288
+ columns=list(df_groupby.groups.keys()),
289
+ ).to_records(index=False, column_dtypes="O")
290
+ adata.uns[key_added] = res_dict
291
+
66
292
  def de_analysis(
67
293
  self,
68
294
  adata: AnnData,
@@ -37,8 +37,8 @@ class DistanceTest:
37
37
  Examples:
38
38
  >>> import pertpy as pt
39
39
  >>> adata = pt.dt.distance_example_data()
40
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
41
- >>> tab = distance_test(adata, groupby='perturbation', contrast='control')
40
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
41
+ >>> tab = distance_test(adata, groupby="perturbation", contrast="control")
42
42
  """
43
43
 
44
44
  def __init__(
@@ -100,8 +100,8 @@ class DistanceTest:
100
100
  Examples:
101
101
  >>> import pertpy as pt
102
102
  >>> adata = pt.dt.distance_example_data()
103
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
104
- >>> tab = distance_test(adata, groupby='perturbation', contrast='control')
103
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
104
+ >>> tab = distance_test(adata, groupby="perturbation", contrast="control")
105
105
  """
106
106
  if self.distance.metric_fct.accepts_precomputed:
107
107
  # Much faster if the metric can be called on the precomputed
@@ -134,8 +134,8 @@ class DistanceTest:
134
134
  Examples:
135
135
  >>> import pertpy as pt
136
136
  >>> adata = pt.dt.distance_example_data()
137
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
138
- >>> test_results = distance_test.test_xy(adata, groupby='perturbation', contrast='control')
137
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
138
+ >>> test_results = distance_test.test_xy(adata, groupby="perturbation", contrast="control")
139
139
  """
140
140
  groups = adata.obs[groupby].unique()
141
141
  if contrast not in groups:
@@ -226,8 +226,8 @@ class DistanceTest:
226
226
  Examples:
227
227
  >>> import pertpy as pt
228
228
  >>> adata = pt.dt.distance_example_data()
229
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
230
- >>> test_results = distance_test.test_precomputed(adata, groupby='perturbation', contrast='control')
229
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
230
+ >>> test_results = distance_test.test_precomputed(adata, groupby="perturbation", contrast="control")
231
231
  """
232
232
  if not self.distance.metric_fct.accepts_precomputed:
233
233
  raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")