pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. pertpy/__init__.py +3 -2
  2. pertpy/data/__init__.py +5 -1
  3. pertpy/data/_dataloader.py +2 -4
  4. pertpy/data/_datasets.py +203 -92
  5. pertpy/metadata/__init__.py +4 -0
  6. pertpy/metadata/_cell_line.py +826 -0
  7. pertpy/metadata/_compound.py +129 -0
  8. pertpy/metadata/_drug.py +242 -0
  9. pertpy/metadata/_look_up.py +582 -0
  10. pertpy/metadata/_metadata.py +73 -0
  11. pertpy/metadata/_moa.py +129 -0
  12. pertpy/plot/__init__.py +1 -9
  13. pertpy/plot/_augur.py +53 -116
  14. pertpy/plot/_coda.py +277 -677
  15. pertpy/plot/_guide_rna.py +17 -35
  16. pertpy/plot/_milopy.py +59 -134
  17. pertpy/plot/_mixscape.py +152 -391
  18. pertpy/preprocessing/_guide_rna.py +88 -4
  19. pertpy/tools/__init__.py +8 -13
  20. pertpy/tools/_augur.py +315 -17
  21. pertpy/tools/_cinemaot.py +143 -4
  22. pertpy/tools/_coda/_base_coda.py +1210 -65
  23. pertpy/tools/_coda/_sccoda.py +50 -21
  24. pertpy/tools/_coda/_tasccoda.py +27 -19
  25. pertpy/tools/_dialogue.py +164 -56
  26. pertpy/tools/_differential_gene_expression.py +240 -14
  27. pertpy/tools/_distances/_distance_tests.py +8 -8
  28. pertpy/tools/_distances/_distances.py +184 -34
  29. pertpy/tools/_enrichment.py +465 -0
  30. pertpy/tools/_milo.py +345 -11
  31. pertpy/tools/_mixscape.py +668 -50
  32. pertpy/tools/_perturbation_space/_clustering.py +5 -1
  33. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
  34. pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
  35. pertpy/tools/_perturbation_space/_simple.py +51 -10
  36. pertpy/tools/_scgen/__init__.py +1 -1
  37. pertpy/tools/_scgen/_scgen.py +701 -0
  38. pertpy/tools/_scgen/_utils.py +1 -3
  39. pertpy/tools/decoupler_LICENSE +674 -0
  40. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
  41. pertpy-0.7.0.dist-info/RECORD +53 -0
  42. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
  43. pertpy/plot/_cinemaot.py +0 -81
  44. pertpy/plot/_dialogue.py +0 -91
  45. pertpy/plot/_scgen.py +0 -337
  46. pertpy/tools/_metadata/__init__.py +0 -0
  47. pertpy/tools/_metadata/_cell_line.py +0 -613
  48. pertpy/tools/_metadata/_look_up.py +0 -342
  49. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  50. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  51. pertpy-0.6.0.dist-info/RECORD +0 -50
  52. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  53. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,16 +5,18 @@ from typing import TYPE_CHECKING, Literal
5
5
  import decoupler as dc
6
6
  import numpy as np
7
7
  import numpy.typing as npt
8
+ import pandas as pd
9
+ from scipy.stats import kendalltau, pearsonr, spearmanr
10
+ from statsmodels.stats.multitest import fdrcorrection
8
11
 
9
12
  if TYPE_CHECKING:
10
- import pandas as pd
11
13
  from anndata import AnnData
12
14
 
13
15
 
14
16
  class DifferentialGeneExpression:
15
17
  """Support for differential gene expression for scverse."""
16
18
 
17
- def pseudobulk(
19
+ def get_pseudobulk(
18
20
  self,
19
21
  adata: AnnData,
20
22
  sample_col: str,
@@ -22,28 +24,44 @@ class DifferentialGeneExpression:
22
24
  obs: pd.DataFrame = None,
23
25
  layer: str = None,
24
26
  use_raw: bool = False,
25
- min_prop: float = 0.2,
27
+ mode: str = "sum",
28
+ min_cells=10,
26
29
  min_counts: int = 1000,
27
- min_samples: int = 2,
28
30
  dtype: npt.DTypeLike = np.float32,
31
+ skip_checks: bool = False,
29
32
  ) -> AnnData:
30
- """Generate Pseudobulk for DE analysis.
33
+ """Summarizes expression profiles across cells per sample and group.
31
34
 
32
- Wraps decoupler's get_pseudobulk function.
35
+ Generates summarized expression profiles across cells per sample (e.g. sample id) and group (e.g. cell type) based on the metadata found in .obs.
36
+ To ensure a minimum quality control, this function removes genes that are not expressed enough across cells (min_prop) or samples (min_smpls),
37
+ and samples with not enough cells (min_cells) or gene counts (min_counts).
38
+
39
+ By default this function expects raw integer counts as input and sums them per sample and group (mode='sum'), but other modes are available.
40
+
41
+ This function produces some quality control metrics to assess if is necessary to filter some samples.
42
+ The number of cells that belong to each sample is stored in `.obs['psbulk_n_cells']`,
43
+ the total sum of counts per sample in .obs['psbulk_counts'], and the proportion of cells that express a given gene in `.layers[‘psbulk_props’]`.
44
+
45
+ Wraps decoupler's `get_pseudobulk` function.
33
46
  See: https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.get_pseudobulk.html#decoupler.get_pseudobulk
34
- for more details
47
+ for more details.
35
48
 
36
49
  Args:
37
50
  adata: Input AnnData object.
38
51
  sample_col: Column of obs where to extract the samples names.
39
52
  groups_col: Column of obs where to extract the groups names.
40
- obs: If provided, metadata dataframe.
53
+ obs: If provided, metadata DataFrame.
41
54
  layer: If provided, which layer to use.
42
- use_raw: Use raw attribute of adata if present.
43
- min_prop: Minimum proportion of cells with non-zero values.
44
- min_counts: Minimum number of cells per sample.
45
- min_samples: Minimum number of samples per feature.
55
+ use_raw: Use raw attribute of the AnnData object if present.
56
+ mode: How to perform the pseudobulk.
57
+ Available options are 'sum', 'mean' or 'median'. Also accepts callback functions to perform custom aggregations.
58
+ Additionally, it is also possible to provide a dictionary of different callback functions, each one stored in a different resulting `.layer`.
59
+ In this case, the result of the first callback function of the dictionary is stored in .X by default.
60
+ min_cells: Filter to remove samples by a minimum number of cells in a sample-group pair.
61
+ min_counts: Filter to remove samples by a minimum number of summed counts in a sample-group pair.
46
62
  dtype: Type of float used.
63
+ skip_checks: Whether to skip input checks.
64
+ Set to True when working with positive and negative data, or when counts are not integers.
47
65
 
48
66
  Returns:
49
67
  Returns new AnnData object with unormalized pseudobulk profiles per sample and group.
@@ -55,14 +73,222 @@ class DifferentialGeneExpression:
55
73
  obs=obs,
56
74
  layer=layer,
57
75
  use_raw=use_raw,
58
- min_prop=min_prop,
76
+ mode=mode,
59
77
  min_counts=min_counts,
60
- min_smpls=min_samples,
61
78
  dtype=dtype,
79
+ min_cells=min_cells,
80
+ skip_checks=skip_checks,
62
81
  )
63
82
 
64
83
  return pseudobulk_adata
65
84
 
85
+ def filter_by_expr(
86
+ self,
87
+ adata: AnnData,
88
+ obs: pd.DataFrame = None,
89
+ group: str | None = None,
90
+ lib_size: int | float | None = None,
91
+ min_count: int = 10,
92
+ min_total_count: int = 15,
93
+ large_n: int = 10,
94
+ min_prop: float = 0.7,
95
+ ) -> AnnData:
96
+ """Filter AnnData by which genes have sufficiently large counts to be retained in a statistical analysis.
97
+
98
+ Wraps decoupler's `filter_by_expr` function.
99
+ See https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.filter_by_expr.html#decoupler.filter_by_expr
100
+ for more details.
101
+
102
+ Args:
103
+ adata: AnnData obtained after running `get_pseudobulk`.
104
+ obs: Metadata dataframe, only needed if `adata` is not an `AnnData`.
105
+ group: Name of the `.obs` column to group by. If None, assumes all samples belong to one group.
106
+ lib_size: Library size. Defaults to the sum of reads per sample if None.
107
+ min_count: Minimum count required per gene for at least some samples.
108
+ min_total_count: Minimum total count required per gene across all samples.
109
+ large_n: Number of samples per group considered to be "large".
110
+ min_prop: Minimum proportion of samples in the smallest group that express the gene.
111
+
112
+ Returns:
113
+ AnnData with only the genes that are to be kept.
114
+ """
115
+ genes = dc.filter_by_expr(
116
+ adata=adata,
117
+ obs=obs,
118
+ group=group,
119
+ lib_size=lib_size,
120
+ min_count=min_count,
121
+ min_total_count=min_total_count,
122
+ large_n=large_n,
123
+ min_prop=min_prop,
124
+ )
125
+ filtered_adata = adata[:, genes].copy()
126
+
127
+ return filtered_adata
128
+
129
+ def filter_by_prop(self, adata: AnnData, min_prop: float = 0.2, min_samples: int = 2) -> AnnData:
130
+ """Determine which genes are expressed in a sufficient proportion of cells across samples.
131
+
132
+ This function selects genes that are sufficiently expressed across cells in each sample and that this condition
133
+ is met across a minimum number of samples.
134
+
135
+ Args:
136
+ adata: AnnData obtained after running `get_pseudobulk`. It requieres `.layer['psbulk_props']`.
137
+ min_prop: Minimum proportion of cells that express a gene in a sample.
138
+ min_samples: Minimum number of samples with bigger or equal proportion of cells with expression than `min_prop`.
139
+
140
+ Returns:
141
+ AnnData with only the genes that are to be kept.
142
+ """
143
+ genes = dc.filter_by_prop(adata=adata, min_prop=min_prop, min_smpls=min_samples)
144
+ filtered_adata = adata[:, genes].copy()
145
+
146
+ return filtered_adata
147
+
148
+ def calculate_correlation(
149
+ self,
150
+ de_res_1: pd.DataFrame,
151
+ de_res_2: pd.DataFrame,
152
+ method: Literal["spearman", "pearson", "kendall-tau"] = "spearman",
153
+ ) -> pd.DataFrame:
154
+ """Calculate the Spearman correlation coefficient for 'pvals_adj' and 'logfoldchanges' columns.
155
+
156
+ Args:
157
+ de_res_1: A DataFrame with DE result columns.
158
+ de_res_2: Another DataFrame with the same DE result columns.
159
+ method: The correlation method to apply. One of `spearman`, `pearson`, `kendall-tau`.
160
+ Defaults to `spearman`.
161
+
162
+ Returns:
163
+ A DataFrame with the Spearman correlation coefficients for 'pvals_adj' and 'logfoldchanges'.
164
+ """
165
+ columns_of_interest = ["pvals_adj", "logfoldchanges"]
166
+ correlation_data = {}
167
+ for col in columns_of_interest:
168
+ match method:
169
+ case "spearman":
170
+ correlation, _ = spearmanr(de_res_1[col], de_res_2[col])
171
+ case "pearson":
172
+ correlation, _ = pearsonr(de_res_1[col], de_res_2[col])
173
+ case "kendall-tau":
174
+ correlation, _ = kendalltau(de_res_1[col], de_res_2[col])
175
+ case _:
176
+ raise ValueError("Unknown correlation method.")
177
+ correlation_data[col] = correlation
178
+
179
+ return pd.DataFrame([correlation_data], columns=columns_of_interest)
180
+
181
+ def calculate_jaccard_index(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame, threshold: float = 0.05) -> float:
182
+ """Calculate the Jaccard index for sets of significantly expressed genes/features based on a p-value threshold.
183
+
184
+ Args:
185
+ de_res_1: A DataFrame with DE result columns, including 'pvals'.
186
+ de_res_2: Another DataFrame with the same DE result columns.
187
+ threshold: A threshold for determining significant expression (default is 0.05).
188
+
189
+ Returns:
190
+ The Jaccard index.
191
+ """
192
+ significant_set_1 = set(de_res_1[de_res_1["pvals"] <= threshold].index)
193
+ significant_set_2 = set(de_res_2[de_res_2["pvals"] <= threshold].index)
194
+
195
+ intersection = significant_set_1.intersection(significant_set_2)
196
+ union = significant_set_1.union(significant_set_2)
197
+
198
+ return len(intersection) / len(union) if union else 0
199
+
200
+ def calculate_cohens_d(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame) -> pd.Series:
201
+ """Calculate Cohen's D for the logfoldchanges.
202
+
203
+ Args:
204
+ de_res_1: A DataFrame with DE result columns, including 'logfoldchanges'.
205
+ de_res_2: Another DataFrame with the same DE result columns.
206
+
207
+ Returns:
208
+ A pandas Series containing Cohen's D for each gene/feature.
209
+ """
210
+ means_1 = de_res_1["logfoldchanges"].mean()
211
+ means_2 = de_res_2["logfoldchanges"].mean()
212
+ sd_1 = de_res_1["logfoldchanges"].std()
213
+ sd_2 = de_res_2["logfoldchanges"].std()
214
+
215
+ pooled_sd = np.sqrt((sd_1**2 + sd_2**2) / 2)
216
+ cohens_d = (means_1 - means_2) / pooled_sd
217
+
218
+ return cohens_d
219
+
220
+ def de_res_to_anndata(
221
+ self,
222
+ adata: AnnData,
223
+ de_res: pd.DataFrame,
224
+ *,
225
+ groupby: str,
226
+ gene_id_col: str = "gene_symbols",
227
+ score_col: str = "scores",
228
+ pval_col: str = "pvals",
229
+ pval_adj_col: str | None = "pvals_adj",
230
+ lfc_col: str = "logfoldchanges",
231
+ key_added: str = "rank_genes_groups",
232
+ ) -> None:
233
+ """Add tabular differential expression result to AnnData as if it was produced by `scanpy.tl.rank_genes_groups`.
234
+
235
+ Args:
236
+ adata:
237
+ Annotated data matrix
238
+ de_res:
239
+ Tablular de result
240
+ groupby:
241
+ Column in `de_res` that indicates the group. This column must also exist in `adata.obs`.
242
+ gene_id_col:
243
+ Column in `de_res` that holds the gene identifiers
244
+ score_col:
245
+ Column in `de_res` that holds the score (results will be ordered by score).
246
+ pval_col:
247
+ Column in `de_res` that holds the unadjusted pvalue
248
+ pval_adj_col:
249
+ Column in `de_res` that holds the adjusted pvalue.
250
+ If not specified, the unadjusted pvalues will be FDR-adjusted.
251
+ lfc_col:
252
+ Column in `de_res` that holds the log fold change
253
+ key_added:
254
+ Key under which the results will be stored in `adata.uns`
255
+ """
256
+ if groupby not in adata.obs.columns or groupby not in de_res.columns:
257
+ raise ValueError("groupby column must exist in both adata and de_res.")
258
+ res_dict = {
259
+ "params": {
260
+ "groupby": groupby,
261
+ "reference": "rest",
262
+ "method": "other",
263
+ "use_raw": True,
264
+ "layer": None,
265
+ "corr_method": "other",
266
+ },
267
+ "names": [],
268
+ "scores": [],
269
+ "pvals": [],
270
+ "pvals_adj": [],
271
+ "logfoldchanges": [],
272
+ }
273
+ df_groupby = de_res.groupby(groupby)
274
+ for _, tmp_df in df_groupby:
275
+ tmp_df = tmp_df.sort_values(score_col, ascending=False)
276
+ res_dict["names"].append(tmp_df[gene_id_col].values) # type: ignore
277
+ res_dict["scores"].append(tmp_df[score_col].values) # type: ignore
278
+ res_dict["pvals"].append(tmp_df[pval_col].values) # type: ignore
279
+ if pval_adj_col is not None:
280
+ res_dict["pvals_adj"].append(tmp_df[pval_adj_col].values) # type: ignore
281
+ else:
282
+ res_dict["pvals_adj"].append(fdrcorrection(tmp_df[pval_col].values)[1]) # type: ignore
283
+ res_dict["logfoldchanges"].append(tmp_df[lfc_col].values) # type: ignore
284
+
285
+ for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]:
286
+ res_dict[key] = pd.DataFrame(
287
+ np.vstack(res_dict[key]).T,
288
+ columns=list(df_groupby.groups.keys()),
289
+ ).to_records(index=False, column_dtypes="O")
290
+ adata.uns[key_added] = res_dict
291
+
66
292
  def de_analysis(
67
293
  self,
68
294
  adata: AnnData,
@@ -37,8 +37,8 @@ class DistanceTest:
37
37
  Examples:
38
38
  >>> import pertpy as pt
39
39
  >>> adata = pt.dt.distance_example_data()
40
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
41
- >>> tab = distance_test(adata, groupby='perturbation', contrast='control')
40
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
41
+ >>> tab = distance_test(adata, groupby="perturbation", contrast="control")
42
42
  """
43
43
 
44
44
  def __init__(
@@ -100,8 +100,8 @@ class DistanceTest:
100
100
  Examples:
101
101
  >>> import pertpy as pt
102
102
  >>> adata = pt.dt.distance_example_data()
103
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
104
- >>> tab = distance_test(adata, groupby='perturbation', contrast='control')
103
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
104
+ >>> tab = distance_test(adata, groupby="perturbation", contrast="control")
105
105
  """
106
106
  if self.distance.metric_fct.accepts_precomputed:
107
107
  # Much faster if the metric can be called on the precomputed
@@ -134,8 +134,8 @@ class DistanceTest:
134
134
  Examples:
135
135
  >>> import pertpy as pt
136
136
  >>> adata = pt.dt.distance_example_data()
137
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
138
- >>> test_results = distance_test.test_xy(adata, groupby='perturbation', contrast='control')
137
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
138
+ >>> test_results = distance_test.test_xy(adata, groupby="perturbation", contrast="control")
139
139
  """
140
140
  groups = adata.obs[groupby].unique()
141
141
  if contrast not in groups:
@@ -226,8 +226,8 @@ class DistanceTest:
226
226
  Examples:
227
227
  >>> import pertpy as pt
228
228
  >>> adata = pt.dt.distance_example_data()
229
- >>> distance_test = pt.tl.DistanceTest('edistance', n_perms=1000)
230
- >>> test_results = distance_test.test_precomputed(adata, groupby='perturbation', contrast='control')
229
+ >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
230
+ >>> test_results = distance_test.test_precomputed(adata, groupby="perturbation", contrast="control")
231
231
  """
232
232
  if not self.distance.metric_fct.accepts_precomputed:
233
233
  raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")