sclab 0.1.7__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. sclab/__init__.py +3 -1
  2. sclab/_io.py +83 -12
  3. sclab/_methods_registry.py +65 -0
  4. sclab/_sclab.py +241 -21
  5. sclab/dataset/_dataset.py +4 -6
  6. sclab/dataset/processor/_processor.py +41 -19
  7. sclab/dataset/processor/_results_panel.py +94 -0
  8. sclab/dataset/processor/step/_processor_step_base.py +12 -6
  9. sclab/examples/processor_steps/__init__.py +8 -0
  10. sclab/examples/processor_steps/_cluster.py +2 -2
  11. sclab/examples/processor_steps/_differential_expression.py +329 -0
  12. sclab/examples/processor_steps/_doublet_detection.py +68 -0
  13. sclab/examples/processor_steps/_gene_expression.py +125 -0
  14. sclab/examples/processor_steps/_integration.py +116 -0
  15. sclab/examples/processor_steps/_neighbors.py +26 -6
  16. sclab/examples/processor_steps/_pca.py +13 -8
  17. sclab/examples/processor_steps/_preprocess.py +52 -25
  18. sclab/examples/processor_steps/_qc.py +24 -8
  19. sclab/examples/processor_steps/_umap.py +2 -2
  20. sclab/gui/__init__.py +0 -0
  21. sclab/gui/components/__init__.py +7 -0
  22. sclab/gui/components/_guided_pseudotime.py +482 -0
  23. sclab/gui/components/_transfer_metadata.py +186 -0
  24. sclab/methods/__init__.py +50 -0
  25. sclab/preprocess/__init__.py +26 -0
  26. sclab/preprocess/_cca.py +176 -0
  27. sclab/preprocess/_cca_integrate.py +109 -0
  28. sclab/preprocess/_filter_obs.py +42 -0
  29. sclab/preprocess/_harmony.py +421 -0
  30. sclab/preprocess/_harmony_integrate.py +53 -0
  31. sclab/preprocess/_normalize_weighted.py +65 -0
  32. sclab/preprocess/_pca.py +51 -0
  33. sclab/preprocess/_preprocess.py +155 -0
  34. sclab/preprocess/_qc.py +38 -0
  35. sclab/preprocess/_rpca.py +116 -0
  36. sclab/preprocess/_subset.py +208 -0
  37. sclab/preprocess/_transfer_metadata.py +196 -0
  38. sclab/preprocess/_transform.py +82 -0
  39. sclab/preprocess/_utils.py +96 -0
  40. sclab/scanpy/__init__.py +0 -0
  41. sclab/scanpy/_compat.py +92 -0
  42. sclab/scanpy/_settings.py +526 -0
  43. sclab/scanpy/logging.py +290 -0
  44. sclab/scanpy/plotting/__init__.py +0 -0
  45. sclab/scanpy/plotting/_rcmod.py +73 -0
  46. sclab/scanpy/plotting/palettes.py +221 -0
  47. sclab/scanpy/readwrite.py +1108 -0
  48. sclab/tools/__init__.py +0 -0
  49. sclab/tools/cellflow/__init__.py +0 -0
  50. sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
  51. sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
  52. sclab/tools/cellflow/pseudotime/__init__.py +0 -0
  53. sclab/tools/cellflow/pseudotime/_pseudotime.py +336 -0
  54. sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
  55. sclab/tools/cellflow/utils/__init__.py +0 -0
  56. sclab/tools/cellflow/utils/density_nd.py +215 -0
  57. sclab/tools/cellflow/utils/interpolate.py +334 -0
  58. sclab/tools/cellflow/utils/periodic_genes.py +106 -0
  59. sclab/tools/cellflow/utils/smoothen.py +124 -0
  60. sclab/tools/cellflow/utils/times.py +55 -0
  61. sclab/tools/differential_expression/__init__.py +7 -0
  62. sclab/tools/differential_expression/_pseudobulk_edger.py +309 -0
  63. sclab/tools/differential_expression/_pseudobulk_helpers.py +290 -0
  64. sclab/tools/differential_expression/_pseudobulk_limma.py +257 -0
  65. sclab/tools/doublet_detection/__init__.py +5 -0
  66. sclab/tools/doublet_detection/_scrublet.py +64 -0
  67. sclab/tools/embedding/__init__.py +0 -0
  68. sclab/tools/imputation/__init__.py +0 -0
  69. sclab/tools/imputation/_alra.py +135 -0
  70. sclab/tools/labeling/__init__.py +6 -0
  71. sclab/tools/labeling/sctype.py +233 -0
  72. sclab/tools/utils/__init__.py +5 -0
  73. sclab/tools/utils/_aggregate_and_filter.py +290 -0
  74. sclab/utils/__init__.py +5 -0
  75. sclab/utils/_write_excel.py +510 -0
  76. {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/METADATA +29 -12
  77. sclab-0.3.4.dist-info/RECORD +93 -0
  78. {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/WHEEL +1 -1
  79. sclab-0.3.4.dist-info/licenses/LICENSE +29 -0
  80. sclab-0.1.7.dist-info/RECORD +0 -30
@@ -0,0 +1,5 @@
1
+ from ._aggregate_and_filter import aggregate_and_filter
2
+
3
+ __all__ = [
4
+ "aggregate_and_filter",
5
+ ]
@@ -0,0 +1,290 @@
1
+ import random
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from anndata import AnnData
6
+ from numpy import ndarray
7
+ from scipy.sparse import csr_matrix, issparse
8
+
9
+
10
+ # code inspired from
11
+ # https://www.sc-best-practices.org/conditions/differential_gene_expression.html
12
+ def aggregate_and_filter(
13
+ adata: AnnData,
14
+ group_key: str = "batch",
15
+ cell_identity_key: str | None = None,
16
+ layer: str | None = None,
17
+ replicas_per_group: int = 3,
18
+ min_cells_per_group: int = 30,
19
+ bootstrap_sampling: bool = False,
20
+ use_cells: dict[str, list[str]] | None = None,
21
+ make_stats: bool = True,
22
+ make_dummies: bool = True,
23
+ ) -> AnnData:
24
+ """
25
+ Aggregate and filter cells in an AnnData object into cell populations.
26
+
27
+ Parameters
28
+ ----------
29
+ adata : AnnData
30
+ AnnData object to aggregate and filter.
31
+ group_key : str, optional
32
+ Key to group cells by. Defaults to 'batch'.
33
+ cell_identity_key : str, optional
34
+ Key to use to identify cell identities. Defaults to None.
35
+ layer : str, optional
36
+ Layer in AnnData object to use for aggregation. Defaults to None.
37
+ replicas_per_group : int, optional
38
+ Number of replicas to create for each group. Defaults to 3.
39
+ min_cells_per_group : int, optional
40
+ Minimum number of cells required for a group to be included. Defaults to 30.
41
+ bootstrap_sampling : bool, optional
42
+ Whether to use bootstrap sampling to create replicas. Defaults to False.
43
+ use_cells : dict[str, list[str]], optional
44
+ If not None, only use the specified cells. Defaults to None.
45
+ make_stats : bool, optional
46
+ Whether to create expression statistics for each group. Defaults to True.
47
+ make_dummies : bool, optional
48
+ Whether to make categorical columns into dummies. Defaults to True.
49
+
50
+ Returns
51
+ -------
52
+ AnnData
53
+ AnnData object with aggregated and filtered cells.
54
+ """
55
+ adata = _prepare_dataset(adata, use_cells)
56
+
57
+ grouping_keys = [group_key]
58
+ if cell_identity_key is not None:
59
+ grouping_keys.append(cell_identity_key)
60
+
61
+ groups_to_drop = _get_groups_to_drop(adata, grouping_keys, min_cells_per_group)
62
+
63
+ _prepare_categorical_column(adata, group_key)
64
+ group_dtype = adata.obs[group_key].dtype
65
+
66
+ if cell_identity_key is not None:
67
+ _prepare_categorical_column(adata, cell_identity_key)
68
+ cell_identity_dtype = adata.obs[cell_identity_key].dtype
69
+
70
+ if make_stats:
71
+ var_dataframe = _create_var_dataframe(
72
+ adata, layer, grouping_keys, groups_to_drop
73
+ )
74
+ else:
75
+ var_dataframe = pd.DataFrame(index=adata.var_names)
76
+
77
+ data = {}
78
+ meta = {}
79
+ groups = adata.obs.groupby(grouping_keys, observed=True).groups
80
+ for group, group_idxs in groups.items():
81
+ if not isinstance(group, tuple):
82
+ group = (group,)
83
+
84
+ if not _including(group, groups_to_drop):
85
+ continue
86
+
87
+ sample_id = "_".join(group)
88
+ match group:
89
+ case (gid, cid):
90
+ group_metadata = {group_key: gid, cell_identity_key: cid}
91
+ case (gid,):
92
+ group_metadata = {group_key: gid}
93
+
94
+ adata_group = adata[group_idxs]
95
+ indices = _get_replica_idxs(adata_group, replicas_per_group, bootstrap_sampling)
96
+ for i, rep_idx in enumerate(indices):
97
+ replica_number = i + 1
98
+ replica_size = len(rep_idx)
99
+ replica_sample_id = f"{sample_id}_rep{replica_number}"
100
+
101
+ adata_group_replica = adata_group[rep_idx]
102
+ X = _get_layer(adata_group_replica, layer)
103
+
104
+ data[replica_sample_id] = np.array(X.sum(axis=0)).flatten()
105
+ meta[replica_sample_id] = {
106
+ **group_metadata,
107
+ "replica": str(replica_number),
108
+ "replica_size": replica_size,
109
+ }
110
+
111
+ data = pd.DataFrame(data).T
112
+ meta = pd.DataFrame(meta).T
113
+ meta["replica"] = meta["replica"].astype("category")
114
+ meta["replica_size"] = meta["replica_size"].astype(int)
115
+ meta[group_key] = meta[group_key].astype(group_dtype)
116
+ if cell_identity_key is not None:
117
+ meta[cell_identity_key] = meta[cell_identity_key].astype(cell_identity_dtype)
118
+
119
+ aggr_adata = AnnData(
120
+ data.values,
121
+ obs=meta,
122
+ var=var_dataframe,
123
+ )
124
+
125
+ if make_dummies:
126
+ _join_dummies(aggr_adata, group_key)
127
+
128
+ return aggr_adata
129
+
130
+
131
+ def _prepare_dataset(
132
+ adata: AnnData,
133
+ use_cells: dict[str, list[str]] | None,
134
+ ) -> AnnData:
135
+ if use_cells is not None:
136
+ for key, value in use_cells.items():
137
+ adata = adata[adata.obs[key].isin(value)]
138
+
139
+ return adata.copy()
140
+
141
+
142
+ def _get_groups_to_drop(
143
+ adata: AnnData,
144
+ grouping_keys: str | list[str],
145
+ min_cells_per_group: int,
146
+ ):
147
+ group_sizes = adata.obs.groupby(grouping_keys, observed=True).size()
148
+ groups_to_drop = group_sizes[group_sizes < min_cells_per_group].index.to_list()
149
+
150
+ if len(groups_to_drop) > 0:
151
+ print("Dropping the following samples:")
152
+
153
+ groups_to_drop = groups_to_drop + [
154
+ (g,) for g in groups_to_drop if not isinstance(g, tuple)
155
+ ]
156
+
157
+ return groups_to_drop
158
+
159
+
160
+ def _prepare_categorical_column(adata: AnnData, column: str) -> None:
161
+ if not isinstance(adata.obs[column].dtype, pd.CategoricalDtype):
162
+ adata.obs[column] = adata.obs[column].astype("category")
163
+
164
+
165
+ def _create_var_dataframe(
166
+ adata: AnnData,
167
+ layer: str,
168
+ grouping_keys: list[str],
169
+ groups_to_drop: list[str],
170
+ ):
171
+ columns = _get_var_dataframe_columns(adata, grouping_keys, groups_to_drop)
172
+ var_dataframe = pd.DataFrame(index=adata.var_names, columns=columns, dtype=float)
173
+
174
+ groups = adata.obs.groupby(grouping_keys, observed=True).groups
175
+ for group, idx in groups.items():
176
+ if not isinstance(group, tuple):
177
+ group = (group,)
178
+
179
+ if not _including(group, groups_to_drop):
180
+ continue
181
+
182
+ sample_id = "_".join(group)
183
+ rest_id = f"not{sample_id}"
184
+
185
+ adata_subset = adata[idx]
186
+ rest_subset = adata[~adata.obs_names.isin(idx)]
187
+
188
+ X = _get_layer(adata_subset, layer, dense=True)
189
+ Y = _get_layer(rest_subset, layer, dense=True)
190
+
191
+ var_dataframe[f"pct_expr_{sample_id}"] = (X > 0).mean(axis=0)
192
+ var_dataframe[f"pct_expr_{rest_id}"] = (Y > 0).mean(axis=0)
193
+ var_dataframe[f"num_expr_{sample_id}"] = (X > 0).sum(axis=0)
194
+ var_dataframe[f"num_expr_{rest_id}"] = (Y > 0).sum(axis=0)
195
+ var_dataframe[f"tot_expr_{sample_id}"] = X.sum(axis=0)
196
+ var_dataframe[f"tot_expr_{rest_id}"] = Y.sum(axis=0)
197
+
198
+ return var_dataframe
199
+
200
+
201
+ def _get_var_dataframe_columns(
202
+ adata: AnnData, grouping_keys: list[str], groups_to_drop: list[str]
203
+ ) -> list[str]:
204
+ columns = []
205
+
206
+ groups = adata.obs.groupby(grouping_keys, observed=True).groups
207
+ for group, _ in groups.items():
208
+ if not isinstance(group, tuple):
209
+ group = (group,)
210
+
211
+ if not _including(group, groups_to_drop):
212
+ continue
213
+
214
+ sample_id = "_".join(group)
215
+ rest_id = f"not{sample_id}"
216
+
217
+ columns.extend(
218
+ [
219
+ f"pct_expr_{sample_id}",
220
+ f"pct_expr_{rest_id}",
221
+ f"num_expr_{sample_id}",
222
+ f"num_expr_{rest_id}",
223
+ f"tot_expr_{sample_id}",
224
+ f"tot_expr_{rest_id}",
225
+ ]
226
+ )
227
+
228
+ return columns
229
+
230
+
231
+ def _including(group: tuple | str, groups_to_drop: list[str]) -> bool:
232
+ match group:
233
+ case (gid, cid):
234
+ if isinstance(cid, float) and np.isnan(cid):
235
+ return False
236
+
237
+ case (gid,) | gid:
238
+ ...
239
+
240
+ if gid in groups_to_drop:
241
+ return False
242
+
243
+ return True
244
+
245
+
246
+ def _get_replica_idxs(
247
+ adata_group: AnnData,
248
+ replicas_per_group: int,
249
+ bootstrap_sampling: bool,
250
+ ):
251
+ group_size = adata_group.n_obs
252
+ indices = list(adata_group.obs_names)
253
+ if bootstrap_sampling:
254
+ indices = np.array(
255
+ [
256
+ np.random.choice(indices, size=group_size, replace=True)
257
+ for _ in range(replicas_per_group)
258
+ ]
259
+ )
260
+
261
+ else:
262
+ random.shuffle(indices)
263
+ indices = np.array_split(np.array(indices), replicas_per_group)
264
+
265
+ return indices
266
+
267
+
268
+ def _get_layer(adata: AnnData, layer: str | None, dense: bool = False):
269
+ X: ndarray | csr_matrix
270
+
271
+ if layer is None or layer == "X":
272
+ X = adata.X
273
+ else:
274
+ X = adata.layers[layer]
275
+
276
+ if dense:
277
+ if issparse(X):
278
+ X = np.asarray(X.todense())
279
+ else:
280
+ X = np.asarray(X)
281
+
282
+ return X
283
+
284
+
285
+ def _join_dummies(aggr_adata: AnnData, group_key: str) -> None:
286
+ dummies = pd.get_dummies(aggr_adata.obs[group_key], prefix=group_key).astype(str)
287
+ dummies = dummies.astype(str).apply(lambda s: s.map({"True": "", "False": "not"}))
288
+ dummies = dummies + aggr_adata.obs[group_key].cat.categories
289
+
290
+ aggr_adata.obs = aggr_adata.obs.join(dummies)
@@ -0,0 +1,5 @@
1
+ from ._write_excel import write_excel
2
+
3
+ __all__ = [
4
+ "write_excel",
5
+ ]