biopipen 0.34.6__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +4 -0
  3. biopipen/core/filters.py +1 -1
  4. biopipen/core/testing.py +2 -1
  5. biopipen/ns/cellranger.py +33 -3
  6. biopipen/ns/regulatory.py +4 -0
  7. biopipen/ns/scrna.py +548 -98
  8. biopipen/ns/scrna_metabolic_landscape.py +4 -0
  9. biopipen/ns/tcr.py +256 -16
  10. biopipen/ns/web.py +5 -0
  11. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +9 -9
  12. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +9 -8
  13. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +9 -9
  14. biopipen/reports/tcr/ClonalStats.svelte +1 -0
  15. biopipen/scripts/cellranger/CellRangerCount.py +55 -11
  16. biopipen/scripts/cellranger/CellRangerVdj.py +54 -8
  17. biopipen/scripts/regulatory/MotifAffinityTest.R +21 -5
  18. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +9 -2
  19. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +15 -6
  20. biopipen/scripts/regulatory/VariantMotifPlot.R +1 -1
  21. biopipen/scripts/regulatory/motifs-common.R +3 -2
  22. biopipen/scripts/scrna/AnnData2Seurat.R +2 -1
  23. biopipen/scripts/scrna/CellCellCommunication.py +26 -14
  24. biopipen/scripts/scrna/CellCellCommunicationPlots.R +23 -4
  25. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  26. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +27 -36
  27. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +42 -26
  28. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +11 -13
  29. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +5 -8
  30. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +5 -8
  31. biopipen/scripts/scrna/CellTypeAnnotation.R +26 -3
  32. biopipen/scripts/scrna/MQuad.py +25 -0
  33. biopipen/scripts/scrna/MarkersFinder.R +128 -30
  34. biopipen/scripts/scrna/ModuleScoreCalculator.R +9 -1
  35. biopipen/scripts/scrna/PseudoBulkDEG.R +113 -27
  36. biopipen/scripts/scrna/ScFGSEA.R +23 -26
  37. biopipen/scripts/scrna/ScVelo.py +20 -8
  38. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
  39. biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -1
  40. biopipen/scripts/scrna/SeuratClustering.R +5 -1
  41. biopipen/scripts/scrna/SeuratMap2Ref.R +1 -2
  42. biopipen/scripts/scrna/SeuratPreparing.R +19 -11
  43. biopipen/scripts/scrna/SeuratSubClustering.R +1 -1
  44. biopipen/scripts/scrna/Slingshot.R +2 -4
  45. biopipen/scripts/scrna/TopExpressingGenes.R +1 -4
  46. biopipen/scripts/scrna/celltypist-wrapper.py +140 -4
  47. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  48. biopipen/scripts/scrna/seurat_anndata_conversion.py +18 -1
  49. biopipen/scripts/tcr/{TCRClustering.R → CDR3Clustering.R} +63 -23
  50. biopipen/scripts/tcr/ClonalStats.R +76 -35
  51. biopipen/utils/misc.py +104 -9
  52. {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/METADATA +5 -2
  53. {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/RECORD +55 -53
  54. {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  55. biopipen/utils/common_docstrs.py +0 -103
  56. {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,10 @@
1
1
  from argparse import ArgumentParser
2
+ from typing import Union
3
+ import numpy as np
4
+ import pandas as pd
5
+ import scanpy as sc
6
+ import celltypist
7
+ from celltypist.classifier import logger, AnnData, Model, Classifier
2
8
 
3
9
  parser = ArgumentParser(description="Run CellTypist")
4
10
  parser.add_argument(
@@ -18,9 +24,139 @@ parser.add_argument(
18
24
  )
19
25
 
20
26
 
27
+ def classifier_init(
28
+ self, filename="", model="", transpose=False, gene_file=None, cell_file=None
29
+ ):
30
+ """Celltypist check if adata is in the range of log1p normalized data to 10000
31
+ counts per cell. Otherwise it will use the raw data if available. However, in
32
+ some cases, the raw data has invalid feature names (var_names) which causes errors.
33
+ Here we check if the feature names of raw data is valid with intersection with
34
+ model features, if not, we will use the adata.X instead of adata.raw.X
35
+ """
36
+ if isinstance(model, str):
37
+ model = Model.load(model)
38
+ self.model = model
39
+ if not filename:
40
+ logger.warn("📭 No input file provided to the classifier")
41
+ return
42
+ if isinstance(filename, str):
43
+ self.filename = filename
44
+ logger.info(f"📁 Input file is '{self.filename}'")
45
+ logger.info("⏳ Loading data")
46
+ if isinstance(filename, str) and filename.endswith(
47
+ (".csv", ".txt", ".tsv", ".tab", ".mtx", ".mtx.gz")
48
+ ):
49
+ self.adata = sc.read(self.filename)
50
+ if transpose:
51
+ self.adata = self.adata.transpose()
52
+ if self.filename.endswith((".mtx", ".mtx.gz")):
53
+ if (gene_file is None) or (cell_file is None):
54
+ raise FileNotFoundError(
55
+ "🛑 Missing `gene_file` and/or `cell_file`. Please provide both "
56
+ "arguments together with the input mtx file"
57
+ )
58
+ genes_mtx = pd.read_csv(gene_file, header=None)[0].values
59
+ cells_mtx = pd.read_csv(cell_file, header=None)[0].values
60
+ if len(genes_mtx) != self.adata.n_vars:
61
+ raise ValueError(
62
+ f"🛑 The number of genes in {gene_file} does not match the number "
63
+ f"of genes in {self.filename}"
64
+ )
65
+ if len(cells_mtx) != self.adata.n_obs:
66
+ raise ValueError(
67
+ f"🛑 The number of cells in {cell_file} does not match the number "
68
+ f"of cells in {self.filename}"
69
+ )
70
+ self.adata.var_names = genes_mtx
71
+ self.adata.obs_names = cells_mtx
72
+ if not float(self.adata.X[:1000].max()).is_integer():
73
+ logger.warn(
74
+ "⚠️ Warning: the input file seems not a raw count matrix. The "
75
+ "prediction result may not be accurate"
76
+ )
77
+ if (
78
+ (self.adata.n_vars >= 100000)
79
+ or (len(self.adata.var_names[0]) >= 30)
80
+ or (
81
+ len(
82
+ self.adata.obs_names.intersection(
83
+ ["GAPDH", "ACTB", "CALM1", "PTPRC", "MALAT1"]
84
+ )
85
+ )
86
+ >= 1
87
+ )
88
+ ):
89
+ logger.warn(
90
+ "⚠️ The input matrix is detected to be a gene-by-cell matrix, will "
91
+ "transpose it"
92
+ )
93
+ self.adata = self.adata.transpose()
94
+ self.adata.var_names_make_unique()
95
+ sc.pp.normalize_total(self.adata, target_sum=1e4)
96
+ sc.pp.log1p(self.adata)
97
+ self.indata = self.adata.X
98
+ self.indata_genes = self.adata.var_names
99
+ self.indata_names = self.adata.obs_names
100
+ elif isinstance(filename, AnnData) or (
101
+ isinstance(filename, str) and filename.endswith(".h5ad")
102
+ ):
103
+ self.adata = sc.read(filename) if isinstance(filename, str) else filename
104
+ self.adata.var_names_make_unique()
105
+ # When to use raw.X?
106
+ # 1. if adata.raw exists
107
+ # 2. if adata.raw.var_names has intersection with model genes
108
+ # 3. if adata.X is not in the expected range
109
+ use_raw = self.adata.raw and (
110
+ self.adata.X[:1000].min() < 0 or self.adata.X[:1000].max() > 9.22
111
+ ) and np.isin(
112
+ self.adata.raw.var_names, self.model.classifier.features
113
+ ).sum() > 0
114
+
115
+ if use_raw:
116
+ if not self.adata.raw:
117
+ raise ValueError(
118
+ "🛑 Invalid expression matrix in `.X`, expect log1p normalized "
119
+ "expression to 10000 counts per cell"
120
+ )
121
+ elif (self.adata.raw.X[:1000].min() < 0) or (
122
+ self.adata.raw.X[:1000].max() > 9.22
123
+ ):
124
+ raise ValueError(
125
+ "🛑 Invalid expression matrix in both `.X` and `.raw.X`, expect "
126
+ "log1p normalized expression to 10000 counts per cell"
127
+ )
128
+ else:
129
+ logger.info(
130
+ "👀 Invalid expression matrix in `.X`, expect log1p normalized "
131
+ "expression to 10000 counts per cell; will use `.raw.X` instead"
132
+ )
133
+ self.indata = self.adata.raw.X
134
+ self.indata_genes = self.adata.raw.var_names
135
+ self.indata_names = self.adata.raw.obs_names
136
+ else:
137
+ self.indata = self.adata.X
138
+ self.indata_genes = self.adata.var_names
139
+ self.indata_names = self.adata.obs_names
140
+ if np.abs(np.expm1(self.indata[0]).sum() - 10000) > 1:
141
+ logger.warn(
142
+ "⚠️ Warning: invalid expression matrix, expect ALL genes and log1p "
143
+ "normalized expression to 10000 counts per cell. The prediction result "
144
+ "may not be accurate"
145
+ )
146
+ else:
147
+ raise ValueError(
148
+ "🛑 Invalid input. Supported types: .csv, .txt, .tsv, .tab, .mtx, .mtx.gz "
149
+ "and .h5ad, or AnnData loaded in memory"
150
+ )
151
+
152
+ logger.info(
153
+ f"🔬 Input data has {self.indata.shape[0]} cells and {len(self.indata_genes)} "
154
+ "genes"
155
+ )
156
+
157
+
21
158
  if __name__ == "__main__":
22
- import scanpy as sc
23
- import celltypist
159
+ Classifier.__init__ = classifier_init # type: ignore
24
160
 
25
161
  args = parser.parse_args()
26
162
  adata = sc.read_h5ad(args.input)
@@ -29,8 +165,8 @@ if __name__ == "__main__":
29
165
  raise ValueError(
30
166
  f"Over clustering column '{over_clustering}' not found in AnnData object."
31
167
  )
32
- if 'neighbors' in adata.uns and 'params' in adata.uns['neighbors']:
33
- adata.uns['neighbors']['params'].setdefault('n_neighbors', 15)
168
+ if "neighbors" in adata.uns and "params" in adata.uns["neighbors"]:
169
+ adata.uns["neighbors"]["params"].setdefault("n_neighbors", 15)
34
170
 
35
171
  annotated = celltypist.annotate(
36
172
  adata,
@@ -0,0 +1,313 @@
1
+ """This file is used to patch scvelo's paga to fix
2
+ https://github.com/theislab/scvelo/issues/1241
3
+
4
+ This is from pull request
5
+ https://github.com/theislab/scvelo/pull/1308
6
+ which has not been merged yet as of 2025-11-07.
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from scipy.sparse import csr_matrix
12
+
13
+ from scanpy.tools._paga import PAGA
14
+ import scvelo
15
+
16
+ # This is adapted from https://github.com/theislab/paga
17
+ from scvelo import logging as logg
18
+ from scvelo import settings
19
+ from scvelo.tools.rank_velocity_genes import velocity_clusters
20
+ from scvelo.tools.utils import strings_to_categoricals
21
+ from scvelo.tools.velocity_graph import vals_to_csr
22
+ from scvelo.tools.velocity_pseudotime import velocity_pseudotime
23
+
24
+
25
+ # TODO: Finish docstrings
26
+ def get_igraph_from_adjacency(adjacency, directed=None):
27
+ """Get igraph graph from adjacency matrix."""
28
+ import igraph as ig
29
+
30
+ sources, targets = adjacency.nonzero()
31
+ weights = adjacency[sources, targets]
32
+ if isinstance(weights, np.matrix):
33
+ weights = weights.A1
34
+ g = ig.Graph(directed=directed)
35
+ g.add_vertices(adjacency.shape[0]) # this adds adjacency.shap[0] vertices
36
+ g.add_edges(list(zip(sources, targets)))
37
+ g.es["weight"] = weights
38
+ if g.vcount() != adjacency.shape[0]:
39
+ logg.warn(
40
+ f"The constructed graph has only {g.vcount()} nodes. "
41
+ "Your adjacency matrix contained redundant nodes."
42
+ )
43
+ return g
44
+
45
+
46
+ # TODO: Add docstrings
47
+ def get_sparse_from_igraph(graph, weight_attr=None):
48
+ """TODO."""
49
+ edges = graph.get_edgelist()
50
+ if weight_attr is None:
51
+ weights = [1] * len(edges)
52
+ else:
53
+ weights = graph.es[weight_attr]
54
+ if not graph.is_directed():
55
+ edges.extend([(v, u) for u, v in edges])
56
+ weights.extend(weights)
57
+ shape = graph.vcount()
58
+ shape = (shape, shape)
59
+ if len(edges) > 0:
60
+ rows, cols = zip(*edges)
61
+ return csr_matrix((weights, (rows, cols)), shape=shape)
62
+ else:
63
+ return csr_matrix(shape)
64
+
65
+
66
+ # TODO: Finish docstrings
67
+ def set_row_csr(csr, rows, value=0):
68
+ """Set all nonzero elements to the given value. Useful to set to 0 mostly."""
69
+ for row in rows:
70
+ start = csr.indptr[row]
71
+ end = csr.indptr[row + 1]
72
+ csr.data[start:end] = value
73
+ if value == 0:
74
+ csr.eliminate_zeros()
75
+
76
+
77
+ # TODO: Add docstrings
78
+ class PAGA_tree(PAGA):
79
+ """TODO."""
80
+
81
+ def __init__(
82
+ self,
83
+ adata,
84
+ groups=None,
85
+ vkey=None,
86
+ use_time_prior=None,
87
+ root_key=None,
88
+ end_key=None,
89
+ threshold_root_end_prior=None,
90
+ minimum_spanning_tree=None,
91
+ ):
92
+ super().__init__(adata=adata, groups=groups, model="v1.2")
93
+ self.groups = groups
94
+ self.vkey = vkey
95
+ self.use_time_prior = use_time_prior
96
+ self.root_key = root_key
97
+ self.end_key = end_key
98
+ self.threshold_root_end_prior = threshold_root_end_prior
99
+ if self.threshold_root_end_prior is None:
100
+ self.threshold_root_end_prior = 0.9
101
+ self.minimum_spanning_tree = minimum_spanning_tree
102
+
103
+ # TODO: Add docstrings
104
+ def compute_transitions(self):
105
+ """TODO."""
106
+ try:
107
+ import igraph
108
+ except ImportError:
109
+ raise ImportError("To run paga, you need to install `pip install igraph`")
110
+ vkey = f"{self.vkey}_graph"
111
+ if vkey not in self._adata.uns:
112
+ raise ValueError(
113
+ "The passed AnnData needs to have an `uns` annotation "
114
+ "with key 'velocity_graph' - a sparse matrix from RNA velocity."
115
+ )
116
+ if self._adata.uns[vkey].shape != (self._adata.n_obs, self._adata.n_obs):
117
+ raise ValueError(
118
+ f"The passed 'velocity_graph' has shape {self._adata.uns[vkey].shape} "
119
+ f"but shoud have shape {(self._adata.n_obs, self._adata.n_obs)}"
120
+ )
121
+
122
+ clusters = self._adata.obs[self.groups]
123
+ cats = clusters.cat.categories
124
+ vgraph = self._adata.uns[vkey] > 0.1
125
+ time_prior = self.use_time_prior
126
+
127
+ if isinstance(time_prior, str) and time_prior in self._adata.obs.keys():
128
+ vpt = self._adata.obs[time_prior].values
129
+ vpt_mean = self._adata.obs.groupby(self.groups)[time_prior].mean()
130
+ vpt_means = np.array([vpt_mean[cat] for cat in clusters])
131
+ rows, cols, vals = [], [], []
132
+ for i in range(vgraph.shape[0]):
133
+ indices = vgraph[i].indices
134
+ idx_bool = vpt[i] < vpt[indices]
135
+ idx_bool &= vpt_means[indices] > vpt_means[i] - 0.1
136
+ cols.extend(indices[idx_bool])
137
+ vals.extend(vgraph[i].data[idx_bool])
138
+ rows.extend([i] * np.sum(idx_bool))
139
+ vgraph = vals_to_csr(vals, rows, cols, shape=vgraph.shape)
140
+
141
+ lb = self.threshold_root_end_prior # cells to be consider as terminal states
142
+ if isinstance(self.end_key, str) and self.end_key in self._adata.obs.keys():
143
+ set_row_csr(vgraph, rows=np.where(self._adata.obs[self.end_key] > lb)[0])
144
+ if isinstance(self.root_key, str) and self.root_key in self._adata.obs.keys():
145
+ vgraph[:, np.where(self._adata.obs[self.root_key] > lb)[0]] = 0
146
+ vgraph.eliminate_zeros()
147
+
148
+ membership = self._adata.obs[self.groups].cat.codes.values
149
+ g = get_igraph_from_adjacency(vgraph, directed=True)
150
+ vc = igraph.VertexClustering(g, membership=membership)
151
+ cg_full = vc.cluster_graph(combine_edges="sum")
152
+ transitions = get_sparse_from_igraph(cg_full, weight_attr="weight")
153
+ transitions = transitions - transitions.T
154
+ transitions_conf = transitions.copy()
155
+ transitions = transitions.tocoo()
156
+ total_n = self._neighbors.n_neighbors * np.array(vc.sizes())
157
+ for i, j, v in zip(transitions.row, transitions.col, transitions.data):
158
+ reference = np.sqrt(total_n[i] * total_n[j])
159
+ transitions_conf[i, j] = 0 if v < 0 else v / reference
160
+ transitions_conf.eliminate_zeros()
161
+
162
+ # remove non-confident direct paths if more confident indirect path is found.
163
+ T = transitions_conf.toarray()
164
+ threshold = max(np.nanmin(np.nanmax(T / (T > 0), axis=0)) - 1e-6, 0.01)
165
+ T *= T > threshold
166
+ for i in range(len(T)):
167
+ idx = T[i] > 0
168
+ if np.any(idx):
169
+ indirect = np.clip(T[idx], None, T[i][idx][:, None]).max(0)
170
+ T[i, T[i] < indirect] = 0
171
+
172
+ if self.minimum_spanning_tree:
173
+ T_tmp = T.copy()
174
+ T_num = T > 0
175
+ T_sum = np.sum(T_num, 0)
176
+ T_max = np.max(T_tmp)
177
+ for i in range(len(T_tmp)):
178
+ if T_sum[i] == 1:
179
+ T_tmp[np.where(T_num[:, i])[0][0], i] = T_max
180
+ from scipy.sparse.csgraph import minimum_spanning_tree
181
+
182
+ T_tmp = np.abs(minimum_spanning_tree(-T_tmp).toarray()) > 0
183
+ T = T_tmp * T
184
+
185
+ transitions_conf = csr_matrix(T)
186
+ self.transitions_confidence = transitions_conf.T
187
+
188
+ # set threshold for minimal spanning tree.
189
+ df = pd.DataFrame(T, index=cats, columns=cats)
190
+ self.threshold = np.nanmin(np.nanmax(df.values / (df.values > 0), axis=0))
191
+ self.threshold = max(self.threshold - 1e-6, 0.01)
192
+
193
+
194
+ def paga(
195
+ adata,
196
+ groups=None,
197
+ vkey="velocity",
198
+ use_time_prior=True,
199
+ root_key=None,
200
+ end_key=None,
201
+ threshold_root_end_prior=None,
202
+ minimum_spanning_tree=True,
203
+ copy=False,
204
+ ):
205
+ """PAGA graph with velocity-directed edges.
206
+
207
+ Mapping out the coarse-grained connectivity structures of complex manifolds
208
+ :cite:p:`Wolf19`. By quantifying the connectivity of partitions (groups, clusters) of the
209
+ single-cell graph, partition-based graph abstraction (PAGA) generates a much
210
+ simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights
211
+ represent confidence in the presence of connections.
212
+
213
+ Parameters
214
+ ----------
215
+ adata : :class:`~anndata.AnnData`
216
+ An annotated data matrix.
217
+ groups : key for categorical in `adata.obs`, optional (default: 'louvain')
218
+ You can pass your predefined groups by choosing any categorical
219
+ annotation of observations (`adata.obs`).
220
+ vkey: `str` or `None` (default: `None`)
221
+ Key for annotations of observations/cells or variables/genes.
222
+ use_time_prior : `str` or bool, optional (default: True)
223
+ Obs key for pseudo-time values.
224
+ If True, 'velocity_pseudotime' is used if available.
225
+ root_key : `str` or bool, optional (default: None)
226
+ Obs key for root states.
227
+ end_key : `str` or bool, optional (default: None)
228
+ Obs key for end states.
229
+ threshold_root_end_prior : `float` (default: 0.9)
230
+ Threshold for root and final states priors, to be in the range of [0,1].
231
+ Values above the threshold will be considered as terminal and included as prior.
232
+ minimum_spanning_tree : bool, optional (default: True)
233
+ Whether to prune the tree such that a path from A-to-B
234
+ is removed if another more confident path exists.
235
+ copy : `bool`, optional (default: `False`)
236
+ Copy `adata` before computation and return a copy.
237
+ Otherwise, perform computation inplace and return `None`.
238
+
239
+ Returns
240
+ -------
241
+ connectivities: `.uns`
242
+ The full adjacency matrix of the abstracted graph, weights correspond to
243
+ confidence in the connectivities of partitions.
244
+ connectivities_tree: `.uns`
245
+ The adjacency matrix of the tree-like subgraph that best explains the topology.
246
+ transitions_confidence: `.uns`
247
+ The adjacency matrix of the abstracted directed graph, weights correspond to
248
+ confidence in the transitions between partitions.
249
+ """
250
+ if "neighbors" not in adata.uns:
251
+ raise ValueError(
252
+ "You need to run `pp.neighbors` first to compute a neighborhood graph."
253
+ )
254
+
255
+ adata = adata.copy() if copy else adata
256
+ strings_to_categoricals(adata)
257
+
258
+ if groups is None:
259
+ groups = (
260
+ "clusters"
261
+ if "clusters" in adata.obs.keys()
262
+ else "louvain"
263
+ if "louvain" in adata.obs.keys()
264
+ else None
265
+ )
266
+ elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys():
267
+ velocity_clusters(adata)
268
+ if use_time_prior and not isinstance(use_time_prior, str):
269
+ use_time_prior = "velocity_pseudotime"
270
+ if use_time_prior not in adata.obs.keys():
271
+ velocity_pseudotime(adata, vkey=vkey, root_key=root_key, end_key=end_key)
272
+
273
+ priors = [p for p in [use_time_prior, root_key, end_key] if p in adata.obs.keys()]
274
+ logg.info(
275
+ "running PAGA",
276
+ f"using priors: {priors}" if len(priors) > 0 else "",
277
+ r=True,
278
+ )
279
+ paga = PAGA_tree(
280
+ adata,
281
+ groups,
282
+ vkey=vkey,
283
+ use_time_prior=use_time_prior,
284
+ root_key=root_key,
285
+ end_key=end_key,
286
+ threshold_root_end_prior=threshold_root_end_prior,
287
+ minimum_spanning_tree=minimum_spanning_tree,
288
+ )
289
+
290
+ if "paga" not in adata.uns:
291
+ adata.uns["paga"] = {}
292
+
293
+ paga.compute_connectivities()
294
+ adata.uns["paga"]["connectivities"] = paga.connectivities
295
+ adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree
296
+ adata.uns[f"{groups}_sizes"] = np.array(paga.ns)
297
+
298
+ paga.compute_transitions()
299
+ adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence
300
+ adata.uns["paga"]["threshold"] = paga.threshold
301
+ adata.uns["paga"]["groups"] = groups
302
+
303
+ logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n")
304
+ logg.hint(
305
+ "added\n" + " 'paga/connectivities', connectivities adjacency (adata.uns)\n"
306
+ " 'paga/connectivities_tree', connectivities subtree (adata.uns)\n"
307
+ " 'paga/transitions_confidence', velocity transitions (adata.uns)"
308
+ )
309
+
310
+ return adata if copy else None
311
+
312
+
313
+ scvelo.tl.paga = paga
@@ -2,6 +2,7 @@
2
2
 
3
3
  Need R and R packages Seurat, SeuratDisk and biopipen.utils.R installed.
4
4
  """
5
+ from __future__ import annotations
5
6
 
6
7
 
7
8
  def convert_seurat_to_anndata(
@@ -10,7 +11,8 @@ def convert_seurat_to_anndata(
10
11
  assay=None,
11
12
  subset=None,
12
13
  rscript="Rscript",
13
- ):
14
+ return_ident_col=False,
15
+ ) -> None | str:
14
16
  """Convert Seurat object to AnnData format.
15
17
 
16
18
  Args:
@@ -43,6 +45,21 @@ def convert_seurat_to_anndata(
43
45
  cmd = [rscript, temp_script_path]
44
46
  run_command(cmd, fg=True)
45
47
 
48
+ if return_ident_col:
49
+ ident_col_script = f"""
50
+ library(biopipen.utils)
51
+
52
+ obj <- read_obj("{input_file}")
53
+ cat(GetIdentityColumn(obj))
54
+ """
55
+ with NamedTemporaryFile(suffix=".R", delete=False) as temp_script:
56
+ temp_script.write(ident_col_script.encode('utf-8'))
57
+ temp_script_path = temp_script.name
58
+
59
+ cmd = [rscript, temp_script_path]
60
+ ident_col = run_command(cmd, stdout="RETURN").strip()
61
+ return ident_col
62
+
46
63
 
47
64
  def convert_anndata_to_seurat(
48
65
  input_file,
@@ -13,6 +13,7 @@ python <- {{envs.python | r}}
13
13
  within_sample <- {{envs.within_sample | r}}
14
14
  args <- {{envs.args | r}}
15
15
  chain <- {{envs.chain | r}}
16
+ type <- {{envs.type | r}}
16
17
 
17
18
  setwd(outdir)
18
19
 
@@ -22,7 +23,36 @@ log$info("Reading input file ...")
22
23
  obj <- read_obj(screpfile)
23
24
  is_seurat <- inherits(obj, "Seurat")
24
25
 
25
- get_cdr3aa_df = function() {
26
+
27
+ get_type <- function() {
28
+ if (!is_seurat) {
29
+ for (sample in names(obj)) {
30
+ for (gene in obj[[sample]]$CTgene) {
31
+ if (grepl("^TRB", gene) || grepl("^TRG", gene) || grepl("^TRA", gene) || grepl("^TRD", gene)) {
32
+ return("TCR")
33
+ } else if (grepl("^IGH", gene) || grepl("^IGK", gene) || grepl("^IGL", gene)) {
34
+ return("BCR")
35
+ }
36
+ }
37
+ }
38
+ } else {
39
+ for (gene in obj@meta.data$CTgene) {
40
+ if (grepl("^TRB", gene) || grepl("^TRG", gene) || grepl("^TRA", gene) || grepl("^TRD", gene)) {
41
+ return("TCR")
42
+ } else if (grepl("^IGH", gene) || grepl("^IGK", gene) || grepl("^IGL", gene)) {
43
+ return("BCR")
44
+ }
45
+ }
46
+ }
47
+ stop("Cannot determine the type of the data (TCR or BCR). Please set envs.type to 'TCR' or 'BCR'.")
48
+ }
49
+
50
+ if (type == "auto") {
51
+ type <- get_type()
52
+ log$info("Auto-detected data type: {type}")
53
+ }
54
+
55
+ get_cdr3aa_df <- function() {
26
56
  if (!is_seurat) {
27
57
  out <- NULL
28
58
  for (sample in names(obj)) {
@@ -32,10 +62,12 @@ get_cdr3aa_df = function() {
32
62
  )
33
63
  if (chain == "both") {
34
64
  df$CDR3.aa <- obj[[sample]]$CTaa
35
- } else if (chain == "alpha") {
65
+ } else if ((type == "BCR" && chain == "heavy") || (type == "TCR" && chain == "light")) {
36
66
  df$CDR3.aa <- obj[[sample]]$cdr3_aa1
37
- } else if (chain == "beta") {
67
+ } else if ((type == "BCR" && chain == "light") || (type == "TCR" && chain == "heavy")) {
38
68
  df$CDR3.aa <- obj[[sample]]$cdr3_aa2
69
+ } else {
70
+ stop(paste("Unknown chain:", chain, "for", type))
39
71
  }
40
72
  out <- rbind(out, df)
41
73
  }
@@ -47,11 +79,13 @@ get_cdr3aa_df = function() {
47
79
  if (chain == "both") {
48
80
  out$CDR3.aa <- out$CTaa
49
81
  } else {
50
- out <- separate(out, CTaa, into = c("alpha.aa", "beta.aa"), sep = "_")
51
- if (chain == "alpha") {
52
- out$CDR3.aa <- out$alpha.aa
53
- } else if (chain == "beta") {
54
- out$CDR3.aa <- out$beta.aa
82
+ out <- separate(out, CTaa, into = c("first", "second"), sep = "_")
83
+ if ((type == "BCR" && chain == "heavy") || (type == "TCR" && chain == "light")) {
84
+ out$CDR3.aa <- out$first
85
+ } else if ((type == "BCR" && chain == "light") || (type == "TCR" && chain == "heavy")) {
86
+ out$CDR3.aa <- out$second
87
+ } else {
88
+ stop(paste("Unknown chain:", chain, "for", type))
55
89
  }
56
90
  }
57
91
  } else {
@@ -132,21 +166,24 @@ output.clusters_df.to_csv(clustcr_dir + "/clusters.txt", sep="\t", index=False)
132
166
 
133
167
  clean_clustcr_output = function(clustcr_outfile) {
134
168
  clustcr_out = read.delim2(clustcr_outfile, header=TRUE, row.names = NULL)
135
- colnames(clustcr_out) = c("CDR3.aa", "TCR_Cluster")
169
+ colnames(clustcr_out) = c("CDR3.aa", "CDR3_Cluster")
136
170
  out = left_join(cdr3aa_df, distinct(clustcr_out), by=c(cdr3seq4clustering = "CDR3.aa")) %>%
137
171
  mutate(
138
- TCR_Cluster = if_else(
139
- is.na(TCR_Cluster),
172
+ CDR3_Cluster = if_else(
173
+ is.na(CDR3_Cluster),
140
174
  paste0("S_", row_number()),
141
- paste0("M_", as.character(TCR_Cluster))
175
+ paste0("M_", as.character(CDR3_Cluster))
142
176
  )
143
177
  )
144
178
 
145
179
  if (within_sample) {
146
- out <- mutate(out, TCR_Cluster = paste0(Sample, ".", TCR_Cluster))
180
+ out <- mutate(out, CDR3_Cluster = paste0(Sample, ".", CDR3_Cluster))
147
181
  }
148
182
 
149
- left_join(cdr3aa_df, out, by = "CDR3.aa")
183
+ # This join would result in more rows than dplyr can handle
184
+ # left_join(cdr3aa_df, out, by = "CDR3.aa")
185
+ out <- out[match(cdr3aa_df$CDR3.aa, out$CDR3.aa), , drop=FALSE]
186
+ cbind(cdr3aa_df, out[, setdiff(colnames(out), "CDR3.aa"), drop=FALSE])
150
187
  }
151
188
 
152
189
  run_clustcr = function() {
@@ -208,25 +245,28 @@ prepare_input = function() {
208
245
 
209
246
  clean_giana_output = function(giana_outfile) {
210
247
  # generate an output file with columns:
211
- # CDR3.aa, TCR_Cluster, V.name, Sample
248
+ # CDR3.aa, CDR3_Cluster, V.name, Sample
212
249
  # If sequence doesn't exist in the input file,
213
250
  # Then a unique cluster id is assigned to it.
214
251
  giana_out = read.delim2(giana_outfile, header=FALSE, comment.char = "#", row.names = NULL)[, 1:2, drop=FALSE]
215
- colnames(giana_out) = c("CDR3.aa", "TCR_Cluster")
252
+ colnames(giana_out) = c("CDR3.aa", "CDR3_Cluster")
216
253
  out = left_join(cdr3aa_df, distinct(giana_out), by=c(cdr3seq4clustering = "CDR3.aa")) %>%
217
254
  mutate(
218
- TCR_Cluster = if_else(
219
- is.na(TCR_Cluster),
255
+ CDR3_Cluster = if_else(
256
+ is.na(CDR3_Cluster),
220
257
  paste0("S_", row_number()),
221
- paste0("M_", as.character(TCR_Cluster))
258
+ paste0("M_", as.character(CDR3_Cluster))
222
259
  )
223
260
  )
224
261
 
225
262
  if (within_sample) {
226
- out <- mutate(out, TCR_Cluster = paste0(Sample, ".", TCR_Cluster))
263
+ out <- mutate(out, CDR3_Cluster = paste0(Sample, ".", CDR3_Cluster))
227
264
  }
228
265
 
229
- left_join(cdr3aa_df, out, by = "CDR3.aa")
266
+ # This join would result in more rows than dplyr can handle
267
+ # left_join(cdr3aa_df, out, by = "CDR3.aa")
268
+ out <- out[match(cdr3aa_df$CDR3.aa, out$CDR3.aa), , drop=FALSE]
269
+ cbind(cdr3aa_df, out[, setdiff(colnames(out), "CDR3.aa"), drop=FALSE])
230
270
  }
231
271
 
232
272
  run_giana = function() {
@@ -276,12 +316,12 @@ attach_to_obj = function(obj, out) {
276
316
  rownames(out) <- out$Barcode
277
317
  if (is_seurat) {
278
318
  # Attach results to Seurat object
279
- obj@meta.data$TCR_Cluster <- out[rownames(obj@meta.data), "TCR_Cluster"]
319
+ obj@meta.data$CDR3_Cluster <- out[rownames(obj@meta.data), "CDR3_Cluster"]
280
320
  } else {
281
321
  # Attach results to the list of data frames
282
322
  for (sample in names(obj)) {
283
323
  sout <- filter(out, Sample == sample)
284
- obj[[sample]]$TCR_Cluster <- sout[obj[[sample]]$barcode, "TCR_Cluster"]
324
+ obj[[sample]]$CDR3_Cluster <- sout[obj[[sample]]$barcode, "CDR3_Cluster"]
285
325
  }
286
326
  }
287
327
  obj