biopipen 0.34.6__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +4 -0
- biopipen/core/filters.py +1 -1
- biopipen/core/testing.py +2 -1
- biopipen/ns/cellranger.py +33 -3
- biopipen/ns/regulatory.py +4 -0
- biopipen/ns/scrna.py +548 -98
- biopipen/ns/scrna_metabolic_landscape.py +4 -0
- biopipen/ns/tcr.py +256 -16
- biopipen/ns/web.py +5 -0
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +9 -9
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +9 -8
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +9 -9
- biopipen/reports/tcr/ClonalStats.svelte +1 -0
- biopipen/scripts/cellranger/CellRangerCount.py +55 -11
- biopipen/scripts/cellranger/CellRangerVdj.py +54 -8
- biopipen/scripts/regulatory/MotifAffinityTest.R +21 -5
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +9 -2
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +15 -6
- biopipen/scripts/regulatory/VariantMotifPlot.R +1 -1
- biopipen/scripts/regulatory/motifs-common.R +3 -2
- biopipen/scripts/scrna/AnnData2Seurat.R +2 -1
- biopipen/scripts/scrna/CellCellCommunication.py +26 -14
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +23 -4
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +27 -36
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +42 -26
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +11 -13
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +5 -8
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +5 -8
- biopipen/scripts/scrna/CellTypeAnnotation.R +26 -3
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +128 -30
- biopipen/scripts/scrna/ModuleScoreCalculator.R +9 -1
- biopipen/scripts/scrna/PseudoBulkDEG.R +113 -27
- biopipen/scripts/scrna/ScFGSEA.R +23 -26
- biopipen/scripts/scrna/ScVelo.py +20 -8
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -1
- biopipen/scripts/scrna/SeuratClustering.R +5 -1
- biopipen/scripts/scrna/SeuratMap2Ref.R +1 -2
- biopipen/scripts/scrna/SeuratPreparing.R +19 -11
- biopipen/scripts/scrna/SeuratSubClustering.R +1 -1
- biopipen/scripts/scrna/Slingshot.R +2 -4
- biopipen/scripts/scrna/TopExpressingGenes.R +1 -4
- biopipen/scripts/scrna/celltypist-wrapper.py +140 -4
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +18 -1
- biopipen/scripts/tcr/{TCRClustering.R → CDR3Clustering.R} +63 -23
- biopipen/scripts/tcr/ClonalStats.R +76 -35
- biopipen/utils/misc.py +104 -9
- {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/METADATA +5 -2
- {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/RECORD +55 -53
- {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- biopipen/utils/common_docstrs.py +0 -103
- {biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
from argparse import ArgumentParser
|
|
2
|
+
from typing import Union
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import scanpy as sc
|
|
6
|
+
import celltypist
|
|
7
|
+
from celltypist.classifier import logger, AnnData, Model, Classifier
|
|
2
8
|
|
|
3
9
|
parser = ArgumentParser(description="Run CellTypist")
|
|
4
10
|
parser.add_argument(
|
|
@@ -18,9 +24,139 @@ parser.add_argument(
|
|
|
18
24
|
)
|
|
19
25
|
|
|
20
26
|
|
|
27
|
+
def classifier_init(
|
|
28
|
+
self, filename="", model="", transpose=False, gene_file=None, cell_file=None
|
|
29
|
+
):
|
|
30
|
+
"""Celltypist check if adata is in the range of log1p normalized data to 10000
|
|
31
|
+
counts per cell. Otherwise it will use the raw data if available. However, in
|
|
32
|
+
some cases, the raw data has invalid feature names (var_names) which causes errors.
|
|
33
|
+
Here we check if the feature names of raw data is valid with intersection with
|
|
34
|
+
model features, if not, we will use the adata.X instead of adata.raw.X
|
|
35
|
+
"""
|
|
36
|
+
if isinstance(model, str):
|
|
37
|
+
model = Model.load(model)
|
|
38
|
+
self.model = model
|
|
39
|
+
if not filename:
|
|
40
|
+
logger.warn("📭 No input file provided to the classifier")
|
|
41
|
+
return
|
|
42
|
+
if isinstance(filename, str):
|
|
43
|
+
self.filename = filename
|
|
44
|
+
logger.info(f"📁 Input file is '{self.filename}'")
|
|
45
|
+
logger.info("⏳ Loading data")
|
|
46
|
+
if isinstance(filename, str) and filename.endswith(
|
|
47
|
+
(".csv", ".txt", ".tsv", ".tab", ".mtx", ".mtx.gz")
|
|
48
|
+
):
|
|
49
|
+
self.adata = sc.read(self.filename)
|
|
50
|
+
if transpose:
|
|
51
|
+
self.adata = self.adata.transpose()
|
|
52
|
+
if self.filename.endswith((".mtx", ".mtx.gz")):
|
|
53
|
+
if (gene_file is None) or (cell_file is None):
|
|
54
|
+
raise FileNotFoundError(
|
|
55
|
+
"🛑 Missing `gene_file` and/or `cell_file`. Please provide both "
|
|
56
|
+
"arguments together with the input mtx file"
|
|
57
|
+
)
|
|
58
|
+
genes_mtx = pd.read_csv(gene_file, header=None)[0].values
|
|
59
|
+
cells_mtx = pd.read_csv(cell_file, header=None)[0].values
|
|
60
|
+
if len(genes_mtx) != self.adata.n_vars:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"🛑 The number of genes in {gene_file} does not match the number "
|
|
63
|
+
f"of genes in {self.filename}"
|
|
64
|
+
)
|
|
65
|
+
if len(cells_mtx) != self.adata.n_obs:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"🛑 The number of cells in {cell_file} does not match the number "
|
|
68
|
+
f"of cells in {self.filename}"
|
|
69
|
+
)
|
|
70
|
+
self.adata.var_names = genes_mtx
|
|
71
|
+
self.adata.obs_names = cells_mtx
|
|
72
|
+
if not float(self.adata.X[:1000].max()).is_integer():
|
|
73
|
+
logger.warn(
|
|
74
|
+
"⚠️ Warning: the input file seems not a raw count matrix. The "
|
|
75
|
+
"prediction result may not be accurate"
|
|
76
|
+
)
|
|
77
|
+
if (
|
|
78
|
+
(self.adata.n_vars >= 100000)
|
|
79
|
+
or (len(self.adata.var_names[0]) >= 30)
|
|
80
|
+
or (
|
|
81
|
+
len(
|
|
82
|
+
self.adata.obs_names.intersection(
|
|
83
|
+
["GAPDH", "ACTB", "CALM1", "PTPRC", "MALAT1"]
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
>= 1
|
|
87
|
+
)
|
|
88
|
+
):
|
|
89
|
+
logger.warn(
|
|
90
|
+
"⚠️ The input matrix is detected to be a gene-by-cell matrix, will "
|
|
91
|
+
"transpose it"
|
|
92
|
+
)
|
|
93
|
+
self.adata = self.adata.transpose()
|
|
94
|
+
self.adata.var_names_make_unique()
|
|
95
|
+
sc.pp.normalize_total(self.adata, target_sum=1e4)
|
|
96
|
+
sc.pp.log1p(self.adata)
|
|
97
|
+
self.indata = self.adata.X
|
|
98
|
+
self.indata_genes = self.adata.var_names
|
|
99
|
+
self.indata_names = self.adata.obs_names
|
|
100
|
+
elif isinstance(filename, AnnData) or (
|
|
101
|
+
isinstance(filename, str) and filename.endswith(".h5ad")
|
|
102
|
+
):
|
|
103
|
+
self.adata = sc.read(filename) if isinstance(filename, str) else filename
|
|
104
|
+
self.adata.var_names_make_unique()
|
|
105
|
+
# When to use raw.X?
|
|
106
|
+
# 1. if adata.raw exists
|
|
107
|
+
# 2. if adata.raw.var_names has intersection with model genes
|
|
108
|
+
# 3. if adata.X is not in the expected range
|
|
109
|
+
use_raw = self.adata.raw and (
|
|
110
|
+
self.adata.X[:1000].min() < 0 or self.adata.X[:1000].max() > 9.22
|
|
111
|
+
) and np.isin(
|
|
112
|
+
self.adata.raw.var_names, self.model.classifier.features
|
|
113
|
+
).sum() > 0
|
|
114
|
+
|
|
115
|
+
if use_raw:
|
|
116
|
+
if not self.adata.raw:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
"🛑 Invalid expression matrix in `.X`, expect log1p normalized "
|
|
119
|
+
"expression to 10000 counts per cell"
|
|
120
|
+
)
|
|
121
|
+
elif (self.adata.raw.X[:1000].min() < 0) or (
|
|
122
|
+
self.adata.raw.X[:1000].max() > 9.22
|
|
123
|
+
):
|
|
124
|
+
raise ValueError(
|
|
125
|
+
"🛑 Invalid expression matrix in both `.X` and `.raw.X`, expect "
|
|
126
|
+
"log1p normalized expression to 10000 counts per cell"
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
logger.info(
|
|
130
|
+
"👀 Invalid expression matrix in `.X`, expect log1p normalized "
|
|
131
|
+
"expression to 10000 counts per cell; will use `.raw.X` instead"
|
|
132
|
+
)
|
|
133
|
+
self.indata = self.adata.raw.X
|
|
134
|
+
self.indata_genes = self.adata.raw.var_names
|
|
135
|
+
self.indata_names = self.adata.raw.obs_names
|
|
136
|
+
else:
|
|
137
|
+
self.indata = self.adata.X
|
|
138
|
+
self.indata_genes = self.adata.var_names
|
|
139
|
+
self.indata_names = self.adata.obs_names
|
|
140
|
+
if np.abs(np.expm1(self.indata[0]).sum() - 10000) > 1:
|
|
141
|
+
logger.warn(
|
|
142
|
+
"⚠️ Warning: invalid expression matrix, expect ALL genes and log1p "
|
|
143
|
+
"normalized expression to 10000 counts per cell. The prediction result "
|
|
144
|
+
"may not be accurate"
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"🛑 Invalid input. Supported types: .csv, .txt, .tsv, .tab, .mtx, .mtx.gz "
|
|
149
|
+
"and .h5ad, or AnnData loaded in memory"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
logger.info(
|
|
153
|
+
f"🔬 Input data has {self.indata.shape[0]} cells and {len(self.indata_genes)} "
|
|
154
|
+
"genes"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
21
158
|
if __name__ == "__main__":
|
|
22
|
-
|
|
23
|
-
import celltypist
|
|
159
|
+
Classifier.__init__ = classifier_init # type: ignore
|
|
24
160
|
|
|
25
161
|
args = parser.parse_args()
|
|
26
162
|
adata = sc.read_h5ad(args.input)
|
|
@@ -29,8 +165,8 @@ if __name__ == "__main__":
|
|
|
29
165
|
raise ValueError(
|
|
30
166
|
f"Over clustering column '{over_clustering}' not found in AnnData object."
|
|
31
167
|
)
|
|
32
|
-
if
|
|
33
|
-
adata.uns[
|
|
168
|
+
if "neighbors" in adata.uns and "params" in adata.uns["neighbors"]:
|
|
169
|
+
adata.uns["neighbors"]["params"].setdefault("n_neighbors", 15)
|
|
34
170
|
|
|
35
171
|
annotated = celltypist.annotate(
|
|
36
172
|
adata,
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""This file is used to patch scvelo's paga to fix
|
|
2
|
+
https://github.com/theislab/scvelo/issues/1241
|
|
3
|
+
|
|
4
|
+
This is from pull request
|
|
5
|
+
https://github.com/theislab/scvelo/pull/1308
|
|
6
|
+
which has not been merged yet as of 2025-11-07.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from scipy.sparse import csr_matrix
|
|
12
|
+
|
|
13
|
+
from scanpy.tools._paga import PAGA
|
|
14
|
+
import scvelo
|
|
15
|
+
|
|
16
|
+
# This is adapted from https://github.com/theislab/paga
|
|
17
|
+
from scvelo import logging as logg
|
|
18
|
+
from scvelo import settings
|
|
19
|
+
from scvelo.tools.rank_velocity_genes import velocity_clusters
|
|
20
|
+
from scvelo.tools.utils import strings_to_categoricals
|
|
21
|
+
from scvelo.tools.velocity_graph import vals_to_csr
|
|
22
|
+
from scvelo.tools.velocity_pseudotime import velocity_pseudotime
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# TODO: Finish docstrings
|
|
26
|
+
def get_igraph_from_adjacency(adjacency, directed=None):
|
|
27
|
+
"""Get igraph graph from adjacency matrix."""
|
|
28
|
+
import igraph as ig
|
|
29
|
+
|
|
30
|
+
sources, targets = adjacency.nonzero()
|
|
31
|
+
weights = adjacency[sources, targets]
|
|
32
|
+
if isinstance(weights, np.matrix):
|
|
33
|
+
weights = weights.A1
|
|
34
|
+
g = ig.Graph(directed=directed)
|
|
35
|
+
g.add_vertices(adjacency.shape[0]) # this adds adjacency.shap[0] vertices
|
|
36
|
+
g.add_edges(list(zip(sources, targets)))
|
|
37
|
+
g.es["weight"] = weights
|
|
38
|
+
if g.vcount() != adjacency.shape[0]:
|
|
39
|
+
logg.warn(
|
|
40
|
+
f"The constructed graph has only {g.vcount()} nodes. "
|
|
41
|
+
"Your adjacency matrix contained redundant nodes."
|
|
42
|
+
)
|
|
43
|
+
return g
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# TODO: Add docstrings
|
|
47
|
+
def get_sparse_from_igraph(graph, weight_attr=None):
|
|
48
|
+
"""TODO."""
|
|
49
|
+
edges = graph.get_edgelist()
|
|
50
|
+
if weight_attr is None:
|
|
51
|
+
weights = [1] * len(edges)
|
|
52
|
+
else:
|
|
53
|
+
weights = graph.es[weight_attr]
|
|
54
|
+
if not graph.is_directed():
|
|
55
|
+
edges.extend([(v, u) for u, v in edges])
|
|
56
|
+
weights.extend(weights)
|
|
57
|
+
shape = graph.vcount()
|
|
58
|
+
shape = (shape, shape)
|
|
59
|
+
if len(edges) > 0:
|
|
60
|
+
rows, cols = zip(*edges)
|
|
61
|
+
return csr_matrix((weights, (rows, cols)), shape=shape)
|
|
62
|
+
else:
|
|
63
|
+
return csr_matrix(shape)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# TODO: Finish docstrings
|
|
67
|
+
def set_row_csr(csr, rows, value=0):
|
|
68
|
+
"""Set all nonzero elements to the given value. Useful to set to 0 mostly."""
|
|
69
|
+
for row in rows:
|
|
70
|
+
start = csr.indptr[row]
|
|
71
|
+
end = csr.indptr[row + 1]
|
|
72
|
+
csr.data[start:end] = value
|
|
73
|
+
if value == 0:
|
|
74
|
+
csr.eliminate_zeros()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# TODO: Add docstrings
|
|
78
|
+
class PAGA_tree(PAGA):
|
|
79
|
+
"""TODO."""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
adata,
|
|
84
|
+
groups=None,
|
|
85
|
+
vkey=None,
|
|
86
|
+
use_time_prior=None,
|
|
87
|
+
root_key=None,
|
|
88
|
+
end_key=None,
|
|
89
|
+
threshold_root_end_prior=None,
|
|
90
|
+
minimum_spanning_tree=None,
|
|
91
|
+
):
|
|
92
|
+
super().__init__(adata=adata, groups=groups, model="v1.2")
|
|
93
|
+
self.groups = groups
|
|
94
|
+
self.vkey = vkey
|
|
95
|
+
self.use_time_prior = use_time_prior
|
|
96
|
+
self.root_key = root_key
|
|
97
|
+
self.end_key = end_key
|
|
98
|
+
self.threshold_root_end_prior = threshold_root_end_prior
|
|
99
|
+
if self.threshold_root_end_prior is None:
|
|
100
|
+
self.threshold_root_end_prior = 0.9
|
|
101
|
+
self.minimum_spanning_tree = minimum_spanning_tree
|
|
102
|
+
|
|
103
|
+
# TODO: Add docstrings
|
|
104
|
+
def compute_transitions(self):
|
|
105
|
+
"""TODO."""
|
|
106
|
+
try:
|
|
107
|
+
import igraph
|
|
108
|
+
except ImportError:
|
|
109
|
+
raise ImportError("To run paga, you need to install `pip install igraph`")
|
|
110
|
+
vkey = f"{self.vkey}_graph"
|
|
111
|
+
if vkey not in self._adata.uns:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"The passed AnnData needs to have an `uns` annotation "
|
|
114
|
+
"with key 'velocity_graph' - a sparse matrix from RNA velocity."
|
|
115
|
+
)
|
|
116
|
+
if self._adata.uns[vkey].shape != (self._adata.n_obs, self._adata.n_obs):
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"The passed 'velocity_graph' has shape {self._adata.uns[vkey].shape} "
|
|
119
|
+
f"but shoud have shape {(self._adata.n_obs, self._adata.n_obs)}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
clusters = self._adata.obs[self.groups]
|
|
123
|
+
cats = clusters.cat.categories
|
|
124
|
+
vgraph = self._adata.uns[vkey] > 0.1
|
|
125
|
+
time_prior = self.use_time_prior
|
|
126
|
+
|
|
127
|
+
if isinstance(time_prior, str) and time_prior in self._adata.obs.keys():
|
|
128
|
+
vpt = self._adata.obs[time_prior].values
|
|
129
|
+
vpt_mean = self._adata.obs.groupby(self.groups)[time_prior].mean()
|
|
130
|
+
vpt_means = np.array([vpt_mean[cat] for cat in clusters])
|
|
131
|
+
rows, cols, vals = [], [], []
|
|
132
|
+
for i in range(vgraph.shape[0]):
|
|
133
|
+
indices = vgraph[i].indices
|
|
134
|
+
idx_bool = vpt[i] < vpt[indices]
|
|
135
|
+
idx_bool &= vpt_means[indices] > vpt_means[i] - 0.1
|
|
136
|
+
cols.extend(indices[idx_bool])
|
|
137
|
+
vals.extend(vgraph[i].data[idx_bool])
|
|
138
|
+
rows.extend([i] * np.sum(idx_bool))
|
|
139
|
+
vgraph = vals_to_csr(vals, rows, cols, shape=vgraph.shape)
|
|
140
|
+
|
|
141
|
+
lb = self.threshold_root_end_prior # cells to be consider as terminal states
|
|
142
|
+
if isinstance(self.end_key, str) and self.end_key in self._adata.obs.keys():
|
|
143
|
+
set_row_csr(vgraph, rows=np.where(self._adata.obs[self.end_key] > lb)[0])
|
|
144
|
+
if isinstance(self.root_key, str) and self.root_key in self._adata.obs.keys():
|
|
145
|
+
vgraph[:, np.where(self._adata.obs[self.root_key] > lb)[0]] = 0
|
|
146
|
+
vgraph.eliminate_zeros()
|
|
147
|
+
|
|
148
|
+
membership = self._adata.obs[self.groups].cat.codes.values
|
|
149
|
+
g = get_igraph_from_adjacency(vgraph, directed=True)
|
|
150
|
+
vc = igraph.VertexClustering(g, membership=membership)
|
|
151
|
+
cg_full = vc.cluster_graph(combine_edges="sum")
|
|
152
|
+
transitions = get_sparse_from_igraph(cg_full, weight_attr="weight")
|
|
153
|
+
transitions = transitions - transitions.T
|
|
154
|
+
transitions_conf = transitions.copy()
|
|
155
|
+
transitions = transitions.tocoo()
|
|
156
|
+
total_n = self._neighbors.n_neighbors * np.array(vc.sizes())
|
|
157
|
+
for i, j, v in zip(transitions.row, transitions.col, transitions.data):
|
|
158
|
+
reference = np.sqrt(total_n[i] * total_n[j])
|
|
159
|
+
transitions_conf[i, j] = 0 if v < 0 else v / reference
|
|
160
|
+
transitions_conf.eliminate_zeros()
|
|
161
|
+
|
|
162
|
+
# remove non-confident direct paths if more confident indirect path is found.
|
|
163
|
+
T = transitions_conf.toarray()
|
|
164
|
+
threshold = max(np.nanmin(np.nanmax(T / (T > 0), axis=0)) - 1e-6, 0.01)
|
|
165
|
+
T *= T > threshold
|
|
166
|
+
for i in range(len(T)):
|
|
167
|
+
idx = T[i] > 0
|
|
168
|
+
if np.any(idx):
|
|
169
|
+
indirect = np.clip(T[idx], None, T[i][idx][:, None]).max(0)
|
|
170
|
+
T[i, T[i] < indirect] = 0
|
|
171
|
+
|
|
172
|
+
if self.minimum_spanning_tree:
|
|
173
|
+
T_tmp = T.copy()
|
|
174
|
+
T_num = T > 0
|
|
175
|
+
T_sum = np.sum(T_num, 0)
|
|
176
|
+
T_max = np.max(T_tmp)
|
|
177
|
+
for i in range(len(T_tmp)):
|
|
178
|
+
if T_sum[i] == 1:
|
|
179
|
+
T_tmp[np.where(T_num[:, i])[0][0], i] = T_max
|
|
180
|
+
from scipy.sparse.csgraph import minimum_spanning_tree
|
|
181
|
+
|
|
182
|
+
T_tmp = np.abs(minimum_spanning_tree(-T_tmp).toarray()) > 0
|
|
183
|
+
T = T_tmp * T
|
|
184
|
+
|
|
185
|
+
transitions_conf = csr_matrix(T)
|
|
186
|
+
self.transitions_confidence = transitions_conf.T
|
|
187
|
+
|
|
188
|
+
# set threshold for minimal spanning tree.
|
|
189
|
+
df = pd.DataFrame(T, index=cats, columns=cats)
|
|
190
|
+
self.threshold = np.nanmin(np.nanmax(df.values / (df.values > 0), axis=0))
|
|
191
|
+
self.threshold = max(self.threshold - 1e-6, 0.01)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def paga(
|
|
195
|
+
adata,
|
|
196
|
+
groups=None,
|
|
197
|
+
vkey="velocity",
|
|
198
|
+
use_time_prior=True,
|
|
199
|
+
root_key=None,
|
|
200
|
+
end_key=None,
|
|
201
|
+
threshold_root_end_prior=None,
|
|
202
|
+
minimum_spanning_tree=True,
|
|
203
|
+
copy=False,
|
|
204
|
+
):
|
|
205
|
+
"""PAGA graph with velocity-directed edges.
|
|
206
|
+
|
|
207
|
+
Mapping out the coarse-grained connectivity structures of complex manifolds
|
|
208
|
+
:cite:p:`Wolf19`. By quantifying the connectivity of partitions (groups, clusters) of the
|
|
209
|
+
single-cell graph, partition-based graph abstraction (PAGA) generates a much
|
|
210
|
+
simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights
|
|
211
|
+
represent confidence in the presence of connections.
|
|
212
|
+
|
|
213
|
+
Parameters
|
|
214
|
+
----------
|
|
215
|
+
adata : :class:`~anndata.AnnData`
|
|
216
|
+
An annotated data matrix.
|
|
217
|
+
groups : key for categorical in `adata.obs`, optional (default: 'louvain')
|
|
218
|
+
You can pass your predefined groups by choosing any categorical
|
|
219
|
+
annotation of observations (`adata.obs`).
|
|
220
|
+
vkey: `str` or `None` (default: `None`)
|
|
221
|
+
Key for annotations of observations/cells or variables/genes.
|
|
222
|
+
use_time_prior : `str` or bool, optional (default: True)
|
|
223
|
+
Obs key for pseudo-time values.
|
|
224
|
+
If True, 'velocity_pseudotime' is used if available.
|
|
225
|
+
root_key : `str` or bool, optional (default: None)
|
|
226
|
+
Obs key for root states.
|
|
227
|
+
end_key : `str` or bool, optional (default: None)
|
|
228
|
+
Obs key for end states.
|
|
229
|
+
threshold_root_end_prior : `float` (default: 0.9)
|
|
230
|
+
Threshold for root and final states priors, to be in the range of [0,1].
|
|
231
|
+
Values above the threshold will be considered as terminal and included as prior.
|
|
232
|
+
minimum_spanning_tree : bool, optional (default: True)
|
|
233
|
+
Whether to prune the tree such that a path from A-to-B
|
|
234
|
+
is removed if another more confident path exists.
|
|
235
|
+
copy : `bool`, optional (default: `False`)
|
|
236
|
+
Copy `adata` before computation and return a copy.
|
|
237
|
+
Otherwise, perform computation inplace and return `None`.
|
|
238
|
+
|
|
239
|
+
Returns
|
|
240
|
+
-------
|
|
241
|
+
connectivities: `.uns`
|
|
242
|
+
The full adjacency matrix of the abstracted graph, weights correspond to
|
|
243
|
+
confidence in the connectivities of partitions.
|
|
244
|
+
connectivities_tree: `.uns`
|
|
245
|
+
The adjacency matrix of the tree-like subgraph that best explains the topology.
|
|
246
|
+
transitions_confidence: `.uns`
|
|
247
|
+
The adjacency matrix of the abstracted directed graph, weights correspond to
|
|
248
|
+
confidence in the transitions between partitions.
|
|
249
|
+
"""
|
|
250
|
+
if "neighbors" not in adata.uns:
|
|
251
|
+
raise ValueError(
|
|
252
|
+
"You need to run `pp.neighbors` first to compute a neighborhood graph."
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
adata = adata.copy() if copy else adata
|
|
256
|
+
strings_to_categoricals(adata)
|
|
257
|
+
|
|
258
|
+
if groups is None:
|
|
259
|
+
groups = (
|
|
260
|
+
"clusters"
|
|
261
|
+
if "clusters" in adata.obs.keys()
|
|
262
|
+
else "louvain"
|
|
263
|
+
if "louvain" in adata.obs.keys()
|
|
264
|
+
else None
|
|
265
|
+
)
|
|
266
|
+
elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys():
|
|
267
|
+
velocity_clusters(adata)
|
|
268
|
+
if use_time_prior and not isinstance(use_time_prior, str):
|
|
269
|
+
use_time_prior = "velocity_pseudotime"
|
|
270
|
+
if use_time_prior not in adata.obs.keys():
|
|
271
|
+
velocity_pseudotime(adata, vkey=vkey, root_key=root_key, end_key=end_key)
|
|
272
|
+
|
|
273
|
+
priors = [p for p in [use_time_prior, root_key, end_key] if p in adata.obs.keys()]
|
|
274
|
+
logg.info(
|
|
275
|
+
"running PAGA",
|
|
276
|
+
f"using priors: {priors}" if len(priors) > 0 else "",
|
|
277
|
+
r=True,
|
|
278
|
+
)
|
|
279
|
+
paga = PAGA_tree(
|
|
280
|
+
adata,
|
|
281
|
+
groups,
|
|
282
|
+
vkey=vkey,
|
|
283
|
+
use_time_prior=use_time_prior,
|
|
284
|
+
root_key=root_key,
|
|
285
|
+
end_key=end_key,
|
|
286
|
+
threshold_root_end_prior=threshold_root_end_prior,
|
|
287
|
+
minimum_spanning_tree=minimum_spanning_tree,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
if "paga" not in adata.uns:
|
|
291
|
+
adata.uns["paga"] = {}
|
|
292
|
+
|
|
293
|
+
paga.compute_connectivities()
|
|
294
|
+
adata.uns["paga"]["connectivities"] = paga.connectivities
|
|
295
|
+
adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree
|
|
296
|
+
adata.uns[f"{groups}_sizes"] = np.array(paga.ns)
|
|
297
|
+
|
|
298
|
+
paga.compute_transitions()
|
|
299
|
+
adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence
|
|
300
|
+
adata.uns["paga"]["threshold"] = paga.threshold
|
|
301
|
+
adata.uns["paga"]["groups"] = groups
|
|
302
|
+
|
|
303
|
+
logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n")
|
|
304
|
+
logg.hint(
|
|
305
|
+
"added\n" + " 'paga/connectivities', connectivities adjacency (adata.uns)\n"
|
|
306
|
+
" 'paga/connectivities_tree', connectivities subtree (adata.uns)\n"
|
|
307
|
+
" 'paga/transitions_confidence', velocity transitions (adata.uns)"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
return adata if copy else None
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
scvelo.tl.paga = paga
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Need R and R packages Seurat, SeuratDisk and biopipen.utils.R installed.
|
|
4
4
|
"""
|
|
5
|
+
from __future__ import annotations
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def convert_seurat_to_anndata(
|
|
@@ -10,7 +11,8 @@ def convert_seurat_to_anndata(
|
|
|
10
11
|
assay=None,
|
|
11
12
|
subset=None,
|
|
12
13
|
rscript="Rscript",
|
|
13
|
-
|
|
14
|
+
return_ident_col=False,
|
|
15
|
+
) -> None | str:
|
|
14
16
|
"""Convert Seurat object to AnnData format.
|
|
15
17
|
|
|
16
18
|
Args:
|
|
@@ -43,6 +45,21 @@ def convert_seurat_to_anndata(
|
|
|
43
45
|
cmd = [rscript, temp_script_path]
|
|
44
46
|
run_command(cmd, fg=True)
|
|
45
47
|
|
|
48
|
+
if return_ident_col:
|
|
49
|
+
ident_col_script = f"""
|
|
50
|
+
library(biopipen.utils)
|
|
51
|
+
|
|
52
|
+
obj <- read_obj("{input_file}")
|
|
53
|
+
cat(GetIdentityColumn(obj))
|
|
54
|
+
"""
|
|
55
|
+
with NamedTemporaryFile(suffix=".R", delete=False) as temp_script:
|
|
56
|
+
temp_script.write(ident_col_script.encode('utf-8'))
|
|
57
|
+
temp_script_path = temp_script.name
|
|
58
|
+
|
|
59
|
+
cmd = [rscript, temp_script_path]
|
|
60
|
+
ident_col = run_command(cmd, stdout="RETURN").strip()
|
|
61
|
+
return ident_col
|
|
62
|
+
|
|
46
63
|
|
|
47
64
|
def convert_anndata_to_seurat(
|
|
48
65
|
input_file,
|
|
@@ -13,6 +13,7 @@ python <- {{envs.python | r}}
|
|
|
13
13
|
within_sample <- {{envs.within_sample | r}}
|
|
14
14
|
args <- {{envs.args | r}}
|
|
15
15
|
chain <- {{envs.chain | r}}
|
|
16
|
+
type <- {{envs.type | r}}
|
|
16
17
|
|
|
17
18
|
setwd(outdir)
|
|
18
19
|
|
|
@@ -22,7 +23,36 @@ log$info("Reading input file ...")
|
|
|
22
23
|
obj <- read_obj(screpfile)
|
|
23
24
|
is_seurat <- inherits(obj, "Seurat")
|
|
24
25
|
|
|
25
|
-
|
|
26
|
+
|
|
27
|
+
get_type <- function() {
|
|
28
|
+
if (!is_seurat) {
|
|
29
|
+
for (sample in names(obj)) {
|
|
30
|
+
for (gene in obj[[sample]]$CTgene) {
|
|
31
|
+
if (grepl("^TRB", gene) || grepl("^TRG", gene) || grepl("^TRA", gene) || grepl("^TRD", gene)) {
|
|
32
|
+
return("TCR")
|
|
33
|
+
} else if (grepl("^IGH", gene) || grepl("^IGK", gene) || grepl("^IGL", gene)) {
|
|
34
|
+
return("BCR")
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
} else {
|
|
39
|
+
for (gene in obj@meta.data$CTgene) {
|
|
40
|
+
if (grepl("^TRB", gene) || grepl("^TRG", gene) || grepl("^TRA", gene) || grepl("^TRD", gene)) {
|
|
41
|
+
return("TCR")
|
|
42
|
+
} else if (grepl("^IGH", gene) || grepl("^IGK", gene) || grepl("^IGL", gene)) {
|
|
43
|
+
return("BCR")
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
stop("Cannot determine the type of the data (TCR or BCR). Please set envs.type to 'TCR' or 'BCR'.")
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (type == "auto") {
|
|
51
|
+
type <- get_type()
|
|
52
|
+
log$info("Auto-detected data type: {type}")
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
get_cdr3aa_df <- function() {
|
|
26
56
|
if (!is_seurat) {
|
|
27
57
|
out <- NULL
|
|
28
58
|
for (sample in names(obj)) {
|
|
@@ -32,10 +62,12 @@ get_cdr3aa_df = function() {
|
|
|
32
62
|
)
|
|
33
63
|
if (chain == "both") {
|
|
34
64
|
df$CDR3.aa <- obj[[sample]]$CTaa
|
|
35
|
-
} else if (chain == "
|
|
65
|
+
} else if ((type == "BCR" && chain == "heavy") || (type == "TCR" && chain == "light")) {
|
|
36
66
|
df$CDR3.aa <- obj[[sample]]$cdr3_aa1
|
|
37
|
-
} else if (chain == "
|
|
67
|
+
} else if ((type == "BCR" && chain == "light") || (type == "TCR" && chain == "heavy")) {
|
|
38
68
|
df$CDR3.aa <- obj[[sample]]$cdr3_aa2
|
|
69
|
+
} else {
|
|
70
|
+
stop(paste("Unknown chain:", chain, "for", type))
|
|
39
71
|
}
|
|
40
72
|
out <- rbind(out, df)
|
|
41
73
|
}
|
|
@@ -47,11 +79,13 @@ get_cdr3aa_df = function() {
|
|
|
47
79
|
if (chain == "both") {
|
|
48
80
|
out$CDR3.aa <- out$CTaa
|
|
49
81
|
} else {
|
|
50
|
-
out <- separate(out, CTaa, into = c("
|
|
51
|
-
if (chain == "
|
|
52
|
-
out$CDR3.aa <- out$
|
|
53
|
-
} else if (chain == "
|
|
54
|
-
out$CDR3.aa <- out$
|
|
82
|
+
out <- separate(out, CTaa, into = c("first", "second"), sep = "_")
|
|
83
|
+
if ((type == "BCR" && chain == "heavy") || (type == "TCR" && chain == "light")) {
|
|
84
|
+
out$CDR3.aa <- out$first
|
|
85
|
+
} else if ((type == "BCR" && chain == "light") || (type == "TCR" && chain == "heavy")) {
|
|
86
|
+
out$CDR3.aa <- out$second
|
|
87
|
+
} else {
|
|
88
|
+
stop(paste("Unknown chain:", chain, "for", type))
|
|
55
89
|
}
|
|
56
90
|
}
|
|
57
91
|
} else {
|
|
@@ -132,21 +166,24 @@ output.clusters_df.to_csv(clustcr_dir + "/clusters.txt", sep="\t", index=False)
|
|
|
132
166
|
|
|
133
167
|
clean_clustcr_output = function(clustcr_outfile) {
|
|
134
168
|
clustcr_out = read.delim2(clustcr_outfile, header=TRUE, row.names = NULL)
|
|
135
|
-
colnames(clustcr_out) = c("CDR3.aa", "
|
|
169
|
+
colnames(clustcr_out) = c("CDR3.aa", "CDR3_Cluster")
|
|
136
170
|
out = left_join(cdr3aa_df, distinct(clustcr_out), by=c(cdr3seq4clustering = "CDR3.aa")) %>%
|
|
137
171
|
mutate(
|
|
138
|
-
|
|
139
|
-
is.na(
|
|
172
|
+
CDR3_Cluster = if_else(
|
|
173
|
+
is.na(CDR3_Cluster),
|
|
140
174
|
paste0("S_", row_number()),
|
|
141
|
-
paste0("M_", as.character(
|
|
175
|
+
paste0("M_", as.character(CDR3_Cluster))
|
|
142
176
|
)
|
|
143
177
|
)
|
|
144
178
|
|
|
145
179
|
if (within_sample) {
|
|
146
|
-
out <- mutate(out,
|
|
180
|
+
out <- mutate(out, CDR3_Cluster = paste0(Sample, ".", CDR3_Cluster))
|
|
147
181
|
}
|
|
148
182
|
|
|
149
|
-
|
|
183
|
+
# This join would result in more rows than dplyr can handle
|
|
184
|
+
# left_join(cdr3aa_df, out, by = "CDR3.aa")
|
|
185
|
+
out <- out[match(cdr3aa_df$CDR3.aa, out$CDR3.aa), , drop=FALSE]
|
|
186
|
+
cbind(cdr3aa_df, out[, setdiff(colnames(out), "CDR3.aa"), drop=FALSE])
|
|
150
187
|
}
|
|
151
188
|
|
|
152
189
|
run_clustcr = function() {
|
|
@@ -208,25 +245,28 @@ prepare_input = function() {
|
|
|
208
245
|
|
|
209
246
|
clean_giana_output = function(giana_outfile) {
|
|
210
247
|
# generate an output file with columns:
|
|
211
|
-
# CDR3.aa,
|
|
248
|
+
# CDR3.aa, CDR3_Cluster, V.name, Sample
|
|
212
249
|
# If sequence doesn't exist in the input file,
|
|
213
250
|
# Then a unique cluster id is assigned to it.
|
|
214
251
|
giana_out = read.delim2(giana_outfile, header=FALSE, comment.char = "#", row.names = NULL)[, 1:2, drop=FALSE]
|
|
215
|
-
colnames(giana_out) = c("CDR3.aa", "
|
|
252
|
+
colnames(giana_out) = c("CDR3.aa", "CDR3_Cluster")
|
|
216
253
|
out = left_join(cdr3aa_df, distinct(giana_out), by=c(cdr3seq4clustering = "CDR3.aa")) %>%
|
|
217
254
|
mutate(
|
|
218
|
-
|
|
219
|
-
is.na(
|
|
255
|
+
CDR3_Cluster = if_else(
|
|
256
|
+
is.na(CDR3_Cluster),
|
|
220
257
|
paste0("S_", row_number()),
|
|
221
|
-
paste0("M_", as.character(
|
|
258
|
+
paste0("M_", as.character(CDR3_Cluster))
|
|
222
259
|
)
|
|
223
260
|
)
|
|
224
261
|
|
|
225
262
|
if (within_sample) {
|
|
226
|
-
out <- mutate(out,
|
|
263
|
+
out <- mutate(out, CDR3_Cluster = paste0(Sample, ".", CDR3_Cluster))
|
|
227
264
|
}
|
|
228
265
|
|
|
229
|
-
|
|
266
|
+
# This join would result in more rows than dplyr can handle
|
|
267
|
+
# left_join(cdr3aa_df, out, by = "CDR3.aa")
|
|
268
|
+
out <- out[match(cdr3aa_df$CDR3.aa, out$CDR3.aa), , drop=FALSE]
|
|
269
|
+
cbind(cdr3aa_df, out[, setdiff(colnames(out), "CDR3.aa"), drop=FALSE])
|
|
230
270
|
}
|
|
231
271
|
|
|
232
272
|
run_giana = function() {
|
|
@@ -276,12 +316,12 @@ attach_to_obj = function(obj, out) {
|
|
|
276
316
|
rownames(out) <- out$Barcode
|
|
277
317
|
if (is_seurat) {
|
|
278
318
|
# Attach results to Seurat object
|
|
279
|
-
obj@meta.data$
|
|
319
|
+
obj@meta.data$CDR3_Cluster <- out[rownames(obj@meta.data), "CDR3_Cluster"]
|
|
280
320
|
} else {
|
|
281
321
|
# Attach results to the list of data frames
|
|
282
322
|
for (sample in names(obj)) {
|
|
283
323
|
sout <- filter(out, Sample == sample)
|
|
284
|
-
obj[[sample]]$
|
|
324
|
+
obj[[sample]]$CDR3_Cluster <- sout[obj[[sample]]$barcode, "CDR3_Cluster"]
|
|
285
325
|
}
|
|
286
326
|
}
|
|
287
327
|
obj
|