biopipen 0.17.6__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/filters.py +36 -23
- biopipen/ns/delim.py +1 -1
- biopipen/ns/scrna.py +132 -49
- biopipen/ns/tcr.py +62 -0
- biopipen/reports/scrna/MarkersFinder.svelte +30 -8
- biopipen/reports/scrna/SeuratClusterStats.svelte +64 -109
- biopipen/reports/tcr/TESSA.svelte +43 -0
- biopipen/scripts/delim/SampleInfo.R +18 -15
- biopipen/scripts/scrna/MarkersFinder.R +58 -2
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +40 -0
- biopipen/scripts/scrna/SeuratClusterStats-features.R +236 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +105 -0
- biopipen/scripts/scrna/SeuratClusterStats.R +7 -521
- biopipen/scripts/scrna/SeuratClustering.R +20 -1
- biopipen/scripts/tcr/ImmunarchLoading.R +1 -1
- biopipen/scripts/tcr/TESSA.R +198 -0
- biopipen/scripts/tcr/TESSA_source/Atchley_factors.csv +21 -0
- biopipen/scripts/tcr/TESSA_source/BriseisEncoder.py +168 -0
- biopipen/scripts/tcr/TESSA_source/MCMC_control.R +71 -0
- biopipen/scripts/tcr/TESSA_source/TrainedEncoder.h5 +0 -0
- biopipen/scripts/tcr/TESSA_source/fixed_b.csv +31 -0
- biopipen/scripts/tcr/TESSA_source/initialization.R +120 -0
- biopipen/scripts/tcr/TESSA_source/post_analysis.R +124 -0
- biopipen/scripts/tcr/TESSA_source/real_data.R +67 -0
- biopipen/scripts/tcr/TESSA_source/update.R +195 -0
- biopipen/scripts/tcr/TESSA_source/utility.R +18 -0
- {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/METADATA +8 -8
- {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/RECORD +31 -16
- {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/WHEEL +0 -0
- {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
|
|
3
|
+
library(glue)
|
|
4
|
+
library(dplyr)
|
|
5
|
+
library(tidyr)
|
|
6
|
+
library(immunarch)
|
|
7
|
+
library(Seurat)
|
|
8
|
+
library(ggplot2)
|
|
9
|
+
library(ggprism)
|
|
10
|
+
|
|
11
|
+
immfile <- {{in.immdata | r}}
|
|
12
|
+
exprfile <- {{in.srtobj | r}}
|
|
13
|
+
outfile <- {{out.outfile | r}}
|
|
14
|
+
python <- {{envs.python | r}}
|
|
15
|
+
within_sample <- {{envs.within_sample | r}}
|
|
16
|
+
assay <- {{envs.assay | r}}
|
|
17
|
+
predefined_b <- {{envs.predefined_b | r}}
|
|
18
|
+
max_iter <- {{envs.max_iter | int}}
|
|
19
|
+
tessa_srcdir <- "{{biopipen_dir}}/scripts/tcr/TESSA_source"
|
|
20
|
+
|
|
21
|
+
outdir <- dirname(outfile)
|
|
22
|
+
result_dir <- file.path(outdir, "result")
|
|
23
|
+
tessa_dir <- file.path(outdir, "tessa")
|
|
24
|
+
if (!dir.exists(result_dir)) dir.create(result_dir)
|
|
25
|
+
if (!dir.exists(tessa_dir)) dir.create(tessa_dir)
|
|
26
|
+
|
|
27
|
+
### Start preparing input files for TESSA
|
|
28
|
+
# Prepare input files
|
|
29
|
+
print("Preparing TCR input file ...")
|
|
30
|
+
immdata <- readRDS(immfile)
|
|
31
|
+
|
|
32
|
+
has_VJ <- "V.name" %in% colnames(immdata$data[[1]]) && "J.name" %in% colnames(immdata$data[[1]])
|
|
33
|
+
# Merge all samples
|
|
34
|
+
tcrdata <- do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
|
|
35
|
+
# Clones Proportion CDR3.aa Barcode
|
|
36
|
+
# 5 4 0.008583691 CAVRDTGNTPLVF;CASSEYSNQPQHF GTTCGGGCACTTACGA-1;TCTCTAAGTACCAGTT-1
|
|
37
|
+
# 6 4 0.008583691 CALTQAAGNKLTF;CASRPEDLRGQPQHF GCTTGAAGTCGGCACT-1;TACTCGCTCCTAAGTG-1
|
|
38
|
+
if (has_VJ) {
|
|
39
|
+
cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa", "V.name", "J.name")]
|
|
40
|
+
} else {
|
|
41
|
+
cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa")]
|
|
42
|
+
}
|
|
43
|
+
# # A tibble: 4 × 5
|
|
44
|
+
# Sample Patient Timepoint Tissue
|
|
45
|
+
# <chr> <chr> <chr> <chr>
|
|
46
|
+
# 1 MC1685Pt011-Baseline-PB MC1685Pt011 Baseline PB
|
|
47
|
+
mdata = as.list(immdata$meta[i, , drop=FALSE])
|
|
48
|
+
for (mname in names(mdata)) {
|
|
49
|
+
assign(mname, mdata[[mname]])
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
cldata %>%
|
|
53
|
+
separate_rows(Barcode, sep=";") %>%
|
|
54
|
+
# Just in case there are duplicated barcodes
|
|
55
|
+
distinct(Barcode, .keep_all = TRUE) %>%
|
|
56
|
+
mutate(Barcode = glue("{{envs.prefix}}{Barcode}"), sample = Sample)
|
|
57
|
+
}))
|
|
58
|
+
if (has_VJ) {
|
|
59
|
+
tcrdata <- tcrdata %>% dplyr::mutate(
|
|
60
|
+
v_gene = sub("-\\d+$", "", V.name),
|
|
61
|
+
j_gene = sub("-\\d+$", "", J.name)
|
|
62
|
+
) %>% dplyr::select(
|
|
63
|
+
contig_id = Barcode,
|
|
64
|
+
cdr3 = CDR3.aa,
|
|
65
|
+
v_gene,
|
|
66
|
+
j_gene,
|
|
67
|
+
sample
|
|
68
|
+
)
|
|
69
|
+
} else {
|
|
70
|
+
tcrdata <- tcrdata %>% dplyr::select(
|
|
71
|
+
contig_id = Barcode,
|
|
72
|
+
cdr3 = CDR3.aa,
|
|
73
|
+
sample
|
|
74
|
+
)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
print("Preparing expression input file ...")
|
|
79
|
+
is_seurat <- endsWith(tolower(exprfile), ".rds")
|
|
80
|
+
is_gz <- endsWith(tolower(exprfile), ".gz")
|
|
81
|
+
|
|
82
|
+
if (is_seurat) {
|
|
83
|
+
sobj <- readRDS(exprfile)
|
|
84
|
+
expr <- GetAssayData(sobj, slot = "data", assay = assay)
|
|
85
|
+
} else if (is_gz) {
|
|
86
|
+
expr <- read.table(gzfile(exprfile), sep="\t", header=TRUE, row.names=1)
|
|
87
|
+
} else {
|
|
88
|
+
expr <- read.table(exprfile, sep="\t", header=TRUE, row.names=1)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
cell_ids <- intersect(tcrdata$contig_id, colnames(expr))
|
|
92
|
+
# Warning about unused cells
|
|
93
|
+
unused_tcr_cells <- setdiff(tcrdata$contig_id, cell_ids)
|
|
94
|
+
unused_expr_cells <- setdiff(colnames(expr), cell_ids)
|
|
95
|
+
if (length(unused_tcr_cells) > 0) {
|
|
96
|
+
warning(glue("{length(unused_tcr_cells)}/{nrow(tcrdata)} TCR cells are not used."), immediate. = TRUE)
|
|
97
|
+
}
|
|
98
|
+
if (length(unused_expr_cells) > 0) {
|
|
99
|
+
warning(glue("{length(unused_expr_cells)}/{ncol(expr)} expression cells are not used."), immediate. = TRUE)
|
|
100
|
+
}
|
|
101
|
+
if (length(cell_ids) == 0) {
|
|
102
|
+
stop("No common cells between TCR and expression data. Are you using the correct prefix?")
|
|
103
|
+
}
|
|
104
|
+
tcrdata <- tcrdata[tcrdata$contig_id %in% cell_ids, , drop=FALSE]
|
|
105
|
+
expr <- as.matrix(expr)[, tcrdata$contig_id, drop=FALSE]
|
|
106
|
+
|
|
107
|
+
# Write input files
|
|
108
|
+
print("Writing input files ...")
|
|
109
|
+
write.table(tcrdata, file.path(tessa_dir, "tcrdata.txt"), sep=",", quote=FALSE, row.names=FALSE)
|
|
110
|
+
write.table(expr, file.path(tessa_dir, "exprdata.txt"), sep=",", quote=FALSE, row.names=TRUE, col.names=TRUE)
|
|
111
|
+
|
|
112
|
+
### End preparing input files for TESSA
|
|
113
|
+
|
|
114
|
+
### Start running TESSA
|
|
115
|
+
print("Running TESSA ...")
|
|
116
|
+
|
|
117
|
+
# The original TESSA uses a python wrapper to run the encoder and tessa model
|
|
118
|
+
# here we run those two steps directly here
|
|
119
|
+
|
|
120
|
+
print("- Running encoder ...")
|
|
121
|
+
cmd_encoder <- paste(
|
|
122
|
+
python,
|
|
123
|
+
file.path(tessa_srcdir, "BriseisEncoder.py"),
|
|
124
|
+
"-tcr",
|
|
125
|
+
file.path(tessa_dir, "tcrdata.txt"),
|
|
126
|
+
"-model",
|
|
127
|
+
file.path(tessa_srcdir, "TrainedEncoder.h5"),
|
|
128
|
+
"-embeding_vectors",
|
|
129
|
+
file.path(tessa_srcdir, "Atchley_factors.csv"),
|
|
130
|
+
"-output_TCR",
|
|
131
|
+
file.path(tessa_dir, "tcr_encoded.txt"),
|
|
132
|
+
"-output_log",
|
|
133
|
+
file.path(tessa_dir, "tcr_encoder.log")
|
|
134
|
+
)
|
|
135
|
+
if (has_VJ) {
|
|
136
|
+
cmd_encoder <- paste(
|
|
137
|
+
cmd_encoder,
|
|
138
|
+
"-output_VJ",
|
|
139
|
+
file.path(tessa_dir, "tcr_vj.txt")
|
|
140
|
+
)
|
|
141
|
+
}
|
|
142
|
+
print(paste("- ", cmd_encoder))
|
|
143
|
+
|
|
144
|
+
rc <- system(cmd_encoder)
|
|
145
|
+
if (rc != 0) {
|
|
146
|
+
stop("Error: Failed to run encoder.")
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
print("- Running TESSA model ...")
|
|
150
|
+
source(file.path(tessa_srcdir, "real_data.R"))
|
|
151
|
+
|
|
152
|
+
tessa <- run_tessa(
|
|
153
|
+
tessa_srcdir,
|
|
154
|
+
file.path(tessa_dir, "exprdata.txt"),
|
|
155
|
+
file.path(tessa_dir, "tcr_encoded.txt"),
|
|
156
|
+
file.path(tessa_dir, "tcrdata.txt"),
|
|
157
|
+
result_dir,
|
|
158
|
+
within_sample,
|
|
159
|
+
(if (!predefined_b) NULL else file.path(tessa_srcdir, "fixed_b.csv")),
|
|
160
|
+
max_iter = max_iter
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Save TESSA results
|
|
164
|
+
print("Saving TESSA results ...")
|
|
165
|
+
if (is_seurat) {
|
|
166
|
+
cells <- rownames(sobj@meta.data)
|
|
167
|
+
sobj@meta.data <- sobj@meta.data %>%
|
|
168
|
+
mutate(
|
|
169
|
+
TESSA_Cluster = tessa$meta[
|
|
170
|
+
match(cells, tessa$meta$barcode),
|
|
171
|
+
"cluster_number"
|
|
172
|
+
]
|
|
173
|
+
) %>%
|
|
174
|
+
add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
|
|
175
|
+
rownames(sobj@meta.data) <- cells
|
|
176
|
+
saveRDS(sobj, outfile)
|
|
177
|
+
} else {
|
|
178
|
+
out <- tessa$meta %>%
|
|
179
|
+
dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
|
|
180
|
+
add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
|
|
181
|
+
write.table(out, outfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
# Post analysis
|
|
185
|
+
print("Post analysis ...")
|
|
186
|
+
plot_tessa(tessa, result_dir)
|
|
187
|
+
plot_Tessa_clusters(tessa, result_dir)
|
|
188
|
+
|
|
189
|
+
p <- tessa$meta %>%
|
|
190
|
+
dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
|
|
191
|
+
add_count(TESSA_Cluster, name = "TESSA_Cluster_Size") %>%
|
|
192
|
+
ggplot(aes(x = TESSA_Cluster_Size)) +
|
|
193
|
+
geom_histogram(binwidth = 1) +
|
|
194
|
+
theme_prism()
|
|
195
|
+
|
|
196
|
+
png(file.path(result_dir, "Cluster_size_dist.png"), width=8, height=8, units="in", res=100)
|
|
197
|
+
print(p)
|
|
198
|
+
dev.off()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Amino acid,Factor I,Factor II,Factor III,Factor IV,Factor V
|
|
2
|
+
A,-0.591,-1.302,-0.733,1.57,-0.146
|
|
3
|
+
C,-1.343,0.465,-0.862,-1.02,-0.255
|
|
4
|
+
D,1.05,0.302,-3.656,-0.259,-3.242
|
|
5
|
+
E,1.357,-1.453,1.477,0.113,-0.837
|
|
6
|
+
F,-1.006,-0.59,1.891,-0.397,0.412
|
|
7
|
+
G,-0.384,1.652,1.33,1.045,2.064
|
|
8
|
+
H,0.336,-0.417,-1.673,-1.474,-0.078
|
|
9
|
+
I,-1.239,-0.547,2.131,0.393,0.816
|
|
10
|
+
K,1.831,-0.561,0.533,-0.277,1.648
|
|
11
|
+
L,-1.019,-0.987,-1.505,1.266,-0.912
|
|
12
|
+
M,-0.663,-1.524,2.219,-1.005,1.212
|
|
13
|
+
N,0.945,0.828,1.299,-0.169,0.933
|
|
14
|
+
P,0.189,2.081,-1.628,0.421,-1.392
|
|
15
|
+
Q,0.931,-0.179,-3.005,-0.503,-1.853
|
|
16
|
+
R,1.538,-0.055,1.502,0.44,2.897
|
|
17
|
+
S,-0.228,1.399,-4.76,0.67,-2.647
|
|
18
|
+
T,-0.032,0.326,2.213,0.908,1.313
|
|
19
|
+
V,-1.337,-0.279,-0.544,1.242,-1.262
|
|
20
|
+
W,-0.595,0.009,0.672,-2.128,-0.184
|
|
21
|
+
Y,0.26,0.83,3.097,-0.838,1.512
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
##########################################################################################################################
|
|
3
|
+
# BriseisEncoder 1.0.0: A deep learning encoder for CDR3 sequences in the TCR space
|
|
4
|
+
##########################################################################################################################
|
|
5
|
+
# BriseisEncoder is capable of transforming CDR3 peptide sequences from productive
|
|
6
|
+
# TCR-beta chains into 15-digit informative numerical vectors.
|
|
7
|
+
# 01312019 Ze Zhang <Ze.Zhang@UTsouthwestern.edu>
|
|
8
|
+
##########################################################################################################################
|
|
9
|
+
# Dependencies:
|
|
10
|
+
# Python 3.6.4 (preferred)
|
|
11
|
+
# numpy (>=1.15.4), pandas (>=0.23.4), keras (>=2.2.4)
|
|
12
|
+
##########################################################################################################################
|
|
13
|
+
# Parameters:
|
|
14
|
+
# tcr_dir: a .csv file to input the CDR3 sequences, contains at least 2 columns naming 'contig_id' and 'cdr3'. Two optional
|
|
15
|
+
# columns 'v_gene' and 'j_gene' denotes the V gene and J gene subgroups (TRBV1-30 and TRBJ1/2) of TRB recombinants.
|
|
16
|
+
# 'contig_id' should contains non-repeated ID strings, and 'cdr3' should contains valid TRB CDR3 peptide sequences.
|
|
17
|
+
# model_dir: a .h5 file containing trained encoding models from CDR3 seqs called from bulk-tumor RNA-seq data.
|
|
18
|
+
# aa_dict_dir: a .csv file storing Atchley's amino acid embedding, details in https://www.ncbi.nlm.nih.gov/pubmed/15851683.
|
|
19
|
+
# output_encodedTCR_dir: a .csv file dir to store encoded CDR3 peptide seqs.
|
|
20
|
+
# output_log_dir: a plain text log-file to record any errors or warnings from this script.
|
|
21
|
+
# output_encodedVJ_dir: a .csv file dir to store one-hot encoded TRBV/TRBJ genes.
|
|
22
|
+
##########################################################################################################################
|
|
23
|
+
# Example:
|
|
24
|
+
# python3 BriseisEncoder.py -tcr TestCDR3.csv -model Trained_encoder.h5 -embeding_vectors Atchley_factors.csv \
|
|
25
|
+
# -output_TCR test.csv -output_VJ testVJ.csv -output_log test.log
|
|
26
|
+
##########################################################################################################################
|
|
27
|
+
""" # noqa: E501
|
|
28
|
+
|
|
29
|
+
# Import dependencies
|
|
30
|
+
import sys
|
|
31
|
+
|
|
32
|
+
# sys.path.append('/home2/s421955/.conda/envs/keras_test/site-packages')
|
|
33
|
+
import numpy as np
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import os
|
|
36
|
+
import csv
|
|
37
|
+
from keras.models import load_model
|
|
38
|
+
from keras.models import Model
|
|
39
|
+
|
|
40
|
+
# Read data
|
|
41
|
+
args = sys.argv
|
|
42
|
+
tcr_dir = args[args.index("-tcr") + 1]
|
|
43
|
+
model_dir = args[args.index("-model") + 1]
|
|
44
|
+
aa_dict_dir = args[args.index("-embeding_vectors") + 1]
|
|
45
|
+
output_encodedTCR_dir = args[args.index("-output_TCR") + 1]
|
|
46
|
+
if "-output_VJ" in args:
|
|
47
|
+
output_encodedVJ_dir = args[args.index("-output_VJ") + 1]
|
|
48
|
+
output_log_dir = args[args.index("-output_log") + 1]
|
|
49
|
+
encode_dim = 80
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Define functions
|
|
53
|
+
def preprocess(filedir):
|
|
54
|
+
# Preprocess TCR files
|
|
55
|
+
print("Processing: " + filedir)
|
|
56
|
+
if not os.path.exists(filedir):
|
|
57
|
+
print("Invalid file path: " + filedir)
|
|
58
|
+
return 0
|
|
59
|
+
dataset = pd.read_csv(filedir, header=0)
|
|
60
|
+
if dataset.isnull().values.any():
|
|
61
|
+
print("Input data contains NAs.")
|
|
62
|
+
# dataset = dataset.dropna()
|
|
63
|
+
data_new = pd.DataFrame(
|
|
64
|
+
{
|
|
65
|
+
"contig_id": dataset["contig_id"],
|
|
66
|
+
"cdr3": dataset["cdr3"],
|
|
67
|
+
"v_gene": None,
|
|
68
|
+
"j_gene": None,
|
|
69
|
+
}
|
|
70
|
+
)
|
|
71
|
+
if "v_gene" in dataset.columns:
|
|
72
|
+
data_new["v_gene"] = dataset["v_gene"]
|
|
73
|
+
else:
|
|
74
|
+
data_new = data_new.drop(columns="v_gene")
|
|
75
|
+
if "j_gene" in dataset.columns:
|
|
76
|
+
data_new["j_gene"] = dataset["j_gene"]
|
|
77
|
+
else:
|
|
78
|
+
data_new = data_new.drop(columns="j_gene")
|
|
79
|
+
data_new.index = range(0, dataset.shape[0])
|
|
80
|
+
return data_new
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def aamapping(peptideSeq, aa_dict, encode_dim):
|
|
84
|
+
# Transform aa seqs to Atchley's factors.
|
|
85
|
+
peptideArray = []
|
|
86
|
+
if len(peptideSeq) > encode_dim:
|
|
87
|
+
print("Length: " + str(len(peptideSeq)) + " over bound!")
|
|
88
|
+
peptideSeq = peptideSeq[0:encode_dim]
|
|
89
|
+
for aa_single in peptideSeq:
|
|
90
|
+
try:
|
|
91
|
+
peptideArray.append(aa_dict[aa_single])
|
|
92
|
+
except KeyError:
|
|
93
|
+
print("Not proper aaSeqs: " + peptideSeq)
|
|
94
|
+
peptideArray.append(np.zeros(5, dtype="float64"))
|
|
95
|
+
for i in range(0, encode_dim - len(peptideSeq)):
|
|
96
|
+
peptideArray.append(np.zeros(5, dtype="float64"))
|
|
97
|
+
return np.asarray(peptideArray)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def datasetMap(dataset, aa_dict, encode_dim):
|
|
101
|
+
# Wrapper of aamapping
|
|
102
|
+
TCR_dict = dict()
|
|
103
|
+
for i in range(0, len(dataset["cdr3"])):
|
|
104
|
+
TCR_key = dataset["contig_id"][i]
|
|
105
|
+
if TCR_key in TCR_dict.keys():
|
|
106
|
+
TCR_key = TCR_key + "_" + str(i)
|
|
107
|
+
TCR_dictarray = aamapping(dataset["cdr3"][i], aa_dict, encode_dim)
|
|
108
|
+
TCR_dict[TCR_key] = TCR_dictarray
|
|
109
|
+
return TCR_dict
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def embedVJ(genelist, maplist):
|
|
113
|
+
# Embed VJgenes
|
|
114
|
+
VJ_array = []
|
|
115
|
+
for gene in genelist:
|
|
116
|
+
ind = np.zeros(len(maplist))
|
|
117
|
+
try:
|
|
118
|
+
find = maplist.index(gene)
|
|
119
|
+
ind[find] = 1
|
|
120
|
+
VJ_array.append(ind)
|
|
121
|
+
except ValueError:
|
|
122
|
+
print("Gene out of bound!" + str(gene))
|
|
123
|
+
VJ_array.append(ind)
|
|
124
|
+
next
|
|
125
|
+
return np.asarray(VJ_array)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Main functions, data preprocess
|
|
129
|
+
log_file = open(output_log_dir, "w")
|
|
130
|
+
sys.stdout = log_file
|
|
131
|
+
print("Mission loading.")
|
|
132
|
+
tcr = preprocess(tcr_dir)
|
|
133
|
+
aa_dict = dict()
|
|
134
|
+
with open(aa_dict_dir, "r") as aa:
|
|
135
|
+
aa_reader = csv.reader(aa)
|
|
136
|
+
next(aa_reader, None)
|
|
137
|
+
for rows in aa_reader:
|
|
138
|
+
aa_name = rows[0]
|
|
139
|
+
aa_factor = rows[1 : len(rows)]
|
|
140
|
+
aa_dict[aa_name] = np.asarray(aa_factor, dtype="float")
|
|
141
|
+
TCR_dict = datasetMap(tcr, aa_dict, encode_dim)
|
|
142
|
+
TCR_contigs = np.stack(list(TCR_dict.values()))
|
|
143
|
+
TCR_contigs = TCR_contigs.reshape(-1, encode_dim, 5, 1)
|
|
144
|
+
# Model prediction
|
|
145
|
+
TCRencoder = load_model(model_dir)
|
|
146
|
+
encoder = Model(TCRencoder.input, TCRencoder.layers[-12].output)
|
|
147
|
+
encoded_mat = encoder.predict(TCR_contigs)
|
|
148
|
+
encoded_mat = pd.DataFrame(encoded_mat, index=tcr["contig_id"])
|
|
149
|
+
encoded_mat.to_csv(output_encodedTCR_dir, sep=",")
|
|
150
|
+
if "v_gene" in tcr.columns or "j_gene" in tcr.columns:
|
|
151
|
+
maplist = ["TRBV" + str(i) for i in range(1, 31)]
|
|
152
|
+
maplist.append("TRBJ1")
|
|
153
|
+
maplist.append("TRBJ2")
|
|
154
|
+
if "v_gene" in tcr.columns:
|
|
155
|
+
v_map = embedVJ(tcr["v_gene"], maplist)[:, 0:30]
|
|
156
|
+
else:
|
|
157
|
+
print("V genes are missing!")
|
|
158
|
+
v_map = np.zeros((tcr.shape[0], 30), dtype="float64")
|
|
159
|
+
if "j_gene" in tcr.columns:
|
|
160
|
+
j_map = embedVJ(tcr["j_gene"], maplist)[:, 30:32]
|
|
161
|
+
else:
|
|
162
|
+
print("J genes are missing!")
|
|
163
|
+
j_map = np.zeros((tcr.shape[0], 2), dtype="float64")
|
|
164
|
+
VJ_map = np.concatenate((v_map, j_map), axis=1)
|
|
165
|
+
VJ_map = pd.DataFrame(data=VJ_map, index=tcr["contig_id"], columns=maplist)
|
|
166
|
+
VJ_map.to_csv(output_encodedVJ_dir, sep=",")
|
|
167
|
+
print("Mission Accomplished.\n")
|
|
168
|
+
log_file.close()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Tessa<-function(e,cdr3,t,hyper_priors,max_iter,sample_id=NULL,save=NULL,b=NULL,seed_num=123)
|
|
2
|
+
{
|
|
3
|
+
# initialization
|
|
4
|
+
cat('\nInitialization\n')
|
|
5
|
+
options(warn=2)
|
|
6
|
+
if ((!is.null(save)) && (!file.exists(save))) {dir.create(save)}
|
|
7
|
+
if (!is.null(sample_id)){
|
|
8
|
+
if(length(sample_id)!=ncol(t)){
|
|
9
|
+
print('Unmatched sample IDs!')
|
|
10
|
+
break
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
if(!is.null(b)){
|
|
14
|
+
preset_b=TRUE
|
|
15
|
+
}else{
|
|
16
|
+
preset_b=FALSE
|
|
17
|
+
}
|
|
18
|
+
print(seed_num)
|
|
19
|
+
initialized=initialize(t,cdr3,e,hyper_priors,sample_id,b,seed_num=seed_num)
|
|
20
|
+
t=initialized$t;meta=initialized$meta;a=initialized$a;b=initialized$b;sigma=initialized$sigma;
|
|
21
|
+
K=initialized$K;ak=initialized$ak;phi=initialized$phi;t0=initialized$t0;
|
|
22
|
+
master_dist_e=initialized$master_dist_e;de=initialized$de;dt=initialized$dt
|
|
23
|
+
|
|
24
|
+
lambda=hyper_priors$lambda;xi=hyper_priors$xi;g=hyper_priors$g
|
|
25
|
+
tau=hyper_priors$tau;u=hyper_priors$u;v=hyper_priors$v;
|
|
26
|
+
initialize_cluster_factor=hyper_priors$initialize_cluster_factor
|
|
27
|
+
|
|
28
|
+
# some intermediate variables for computational efficiencies
|
|
29
|
+
meta_dedup=meta[!duplicated(meta$group_ID),] # meta: cell level; meta_dedup: group level
|
|
30
|
+
rownames(meta_dedup)=meta_dedup$group_ID
|
|
31
|
+
meta_dedup=meta_dedup[colnames(t),]
|
|
32
|
+
updated_recent=c() # for recording acceptance rates of b
|
|
33
|
+
mean_t=rowMeans(t)
|
|
34
|
+
|
|
35
|
+
# MCMC
|
|
36
|
+
for (iter in 1:max_iter)
|
|
37
|
+
{
|
|
38
|
+
# report
|
|
39
|
+
cat(paste('\nIteration round:',iter,"\n"))
|
|
40
|
+
cat(paste(" # clusters:",K,"\n"))
|
|
41
|
+
cluster_rate=1-sum(sapply(dt,function(x) length(x)==1))/length(unlist(dt))
|
|
42
|
+
cat(paste(" Clustering rate:",round(cluster_rate,d=3),"\n"))
|
|
43
|
+
|
|
44
|
+
# Dirichlet process
|
|
45
|
+
results=DP(meta_dedup,meta,t0,dt,de,ak,phi,t,lambda,g,sigma,b,master_dist_e,K,
|
|
46
|
+
a,xi,mean_t,sample_id)
|
|
47
|
+
t0=results$t0;phi=results$phi;de=results$de;dt=results$dt;ak=results$ak;
|
|
48
|
+
c=results$c;K=results$K;meta_dedup$cluster_number=results$cluster_number
|
|
49
|
+
|
|
50
|
+
# other parameters
|
|
51
|
+
t0=update_t0(t,meta_dedup,K,lambda,b,phi,t0)
|
|
52
|
+
ak=update_ak(K,dt,de,sigma,a,g,phi,ak)
|
|
53
|
+
regression_loss=sapply(1:K,function(k) sum((de[[k]]-ak[k]*dt[[k]])^2))
|
|
54
|
+
sigma=update_sigma(u,v,K,phi,g,ak,a,de,dt,regression_loss)
|
|
55
|
+
a=update_a(tau,K,g,sigma,ak)
|
|
56
|
+
b_result=update_b(u,v,phi,t,t0,meta_dedup,K,dt,de,ak,sigma,regression_loss,b,preset_b)
|
|
57
|
+
b=b_result$b;dt=b_result$dt
|
|
58
|
+
updated_recent=c(b_result$updated,updated_recent)
|
|
59
|
+
if (length(updated_recent)>200) {updated_recent=updated_recent[1:200]}
|
|
60
|
+
cat(paste(" Recent b acceptance rate:",round(sum(updated_recent)/length(updated_recent),d=2),"\n"))
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# wrap up
|
|
64
|
+
options(warn=0)
|
|
65
|
+
meta$cluster_number=meta_dedup[meta$group_ID,"cluster_number"]
|
|
66
|
+
tessa_results=list(b=b,meta=meta,master_dist_e=master_dist_e,a=a,ak=ak,sigma=sigma,dt=dt,de=de,
|
|
67
|
+
t=t,meta_dedup=meta_dedup,phi=phi,K=K)
|
|
68
|
+
save(tessa_results,file=paste(save,"/tessa_final.RData",sep=""))
|
|
69
|
+
write.csv(meta,file=paste(save,"/result_meta.csv",sep=""),quote=F,row.names=FALSE)
|
|
70
|
+
return(tessa_results)
|
|
71
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
b
|
|
2
|
+
0.104086935938378
|
|
3
|
+
0.108780692900858
|
|
4
|
+
0.0570899431292494
|
|
5
|
+
0.0223191901369765
|
|
6
|
+
0.0625734040149305
|
|
7
|
+
0.0724461005365275
|
|
8
|
+
0.0570765486616299
|
|
9
|
+
0.0735009762613172
|
|
10
|
+
0.0493248771695967
|
|
11
|
+
0.0644907300641909
|
|
12
|
+
0.0505856876380108
|
|
13
|
+
0.0362054942161984
|
|
14
|
+
0.0430213209688376
|
|
15
|
+
0.0338085552633352
|
|
16
|
+
0.0577835864812993
|
|
17
|
+
0.0360238874526061
|
|
18
|
+
0.0497979998974232
|
|
19
|
+
0.0501341494361042
|
|
20
|
+
0.0974383828919103
|
|
21
|
+
0.0950533850505816
|
|
22
|
+
0.0230444436289012
|
|
23
|
+
0.0830927914655238
|
|
24
|
+
0.0632701497792147
|
|
25
|
+
0.0341922249498399
|
|
26
|
+
0.0188572387690941
|
|
27
|
+
0.0220431669369226
|
|
28
|
+
0.0353984720393545
|
|
29
|
+
0.0327647439111161
|
|
30
|
+
0.0298263126422095
|
|
31
|
+
0.0244369519533193
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# find center of each cluster
|
|
2
|
+
find_center<-function(t_cluster,b)
|
|
3
|
+
{
|
|
4
|
+
names(which.min(colSums((t_cluster-rowMeans(t_cluster))^2/b)))[1]
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
# to initialize the assignment of clusters. A good initialization will save a lot of MCMC cycles
|
|
8
|
+
initialize_cluster<-function(meta,t,factor=4,sample_id_dedup) # larger factor, less number of clusters
|
|
9
|
+
{
|
|
10
|
+
t0=t[,meta$group_ID]
|
|
11
|
+
if(is.null(sample_id_dedup)){
|
|
12
|
+
k0=round(dim(t0)[2]/factor)
|
|
13
|
+
if(k0<1){k0=1}
|
|
14
|
+
cluster_n=cutree(hclust(dist(t(t0))),k=k0)
|
|
15
|
+
}else{
|
|
16
|
+
sample_id=sample_id_dedup[meta$group_ID]
|
|
17
|
+
cluster_n=rep(0,ncol(t0))
|
|
18
|
+
names(cluster_n)=meta$group_ID
|
|
19
|
+
for(s in 1:length(unique(sample_id))){
|
|
20
|
+
sample_id_tmp=unique(sample_id)[s]
|
|
21
|
+
t0_tmp=t0[,sample_id==sample_id_tmp]
|
|
22
|
+
k0=round(dim(t0_tmp)[2]/(factor*length(unique(sample_id))))
|
|
23
|
+
if(k0<1){k0=1}
|
|
24
|
+
cluster_n_tmp=cutree(hclust(dist(t(t0_tmp))),k=k0)
|
|
25
|
+
cluster_n_tmp=cluster_n_tmp+max(cluster_n)
|
|
26
|
+
cluster_n[sample_id==sample_id_tmp]=cluster_n_tmp
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
cluster=rep("",length(cluster_n))
|
|
30
|
+
b_dummy=rep(1,dim(t)[1])
|
|
31
|
+
|
|
32
|
+
for (i in unique(cluster_n))
|
|
33
|
+
{cluster[cluster_n==i]=find_center(t0[,cluster_n==i,drop=F],b_dummy)}
|
|
34
|
+
cluster
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# initialization
|
|
38
|
+
initialize<-function(t,cdr3,e,hyper_priors,sample_id,b,seed_num=123)
|
|
39
|
+
{
|
|
40
|
+
# setting up
|
|
41
|
+
set.seed(as.numeric(seed_num))
|
|
42
|
+
lambda=hyper_priors$lambda;xi=hyper_priors$xi;g=hyper_priors$g
|
|
43
|
+
tau=hyper_priors$tau;u=hyper_priors$u;v=hyper_priors$v;
|
|
44
|
+
initialize_cluster_factor=hyper_priors$initialize_cluster_factor
|
|
45
|
+
|
|
46
|
+
# preprocess t
|
|
47
|
+
if(!is.null(sample_id)){
|
|
48
|
+
cdr3=paste(cdr3,sample_id,sep=';')
|
|
49
|
+
sample_id_dedup=sample_id[!duplicated(cdr3)]
|
|
50
|
+
names(sample_id_dedup)=cdr3[!duplicated(cdr3)]
|
|
51
|
+
}else{
|
|
52
|
+
sample_id_dedup=NULL
|
|
53
|
+
}
|
|
54
|
+
t=t[,!duplicated(cdr3),drop=F]
|
|
55
|
+
colnames(t)=cdr3[!duplicated(cdr3)]
|
|
56
|
+
|
|
57
|
+
# meta object
|
|
58
|
+
# group ID is just the CDR3 sequence, cluster ID is the CDR3 sequence of the centers
|
|
59
|
+
meta=data.frame(barcode=colnames(e),group_ID=cdr3,cluster_number=NA,stringsAsFactors = F)
|
|
60
|
+
meta$cluster_number=initialize_cluster(meta,t,initialize_cluster_factor,sample_id_dedup)
|
|
61
|
+
|
|
62
|
+
## initialize random/placeholder variables
|
|
63
|
+
# simple ones
|
|
64
|
+
# the initialization of b is tricky
|
|
65
|
+
# note this is not sampling from its distribution, but it is ok for initialization
|
|
66
|
+
if(is.null(b)){
|
|
67
|
+
b=apply(t,1,var)/10
|
|
68
|
+
}
|
|
69
|
+
#b[]=mean(b)
|
|
70
|
+
K=length(unique(meta$cluster_number))
|
|
71
|
+
|
|
72
|
+
# phi
|
|
73
|
+
phi0=aggregate(meta$group_ID,by=list(meta$cluster_number),function(x) length(unique(x)))
|
|
74
|
+
phi=phi0[,2]
|
|
75
|
+
names(phi)=phi0[,1]
|
|
76
|
+
|
|
77
|
+
# t0
|
|
78
|
+
# this is not exactly "right", but not wrong, either
|
|
79
|
+
t00=aggregate(t(t[,meta$group_ID]),by=list(meta$cluster_number),mean)
|
|
80
|
+
t0=sapply(1:dim(t00)[1],function(i) as.numeric(unlist(t00[i,-1])),simplify=F)
|
|
81
|
+
names(t0)=t00$Group.1
|
|
82
|
+
t0=t0[names(phi)]
|
|
83
|
+
|
|
84
|
+
# de, dt
|
|
85
|
+
tmp=as.matrix(dist(t(e)))
|
|
86
|
+
tmp=aggregate(tmp,by=list(meta$group_ID),mean)
|
|
87
|
+
rownames(tmp)=tmp[,1]
|
|
88
|
+
tmp=as.matrix(tmp[,-1])
|
|
89
|
+
tmp=aggregate(t(tmp),by=list(meta$group_ID),mean)
|
|
90
|
+
rownames(tmp)=tmp[,1]
|
|
91
|
+
master_dist_e=as.matrix(tmp[,-1])
|
|
92
|
+
master_dist_e=master_dist_e[colnames(t),colnames(t)]
|
|
93
|
+
|
|
94
|
+
dt=de=list()
|
|
95
|
+
coefs=c()
|
|
96
|
+
|
|
97
|
+
for(k in 1:K)
|
|
98
|
+
{
|
|
99
|
+
c=names(phi)[k]
|
|
100
|
+
group=unique(meta$group_ID[meta$cluster_number==c])
|
|
101
|
+
de[[c]]=named_c(NULL,master_dist_e[names(phi)[k],group],group)
|
|
102
|
+
dt[[c]]=named_c(NULL,colSums((t[,group,drop=F]-t[,c])^2/b/2),group)
|
|
103
|
+
coefs=c(coefs,coef(lm(de[[c]]~dt[[c]]))[2]) # for estimating good values of a and ak
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# sigma, this is also tricky
|
|
107
|
+
sigma=mean(sapply(1:K,function(k) sd(de[[k]])),na.rm=T)*5
|
|
108
|
+
if (is.na(sigma)) {sigma=1}
|
|
109
|
+
|
|
110
|
+
# a, ak
|
|
111
|
+
# note this is not sampling from its distribution
|
|
112
|
+
coefs=coefs[which(coefs>0)]
|
|
113
|
+
a=ifelse(is.na(mean(coefs)),1,mean(coefs))
|
|
114
|
+
ak=rnorm(K,a,ifelse(is.na(sd(coefs)),1,sd(coefs)))
|
|
115
|
+
names(ak)=names(phi)
|
|
116
|
+
|
|
117
|
+
# return
|
|
118
|
+
return(list(t=t,meta=meta,a=a,b=b,sigma=sigma,K=K,
|
|
119
|
+
ak=ak,phi=phi,t0=t0,master_dist_e=master_dist_e,de=de,dt=dt))
|
|
120
|
+
}
|