biopipen 0.17.6__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (31) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/filters.py +36 -23
  3. biopipen/ns/delim.py +1 -1
  4. biopipen/ns/scrna.py +132 -49
  5. biopipen/ns/tcr.py +62 -0
  6. biopipen/reports/scrna/MarkersFinder.svelte +30 -8
  7. biopipen/reports/scrna/SeuratClusterStats.svelte +64 -109
  8. biopipen/reports/tcr/TESSA.svelte +43 -0
  9. biopipen/scripts/delim/SampleInfo.R +18 -15
  10. biopipen/scripts/scrna/MarkersFinder.R +58 -2
  11. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +40 -0
  12. biopipen/scripts/scrna/SeuratClusterStats-features.R +236 -0
  13. biopipen/scripts/scrna/SeuratClusterStats-stats.R +105 -0
  14. biopipen/scripts/scrna/SeuratClusterStats.R +7 -521
  15. biopipen/scripts/scrna/SeuratClustering.R +20 -1
  16. biopipen/scripts/tcr/ImmunarchLoading.R +1 -1
  17. biopipen/scripts/tcr/TESSA.R +198 -0
  18. biopipen/scripts/tcr/TESSA_source/Atchley_factors.csv +21 -0
  19. biopipen/scripts/tcr/TESSA_source/BriseisEncoder.py +168 -0
  20. biopipen/scripts/tcr/TESSA_source/MCMC_control.R +71 -0
  21. biopipen/scripts/tcr/TESSA_source/TrainedEncoder.h5 +0 -0
  22. biopipen/scripts/tcr/TESSA_source/fixed_b.csv +31 -0
  23. biopipen/scripts/tcr/TESSA_source/initialization.R +120 -0
  24. biopipen/scripts/tcr/TESSA_source/post_analysis.R +124 -0
  25. biopipen/scripts/tcr/TESSA_source/real_data.R +67 -0
  26. biopipen/scripts/tcr/TESSA_source/update.R +195 -0
  27. biopipen/scripts/tcr/TESSA_source/utility.R +18 -0
  28. {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/METADATA +8 -8
  29. {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/RECORD +31 -16
  30. {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/WHEEL +0 -0
  31. {biopipen-0.17.6.dist-info → biopipen-0.18.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,198 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+
3
+ library(glue)
4
+ library(dplyr)
5
+ library(tidyr)
6
+ library(immunarch)
7
+ library(Seurat)
8
+ library(ggplot2)
9
+ library(ggprism)
10
+
11
+ immfile <- {{in.immdata | r}}
12
+ exprfile <- {{in.srtobj | r}}
13
+ outfile <- {{out.outfile | r}}
14
+ python <- {{envs.python | r}}
15
+ within_sample <- {{envs.within_sample | r}}
16
+ assay <- {{envs.assay | r}}
17
+ predefined_b <- {{envs.predefined_b | r}}
18
+ max_iter <- {{envs.max_iter | int}}
19
+ tessa_srcdir <- "{{biopipen_dir}}/scripts/tcr/TESSA_source"
20
+
21
+ outdir <- dirname(outfile)
22
+ result_dir <- file.path(outdir, "result")
23
+ tessa_dir <- file.path(outdir, "tessa")
24
+ if (!dir.exists(result_dir)) dir.create(result_dir)
25
+ if (!dir.exists(tessa_dir)) dir.create(tessa_dir)
26
+
27
+ ### Start preparing input files for TESSA
28
+ # Prepare input files
29
+ print("Preparing TCR input file ...")
30
+ immdata <- readRDS(immfile)
31
+
32
+ has_VJ <- "V.name" %in% colnames(immdata$data[[1]]) && "J.name" %in% colnames(immdata$data[[1]])
33
+ # Merge all samples
34
+ tcrdata <- do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
35
+ # Clones Proportion CDR3.aa Barcode
36
+ # 5 4 0.008583691 CAVRDTGNTPLVF;CASSEYSNQPQHF GTTCGGGCACTTACGA-1;TCTCTAAGTACCAGTT-1
37
+ # 6 4 0.008583691 CALTQAAGNKLTF;CASRPEDLRGQPQHF GCTTGAAGTCGGCACT-1;TACTCGCTCCTAAGTG-1
38
+ if (has_VJ) {
39
+ cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa", "V.name", "J.name")]
40
+ } else {
41
+ cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa")]
42
+ }
43
+ # # A tibble: 4 × 5
44
+ # Sample Patient Timepoint Tissue
45
+ # <chr> <chr> <chr> <chr>
46
+ # 1 MC1685Pt011-Baseline-PB MC1685Pt011 Baseline PB
47
+ mdata = as.list(immdata$meta[i, , drop=FALSE])
48
+ for (mname in names(mdata)) {
49
+ assign(mname, mdata[[mname]])
50
+ }
51
+
52
+ cldata %>%
53
+ separate_rows(Barcode, sep=";") %>%
54
+ # Just in case there are duplicated barcodes
55
+ distinct(Barcode, .keep_all = TRUE) %>%
56
+ mutate(Barcode = glue("{{envs.prefix}}{Barcode}"), sample = Sample)
57
+ }))
58
+ if (has_VJ) {
59
+ tcrdata <- tcrdata %>% dplyr::mutate(
60
+ v_gene = sub("-\\d+$", "", V.name),
61
+ j_gene = sub("-\\d+$", "", J.name)
62
+ ) %>% dplyr::select(
63
+ contig_id = Barcode,
64
+ cdr3 = CDR3.aa,
65
+ v_gene,
66
+ j_gene,
67
+ sample
68
+ )
69
+ } else {
70
+ tcrdata <- tcrdata %>% dplyr::select(
71
+ contig_id = Barcode,
72
+ cdr3 = CDR3.aa,
73
+ sample
74
+ )
75
+ }
76
+
77
+
78
+ print("Preparing expression input file ...")
79
+ is_seurat <- endsWith(tolower(exprfile), ".rds")
80
+ is_gz <- endsWith(tolower(exprfile), ".gz")
81
+
82
+ if (is_seurat) {
83
+ sobj <- readRDS(exprfile)
84
+ expr <- GetAssayData(sobj, slot = "data", assay = assay)
85
+ } else if (is_gz) {
86
+ expr <- read.table(gzfile(exprfile), sep="\t", header=TRUE, row.names=1)
87
+ } else {
88
+ expr <- read.table(exprfile, sep="\t", header=TRUE, row.names=1)
89
+ }
90
+
91
+ cell_ids <- intersect(tcrdata$contig_id, colnames(expr))
92
+ # Warning about unused cells
93
+ unused_tcr_cells <- setdiff(tcrdata$contig_id, cell_ids)
94
+ unused_expr_cells <- setdiff(colnames(expr), cell_ids)
95
+ if (length(unused_tcr_cells) > 0) {
96
+ warning(glue("{length(unused_tcr_cells)}/{nrow(tcrdata)} TCR cells are not used."), immediate. = TRUE)
97
+ }
98
+ if (length(unused_expr_cells) > 0) {
99
+ warning(glue("{length(unused_expr_cells)}/{ncol(expr)} expression cells are not used."), immediate. = TRUE)
100
+ }
101
+ if (length(cell_ids) == 0) {
102
+ stop("No common cells between TCR and expression data. Are you using the correct prefix?")
103
+ }
104
+ tcrdata <- tcrdata[tcrdata$contig_id %in% cell_ids, , drop=FALSE]
105
+ expr <- as.matrix(expr)[, tcrdata$contig_id, drop=FALSE]
106
+
107
+ # Write input files
108
+ print("Writing input files ...")
109
+ write.table(tcrdata, file.path(tessa_dir, "tcrdata.txt"), sep=",", quote=FALSE, row.names=FALSE)
110
+ write.table(expr, file.path(tessa_dir, "exprdata.txt"), sep=",", quote=FALSE, row.names=TRUE, col.names=TRUE)
111
+
112
+ ### End preparing input files for TESSA
113
+
114
+ ### Start running TESSA
115
+ print("Running TESSA ...")
116
+
117
+ # The original TESSA uses a python wrapper to run the encoder and tessa model
118
+ # here we run those two steps directly here
119
+
120
+ print("- Running encoder ...")
121
+ cmd_encoder <- paste(
122
+ python,
123
+ file.path(tessa_srcdir, "BriseisEncoder.py"),
124
+ "-tcr",
125
+ file.path(tessa_dir, "tcrdata.txt"),
126
+ "-model",
127
+ file.path(tessa_srcdir, "TrainedEncoder.h5"),
128
+ "-embeding_vectors",
129
+ file.path(tessa_srcdir, "Atchley_factors.csv"),
130
+ "-output_TCR",
131
+ file.path(tessa_dir, "tcr_encoded.txt"),
132
+ "-output_log",
133
+ file.path(tessa_dir, "tcr_encoder.log")
134
+ )
135
+ if (has_VJ) {
136
+ cmd_encoder <- paste(
137
+ cmd_encoder,
138
+ "-output_VJ",
139
+ file.path(tessa_dir, "tcr_vj.txt")
140
+ )
141
+ }
142
+ print(paste("- ", cmd_encoder))
143
+
144
+ rc <- system(cmd_encoder)
145
+ if (rc != 0) {
146
+ stop("Error: Failed to run encoder.")
147
+ }
148
+
149
+ print("- Running TESSA model ...")
150
+ source(file.path(tessa_srcdir, "real_data.R"))
151
+
152
+ tessa <- run_tessa(
153
+ tessa_srcdir,
154
+ file.path(tessa_dir, "exprdata.txt"),
155
+ file.path(tessa_dir, "tcr_encoded.txt"),
156
+ file.path(tessa_dir, "tcrdata.txt"),
157
+ result_dir,
158
+ within_sample,
159
+ (if (!predefined_b) NULL else file.path(tessa_srcdir, "fixed_b.csv")),
160
+ max_iter = max_iter
161
+ )
162
+
163
+ # Save TESSA results
164
+ print("Saving TESSA results ...")
165
+ if (is_seurat) {
166
+ cells <- rownames(sobj@meta.data)
167
+ sobj@meta.data <- sobj@meta.data %>%
168
+ mutate(
169
+ TESSA_Cluster = tessa$meta[
170
+ match(cells, tessa$meta$barcode),
171
+ "cluster_number"
172
+ ]
173
+ ) %>%
174
+ add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
175
+ rownames(sobj@meta.data) <- cells
176
+ saveRDS(sobj, outfile)
177
+ } else {
178
+ out <- tessa$meta %>%
179
+ dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
180
+ add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
181
+ write.table(out, outfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
182
+ }
183
+
184
+ # Post analysis
185
+ print("Post analysis ...")
186
+ plot_tessa(tessa, result_dir)
187
+ plot_Tessa_clusters(tessa, result_dir)
188
+
189
+ p <- tessa$meta %>%
190
+ dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
191
+ add_count(TESSA_Cluster, name = "TESSA_Cluster_Size") %>%
192
+ ggplot(aes(x = TESSA_Cluster_Size)) +
193
+ geom_histogram(binwidth = 1) +
194
+ theme_prism()
195
+
196
+ png(file.path(result_dir, "Cluster_size_dist.png"), width=8, height=8, units="in", res=100)
197
+ print(p)
198
+ dev.off()
@@ -0,0 +1,21 @@
1
+ Amino acid,Factor I,Factor II,Factor III,Factor IV,Factor V
2
+ A,-0.591,-1.302,-0.733,1.57,-0.146
3
+ C,-1.343,0.465,-0.862,-1.02,-0.255
4
+ D,1.05,0.302,-3.656,-0.259,-3.242
5
+ E,1.357,-1.453,1.477,0.113,-0.837
6
+ F,-1.006,-0.59,1.891,-0.397,0.412
7
+ G,-0.384,1.652,1.33,1.045,2.064
8
+ H,0.336,-0.417,-1.673,-1.474,-0.078
9
+ I,-1.239,-0.547,2.131,0.393,0.816
10
+ K,1.831,-0.561,0.533,-0.277,1.648
11
+ L,-1.019,-0.987,-1.505,1.266,-0.912
12
+ M,-0.663,-1.524,2.219,-1.005,1.212
13
+ N,0.945,0.828,1.299,-0.169,0.933
14
+ P,0.189,2.081,-1.628,0.421,-1.392
15
+ Q,0.931,-0.179,-3.005,-0.503,-1.853
16
+ R,1.538,-0.055,1.502,0.44,2.897
17
+ S,-0.228,1.399,-4.76,0.67,-2.647
18
+ T,-0.032,0.326,2.213,0.908,1.313
19
+ V,-1.337,-0.279,-0.544,1.242,-1.262
20
+ W,-0.595,0.009,0.672,-2.128,-0.184
21
+ Y,0.26,0.83,3.097,-0.838,1.512
@@ -0,0 +1,168 @@
1
+ """
2
+ ##########################################################################################################################
3
+ # BriseisEncoder 1.0.0: A deep learning encoder for CDR3 sequences in the TCR space
4
+ ##########################################################################################################################
5
+ # BriseisEncoder is capable of transforming CDR3 peptide sequences from productive
6
+ # TCR-beta chains into 15-digit informative numerical vectors.
7
+ # 01312019 Ze Zhang <Ze.Zhang@UTsouthwestern.edu>
8
+ ##########################################################################################################################
9
+ # Dependencies:
10
+ # Python 3.6.4 (preferred)
11
+ # numpy (>=1.15.4), pandas (>=0.23.4), keras (>=2.2.4)
12
+ ##########################################################################################################################
13
+ # Parameters:
14
+ # tcr_dir: a .csv file to input the CDR3 sequences, contains at least 2 columns naming 'contig_id' and 'cdr3'. Two optional
15
+ # columns 'v_gene' and 'j_gene' denotes the V gene and J gene subgroups (TRBV1-30 and TRBJ1/2) of TRB recombinants.
16
+ # 'contig_id' should contains non-repeated ID strings, and 'cdr3' should contains valid TRB CDR3 peptide sequences.
17
+ # model_dir: a .h5 file containing trained encoding models from CDR3 seqs called from bulk-tumor RNA-seq data.
18
+ # aa_dict_dir: a .csv file storing Atchley's amino acid embedding, details in https://www.ncbi.nlm.nih.gov/pubmed/15851683.
19
+ # output_encodedTCR_dir: a .csv file dir to store encoded CDR3 peptide seqs.
20
+ # output_log_dir: a plain text log-file to record any errors or warnings from this script.
21
+ # output_encodedVJ_dir: a .csv file dir to store one-hot encoded TRBV/TRBJ genes.
22
+ ##########################################################################################################################
23
+ # Example:
24
+ # python3 BriseisEncoder.py -tcr TestCDR3.csv -model Trained_encoder.h5 -embeding_vectors Atchley_factors.csv \
25
+ # -output_TCR test.csv -output_VJ testVJ.csv -output_log test.log
26
+ ##########################################################################################################################
27
+ """ # noqa: E501
28
+
29
+ # Import dependencies
30
+ import sys
31
+
32
+ # sys.path.append('/home2/s421955/.conda/envs/keras_test/site-packages')
33
+ import numpy as np
34
+ import pandas as pd
35
+ import os
36
+ import csv
37
+ from keras.models import load_model
38
+ from keras.models import Model
39
+
40
+ # Read data
41
+ args = sys.argv
42
+ tcr_dir = args[args.index("-tcr") + 1]
43
+ model_dir = args[args.index("-model") + 1]
44
+ aa_dict_dir = args[args.index("-embeding_vectors") + 1]
45
+ output_encodedTCR_dir = args[args.index("-output_TCR") + 1]
46
+ if "-output_VJ" in args:
47
+ output_encodedVJ_dir = args[args.index("-output_VJ") + 1]
48
+ output_log_dir = args[args.index("-output_log") + 1]
49
+ encode_dim = 80
50
+
51
+
52
+ # Define functions
53
+ def preprocess(filedir):
54
+ # Preprocess TCR files
55
+ print("Processing: " + filedir)
56
+ if not os.path.exists(filedir):
57
+ print("Invalid file path: " + filedir)
58
+ return 0
59
+ dataset = pd.read_csv(filedir, header=0)
60
+ if dataset.isnull().values.any():
61
+ print("Input data contains NAs.")
62
+ # dataset = dataset.dropna()
63
+ data_new = pd.DataFrame(
64
+ {
65
+ "contig_id": dataset["contig_id"],
66
+ "cdr3": dataset["cdr3"],
67
+ "v_gene": None,
68
+ "j_gene": None,
69
+ }
70
+ )
71
+ if "v_gene" in dataset.columns:
72
+ data_new["v_gene"] = dataset["v_gene"]
73
+ else:
74
+ data_new = data_new.drop(columns="v_gene")
75
+ if "j_gene" in dataset.columns:
76
+ data_new["j_gene"] = dataset["j_gene"]
77
+ else:
78
+ data_new = data_new.drop(columns="j_gene")
79
+ data_new.index = range(0, dataset.shape[0])
80
+ return data_new
81
+
82
+
83
+ def aamapping(peptideSeq, aa_dict, encode_dim):
84
+ # Transform aa seqs to Atchley's factors.
85
+ peptideArray = []
86
+ if len(peptideSeq) > encode_dim:
87
+ print("Length: " + str(len(peptideSeq)) + " over bound!")
88
+ peptideSeq = peptideSeq[0:encode_dim]
89
+ for aa_single in peptideSeq:
90
+ try:
91
+ peptideArray.append(aa_dict[aa_single])
92
+ except KeyError:
93
+ print("Not proper aaSeqs: " + peptideSeq)
94
+ peptideArray.append(np.zeros(5, dtype="float64"))
95
+ for i in range(0, encode_dim - len(peptideSeq)):
96
+ peptideArray.append(np.zeros(5, dtype="float64"))
97
+ return np.asarray(peptideArray)
98
+
99
+
100
+ def datasetMap(dataset, aa_dict, encode_dim):
101
+ # Wrapper of aamapping
102
+ TCR_dict = dict()
103
+ for i in range(0, len(dataset["cdr3"])):
104
+ TCR_key = dataset["contig_id"][i]
105
+ if TCR_key in TCR_dict.keys():
106
+ TCR_key = TCR_key + "_" + str(i)
107
+ TCR_dictarray = aamapping(dataset["cdr3"][i], aa_dict, encode_dim)
108
+ TCR_dict[TCR_key] = TCR_dictarray
109
+ return TCR_dict
110
+
111
+
112
+ def embedVJ(genelist, maplist):
113
+ # Embed VJgenes
114
+ VJ_array = []
115
+ for gene in genelist:
116
+ ind = np.zeros(len(maplist))
117
+ try:
118
+ find = maplist.index(gene)
119
+ ind[find] = 1
120
+ VJ_array.append(ind)
121
+ except ValueError:
122
+ print("Gene out of bound!" + str(gene))
123
+ VJ_array.append(ind)
124
+ next
125
+ return np.asarray(VJ_array)
126
+
127
+
128
+ # Main functions, data preprocess
129
+ log_file = open(output_log_dir, "w")
130
+ sys.stdout = log_file
131
+ print("Mission loading.")
132
+ tcr = preprocess(tcr_dir)
133
+ aa_dict = dict()
134
+ with open(aa_dict_dir, "r") as aa:
135
+ aa_reader = csv.reader(aa)
136
+ next(aa_reader, None)
137
+ for rows in aa_reader:
138
+ aa_name = rows[0]
139
+ aa_factor = rows[1 : len(rows)]
140
+ aa_dict[aa_name] = np.asarray(aa_factor, dtype="float")
141
+ TCR_dict = datasetMap(tcr, aa_dict, encode_dim)
142
+ TCR_contigs = np.stack(list(TCR_dict.values()))
143
+ TCR_contigs = TCR_contigs.reshape(-1, encode_dim, 5, 1)
144
+ # Model prediction
145
+ TCRencoder = load_model(model_dir)
146
+ encoder = Model(TCRencoder.input, TCRencoder.layers[-12].output)
147
+ encoded_mat = encoder.predict(TCR_contigs)
148
+ encoded_mat = pd.DataFrame(encoded_mat, index=tcr["contig_id"])
149
+ encoded_mat.to_csv(output_encodedTCR_dir, sep=",")
150
+ if "v_gene" in tcr.columns or "j_gene" in tcr.columns:
151
+ maplist = ["TRBV" + str(i) for i in range(1, 31)]
152
+ maplist.append("TRBJ1")
153
+ maplist.append("TRBJ2")
154
+ if "v_gene" in tcr.columns:
155
+ v_map = embedVJ(tcr["v_gene"], maplist)[:, 0:30]
156
+ else:
157
+ print("V genes are missing!")
158
+ v_map = np.zeros((tcr.shape[0], 30), dtype="float64")
159
+ if "j_gene" in tcr.columns:
160
+ j_map = embedVJ(tcr["j_gene"], maplist)[:, 30:32]
161
+ else:
162
+ print("J genes are missing!")
163
+ j_map = np.zeros((tcr.shape[0], 2), dtype="float64")
164
+ VJ_map = np.concatenate((v_map, j_map), axis=1)
165
+ VJ_map = pd.DataFrame(data=VJ_map, index=tcr["contig_id"], columns=maplist)
166
+ VJ_map.to_csv(output_encodedVJ_dir, sep=",")
167
+ print("Mission Accomplished.\n")
168
+ log_file.close()
@@ -0,0 +1,71 @@
1
+ Tessa<-function(e,cdr3,t,hyper_priors,max_iter,sample_id=NULL,save=NULL,b=NULL,seed_num=123)
2
+ {
3
+ # initialization
4
+ cat('\nInitialization\n')
5
+ options(warn=2)
6
+ if ((!is.null(save)) && (!file.exists(save))) {dir.create(save)}
7
+ if (!is.null(sample_id)){
8
+ if(length(sample_id)!=ncol(t)){
9
+ print('Unmatched sample IDs!')
10
+ break
11
+ }
12
+ }
13
+ if(!is.null(b)){
14
+ preset_b=TRUE
15
+ }else{
16
+ preset_b=FALSE
17
+ }
18
+ print(seed_num)
19
+ initialized=initialize(t,cdr3,e,hyper_priors,sample_id,b,seed_num=seed_num)
20
+ t=initialized$t;meta=initialized$meta;a=initialized$a;b=initialized$b;sigma=initialized$sigma;
21
+ K=initialized$K;ak=initialized$ak;phi=initialized$phi;t0=initialized$t0;
22
+ master_dist_e=initialized$master_dist_e;de=initialized$de;dt=initialized$dt
23
+
24
+ lambda=hyper_priors$lambda;xi=hyper_priors$xi;g=hyper_priors$g
25
+ tau=hyper_priors$tau;u=hyper_priors$u;v=hyper_priors$v;
26
+ initialize_cluster_factor=hyper_priors$initialize_cluster_factor
27
+
28
+ # some intermediate variables for computational efficiencies
29
+ meta_dedup=meta[!duplicated(meta$group_ID),] # meta: cell level; meta_dedup: group level
30
+ rownames(meta_dedup)=meta_dedup$group_ID
31
+ meta_dedup=meta_dedup[colnames(t),]
32
+ updated_recent=c() # for recording acceptance rates of b
33
+ mean_t=rowMeans(t)
34
+
35
+ # MCMC
36
+ for (iter in 1:max_iter)
37
+ {
38
+ # report
39
+ cat(paste('\nIteration round:',iter,"\n"))
40
+ cat(paste(" # clusters:",K,"\n"))
41
+ cluster_rate=1-sum(sapply(dt,function(x) length(x)==1))/length(unlist(dt))
42
+ cat(paste(" Clustering rate:",round(cluster_rate,d=3),"\n"))
43
+
44
+ # Dirichlet process
45
+ results=DP(meta_dedup,meta,t0,dt,de,ak,phi,t,lambda,g,sigma,b,master_dist_e,K,
46
+ a,xi,mean_t,sample_id)
47
+ t0=results$t0;phi=results$phi;de=results$de;dt=results$dt;ak=results$ak;
48
+ c=results$c;K=results$K;meta_dedup$cluster_number=results$cluster_number
49
+
50
+ # other parameters
51
+ t0=update_t0(t,meta_dedup,K,lambda,b,phi,t0)
52
+ ak=update_ak(K,dt,de,sigma,a,g,phi,ak)
53
+ regression_loss=sapply(1:K,function(k) sum((de[[k]]-ak[k]*dt[[k]])^2))
54
+ sigma=update_sigma(u,v,K,phi,g,ak,a,de,dt,regression_loss)
55
+ a=update_a(tau,K,g,sigma,ak)
56
+ b_result=update_b(u,v,phi,t,t0,meta_dedup,K,dt,de,ak,sigma,regression_loss,b,preset_b)
57
+ b=b_result$b;dt=b_result$dt
58
+ updated_recent=c(b_result$updated,updated_recent)
59
+ if (length(updated_recent)>200) {updated_recent=updated_recent[1:200]}
60
+ cat(paste(" Recent b acceptance rate:",round(sum(updated_recent)/length(updated_recent),d=2),"\n"))
61
+ }
62
+
63
+ # wrap up
64
+ options(warn=0)
65
+ meta$cluster_number=meta_dedup[meta$group_ID,"cluster_number"]
66
+ tessa_results=list(b=b,meta=meta,master_dist_e=master_dist_e,a=a,ak=ak,sigma=sigma,dt=dt,de=de,
67
+ t=t,meta_dedup=meta_dedup,phi=phi,K=K)
68
+ save(tessa_results,file=paste(save,"/tessa_final.RData",sep=""))
69
+ write.csv(meta,file=paste(save,"/result_meta.csv",sep=""),quote=F,row.names=FALSE)
70
+ return(tessa_results)
71
+ }
@@ -0,0 +1,31 @@
1
+ b
2
+ 0.104086935938378
3
+ 0.108780692900858
4
+ 0.0570899431292494
5
+ 0.0223191901369765
6
+ 0.0625734040149305
7
+ 0.0724461005365275
8
+ 0.0570765486616299
9
+ 0.0735009762613172
10
+ 0.0493248771695967
11
+ 0.0644907300641909
12
+ 0.0505856876380108
13
+ 0.0362054942161984
14
+ 0.0430213209688376
15
+ 0.0338085552633352
16
+ 0.0577835864812993
17
+ 0.0360238874526061
18
+ 0.0497979998974232
19
+ 0.0501341494361042
20
+ 0.0974383828919103
21
+ 0.0950533850505816
22
+ 0.0230444436289012
23
+ 0.0830927914655238
24
+ 0.0632701497792147
25
+ 0.0341922249498399
26
+ 0.0188572387690941
27
+ 0.0220431669369226
28
+ 0.0353984720393545
29
+ 0.0327647439111161
30
+ 0.0298263126422095
31
+ 0.0244369519533193
@@ -0,0 +1,120 @@
1
+ # find center of each cluster
2
+ find_center<-function(t_cluster,b)
3
+ {
4
+ names(which.min(colSums((t_cluster-rowMeans(t_cluster))^2/b)))[1]
5
+ }
6
+
7
+ # to initialize the assignment of clusters. A good initialization will save a lot of MCMC cycles
8
+ initialize_cluster<-function(meta,t,factor=4,sample_id_dedup) # larger factor, less number of clusters
9
+ {
10
+ t0=t[,meta$group_ID]
11
+ if(is.null(sample_id_dedup)){
12
+ k0=round(dim(t0)[2]/factor)
13
+ if(k0<1){k0=1}
14
+ cluster_n=cutree(hclust(dist(t(t0))),k=k0)
15
+ }else{
16
+ sample_id=sample_id_dedup[meta$group_ID]
17
+ cluster_n=rep(0,ncol(t0))
18
+ names(cluster_n)=meta$group_ID
19
+ for(s in 1:length(unique(sample_id))){
20
+ sample_id_tmp=unique(sample_id)[s]
21
+ t0_tmp=t0[,sample_id==sample_id_tmp]
22
+ k0=round(dim(t0_tmp)[2]/(factor*length(unique(sample_id))))
23
+ if(k0<1){k0=1}
24
+ cluster_n_tmp=cutree(hclust(dist(t(t0_tmp))),k=k0)
25
+ cluster_n_tmp=cluster_n_tmp+max(cluster_n)
26
+ cluster_n[sample_id==sample_id_tmp]=cluster_n_tmp
27
+ }
28
+ }
29
+ cluster=rep("",length(cluster_n))
30
+ b_dummy=rep(1,dim(t)[1])
31
+
32
+ for (i in unique(cluster_n))
33
+ {cluster[cluster_n==i]=find_center(t0[,cluster_n==i,drop=F],b_dummy)}
34
+ cluster
35
+ }
36
+
37
+ # initialization
38
+ initialize<-function(t,cdr3,e,hyper_priors,sample_id,b,seed_num=123)
39
+ {
40
+ # setting up
41
+ set.seed(as.numeric(seed_num))
42
+ lambda=hyper_priors$lambda;xi=hyper_priors$xi;g=hyper_priors$g
43
+ tau=hyper_priors$tau;u=hyper_priors$u;v=hyper_priors$v;
44
+ initialize_cluster_factor=hyper_priors$initialize_cluster_factor
45
+
46
+ # preprocess t
47
+ if(!is.null(sample_id)){
48
+ cdr3=paste(cdr3,sample_id,sep=';')
49
+ sample_id_dedup=sample_id[!duplicated(cdr3)]
50
+ names(sample_id_dedup)=cdr3[!duplicated(cdr3)]
51
+ }else{
52
+ sample_id_dedup=NULL
53
+ }
54
+ t=t[,!duplicated(cdr3),drop=F]
55
+ colnames(t)=cdr3[!duplicated(cdr3)]
56
+
57
+ # meta object
58
+ # group ID is just the CDR3 sequence, cluster ID is the CDR3 sequence of the centers
59
+ meta=data.frame(barcode=colnames(e),group_ID=cdr3,cluster_number=NA,stringsAsFactors = F)
60
+ meta$cluster_number=initialize_cluster(meta,t,initialize_cluster_factor,sample_id_dedup)
61
+
62
+ ## initialize random/placeholder variables
63
+ # simple ones
64
+ # the initialization of b is tricky
65
+ # note this is not sampling from its distribution, but it is ok for initialization
66
+ if(is.null(b)){
67
+ b=apply(t,1,var)/10
68
+ }
69
+ #b[]=mean(b)
70
+ K=length(unique(meta$cluster_number))
71
+
72
+ # phi
73
+ phi0=aggregate(meta$group_ID,by=list(meta$cluster_number),function(x) length(unique(x)))
74
+ phi=phi0[,2]
75
+ names(phi)=phi0[,1]
76
+
77
+ # t0
78
+ # this is not exactly "right", but not wrong, either
79
+ t00=aggregate(t(t[,meta$group_ID]),by=list(meta$cluster_number),mean)
80
+ t0=sapply(1:dim(t00)[1],function(i) as.numeric(unlist(t00[i,-1])),simplify=F)
81
+ names(t0)=t00$Group.1
82
+ t0=t0[names(phi)]
83
+
84
+ # de, dt
85
+ tmp=as.matrix(dist(t(e)))
86
+ tmp=aggregate(tmp,by=list(meta$group_ID),mean)
87
+ rownames(tmp)=tmp[,1]
88
+ tmp=as.matrix(tmp[,-1])
89
+ tmp=aggregate(t(tmp),by=list(meta$group_ID),mean)
90
+ rownames(tmp)=tmp[,1]
91
+ master_dist_e=as.matrix(tmp[,-1])
92
+ master_dist_e=master_dist_e[colnames(t),colnames(t)]
93
+
94
+ dt=de=list()
95
+ coefs=c()
96
+
97
+ for(k in 1:K)
98
+ {
99
+ c=names(phi)[k]
100
+ group=unique(meta$group_ID[meta$cluster_number==c])
101
+ de[[c]]=named_c(NULL,master_dist_e[names(phi)[k],group],group)
102
+ dt[[c]]=named_c(NULL,colSums((t[,group,drop=F]-t[,c])^2/b/2),group)
103
+ coefs=c(coefs,coef(lm(de[[c]]~dt[[c]]))[2]) # for estimating good values of a and ak
104
+ }
105
+
106
+ # sigma, this is also tricky
107
+ sigma=mean(sapply(1:K,function(k) sd(de[[k]])),na.rm=T)*5
108
+ if (is.na(sigma)) {sigma=1}
109
+
110
+ # a, ak
111
+ # note this is not sampling from its distribution
112
+ coefs=coefs[which(coefs>0)]
113
+ a=ifelse(is.na(mean(coefs)),1,mean(coefs))
114
+ ak=rnorm(K,a,ifelse(is.na(sd(coefs)),1,sd(coefs)))
115
+ names(ak)=names(phi)
116
+
117
+ # return
118
+ return(list(t=t,meta=meta,a=a,b=b,sigma=sigma,K=K,
119
+ ak=ak,phi=phi,t0=t0,master_dist_e=master_dist_e,de=de,dt=dt))
120
+ }