partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/FastTree +0 -0
- bin/add-chimeras.py +59 -0
- bin/add-seqs-to-outputs.py +81 -0
- bin/bcr-phylo-run.py +799 -0
- bin/build.sh +24 -0
- bin/cf-alleles.py +97 -0
- bin/cf-germlines.py +57 -0
- bin/cf-linearham.py +199 -0
- bin/chimera-plot.py +76 -0
- bin/choose-partially-paired.py +143 -0
- bin/circle-plots.py +30 -0
- bin/compare-plotdirs.py +298 -0
- bin/diff-parameters.py +133 -0
- bin/docker-hub-push.sh +6 -0
- bin/extract-pairing-info.py +55 -0
- bin/gcdyn-simu-run.py +223 -0
- bin/gctree-run.py +244 -0
- bin/get-naive-probabilities.py +126 -0
- bin/iqtree-1.6.12 +0 -0
- bin/lonr.r +1020 -0
- bin/makeHtml +52 -0
- bin/mds-run.py +46 -0
- bin/parse-output.py +277 -0
- bin/partis +1869 -0
- bin/partis-pip +116 -0
- bin/partis.py +1869 -0
- bin/plot-gl-set-trees.py +519 -0
- bin/plot-hmms.py +151 -0
- bin/plot-lb-tree.py +427 -0
- bin/raxml-ng +0 -0
- bin/read-bcr-phylo-trees.py +38 -0
- bin/read-gctree-output.py +166 -0
- bin/run-chimeras.sh +64 -0
- bin/run-dtr-scan.sh +25 -0
- bin/run-paired-loci.sh +100 -0
- bin/run-tree-metrics.sh +88 -0
- bin/smetric-run.py +62 -0
- bin/split-loci.py +317 -0
- bin/swarm-2.1.13-linux-x86_64 +0 -0
- bin/test-germline-inference.py +425 -0
- bin/tree-perf-run.py +194 -0
- bin/vsearch-2.4.3-linux-x86_64 +0 -0
- bin/vsearch-2.4.3-macos-x86_64 +0 -0
- bin/xvfb-run +194 -0
- partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
- partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
- partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
- partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
- partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
- partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
- partis_bcr-1.0.2.data/scripts/test.py +1005 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
- partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
- {partis → python}/glutils.py +1 -1
- python/main.py +30 -0
- {partis → python}/plotting.py +10 -1
- {partis → python}/treeutils.py +18 -16
- {partis → python}/utils.py +14 -7
- packages/ham/bcrham +0 -0
- partis/main.py +0 -59
- partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
- {partis → python}/__init__.py +0 -0
- {partis → python}/alleleclusterer.py +0 -0
- {partis → python}/allelefinder.py +0 -0
- {partis → python}/alleleremover.py +0 -0
- {partis → python}/annotationclustering.py +0 -0
- {partis → python}/baseutils.py +0 -0
- {partis → python}/cache/__init__.py +0 -0
- {partis → python}/cache/cached_uncertainties.py +0 -0
- {partis → python}/clusterpath.py +0 -0
- {partis → python}/coar.py +0 -0
- {partis → python}/corrcounter.py +0 -0
- {partis → python}/datautils.py +0 -0
- {partis → python}/event.py +0 -0
- {partis → python}/fraction_uncertainty.py +0 -0
- {partis → python}/gex.py +0 -0
- {partis → python}/glomerator.py +0 -0
- {partis → python}/hist.py +0 -0
- {partis → python}/hmmwriter.py +0 -0
- {partis → python}/hutils.py +0 -0
- {partis → python}/indelutils.py +0 -0
- {partis → python}/lbplotting.py +0 -0
- {partis → python}/mds.py +0 -0
- {partis → python}/mutefreqer.py +0 -0
- {partis → python}/paircluster.py +0 -0
- {partis → python}/parametercounter.py +0 -0
- {partis → python}/paramutils.py +0 -0
- {partis → python}/partitiondriver.py +0 -0
- {partis → python}/partitionplotter.py +0 -0
- {partis → python}/performanceplotter.py +0 -0
- {partis → python}/plotconfig.py +0 -0
- {partis → python}/processargs.py +0 -0
- {partis → python}/prutils.py +0 -0
- {partis → python}/recombinator.py +0 -0
- {partis → python}/scanplot.py +0 -0
- {partis → python}/seqfileopener.py +0 -0
- {partis → python}/treegenerator.py +0 -0
- {partis → python}/viterbicluster.py +0 -0
- {partis → python}/vrc01.py +0 -0
- {partis → python}/waterer.py +0 -0
bin/lonr.r
ADDED
@@ -0,0 +1,1020 @@
|
|
1
|
+
# paper: http://dx.doi.org/10.1093/nar/gkv1198
|
2
|
+
# docs from supplementary info:
|
3
|
+
|
4
|
+
# issues
|
5
|
+
# - can't pass in your own tree -- have to let it run either phylip dnapars or phylip neighbor. Related things:
|
6
|
+
# - it assumes the internal nodes (that phylip adds) have phylip-convention names, i.e. str(some_integer), and if they're not, it breaks
|
7
|
+
# - if you pass it (i.e. if you observe) internal nodes, phylip shunts them to zero-lengh branches hanging off of newly-inferred internal nodes. So you then have to try to figure out which nodes/leaves you can collapse
|
8
|
+
# - it counts offspring all the way down the tree, i.e. ancestors get credit for fitness improvements in their offspring
|
9
|
+
# - this is "handled" by the "affected by descendents" flag, but that doesn't solve the problem, it just alerts you to lineages where there's more likely to be a problem
|
10
|
+
# - it would be much better to use an lbi-style weighting that decreases contributions as they get further away
|
11
|
+
# - ignores pairs of sibling edges in which both edges have mutations, i.e. it can throw out a large fraction of mutations in cases where most branches have mutations
|
12
|
+
# - ignores branches with no siblings
|
13
|
+
# - ignores branches with multiple mutations
|
14
|
+
# notes
|
15
|
+
# - number of offspring for a node is set as the number of edges in the entire subtree below that node
|
16
|
+
|
17
|
+
## Computing LONR scores in lineage trees
|
18
|
+
|
19
|
+
## General
|
20
|
+
## The program is written in R.
|
21
|
+
## This analysis is divided into two parts:
|
22
|
+
## First, a lineage tree is built with sequences provided in and aligned FASTA format.
|
23
|
+
## There is an option to provide an outgroup sequence in order to detect more accurately the root sequence.
|
24
|
+
## For less than 100 sequences, the Maximum parsimony method is used (dnapars Phylip program).
|
25
|
+
## Otherwise, the Neighbor-joining method is used (neighbor Phylip program).
|
26
|
+
## The Neighbor-joining method uses a distance matrix as input, which is computed using the dnadist Phylip program.
|
27
|
+
## The internal sequences are reconstructed using the Fitch algorithm.
|
28
|
+
## The tree is then divided into subtrees in order to ignore mutations occurring in very distant sequences.
|
29
|
+
## Thus the user can specify the cut-off (in nucleotides) to cut long branches.
|
30
|
+
## The default is 10 mutations.
|
31
|
+
## The following LONR analysis is then performed for each subtree separately.
|
32
|
+
## Second, mutations are detected between each pair of father and son sequences (at the nucleotide level).
|
33
|
+
## The LONR score is calculated, for each mutation, as the log of the ratio between the sub-tree size of the son in which the mutation occurred and the sub-tree size of the son in which no mutation occurred at this position.
|
34
|
+
|
35
|
+
## How to run
|
36
|
+
## This program uses the Phylip-3.695 package. It is expecting it to be in the same folder as this script. The path to the program can be modified in the beginning of the script.
|
37
|
+
## The package Biostrings and seqinr need to be installed.
|
38
|
+
|
39
|
+
## Main function - compute.LONR()
|
40
|
+
|
41
|
+
## Input
|
42
|
+
## 1) in.dir – aligned FASTA file input directory
|
43
|
+
## 2) out.dir – output directory for tree files and LONR results
|
44
|
+
## 3) file – FASTA file name
|
45
|
+
## 4) outgroup (optional) – outgroup sequence name. If
|
46
|
+
## 5) cutoff - branches with more mutations than specified nu this cutoff will be trimmed, resulting in several subtrees (default - 10 )
|
47
|
+
|
48
|
+
## Output
|
49
|
+
## The following directories are created in out.dir:
|
50
|
+
## Tree directory –
|
51
|
+
## 1) filename.fasta -modified FASTA file if gaps were removed.
|
52
|
+
## 2) filename.dis – (only for neighbor joining trees) contains distance matrix created by the dnadist program in the Phylip package.
|
53
|
+
## 3) filename.phy – original sequences in alignment format.
|
54
|
+
## 4) filename_edges.tab – tab-delimited file containing the tree edges, their weights and the distance in nucleotides between each two nodes.
|
55
|
+
## 5) filename_names.tab – tab-delimited file matching between original sequence names and temporary names.
|
56
|
+
## 6) filename_out.txt – Phylip output tree file
|
57
|
+
## 7) filename_tree.txt – Phylip output tree file in Newick format
|
58
|
+
|
59
|
+
## LONR directory –
|
60
|
+
## 1) filename_lonr.csv – comma-separated file containing lonr results as followed:
|
61
|
+
## i. mutation – mutated nucleotides (e.g. AC means A C)
|
62
|
+
## ii. LONR – log(size of mutated sub-tree/size of un-mutated sub-tree)
|
63
|
+
## iii. mutation.type – (S) Silent or (Replacement)
|
64
|
+
## iv. position – in nucleotides, according to output FASTA file (see above)
|
65
|
+
## v. father – sequence name from which occurred mutation
|
66
|
+
## vi. son – sequence name to which occurred mutation
|
67
|
+
## vii. flag – True if LONR score an internal node is affected by mutations occurring in its descendants
|
68
|
+
## Clarifications
|
69
|
+
## 1) The input sequences must already be aligned. If there are gaps, the consensus sequence of all the sequences is computed, and positions containing gaps are removed from all the sequences. The output FASTA file is created after this step.
|
70
|
+
## 2) The dnapars program may create trees which are not completely binary. Thus, internal nodes which have more than two children are fixed by created an identical new child, which will receive the extra children.
|
71
|
+
## This also happens is case no outgroup is provided and the root has three children (both in dnapars and neighbor programs).
|
72
|
+
## 3) If the nucleotide sequence lengths are not a multiple of three and mutations occurred in the last nucleotides, these mutations are ignored since they cannot be typed (not a full codon).
|
73
|
+
|
74
|
+
# imports
|
75
|
+
suppressPackageStartupMessages(require(seqinr, quietly=TRUE, warn.conflicts=FALSE))
|
76
|
+
suppressPackageStartupMessages(require(Biostrings, quietly=TRUE, warn.conflicts=FALSE))
|
77
|
+
|
78
|
+
MIN.SEQ <- 3
|
79
|
+
MAX.SEQ <- 7000
|
80
|
+
|
81
|
+
# ----------------------------------------------------------------------------------------
|
82
|
+
# empty ones are all set by the calling python script after importing this code
|
83
|
+
G.phy.infname = 'inseqs.phy'
|
84
|
+
G.dnadist.fname = 'dnadist.dis'
|
85
|
+
G.phy.outfname = ''
|
86
|
+
G.phy.treefname = ''
|
87
|
+
G.outseqs.fname = ''
|
88
|
+
G.edgefname = ''
|
89
|
+
G.names.fname = ''
|
90
|
+
G.lonrfname = ''
|
91
|
+
|
92
|
+
# Create fasta file from a data.frame of sequences and headers
|
93
|
+
# out.dir - output directory
|
94
|
+
# nameSeq.df - data.frame of headers and sequences
|
95
|
+
write.FASTA <- function(out.dir, nameSeq.df) {
|
96
|
+
sequences <- nameSeq.df$seq
|
97
|
+
names(sequences) <- nameSeq.df$head
|
98
|
+
writeXStringSet(DNAStringSet(sequences), file=paste0(out.dir, G.outseqs.fname), width=1000)
|
99
|
+
}
|
100
|
+
|
101
|
+
# Change sequence names
|
102
|
+
# fasta.df - data frame of sequences and headers
|
103
|
+
# outgroup - outgroup sequence name
|
104
|
+
# Returns: data frame of sequences and headers with extra column of new headers
|
105
|
+
change.names <- function(fasta.df, outgroup = NULL){
|
106
|
+
|
107
|
+
if (!is.null(outgroup)){
|
108
|
+
outgroup.ind <- which(fasta.df$head == outgroup)
|
109
|
+
n.seq <- nrow(fasta.df)
|
110
|
+
# change sequence names to 'L_#'
|
111
|
+
ind <- 1:n.seq
|
112
|
+
ind <- ind[-outgroup.ind]
|
113
|
+
|
114
|
+
# original method, fails if outgroup is the last sequence in the file (it complains that the first line only adds n.seq-1 entries, whereas the rest of the data frame has n.seq entries [or something like that]):
|
115
|
+
## fasta.df$head2[ind] <- sapply(1:(n.seq-1), function(x) { paste0('L', x)})
|
116
|
+
## fasta.df$head2[outgroup.ind] <- fasta.df$head[outgroup.ind]
|
117
|
+
|
118
|
+
# this is weird and complicated so that it duplicates the original method ^
|
119
|
+
fasta.df$head2[1:n.seq] <- sapply(1:n.seq, function(x) { if(x == outgroup.ind) return(fasta.df$head[outgroup.ind]); if(x > outgroup.ind) x = x-1; return(paste0('L', x))})
|
120
|
+
}else{
|
121
|
+
n.seq <- nrow(fasta.df)
|
122
|
+
# change sequence names to 'L_#'
|
123
|
+
ind <- 1:n.seq
|
124
|
+
fasta.df$head2[ind] <- sapply(1:n.seq, function(x) { paste0('L', x)})
|
125
|
+
}
|
126
|
+
return(fasta.df)
|
127
|
+
}
|
128
|
+
|
129
|
+
# Create phylip input alignment file
|
130
|
+
# Arguments: fasta.df - data frame of sequences and headers
|
131
|
+
fasta2phylip <- function(fasta.df, workdir){
|
132
|
+
|
133
|
+
phy.df <- rbind(data.frame(head=sprintf('%-9s', nrow(fasta.df)), seq=nchar(fasta.df$seq[1]), stringsAsFactors=F),
|
134
|
+
data.frame(head=sprintf('%-9s', fasta.df$head2), seq=fasta.df$seq, stringsAsFactors=F))
|
135
|
+
|
136
|
+
write.table(phy.df, file=paste0(workdir, G.phy.infname), quote=F, sep=' ', col.names=F, row.names=F)
|
137
|
+
}
|
138
|
+
|
139
|
+
# Run dnapars (maximum parsimony)
|
140
|
+
# outgroup.ind - outgroup index in file (optional)
|
141
|
+
run.dnapars <- function(workdir, outgroup.ind){
|
142
|
+
print(' building trees with maximum parsimony')
|
143
|
+
|
144
|
+
curr.dir <- getwd()
|
145
|
+
setwd(workdir)
|
146
|
+
|
147
|
+
# options from dnapars program
|
148
|
+
# index of outgroup - last in data frame
|
149
|
+
if (length(outgroup.ind) != 0)
|
150
|
+
pars.options <- c(G.phy.infname, 'O', outgroup.ind, 'V', '1', '5', '.', '2', 'Y')
|
151
|
+
else
|
152
|
+
pars.options <- c(G.phy.infname, 'V', '1', '5', '.', '2', 'Y')
|
153
|
+
|
154
|
+
# run dnapars
|
155
|
+
system2('phylip', args='dnapars', input=pars.options, stdout=NULL) # not sure why this was just calling 'dnapars'? my phylip install doesn't seem to have put dnapars in my path, but 'phylip dnapars' seems to work ok
|
156
|
+
|
157
|
+
# move .phy file and output tree files
|
158
|
+
file.rename(from=paste0(workdir, 'outfile'), to=paste0(workdir, G.phy.outfname))
|
159
|
+
file.rename(from=paste0(workdir, 'outtree'), to=paste0(workdir, G.phy.treefname))
|
160
|
+
|
161
|
+
setwd(curr.dir)
|
162
|
+
}
|
163
|
+
|
164
|
+
# Order sequence from root to leaves
|
165
|
+
#
|
166
|
+
# Arguments: root - root sequence name
|
167
|
+
# nameSeq.df - data frame with headers and sequences
|
168
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
169
|
+
# outgroup - outgroup sequence name (optional)
|
170
|
+
#
|
171
|
+
# Returns: nameSeq.df - data frame with headers and sequences, ordered by distance from root
|
172
|
+
order.nameSeq <- function(root, nameSeq.df, edge.df, outgroup){
|
173
|
+
|
174
|
+
n.seq <- nrow(nameSeq.df)
|
175
|
+
nameSeq.df2 <- nameSeq.df[which(nameSeq.df[,'head']==root),] # root
|
176
|
+
for(i in 1:n.seq){ #1 - root
|
177
|
+
sons <- edge.df[which(edge.df[,'from']==nameSeq.df2[i,'head']),2]
|
178
|
+
if(length(sons)>0){
|
179
|
+
outgroup.ind <- which(sons==outgroup)
|
180
|
+
if(length(outgroup.ind)!=0) { sons <- sons[-outgroup.ind] }
|
181
|
+
for(j in 1:length(sons))
|
182
|
+
nameSeq.df2 <- rbind(nameSeq.df2, nameSeq.df[which(nameSeq.df[,'head']==sons[j]),])
|
183
|
+
}
|
184
|
+
}
|
185
|
+
if (!is.null(outgroup)) # last - outgroup
|
186
|
+
nameSeq.df2 <- rbind(nameSeq.df[which(nameSeq.df[,'head']==outgroup),], nameSeq.df2) # add outgroup first
|
187
|
+
row.names(nameSeq.df2)<-NULL
|
188
|
+
|
189
|
+
return(nameSeq.df2)
|
190
|
+
}
|
191
|
+
|
192
|
+
|
193
|
+
# Parse dnapars output tree file, get internal sequences and tree structure
|
194
|
+
#
|
195
|
+
# Returns: list of:
|
196
|
+
# nameSeq.df - data frame with headers and sequences, ordered by distance from root
|
197
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
198
|
+
parse.dnapars <- function(workdir, outgroup = NULL){
|
199
|
+
|
200
|
+
# read output tree file
|
201
|
+
out.tree <- scan(paste0(workdir, G.phy.outfname), what='character',sep='\n',
|
202
|
+
blank.lines.skip=F, strip.white=F)
|
203
|
+
# check if tree was build
|
204
|
+
if (any(grepl('-1 trees in all found', out.tree))) { return(NULL) }
|
205
|
+
|
206
|
+
# get internal sequences
|
207
|
+
seq.start <- min(grep('From\\s+To\\s+Any Steps\\?\\s+State at upper node', out.tree, perl=T, fixed=F))
|
208
|
+
seq.empty <- grep('^\\s*$', out.tree[seq.start:length(out.tree)], perl=T, fixed=F)
|
209
|
+
seq.len <- seq.empty[min(which(seq.empty[-1] == (seq.empty[-length(seq.empty)] + 1)))]
|
210
|
+
seq.block <- paste(out.tree[(seq.start + 2):(seq.start + seq.len - 2)], collapse='\n')
|
211
|
+
seq.df <- read.table(textConnection(seq.block), as.is=T, fill=T, blank.lines.skip=T, row.names = NULL, header=F, stringsAsFactors=F)
|
212
|
+
|
213
|
+
# fix root lines and remove empty rows
|
214
|
+
fix.row <- which(seq.df[,3]!="yes" & seq.df[,3]!="no" & seq.df[,3]!="maybe")
|
215
|
+
if (!is.null(outgroup))
|
216
|
+
seq.df[fix.row, ] <- cbind(seq.df[fix.row, 1], seq.df[fix.row, 2],'no', seq.df[fix.row, 3:6], stringsAsFactors=F)
|
217
|
+
else
|
218
|
+
seq.df[fix.row, ] <- cbind(seq.df[fix.row, 1], seq.df[fix.row, 1],'no', seq.df[fix.row, 2:5], stringsAsFactors=F)
|
219
|
+
|
220
|
+
# save full sequences as a data frame
|
221
|
+
names <- unique(seq.df[, 2])
|
222
|
+
seq <- sapply(names, function(x) { paste(t(as.matrix(seq.df[seq.df[, 2] == x, -c(1:3)])), collapse='') })
|
223
|
+
nameSeq.df <- data.frame(head=names, seq=seq, stringsAsFactors=F, row.names=NULL)
|
224
|
+
|
225
|
+
# get tree structure
|
226
|
+
edge.start <- min(grep('between\\s+and\\s+length', out.tree, perl=T, fixed=F))
|
227
|
+
edge.len <- min(grep('^\\s*$', out.tree[edge.start:length(out.tree)], perl=T, fixed=F))
|
228
|
+
edge.block <- paste(out.tree[(edge.start + 2):(edge.start + edge.len - 2)], collapse='\n')
|
229
|
+
edge.df <- read.table(textConnection(edge.block), col.names=c('from', 'to', 'weight'), as.is=T, stringsAsFactors=F)
|
230
|
+
|
231
|
+
# order sequences by distance from root
|
232
|
+
root <- unique(edge.df$from)[!(unique(edge.df$from) %in% edge.df$to)]
|
233
|
+
|
234
|
+
#root <- seq.df[which(seq.df[,1]=='root')[1],2]
|
235
|
+
nameSeq.df <- order.nameSeq(root, nameSeq.df, edge.df, outgroup)
|
236
|
+
|
237
|
+
return(list(nameSeq.df, edge.df))
|
238
|
+
}
|
239
|
+
|
240
|
+
# Fix ambiguous nucleotides in internal sequences, according to IUPAC code
|
241
|
+
#
|
242
|
+
# Arguments: nameSeq.df - data frame with headers and sequences, order by distance from root
|
243
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
244
|
+
# outgroup - outgroup sequence name (optional)
|
245
|
+
#
|
246
|
+
# Returns: - nameSeq.df - data frame with fixed sequences
|
247
|
+
fix.internal <- function(nameSeq.df, edge.df, outgroup = NULL){
|
248
|
+
|
249
|
+
nucleotides <- c('A', 'C','G','T','-')
|
250
|
+
|
251
|
+
# from leaves to root (except outgroup)
|
252
|
+
# fix only internal nodes!
|
253
|
+
if (!is.null(outgroup))
|
254
|
+
intern.seq <- rev(setdiff(2:nrow(nameSeq.df),grep('^L', nameSeq.df[,'head'], perl=T, fixed=F)))
|
255
|
+
else
|
256
|
+
intern.seq <- rev(setdiff(1:nrow(nameSeq.df),grep('^L', nameSeq.df[,'head'], perl=T, fixed=F)))
|
257
|
+
|
258
|
+
for(i in intern.seq){
|
259
|
+
# get ambiguous nucleotides
|
260
|
+
curr.seq <- nameSeq.df[i,'seq']
|
261
|
+
amb.pos <- setdiff(1:nchar(curr.seq),unlist(lapply(nucleotides, function(x) {gregexpr(pattern =x,curr.seq)})))
|
262
|
+
if(length(amb.pos) > 0){
|
263
|
+
# get son sequences
|
264
|
+
sons.seq <- sapply(edge.df[which(edge.df[,'from']==nameSeq.df[i,'head']),'to'], function(x) {nameSeq.df[which(nameSeq.df[,'head']==x),'seq']})
|
265
|
+
# do not fix outgroup sequence
|
266
|
+
if (!is.null(outgroup)){
|
267
|
+
outgroup.ind <- which(names(sons.seq)==outgroup)
|
268
|
+
if(length(outgroup.ind)!=0)
|
269
|
+
sons.seq <- sons.seq[-outgroup.ind]
|
270
|
+
}
|
271
|
+
|
272
|
+
for(j in amb.pos){
|
273
|
+
amb.nuc <- substr(curr.seq,j,j)
|
274
|
+
sons.nuc <- substr(sons.seq,j,j)
|
275
|
+
# if deletion
|
276
|
+
if(amb.nuc == 'O')
|
277
|
+
curr.seq <- paste0(substr(curr.seq,1, j-1), '-', substr(curr.seq,j+1, nchar(curr.seq)))
|
278
|
+
else{
|
279
|
+
# check if at least one of the sons has one match for ambiguous letter
|
280
|
+
if(is.element(amb.nuc, c('?', 'X')))
|
281
|
+
matches <- which(is.element(sons.nuc, nucleotides))
|
282
|
+
else
|
283
|
+
matches <- which(is.element(sons.nuc, toupper(amb(amb.nuc, forceToLower = TRUE))))
|
284
|
+
if(length(matches)==1) # if only one son had a match
|
285
|
+
curr.seq <- paste0(substr(curr.seq,1, j-1), sons.nuc[matches], substr(curr.seq,j+1, nchar(curr.seq)))
|
286
|
+
else if(length(matches)==0){ # choose randomly from son nucleotides
|
287
|
+
rand <- sample(1:length(sons.nuc), 1, replace=T)
|
288
|
+
curr.seq <- paste0(substr(curr.seq,1, j-1), sons.nuc[rand], substr(curr.seq,j+1, nchar(curr.seq)))
|
289
|
+
}else{ # choose randomly from son nucleotides that matched
|
290
|
+
rand <- sample(1:length(matches), 1, replace=T)
|
291
|
+
curr.seq <- paste0(substr(curr.seq,1, j-1), sons.nuc[matches[rand]], substr(curr.seq,j+1, nchar(curr.seq)))
|
292
|
+
}
|
293
|
+
}
|
294
|
+
}
|
295
|
+
nameSeq.df[i,'seq'] <- curr.seq
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
return(nameSeq.df)
|
300
|
+
}
|
301
|
+
|
302
|
+
# Compute distance matrix (Kimura model) for neighbor joining using dnadist (Phylip package)
|
303
|
+
#
|
304
|
+
# Arguments:
|
305
|
+
# workdir - phylip directory
|
306
|
+
run.dnadist <- function(workdir){
|
307
|
+
|
308
|
+
# options from dnadist program
|
309
|
+
dnadist.options <- c(G.phy.infname, 'D', '2', 'Y')
|
310
|
+
|
311
|
+
# run dnadist
|
312
|
+
system2('phylip', args='dnadist', input=dnadist.options,, stdout=NULL)
|
313
|
+
|
314
|
+
# move .phy file and output tree files
|
315
|
+
file.rename(from=paste0(workdir, 'outfile'), to=paste0(workdir, G.dnadist.fname))
|
316
|
+
}
|
317
|
+
|
318
|
+
|
319
|
+
# Run neighbor (Neighbor joining)
|
320
|
+
# outgroup.ind - outgroup index in file (optional)
|
321
|
+
run.neighbor <- function(workdir, outgroup.ind){
|
322
|
+
print(' building trees with neighbor joining')
|
323
|
+
curr.dir <- getwd()
|
324
|
+
setwd(workdir)
|
325
|
+
|
326
|
+
# run dnadist to create distance matrix
|
327
|
+
run.dnadist(workdir)
|
328
|
+
|
329
|
+
# options for neighbor program
|
330
|
+
# index of outgroup - last data frame
|
331
|
+
if (length(outgroup.ind) != 0 )
|
332
|
+
neigh.options <- c(G.dnadist.fname, 'O', outgroup.ind, '2', 'Y')
|
333
|
+
else
|
334
|
+
neigh.options <- c(G.dnadist.fname, '2', 'Y')
|
335
|
+
|
336
|
+
# run neighbor
|
337
|
+
system2('phylip', args='neighbor', input=neigh.options, stdout=NULL)
|
338
|
+
|
339
|
+
# move .phy, .dis and output tree files
|
340
|
+
file.rename(from=paste0(workdir, 'outfile'), to=paste0(workdir, G.phy.outfname))
|
341
|
+
file.rename(from=paste0(workdir, 'outtree'), to=paste0(workdir, G.phy.treefname))
|
342
|
+
file.remove(paste0(workdir, G.dnadist.fname))
|
343
|
+
|
344
|
+
setwd(curr.dir)
|
345
|
+
}
|
346
|
+
|
347
|
+
# Parse neighbor output tree file, get internal sequences and tree structure
|
348
|
+
# fasta.df - data frame of input sequences and headers
|
349
|
+
# outgroup - outgroup sequence name (optional)
|
350
|
+
# Returns: - list of:
|
351
|
+
# nameSeq.df - data frame with headers and sequences
|
352
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
353
|
+
parse.neighbor <- function(workdir, fasta.df, outgroup = NULL){
|
354
|
+
|
355
|
+
# read output tree file
|
356
|
+
out.tree <- scan(paste0(workdir, G.phy.outfname), what='character',sep='\n',
|
357
|
+
blank.lines.skip=F, strip.white=F)
|
358
|
+
|
359
|
+
# check if tree was build
|
360
|
+
if (any(grepl('-1 trees in all found', out.tree))) { return(NULL) }
|
361
|
+
|
362
|
+
# get tree structure
|
363
|
+
edge.start <- min(grep('Between\\s+And\\s+Length', out.tree, perl=T, fixed=F))
|
364
|
+
edge.len <- min(grep('^\\s*$', out.tree[edge.start:length(out.tree)], perl=T, fixed=F))
|
365
|
+
edge.block <- paste(out.tree[(edge.start + 2):(edge.start + edge.len - 2)], collapse='\n')
|
366
|
+
edge.df <- read.table(textConnection(edge.block), col.names=c('from', 'to', 'weight'), as.is=T)
|
367
|
+
|
368
|
+
# create nameSeq.df from input sequence only (for now)
|
369
|
+
nameSeq.df <- data.frame(head = fasta.df[,'head2'], seq = fasta.df[,'seq'], stringsAsFactors=F)
|
370
|
+
|
371
|
+
return(list(nameSeq.df, edge.df))
|
372
|
+
}
|
373
|
+
|
374
|
+
# Traverse tree in postorder (step 2 in Fitch's algorithm)
|
375
|
+
#
|
376
|
+
# Arguments: father/ sons - father and its 2 son names
|
377
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
378
|
+
# nameSeq.list - list of sequences with headers
|
379
|
+
#
|
380
|
+
# Returns: modified nameSeq.list - list of sequences with headers
|
381
|
+
traverse.up <- function(father, sons, edge.df, nameSeq.list){
|
382
|
+
|
383
|
+
sons.seq <- sapply(sons,function(x) NULL)
|
384
|
+
|
385
|
+
# check if each son sequence a leaf or is already reconstructed
|
386
|
+
for(i in sons){
|
387
|
+
if(!is.element(i, names(nameSeq.list)))
|
388
|
+
nameSeq.list <- traverse.up(i, edge.df[edge.df[,'from']==i,'to'], edge.df, nameSeq.list)
|
389
|
+
sons.seq[i] <- nameSeq.list[i]
|
390
|
+
}
|
391
|
+
seq.len <- length(sons.seq[[1]])
|
392
|
+
|
393
|
+
father.seq <- character(seq.len)
|
394
|
+
for(i in 1:seq.len){ # for each position in sequence
|
395
|
+
curr.nuc <- sapply(sons.seq, function(x) x[[i]])
|
396
|
+
nuc <- Reduce(intersect, strsplit(curr.nuc,""))
|
397
|
+
if(length(nuc)>0) # save intersection
|
398
|
+
father.seq[i] <- paste(nuc, collapse='')
|
399
|
+
else # save union
|
400
|
+
father.seq[i] <- paste( Reduce(union, strsplit(curr.nuc,'')),collapse="")
|
401
|
+
}
|
402
|
+
nameSeq.list[[father]]<-father.seq # save modified sequence
|
403
|
+
|
404
|
+
return(nameSeq.list)
|
405
|
+
}
|
406
|
+
|
407
|
+
# Recursively traverse tree in preorder (step 1 in Fitch's algorithm)
|
408
|
+
#
|
409
|
+
# Arguments: father/ son - father and son name
|
410
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
411
|
+
# nameSeq.list - list of sequences with headers
|
412
|
+
# outgroup - outgroup sequence name (optional)
|
413
|
+
#
|
414
|
+
# Returns: modified nameSeq.list - list of sequences with headers
|
415
|
+
traverse.down <- function(father, son, edge.df, nameSeq.list, outgroup = NULL){
|
416
|
+
|
417
|
+
son.seq <- nameSeq.list[[son]]
|
418
|
+
seq.len <- length(son.seq)
|
419
|
+
|
420
|
+
if(is.null(father)){ # root
|
421
|
+
for(i in 1:seq.len){
|
422
|
+
if(nchar(son.seq[i])>1){ # more than 1 option in intersection - choose randomly
|
423
|
+
#rand <- floor(runif(1, min=1, max=nchar(son.seq[i])+1))
|
424
|
+
rand <- sample(1:nchar(son.seq[i]), 1, replace=T)
|
425
|
+
son.seq[i] <- substr(son.seq[i],rand, rand)
|
426
|
+
}
|
427
|
+
}
|
428
|
+
}else{
|
429
|
+
father.seq <- nameSeq.list[[father]]
|
430
|
+
for(i in 1:seq.len){
|
431
|
+
if(nchar(son.seq[i])>1){
|
432
|
+
nuc <- Reduce(intersect, strsplit(c(father.seq[i],son.seq[i]),''))
|
433
|
+
# if only one nucleotide in intersection - keep it
|
434
|
+
if(length(nuc)==0){ # no intersection - choose randomly from son's options
|
435
|
+
rand <- sample(1:nchar(son.seq[i]), 1, replace=T)
|
436
|
+
son.seq[i] <- substr(son.seq[i],rand, rand)
|
437
|
+
}else # if intersection is not empty, but son has more than 1 option
|
438
|
+
son.seq[i] <- nuc
|
439
|
+
}
|
440
|
+
}
|
441
|
+
}
|
442
|
+
nameSeq.list[[son]] <- son.seq # save modified sequence
|
443
|
+
|
444
|
+
# recursive call for each son's sons
|
445
|
+
sons <- edge.df[edge.df[,'from']==son,'to']
|
446
|
+
if(length(sons) > 0){
|
447
|
+
# remove outgroup from root's children
|
448
|
+
if (!is.null(outgroup)){
|
449
|
+
outgroup.ind <- which(sons == outgroup)
|
450
|
+
if(length(outgroup.ind)!=0) { sons <- sons[-outgroup.ind] }
|
451
|
+
}
|
452
|
+
nameSeq.list <- traverse.down(son, sons[1], edge.df, nameSeq.list, outgroup)
|
453
|
+
nameSeq.list <- traverse.down(son, sons[2], edge.df, nameSeq.list, outgroup)
|
454
|
+
}
|
455
|
+
return(nameSeq.list)
|
456
|
+
}
|
457
|
+
|
458
|
+
# Reconstruct internal sequences with Fitch algorithm (for neighbor trees)
|
459
|
+
#
|
460
|
+
# Arguments: nameSeq.df - data frame with headers and input sequences only
|
461
|
+
# edge.df - data frame with columns - parent(from), child(to), distance(weight)
|
462
|
+
# outgroup - outgroup sequence name (optional)
|
463
|
+
#
|
464
|
+
# Returns: nameSeq.df - data frame with headers and all sequences, ordered by distance from root
|
465
|
+
get.internal <- function(nameSeq.df, edge.df, outgroup = NULL){
|
466
|
+
|
467
|
+
# find root
|
468
|
+
root <- as.character(unique(edge.df[!(edge.df[, 1] %in% edge.df[, 'to']), 1]))
|
469
|
+
root.sons <- edge.df[edge.df[,'from']==root,'to']
|
470
|
+
|
471
|
+
# remove outgroup from root's children
|
472
|
+
if (!is.null(outgroup)){
|
473
|
+
outgroup.ind <- which(root.sons == outgroup)
|
474
|
+
if(length(outgroup.ind)!=0) { root.sons <- root.sons[-outgroup.ind] }
|
475
|
+
}
|
476
|
+
|
477
|
+
# convert sequences to lists
|
478
|
+
nameSeq.list <- sapply(nameSeq.df[,'seq'], function(x) {strsplit(x, "", fixed=FALSE)})
|
479
|
+
names(nameSeq.list) <- nameSeq.df[,'head']
|
480
|
+
|
481
|
+
# Step 1 - preorder on tree - get intersection, otherwise - get union
|
482
|
+
nameSeq.list <- traverse.up(root, root.sons, edge.df, nameSeq.list)
|
483
|
+
|
484
|
+
# Step 2 - postorder on tree - get intersection, otherwise choose randomly
|
485
|
+
nameSeq.list <- traverse.down(NULL, root, edge.df, nameSeq.list, outgroup)
|
486
|
+
|
487
|
+
# convert sequence list to dataframe
|
488
|
+
nameSeq.list2 <-lapply(nameSeq.list,function(x) {paste(x, collapse="")})
|
489
|
+
nameSeq.df <-data.frame(head = names(nameSeq.list2), stringsAsFactors=F)
|
490
|
+
nameSeq.df$seq <- unlist(nameSeq.list2)
|
491
|
+
|
492
|
+
# order sequences in nameSeq.df by distance from root
|
493
|
+
nameSeq.df <- order.nameSeq(root, nameSeq.df, edge.df, outgroup)
|
494
|
+
|
495
|
+
return(nameSeq.df)
|
496
|
+
}
|
497
|
+
|
498
|
+
# Match old input sequence names with new in final output sequence data frame
|
499
|
+
#
|
500
|
+
# Arguments: nameSeq.df - data frame with headers and all sequences
|
501
|
+
# fasta.df - data frame with new and old headers and input sequences only
|
502
|
+
#
|
503
|
+
# Returns: nameSeq.df - data frame with headers and sequences, with old and new names
|
504
|
+
match.names <- function(fasta.df, nameSeq.df){
|
505
|
+
|
506
|
+
nameSeq.df$head2 <- rep("-", nrow(nameSeq.df))
|
507
|
+
for(i in 1:nrow(fasta.df))
|
508
|
+
nameSeq.df[which(fasta.df[i,'head2']==nameSeq.df[,'head']),'head2'] <- fasta.df[i,'head']
|
509
|
+
|
510
|
+
return(nameSeq.df)
|
511
|
+
}
|
512
|
+
|
513
|
+
# Compute edge lengths (distance in nucleotides)
|
514
|
+
#
|
515
|
+
# Arguments: nameSeq.df - data frame with headers and sequences
|
516
|
+
# edge.df - data frame with columns - parent(from), child(to), edge weight (weight)
|
517
|
+
#
|
518
|
+
# Returns: edge.df - data frame with columns - parent(from), child(to), edge weight (weight) , edge length (distance in nt)
|
519
|
+
compute.edge <- function(nameSeq.df, edge.df){
|
520
|
+
|
521
|
+
n.edge <- nrow(edge.df)
|
522
|
+
edge.df$distance <- rep(0, n.edge)
|
523
|
+
for(i in 1:n.edge){
|
524
|
+
from.seq <- unlist(strsplit(nameSeq.df[nameSeq.df[,'head']==as.character(edge.df[i,'from']),'seq'],''))
|
525
|
+
to.seq <- unlist(strsplit(nameSeq.df[nameSeq.df[,'head']==as.character(edge.df[i,'to']),'seq'],''))
|
526
|
+
edge.df[i,'distance'] <- length(which(from.seq!=to.seq))
|
527
|
+
}
|
528
|
+
|
529
|
+
return(edge.df)
|
530
|
+
}
|
531
|
+
|
532
|
+
# Convert parsimony tree into binary tree
|
533
|
+
#
|
534
|
+
# Arguments: nameSeq.df - data frame with headers and sequences
|
535
|
+
# edge.df - data frame with columns - parent(from), child(to), edge weight (weight)
|
536
|
+
# outgroup name (default - NULL)
|
537
|
+
#
|
538
|
+
# Returns: modified edge.df and nameSeq.df
|
539
|
+
convert.to.binary <- function(nameSeq.df, edge.df, outgroup = NULL){
|
540
|
+
|
541
|
+
# find nodes with more than 2 sons
|
542
|
+
son.count <- table(edge.df$from)
|
543
|
+
nodes <- names(son.count)[son.count == 3]
|
544
|
+
|
545
|
+
# get name of last internal node created by program
|
546
|
+
ind <- max(unique(edge.df$from)) + 1
|
547
|
+
|
548
|
+
for (i in nodes){
|
549
|
+
sons <- edge.df[edge.df$from==i, 'to']
|
550
|
+
if (!is.null(outgroup)){
|
551
|
+
if (outgroup %in% sons) # if outgroup is one of the root's sons - ok
|
552
|
+
next
|
553
|
+
}
|
554
|
+
|
555
|
+
# create new node named with current ind and attach 2 sons
|
556
|
+
edge.df[edge.df$to==sons[1], 'from'] <- ind
|
557
|
+
edge.df[edge.df$to==sons[2], 'from'] <- ind
|
558
|
+
# add new node has son of current node
|
559
|
+
edge.df <- rbind(edge.df, data.frame(from = i, to = ind, weight=0))
|
560
|
+
# add new node sequence - same as current node
|
561
|
+
nameSeq.df <- rbind(nameSeq.df, data.frame(head=ind, seq=nameSeq.df[nameSeq.df$head==i, 'seq']))
|
562
|
+
ind <- ind + 1
|
563
|
+
}
|
564
|
+
|
565
|
+
return(list(nameSeq.df, edge.df))
|
566
|
+
|
567
|
+
}
|
568
|
+
|
569
|
+
# Build phylogenetic tree with maximum parsimony or neighbor joining
|
570
|
+
# Arguments:
|
571
|
+
# fasta.df - data frame with input sequences and headers
|
572
|
+
# outgroup - outgroup sequence name (optional)
|
573
|
+
# Returns: nameSeq.list - list of sequences with headers
|
574
|
+
# edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
|
575
|
+
build.trees <- function(method, fasta.df, workdir, outgroup=NULL, existing.edgefile=NULL, existing.node.seqfile=NULL){
|
576
|
+
|
577
|
+
dir.create(workdir, recursive = T, showWarnings = FALSE)
|
578
|
+
|
579
|
+
# change input sequence name to shorter ones
|
580
|
+
fasta.df <- change.names(fasta.df, outgroup) # CHECK CP NUM!!!!!!!
|
581
|
+
|
582
|
+
# convert sequences to alignment format for phylip programs
|
583
|
+
fasta2phylip(fasta.df, workdir)
|
584
|
+
|
585
|
+
# find outgroup index in file
|
586
|
+
outgroup.ind <- which(fasta.df$head == outgroup)
|
587
|
+
|
588
|
+
if(!is.null(existing.edgefile)) {
|
589
|
+
stop('doesn\'t work yet (and probably a waste of time to fix it -- reimplement this stupid shit in python)')
|
590
|
+
edge.df <- read.csv(file=existing.edgefile, header=TRUE, sep=',', colClasses=c('character', 'character', 'numeric'))
|
591
|
+
nameSeq.df <- read.csv(file=existing.node.seqfile, header=TRUE, sep=',', colClasses=c('character', 'character'))
|
592
|
+
for(irow in 1:length(nameSeq.df$head)) {
|
593
|
+
input.name <- nameSeq.df$head[[irow]]
|
594
|
+
for(jrow in 1:length(fasta.df$head)) {
|
595
|
+
if(fasta.df$head[[jrow]] == input.name) { # see if this sequence is in the df with name translations (i.e. it's an input/leaf sequence)
|
596
|
+
new.name <- fasta.df$head2[[jrow]]
|
597
|
+
nameSeq.df$head[[irow]] <- new.name # and if it is, replace the input name with the new name
|
598
|
+
if(toString(nameSeq.df$seq[[irow]]) != toupper(toString(fasta.df$seq[[jrow]]))) # make sure the sequences are the same (I don't really understand the need for toString(), but it prints some extra "Level" stuff if I don't have it, so...)
|
599
|
+
stop(paste0('sequences don\'t match for ', input.name, ':\n', toString(nameSeq.df$seq[[irow]]), '\n', toupper(toString(fasta.df$seq[[jrow]]))))
|
600
|
+
for(krow in 1:length(edge.df$from)) { # also have to change the names in the edge df
|
601
|
+
if(edge.df$from[[krow]] == input.name) # this doesn't actually happen, since they only rename the leaves
|
602
|
+
edge.df$from[[krow]] <- new.name
|
603
|
+
if(edge.df$to[[krow]] == input.name)
|
604
|
+
edge.df$to[[krow]] <- new.name
|
605
|
+
}
|
606
|
+
break
|
607
|
+
}
|
608
|
+
}
|
609
|
+
}
|
610
|
+
## # we need to convert the internal node names to integers here, since convert.to.binary() tries sorts them, which fails if they're strings. But this doesn't freaking work, because the class of the column is set the 'character' (above). fuck you, R
|
611
|
+
## print(nameSeq.df$head)
|
612
|
+
## ftmp <- function(x) { if(substr(x, 1, 1) == 'L') return(x); if(x == outgroup) return(x); return(strtoi(x)); }
|
613
|
+
## edge.df$from <- sapply(edge.df$from, ftmp)
|
614
|
+
## edge.df$to <- sapply(edge.df$to, ftmp)
|
615
|
+
## nameSeq.df$head <- sapply(nameSeq.df$head, ftmp)
|
616
|
+
## print(nameSeq.df$head)
|
617
|
+
## stop('x')
|
618
|
+
} else if(method == 'dnapars') { # maximum parsimony
|
619
|
+
run.dnapars(workdir, outgroup.ind)
|
620
|
+
|
621
|
+
# parse dnapars output
|
622
|
+
tmp <- parse.dnapars(workdir, outgroup)
|
623
|
+
if(is.null(tmp)) { return(F) } # if tree building failed
|
624
|
+
nameSeq.df <- tmp[[1]]
|
625
|
+
edge.df <- tmp[[2]]
|
626
|
+
nameSeq.df <- fix.internal(nameSeq.df, edge.df, outgroup)
|
627
|
+
} else if(method == 'neighbor') { # neighbor joining
|
628
|
+
run.neighbor(workdir, outgroup.ind)
|
629
|
+
|
630
|
+
# parse neighbor output
|
631
|
+
tmp <- parse.neighbor(workdir, fasta.df, outgroup)
|
632
|
+
if(is.null(tmp)) { return(F) } # if tree building failed
|
633
|
+
nameSeq.df <- tmp[[1]]
|
634
|
+
edge.df <- tmp[[2]]
|
635
|
+
nameSeq.df <- get.internal(nameSeq.df, edge.df, outgroup)
|
636
|
+
} else {
|
637
|
+
stop(paste0('unhandled method: ', method))
|
638
|
+
}
|
639
|
+
|
640
|
+
# convert tree to binary
|
641
|
+
tmp <- convert.to.binary(nameSeq.df, edge.df, outgroup)
|
642
|
+
nameSeq.df <- tmp[[1]]
|
643
|
+
edge.df <- tmp[[2]]
|
644
|
+
|
645
|
+
# compute edge lengths
|
646
|
+
edge.df <- compute.edge(nameSeq.df, edge.df)
|
647
|
+
|
648
|
+
# retrieve old names for input sequences
|
649
|
+
nameSeq.df <- match.names(fasta.df, nameSeq.df)
|
650
|
+
|
651
|
+
# save output files
|
652
|
+
# save sequence as FASTA file
|
653
|
+
write.FASTA(workdir, nameSeq.df)
|
654
|
+
# save Fome/To/distances table (edges)
|
655
|
+
write.table(edge.df, file=paste0(workdir, G.edgefname), quote=F, sep='\t', col.names=T, row.names=F)
|
656
|
+
# save old and new sequence names
|
657
|
+
write.table(nameSeq.df[,c('head','head2')], file=paste0(workdir, G.names.fname), quote=F, sep='\t', col.names=T, row.names=F)
|
658
|
+
|
659
|
+
return(list(nameSeq.df, edge.df))
|
660
|
+
}
|
661
|
+
|
662
|
+
# Calculate subtree sizes for each internal node (recursive function)
|
663
|
+
#
|
664
|
+
# Arguments: node - current node for which the number of offsprings
|
665
|
+
# n.offspring - data frame with size of keft and right subtrees for each sequence
|
666
|
+
# edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
|
667
|
+
#
|
668
|
+
# Returns: updated n.offspring
|
669
|
+
get.number.offsprings <- function(node, n.offspring, edge.df){
|
670
|
+
|
671
|
+
# post order on tree
|
672
|
+
sons <- edge.df[edge.df[, 'from'] == node, 2]
|
673
|
+
|
674
|
+
# if node is a leaf - set number of offsprings to zero
|
675
|
+
if (length(sons) == 0)
|
676
|
+
return (n.offspring)
|
677
|
+
|
678
|
+
# sort sons alphabetically - so that the first is always the left son
|
679
|
+
sons <- sort(sons)
|
680
|
+
|
681
|
+
node.ind <- which(n.offspring$node == node)
|
682
|
+
|
683
|
+
# left subtree
|
684
|
+
n.offspring <- get.number.offsprings(sons[1], n.offspring, edge.df)
|
685
|
+
n.offspring[node.ind, 'left'] <- n.offspring[n.offspring$node == sons[1], 'left'] +
|
686
|
+
n.offspring[n.offspring$node == sons[1], 'right'] + 1
|
687
|
+
|
688
|
+
if (length(sons) == 1 ) # the other son was cut
|
689
|
+
n.offspring[node.ind, 'right'] <- 0
|
690
|
+
else{
|
691
|
+
# right subtree
|
692
|
+
n.offspring <- get.number.offsprings(sons[2], n.offspring, edge.df)
|
693
|
+
n.offspring[node.ind, 'right'] <- n.offspring[n.offspring$node == sons[2], 'left'] +
|
694
|
+
n.offspring[n.offspring$node == sons[2], 'right'] + 1
|
695
|
+
}
|
696
|
+
|
697
|
+
return (n.offspring)
|
698
|
+
}
|
699
|
+
|
700
|
+
# Analyze mutations between a pair of father-son and compute LONR scores for each mutation
|
701
|
+
#
|
702
|
+
# Arguments: n.offspring - data frame with size of left and right subtrees for each sequence
|
703
|
+
# father - father name
|
704
|
+
# father.char - father nucleotide sequence split into characters
|
705
|
+
# father.aa.char - father amino acid sequence split into characters
|
706
|
+
# son - son name
|
707
|
+
# son.side - left or right son of father (alphabetically)
|
708
|
+
# son.char - son nucleotide sequence split into characters
|
709
|
+
# son.aa.char - son amino acid sequence split into characters
|
710
|
+
# mut.pos - mutation positions (nucleotides)
|
711
|
+
# mutations - mutation table
|
712
|
+
# mutations.ind - index of last mutation inserted in mutations table
|
713
|
+
#
|
714
|
+
# Returns: list of
|
715
|
+
# - updated mutations and mutations.ind
|
716
|
+
analyze.mutations <- function(n.offspring, father, father.char, father.aa.char, son, son.side, son.char, son.aa.char, mut.pos, mutations, mutations.ind){
|
717
|
+
|
718
|
+
flag <- F
|
719
|
+
father.ind <- which(n.offspring$node==father)
|
720
|
+
|
721
|
+
# if at least one son was trimmed - do not analyze mutations
|
722
|
+
if (n.offspring[father.ind,'left']==0 | n.offspring[father.ind,'right']==0)
|
723
|
+
return(list(mutations, mutations.ind))
|
724
|
+
|
725
|
+
# compute LONR score
|
726
|
+
# log of subtree size where mutation occurred divided by subtree size where no mutation occurred
|
727
|
+
if (son.side == 'left')
|
728
|
+
lonr <- log(n.offspring[father.ind,'left']/n.offspring[father.ind,'right'])
|
729
|
+
else
|
730
|
+
lonr <- log(n.offspring[father.ind,'right']/n.offspring[father.ind,'left'])
|
731
|
+
|
732
|
+
# get son's subtree sizes
|
733
|
+
son.left <- n.offspring[n.offspring$node==son, 'left']
|
734
|
+
son.right <- n.offspring[n.offspring$node==son, 'right']
|
735
|
+
if (son.left != 0 & son.right != 0){
|
736
|
+
if (son.left > 2*son.right | 2*son.left < son.right){
|
737
|
+
if (son.side == 'left')
|
738
|
+
new.lonr <- log((2*min(son.left, son.right))/n.offspring[father.ind,'right'])
|
739
|
+
else
|
740
|
+
new.lonr <- log((2*min(son.left, son.right))/n.offspring[father.ind,'left'])
|
741
|
+
if (lonr/new.lonr < 0) # sign changed - add flag
|
742
|
+
flag = T
|
743
|
+
}
|
744
|
+
}
|
745
|
+
|
746
|
+
for (j in mut.pos){
|
747
|
+
# mutation (nt)
|
748
|
+
mutations[mutations.ind,'mutation'] <- paste0(father.char[j],son.char[j])
|
749
|
+
# LONR
|
750
|
+
mutations[mutations.ind,'LONR'] <- lonr
|
751
|
+
# mutation type
|
752
|
+
# if mutation occurred in last nucleotides which are not a full codon - do not analyse
|
753
|
+
if( length(father.aa.char) >= ceiling(j/3) ){
|
754
|
+
if (father.aa.char[ceiling(j/3)] != son.aa.char[ceiling(j/3)])
|
755
|
+
mutations[mutations.ind,'mutation.type'] <- 'R'
|
756
|
+
else
|
757
|
+
mutations[mutations.ind,'mutation.type'] <- 'S'
|
758
|
+
}else
|
759
|
+
next
|
760
|
+
# position (nt)
|
761
|
+
mutations[mutations.ind,'position'] <- j
|
762
|
+
|
763
|
+
# save father and son names
|
764
|
+
mutations[mutations.ind,'father'] <- father
|
765
|
+
mutations[mutations.ind,'son'] <- son
|
766
|
+
mutations[mutations.ind,'flag'] <- flag
|
767
|
+
|
768
|
+
mutations.ind <- mutations.ind + 1
|
769
|
+
|
770
|
+
# increase table size if needed
|
771
|
+
if (mutations.ind > nrow(mutations))
|
772
|
+
mutations <- rbind(mutations, data.frame(mutation = rep('', 1000), LONR = 0, mutation.type = '', position=0, father='', son='', flag=F, stringsAsFactors=F))
|
773
|
+
}
|
774
|
+
|
775
|
+
return(list(mutations, mutations.ind))
|
776
|
+
}
|
777
|
+
|
778
|
+
|
779
|
+
# Get mutations between each father-son pair (recursively)
|
780
|
+
#
|
781
|
+
# Arguments: node - current node for which the number of offsprings
|
782
|
+
# edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
|
783
|
+
# n.offspring - data frame with size of left and right subtrees for each sequence
|
784
|
+
# mutations - mutation table
|
785
|
+
# mutations.ind - index of last muation inserted in mutations table
|
786
|
+
#
|
787
|
+
# Returns: updated n.offspring
|
788
|
+
get.mutations <- function(node, edge.df, nameSeq.df, n.offspring, mutations, mutations.ind){
|
789
|
+
|
790
|
+
sons <- edge.df[edge.df[, 1] == node, 2]
|
791
|
+
|
792
|
+
# if node is a leaf -
|
793
|
+
if (length(sons) == 0)
|
794
|
+
return (list(mutations, mutations.ind))
|
795
|
+
|
796
|
+
sons <- sort(sons)
|
797
|
+
|
798
|
+
# get father sequence, translate and split into characters
|
799
|
+
father.seq <- nameSeq.df[nameSeq.df$head==node, 'seq']
|
800
|
+
father.char <- s2c(father.seq)
|
801
|
+
father.aa.char <- seqinr::translate(unlist(strsplit(tolower(substr(father.seq, 1, nchar(father.seq)-(nchar(father.seq)%%3))), "")),
|
802
|
+
numcode = 1, NAstring = "X", ambiguous = FALSE)
|
803
|
+
|
804
|
+
# get son sequences, translate and split into characters
|
805
|
+
son1.seq <- nameSeq.df[nameSeq.df$head==sons[1], 'seq']
|
806
|
+
son1.char <- s2c(son1.seq)
|
807
|
+
son1.aa.char <- seqinr::translate(unlist(strsplit(tolower(substr(son1.seq, 1, nchar(son1.seq)-(nchar(son1.seq)%%3))), "")),
|
808
|
+
numcode = 1, NAstring = "X", ambiguous = FALSE)
|
809
|
+
# get mutation positions
|
810
|
+
mut.pos.son1 <- which(father.char!=son1.char)
|
811
|
+
|
812
|
+
if (length(sons) > 1){
|
813
|
+
# get son sequences, translate and split into characters
|
814
|
+
son2.seq <- nameSeq.df[nameSeq.df$head==sons[2], 'seq']
|
815
|
+
son2.char <- s2c(son2.seq)
|
816
|
+
son2.aa.char <- seqinr::translate(unlist(strsplit(tolower(substr(son2.seq, 1, nchar(son2.seq)-(nchar(son2.seq)%%3))), "")),
|
817
|
+
numcode = 1, NAstring = "X", ambiguous = FALSE)
|
818
|
+
# get mutation positions
|
819
|
+
mut.pos.son2 <- which(father.char!=son2.char)
|
820
|
+
# do not analyze mutation in both sons
|
821
|
+
mut.pos.son1 <- setdiff(mut.pos.son1,intersect(mut.pos.son1, mut.pos.son2))
|
822
|
+
mut.pos.son2 <- setdiff(mut.pos.son2,intersect(mut.pos.son1, mut.pos.son2))
|
823
|
+
}
|
824
|
+
# analyze mutation in first son
|
825
|
+
res<- analyze.mutations(n.offspring, node, father.char, father.aa.char, sons[1], 'left', son1.char, son1.aa.char, mut.pos.son1, mutations, mutations.ind)
|
826
|
+
mutations <- res[[1]]
|
827
|
+
mutations.ind <- res[[2]]
|
828
|
+
# recursive call with sons
|
829
|
+
res <- get.mutations(sons[1], edge.df, nameSeq.df, n.offspring, mutations, mutations.ind)
|
830
|
+
mutations <- res[[1]]
|
831
|
+
mutations.ind <- res[[2]]
|
832
|
+
|
833
|
+
if (length(sons) > 1){
|
834
|
+
# analyze mutation in second son
|
835
|
+
res<- analyze.mutations(n.offspring, node, father.char, father.aa.char, sons[2], 'right', son2.char, son2.aa.char, mut.pos.son2, mutations, mutations.ind)
|
836
|
+
mutations <- res[[1]]
|
837
|
+
mutations.ind <- res[[2]]
|
838
|
+
res <- get.mutations(sons[2], edge.df, nameSeq.df, n.offspring, mutations, mutations.ind)
|
839
|
+
mutations <- res[[1]]
|
840
|
+
mutations.ind <- res[[2]]
|
841
|
+
}
|
842
|
+
return (list(mutations, mutations.ind))
|
843
|
+
|
844
|
+
}
|
845
|
+
|
846
|
+
# Calculate subtree sizes for each internal node, find mutations and compute LONR scores
|
847
|
+
#
|
848
|
+
# Arguments:
|
849
|
+
# nameSeq.df - data frame of sequences and headers
|
850
|
+
# edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
|
851
|
+
# outgroup - outgroup sequence name (optional)
|
852
|
+
#
|
853
|
+
# Returns: mutations - mutation table
|
854
|
+
compute.sub.trees <- function(nameSeq.df, edge.df, outgroup = NULL){
|
855
|
+
|
856
|
+
# remove outgroup if exists
|
857
|
+
if (!is.null(outgroup)) {
|
858
|
+
nameSeq.df <- nameSeq.df[-which(nameSeq.df$head == outgroup), ] # remove from sequence list
|
859
|
+
edge.df <- edge.df[-which(edge.df[, 'to'] == outgroup), ] # remove from edge table
|
860
|
+
}
|
861
|
+
n.seq <- length(nameSeq.df)
|
862
|
+
|
863
|
+
# find all subbtree roots
|
864
|
+
roots <- as.character(unique(edge.df[!(edge.df[, 1] %in% edge.df[, 'to']), 1]))
|
865
|
+
treesIDs <- unique(nameSeq.df$treeID)
|
866
|
+
all.mutations <- data.frame()
|
867
|
+
for (i in treesIDs){
|
868
|
+
sub.tree <- nameSeq.df[nameSeq.df$treeID==i,]
|
869
|
+
curr.root <- as.character(roots[roots%in%sub.tree$head])
|
870
|
+
|
871
|
+
# calculate the size of left and right subtrees for each node
|
872
|
+
n.offspring <- data.frame(node = sub.tree$head, left = 0, right = 0, stringsAsFactors=F)
|
873
|
+
n.offspring <- get.number.offsprings(curr.root, n.offspring, edge.df)
|
874
|
+
|
875
|
+
# get mutations between each father-son pair
|
876
|
+
mutations <- data.frame(mutation = rep('', 1000), LONR = 0, mutation.type = '', position=0, father='', son='', flag=F, stringsAsFactors=F)
|
877
|
+
res <- get.mutations(curr.root, edge.df, sub.tree, n.offspring, mutations, 1)
|
878
|
+
|
879
|
+
# remove end of table if not used
|
880
|
+
mutations <- res[[1]]
|
881
|
+
ind <- res[[2]]
|
882
|
+
if (nrow(mutations) > ind )
|
883
|
+
mutations <- mutations[-(ind:nrow(mutations)), ]
|
884
|
+
|
885
|
+
all.mutations <- rbind(all.mutations, mutations)
|
886
|
+
}
|
887
|
+
return(all.mutations)
|
888
|
+
|
889
|
+
}
|
890
|
+
|
891
|
+
# Compute consensus sequence and remove positions with gaps
|
892
|
+
remove.gaps <- function(infile){
|
893
|
+
|
894
|
+
# read FASTA file in alignment object
|
895
|
+
aligned.seq <-read.alignment(infile, format='fasta', forceToLower = F)
|
896
|
+
|
897
|
+
# comnpute consensus sequence
|
898
|
+
consensus.seq <- consensus(aligned.seq, method = "majority")
|
899
|
+
|
900
|
+
# remove columns containing gaps in consensus
|
901
|
+
gapped.pos <- rev(which(consensus.seq == '-'))
|
902
|
+
for (pos in gapped.pos)
|
903
|
+
aligned.seq[['seq']] <- paste0(substr(aligned.seq[['seq']], 1, pos-1), substr(aligned.seq[['seq']], pos+1, nchar(aligned.seq[['seq']])[1]))
|
904
|
+
|
905
|
+
# convert sequences and headers into list
|
906
|
+
fasta.df <- data.frame(head=aligned.seq[['nam']], seq = aligned.seq[['seq']], stringsAsFactors = F)
|
907
|
+
|
908
|
+
return (fasta.df)
|
909
|
+
}
|
910
|
+
|
911
|
+
# Dived trees into subtrees by cutting branches with more mutations than specified by cutoff
|
912
|
+
#
|
913
|
+
# Arguments: nameSeq.df - data frame with headers and sequences
|
914
|
+
# edge.df - data frame with columns - parent(from), child(to), edge weight (weight), edge length (distance in nt)
|
915
|
+
# cutoff - number of mutation (default - 10)
|
916
|
+
#
|
917
|
+
# Returns: modified nameSeq.df - new column treeID, specifying sub tree
|
918
|
+
# modified edge.df - without long branches
|
919
|
+
cut.trees <- function(nameSeq.df,edge.df,cutoff){
|
920
|
+
|
921
|
+
# add column for tree ID
|
922
|
+
nameSeq.df$treeID <- -1
|
923
|
+
|
924
|
+
# Cut edges longer than threshold
|
925
|
+
trim.edge.df <- edge.df[edge.df$distance<=cutoff,]
|
926
|
+
|
927
|
+
# Get roots of all subtrees
|
928
|
+
sub.roots <- setdiff(trim.edge.df$from, trim.edge.df$to)
|
929
|
+
tree.id = 1
|
930
|
+
# For each root, get subtree edges and nodes
|
931
|
+
trees <- list()
|
932
|
+
for(root in sub.roots) {
|
933
|
+
at.bottom <- F
|
934
|
+
nodes <- nameSeq.df[nameSeq.df$head==root,]
|
935
|
+
# Trace down tree until at all leaves
|
936
|
+
while(at.bottom != T) {
|
937
|
+
# Add children of all nodes in tree thus far
|
938
|
+
new.nodes <- unique(rbind(nodes, nameSeq.df[nameSeq.df$head %in% trim.edge.df$to[trim.edge.df$from %in% nodes$head],]))
|
939
|
+
|
940
|
+
# If no children are to be added (tree is complete)
|
941
|
+
if(nrow(nodes) == nrow(new.nodes)) {
|
942
|
+
nodes <- new.nodes
|
943
|
+
# delete internal sequences
|
944
|
+
#nodes <- nodes[nodes$head %in% nodes$head[grep('^L', nodes$head, perl=T, fixed=F)],]
|
945
|
+
if(nrow(nodes)!=0){
|
946
|
+
nameSeq.df[nameSeq.df$head %in% nodes$head,'treeID'] <- tree.id
|
947
|
+
tree.id <- tree.id + 1
|
948
|
+
}
|
949
|
+
at.bottom <- T
|
950
|
+
}else{
|
951
|
+
nodes <- new.nodes
|
952
|
+
}
|
953
|
+
}
|
954
|
+
}
|
955
|
+
sing <- which(nameSeq.df$treeID==-1)
|
956
|
+
if (length(sing) > 0 )
|
957
|
+
nameSeq.df <- nameSeq.df[-sing,]
|
958
|
+
|
959
|
+
return(list(nameSeq.df, trim.edge.df))
|
960
|
+
}
|
961
|
+
|
962
|
+
# make sure the dirs have trailing slashes (all the path manipulation assumes they do)
|
963
|
+
check.dirs <- function(dirname) {
|
964
|
+
if(nchar(dirname) == 0)
|
965
|
+
stop('unexpected zero length dir name')
|
966
|
+
|
967
|
+
last.char = substr(dirname, nchar(dirname), nchar(dirname) + 1)
|
968
|
+
if(last.char != '/') {
|
969
|
+
print(paste0('note: directory names must have trailing slashes, adding to ', dirname))
|
970
|
+
dirname = paste0(dirname, '/')
|
971
|
+
}
|
972
|
+
|
973
|
+
return(dirname)
|
974
|
+
}
|
975
|
+
|
976
|
+
# MAIN function - Builds lineage tree, find mutations within the tree and compute LONR scores
|
977
|
+
#
|
978
|
+
# Arguments:
|
979
|
+
# method - dnapars or neighbor
|
980
|
+
# infile - input fasta file
|
981
|
+
# workdir - temporary working directory
|
982
|
+
# outgroup - outgroup sequence name (optional)
|
983
|
+
compute.LONR <- function(method, infile, workdir, outgroup=NULL, existing.edgefile=NULL, existing.node.seqfile=NULL, cutoff=10){
|
984
|
+
workdir = check.dirs(workdir)
|
985
|
+
|
986
|
+
# remove gaps in consensus
|
987
|
+
fasta.df <- remove.gaps(infile)
|
988
|
+
|
989
|
+
# fail on small or huge files
|
990
|
+
n.seq <- nrow(fasta.df)
|
991
|
+
if(n.seq < MIN.SEQ)
|
992
|
+
stop('Not enough sequences to make tree with Phylip')
|
993
|
+
if(n.seq > MAX.SEQ)
|
994
|
+
stop('Too many sequences to make tree with Phylip')
|
995
|
+
|
996
|
+
#------------------------------------
|
997
|
+
# PART I - Build lineage tree
|
998
|
+
#------------------------------------
|
999
|
+
res <- build.trees(method, fasta.df, workdir, outgroup, existing.edgefile, existing.node.seqfile)
|
1000
|
+
|
1001
|
+
if (is.null(res))
|
1002
|
+
return(F)
|
1003
|
+
nameSeq.df <- res[[1]]
|
1004
|
+
edge.df <- res[[2]]
|
1005
|
+
|
1006
|
+
#------------------------------------
|
1007
|
+
# PART II - Compute LONR scores
|
1008
|
+
#------------------------------------
|
1009
|
+
res <- cut.trees(nameSeq.df,edge.df,cutoff)
|
1010
|
+
nameSeq.df <- res[[1]]
|
1011
|
+
edge.df <- res[[2]]
|
1012
|
+
|
1013
|
+
|
1014
|
+
LONR.table <- compute.sub.trees(nameSeq.df, edge.df, outgroup)
|
1015
|
+
|
1016
|
+
# write lonr output to csv
|
1017
|
+
write.table(LONR.table, file=paste0(workdir, G.lonrfname), quote=F, sep=',', col.names=T, row.names=F)
|
1018
|
+
|
1019
|
+
file.remove(paste0(workdir, G.phy.infname))
|
1020
|
+
}
|