partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.1.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
  54. partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. partis/main.py +0 -59
  61. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  62. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
  65. {partis → python}/__init__.py +0 -0
  66. {partis → python}/alleleclusterer.py +0 -0
  67. {partis → python}/allelefinder.py +0 -0
  68. {partis → python}/alleleremover.py +0 -0
  69. {partis → python}/annotationclustering.py +0 -0
  70. {partis → python}/baseutils.py +0 -0
  71. {partis → python}/cache/__init__.py +0 -0
  72. {partis → python}/cache/cached_uncertainties.py +0 -0
  73. {partis → python}/clusterpath.py +0 -0
  74. {partis → python}/coar.py +0 -0
  75. {partis → python}/corrcounter.py +0 -0
  76. {partis → python}/datautils.py +0 -0
  77. {partis → python}/event.py +0 -0
  78. {partis → python}/fraction_uncertainty.py +0 -0
  79. {partis → python}/gex.py +0 -0
  80. {partis → python}/glomerator.py +0 -0
  81. {partis → python}/hist.py +0 -0
  82. {partis → python}/hmmwriter.py +0 -0
  83. {partis → python}/hutils.py +0 -0
  84. {partis → python}/indelutils.py +0 -0
  85. {partis → python}/lbplotting.py +0 -0
  86. {partis → python}/mds.py +0 -0
  87. {partis → python}/mutefreqer.py +0 -0
  88. {partis → python}/paircluster.py +0 -0
  89. {partis → python}/parametercounter.py +0 -0
  90. {partis → python}/paramutils.py +0 -0
  91. {partis → python}/partitiondriver.py +0 -0
  92. {partis → python}/partitionplotter.py +0 -0
  93. {partis → python}/performanceplotter.py +0 -0
  94. {partis → python}/plotconfig.py +0 -0
  95. {partis → python}/processargs.py +0 -0
  96. {partis → python}/prutils.py +0 -0
  97. {partis → python}/recombinator.py +0 -0
  98. {partis → python}/scanplot.py +0 -0
  99. {partis → python}/seqfileopener.py +0 -0
  100. {partis → python}/treegenerator.py +0 -0
  101. {partis → python}/viterbicluster.py +0 -0
  102. {partis → python}/vrc01.py +0 -0
  103. {partis → python}/waterer.py +0 -0
bin/lonr.r ADDED
@@ -0,0 +1,1020 @@
1
+ # paper: http://dx.doi.org/10.1093/nar/gkv1198
2
+ # docs from supplementary info:
3
+
4
+ # issues
5
+ # - can't pass in your own tree -- have to let it run either phylip dnapars or phylip neighbor. Related things:
6
+ # - it assumes the internal nodes (that phylip adds) have phylip-convention names, i.e. str(some_integer), and if they're not, it breaks
7
+ # - if you pass it (i.e. if you observe) internal nodes, phylip shunts them to zero-lengh branches hanging off of newly-inferred internal nodes. So you then have to try to figure out which nodes/leaves you can collapse
8
+ # - it counts offspring all the way down the tree, i.e. ancestors get credit for fitness improvements in their offspring
9
+ # - this is "handled" by the "affected by descendents" flag, but that doesn't solve the problem, it just alerts you to lineages where there's more likely to be a problem
10
+ # - it would be much better to use an lbi-style weighting that decreases contributions as they get further away
11
+ # - ignores pairs of sibling edges in which both edges have mutations, i.e. it can throw out a large fraction of mutations in cases where most branches have mutations
12
+ # - ignores branches with no siblings
13
+ # - ignores branches with multiple mutations
14
+ # notes
15
+ # - number of offspring for a node is set as the number of edges in the entire subtree below that node
16
+
17
+ ## Computing LONR scores in lineage trees
18
+
19
+ ## General
20
+ ## The program is written in R.
21
+ ## This analysis is divided into two parts:
22
+ ## First, a lineage tree is built with sequences provided in and aligned FASTA format.
23
+ ## There is an option to provide an outgroup sequence in order to detect more accurately the root sequence.
24
+ ## For less than 100 sequences, the Maximum parsimony method is used (dnapars Phylip program).
25
+ ## Otherwise, the Neighbor-joining method is used (neighbor Phylip program).
26
+ ## The Neighbor-joining method uses a distance matrix as input, which is computed using the dnadist Phylip program.
27
+ ## The internal sequences are reconstructed using the Fitch algorithm.
28
+ ## The tree is then divided into subtrees in order to ignore mutations occurring in very distant sequences.
29
+ ## Thus the user can specify the cut-off (in nucleotides) to cut long branches.
30
+ ## The default is 10 mutations.
31
+ ## The following LONR analysis is then performed for each subtree separately.
32
+ ## Second, mutations are detected between each pair of father and son sequences (at the nucleotide level).
33
+ ## The LONR score is calculated, for each mutation, as the log of the ratio between the sub-tree size of the son in which the mutation occurred and the sub-tree size of the son in which no mutation occurred at this position.
34
+
35
+ ## How to run
36
+ ## This program uses the Phylip-3.695 package. It is expecting it to be in the same folder as this script. The path to the program can be modified in the beginning of the script.
37
+ ## The package Biostrings and seqinr need to be installed.
38
+
39
+ ## Main function - compute.LONR()
40
+
41
+ ## Input
42
+ ## 1) in.dir – aligned FASTA file input directory
43
+ ## 2) out.dir – output directory for tree files and LONR results
44
+ ## 3) file – FASTA file name
45
+ ## 4) outgroup (optional) – outgroup sequence name. If
46
+ ## 5) cutoff - branches with more mutations than specified nu this cutoff will be trimmed, resulting in several subtrees (default - 10 )
47
+
48
+ ## Output
49
+ ## The following directories are created in out.dir:
50
+ ## Tree directory –
51
+ ## 1) filename.fasta -modified FASTA file if gaps were removed.
52
+ ## 2) filename.dis – (only for neighbor joining trees) contains distance matrix created by the dnadist program in the Phylip package.
53
+ ## 3) filename.phy – original sequences in alignment format.
54
+ ## 4) filename_edges.tab – tab-delimited file containing the tree edges, their weights and the distance in nucleotides between each two nodes.
55
+ ## 5) filename_names.tab – tab-delimited file matching between original sequence names and temporary names.
56
+ ## 6) filename_out.txt – Phylip output tree file
57
+ ## 7) filename_tree.txt – Phylip output tree file in Newick format
58
+
59
+ ## LONR directory –
60
+ ## 1) filename_lonr.csv – comma-separated file containing lonr results as followed:
61
+ ## i. mutation – mutated nucleotides (e.g. AC means A C)
62
+ ## ii. LONR – log(size of mutated sub-tree/size of un-mutated sub-tree)
63
+ ## iii. mutation.type – (S) Silent or (Replacement)
64
+ ## iv. position – in nucleotides, according to output FASTA file (see above)
65
+ ## v. father – sequence name from which occurred mutation
66
+ ## vi. son – sequence name to which occurred mutation
67
+ ## vii. flag – True if LONR score an internal node is affected by mutations occurring in its descendants
68
+ ## Clarifications
69
+ ## 1) The input sequences must already be aligned. If there are gaps, the consensus sequence of all the sequences is computed, and positions containing gaps are removed from all the sequences. The output FASTA file is created after this step.
70
+ ## 2) The dnapars program may create trees which are not completely binary. Thus, internal nodes which have more than two children are fixed by created an identical new child, which will receive the extra children.
71
+ ## This also happens is case no outgroup is provided and the root has three children (both in dnapars and neighbor programs).
72
+ ## 3) If the nucleotide sequence lengths are not a multiple of three and mutations occurred in the last nucleotides, these mutations are ignored since they cannot be typed (not a full codon).
73
+
74
+ # imports
75
+ suppressPackageStartupMessages(require(seqinr, quietly=TRUE, warn.conflicts=FALSE))
76
+ suppressPackageStartupMessages(require(Biostrings, quietly=TRUE, warn.conflicts=FALSE))
77
+
78
+ MIN.SEQ <- 3
79
+ MAX.SEQ <- 7000
80
+
81
+ # ----------------------------------------------------------------------------------------
82
+ # empty ones are all set by the calling python script after importing this code
83
+ G.phy.infname = 'inseqs.phy'
84
+ G.dnadist.fname = 'dnadist.dis'
85
+ G.phy.outfname = ''
86
+ G.phy.treefname = ''
87
+ G.outseqs.fname = ''
88
+ G.edgefname = ''
89
+ G.names.fname = ''
90
+ G.lonrfname = ''
91
+
92
+ # Create fasta file from a data.frame of sequences and headers
93
+ # out.dir - output directory
94
+ # nameSeq.df - data.frame of headers and sequences
95
+ write.FASTA <- function(out.dir, nameSeq.df) {
96
+ sequences <- nameSeq.df$seq
97
+ names(sequences) <- nameSeq.df$head
98
+ writeXStringSet(DNAStringSet(sequences), file=paste0(out.dir, G.outseqs.fname), width=1000)
99
+ }
100
+
101
+ # Change sequence names
102
+ # fasta.df - data frame of sequences and headers
103
+ # outgroup - outgroup sequence name
104
+ # Returns: data frame of sequences and headers with extra column of new headers
105
+ change.names <- function(fasta.df, outgroup = NULL){
106
+
107
+ if (!is.null(outgroup)){
108
+ outgroup.ind <- which(fasta.df$head == outgroup)
109
+ n.seq <- nrow(fasta.df)
110
+ # change sequence names to 'L_#'
111
+ ind <- 1:n.seq
112
+ ind <- ind[-outgroup.ind]
113
+
114
+ # original method, fails if outgroup is the last sequence in the file (it complains that the first line only adds n.seq-1 entries, whereas the rest of the data frame has n.seq entries [or something like that]):
115
+ ## fasta.df$head2[ind] <- sapply(1:(n.seq-1), function(x) { paste0('L', x)})
116
+ ## fasta.df$head2[outgroup.ind] <- fasta.df$head[outgroup.ind]
117
+
118
+ # this is weird and complicated so that it duplicates the original method ^
119
+ fasta.df$head2[1:n.seq] <- sapply(1:n.seq, function(x) { if(x == outgroup.ind) return(fasta.df$head[outgroup.ind]); if(x > outgroup.ind) x = x-1; return(paste0('L', x))})
120
+ }else{
121
+ n.seq <- nrow(fasta.df)
122
+ # change sequence names to 'L_#'
123
+ ind <- 1:n.seq
124
+ fasta.df$head2[ind] <- sapply(1:n.seq, function(x) { paste0('L', x)})
125
+ }
126
+ return(fasta.df)
127
+ }
128
+
129
+ # Create phylip input alignment file
130
+ # Arguments: fasta.df - data frame of sequences and headers
131
+ fasta2phylip <- function(fasta.df, workdir){
132
+
133
+ phy.df <- rbind(data.frame(head=sprintf('%-9s', nrow(fasta.df)), seq=nchar(fasta.df$seq[1]), stringsAsFactors=F),
134
+ data.frame(head=sprintf('%-9s', fasta.df$head2), seq=fasta.df$seq, stringsAsFactors=F))
135
+
136
+ write.table(phy.df, file=paste0(workdir, G.phy.infname), quote=F, sep=' ', col.names=F, row.names=F)
137
+ }
138
+
139
+ # Run dnapars (maximum parsimony)
140
+ # outgroup.ind - outgroup index in file (optional)
141
+ run.dnapars <- function(workdir, outgroup.ind){
142
+ print(' building trees with maximum parsimony')
143
+
144
+ curr.dir <- getwd()
145
+ setwd(workdir)
146
+
147
+ # options from dnapars program
148
+ # index of outgroup - last in data frame
149
+ if (length(outgroup.ind) != 0)
150
+ pars.options <- c(G.phy.infname, 'O', outgroup.ind, 'V', '1', '5', '.', '2', 'Y')
151
+ else
152
+ pars.options <- c(G.phy.infname, 'V', '1', '5', '.', '2', 'Y')
153
+
154
+ # run dnapars
155
+ system2('phylip', args='dnapars', input=pars.options, stdout=NULL) # not sure why this was just calling 'dnapars'? my phylip install doesn't seem to have put dnapars in my path, but 'phylip dnapars' seems to work ok
156
+
157
+ # move .phy file and output tree files
158
+ file.rename(from=paste0(workdir, 'outfile'), to=paste0(workdir, G.phy.outfname))
159
+ file.rename(from=paste0(workdir, 'outtree'), to=paste0(workdir, G.phy.treefname))
160
+
161
+ setwd(curr.dir)
162
+ }
163
+
164
+ # Order sequence from root to leaves
165
+ #
166
+ # Arguments: root - root sequence name
167
+ # nameSeq.df - data frame with headers and sequences
168
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
169
+ # outgroup - outgroup sequence name (optional)
170
+ #
171
+ # Returns: nameSeq.df - data frame with headers and sequences, ordered by distance from root
172
+ order.nameSeq <- function(root, nameSeq.df, edge.df, outgroup){
173
+
174
+ n.seq <- nrow(nameSeq.df)
175
+ nameSeq.df2 <- nameSeq.df[which(nameSeq.df[,'head']==root),] # root
176
+ for(i in 1:n.seq){ #1 - root
177
+ sons <- edge.df[which(edge.df[,'from']==nameSeq.df2[i,'head']),2]
178
+ if(length(sons)>0){
179
+ outgroup.ind <- which(sons==outgroup)
180
+ if(length(outgroup.ind)!=0) { sons <- sons[-outgroup.ind] }
181
+ for(j in 1:length(sons))
182
+ nameSeq.df2 <- rbind(nameSeq.df2, nameSeq.df[which(nameSeq.df[,'head']==sons[j]),])
183
+ }
184
+ }
185
+ if (!is.null(outgroup)) # last - outgroup
186
+ nameSeq.df2 <- rbind(nameSeq.df[which(nameSeq.df[,'head']==outgroup),], nameSeq.df2) # add outgroup first
187
+ row.names(nameSeq.df2)<-NULL
188
+
189
+ return(nameSeq.df2)
190
+ }
191
+
192
+
193
+ # Parse dnapars output tree file, get internal sequences and tree structure
194
+ #
195
+ # Returns: list of:
196
+ # nameSeq.df - data frame with headers and sequences, ordered by distance from root
197
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
198
+ parse.dnapars <- function(workdir, outgroup = NULL){
199
+
200
+ # read output tree file
201
+ out.tree <- scan(paste0(workdir, G.phy.outfname), what='character',sep='\n',
202
+ blank.lines.skip=F, strip.white=F)
203
+ # check if tree was build
204
+ if (any(grepl('-1 trees in all found', out.tree))) { return(NULL) }
205
+
206
+ # get internal sequences
207
+ seq.start <- min(grep('From\\s+To\\s+Any Steps\\?\\s+State at upper node', out.tree, perl=T, fixed=F))
208
+ seq.empty <- grep('^\\s*$', out.tree[seq.start:length(out.tree)], perl=T, fixed=F)
209
+ seq.len <- seq.empty[min(which(seq.empty[-1] == (seq.empty[-length(seq.empty)] + 1)))]
210
+ seq.block <- paste(out.tree[(seq.start + 2):(seq.start + seq.len - 2)], collapse='\n')
211
+ seq.df <- read.table(textConnection(seq.block), as.is=T, fill=T, blank.lines.skip=T, row.names = NULL, header=F, stringsAsFactors=F)
212
+
213
+ # fix root lines and remove empty rows
214
+ fix.row <- which(seq.df[,3]!="yes" & seq.df[,3]!="no" & seq.df[,3]!="maybe")
215
+ if (!is.null(outgroup))
216
+ seq.df[fix.row, ] <- cbind(seq.df[fix.row, 1], seq.df[fix.row, 2],'no', seq.df[fix.row, 3:6], stringsAsFactors=F)
217
+ else
218
+ seq.df[fix.row, ] <- cbind(seq.df[fix.row, 1], seq.df[fix.row, 1],'no', seq.df[fix.row, 2:5], stringsAsFactors=F)
219
+
220
+ # save full sequences as a data frame
221
+ names <- unique(seq.df[, 2])
222
+ seq <- sapply(names, function(x) { paste(t(as.matrix(seq.df[seq.df[, 2] == x, -c(1:3)])), collapse='') })
223
+ nameSeq.df <- data.frame(head=names, seq=seq, stringsAsFactors=F, row.names=NULL)
224
+
225
+ # get tree structure
226
+ edge.start <- min(grep('between\\s+and\\s+length', out.tree, perl=T, fixed=F))
227
+ edge.len <- min(grep('^\\s*$', out.tree[edge.start:length(out.tree)], perl=T, fixed=F))
228
+ edge.block <- paste(out.tree[(edge.start + 2):(edge.start + edge.len - 2)], collapse='\n')
229
+ edge.df <- read.table(textConnection(edge.block), col.names=c('from', 'to', 'weight'), as.is=T, stringsAsFactors=F)
230
+
231
+ # order sequences by distance from root
232
+ root <- unique(edge.df$from)[!(unique(edge.df$from) %in% edge.df$to)]
233
+
234
+ #root <- seq.df[which(seq.df[,1]=='root')[1],2]
235
+ nameSeq.df <- order.nameSeq(root, nameSeq.df, edge.df, outgroup)
236
+
237
+ return(list(nameSeq.df, edge.df))
238
+ }
239
+
240
+ # Fix ambiguous nucleotides in internal sequences, according to IUPAC code
241
+ #
242
+ # Arguments: nameSeq.df - data frame with headers and sequences, order by distance from root
243
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
244
+ # outgroup - outgroup sequence name (optional)
245
+ #
246
+ # Returns: - nameSeq.df - data frame with fixed sequences
247
+ fix.internal <- function(nameSeq.df, edge.df, outgroup = NULL){
248
+
249
+ nucleotides <- c('A', 'C','G','T','-')
250
+
251
+ # from leaves to root (except outgroup)
252
+ # fix only internal nodes!
253
+ if (!is.null(outgroup))
254
+ intern.seq <- rev(setdiff(2:nrow(nameSeq.df),grep('^L', nameSeq.df[,'head'], perl=T, fixed=F)))
255
+ else
256
+ intern.seq <- rev(setdiff(1:nrow(nameSeq.df),grep('^L', nameSeq.df[,'head'], perl=T, fixed=F)))
257
+
258
+ for(i in intern.seq){
259
+ # get ambiguous nucleotides
260
+ curr.seq <- nameSeq.df[i,'seq']
261
+ amb.pos <- setdiff(1:nchar(curr.seq),unlist(lapply(nucleotides, function(x) {gregexpr(pattern =x,curr.seq)})))
262
+ if(length(amb.pos) > 0){
263
+ # get son sequences
264
+ sons.seq <- sapply(edge.df[which(edge.df[,'from']==nameSeq.df[i,'head']),'to'], function(x) {nameSeq.df[which(nameSeq.df[,'head']==x),'seq']})
265
+ # do not fix outgroup sequence
266
+ if (!is.null(outgroup)){
267
+ outgroup.ind <- which(names(sons.seq)==outgroup)
268
+ if(length(outgroup.ind)!=0)
269
+ sons.seq <- sons.seq[-outgroup.ind]
270
+ }
271
+
272
+ for(j in amb.pos){
273
+ amb.nuc <- substr(curr.seq,j,j)
274
+ sons.nuc <- substr(sons.seq,j,j)
275
+ # if deletion
276
+ if(amb.nuc == 'O')
277
+ curr.seq <- paste0(substr(curr.seq,1, j-1), '-', substr(curr.seq,j+1, nchar(curr.seq)))
278
+ else{
279
+ # check if at least one of the sons has one match for ambiguous letter
280
+ if(is.element(amb.nuc, c('?', 'X')))
281
+ matches <- which(is.element(sons.nuc, nucleotides))
282
+ else
283
+ matches <- which(is.element(sons.nuc, toupper(amb(amb.nuc, forceToLower = TRUE))))
284
+ if(length(matches)==1) # if only one son had a match
285
+ curr.seq <- paste0(substr(curr.seq,1, j-1), sons.nuc[matches], substr(curr.seq,j+1, nchar(curr.seq)))
286
+ else if(length(matches)==0){ # choose randomly from son nucleotides
287
+ rand <- sample(1:length(sons.nuc), 1, replace=T)
288
+ curr.seq <- paste0(substr(curr.seq,1, j-1), sons.nuc[rand], substr(curr.seq,j+1, nchar(curr.seq)))
289
+ }else{ # choose randomly from son nucleotides that matched
290
+ rand <- sample(1:length(matches), 1, replace=T)
291
+ curr.seq <- paste0(substr(curr.seq,1, j-1), sons.nuc[matches[rand]], substr(curr.seq,j+1, nchar(curr.seq)))
292
+ }
293
+ }
294
+ }
295
+ nameSeq.df[i,'seq'] <- curr.seq
296
+ }
297
+ }
298
+
299
+ return(nameSeq.df)
300
+ }
301
+
302
+ # Compute distance matrix (Kimura model) for neighbor joining using dnadist (Phylip package)
303
+ #
304
+ # Arguments:
305
+ # workdir - phylip directory
306
+ run.dnadist <- function(workdir){
307
+
308
+ # options from dnadist program
309
+ dnadist.options <- c(G.phy.infname, 'D', '2', 'Y')
310
+
311
+ # run dnadist
312
+ system2('phylip', args='dnadist', input=dnadist.options,, stdout=NULL)
313
+
314
+ # move .phy file and output tree files
315
+ file.rename(from=paste0(workdir, 'outfile'), to=paste0(workdir, G.dnadist.fname))
316
+ }
317
+
318
+
319
+ # Run neighbor (Neighbor joining)
320
+ # outgroup.ind - outgroup index in file (optional)
321
+ run.neighbor <- function(workdir, outgroup.ind){
322
+ print(' building trees with neighbor joining')
323
+ curr.dir <- getwd()
324
+ setwd(workdir)
325
+
326
+ # run dnadist to create distance matrix
327
+ run.dnadist(workdir)
328
+
329
+ # options for neighbor program
330
+ # index of outgroup - last data frame
331
+ if (length(outgroup.ind) != 0 )
332
+ neigh.options <- c(G.dnadist.fname, 'O', outgroup.ind, '2', 'Y')
333
+ else
334
+ neigh.options <- c(G.dnadist.fname, '2', 'Y')
335
+
336
+ # run neighbor
337
+ system2('phylip', args='neighbor', input=neigh.options, stdout=NULL)
338
+
339
+ # move .phy, .dis and output tree files
340
+ file.rename(from=paste0(workdir, 'outfile'), to=paste0(workdir, G.phy.outfname))
341
+ file.rename(from=paste0(workdir, 'outtree'), to=paste0(workdir, G.phy.treefname))
342
+ file.remove(paste0(workdir, G.dnadist.fname))
343
+
344
+ setwd(curr.dir)
345
+ }
346
+
347
+ # Parse neighbor output tree file, get internal sequences and tree structure
348
+ # fasta.df - data frame of input sequences and headers
349
+ # outgroup - outgroup sequence name (optional)
350
+ # Returns: - list of:
351
+ # nameSeq.df - data frame with headers and sequences
352
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
353
+ parse.neighbor <- function(workdir, fasta.df, outgroup = NULL){
354
+
355
+ # read output tree file
356
+ out.tree <- scan(paste0(workdir, G.phy.outfname), what='character',sep='\n',
357
+ blank.lines.skip=F, strip.white=F)
358
+
359
+ # check if tree was build
360
+ if (any(grepl('-1 trees in all found', out.tree))) { return(NULL) }
361
+
362
+ # get tree structure
363
+ edge.start <- min(grep('Between\\s+And\\s+Length', out.tree, perl=T, fixed=F))
364
+ edge.len <- min(grep('^\\s*$', out.tree[edge.start:length(out.tree)], perl=T, fixed=F))
365
+ edge.block <- paste(out.tree[(edge.start + 2):(edge.start + edge.len - 2)], collapse='\n')
366
+ edge.df <- read.table(textConnection(edge.block), col.names=c('from', 'to', 'weight'), as.is=T)
367
+
368
+ # create nameSeq.df from input sequence only (for now)
369
+ nameSeq.df <- data.frame(head = fasta.df[,'head2'], seq = fasta.df[,'seq'], stringsAsFactors=F)
370
+
371
+ return(list(nameSeq.df, edge.df))
372
+ }
373
+
374
+ # Traverse tree in postorder (step 2 in Fitch's algorithm)
375
+ #
376
+ # Arguments: father/ sons - father and its 2 son names
377
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
378
+ # nameSeq.list - list of sequences with headers
379
+ #
380
+ # Returns: modified nameSeq.list - list of sequences with headers
381
+ traverse.up <- function(father, sons, edge.df, nameSeq.list){
382
+
383
+ sons.seq <- sapply(sons,function(x) NULL)
384
+
385
+ # check if each son sequence a leaf or is already reconstructed
386
+ for(i in sons){
387
+ if(!is.element(i, names(nameSeq.list)))
388
+ nameSeq.list <- traverse.up(i, edge.df[edge.df[,'from']==i,'to'], edge.df, nameSeq.list)
389
+ sons.seq[i] <- nameSeq.list[i]
390
+ }
391
+ seq.len <- length(sons.seq[[1]])
392
+
393
+ father.seq <- character(seq.len)
394
+ for(i in 1:seq.len){ # for each position in sequence
395
+ curr.nuc <- sapply(sons.seq, function(x) x[[i]])
396
+ nuc <- Reduce(intersect, strsplit(curr.nuc,""))
397
+ if(length(nuc)>0) # save intersection
398
+ father.seq[i] <- paste(nuc, collapse='')
399
+ else # save union
400
+ father.seq[i] <- paste( Reduce(union, strsplit(curr.nuc,'')),collapse="")
401
+ }
402
+ nameSeq.list[[father]]<-father.seq # save modified sequence
403
+
404
+ return(nameSeq.list)
405
+ }
406
+
407
+ # Recursively traverse tree in preorder (step 1 in Fitch's algorithm)
408
+ #
409
+ # Arguments: father/ son - father and son name
410
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
411
+ # nameSeq.list - list of sequences with headers
412
+ # outgroup - outgroup sequence name (optional)
413
+ #
414
+ # Returns: modified nameSeq.list - list of sequences with headers
415
+ traverse.down <- function(father, son, edge.df, nameSeq.list, outgroup = NULL){
416
+
417
+ son.seq <- nameSeq.list[[son]]
418
+ seq.len <- length(son.seq)
419
+
420
+ if(is.null(father)){ # root
421
+ for(i in 1:seq.len){
422
+ if(nchar(son.seq[i])>1){ # more than 1 option in intersection - choose randomly
423
+ #rand <- floor(runif(1, min=1, max=nchar(son.seq[i])+1))
424
+ rand <- sample(1:nchar(son.seq[i]), 1, replace=T)
425
+ son.seq[i] <- substr(son.seq[i],rand, rand)
426
+ }
427
+ }
428
+ }else{
429
+ father.seq <- nameSeq.list[[father]]
430
+ for(i in 1:seq.len){
431
+ if(nchar(son.seq[i])>1){
432
+ nuc <- Reduce(intersect, strsplit(c(father.seq[i],son.seq[i]),''))
433
+ # if only one nucleotide in intersection - keep it
434
+ if(length(nuc)==0){ # no intersection - choose randomly from son's options
435
+ rand <- sample(1:nchar(son.seq[i]), 1, replace=T)
436
+ son.seq[i] <- substr(son.seq[i],rand, rand)
437
+ }else # if intersection is not empty, but son has more than 1 option
438
+ son.seq[i] <- nuc
439
+ }
440
+ }
441
+ }
442
+ nameSeq.list[[son]] <- son.seq # save modified sequence
443
+
444
+ # recursive call for each son's sons
445
+ sons <- edge.df[edge.df[,'from']==son,'to']
446
+ if(length(sons) > 0){
447
+ # remove outgroup from root's children
448
+ if (!is.null(outgroup)){
449
+ outgroup.ind <- which(sons == outgroup)
450
+ if(length(outgroup.ind)!=0) { sons <- sons[-outgroup.ind] }
451
+ }
452
+ nameSeq.list <- traverse.down(son, sons[1], edge.df, nameSeq.list, outgroup)
453
+ nameSeq.list <- traverse.down(son, sons[2], edge.df, nameSeq.list, outgroup)
454
+ }
455
+ return(nameSeq.list)
456
+ }
457
+
458
+ # Reconstruct internal sequences with Fitch algorithm (for neighbor trees)
459
+ #
460
+ # Arguments: nameSeq.df - data frame with headers and input sequences only
461
+ # edge.df - data frame with columns - parent(from), child(to), distance(weight)
462
+ # outgroup - outgroup sequence name (optional)
463
+ #
464
+ # Returns: nameSeq.df - data frame with headers and all sequences, ordered by distance from root
465
+ get.internal <- function(nameSeq.df, edge.df, outgroup = NULL){
466
+
467
+ # find root
468
+ root <- as.character(unique(edge.df[!(edge.df[, 1] %in% edge.df[, 'to']), 1]))
469
+ root.sons <- edge.df[edge.df[,'from']==root,'to']
470
+
471
+ # remove outgroup from root's children
472
+ if (!is.null(outgroup)){
473
+ outgroup.ind <- which(root.sons == outgroup)
474
+ if(length(outgroup.ind)!=0) { root.sons <- root.sons[-outgroup.ind] }
475
+ }
476
+
477
+ # convert sequences to lists
478
+ nameSeq.list <- sapply(nameSeq.df[,'seq'], function(x) {strsplit(x, "", fixed=FALSE)})
479
+ names(nameSeq.list) <- nameSeq.df[,'head']
480
+
481
+ # Step 1 - preorder on tree - get intersection, otherwise - get union
482
+ nameSeq.list <- traverse.up(root, root.sons, edge.df, nameSeq.list)
483
+
484
+ # Step 2 - postorder on tree - get intersection, otherwise choose randomly
485
+ nameSeq.list <- traverse.down(NULL, root, edge.df, nameSeq.list, outgroup)
486
+
487
+ # convert sequence list to dataframe
488
+ nameSeq.list2 <-lapply(nameSeq.list,function(x) {paste(x, collapse="")})
489
+ nameSeq.df <-data.frame(head = names(nameSeq.list2), stringsAsFactors=F)
490
+ nameSeq.df$seq <- unlist(nameSeq.list2)
491
+
492
+ # order sequences in nameSeq.df by distance from root
493
+ nameSeq.df <- order.nameSeq(root, nameSeq.df, edge.df, outgroup)
494
+
495
+ return(nameSeq.df)
496
+ }
497
+
498
+ # Match old input sequence names with new in final output sequence data frame
499
+ #
500
+ # Arguments: nameSeq.df - data frame with headers and all sequences
501
+ # fasta.df - data frame with new and old headers and input sequences only
502
+ #
503
+ # Returns: nameSeq.df - data frame with headers and sequences, with old and new names
504
+ match.names <- function(fasta.df, nameSeq.df){
505
+
506
+ nameSeq.df$head2 <- rep("-", nrow(nameSeq.df))
507
+ for(i in 1:nrow(fasta.df))
508
+ nameSeq.df[which(fasta.df[i,'head2']==nameSeq.df[,'head']),'head2'] <- fasta.df[i,'head']
509
+
510
+ return(nameSeq.df)
511
+ }
512
+
513
+ # Compute edge lengths (distance in nucleotides)
514
+ #
515
+ # Arguments: nameSeq.df - data frame with headers and sequences
516
+ # edge.df - data frame with columns - parent(from), child(to), edge weight (weight)
517
+ #
518
+ # Returns: edge.df - data frame with columns - parent(from), child(to), edge weight (weight) , edge length (distance in nt)
519
+ compute.edge <- function(nameSeq.df, edge.df){
520
+
521
+ n.edge <- nrow(edge.df)
522
+ edge.df$distance <- rep(0, n.edge)
523
+ for(i in 1:n.edge){
524
+ from.seq <- unlist(strsplit(nameSeq.df[nameSeq.df[,'head']==as.character(edge.df[i,'from']),'seq'],''))
525
+ to.seq <- unlist(strsplit(nameSeq.df[nameSeq.df[,'head']==as.character(edge.df[i,'to']),'seq'],''))
526
+ edge.df[i,'distance'] <- length(which(from.seq!=to.seq))
527
+ }
528
+
529
+ return(edge.df)
530
+ }
531
+
532
+ # Convert parsimony tree into binary tree
533
+ #
534
+ # Arguments: nameSeq.df - data frame with headers and sequences
535
+ # edge.df - data frame with columns - parent(from), child(to), edge weight (weight)
536
+ # outgroup name (default - NULL)
537
+ #
538
+ # Returns: modified edge.df and nameSeq.df
539
+ convert.to.binary <- function(nameSeq.df, edge.df, outgroup = NULL){
540
+
541
+ # find nodes with more than 2 sons
542
+ son.count <- table(edge.df$from)
543
+ nodes <- names(son.count)[son.count == 3]
544
+
545
+ # get name of last internal node created by program
546
+ ind <- max(unique(edge.df$from)) + 1
547
+
548
+ for (i in nodes){
549
+ sons <- edge.df[edge.df$from==i, 'to']
550
+ if (!is.null(outgroup)){
551
+ if (outgroup %in% sons) # if outgroup is one of the root's sons - ok
552
+ next
553
+ }
554
+
555
+ # create new node named with current ind and attach 2 sons
556
+ edge.df[edge.df$to==sons[1], 'from'] <- ind
557
+ edge.df[edge.df$to==sons[2], 'from'] <- ind
558
+ # add new node has son of current node
559
+ edge.df <- rbind(edge.df, data.frame(from = i, to = ind, weight=0))
560
+ # add new node sequence - same as current node
561
+ nameSeq.df <- rbind(nameSeq.df, data.frame(head=ind, seq=nameSeq.df[nameSeq.df$head==i, 'seq']))
562
+ ind <- ind + 1
563
+ }
564
+
565
+ return(list(nameSeq.df, edge.df))
566
+
567
+ }
568
+
569
+ # Build phylogenetic tree with maximum parsimony or neighbor joining
570
+ # Arguments:
571
+ # fasta.df - data frame with input sequences and headers
572
+ # outgroup - outgroup sequence name (optional)
573
+ # Returns: nameSeq.list - list of sequences with headers
574
+ # edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
575
+ build.trees <- function(method, fasta.df, workdir, outgroup=NULL, existing.edgefile=NULL, existing.node.seqfile=NULL){
576
+
577
+ dir.create(workdir, recursive = T, showWarnings = FALSE)
578
+
579
+ # change input sequence name to shorter ones
580
+ fasta.df <- change.names(fasta.df, outgroup) # CHECK CP NUM!!!!!!!
581
+
582
+ # convert sequences to alignment format for phylip programs
583
+ fasta2phylip(fasta.df, workdir)
584
+
585
+ # find outgroup index in file
586
+ outgroup.ind <- which(fasta.df$head == outgroup)
587
+
588
+ if(!is.null(existing.edgefile)) {
589
+ stop('doesn\'t work yet (and probably a waste of time to fix it -- reimplement this stupid shit in python)')
590
+ edge.df <- read.csv(file=existing.edgefile, header=TRUE, sep=',', colClasses=c('character', 'character', 'numeric'))
591
+ nameSeq.df <- read.csv(file=existing.node.seqfile, header=TRUE, sep=',', colClasses=c('character', 'character'))
592
+ for(irow in 1:length(nameSeq.df$head)) {
593
+ input.name <- nameSeq.df$head[[irow]]
594
+ for(jrow in 1:length(fasta.df$head)) {
595
+ if(fasta.df$head[[jrow]] == input.name) { # see if this sequence is in the df with name translations (i.e. it's an input/leaf sequence)
596
+ new.name <- fasta.df$head2[[jrow]]
597
+ nameSeq.df$head[[irow]] <- new.name # and if it is, replace the input name with the new name
598
+ if(toString(nameSeq.df$seq[[irow]]) != toupper(toString(fasta.df$seq[[jrow]]))) # make sure the sequences are the same (I don't really understand the need for toString(), but it prints some extra "Level" stuff if I don't have it, so...)
599
+ stop(paste0('sequences don\'t match for ', input.name, ':\n', toString(nameSeq.df$seq[[irow]]), '\n', toupper(toString(fasta.df$seq[[jrow]]))))
600
+ for(krow in 1:length(edge.df$from)) { # also have to change the names in the edge df
601
+ if(edge.df$from[[krow]] == input.name) # this doesn't actually happen, since they only rename the leaves
602
+ edge.df$from[[krow]] <- new.name
603
+ if(edge.df$to[[krow]] == input.name)
604
+ edge.df$to[[krow]] <- new.name
605
+ }
606
+ break
607
+ }
608
+ }
609
+ }
610
+ ## # we need to convert the internal node names to integers here, since convert.to.binary() tries sorts them, which fails if they're strings. But this doesn't freaking work, because the class of the column is set the 'character' (above). fuck you, R
611
+ ## print(nameSeq.df$head)
612
+ ## ftmp <- function(x) { if(substr(x, 1, 1) == 'L') return(x); if(x == outgroup) return(x); return(strtoi(x)); }
613
+ ## edge.df$from <- sapply(edge.df$from, ftmp)
614
+ ## edge.df$to <- sapply(edge.df$to, ftmp)
615
+ ## nameSeq.df$head <- sapply(nameSeq.df$head, ftmp)
616
+ ## print(nameSeq.df$head)
617
+ ## stop('x')
618
+ } else if(method == 'dnapars') { # maximum parsimony
619
+ run.dnapars(workdir, outgroup.ind)
620
+
621
+ # parse dnapars output
622
+ tmp <- parse.dnapars(workdir, outgroup)
623
+ if(is.null(tmp)) { return(F) } # if tree building failed
624
+ nameSeq.df <- tmp[[1]]
625
+ edge.df <- tmp[[2]]
626
+ nameSeq.df <- fix.internal(nameSeq.df, edge.df, outgroup)
627
+ } else if(method == 'neighbor') { # neighbor joining
628
+ run.neighbor(workdir, outgroup.ind)
629
+
630
+ # parse neighbor output
631
+ tmp <- parse.neighbor(workdir, fasta.df, outgroup)
632
+ if(is.null(tmp)) { return(F) } # if tree building failed
633
+ nameSeq.df <- tmp[[1]]
634
+ edge.df <- tmp[[2]]
635
+ nameSeq.df <- get.internal(nameSeq.df, edge.df, outgroup)
636
+ } else {
637
+ stop(paste0('unhandled method: ', method))
638
+ }
639
+
640
+ # convert tree to binary
641
+ tmp <- convert.to.binary(nameSeq.df, edge.df, outgroup)
642
+ nameSeq.df <- tmp[[1]]
643
+ edge.df <- tmp[[2]]
644
+
645
+ # compute edge lengths
646
+ edge.df <- compute.edge(nameSeq.df, edge.df)
647
+
648
+ # retrieve old names for input sequences
649
+ nameSeq.df <- match.names(fasta.df, nameSeq.df)
650
+
651
+ # save output files
652
+ # save sequence as FASTA file
653
+ write.FASTA(workdir, nameSeq.df)
654
+ # save Fome/To/distances table (edges)
655
+ write.table(edge.df, file=paste0(workdir, G.edgefname), quote=F, sep='\t', col.names=T, row.names=F)
656
+ # save old and new sequence names
657
+ write.table(nameSeq.df[,c('head','head2')], file=paste0(workdir, G.names.fname), quote=F, sep='\t', col.names=T, row.names=F)
658
+
659
+ return(list(nameSeq.df, edge.df))
660
+ }
661
+
662
+ # Calculate subtree sizes for each internal node (recursive function)
663
+ #
664
+ # Arguments: node - current node for which the number of offsprings
665
+ # n.offspring - data frame with size of keft and right subtrees for each sequence
666
+ # edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
667
+ #
668
+ # Returns: updated n.offspring
669
+ get.number.offsprings <- function(node, n.offspring, edge.df){
670
+
671
+ # post order on tree
672
+ sons <- edge.df[edge.df[, 'from'] == node, 2]
673
+
674
+ # if node is a leaf - set number of offsprings to zero
675
+ if (length(sons) == 0)
676
+ return (n.offspring)
677
+
678
+ # sort sons alphabetically - so that the first is always the left son
679
+ sons <- sort(sons)
680
+
681
+ node.ind <- which(n.offspring$node == node)
682
+
683
+ # left subtree
684
+ n.offspring <- get.number.offsprings(sons[1], n.offspring, edge.df)
685
+ n.offspring[node.ind, 'left'] <- n.offspring[n.offspring$node == sons[1], 'left'] +
686
+ n.offspring[n.offspring$node == sons[1], 'right'] + 1
687
+
688
+ if (length(sons) == 1 ) # the other son was cut
689
+ n.offspring[node.ind, 'right'] <- 0
690
+ else{
691
+ # right subtree
692
+ n.offspring <- get.number.offsprings(sons[2], n.offspring, edge.df)
693
+ n.offspring[node.ind, 'right'] <- n.offspring[n.offspring$node == sons[2], 'left'] +
694
+ n.offspring[n.offspring$node == sons[2], 'right'] + 1
695
+ }
696
+
697
+ return (n.offspring)
698
+ }
699
+
700
+ # Analyze mutations between a pair of father-son and compute LONR scores for each mutation
701
+ #
702
+ # Arguments: n.offspring - data frame with size of left and right subtrees for each sequence
703
+ # father - father name
704
+ # father.char - father nucleotide sequence split into characters
705
+ # father.aa.char - father amino acid sequence split into characters
706
+ # son - son name
707
+ # son.side - left or right son of father (alphabetically)
708
+ # son.char - son nucleotide sequence split into characters
709
+ # son.aa.char - son amino acid sequence split into characters
710
+ # mut.pos - mutation positions (nucleotides)
711
+ # mutations - mutation table
712
+ # mutations.ind - index of last mutation inserted in mutations table
713
+ #
714
+ # Returns: list of
715
+ # - updated mutations and mutations.ind
716
+ analyze.mutations <- function(n.offspring, father, father.char, father.aa.char, son, son.side, son.char, son.aa.char, mut.pos, mutations, mutations.ind){
717
+
718
+ flag <- F
719
+ father.ind <- which(n.offspring$node==father)
720
+
721
+ # if at least one son was trimmed - do not analyze mutations
722
+ if (n.offspring[father.ind,'left']==0 | n.offspring[father.ind,'right']==0)
723
+ return(list(mutations, mutations.ind))
724
+
725
+ # compute LONR score
726
+ # log of subtree size where mutation occurred divided by subtree size where no mutation occurred
727
+ if (son.side == 'left')
728
+ lonr <- log(n.offspring[father.ind,'left']/n.offspring[father.ind,'right'])
729
+ else
730
+ lonr <- log(n.offspring[father.ind,'right']/n.offspring[father.ind,'left'])
731
+
732
+ # get son's subtree sizes
733
+ son.left <- n.offspring[n.offspring$node==son, 'left']
734
+ son.right <- n.offspring[n.offspring$node==son, 'right']
735
+ if (son.left != 0 & son.right != 0){
736
+ if (son.left > 2*son.right | 2*son.left < son.right){
737
+ if (son.side == 'left')
738
+ new.lonr <- log((2*min(son.left, son.right))/n.offspring[father.ind,'right'])
739
+ else
740
+ new.lonr <- log((2*min(son.left, son.right))/n.offspring[father.ind,'left'])
741
+ if (lonr/new.lonr < 0) # sign changed - add flag
742
+ flag = T
743
+ }
744
+ }
745
+
746
+ for (j in mut.pos){
747
+ # mutation (nt)
748
+ mutations[mutations.ind,'mutation'] <- paste0(father.char[j],son.char[j])
749
+ # LONR
750
+ mutations[mutations.ind,'LONR'] <- lonr
751
+ # mutation type
752
+ # if mutation occurred in last nucleotides which are not a full codon - do not analyse
753
+ if( length(father.aa.char) >= ceiling(j/3) ){
754
+ if (father.aa.char[ceiling(j/3)] != son.aa.char[ceiling(j/3)])
755
+ mutations[mutations.ind,'mutation.type'] <- 'R'
756
+ else
757
+ mutations[mutations.ind,'mutation.type'] <- 'S'
758
+ }else
759
+ next
760
+ # position (nt)
761
+ mutations[mutations.ind,'position'] <- j
762
+
763
+ # save father and son names
764
+ mutations[mutations.ind,'father'] <- father
765
+ mutations[mutations.ind,'son'] <- son
766
+ mutations[mutations.ind,'flag'] <- flag
767
+
768
+ mutations.ind <- mutations.ind + 1
769
+
770
+ # increase table size if needed
771
+ if (mutations.ind > nrow(mutations))
772
+ mutations <- rbind(mutations, data.frame(mutation = rep('', 1000), LONR = 0, mutation.type = '', position=0, father='', son='', flag=F, stringsAsFactors=F))
773
+ }
774
+
775
+ return(list(mutations, mutations.ind))
776
+ }
777
+
778
+
779
+ # Get mutations between each father-son pair (recursively)
780
+ #
781
+ # Arguments: node - current node for which the number of offsprings
782
+ # edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
783
+ # n.offspring - data frame with size of left and right subtrees for each sequence
784
+ # mutations - mutation table
785
+ # mutations.ind - index of last muation inserted in mutations table
786
+ #
787
+ # Returns: updated n.offspring
788
+ get.mutations <- function(node, edge.df, nameSeq.df, n.offspring, mutations, mutations.ind){
789
+
790
+ sons <- edge.df[edge.df[, 1] == node, 2]
791
+
792
+ # if node is a leaf -
793
+ if (length(sons) == 0)
794
+ return (list(mutations, mutations.ind))
795
+
796
+ sons <- sort(sons)
797
+
798
+ # get father sequence, translate and split into characters
799
+ father.seq <- nameSeq.df[nameSeq.df$head==node, 'seq']
800
+ father.char <- s2c(father.seq)
801
+ father.aa.char <- seqinr::translate(unlist(strsplit(tolower(substr(father.seq, 1, nchar(father.seq)-(nchar(father.seq)%%3))), "")),
802
+ numcode = 1, NAstring = "X", ambiguous = FALSE)
803
+
804
+ # get son sequences, translate and split into characters
805
+ son1.seq <- nameSeq.df[nameSeq.df$head==sons[1], 'seq']
806
+ son1.char <- s2c(son1.seq)
807
+ son1.aa.char <- seqinr::translate(unlist(strsplit(tolower(substr(son1.seq, 1, nchar(son1.seq)-(nchar(son1.seq)%%3))), "")),
808
+ numcode = 1, NAstring = "X", ambiguous = FALSE)
809
+ # get mutation positions
810
+ mut.pos.son1 <- which(father.char!=son1.char)
811
+
812
+ if (length(sons) > 1){
813
+ # get son sequences, translate and split into characters
814
+ son2.seq <- nameSeq.df[nameSeq.df$head==sons[2], 'seq']
815
+ son2.char <- s2c(son2.seq)
816
+ son2.aa.char <- seqinr::translate(unlist(strsplit(tolower(substr(son2.seq, 1, nchar(son2.seq)-(nchar(son2.seq)%%3))), "")),
817
+ numcode = 1, NAstring = "X", ambiguous = FALSE)
818
+ # get mutation positions
819
+ mut.pos.son2 <- which(father.char!=son2.char)
820
+ # do not analyze mutation in both sons
821
+ mut.pos.son1 <- setdiff(mut.pos.son1,intersect(mut.pos.son1, mut.pos.son2))
822
+ mut.pos.son2 <- setdiff(mut.pos.son2,intersect(mut.pos.son1, mut.pos.son2))
823
+ }
824
+ # analyze mutation in first son
825
+ res<- analyze.mutations(n.offspring, node, father.char, father.aa.char, sons[1], 'left', son1.char, son1.aa.char, mut.pos.son1, mutations, mutations.ind)
826
+ mutations <- res[[1]]
827
+ mutations.ind <- res[[2]]
828
+ # recursive call with sons
829
+ res <- get.mutations(sons[1], edge.df, nameSeq.df, n.offspring, mutations, mutations.ind)
830
+ mutations <- res[[1]]
831
+ mutations.ind <- res[[2]]
832
+
833
+ if (length(sons) > 1){
834
+ # analyze mutation in second son
835
+ res<- analyze.mutations(n.offspring, node, father.char, father.aa.char, sons[2], 'right', son2.char, son2.aa.char, mut.pos.son2, mutations, mutations.ind)
836
+ mutations <- res[[1]]
837
+ mutations.ind <- res[[2]]
838
+ res <- get.mutations(sons[2], edge.df, nameSeq.df, n.offspring, mutations, mutations.ind)
839
+ mutations <- res[[1]]
840
+ mutations.ind <- res[[2]]
841
+ }
842
+ return (list(mutations, mutations.ind))
843
+
844
+ }
845
+
846
+ # Calculate subtree sizes for each internal node, find mutations and compute LONR scores
847
+ #
848
+ # Arguments:
849
+ # nameSeq.df - data frame of sequences and headers
850
+ # edge.df - data frame with columns - parent(from), child(to), weight, distance(nt)
851
+ # outgroup - outgroup sequence name (optional)
852
+ #
853
+ # Returns: mutations - mutation table
854
+ compute.sub.trees <- function(nameSeq.df, edge.df, outgroup = NULL){
855
+
856
+ # remove outgroup if exists
857
+ if (!is.null(outgroup)) {
858
+ nameSeq.df <- nameSeq.df[-which(nameSeq.df$head == outgroup), ] # remove from sequence list
859
+ edge.df <- edge.df[-which(edge.df[, 'to'] == outgroup), ] # remove from edge table
860
+ }
861
+ n.seq <- length(nameSeq.df)
862
+
863
+ # find all subbtree roots
864
+ roots <- as.character(unique(edge.df[!(edge.df[, 1] %in% edge.df[, 'to']), 1]))
865
+ treesIDs <- unique(nameSeq.df$treeID)
866
+ all.mutations <- data.frame()
867
+ for (i in treesIDs){
868
+ sub.tree <- nameSeq.df[nameSeq.df$treeID==i,]
869
+ curr.root <- as.character(roots[roots%in%sub.tree$head])
870
+
871
+ # calculate the size of left and right subtrees for each node
872
+ n.offspring <- data.frame(node = sub.tree$head, left = 0, right = 0, stringsAsFactors=F)
873
+ n.offspring <- get.number.offsprings(curr.root, n.offspring, edge.df)
874
+
875
+ # get mutations between each father-son pair
876
+ mutations <- data.frame(mutation = rep('', 1000), LONR = 0, mutation.type = '', position=0, father='', son='', flag=F, stringsAsFactors=F)
877
+ res <- get.mutations(curr.root, edge.df, sub.tree, n.offspring, mutations, 1)
878
+
879
+ # remove end of table if not used
880
+ mutations <- res[[1]]
881
+ ind <- res[[2]]
882
+ if (nrow(mutations) > ind )
883
+ mutations <- mutations[-(ind:nrow(mutations)), ]
884
+
885
+ all.mutations <- rbind(all.mutations, mutations)
886
+ }
887
+ return(all.mutations)
888
+
889
+ }
890
+
891
+ # Compute consensus sequence and remove positions with gaps
892
+ remove.gaps <- function(infile){
893
+
894
+ # read FASTA file in alignment object
895
+ aligned.seq <-read.alignment(infile, format='fasta', forceToLower = F)
896
+
897
+ # comnpute consensus sequence
898
+ consensus.seq <- consensus(aligned.seq, method = "majority")
899
+
900
+ # remove columns containing gaps in consensus
901
+ gapped.pos <- rev(which(consensus.seq == '-'))
902
+ for (pos in gapped.pos)
903
+ aligned.seq[['seq']] <- paste0(substr(aligned.seq[['seq']], 1, pos-1), substr(aligned.seq[['seq']], pos+1, nchar(aligned.seq[['seq']])[1]))
904
+
905
+ # convert sequences and headers into list
906
+ fasta.df <- data.frame(head=aligned.seq[['nam']], seq = aligned.seq[['seq']], stringsAsFactors = F)
907
+
908
+ return (fasta.df)
909
+ }
910
+
911
+ # Dived trees into subtrees by cutting branches with more mutations than specified by cutoff
912
+ #
913
+ # Arguments: nameSeq.df - data frame with headers and sequences
914
+ # edge.df - data frame with columns - parent(from), child(to), edge weight (weight), edge length (distance in nt)
915
+ # cutoff - number of mutation (default - 10)
916
+ #
917
+ # Returns: modified nameSeq.df - new column treeID, specifying sub tree
918
+ # modified edge.df - without long branches
919
+ cut.trees <- function(nameSeq.df,edge.df,cutoff){
920
+
921
+ # add column for tree ID
922
+ nameSeq.df$treeID <- -1
923
+
924
+ # Cut edges longer than threshold
925
+ trim.edge.df <- edge.df[edge.df$distance<=cutoff,]
926
+
927
+ # Get roots of all subtrees
928
+ sub.roots <- setdiff(trim.edge.df$from, trim.edge.df$to)
929
+ tree.id = 1
930
+ # For each root, get subtree edges and nodes
931
+ trees <- list()
932
+ for(root in sub.roots) {
933
+ at.bottom <- F
934
+ nodes <- nameSeq.df[nameSeq.df$head==root,]
935
+ # Trace down tree until at all leaves
936
+ while(at.bottom != T) {
937
+ # Add children of all nodes in tree thus far
938
+ new.nodes <- unique(rbind(nodes, nameSeq.df[nameSeq.df$head %in% trim.edge.df$to[trim.edge.df$from %in% nodes$head],]))
939
+
940
+ # If no children are to be added (tree is complete)
941
+ if(nrow(nodes) == nrow(new.nodes)) {
942
+ nodes <- new.nodes
943
+ # delete internal sequences
944
+ #nodes <- nodes[nodes$head %in% nodes$head[grep('^L', nodes$head, perl=T, fixed=F)],]
945
+ if(nrow(nodes)!=0){
946
+ nameSeq.df[nameSeq.df$head %in% nodes$head,'treeID'] <- tree.id
947
+ tree.id <- tree.id + 1
948
+ }
949
+ at.bottom <- T
950
+ }else{
951
+ nodes <- new.nodes
952
+ }
953
+ }
954
+ }
955
+ sing <- which(nameSeq.df$treeID==-1)
956
+ if (length(sing) > 0 )
957
+ nameSeq.df <- nameSeq.df[-sing,]
958
+
959
+ return(list(nameSeq.df, trim.edge.df))
960
+ }
961
+
962
+ # make sure the dirs have trailing slashes (all the path manipulation assumes they do)
963
+ check.dirs <- function(dirname) {
964
+ if(nchar(dirname) == 0)
965
+ stop('unexpected zero length dir name')
966
+
967
+ last.char = substr(dirname, nchar(dirname), nchar(dirname) + 1)
968
+ if(last.char != '/') {
969
+ print(paste0('note: directory names must have trailing slashes, adding to ', dirname))
970
+ dirname = paste0(dirname, '/')
971
+ }
972
+
973
+ return(dirname)
974
+ }
975
+
976
+ # MAIN function - Builds lineage tree, find mutations within the tree and compute LONR scores
977
+ #
978
+ # Arguments:
979
+ # method - dnapars or neighbor
980
+ # infile - input fasta file
981
+ # workdir - temporary working directory
982
+ # outgroup - outgroup sequence name (optional)
983
+ compute.LONR <- function(method, infile, workdir, outgroup=NULL, existing.edgefile=NULL, existing.node.seqfile=NULL, cutoff=10){
984
+ workdir = check.dirs(workdir)
985
+
986
+ # remove gaps in consensus
987
+ fasta.df <- remove.gaps(infile)
988
+
989
+ # fail on small or huge files
990
+ n.seq <- nrow(fasta.df)
991
+ if(n.seq < MIN.SEQ)
992
+ stop('Not enough sequences to make tree with Phylip')
993
+ if(n.seq > MAX.SEQ)
994
+ stop('Too many sequences to make tree with Phylip')
995
+
996
+ #------------------------------------
997
+ # PART I - Build lineage tree
998
+ #------------------------------------
999
+ res <- build.trees(method, fasta.df, workdir, outgroup, existing.edgefile, existing.node.seqfile)
1000
+
1001
+ if (is.null(res))
1002
+ return(F)
1003
+ nameSeq.df <- res[[1]]
1004
+ edge.df <- res[[2]]
1005
+
1006
+ #------------------------------------
1007
+ # PART II - Compute LONR scores
1008
+ #------------------------------------
1009
+ res <- cut.trees(nameSeq.df,edge.df,cutoff)
1010
+ nameSeq.df <- res[[1]]
1011
+ edge.df <- res[[2]]
1012
+
1013
+
1014
+ LONR.table <- compute.sub.trees(nameSeq.df, edge.df, outgroup)
1015
+
1016
+ # write lonr output to csv
1017
+ write.table(LONR.table, file=paste0(workdir, G.lonrfname), quote=F, sep=',', col.names=T, row.names=F)
1018
+
1019
+ file.remove(paste0(workdir, G.phy.infname))
1020
+ }