viral_seq 1.0.13 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/viral_seq.rb CHANGED
@@ -37,6 +37,10 @@ require_relative "viral_seq/string"
37
37
  require_relative "viral_seq/version"
38
38
  require_relative "viral_seq/tcs_core"
39
39
  require_relative "viral_seq/tcs_json"
40
-
40
+ require_relative "viral_seq/tcs_dr"
41
+ require_relative "viral_seq/sdrm"
42
+ require_relative "viral_seq/recency"
41
43
 
42
44
  require "muscle_bio"
45
+ require "json"
46
+ require "securerandom"
@@ -1,11 +1,41 @@
1
1
  module ViralSeq
2
2
 
3
3
  # array for all amino acid one letter abbreviations
4
-
5
4
  AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
5
 
7
- SDRM_HIV_PR_LIST = {}
8
- SDRM_HIV_RT_LIST = {}
9
- SDRM_HIV_IN_LIST = {}
10
-
6
+ # R script for tcs_sdrm script
7
+
8
+ R_SCRIPT = 'setwd("PATH_TO_FASTA")
9
+ library(phangorn)
10
+ library(ape)
11
+ library(ggplot2)
12
+ library(scales)
13
+ library(ggforce)
14
+ library(cowplot)
15
+ library(magrittr)
16
+ library(gridExtra)
17
+ pdf("OUTPUT_PDF", onefile=T, width=11, height=8.5)
18
+ fileNames <- list.files()
19
+ for (fileName in fileNames) {
20
+ dna <- read.dna(fileName, format="fasta")
21
+ class(dna)
22
+ D<- dist.dna(dna, model="raw")
23
+ pi <- mean(D)
24
+ dist20 <- quantile(D, prob=c(0.20))
25
+ alldist <- data.frame(File=fileName, pi, dist20)
26
+ write.table(alldist,"OUTPUT_CSV",append=TRUE, sep = ",", row.names = FALSE, col.names=FALSE)
27
+ D2 <- dist.dna(dna, model="TN93")*100
28
+ def.par <- par(no.readonly = TRUE)
29
+ par(mfrow=c(1,2))
30
+ hist<-hist(D, main=fileName, xlab="% Pairwise Distance", ylab="Frequency", col="gray")
31
+ abline(v=dist20, col="royalblue",lwd=2)
32
+ abline(v=pi, col="red", lwd=2)
33
+ legend(x="topright", c("dist20", "pi"), col = c("royalblue", "red"), lwd = c(2,2), cex=0.5)
34
+ njtree<-NJ(D2)
35
+ njtreeplot <- plot(njtree, show.tip.label=F, "unrooted", main=fileName)
36
+ add.scale.bar(cex=0.7, font=2, col="red")
37
+ }
38
+ dev.off()'
39
+
40
+
11
41
  end
@@ -1,6 +1,6 @@
1
1
 
2
2
  module ViralSeq
3
- class SDRM
3
+ class SeqHash
4
4
 
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
@@ -39,8 +39,9 @@ module ViralSeq
39
39
 
40
40
  def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
41
  temp_dir = Dir.home
42
- temp_file = File.join(temp_dir, "_temp_muscle_in")
43
- temp_aln = File.join(temp_dir, "_temp_muscle_aln")
42
+ temp_name = "_" + SecureRandom.alphanumeric
43
+ temp_file = File.join(temp_dir, temp_name)
44
+ temp_aln = File.join(temp_dir, (temp_name + "_aln"))
44
45
  name = ">test"
45
46
  temp_in = File.open(temp_file,"w")
46
47
  temp_in.puts ">ref"
@@ -0,0 +1,52 @@
1
+ module ViralSeq
2
+
3
+ # recency prediction function based on HIV MPID-NGS
4
+ # @see https://pubmed.ncbi.nlm.nih.gov/32663847 Ref: Zhou et al. J Infect Dis. 2021
5
+
6
+ module Recency
7
+
8
+ # @params tcs_RT [Integer] number of TCS at the RT region
9
+ # @params tcs_V1V3 [Integer] number of TCS at the V1V3 region
10
+ # @params pi_RT [Float] pairwise diversity at the RT region
11
+ # @params pi_V1V3 [Float] pairwise diversity at the V1V3 region
12
+ # @params dist20_RT [Float] dist20 at the RT region
13
+ # @params dist20_V1V3 [Float] dist20 at the V1V3 region
14
+ # @return [String] determination of the recency
15
+
16
+ def self.define(tcs_RT: nil,
17
+ tcs_V1V3: nil,
18
+ pi_RT: nil,
19
+ dist20_RT: nil,
20
+ pi_V1V3: nil,
21
+ dist20_V1V3: nil)
22
+ tcs_RT ||= 0
23
+ tcs_V1V3 ||= 0
24
+ if (tcs_RT >= 3 && pi_RT) and (tcs_V1V3 >= 3 && pi_V1V3)
25
+ if (pi_RT + pi_V1V3) < 0.0103
26
+ recency = "recent"
27
+ elsif (pi_RT + pi_V1V3) >= 0.0103 and (dist20_RT + dist20_V1V3) >= 0.006
28
+ recency = "chronic"
29
+ else
30
+ recency = "indeterminant"
31
+ end
32
+ elsif (tcs_RT >= 3 && pi_RT) and tcs_V1V3 < 3
33
+ if pi_RT < 0.0021
34
+ recency = "recent"
35
+ elsif pi_RT >= 0.0021 and dist20_RT >= 0.001
36
+ recency = "chronic"
37
+ else
38
+ recency = "indeterminant"
39
+ end
40
+ elsif (tcs_V1V3 >= 3 && pi_V1V3)
41
+ if pi_V1V3 >= 0.0103 and dist20_V1V3 >= 0.006
42
+ recency = "chronic"
43
+ else
44
+ recency = "insufficient data"
45
+ end
46
+ else
47
+ recency = "insufficient data"
48
+ end
49
+ return recency
50
+ end
51
+ end
52
+ end
@@ -1,43 +1,109 @@
1
1
  module ViralSeq
2
2
  class DRMs
3
- def initialize (mutation_list = {})
4
- @mutation_list = mutation_list
5
- end
6
-
7
- attr_accessor :mutation_list
8
- end
3
+ class << self
9
4
 
10
- def self.sdrm_hiv_pr(seq_hash)
11
- end
12
-
13
- def self.sdrm_hiv_rt(seq_hash)
14
- end
5
+ # function to retrieve sdrm positions as a hash
6
+ # @param ref_option [Symbol], name of reference genomes, options are `:hiv_pr`, `:hiv_rt`, `:hiv_in`, `hcv_ns5a`
7
+ # @return [Hash] Hash of :position_number => [ 'wildtype_codon', ['mutation_codons']]
8
+ def sdrm_hash(options)
9
+ sdrm = {}
10
+ case options
11
+ when :hcv_ns5a
12
+ sdrm[28] = ['M',['T']]
13
+ sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
14
+ sdrm[31] = ['L',['M','V','F']]
15
+ sdrm[32] = ['P',['L']]
16
+ sdrm[44] = ['K',['R']]
17
+ sdrm[58] = ['H',['D','P','S']]
18
+ sdrm[64] = ['T',['A','S']]
19
+ sdrm[77] = ['P',['A','S']]
20
+ sdrm[78] = ['R',['K']]
21
+ sdrm[79] = ['T',['A']]
22
+ sdrm[83] = ['T',['M']]
23
+ sdrm[85] = ['S',['N','H','Y']]
24
+ sdrm[92] = ['A',['P','T','K','E']]
25
+ sdrm[93] = ['Y',['C','F','H','N']]
26
+ sdrm[107] = ['K',['T','S']]
27
+ sdrm[121] = ['I',['V']]
28
+ sdrm[135] = ['T',['A']]
29
+ when :nrti
30
+ sdrm[41] = ['M',['L']]
31
+ sdrm[65] = ['K',['R']]
32
+ sdrm[67] = ['D',['N','G','E']]
33
+ sdrm[69] = ['T',['D']]
34
+ sdrm[70] = ['K',['R','E']]
35
+ sdrm[74] = ['L',['V','I']]
36
+ sdrm[75] = ['V',['M','T','A','S']]
37
+ sdrm[77] = ['F',['L']]
38
+ sdrm[115] = ['Y',['F']]
39
+ sdrm[116] = ['F',['Y']]
40
+ sdrm[151] = ['Q',['M']]
41
+ sdrm[184] = ['M',['V','I']]
42
+ sdrm[210] = ['L',['W']]
43
+ sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
44
+ sdrm[219] = ["K",["Q","E","N","R"]]
45
+ when :nnrti
46
+ sdrm[100] = ['L',['I']]
47
+ sdrm[101] = ['K',['E','P']]
48
+ sdrm[103] = ['K',['N','S']]
49
+ sdrm[106] = ['V',['M','A']]
50
+ sdrm[179] = ['V',['F','D']]
51
+ sdrm[181] = ['Y',['C','I','V']]
52
+ sdrm[188] = ['Y',['L','H','C']]
53
+ sdrm[190] = ['G',['A','S','E']]
54
+ sdrm[225] = ['P',['H']]
55
+ sdrm[230] = ['M',['L']]
56
+ when :hiv_pr
57
+ sdrm[23] = ['L',['I']]
58
+ sdrm[24] = ['L',['I']]
59
+ sdrm[30] = ['D',['N']]
60
+ sdrm[32] = ['V',['I']]
61
+ sdrm[46] = ['M',['I','L']]
62
+ sdrm[47] = ['I',['V','A']]
63
+ sdrm[48] = ['G',['V','M']]
64
+ sdrm[50] = ['I',['V','L']]
65
+ sdrm[53] = ['F',['L']]
66
+ sdrm[54] = ['I',['V','L','M','T','A','S']]
67
+ sdrm[73] = ['G',['S','T','C','A']]
68
+ sdrm[76] = ['L',['V']]
69
+ sdrm[82] = ['V',['A','T','S','F','L','C','M']]
70
+ sdrm[83] = ['N',['D']]
71
+ sdrm[84] = ['I',['V','A','C']]
72
+ sdrm[88] = ['N',['D','S']]
73
+ sdrm[90] = ['L',['M']]
74
+ when :hiv_in
75
+ sdrm[66] = ['T',['A','I','K']]
76
+ sdrm[74] = ['L',['M']]
77
+ sdrm[92] = ['E',['Q']]
78
+ sdrm[95] = ['Q',['K']]
79
+ sdrm[97] = ['T',['A']]
80
+ sdrm[121] = ['F',['Y']]
81
+ sdrm[140] = ['G',['A','S','C']]
82
+ sdrm[143] = ["Y",["C","H","R"]]
83
+ sdrm[147] = ['S',['G']]
84
+ sdrm[148] = ['Q',['H','K','R']]
85
+ sdrm[155] = ['N',['S','H']]
86
+ else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
87
+ end
88
+ return sdrm
89
+ end # end of #sdrm_hash
15
90
 
16
- def self.sdrm_hiv_in(seq_hash)
17
- end
18
-
19
- def self.list_from_json(file)
20
- end
21
-
22
- def self.list_from_csv(file)
23
- end
24
-
25
- def self.export_list_hiv_pr(file, format = :json)
26
- if foramt == :json
91
+ # function to export SDRM positions as json object
92
+ # @param (see #sdrm_hash)
93
+ # @return [Array] json Array of SDRM positions
27
94
 
95
+ def sdrm_json(options)
96
+ sdrm = ViralSeq::DRMs.sdrm_hash(options)
97
+ json_array = []
98
+ sdrm.each do |pos, muts|
99
+ mutation = {}
100
+ mutation[:position] = pos
101
+ mutation[:wildtypeCodon] = muts[0]
102
+ mutation[:mutationCodons] = muts[1]
103
+ json_array << mutation
104
+ end
105
+ return json_array
106
+ end
28
107
  end
29
108
  end
30
-
31
- def self.export_list_hiv_rt(file, format = :json)
32
-
33
- end
34
-
35
- def self.export_list_hiv_in(file, format = :json)
36
-
37
- end
38
-
39
- def drm_analysis(seq_hash)
40
- mutation_list = self.mutation_list
41
-
42
- end
43
109
  end
@@ -11,7 +11,7 @@ module ViralSeq
11
11
  # # filter nt sequences with the reference coordinates
12
12
  # filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
13
13
  # # return a new ViralSeq::SeqHash object without stop codons
14
- # filtered_seqhash = filtered_seqhash.a3g[1]
14
+ # filtered_seqhash = filtered_seqhash.a3g[:filtered_seq]
15
15
  # # further filter out sequences with A3G hypermutations
16
16
  # filtered_seqhash.pi
17
17
  # # return pairwise diveristy π
@@ -187,6 +187,25 @@ module ViralSeq
187
187
  return new_seqhash
188
188
  end
189
189
 
190
+ # sample a certain number of sequences from a SeqHash object
191
+ # @param n [Integer] number of sequences to sample
192
+ # @return [ViralSeq::SeqHash] sampled SeqHash
193
+
194
+ def sample(n = 1)
195
+ keys = self.dna_hash.keys
196
+ sampled_keys = keys.sample(n)
197
+ sampled_nt = {}
198
+ sampled_aa = {}
199
+ sampled_qc = {}
200
+ sampled_title = self.title + "_sampled_" + n.to_s
201
+ sampled_keys.each do |k|
202
+ sampled_nt[k] = self.dna_hash[k]
203
+ sampled_aa[k] = self.aa_hash[k]
204
+ sampled_qc[k] = self.qc_hash[k]
205
+ end
206
+ return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
207
+ end
208
+
190
209
  # write the nt sequences to a FASTA format file
191
210
  # @param file [String] path to the FASTA output file
192
211
  # @return [NilClass]
@@ -394,7 +413,6 @@ module ViralSeq
394
413
  end
395
414
  end
396
415
  end
397
-
398
416
  consensus_seq += call_consensus_base(max_base_list)
399
417
  end
400
418
  return consensus_seq
@@ -583,8 +601,8 @@ module ViralSeq
583
601
  temp_dir=File.dirname($0)
584
602
  end
585
603
 
586
- temp_file = temp_dir + "/_temp_muscle_in"
587
- temp_aln = temp_dir + "/_temp_muscle_aln"
604
+ temp_file = File.join(temp_dir, "_temp_muscle_in")
605
+ temp_aln = File.join(temp_dir, "_temp_muscle_aln")
588
606
  File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
589
607
  if path_to_muscle
590
608
  unless ViralSeq.check_muscle?(path_to_muscle)
@@ -742,6 +760,7 @@ module ViralSeq
742
760
  seq_hash_unique_pass = []
743
761
 
744
762
  seq_hash_unique.each do |seq|
763
+ next if seq.nil?
745
764
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
765
  next unless loc # if locator tool fails, skip this seq.
747
766
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
@@ -808,7 +827,7 @@ module ViralSeq
808
827
  end # end of locator
809
828
  alias_method :loc, :sequence_locator
810
829
 
811
- # Remove squences with residual offspring Primer IDs.
830
+ # Remove sequences with residual offspring Primer IDs.
812
831
  # Compare PID with sequences which have identical sequences.
813
832
  # PIDs differ by 1 base will be recognized. If PID1 is x time (cutoff) greater than PID2, PID2 will be disgarded.
814
833
  # each sequence tag starting with ">" and the Primer ID sequence
@@ -1155,6 +1174,7 @@ module ViralSeq
1155
1174
  new_sh.aa_hash[k] = aa_hash[k]
1156
1175
  new_sh.qc_hash[k] = qc_hash[k]
1157
1176
  end
1177
+ new_sh.file = self.file
1158
1178
  new_sh.title = self.title + "_" + n.to_s
1159
1179
  return new_sh
1160
1180
  end
@@ -110,19 +110,21 @@ module ViralSeq
110
110
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
111
111
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
112
112
  joined_seq = {}
113
- seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
113
+ seq_pair_hash.each do |seq_name,seq_pair|
114
114
  r1_seq = seq_pair[0]
115
115
  r2_seq = seq_pair[1]
116
116
  if overlap.zero?
117
117
  joined_sequence = r1_seq + r2_seq
118
+ elsif diff.zero?
119
+ if r1_seq[-overlap..-1] == r2_seq[0,overlap]
120
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
121
+ end
118
122
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
119
123
  joined_sequence= r1_seq + r2_seq[overlap..-1]
120
124
  else
121
125
  next
122
126
  end
123
- seq_names.each do |seq_name|
124
- joined_seq[seq_name] = joined_sequence
125
- end
127
+ joined_seq[seq_name] = joined_sequence if joined_sequence
126
128
  end
127
129
 
128
130
  joined_seq_hash = ViralSeq::SeqHash.new
@@ -113,7 +113,7 @@ module ViralSeq
113
113
  def sdrm(option, start_aa = 1)
114
114
  aa_array = self.aa_array
115
115
  out_hash = {}
116
- sdrm = sdrm_hash(option)
116
+ sdrm = ViralSeq::DRMs.sdrm_hash(option)
117
117
  aa_length = aa_array.size
118
118
  end_aa = start_aa + aa_length - 1
119
119
  (start_aa..end_aa).each do |position|
@@ -535,88 +535,5 @@ module ViralSeq
535
535
  return aa_out
536
536
  end # end of #amino_acid_2
537
537
 
538
- # sdrm position hash
539
- def sdrm_hash(options)
540
- sdrm = {}
541
- case options
542
- when :hcv_ns5a
543
- sdrm[28] = ['M',['T']]
544
- sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
545
- sdrm[31] = ['L',['M','V','F']]
546
- sdrm[32] = ['P',['L']]
547
- sdrm[44] = ['K',['R']]
548
- sdrm[58] = ['H',['D','P','S']]
549
- sdrm[64] = ['T',['A','S']]
550
- sdrm[77] = ['P',['A','S']]
551
- sdrm[78] = ['R',['K']]
552
- sdrm[79] = ['T',['A']]
553
- sdrm[83] = ['T',['M']]
554
- sdrm[85] = ['S',['N','H','Y']]
555
- sdrm[92] = ['A',['P','T','K','E']]
556
- sdrm[93] = ['Y',['C','F','H','N']]
557
- sdrm[107] = ['K',['T','S']]
558
- sdrm[121] = ['I',['V']]
559
- sdrm[135] = ['T',['A']]
560
- when :nrti
561
- sdrm[41] = ['M',['L']]
562
- sdrm[65] = ['K',['R']]
563
- sdrm[67] = ['D',['N','G','E']]
564
- sdrm[69] = ['T',['D']]
565
- sdrm[70] = ['K',['R','E']]
566
- sdrm[74] = ['L',['V','I']]
567
- sdrm[75] = ['V',['M','T','A','S']]
568
- sdrm[77] = ['F',['L']]
569
- sdrm[115] = ['Y',['F']]
570
- sdrm[116] = ['F',['Y']]
571
- sdrm[151] = ['Q',['M']]
572
- sdrm[184] = ['M',['V','I']]
573
- sdrm[210] = ['L',['W']]
574
- sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
575
- sdrm[219] = ["K",["Q","E","N","R"]]
576
- when :nnrti
577
- sdrm[100] = ['L',['I']]
578
- sdrm[101] = ['K',['E','P']]
579
- sdrm[103] = ['K',['N','S']]
580
- sdrm[106] = ['V',['M','A']]
581
- sdrm[179] = ['V',['F','D']]
582
- sdrm[181] = ['Y',['C','I','V']]
583
- sdrm[188] = ['Y',['L','H','C']]
584
- sdrm[190] = ['G',['A','S','E']]
585
- sdrm[225] = ['P',['H']]
586
- sdrm[230] = ['M',['L']]
587
- when :hiv_pr
588
- sdrm[23] = ['L',['I']]
589
- sdrm[24] = ['L',['I']]
590
- sdrm[30] = ['D',['N']]
591
- sdrm[32] = ['V',['I']]
592
- sdrm[46] = ['M',['I','L']]
593
- sdrm[47] = ['I',['V','A']]
594
- sdrm[48] = ['G',['V','M']]
595
- sdrm[50] = ['I',['V','L']]
596
- sdrm[53] = ['F',['L']]
597
- sdrm[54] = ['I',['V','L','M','T','A','S']]
598
- sdrm[73] = ['G',['S','T','C','A']]
599
- sdrm[76] = ['L',['V']]
600
- sdrm[82] = ['V',['A','T','S','F','L','C','M']]
601
- sdrm[83] = ['N',['D']]
602
- sdrm[84] = ['I',['V','A','C']]
603
- sdrm[88] = ['N',['D','S']]
604
- sdrm[90] = ['L',['M']]
605
- when :hiv_in
606
- sdrm[66] = ['T',['A','I','K']]
607
- sdrm[74] = ['L',['M']]
608
- sdrm[92] = ['E',['Q']]
609
- sdrm[95] = ['Q',['K']]
610
- sdrm[97] = ['T',['A']]
611
- sdrm[121] = ['F',['Y']]
612
- sdrm[140] = ['G',['A','S','C']]
613
- sdrm[143] = ["Y",["C","H","R"]]
614
- sdrm[147] = ['S',['G']]
615
- sdrm[148] = ['Q',['H','K','R']]
616
- sdrm[155] = ['N',['S','H']]
617
- else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
618
- end
619
- return sdrm
620
- end
621
538
  end # end of ViralSeq::Sequence
622
539
  end # end of ViralSeq