viral_seq 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,652 @@
1
+ # viral_seq/sdrm_core.rb
2
+ # core functions for HIV SDRM analysis using MPID-DR protocol.
3
+ # More details for HIV Surveillance Drug Resistance Mutation (SDRM) can be found at
4
+ # https://hivdb.stanford.edu/pages/surveillance.html
5
+
6
+ # Including methods as:
7
+ # ViralSeq::sdrm_nrti
8
+ # ViralSeq::sdrm_nnrti
9
+ # ViralSeq::hiv_protease
10
+ # ViralSeq::sdrm_int
11
+ # ViralSeq::sdrm_pr_bulk
12
+ # ViralSeq::sdrm_rt_bulk
13
+ # ViralSeq::sdrm_in_bulk
14
+
15
+ # ViralSeq.sdrm_nrti(aa_arry, start_aa)
16
+ # ViralSeq.sdrm_nnrti(aa_arry, start_aa)
17
+ # ViralSeq.hiv_protease(aa_arry, start_aa)
18
+ # ViralSeq.sdrm_int(aa_arry, start_aa)
19
+ # # funtions to identify SDRMs from a given sequence in an Array object
20
+ # # function names indicate which HIV drug resistance mutations it can identify
21
+ # # input an Array object for amino acid sequence ['A', 'M', 'L', ...]
22
+ # # start_aa is an Integer to indicate codon number of the 1st amino acid sequence in the input aa_array
23
+ # # return a Hash object for SDRMs identified. {:posiiton =>[:wildtype_codon, :mutation_codon]}
24
+
25
+ # ViralSeq.sdrm_pr_bulk(sequence_hash, minority_cut_off)
26
+ # ViralSeq.sdrm_rt_bulk(sequence_hash, minority_cut_off)
27
+ # ViralSeq.sdrm_in_bulk(sequence_hash, minority_cut_off)
28
+ # # functions to identify SDRMs from a sequence hash object.
29
+ # # name of the functions indicate which region it works on
30
+ # # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
31
+ # # PR codon 1-99
32
+ # # RT codon 34-122, 152-236, two regions are linked
33
+ # # IN codon 53-174
34
+ # # sequence_hash is a Hash object of sequences {:name => :sequence, ...}
35
+ # # sequences usually need to be QCed (remove sequences with stop codon and a3g hypermutations) first
36
+ # # minority_cut_off is the Integer cut-off for minimal abundance of a mutation to be called as valid mutation
37
+ # # minority_cut_off can be obtained using ViralSeq::poisson_minority_cutoff function
38
+ # # return [point_mutation_list, linkage_list, report_list]
39
+ # =USAGE
40
+ # # example (example files from ID:VS053118-0566)
41
+ # sequence = ViralSeq.fasta_to_hash('spec/sample_files/sample_dr_sequences/pr.fasta')
42
+ # p_cut_off = ViralSeq.poisson_minority_cutoff(sequences)
43
+ # pr_sdrm = ViralSeq.sdrm_pr_bulk(sequence, p_cut_off)
44
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"
45
+ # pr_sdrm[0].each {|n| puts n.join(',')}
46
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
47
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
48
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
49
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
50
+ #
51
+ # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"
52
+ # pr_sdrm[1].each {|n| puts n.join(',')}
53
+ # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
54
+ # => PR,396,D30N+N88D,245,0.61869,0.56884,0.66674,
55
+ # => PR,396,WT,149,0.37626,0.32837,0.42602,
56
+ # => PR,396,D30N,1,0.00253,6.0e-05,0.01399,*
57
+ # => PR,396,D30N+I50V+N88D,1,0.00253,6.0e-05,0.01399,*
58
+ #
59
+ # puts "position,codon,tcs_number," + ViralSeq::AMINO_ACID_LIST.join(",")
60
+ # pr_sdrm[2].each {|n|puts n.join(",")}
61
+ # => position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
62
+ # => PR,1,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63
+ # => PR,2,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64
+ # => PR,3,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65
+ # => PR,4,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
66
+ # => PR,5,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67
+ # => PR,6,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
68
+ # => PR,7,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69
+ # => PR,8,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
70
+ # => PR,9,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71
+ # => PR,10,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72
+ # => PR,11,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
73
+ # => PR,12,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,62.1212,0.0,0.0,0.0,0.0
74
+ # => PR,13,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.1313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.8687,0.0,0.0,0.0
75
+ # => PR,14,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76
+ # => PR,15,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6263,0.0,0.0,0.0
77
+ # => PR,16,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78
+ # => PR,17,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79
+ # => PR,18,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.5051,0.0,0.0,0.0,0.0,0.0,0.0
80
+ # => PR,19,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81
+ # => PR,20,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82
+ # => PR,21,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83
+ # => PR,22,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84
+ # => PR,23,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85
+ # => PR,24,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86
+ # => PR,25,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87
+ # => PR,26,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
88
+ # => PR,27,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89
+ # => PR,28,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90
+ # => PR,29,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91
+ # => PR,30,396,0.0,0.0,37.6263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92
+ # => PR,31,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
93
+ # => PR,32,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
94
+ # => PR,33,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
95
+ # => PR,34,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96
+ # => PR,35,396,0.0,0.0,62.1212,37.6263,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97
+ # => PR,36,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
98
+ # => PR,37,396,0.0,0.0,37.8788,61.8687,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99
+ # => PR,38,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100
+ # => PR,39,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
101
+ # => PR,40,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102
+ # => PR,41,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0
103
+ # => PR,42,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
104
+ # => PR,43,396,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105
+ # => PR,44,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106
+ # => PR,45,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107
+ # => PR,46,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108
+ # => PR,47,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109
+ # => PR,48,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110
+ # => PR,49,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111
+ # => PR,50,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
112
+ # => PR,51,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113
+ # => PR,52,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114
+ # => PR,53,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115
+ # => PR,54,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116
+ # => PR,55,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117
+ # => PR,56,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
118
+ # => PR,57,396,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0
119
+ # => PR,58,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120
+ # => PR,59,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
121
+ # => PR,60,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122
+ # => PR,61,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123
+ # => PR,62,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124
+ # => PR,63,396,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,37.8788,0.0,0.0,61.8687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125
+ # => PR,64,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126
+ # => PR,65,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127
+ # => PR,66,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128
+ # => PR,67,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129
+ # => PR,68,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130
+ # => PR,69,396,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131
+ # => PR,70,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132
+ # => PR,71,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,37.8788,0.0,0.0,0.0
133
+ # => PR,72,396,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0
134
+ # => PR,73,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135
+ # => PR,74,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
136
+ # => PR,75,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
137
+ # => PR,76,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138
+ # => PR,77,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139
+ # => PR,78,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140
+ # => PR,79,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141
+ # => PR,80,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
142
+ # => PR,81,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143
+ # => PR,82,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
144
+ # => PR,83,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
145
+ # => PR,84,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146
+ # => PR,85,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147
+ # => PR,86,396,0.0,0.0,0.0,0.5051,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148
+ # => PR,87,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
149
+ # => PR,88,396,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150
+ # => PR,89,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151
+ # => PR,90,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152
+ # => PR,91,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
153
+ # => PR,92,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154
+ # => PR,93,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155
+ # => PR,94,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156
+ # => PR,95,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157
+ # => PR,96,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
158
+ # => PR,97,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159
+ # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
160
+ # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161
+
162
+
163
+
164
+ module ViralSeq
165
+
166
+ # drug resistant mutation summary. input: amino acid array and starting codon, output, hash of summary
167
+ def self.sdrm_nrti(aa_array,start_aa=1)
168
+ out_hash = {}
169
+ sdrm = {}
170
+ sdrm[41] = ['M',['L']]
171
+ sdrm[65] = ['K',['R']]
172
+ sdrm[67] = ['D',['N','G','E']]
173
+ sdrm[69] = ['T',['D']]
174
+ sdrm[70] = ['K',['R','E']]
175
+ sdrm[74] = ['L',['V','I']]
176
+ sdrm[75] = ['V',['M','T','A','S']]
177
+ sdrm[77] = ['F',['L']]
178
+ sdrm[115] = ['Y',['F']]
179
+ sdrm[116] = ['F',['Y']]
180
+ sdrm[151] = ['Q',['M']]
181
+ sdrm[184] = ['M',['V','I']]
182
+ sdrm[210] = ['L',['W']]
183
+ sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
184
+ sdrm[219] = ["K",["Q","E","N","R"]]
185
+ aa_length = aa_array.size
186
+ end_aa = start_aa + aa_length - 1
187
+ (start_aa..end_aa).each do |position|
188
+ array_position = position - start_aa
189
+ if sdrm.keys.include?(position)
190
+ wt_aa = sdrm[position][0]
191
+ test_aa = aa_array[array_position]
192
+ if test_aa.size == 1
193
+ unless wt_aa == test_aa
194
+ if sdrm[position][1].include?(test_aa)
195
+ out_hash[position] = [wt_aa,test_aa]
196
+ end
197
+ end
198
+ else
199
+ test_aa_array = test_aa.split("/")
200
+ if (test_aa_array & sdrm[position][1])
201
+ out_hash[position] = [wt_aa,test_aa]
202
+ end
203
+ end
204
+
205
+ end
206
+ end
207
+ return out_hash
208
+ end
209
+
210
+ def self.sdrm_nnrti(aa_array,start_aa=1)
211
+ out_hash = {}
212
+ sdrm = {}
213
+ sdrm[100] = ['L',['I']]
214
+ sdrm[101] = ['K',['E','P']]
215
+ sdrm[103] = ['K',['N','S']]
216
+ sdrm[106] = ['V',['M','A']]
217
+ sdrm[179] = ['V',['F','D']]
218
+ sdrm[181] = ['Y',['C','I','V']]
219
+ sdrm[188] = ['Y',['L','H','C']]
220
+ sdrm[190] = ['G',['A','S','E']]
221
+ sdrm[225] = ['P',['H']]
222
+ sdrm[230] = ['M',['L']]
223
+ aa_length = aa_array.size
224
+ end_aa = start_aa + aa_length - 1
225
+ (start_aa..end_aa).each do |position|
226
+ array_position = position - start_aa
227
+ if sdrm.keys.include?(position)
228
+ wt_aa = sdrm[position][0]
229
+ test_aa = aa_array[array_position]
230
+ if test_aa.size == 1
231
+ unless wt_aa == test_aa
232
+ if sdrm[position][1].include?(test_aa)
233
+ out_hash[position] = [wt_aa,test_aa]
234
+ end
235
+ end
236
+ else
237
+ test_aa_array = test_aa.split("/")
238
+ if (test_aa_array & sdrm[position][1])
239
+ out_hash[position] = [wt_aa,test_aa]
240
+ end
241
+ end
242
+
243
+ end
244
+ end
245
+ return out_hash
246
+ end
247
+
248
+ #HIV protease surveillance mutations
249
+
250
+ def self.hiv_protease(aa_array,start_aa=1)
251
+ out_hash = {}
252
+ sdrm = {}
253
+ sdrm[23] = ['L',['I']]
254
+ sdrm[24] = ['L',['I']]
255
+ sdrm[30] = ['D',['N']]
256
+ sdrm[32] = ['V',['I']]
257
+ sdrm[46] = ['M',['I','L','V']] # M46V not on the SDRM list but we still include it.
258
+ sdrm[47] = ['I',['V','A']]
259
+ sdrm[48] = ['G',['V','M']]
260
+ sdrm[50] = ['I',['V','L']]
261
+ sdrm[53] = ['F',['Y']]
262
+ sdrm[54] = ['I',['V','L','M','T','A','S']]
263
+ sdrm[73] = ['G',['S','T','C','A']]
264
+ sdrm[76] = ['L',['V']]
265
+ sdrm[82] = ['V',['A','T','S','F','L','C','M']]
266
+ sdrm[83] = ['N',['D']]
267
+ sdrm[84] = ['I',['V','A','C']]
268
+ sdrm[85] = ['I',['V']]
269
+ sdrm[88] = ['N',['D','S']]
270
+ sdrm[90] = ['L',['M']]
271
+ aa_length = aa_array.size
272
+ end_aa = start_aa + aa_length - 1
273
+ (start_aa..end_aa).each do |position|
274
+ array_position = position - start_aa
275
+ if sdrm.keys.include?(position)
276
+ wt_aa = sdrm[position][0]
277
+ test_aa = aa_array[array_position]
278
+ if test_aa.size == 1
279
+ unless wt_aa == test_aa
280
+ if sdrm[position][1].include?(test_aa)
281
+ out_hash[position] = [wt_aa,test_aa]
282
+ end
283
+ end
284
+ else
285
+ test_aa_array = test_aa.split("/")
286
+ if (test_aa_array & sdrm[position][1])
287
+ out_hash[position] = [wt_aa,test_aa]
288
+ end
289
+ end
290
+ end
291
+ end
292
+ return out_hash
293
+ end
294
+
295
+ #HIV integrase drug resistance mutations
296
+
297
+ def self.sdrm_int(aa_array,start_aa=1)
298
+ out_hash = {}
299
+ sdrm = {}
300
+ sdrm[66] = ['T',['A','I','K']]
301
+ sdrm[74] = ['L',['M']]
302
+ sdrm[92] = ['E',['Q']]
303
+ sdrm[95] = ['Q',['K']]
304
+ sdrm[97] = ['T',['A']]
305
+ sdrm[121] = ['F',['Y']]
306
+ sdrm[140] = ['G',['A','S','C']]
307
+ sdrm[143] = ["Y",["C","H","R"]]
308
+ sdrm[147] = ['S',['G']]
309
+ sdrm[148] = ['Q',['H','K','R']]
310
+ sdrm[155] = ['N',['S','H']]
311
+ aa_length = aa_array.size
312
+ end_aa = start_aa + aa_length - 1
313
+ (start_aa..end_aa).each do |position|
314
+ array_position = position - start_aa
315
+ if sdrm.keys.include?(position)
316
+ wt_aa = sdrm[position][0]
317
+ test_aa = aa_array[array_position]
318
+ if test_aa.size == 1
319
+ unless wt_aa == test_aa
320
+ if sdrm[position][1].include?(test_aa)
321
+ out_hash[position] = [wt_aa,test_aa]
322
+ end
323
+ end
324
+ else
325
+ test_aa_array = test_aa.split("/")
326
+ if (test_aa_array & sdrm[position][1])
327
+ out_hash[position] = [wt_aa,test_aa]
328
+ end
329
+ end
330
+
331
+ end
332
+ end
333
+ return out_hash
334
+ end
335
+
336
+ # input sequence hash, and Poisson cutoff for minority variants.
337
+ # HIV-1 PR region SDRM based on HIVDB.stanford.edu
338
+ # only for MPID-DR MiSeq sequences, PR codon 1-99
339
+ # return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
340
+ def self.sdrm_pr_bulk(sequences, cutoff = 0)
341
+ region = "PR"
342
+ rf_label = 0
343
+ start_codon_number = 1
344
+ n_seq = sequences.size
345
+ mut = {}
346
+ mut_com = []
347
+ aa = {}
348
+ point_mutation_list = []
349
+ sequences.each do |name,seq|
350
+ s = ViralSeq::Sequence.new(name,seq)
351
+ s.get_aa_array(rf_label)
352
+ aa_seq = s.aa_array
353
+ aa[name] = aa_seq.join("")
354
+ record = ViralSeq.hiv_protease(aa_seq)
355
+ mut_com << record
356
+ record.each do |position,mutation|
357
+ if mut[position]
358
+ mut[position][1] << mutation[1]
359
+ else
360
+ mut[position] = [mutation[0],[]]
361
+ mut[position][1] << mutation[1]
362
+ end
363
+ end
364
+ end
365
+ mut.each do |position,mutation|
366
+ wt = mutation[0]
367
+ mut_list = mutation[1]
368
+ count_mut_list = ViralSeq.count(mut_list)
369
+ count_mut_list.each do |m,number|
370
+ ci = ViralSeq.r_binom_CI(number, n_seq)
371
+ label = number < cutoff ? "*" : ""
372
+ point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
373
+ end
374
+ end
375
+ point_mutation_list.sort_by! {|record| record[2]}
376
+
377
+ link = ViralSeq.count(mut_com)
378
+ link2 = {}
379
+ link.each do |k,v|
380
+ pattern = []
381
+ if k.size == 0
382
+ pattern = ['WT']
383
+ else
384
+ k.each do |p,m|
385
+ pattern << (m[0] + p.to_s + m[1])
386
+ end
387
+ end
388
+ link2[pattern.join("+")] = v
389
+ end
390
+ linkage_list = []
391
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
392
+ ci = ViralSeq.r_binom_CI(v, n_seq)
393
+ label = v < cutoff ? "*" : ""
394
+ linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
395
+ end
396
+
397
+ report_list = []
398
+
399
+ div_aa = {}
400
+ aa_start = start_codon_number
401
+
402
+ aa_size = aa.values[0].size - 1
403
+
404
+ (0..aa_size).to_a.each do |p|
405
+ aas = []
406
+ aa.values.each do |r1|
407
+ aas << r1[p]
408
+ end
409
+ count_aas = ViralSeq.count(aas)
410
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
411
+ aa_start += 1
412
+ end
413
+
414
+ div_aa.each do |k,v|
415
+ record = [region, k, n_seq]
416
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
417
+ aa_count = v[amino_acid]
418
+ record << (aa_count.to_f/n_seq*100).round(4)
419
+ end
420
+ report_list << record
421
+ end
422
+
423
+ return [point_mutation_list, linkage_list, report_list]
424
+ end
425
+
426
+
427
+ #input sequence hash, and Poisson cutoff for minority variants.
428
+ #HIV-1 RT region SDRM based on HIVDB.stanford.edu
429
+ #only for MPID-DR MiSeq sequences
430
+ #RT codon 34-122, 152-236 two regions are linked.
431
+ #return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
432
+ def self.sdrm_rt_bulk(sequences, cutoff = 0)
433
+ region = "RT"
434
+ rf_label = 1
435
+ start_codon_number = 34
436
+ gap = "AGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC"
437
+
438
+ n_seq = sequences.size
439
+ mut_nrti = {}
440
+ mut_nnrti = {}
441
+ mut_com = []
442
+ r1_aa = {}
443
+ r2_aa = {}
444
+ point_mutation_list = []
445
+ sequences.each do |name,seq|
446
+ r1 = seq[0,267]
447
+ r2 = seq[267..-1]
448
+ seq = r1 + gap + r2
449
+ s = ViralSeq::Sequence.new(name,seq)
450
+ s.get_aa_array(rf_label)
451
+ aa_seq = s.aa_array
452
+
453
+ r1_aa[name] = aa_seq[0,89].join("")
454
+ r2_aa[name] = aa_seq[-85..-1].join("")
455
+ nrti = ViralSeq.sdrm_nrti(aa_seq,start_codon_number)
456
+ nnrti = ViralSeq.sdrm_nnrti(aa_seq,start_codon_number)
457
+ mut_com << (nrti.merge(nnrti))
458
+
459
+ nrti.each do |position,mutation|
460
+ if mut_nrti[position]
461
+ mut_nrti[position][1] << mutation[1]
462
+ else
463
+ mut_nrti[position] = [mutation[0],[]]
464
+ mut_nrti[position][1] << mutation[1]
465
+ end
466
+ end
467
+ nnrti.each do |position,mutation|
468
+ if mut_nnrti[position]
469
+ mut_nnrti[position][1] << mutation[1]
470
+ else
471
+ mut_nnrti[position] = [mutation[0],[]]
472
+ mut_nnrti[position][1] << mutation[1]
473
+ end
474
+ end
475
+ end
476
+
477
+ mut_nrti.each do |position,mutation|
478
+ wt = mutation[0]
479
+ mut_list = mutation[1]
480
+ count_mut_list = ViralSeq.count(mut_list)
481
+ count_mut_list.each do |m,number|
482
+ ci = ViralSeq.r_binom_CI(number, n_seq)
483
+ label = number < cutoff ? "*" : ""
484
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
485
+ end
486
+ end
487
+
488
+ mut_nnrti.each do |position,mutation|
489
+ wt = mutation[0]
490
+ mut_list = mutation[1]
491
+ count_mut_list = ViralSeq.count(mut_list)
492
+ count_mut_list.each do |m,number|
493
+ ci = ViralSeq.r_binom_CI(number, n_seq)
494
+ label = number < cutoff ? "*" : ""
495
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
496
+ end
497
+ end
498
+ point_mutation_list.sort_by! {|record| record[2]}
499
+
500
+ link = ViralSeq.count(mut_com)
501
+ link2 = {}
502
+ link.each do |k,v|
503
+ pattern = []
504
+ if k.size == 0
505
+ pattern = ['WT']
506
+ else
507
+ k.each do |p,m|
508
+ pattern << (m[0] + p.to_s + m[1])
509
+ end
510
+ end
511
+ link2[pattern.join("+")] = v
512
+ end
513
+ linkage_list = []
514
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
515
+ ci = ViralSeq.r_binom_CI(v, n_seq)
516
+ label = v < cutoff ? "*" : ""
517
+ linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
518
+ end
519
+
520
+ report_list = []
521
+
522
+ div_aa = {}
523
+ r1_aa_start = 34
524
+ r2_aa_start = 152
525
+
526
+ r1_aa_size = r1_aa.values[0].size - 1
527
+ r2_aa_size = r2_aa.values[0].size - 1
528
+
529
+ (0..r1_aa_size).to_a.each do |p|
530
+ aas = []
531
+ r1_aa.values.each do |r1|
532
+ aas << r1[p]
533
+ end
534
+ count_aas = ViralSeq.count(aas)
535
+ div_aa[r1_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
536
+ r1_aa_start += 1
537
+ end
538
+
539
+ (0..r2_aa_size).to_a.each do |p|
540
+ aas = []
541
+ r2_aa.values.each do |r1|
542
+ aas << r1[p]
543
+ end
544
+ count_aas = ViralSeq.count(aas)
545
+ div_aa[r2_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
546
+ r2_aa_start += 1
547
+ end
548
+
549
+ div_aa.each do |k,v|
550
+ record = [region, k, n_seq]
551
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
552
+ aa_count = v[amino_acid]
553
+ record << (aa_count.to_f/n_seq*100).round(4)
554
+ end
555
+ report_list << record
556
+ end
557
+
558
+ return [point_mutation_list, linkage_list, report_list]
559
+ end
560
+
561
+ #input sequence hash, and Poisson cutoff for minority variants.
562
+ #HIV-1 IN region SDRM based on HIVDB.stanford.edu
563
+ #only for MPID-DR MiSeq sequences
564
+ #IN codon 53-174
565
+ #return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
566
+ def self.sdrm_in_bulk(sequences, cutoff = 0)
567
+ region = "IN"
568
+ rf_label = 2
569
+ start_codon_number = 53
570
+ n_seq = sequences.size
571
+ mut = {}
572
+ mut_com = []
573
+ aa = {}
574
+ point_mutation_list = []
575
+ sequences.each do |name,seq|
576
+ s = ViralSeq::Sequence.new(name,seq)
577
+ s.get_aa_array(rf_label)
578
+ aa_seq = s.aa_array
579
+ aa[name] = aa_seq.join("")
580
+ record = ViralSeq.sdrm_int(aa_seq, start_codon_number)
581
+ mut_com << record
582
+ record.each do |position,mutation|
583
+ if mut[position]
584
+ mut[position][1] << mutation[1]
585
+ else
586
+ mut[position] = [mutation[0],[]]
587
+ mut[position][1] << mutation[1]
588
+ end
589
+ end
590
+ end
591
+ mut.each do |position,mutation|
592
+ wt = mutation[0]
593
+ mut_list = mutation[1]
594
+ count_mut_list = ViralSeq.count(mut_list)
595
+ count_mut_list.each do |m,number|
596
+ ci = ViralSeq.r_binom_CI(number, n_seq)
597
+ label = number < cutoff ? "*" : ""
598
+ point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
599
+ end
600
+ end
601
+ point_mutation_list.sort_by! {|record| record[2]}
602
+
603
+ link = ViralSeq.count(mut_com)
604
+ link2 = {}
605
+ link.each do |k,v|
606
+ pattern = []
607
+ if k.size == 0
608
+ pattern = ['WT']
609
+ else
610
+ k.each do |p,m|
611
+ pattern << (m[0] + p.to_s + m[1])
612
+ end
613
+ end
614
+ link2[pattern.join("+")] = v
615
+ end
616
+ linkage_list = []
617
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
618
+ ci = ViralSeq.r_binom_CI(v, n_seq)
619
+ label = v < cutoff ? "*" : ""
620
+ linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
621
+ end
622
+
623
+ report_list = []
624
+
625
+ div_aa = {}
626
+ aa_start = start_codon_number
627
+
628
+ aa_size = aa.values[0].size - 1
629
+
630
+ (0..aa_size).to_a.each do |p|
631
+ aas = []
632
+ aa.values.each do |r1|
633
+ aas << r1[p]
634
+ end
635
+ count_aas = ViralSeq.count(aas)
636
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
637
+ aa_start += 1
638
+ end
639
+
640
+ div_aa.each do |k,v|
641
+ record = [region, k, n_seq]
642
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
643
+ aa_count = v[amino_acid]
644
+ record << (aa_count.to_f/n_seq*100).round(4)
645
+ end
646
+ report_list << record
647
+ end
648
+
649
+ return [point_mutation_list, linkage_list, report_list]
650
+ end
651
+
652
+ end