viral_seq 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,652 @@
1
+ # viral_seq/sdrm_core.rb
2
+ # core functions for HIV SDRM analysis using MPID-DR protocol.
3
+ # More details for HIV Surveillance Drug Resistance Mutation (SDRM) can be found at
4
+ # https://hivdb.stanford.edu/pages/surveillance.html
5
+
6
+ # Including methods as:
7
+ # ViralSeq::sdrm_nrti
8
+ # ViralSeq::sdrm_nnrti
9
+ # ViralSeq::hiv_protease
10
+ # ViralSeq::sdrm_int
11
+ # ViralSeq::sdrm_pr_bulk
12
+ # ViralSeq::sdrm_rt_bulk
13
+ # ViralSeq::sdrm_in_bulk
14
+
15
+ # ViralSeq.sdrm_nrti(aa_arry, start_aa)
16
+ # ViralSeq.sdrm_nnrti(aa_arry, start_aa)
17
+ # ViralSeq.hiv_protease(aa_arry, start_aa)
18
+ # ViralSeq.sdrm_int(aa_arry, start_aa)
19
+ # # funtions to identify SDRMs from a given sequence in an Array object
20
+ # # function names indicate which HIV drug resistance mutations it can identify
21
+ # # input an Array object for amino acid sequence ['A', 'M', 'L', ...]
22
+ # # start_aa is an Integer to indicate codon number of the 1st amino acid sequence in the input aa_array
23
+ # # return a Hash object for SDRMs identified. {:posiiton =>[:wildtype_codon, :mutation_codon]}
24
+
25
+ # ViralSeq.sdrm_pr_bulk(sequence_hash, minority_cut_off)
26
+ # ViralSeq.sdrm_rt_bulk(sequence_hash, minority_cut_off)
27
+ # ViralSeq.sdrm_in_bulk(sequence_hash, minority_cut_off)
28
+ # # functions to identify SDRMs from a sequence hash object.
29
+ # # name of the functions indicate which region it works on
30
+ # # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
31
+ # # PR codon 1-99
32
+ # # RT codon 34-122, 152-236, two regions are linked
33
+ # # IN codon 53-174
34
+ # # sequence_hash is a Hash object of sequences {:name => :sequence, ...}
35
+ # # sequences usually need to be QCed (remove sequences with stop codon and a3g hypermutations) first
36
+ # # minority_cut_off is the Integer cut-off for minimal abundance of a mutation to be called as valid mutation
37
+ # # minority_cut_off can be obtained using ViralSeq::poisson_minority_cutoff function
38
+ # # return [point_mutation_list, linkage_list, report_list]
39
+ # =USAGE
40
+ # # example (example files from ID:VS053118-0566)
41
+ # sequence = ViralSeq.fasta_to_hash('spec/sample_files/sample_dr_sequences/pr.fasta')
42
+ # p_cut_off = ViralSeq.poisson_minority_cutoff(sequences)
43
+ # pr_sdrm = ViralSeq.sdrm_pr_bulk(sequence, p_cut_off)
44
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"
45
+ # pr_sdrm[0].each {|n| puts n.join(',')}
46
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
47
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
48
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
49
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
50
+ #
51
+ # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"
52
+ # pr_sdrm[1].each {|n| puts n.join(',')}
53
+ # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
54
+ # => PR,396,D30N+N88D,245,0.61869,0.56884,0.66674,
55
+ # => PR,396,WT,149,0.37626,0.32837,0.42602,
56
+ # => PR,396,D30N,1,0.00253,6.0e-05,0.01399,*
57
+ # => PR,396,D30N+I50V+N88D,1,0.00253,6.0e-05,0.01399,*
58
+ #
59
+ # puts "position,codon,tcs_number," + ViralSeq::AMINO_ACID_LIST.join(",")
60
+ # pr_sdrm[2].each {|n|puts n.join(",")}
61
+ # => position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
62
+ # => PR,1,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63
+ # => PR,2,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64
+ # => PR,3,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65
+ # => PR,4,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
66
+ # => PR,5,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67
+ # => PR,6,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
68
+ # => PR,7,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69
+ # => PR,8,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
70
+ # => PR,9,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71
+ # => PR,10,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72
+ # => PR,11,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
73
+ # => PR,12,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,62.1212,0.0,0.0,0.0,0.0
74
+ # => PR,13,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.1313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.8687,0.0,0.0,0.0
75
+ # => PR,14,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76
+ # => PR,15,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6263,0.0,0.0,0.0
77
+ # => PR,16,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78
+ # => PR,17,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79
+ # => PR,18,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.5051,0.0,0.0,0.0,0.0,0.0,0.0
80
+ # => PR,19,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81
+ # => PR,20,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82
+ # => PR,21,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83
+ # => PR,22,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84
+ # => PR,23,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85
+ # => PR,24,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86
+ # => PR,25,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87
+ # => PR,26,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
88
+ # => PR,27,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89
+ # => PR,28,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90
+ # => PR,29,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91
+ # => PR,30,396,0.0,0.0,37.6263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92
+ # => PR,31,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
93
+ # => PR,32,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
94
+ # => PR,33,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
95
+ # => PR,34,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96
+ # => PR,35,396,0.0,0.0,62.1212,37.6263,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97
+ # => PR,36,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
98
+ # => PR,37,396,0.0,0.0,37.8788,61.8687,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99
+ # => PR,38,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100
+ # => PR,39,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
101
+ # => PR,40,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102
+ # => PR,41,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0
103
+ # => PR,42,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
104
+ # => PR,43,396,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105
+ # => PR,44,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106
+ # => PR,45,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107
+ # => PR,46,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108
+ # => PR,47,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109
+ # => PR,48,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110
+ # => PR,49,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111
+ # => PR,50,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
112
+ # => PR,51,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113
+ # => PR,52,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114
+ # => PR,53,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115
+ # => PR,54,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116
+ # => PR,55,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117
+ # => PR,56,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
118
+ # => PR,57,396,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0
119
+ # => PR,58,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120
+ # => PR,59,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
121
+ # => PR,60,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122
+ # => PR,61,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123
+ # => PR,62,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124
+ # => PR,63,396,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,37.8788,0.0,0.0,61.8687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125
+ # => PR,64,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126
+ # => PR,65,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127
+ # => PR,66,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128
+ # => PR,67,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129
+ # => PR,68,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130
+ # => PR,69,396,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131
+ # => PR,70,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132
+ # => PR,71,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,37.8788,0.0,0.0,0.0
133
+ # => PR,72,396,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0
134
+ # => PR,73,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135
+ # => PR,74,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
136
+ # => PR,75,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
137
+ # => PR,76,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138
+ # => PR,77,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139
+ # => PR,78,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140
+ # => PR,79,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141
+ # => PR,80,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
142
+ # => PR,81,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143
+ # => PR,82,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
144
+ # => PR,83,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
145
+ # => PR,84,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146
+ # => PR,85,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147
+ # => PR,86,396,0.0,0.0,0.0,0.5051,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148
+ # => PR,87,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
149
+ # => PR,88,396,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150
+ # => PR,89,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151
+ # => PR,90,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152
+ # => PR,91,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
153
+ # => PR,92,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154
+ # => PR,93,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155
+ # => PR,94,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156
+ # => PR,95,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157
+ # => PR,96,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
158
+ # => PR,97,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159
+ # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
160
+ # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161
+
162
+
163
+
164
+ module ViralSeq
165
+
166
+ # drug resistant mutation summary. input: amino acid array and starting codon, output, hash of summary
167
+ def self.sdrm_nrti(aa_array,start_aa=1)
168
+ out_hash = {}
169
+ sdrm = {}
170
+ sdrm[41] = ['M',['L']]
171
+ sdrm[65] = ['K',['R']]
172
+ sdrm[67] = ['D',['N','G','E']]
173
+ sdrm[69] = ['T',['D']]
174
+ sdrm[70] = ['K',['R','E']]
175
+ sdrm[74] = ['L',['V','I']]
176
+ sdrm[75] = ['V',['M','T','A','S']]
177
+ sdrm[77] = ['F',['L']]
178
+ sdrm[115] = ['Y',['F']]
179
+ sdrm[116] = ['F',['Y']]
180
+ sdrm[151] = ['Q',['M']]
181
+ sdrm[184] = ['M',['V','I']]
182
+ sdrm[210] = ['L',['W']]
183
+ sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
184
+ sdrm[219] = ["K",["Q","E","N","R"]]
185
+ aa_length = aa_array.size
186
+ end_aa = start_aa + aa_length - 1
187
+ (start_aa..end_aa).each do |position|
188
+ array_position = position - start_aa
189
+ if sdrm.keys.include?(position)
190
+ wt_aa = sdrm[position][0]
191
+ test_aa = aa_array[array_position]
192
+ if test_aa.size == 1
193
+ unless wt_aa == test_aa
194
+ if sdrm[position][1].include?(test_aa)
195
+ out_hash[position] = [wt_aa,test_aa]
196
+ end
197
+ end
198
+ else
199
+ test_aa_array = test_aa.split("/")
200
+ if (test_aa_array & sdrm[position][1])
201
+ out_hash[position] = [wt_aa,test_aa]
202
+ end
203
+ end
204
+
205
+ end
206
+ end
207
+ return out_hash
208
+ end
209
+
210
+ def self.sdrm_nnrti(aa_array,start_aa=1)
211
+ out_hash = {}
212
+ sdrm = {}
213
+ sdrm[100] = ['L',['I']]
214
+ sdrm[101] = ['K',['E','P']]
215
+ sdrm[103] = ['K',['N','S']]
216
+ sdrm[106] = ['V',['M','A']]
217
+ sdrm[179] = ['V',['F','D']]
218
+ sdrm[181] = ['Y',['C','I','V']]
219
+ sdrm[188] = ['Y',['L','H','C']]
220
+ sdrm[190] = ['G',['A','S','E']]
221
+ sdrm[225] = ['P',['H']]
222
+ sdrm[230] = ['M',['L']]
223
+ aa_length = aa_array.size
224
+ end_aa = start_aa + aa_length - 1
225
+ (start_aa..end_aa).each do |position|
226
+ array_position = position - start_aa
227
+ if sdrm.keys.include?(position)
228
+ wt_aa = sdrm[position][0]
229
+ test_aa = aa_array[array_position]
230
+ if test_aa.size == 1
231
+ unless wt_aa == test_aa
232
+ if sdrm[position][1].include?(test_aa)
233
+ out_hash[position] = [wt_aa,test_aa]
234
+ end
235
+ end
236
+ else
237
+ test_aa_array = test_aa.split("/")
238
+ if (test_aa_array & sdrm[position][1])
239
+ out_hash[position] = [wt_aa,test_aa]
240
+ end
241
+ end
242
+
243
+ end
244
+ end
245
+ return out_hash
246
+ end
247
+
248
+ #HIV protease surveillance mutations
249
+
250
+ def self.hiv_protease(aa_array,start_aa=1)
251
+ out_hash = {}
252
+ sdrm = {}
253
+ sdrm[23] = ['L',['I']]
254
+ sdrm[24] = ['L',['I']]
255
+ sdrm[30] = ['D',['N']]
256
+ sdrm[32] = ['V',['I']]
257
+ sdrm[46] = ['M',['I','L','V']] # M46V not on the SDRM list but we still include it.
258
+ sdrm[47] = ['I',['V','A']]
259
+ sdrm[48] = ['G',['V','M']]
260
+ sdrm[50] = ['I',['V','L']]
261
+ sdrm[53] = ['F',['Y']]
262
+ sdrm[54] = ['I',['V','L','M','T','A','S']]
263
+ sdrm[73] = ['G',['S','T','C','A']]
264
+ sdrm[76] = ['L',['V']]
265
+ sdrm[82] = ['V',['A','T','S','F','L','C','M']]
266
+ sdrm[83] = ['N',['D']]
267
+ sdrm[84] = ['I',['V','A','C']]
268
+ sdrm[85] = ['I',['V']]
269
+ sdrm[88] = ['N',['D','S']]
270
+ sdrm[90] = ['L',['M']]
271
+ aa_length = aa_array.size
272
+ end_aa = start_aa + aa_length - 1
273
+ (start_aa..end_aa).each do |position|
274
+ array_position = position - start_aa
275
+ if sdrm.keys.include?(position)
276
+ wt_aa = sdrm[position][0]
277
+ test_aa = aa_array[array_position]
278
+ if test_aa.size == 1
279
+ unless wt_aa == test_aa
280
+ if sdrm[position][1].include?(test_aa)
281
+ out_hash[position] = [wt_aa,test_aa]
282
+ end
283
+ end
284
+ else
285
+ test_aa_array = test_aa.split("/")
286
+ if (test_aa_array & sdrm[position][1])
287
+ out_hash[position] = [wt_aa,test_aa]
288
+ end
289
+ end
290
+ end
291
+ end
292
+ return out_hash
293
+ end
294
+
295
+ #HIV integrase drug resistance mutations
296
+
297
+ def self.sdrm_int(aa_array,start_aa=1)
298
+ out_hash = {}
299
+ sdrm = {}
300
+ sdrm[66] = ['T',['A','I','K']]
301
+ sdrm[74] = ['L',['M']]
302
+ sdrm[92] = ['E',['Q']]
303
+ sdrm[95] = ['Q',['K']]
304
+ sdrm[97] = ['T',['A']]
305
+ sdrm[121] = ['F',['Y']]
306
+ sdrm[140] = ['G',['A','S','C']]
307
+ sdrm[143] = ["Y",["C","H","R"]]
308
+ sdrm[147] = ['S',['G']]
309
+ sdrm[148] = ['Q',['H','K','R']]
310
+ sdrm[155] = ['N',['S','H']]
311
+ aa_length = aa_array.size
312
+ end_aa = start_aa + aa_length - 1
313
+ (start_aa..end_aa).each do |position|
314
+ array_position = position - start_aa
315
+ if sdrm.keys.include?(position)
316
+ wt_aa = sdrm[position][0]
317
+ test_aa = aa_array[array_position]
318
+ if test_aa.size == 1
319
+ unless wt_aa == test_aa
320
+ if sdrm[position][1].include?(test_aa)
321
+ out_hash[position] = [wt_aa,test_aa]
322
+ end
323
+ end
324
+ else
325
+ test_aa_array = test_aa.split("/")
326
+ if (test_aa_array & sdrm[position][1])
327
+ out_hash[position] = [wt_aa,test_aa]
328
+ end
329
+ end
330
+
331
+ end
332
+ end
333
+ return out_hash
334
+ end
335
+
336
+ # input sequence hash, and Poisson cutoff for minority variants.
337
+ # HIV-1 PR region SDRM based on HIVDB.stanford.edu
338
+ # only for MPID-DR MiSeq sequences, PR codon 1-99
339
+ # return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
340
+ def self.sdrm_pr_bulk(sequences, cutoff = 0)
341
+ region = "PR"
342
+ rf_label = 0
343
+ start_codon_number = 1
344
+ n_seq = sequences.size
345
+ mut = {}
346
+ mut_com = []
347
+ aa = {}
348
+ point_mutation_list = []
349
+ sequences.each do |name,seq|
350
+ s = ViralSeq::Sequence.new(name,seq)
351
+ s.get_aa_array(rf_label)
352
+ aa_seq = s.aa_array
353
+ aa[name] = aa_seq.join("")
354
+ record = ViralSeq.hiv_protease(aa_seq)
355
+ mut_com << record
356
+ record.each do |position,mutation|
357
+ if mut[position]
358
+ mut[position][1] << mutation[1]
359
+ else
360
+ mut[position] = [mutation[0],[]]
361
+ mut[position][1] << mutation[1]
362
+ end
363
+ end
364
+ end
365
+ mut.each do |position,mutation|
366
+ wt = mutation[0]
367
+ mut_list = mutation[1]
368
+ count_mut_list = ViralSeq.count(mut_list)
369
+ count_mut_list.each do |m,number|
370
+ ci = ViralSeq.r_binom_CI(number, n_seq)
371
+ label = number < cutoff ? "*" : ""
372
+ point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
373
+ end
374
+ end
375
+ point_mutation_list.sort_by! {|record| record[2]}
376
+
377
+ link = ViralSeq.count(mut_com)
378
+ link2 = {}
379
+ link.each do |k,v|
380
+ pattern = []
381
+ if k.size == 0
382
+ pattern = ['WT']
383
+ else
384
+ k.each do |p,m|
385
+ pattern << (m[0] + p.to_s + m[1])
386
+ end
387
+ end
388
+ link2[pattern.join("+")] = v
389
+ end
390
+ linkage_list = []
391
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
392
+ ci = ViralSeq.r_binom_CI(v, n_seq)
393
+ label = v < cutoff ? "*" : ""
394
+ linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
395
+ end
396
+
397
+ report_list = []
398
+
399
+ div_aa = {}
400
+ aa_start = start_codon_number
401
+
402
+ aa_size = aa.values[0].size - 1
403
+
404
+ (0..aa_size).to_a.each do |p|
405
+ aas = []
406
+ aa.values.each do |r1|
407
+ aas << r1[p]
408
+ end
409
+ count_aas = ViralSeq.count(aas)
410
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
411
+ aa_start += 1
412
+ end
413
+
414
+ div_aa.each do |k,v|
415
+ record = [region, k, n_seq]
416
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
417
+ aa_count = v[amino_acid]
418
+ record << (aa_count.to_f/n_seq*100).round(4)
419
+ end
420
+ report_list << record
421
+ end
422
+
423
+ return [point_mutation_list, linkage_list, report_list]
424
+ end
425
+
426
+
427
+ #input sequence hash, and Poisson cutoff for minority variants.
428
+ #HIV-1 RT region SDRM based on HIVDB.stanford.edu
429
+ #only for MPID-DR MiSeq sequences
430
+ #RT codon 34-122, 152-236 two regions are linked.
431
+ #return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
432
+ def self.sdrm_rt_bulk(sequences, cutoff = 0)
433
+ region = "RT"
434
+ rf_label = 1
435
+ start_codon_number = 34
436
+ gap = "AGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC"
437
+
438
+ n_seq = sequences.size
439
+ mut_nrti = {}
440
+ mut_nnrti = {}
441
+ mut_com = []
442
+ r1_aa = {}
443
+ r2_aa = {}
444
+ point_mutation_list = []
445
+ sequences.each do |name,seq|
446
+ r1 = seq[0,267]
447
+ r2 = seq[267..-1]
448
+ seq = r1 + gap + r2
449
+ s = ViralSeq::Sequence.new(name,seq)
450
+ s.get_aa_array(rf_label)
451
+ aa_seq = s.aa_array
452
+
453
+ r1_aa[name] = aa_seq[0,89].join("")
454
+ r2_aa[name] = aa_seq[-85..-1].join("")
455
+ nrti = ViralSeq.sdrm_nrti(aa_seq,start_codon_number)
456
+ nnrti = ViralSeq.sdrm_nnrti(aa_seq,start_codon_number)
457
+ mut_com << (nrti.merge(nnrti))
458
+
459
+ nrti.each do |position,mutation|
460
+ if mut_nrti[position]
461
+ mut_nrti[position][1] << mutation[1]
462
+ else
463
+ mut_nrti[position] = [mutation[0],[]]
464
+ mut_nrti[position][1] << mutation[1]
465
+ end
466
+ end
467
+ nnrti.each do |position,mutation|
468
+ if mut_nnrti[position]
469
+ mut_nnrti[position][1] << mutation[1]
470
+ else
471
+ mut_nnrti[position] = [mutation[0],[]]
472
+ mut_nnrti[position][1] << mutation[1]
473
+ end
474
+ end
475
+ end
476
+
477
+ mut_nrti.each do |position,mutation|
478
+ wt = mutation[0]
479
+ mut_list = mutation[1]
480
+ count_mut_list = ViralSeq.count(mut_list)
481
+ count_mut_list.each do |m,number|
482
+ ci = ViralSeq.r_binom_CI(number, n_seq)
483
+ label = number < cutoff ? "*" : ""
484
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
485
+ end
486
+ end
487
+
488
+ mut_nnrti.each do |position,mutation|
489
+ wt = mutation[0]
490
+ mut_list = mutation[1]
491
+ count_mut_list = ViralSeq.count(mut_list)
492
+ count_mut_list.each do |m,number|
493
+ ci = ViralSeq.r_binom_CI(number, n_seq)
494
+ label = number < cutoff ? "*" : ""
495
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
496
+ end
497
+ end
498
+ point_mutation_list.sort_by! {|record| record[2]}
499
+
500
+ link = ViralSeq.count(mut_com)
501
+ link2 = {}
502
+ link.each do |k,v|
503
+ pattern = []
504
+ if k.size == 0
505
+ pattern = ['WT']
506
+ else
507
+ k.each do |p,m|
508
+ pattern << (m[0] + p.to_s + m[1])
509
+ end
510
+ end
511
+ link2[pattern.join("+")] = v
512
+ end
513
+ linkage_list = []
514
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
515
+ ci = ViralSeq.r_binom_CI(v, n_seq)
516
+ label = v < cutoff ? "*" : ""
517
+ linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
518
+ end
519
+
520
+ report_list = []
521
+
522
+ div_aa = {}
523
+ r1_aa_start = 34
524
+ r2_aa_start = 152
525
+
526
+ r1_aa_size = r1_aa.values[0].size - 1
527
+ r2_aa_size = r2_aa.values[0].size - 1
528
+
529
+ (0..r1_aa_size).to_a.each do |p|
530
+ aas = []
531
+ r1_aa.values.each do |r1|
532
+ aas << r1[p]
533
+ end
534
+ count_aas = ViralSeq.count(aas)
535
+ div_aa[r1_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
536
+ r1_aa_start += 1
537
+ end
538
+
539
+ (0..r2_aa_size).to_a.each do |p|
540
+ aas = []
541
+ r2_aa.values.each do |r1|
542
+ aas << r1[p]
543
+ end
544
+ count_aas = ViralSeq.count(aas)
545
+ div_aa[r2_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
546
+ r2_aa_start += 1
547
+ end
548
+
549
+ div_aa.each do |k,v|
550
+ record = [region, k, n_seq]
551
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
552
+ aa_count = v[amino_acid]
553
+ record << (aa_count.to_f/n_seq*100).round(4)
554
+ end
555
+ report_list << record
556
+ end
557
+
558
+ return [point_mutation_list, linkage_list, report_list]
559
+ end
560
+
561
+ #input sequence hash, and Poisson cutoff for minority variants.
562
+ #HIV-1 IN region SDRM based on HIVDB.stanford.edu
563
+ #only for MPID-DR MiSeq sequences
564
+ #IN codon 53-174
565
+ #return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
566
+ def self.sdrm_in_bulk(sequences, cutoff = 0)
567
+ region = "IN"
568
+ rf_label = 2
569
+ start_codon_number = 53
570
+ n_seq = sequences.size
571
+ mut = {}
572
+ mut_com = []
573
+ aa = {}
574
+ point_mutation_list = []
575
+ sequences.each do |name,seq|
576
+ s = ViralSeq::Sequence.new(name,seq)
577
+ s.get_aa_array(rf_label)
578
+ aa_seq = s.aa_array
579
+ aa[name] = aa_seq.join("")
580
+ record = ViralSeq.sdrm_int(aa_seq, start_codon_number)
581
+ mut_com << record
582
+ record.each do |position,mutation|
583
+ if mut[position]
584
+ mut[position][1] << mutation[1]
585
+ else
586
+ mut[position] = [mutation[0],[]]
587
+ mut[position][1] << mutation[1]
588
+ end
589
+ end
590
+ end
591
+ mut.each do |position,mutation|
592
+ wt = mutation[0]
593
+ mut_list = mutation[1]
594
+ count_mut_list = ViralSeq.count(mut_list)
595
+ count_mut_list.each do |m,number|
596
+ ci = ViralSeq.r_binom_CI(number, n_seq)
597
+ label = number < cutoff ? "*" : ""
598
+ point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
599
+ end
600
+ end
601
+ point_mutation_list.sort_by! {|record| record[2]}
602
+
603
+ link = ViralSeq.count(mut_com)
604
+ link2 = {}
605
+ link.each do |k,v|
606
+ pattern = []
607
+ if k.size == 0
608
+ pattern = ['WT']
609
+ else
610
+ k.each do |p,m|
611
+ pattern << (m[0] + p.to_s + m[1])
612
+ end
613
+ end
614
+ link2[pattern.join("+")] = v
615
+ end
616
+ linkage_list = []
617
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
618
+ ci = ViralSeq.r_binom_CI(v, n_seq)
619
+ label = v < cutoff ? "*" : ""
620
+ linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
621
+ end
622
+
623
+ report_list = []
624
+
625
+ div_aa = {}
626
+ aa_start = start_codon_number
627
+
628
+ aa_size = aa.values[0].size - 1
629
+
630
+ (0..aa_size).to_a.each do |p|
631
+ aas = []
632
+ aa.values.each do |r1|
633
+ aas << r1[p]
634
+ end
635
+ count_aas = ViralSeq.count(aas)
636
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
637
+ aa_start += 1
638
+ end
639
+
640
+ div_aa.each do |k,v|
641
+ record = [region, k, n_seq]
642
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
643
+ aa_count = v[amino_acid]
644
+ record << (aa_count.to_f/n_seq*100).round(4)
645
+ end
646
+ report_list << record
647
+ end
648
+
649
+ return [point_mutation_list, linkage_list, report_list]
650
+ end
651
+
652
+ end