viral_seq 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/viral_seq/a3g.rb +172 -0
- data/lib/viral_seq/fasta.rb +154 -0
- data/lib/viral_seq/hcv_dr.rb +54 -0
- data/lib/viral_seq/locator.rb +299 -0
- data/lib/viral_seq/math.rb +401 -0
- data/lib/viral_seq/misc.rb +103 -0
- data/lib/viral_seq/muscle.rb +89 -0
- data/lib/viral_seq/nt_variation.rb +148 -0
- data/lib/viral_seq/poisson_cutoff.rb +68 -0
- data/lib/viral_seq/refseq.rb +45 -0
- data/lib/viral_seq/sdrm_core.rb +652 -0
- data/lib/viral_seq/sequence.rb +392 -0
- data/lib/viral_seq/tcs_core.rb +556 -0
- data/lib/viral_seq/version.rb +6 -0
- data/lib/viral_seq.rb +41 -0
- data/viral_seq.gemspec +37 -0
- metadata +130 -0
@@ -0,0 +1,652 @@
|
|
1
|
+
# viral_seq/sdrm_core.rb
|
2
|
+
# core functions for HIV SDRM analysis using MPID-DR protocol.
|
3
|
+
# More details for HIV Surveillance Drug Resistance Mutation (SDRM) can be found at
|
4
|
+
# https://hivdb.stanford.edu/pages/surveillance.html
|
5
|
+
|
6
|
+
# Including methods as:
|
7
|
+
# ViralSeq::sdrm_nrti
|
8
|
+
# ViralSeq::sdrm_nnrti
|
9
|
+
# ViralSeq::hiv_protease
|
10
|
+
# ViralSeq::sdrm_int
|
11
|
+
# ViralSeq::sdrm_pr_bulk
|
12
|
+
# ViralSeq::sdrm_rt_bulk
|
13
|
+
# ViralSeq::sdrm_in_bulk
|
14
|
+
|
15
|
+
# ViralSeq.sdrm_nrti(aa_arry, start_aa)
|
16
|
+
# ViralSeq.sdrm_nnrti(aa_arry, start_aa)
|
17
|
+
# ViralSeq.hiv_protease(aa_arry, start_aa)
|
18
|
+
# ViralSeq.sdrm_int(aa_arry, start_aa)
|
19
|
+
# # funtions to identify SDRMs from a given sequence in an Array object
|
20
|
+
# # function names indicate which HIV drug resistance mutations it can identify
|
21
|
+
# # input an Array object for amino acid sequence ['A', 'M', 'L', ...]
|
22
|
+
# # start_aa is an Integer to indicate codon number of the 1st amino acid sequence in the input aa_array
|
23
|
+
# # return a Hash object for SDRMs identified. {:posiiton =>[:wildtype_codon, :mutation_codon]}
|
24
|
+
|
25
|
+
# ViralSeq.sdrm_pr_bulk(sequence_hash, minority_cut_off)
|
26
|
+
# ViralSeq.sdrm_rt_bulk(sequence_hash, minority_cut_off)
|
27
|
+
# ViralSeq.sdrm_in_bulk(sequence_hash, minority_cut_off)
|
28
|
+
# # functions to identify SDRMs from a sequence hash object.
|
29
|
+
# # name of the functions indicate which region it works on
|
30
|
+
# # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
|
31
|
+
# # PR codon 1-99
|
32
|
+
# # RT codon 34-122, 152-236, two regions are linked
|
33
|
+
# # IN codon 53-174
|
34
|
+
# # sequence_hash is a Hash object of sequences {:name => :sequence, ...}
|
35
|
+
# # sequences usually need to be QCed (remove sequences with stop codon and a3g hypermutations) first
|
36
|
+
# # minority_cut_off is the Integer cut-off for minimal abundance of a mutation to be called as valid mutation
|
37
|
+
# # minority_cut_off can be obtained using ViralSeq::poisson_minority_cutoff function
|
38
|
+
# # return [point_mutation_list, linkage_list, report_list]
|
39
|
+
# =USAGE
|
40
|
+
# # example (example files from ID:VS053118-0566)
|
41
|
+
# sequence = ViralSeq.fasta_to_hash('spec/sample_files/sample_dr_sequences/pr.fasta')
|
42
|
+
# p_cut_off = ViralSeq.poisson_minority_cutoff(sequences)
|
43
|
+
# pr_sdrm = ViralSeq.sdrm_pr_bulk(sequence, p_cut_off)
|
44
|
+
# puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"
|
45
|
+
# pr_sdrm[0].each {|n| puts n.join(',')}
|
46
|
+
# => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
|
47
|
+
# => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
|
48
|
+
# => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
|
49
|
+
# => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
|
50
|
+
#
|
51
|
+
# puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"
|
52
|
+
# pr_sdrm[1].each {|n| puts n.join(',')}
|
53
|
+
# => region,tcs_number,linkage,count,%,CI_low,CI_high,label
|
54
|
+
# => PR,396,D30N+N88D,245,0.61869,0.56884,0.66674,
|
55
|
+
# => PR,396,WT,149,0.37626,0.32837,0.42602,
|
56
|
+
# => PR,396,D30N,1,0.00253,6.0e-05,0.01399,*
|
57
|
+
# => PR,396,D30N+I50V+N88D,1,0.00253,6.0e-05,0.01399,*
|
58
|
+
#
|
59
|
+
# puts "position,codon,tcs_number," + ViralSeq::AMINO_ACID_LIST.join(",")
|
60
|
+
# pr_sdrm[2].each {|n|puts n.join(",")}
|
61
|
+
# => position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
|
62
|
+
# => PR,1,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
63
|
+
# => PR,2,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
64
|
+
# => PR,3,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
65
|
+
# => PR,4,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
66
|
+
# => PR,5,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
67
|
+
# => PR,6,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
|
68
|
+
# => PR,7,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
69
|
+
# => PR,8,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
|
70
|
+
# => PR,9,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
71
|
+
# => PR,10,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
72
|
+
# => PR,11,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
73
|
+
# => PR,12,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,62.1212,0.0,0.0,0.0,0.0
|
74
|
+
# => PR,13,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.1313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.8687,0.0,0.0,0.0
|
75
|
+
# => PR,14,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
76
|
+
# => PR,15,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6263,0.0,0.0,0.0
|
77
|
+
# => PR,16,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
78
|
+
# => PR,17,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
79
|
+
# => PR,18,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.5051,0.0,0.0,0.0,0.0,0.0,0.0
|
80
|
+
# => PR,19,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
81
|
+
# => PR,20,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
82
|
+
# => PR,21,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
83
|
+
# => PR,22,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
84
|
+
# => PR,23,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
85
|
+
# => PR,24,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
86
|
+
# => PR,25,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
87
|
+
# => PR,26,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
88
|
+
# => PR,27,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
89
|
+
# => PR,28,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
90
|
+
# => PR,29,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
91
|
+
# => PR,30,396,0.0,0.0,37.6263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
92
|
+
# => PR,31,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
93
|
+
# => PR,32,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
94
|
+
# => PR,33,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
|
95
|
+
# => PR,34,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
96
|
+
# => PR,35,396,0.0,0.0,62.1212,37.6263,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
97
|
+
# => PR,36,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
|
98
|
+
# => PR,37,396,0.0,0.0,37.8788,61.8687,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
99
|
+
# => PR,38,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
100
|
+
# => PR,39,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
|
101
|
+
# => PR,40,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
102
|
+
# => PR,41,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0
|
103
|
+
# => PR,42,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
|
104
|
+
# => PR,43,396,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
105
|
+
# => PR,44,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
106
|
+
# => PR,45,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
107
|
+
# => PR,46,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
108
|
+
# => PR,47,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
109
|
+
# => PR,48,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
110
|
+
# => PR,49,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
111
|
+
# => PR,50,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
|
112
|
+
# => PR,51,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
113
|
+
# => PR,52,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
114
|
+
# => PR,53,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
115
|
+
# => PR,54,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
116
|
+
# => PR,55,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
117
|
+
# => PR,56,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
118
|
+
# => PR,57,396,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0
|
119
|
+
# => PR,58,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
120
|
+
# => PR,59,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
|
121
|
+
# => PR,60,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
122
|
+
# => PR,61,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
123
|
+
# => PR,62,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
124
|
+
# => PR,63,396,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,37.8788,0.0,0.0,61.8687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
125
|
+
# => PR,64,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
126
|
+
# => PR,65,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
127
|
+
# => PR,66,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
128
|
+
# => PR,67,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
129
|
+
# => PR,68,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
130
|
+
# => PR,69,396,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
131
|
+
# => PR,70,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
132
|
+
# => PR,71,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,37.8788,0.0,0.0,0.0
|
133
|
+
# => PR,72,396,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0
|
134
|
+
# => PR,73,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
135
|
+
# => PR,74,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
136
|
+
# => PR,75,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
137
|
+
# => PR,76,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
138
|
+
# => PR,77,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
139
|
+
# => PR,78,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
140
|
+
# => PR,79,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
141
|
+
# => PR,80,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
142
|
+
# => PR,81,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
143
|
+
# => PR,82,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
144
|
+
# => PR,83,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
|
145
|
+
# => PR,84,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
146
|
+
# => PR,85,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
147
|
+
# => PR,86,396,0.0,0.0,0.0,0.5051,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
148
|
+
# => PR,87,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
|
149
|
+
# => PR,88,396,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
150
|
+
# => PR,89,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
151
|
+
# => PR,90,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
152
|
+
# => PR,91,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
153
|
+
# => PR,92,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
154
|
+
# => PR,93,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
155
|
+
# => PR,94,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
156
|
+
# => PR,95,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
157
|
+
# => PR,96,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
158
|
+
# => PR,97,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
159
|
+
# => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
|
160
|
+
# => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
module ViralSeq
|
165
|
+
|
166
|
+
# drug resistant mutation summary. input: amino acid array and starting codon, output, hash of summary
|
167
|
+
def self.sdrm_nrti(aa_array,start_aa=1)
|
168
|
+
out_hash = {}
|
169
|
+
sdrm = {}
|
170
|
+
sdrm[41] = ['M',['L']]
|
171
|
+
sdrm[65] = ['K',['R']]
|
172
|
+
sdrm[67] = ['D',['N','G','E']]
|
173
|
+
sdrm[69] = ['T',['D']]
|
174
|
+
sdrm[70] = ['K',['R','E']]
|
175
|
+
sdrm[74] = ['L',['V','I']]
|
176
|
+
sdrm[75] = ['V',['M','T','A','S']]
|
177
|
+
sdrm[77] = ['F',['L']]
|
178
|
+
sdrm[115] = ['Y',['F']]
|
179
|
+
sdrm[116] = ['F',['Y']]
|
180
|
+
sdrm[151] = ['Q',['M']]
|
181
|
+
sdrm[184] = ['M',['V','I']]
|
182
|
+
sdrm[210] = ['L',['W']]
|
183
|
+
sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
|
184
|
+
sdrm[219] = ["K",["Q","E","N","R"]]
|
185
|
+
aa_length = aa_array.size
|
186
|
+
end_aa = start_aa + aa_length - 1
|
187
|
+
(start_aa..end_aa).each do |position|
|
188
|
+
array_position = position - start_aa
|
189
|
+
if sdrm.keys.include?(position)
|
190
|
+
wt_aa = sdrm[position][0]
|
191
|
+
test_aa = aa_array[array_position]
|
192
|
+
if test_aa.size == 1
|
193
|
+
unless wt_aa == test_aa
|
194
|
+
if sdrm[position][1].include?(test_aa)
|
195
|
+
out_hash[position] = [wt_aa,test_aa]
|
196
|
+
end
|
197
|
+
end
|
198
|
+
else
|
199
|
+
test_aa_array = test_aa.split("/")
|
200
|
+
if (test_aa_array & sdrm[position][1])
|
201
|
+
out_hash[position] = [wt_aa,test_aa]
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
207
|
+
return out_hash
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.sdrm_nnrti(aa_array,start_aa=1)
|
211
|
+
out_hash = {}
|
212
|
+
sdrm = {}
|
213
|
+
sdrm[100] = ['L',['I']]
|
214
|
+
sdrm[101] = ['K',['E','P']]
|
215
|
+
sdrm[103] = ['K',['N','S']]
|
216
|
+
sdrm[106] = ['V',['M','A']]
|
217
|
+
sdrm[179] = ['V',['F','D']]
|
218
|
+
sdrm[181] = ['Y',['C','I','V']]
|
219
|
+
sdrm[188] = ['Y',['L','H','C']]
|
220
|
+
sdrm[190] = ['G',['A','S','E']]
|
221
|
+
sdrm[225] = ['P',['H']]
|
222
|
+
sdrm[230] = ['M',['L']]
|
223
|
+
aa_length = aa_array.size
|
224
|
+
end_aa = start_aa + aa_length - 1
|
225
|
+
(start_aa..end_aa).each do |position|
|
226
|
+
array_position = position - start_aa
|
227
|
+
if sdrm.keys.include?(position)
|
228
|
+
wt_aa = sdrm[position][0]
|
229
|
+
test_aa = aa_array[array_position]
|
230
|
+
if test_aa.size == 1
|
231
|
+
unless wt_aa == test_aa
|
232
|
+
if sdrm[position][1].include?(test_aa)
|
233
|
+
out_hash[position] = [wt_aa,test_aa]
|
234
|
+
end
|
235
|
+
end
|
236
|
+
else
|
237
|
+
test_aa_array = test_aa.split("/")
|
238
|
+
if (test_aa_array & sdrm[position][1])
|
239
|
+
out_hash[position] = [wt_aa,test_aa]
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
end
|
244
|
+
end
|
245
|
+
return out_hash
|
246
|
+
end
|
247
|
+
|
248
|
+
#HIV protease surveillance mutations
|
249
|
+
|
250
|
+
def self.hiv_protease(aa_array,start_aa=1)
|
251
|
+
out_hash = {}
|
252
|
+
sdrm = {}
|
253
|
+
sdrm[23] = ['L',['I']]
|
254
|
+
sdrm[24] = ['L',['I']]
|
255
|
+
sdrm[30] = ['D',['N']]
|
256
|
+
sdrm[32] = ['V',['I']]
|
257
|
+
sdrm[46] = ['M',['I','L','V']] # M46V not on the SDRM list but we still include it.
|
258
|
+
sdrm[47] = ['I',['V','A']]
|
259
|
+
sdrm[48] = ['G',['V','M']]
|
260
|
+
sdrm[50] = ['I',['V','L']]
|
261
|
+
sdrm[53] = ['F',['Y']]
|
262
|
+
sdrm[54] = ['I',['V','L','M','T','A','S']]
|
263
|
+
sdrm[73] = ['G',['S','T','C','A']]
|
264
|
+
sdrm[76] = ['L',['V']]
|
265
|
+
sdrm[82] = ['V',['A','T','S','F','L','C','M']]
|
266
|
+
sdrm[83] = ['N',['D']]
|
267
|
+
sdrm[84] = ['I',['V','A','C']]
|
268
|
+
sdrm[85] = ['I',['V']]
|
269
|
+
sdrm[88] = ['N',['D','S']]
|
270
|
+
sdrm[90] = ['L',['M']]
|
271
|
+
aa_length = aa_array.size
|
272
|
+
end_aa = start_aa + aa_length - 1
|
273
|
+
(start_aa..end_aa).each do |position|
|
274
|
+
array_position = position - start_aa
|
275
|
+
if sdrm.keys.include?(position)
|
276
|
+
wt_aa = sdrm[position][0]
|
277
|
+
test_aa = aa_array[array_position]
|
278
|
+
if test_aa.size == 1
|
279
|
+
unless wt_aa == test_aa
|
280
|
+
if sdrm[position][1].include?(test_aa)
|
281
|
+
out_hash[position] = [wt_aa,test_aa]
|
282
|
+
end
|
283
|
+
end
|
284
|
+
else
|
285
|
+
test_aa_array = test_aa.split("/")
|
286
|
+
if (test_aa_array & sdrm[position][1])
|
287
|
+
out_hash[position] = [wt_aa,test_aa]
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
end
|
292
|
+
return out_hash
|
293
|
+
end
|
294
|
+
|
295
|
+
#HIV integrase drug resistance mutations
|
296
|
+
|
297
|
+
def self.sdrm_int(aa_array,start_aa=1)
|
298
|
+
out_hash = {}
|
299
|
+
sdrm = {}
|
300
|
+
sdrm[66] = ['T',['A','I','K']]
|
301
|
+
sdrm[74] = ['L',['M']]
|
302
|
+
sdrm[92] = ['E',['Q']]
|
303
|
+
sdrm[95] = ['Q',['K']]
|
304
|
+
sdrm[97] = ['T',['A']]
|
305
|
+
sdrm[121] = ['F',['Y']]
|
306
|
+
sdrm[140] = ['G',['A','S','C']]
|
307
|
+
sdrm[143] = ["Y",["C","H","R"]]
|
308
|
+
sdrm[147] = ['S',['G']]
|
309
|
+
sdrm[148] = ['Q',['H','K','R']]
|
310
|
+
sdrm[155] = ['N',['S','H']]
|
311
|
+
aa_length = aa_array.size
|
312
|
+
end_aa = start_aa + aa_length - 1
|
313
|
+
(start_aa..end_aa).each do |position|
|
314
|
+
array_position = position - start_aa
|
315
|
+
if sdrm.keys.include?(position)
|
316
|
+
wt_aa = sdrm[position][0]
|
317
|
+
test_aa = aa_array[array_position]
|
318
|
+
if test_aa.size == 1
|
319
|
+
unless wt_aa == test_aa
|
320
|
+
if sdrm[position][1].include?(test_aa)
|
321
|
+
out_hash[position] = [wt_aa,test_aa]
|
322
|
+
end
|
323
|
+
end
|
324
|
+
else
|
325
|
+
test_aa_array = test_aa.split("/")
|
326
|
+
if (test_aa_array & sdrm[position][1])
|
327
|
+
out_hash[position] = [wt_aa,test_aa]
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
end
|
332
|
+
end
|
333
|
+
return out_hash
|
334
|
+
end
|
335
|
+
|
336
|
+
# input sequence hash, and Poisson cutoff for minority variants.
|
337
|
+
# HIV-1 PR region SDRM based on HIVDB.stanford.edu
|
338
|
+
# only for MPID-DR MiSeq sequences, PR codon 1-99
|
339
|
+
# return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
|
340
|
+
def self.sdrm_pr_bulk(sequences, cutoff = 0)
|
341
|
+
region = "PR"
|
342
|
+
rf_label = 0
|
343
|
+
start_codon_number = 1
|
344
|
+
n_seq = sequences.size
|
345
|
+
mut = {}
|
346
|
+
mut_com = []
|
347
|
+
aa = {}
|
348
|
+
point_mutation_list = []
|
349
|
+
sequences.each do |name,seq|
|
350
|
+
s = ViralSeq::Sequence.new(name,seq)
|
351
|
+
s.get_aa_array(rf_label)
|
352
|
+
aa_seq = s.aa_array
|
353
|
+
aa[name] = aa_seq.join("")
|
354
|
+
record = ViralSeq.hiv_protease(aa_seq)
|
355
|
+
mut_com << record
|
356
|
+
record.each do |position,mutation|
|
357
|
+
if mut[position]
|
358
|
+
mut[position][1] << mutation[1]
|
359
|
+
else
|
360
|
+
mut[position] = [mutation[0],[]]
|
361
|
+
mut[position][1] << mutation[1]
|
362
|
+
end
|
363
|
+
end
|
364
|
+
end
|
365
|
+
mut.each do |position,mutation|
|
366
|
+
wt = mutation[0]
|
367
|
+
mut_list = mutation[1]
|
368
|
+
count_mut_list = ViralSeq.count(mut_list)
|
369
|
+
count_mut_list.each do |m,number|
|
370
|
+
ci = ViralSeq.r_binom_CI(number, n_seq)
|
371
|
+
label = number < cutoff ? "*" : ""
|
372
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
373
|
+
end
|
374
|
+
end
|
375
|
+
point_mutation_list.sort_by! {|record| record[2]}
|
376
|
+
|
377
|
+
link = ViralSeq.count(mut_com)
|
378
|
+
link2 = {}
|
379
|
+
link.each do |k,v|
|
380
|
+
pattern = []
|
381
|
+
if k.size == 0
|
382
|
+
pattern = ['WT']
|
383
|
+
else
|
384
|
+
k.each do |p,m|
|
385
|
+
pattern << (m[0] + p.to_s + m[1])
|
386
|
+
end
|
387
|
+
end
|
388
|
+
link2[pattern.join("+")] = v
|
389
|
+
end
|
390
|
+
linkage_list = []
|
391
|
+
link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
|
392
|
+
ci = ViralSeq.r_binom_CI(v, n_seq)
|
393
|
+
label = v < cutoff ? "*" : ""
|
394
|
+
linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
|
395
|
+
end
|
396
|
+
|
397
|
+
report_list = []
|
398
|
+
|
399
|
+
div_aa = {}
|
400
|
+
aa_start = start_codon_number
|
401
|
+
|
402
|
+
aa_size = aa.values[0].size - 1
|
403
|
+
|
404
|
+
(0..aa_size).to_a.each do |p|
|
405
|
+
aas = []
|
406
|
+
aa.values.each do |r1|
|
407
|
+
aas << r1[p]
|
408
|
+
end
|
409
|
+
count_aas = ViralSeq.count(aas)
|
410
|
+
div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
411
|
+
aa_start += 1
|
412
|
+
end
|
413
|
+
|
414
|
+
div_aa.each do |k,v|
|
415
|
+
record = [region, k, n_seq]
|
416
|
+
ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
|
417
|
+
aa_count = v[amino_acid]
|
418
|
+
record << (aa_count.to_f/n_seq*100).round(4)
|
419
|
+
end
|
420
|
+
report_list << record
|
421
|
+
end
|
422
|
+
|
423
|
+
return [point_mutation_list, linkage_list, report_list]
|
424
|
+
end
|
425
|
+
|
426
|
+
|
427
|
+
#input sequence hash, and Poisson cutoff for minority variants.
|
428
|
+
#HIV-1 RT region SDRM based on HIVDB.stanford.edu
|
429
|
+
#only for MPID-DR MiSeq sequences
|
430
|
+
#RT codon 34-122, 152-236 two regions are linked.
|
431
|
+
#return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
|
432
|
+
def self.sdrm_rt_bulk(sequences, cutoff = 0)
|
433
|
+
region = "RT"
|
434
|
+
rf_label = 1
|
435
|
+
start_codon_number = 34
|
436
|
+
gap = "AGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC"
|
437
|
+
|
438
|
+
n_seq = sequences.size
|
439
|
+
mut_nrti = {}
|
440
|
+
mut_nnrti = {}
|
441
|
+
mut_com = []
|
442
|
+
r1_aa = {}
|
443
|
+
r2_aa = {}
|
444
|
+
point_mutation_list = []
|
445
|
+
sequences.each do |name,seq|
|
446
|
+
r1 = seq[0,267]
|
447
|
+
r2 = seq[267..-1]
|
448
|
+
seq = r1 + gap + r2
|
449
|
+
s = ViralSeq::Sequence.new(name,seq)
|
450
|
+
s.get_aa_array(rf_label)
|
451
|
+
aa_seq = s.aa_array
|
452
|
+
|
453
|
+
r1_aa[name] = aa_seq[0,89].join("")
|
454
|
+
r2_aa[name] = aa_seq[-85..-1].join("")
|
455
|
+
nrti = ViralSeq.sdrm_nrti(aa_seq,start_codon_number)
|
456
|
+
nnrti = ViralSeq.sdrm_nnrti(aa_seq,start_codon_number)
|
457
|
+
mut_com << (nrti.merge(nnrti))
|
458
|
+
|
459
|
+
nrti.each do |position,mutation|
|
460
|
+
if mut_nrti[position]
|
461
|
+
mut_nrti[position][1] << mutation[1]
|
462
|
+
else
|
463
|
+
mut_nrti[position] = [mutation[0],[]]
|
464
|
+
mut_nrti[position][1] << mutation[1]
|
465
|
+
end
|
466
|
+
end
|
467
|
+
nnrti.each do |position,mutation|
|
468
|
+
if mut_nnrti[position]
|
469
|
+
mut_nnrti[position][1] << mutation[1]
|
470
|
+
else
|
471
|
+
mut_nnrti[position] = [mutation[0],[]]
|
472
|
+
mut_nnrti[position][1] << mutation[1]
|
473
|
+
end
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
477
|
+
mut_nrti.each do |position,mutation|
|
478
|
+
wt = mutation[0]
|
479
|
+
mut_list = mutation[1]
|
480
|
+
count_mut_list = ViralSeq.count(mut_list)
|
481
|
+
count_mut_list.each do |m,number|
|
482
|
+
ci = ViralSeq.r_binom_CI(number, n_seq)
|
483
|
+
label = number < cutoff ? "*" : ""
|
484
|
+
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
488
|
+
mut_nnrti.each do |position,mutation|
|
489
|
+
wt = mutation[0]
|
490
|
+
mut_list = mutation[1]
|
491
|
+
count_mut_list = ViralSeq.count(mut_list)
|
492
|
+
count_mut_list.each do |m,number|
|
493
|
+
ci = ViralSeq.r_binom_CI(number, n_seq)
|
494
|
+
label = number < cutoff ? "*" : ""
|
495
|
+
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
496
|
+
end
|
497
|
+
end
|
498
|
+
point_mutation_list.sort_by! {|record| record[2]}
|
499
|
+
|
500
|
+
link = ViralSeq.count(mut_com)
|
501
|
+
link2 = {}
|
502
|
+
link.each do |k,v|
|
503
|
+
pattern = []
|
504
|
+
if k.size == 0
|
505
|
+
pattern = ['WT']
|
506
|
+
else
|
507
|
+
k.each do |p,m|
|
508
|
+
pattern << (m[0] + p.to_s + m[1])
|
509
|
+
end
|
510
|
+
end
|
511
|
+
link2[pattern.join("+")] = v
|
512
|
+
end
|
513
|
+
linkage_list = []
|
514
|
+
link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
|
515
|
+
ci = ViralSeq.r_binom_CI(v, n_seq)
|
516
|
+
label = v < cutoff ? "*" : ""
|
517
|
+
linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
|
518
|
+
end
|
519
|
+
|
520
|
+
report_list = []
|
521
|
+
|
522
|
+
div_aa = {}
|
523
|
+
r1_aa_start = 34
|
524
|
+
r2_aa_start = 152
|
525
|
+
|
526
|
+
r1_aa_size = r1_aa.values[0].size - 1
|
527
|
+
r2_aa_size = r2_aa.values[0].size - 1
|
528
|
+
|
529
|
+
(0..r1_aa_size).to_a.each do |p|
|
530
|
+
aas = []
|
531
|
+
r1_aa.values.each do |r1|
|
532
|
+
aas << r1[p]
|
533
|
+
end
|
534
|
+
count_aas = ViralSeq.count(aas)
|
535
|
+
div_aa[r1_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
536
|
+
r1_aa_start += 1
|
537
|
+
end
|
538
|
+
|
539
|
+
(0..r2_aa_size).to_a.each do |p|
|
540
|
+
aas = []
|
541
|
+
r2_aa.values.each do |r1|
|
542
|
+
aas << r1[p]
|
543
|
+
end
|
544
|
+
count_aas = ViralSeq.count(aas)
|
545
|
+
div_aa[r2_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
546
|
+
r2_aa_start += 1
|
547
|
+
end
|
548
|
+
|
549
|
+
div_aa.each do |k,v|
|
550
|
+
record = [region, k, n_seq]
|
551
|
+
ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
|
552
|
+
aa_count = v[amino_acid]
|
553
|
+
record << (aa_count.to_f/n_seq*100).round(4)
|
554
|
+
end
|
555
|
+
report_list << record
|
556
|
+
end
|
557
|
+
|
558
|
+
return [point_mutation_list, linkage_list, report_list]
|
559
|
+
end
|
560
|
+
|
561
|
+
#input sequence hash, and Poisson cutoff for minority variants.
|
562
|
+
#HIV-1 IN region SDRM based on HIVDB.stanford.edu
|
563
|
+
#only for MPID-DR MiSeq sequences
|
564
|
+
#IN codon 53-174
|
565
|
+
#return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
|
566
|
+
def self.sdrm_in_bulk(sequences, cutoff = 0)
|
567
|
+
region = "IN"
|
568
|
+
rf_label = 2
|
569
|
+
start_codon_number = 53
|
570
|
+
n_seq = sequences.size
|
571
|
+
mut = {}
|
572
|
+
mut_com = []
|
573
|
+
aa = {}
|
574
|
+
point_mutation_list = []
|
575
|
+
sequences.each do |name,seq|
|
576
|
+
s = ViralSeq::Sequence.new(name,seq)
|
577
|
+
s.get_aa_array(rf_label)
|
578
|
+
aa_seq = s.aa_array
|
579
|
+
aa[name] = aa_seq.join("")
|
580
|
+
record = ViralSeq.sdrm_int(aa_seq, start_codon_number)
|
581
|
+
mut_com << record
|
582
|
+
record.each do |position,mutation|
|
583
|
+
if mut[position]
|
584
|
+
mut[position][1] << mutation[1]
|
585
|
+
else
|
586
|
+
mut[position] = [mutation[0],[]]
|
587
|
+
mut[position][1] << mutation[1]
|
588
|
+
end
|
589
|
+
end
|
590
|
+
end
|
591
|
+
mut.each do |position,mutation|
|
592
|
+
wt = mutation[0]
|
593
|
+
mut_list = mutation[1]
|
594
|
+
count_mut_list = ViralSeq.count(mut_list)
|
595
|
+
count_mut_list.each do |m,number|
|
596
|
+
ci = ViralSeq.r_binom_CI(number, n_seq)
|
597
|
+
label = number < cutoff ? "*" : ""
|
598
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
599
|
+
end
|
600
|
+
end
|
601
|
+
point_mutation_list.sort_by! {|record| record[2]}
|
602
|
+
|
603
|
+
link = ViralSeq.count(mut_com)
|
604
|
+
link2 = {}
|
605
|
+
link.each do |k,v|
|
606
|
+
pattern = []
|
607
|
+
if k.size == 0
|
608
|
+
pattern = ['WT']
|
609
|
+
else
|
610
|
+
k.each do |p,m|
|
611
|
+
pattern << (m[0] + p.to_s + m[1])
|
612
|
+
end
|
613
|
+
end
|
614
|
+
link2[pattern.join("+")] = v
|
615
|
+
end
|
616
|
+
linkage_list = []
|
617
|
+
link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
|
618
|
+
ci = ViralSeq.r_binom_CI(v, n_seq)
|
619
|
+
label = v < cutoff ? "*" : ""
|
620
|
+
linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
|
621
|
+
end
|
622
|
+
|
623
|
+
report_list = []
|
624
|
+
|
625
|
+
div_aa = {}
|
626
|
+
aa_start = start_codon_number
|
627
|
+
|
628
|
+
aa_size = aa.values[0].size - 1
|
629
|
+
|
630
|
+
(0..aa_size).to_a.each do |p|
|
631
|
+
aas = []
|
632
|
+
aa.values.each do |r1|
|
633
|
+
aas << r1[p]
|
634
|
+
end
|
635
|
+
count_aas = ViralSeq.count(aas)
|
636
|
+
div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
637
|
+
aa_start += 1
|
638
|
+
end
|
639
|
+
|
640
|
+
div_aa.each do |k,v|
|
641
|
+
record = [region, k, n_seq]
|
642
|
+
ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
|
643
|
+
aa_count = v[amino_acid]
|
644
|
+
record << (aa_count.to_f/n_seq*100).round(4)
|
645
|
+
end
|
646
|
+
report_list << record
|
647
|
+
end
|
648
|
+
|
649
|
+
return [point_mutation_list, linkage_list, report_list]
|
650
|
+
end
|
651
|
+
|
652
|
+
end
|