viral_seq 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
@@ -1,148 +0,0 @@
|
|
1
|
-
# viral_seq/nt_variation
|
2
|
-
|
3
|
-
# contain functions to cacluate shannon's entropy, pairwise diversity, and TN93 distance
|
4
|
-
# ViralSeq::shannons_entropy
|
5
|
-
# ViralSeq::nucleotide_pi
|
6
|
-
# ViralSeq::TN93
|
7
|
-
|
8
|
-
# ViralSeq.shannons_entropy(sequences)
|
9
|
-
# # calculate Shannon's entropy, Euler's number as the base of logarithm
|
10
|
-
# # https://en.wikipedia.org/wiki/Entropy_(information_theory)
|
11
|
-
# # input sequences alignment in Array [:seq1, :seq2, ...] or Hash [:name => :sequence] object
|
12
|
-
# # it works on both nt and aa sequences
|
13
|
-
# # return a Hash object for entropy score at each position in the alignment {:position => :entropy, ...}
|
14
|
-
# # position starts at 1.
|
15
|
-
# =Usage
|
16
|
-
# # example
|
17
|
-
# sequence_file = 'spec/sample_files/sample_sequence_alignment_for_entropy.fasta'
|
18
|
-
# sequence_hash = ViralSeq.fasta_to_hash(sequence_file)
|
19
|
-
# entropy_hash = ViralSeq.shannons_entropy(sequence_hash)
|
20
|
-
# entropy_hash[3]
|
21
|
-
# => 0.0
|
22
|
-
# entropy_hash[14].round(3)
|
23
|
-
# => 0.639
|
24
|
-
# # This example is the sample input of LANL Entropy-One
|
25
|
-
# # https://www.hiv.lanl.gov/content/sequence/ENTROPY/entropy_one.html?sample_input=1
|
26
|
-
|
27
|
-
# ViralSeq.nucleotide_pi(sequences)
|
28
|
-
# # Function to calculate nucleotide diversity π.
|
29
|
-
# # Refer to https://en.wikipedia.org/wiki/Nucleotide_diversity
|
30
|
-
# # input sequences alignment in Array [:seq1, :seq2, ...] or Hash [:name => :sequence] object
|
31
|
-
# # nt sequence only
|
32
|
-
# # return π as a Float object
|
33
|
-
# =Usage
|
34
|
-
# # example
|
35
|
-
# sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
|
36
|
-
# ViralSeq.nucleotide_pi(sequences)
|
37
|
-
# => 0.16667
|
38
|
-
|
39
|
-
# ViralSeq.tn93(sequences)
|
40
|
-
# # TN93 distance function
|
41
|
-
# # tabulate pairwise comparison of sequence pairs in a sequence alignment
|
42
|
-
# # input sequences alignment in Array [:seq1, :seq2, ...] or Hash [:name => :sequence] object
|
43
|
-
# # nt sequence only
|
44
|
-
# # return pairwise distance table in Hash object {:diff => :freq, ... }
|
45
|
-
# # Note: :diff in different positions (Integer), not percentage.
|
46
|
-
# =Usage
|
47
|
-
# sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
|
48
|
-
# ViralSeq.tn93(sequences)
|
49
|
-
# => {0=>1, 1=>8, 2=>6}
|
50
|
-
|
51
|
-
module ViralSeq
|
52
|
-
# calculate Shannon's entropy, Euler's number as the base of logarithm
|
53
|
-
|
54
|
-
def self.shannons_entropy(sequences)
|
55
|
-
sequences = if sequences.is_a?(Hash)
|
56
|
-
sequences.values
|
57
|
-
elsif sequences.is_a?(Array)
|
58
|
-
sequences
|
59
|
-
else
|
60
|
-
raise ArgumentError.new("Wrong type of input sequences. it has to be Hash or Array object")
|
61
|
-
end
|
62
|
-
entropy_hash = {}
|
63
|
-
seq_l = sequences[0].size
|
64
|
-
(0..(seq_l - 1)).each do |position|
|
65
|
-
element = []
|
66
|
-
sequences.each do |seq|
|
67
|
-
element << seq[position]
|
68
|
-
end
|
69
|
-
entropy = 0
|
70
|
-
element.delete('*')
|
71
|
-
element_size = element.size
|
72
|
-
ViralSeq.count(element).each do |_k,v|
|
73
|
-
p = v/element_size.to_f
|
74
|
-
entropy += (-p * Math.log(p))
|
75
|
-
end
|
76
|
-
entropy_hash[(position + 1)] = entropy
|
77
|
-
end
|
78
|
-
return entropy_hash
|
79
|
-
end
|
80
|
-
|
81
|
-
# nucleotide pairwise diversity
|
82
|
-
def self.nucleotide_pi(sequences)
|
83
|
-
sequences = if sequences.is_a?(Hash)
|
84
|
-
sequences.values
|
85
|
-
elsif sequences.is_a?(Array)
|
86
|
-
sequences
|
87
|
-
else
|
88
|
-
raise ArgumentError.new("Wrong type of input sequences. it has to be Hash or Array object")
|
89
|
-
end
|
90
|
-
seq_length = sequences[0].size - 1
|
91
|
-
nt_position_hash = {}
|
92
|
-
(0..seq_length).each do |n|
|
93
|
-
nt_position_hash[n] = []
|
94
|
-
sequences.each do |s|
|
95
|
-
nt_position_hash[n] << s[n]
|
96
|
-
end
|
97
|
-
end
|
98
|
-
diver = 0
|
99
|
-
com = 0
|
100
|
-
nt_position_hash.each do |_p,nt|
|
101
|
-
nt.delete_if {|n| n =~ /[^A|^C|^G|^T]/}
|
102
|
-
next if nt.size == 1
|
103
|
-
nt_count = ViralSeq.count(nt)
|
104
|
-
combination = (nt.size)*(nt.size - 1)/2
|
105
|
-
com += combination
|
106
|
-
a = nt_count["A"]
|
107
|
-
c = nt_count["C"]
|
108
|
-
t = nt_count["T"]
|
109
|
-
g = nt_count["G"]
|
110
|
-
div = a*c + a*t + a*g + c*t + c*g + t*g
|
111
|
-
diver += div
|
112
|
-
end
|
113
|
-
pi = (diver/com.to_f).round(5)
|
114
|
-
return pi
|
115
|
-
end
|
116
|
-
|
117
|
-
# TN93 distance function. Input: sequence Array/Hash, output hash: {diff => counts, ...}
|
118
|
-
def self.tn93(sequences)
|
119
|
-
sequences = if sequences.is_a?(Hash)
|
120
|
-
sequences.values
|
121
|
-
elsif sequences.is_a?(Array)
|
122
|
-
sequences
|
123
|
-
else
|
124
|
-
raise ArgumentError.new("Wrong type of input sequences. it has to be Hash or Array object")
|
125
|
-
end
|
126
|
-
diff = []
|
127
|
-
seq_hash = ViralSeq.count(sequences)
|
128
|
-
seq_hash.values.each do |v|
|
129
|
-
comb = v * (v - 1) / 2
|
130
|
-
comb.times {diff << 0}
|
131
|
-
end
|
132
|
-
|
133
|
-
seq_hash.keys.combination(2).to_a.each do |pair|
|
134
|
-
s1 = pair[0]
|
135
|
-
s2 = pair[1]
|
136
|
-
diff_temp = ViralSeq.compare_two_seq(s1,s2)
|
137
|
-
comb = seq_hash[s1] * seq_hash[s2]
|
138
|
-
comb.times {diff << diff_temp}
|
139
|
-
end
|
140
|
-
|
141
|
-
count_diff = ViralSeq.count(diff)
|
142
|
-
out_hash = Hash.new(0)
|
143
|
-
Hash[count_diff.sort_by{|k,_v|k}].each do |k,v|
|
144
|
-
out_hash[k] = v
|
145
|
-
end
|
146
|
-
return out_hash
|
147
|
-
end
|
148
|
-
end
|
@@ -1,68 +0,0 @@
|
|
1
|
-
# viral_seq/poisson_cutoff.rb
|
2
|
-
# define Poission cut-off for minority variants.
|
3
|
-
# (Ref: Zhou, et al. J Virol 2015)
|
4
|
-
# ViralSeq::poisson_minority_cutoff
|
5
|
-
|
6
|
-
# ViralSeq.poisson_minority_cutoff(sequences, error_rate, fold_cutoff)
|
7
|
-
# # sequences: input sequences alignment in Array [:seq1, :seq2, ...] or Hash [:name => :sequence] object
|
8
|
-
# # error_rate: the redisual sequencing error rate (default = 0.0001),
|
9
|
-
# # fold_cutoff: a fold cut-off to determine poisson minority cut-off. default = 20. i.e. <5% mutations from randome method error.
|
10
|
-
# # example: cut-off = 2 means that mutations appear at least 2 times are very likely to be a true mutation instead of residual methods errors.
|
11
|
-
# =Usage
|
12
|
-
# sequence_file = 'spec/sample_files/sample_sequence_for_poisson.fasta'
|
13
|
-
# sequences = ViralSeq.fasta_to_hash(sequence_file)
|
14
|
-
# ViralSeq.poisson_minority_cutoff(sequences)
|
15
|
-
# => 2
|
16
|
-
|
17
|
-
|
18
|
-
module ViralSeq
|
19
|
-
|
20
|
-
def self.poisson_minority_cutoff(sequences, error_rate = 0.0001, fold_cutoff = 20)
|
21
|
-
sequences = if sequences.is_a?(Hash)
|
22
|
-
sequences.values
|
23
|
-
elsif sequences.is_a?(Array)
|
24
|
-
sequences
|
25
|
-
else
|
26
|
-
raise ArgumentError.new("Wrong type of input sequences. it has to be Hash or Array object")
|
27
|
-
end
|
28
|
-
if sequences.size == 0
|
29
|
-
return 0
|
30
|
-
else
|
31
|
-
cut_off = 1
|
32
|
-
l = sequences[0].size
|
33
|
-
rate = sequences.size * error_rate
|
34
|
-
count_mut = ViralSeq.variant_for_poisson(sequences)
|
35
|
-
max_count = count_mut.keys.max
|
36
|
-
poisson_hash = ViralSeq.poisson_distribution(rate, max_count)
|
37
|
-
|
38
|
-
poisson_hash.each do |k,v|
|
39
|
-
cal = l * v
|
40
|
-
obs = count_mut[k] ? count_mut[k] : 0
|
41
|
-
if obs >= fold_cutoff * cal
|
42
|
-
cut_off = k
|
43
|
-
break
|
44
|
-
end
|
45
|
-
end
|
46
|
-
return cut_off
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
# Input sequence array. output Variant distribution for Poisson cut-off
|
51
|
-
def self.variant_for_poisson(seq)
|
52
|
-
seq_size = seq.size
|
53
|
-
l = seq[0].size - 1
|
54
|
-
var = []
|
55
|
-
(0..l).to_a.each do |pos|
|
56
|
-
nt = []
|
57
|
-
seq.each do |s|
|
58
|
-
nt << s[pos]
|
59
|
-
end
|
60
|
-
count_nt = ViralSeq.count(nt)
|
61
|
-
v = seq_size - count_nt.values.max
|
62
|
-
var << v
|
63
|
-
end
|
64
|
-
var_count = count(var)
|
65
|
-
var_count.sort_by{|key,_value|key}.to_h
|
66
|
-
end
|
67
|
-
|
68
|
-
end
|
data/lib/viral_seq/refseq.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
# viral_seq/refseq.rb
|
2
|
-
# store constants of reference sequences
|
3
|
-
|
4
|
-
# sequence of HIV-1 HXB2 (Genbank accession number K03455)
|
5
|
-
# https://www.ncbi.nlm.nih.gov/nuccore/K03455
|
6
|
-
|
7
|
-
# sequence of for HIV-1 NL43 (Genbank accession number AF324493)
|
8
|
-
# https://www.ncbi.nlm.nih.gov/nuccore/AF324493
|
9
|
-
|
10
|
-
# sequence of SIV MAC239 (Genbank accession number M33262)
|
11
|
-
# https://www.ncbi.nlm.nih.gov/nucleotide/M33262
|
12
|
-
|
13
|
-
# ViralSeq::HXB2
|
14
|
-
# ViralSeq::NL43
|
15
|
-
# ViralSeq::MAC239
|
16
|
-
# ViralSeq::check_ref
|
17
|
-
|
18
|
-
module ViralSeq
|
19
|
-
HXB2 = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
|
20
|
-
|
21
|
-
NL43 = "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAAATAAGGAGAGAAGAACAGCTTGTTACACCCTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGCATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGTGTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTATAGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACAGTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTTATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAAGGGTGCCCACACTAATGATGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCATAATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATATGTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGGAATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAAAAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATGGGTTGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATATGGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTTGGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGATTAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTACTAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGAATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGCCTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCATACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATATGGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGATAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGGTTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTATCCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACATGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGAGGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCTGCACGTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
|
22
|
-
|
23
|
-
MAC239 = "GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAGGCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATACCCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCAGATTGGCGCCTGAACAGGGACTTGAAGGAGAGTGAGAGACTCCTGAGTACGGCTGAGTGAAGGCAGTAAGGGCGGCAGGAACCAACCACGACGGAGTGCTCCTATAAAGGCGCGGGTCGGTACCAGACGGCGTGAGGAGCGGGAGAGGAAGAGGCCTCCGGTTGCAGGTAAGTGCAACACAAAAAAGAAATAGCTGTCTTTTATCCAGGAAGGGGTAATAAGATAGAGTGGGAGATGGGCGTGAGAAACTCCGTCTTGTCAGGGAAGAAAGCAGATGAATTAGAAAAAATTAGGCTACGACCCAACGGAAAGAAAAAGTACATGTTGAAGCATGTAGTATGGGCAGCAAATGAATTAGATAGATTTGGATTAGCAGAAAGCCTGTTGGAGAACAAAGAAGGATGTCAAAAAATACTTTCGGTCTTAGCTCCATTAGTGCCAACAGGCTCAGAAAATTTAAAAAGCCTTTATAATACTGTCTGCGTCATCTGGTGCATTCACGCAGAAGAGAAAGTGAAACACACTGAGGAAGCAAAACAGATAGTGCAGAGACACCTAGTGGTGGAAACAGGAACAACAGAAACTATGCCAAAAACAAGTAGACCAACAGCACCATCTAGCGGCAGAGGAGGAAATTACCCAGTACAACAAATAGGTGGTAACTATGTCCACCTGCCATTAAGCCCGAGAACATTAAATGCCTGGGTAAAATTGATAGAGGAAAAGAAATTTGGAGCAGAAGTAGTGCCAGGATTTCAGGCACTGTCAGAAGGTTGCACCCCCTATGACATTAATCAGATGTTAAATTGTGTGGGAGACCATCAAGCGGCTATGCAGATTATCAGAGATATTATAAACGAGGAGGCTGCAGATTGGGACTTGCAGCACCCACAACCAGCTCCACAACAAGGACAACTTAGGGAGCCGTCAGGATCAGATATTGCAGGAACAACTAGTTCAGTAGATGAACAAATCCAGTGGATGTACAGACAACAGAACCCCATACCAGTAGGCAACATTTACAGGAGATGGATCCAACTGGGGTTGCAAAAATGTGTCAGAATGTATAACCCAACAAACATTCTAGATGTAAAACAAGGGCCAAAAGAGCCATTTCAGAGCTATGTAGACAGGTTCTACAAAAGTTTAAGAGCAGAACAGACAGATGCAGCAGTAAAGAATTGGATGACTCAAACACTGCTGATTCAAAATGCTAACCCAGATTGCAAGCTAGTGCTGAAGGGGCTGGGTGTGAATCCCACCCTAGAAGAAATGCTGACGGCTTGTCAAGGAGTAGGGGGGCCGGGACAGAAGGCTAGATTAATGGCAGAAGCCCTGAAAGAGGCCCTCGCACCAGTGCCAATCCCTTTTGCAGCAGCCCAACAGAGGGGACCAAGAAAGCCAATTAAGTGTTGGAATTGTGGGAAAGAGGGACACTCTGCAAGGCAATGCAGAGCCCCAAGAAGACAGGGATGCTGGAAATGTGGAAAAATGGACCATGTTATGGCCAAATGCCCAGACAGACAGGCGGGTTTTTTAGGCCTTGGTCCATGGGGAAAGAAGCCCCGCAATTTCCCCATGGCTCAAGTGCATCAGGGGCTGATGCCAACTGCTCCCCCAGAGGACCCAGCTGTGGATCTGCTAAAGAACTACATGCAGTTGGGCAAGCAGCAGAGAGAAAAGCAGAGAGAAAGCAGAGAGAAGCCTTACAAGGAGGTGACAGAGGATTTGCTGCACCTCAATTCTCTCTTTGGAGGAGACCAGTAGTCACTGCTCATATTGAAGGACAGCCTGTAGAAGTATTACTGGATACAGGGGCTGATGATTCTATTGTAACAGGAATAGAGTTAGGTCCACATTATACCCCAAAAATAGTAGGAGGAATAGGAGGTTTTATTAATACTAAAGAATACAAAAATGTAGAAATAGAAGTTTTAGGCAAAAGGATTAAAGGGACAATCATGACAGGGGACACCCCGATTAACATTTTTGGTAGAAATTTGCTAACAGCTCTGGGGATGTCTCTAAATTTTCCCATAGCTAAAGTAGAGCCTGTAAAAGTCGCCTTAAAGCCAGGAAAGGATGGACCAAAATTGAAGCAGTGGCCATTATCAAAAGAAAAGATAGTTGCATTAAGAGAAATCTGTGAAAAGATGGAAAAGGATGGTCAGTTGGAGGAAGCTCCCCCGACCAATCCATACAACACCCCCACATTTGCTATAAAGAAAAAGGATAAGAACAAATGGAGAATGCTGATAGATTTTAGGGAACTAAATAGGGTCACTCAGGACTTTACGGAAGTCCAATTAGGAATACCACACCCTGCAGGACTAGCAAAAAGGAAAAGAATTACAGTACTGGATATAGGTGATGCATATTTCTCCATACCTCTAGATGAAGAATTTAGGCAGTACACTGCCTTTACTTTACCATCAGTAAATAATGCAGAGCCAGGAAAACGATACATTTATAAGGTTCTGCCTCAGGGATGGAAGGGGTCACCAGCCATCTTCCAATACACTATGAGACATGTGCTAGAACCCTTCAGGAAGGCAAATCCAGATGTGACCTTAGTCCAGTATATGGATGACATCTTAATAGCTAGTGACAGGACAGACCTGGAACATGACAGGGTAGTTTTACAGTCAAAGGAACTCTTGAATAGCATAGGGTTTTCTACCCCAGAAGAGAAATTCCAAAAAGATCCCCCATTTCAATGGATGGGGTACGAATTGTGGCCAACAAAATGGAAGTTGCAAAAGATAGAGTTGCCACAAAGAGAGACCTGGACAGTGAATGATATACAGAAGTTAGTAGGAGTATTAAATTGGGCAGCTCAAATTTATCCAGGTATAAAAACCAAACATCTCTGTAGGTTAATTAGAGGAAAAATGACTCTAACAGAGGAAGTTCAGTGGACTGAGATGGCAGAAGCAGAATATGAGGAAAATAAAATAATTCTCAGTCAGGAACAAGAAGGATGTTATTACCAAGAAGGCAAGCCATTAGAAGCCACGGTAATAAAGAGTCAGGACAATCAGTGGTCTTATAAAATTCACCAAGAAGACAAAATACTGAAAGTAGGAAAATTTGCAAAGATAAAGAATACACATACCAATGGAGTGAGACTATTAGCACATGTAATACAGAAAATAGGAAAGGAAGCAATAGTGATCTGGGGACAGGTCCCAAAATTCCACTTACCAGTTGAGAAGGATGTATGGGAACAGTGGTGGACAGACTATTGGCAGGTAACCTGGATACCGGAATGGGATTTTATCTCAACACCACCGCTAGTAAGATTAGTCTTCAATCTAGTGAAGGACCCTATAGAGGGAGAAGAAACCTATTATACAGATGGATCATGTAATAAACAGTCAAAAGAAGGGAAAGCAGGATATATCACAGATAGGGGCAAAGACAAAGTAAAAGTGTTAGAACAGACTACTAATCAACAAGCAGAATTGGAAGCATTTCTCATGGCATTGACAGACTCAGGGCCAAAGGCAAATATTATAGTAGATTCACAATATGTTATGGGAATAATAACAGGATGCCCTACAGAATCAGAGAGCAGGCTAGTTAATCAAATAATAGAAGAAATGATTAAAAAGTCAGAAATTTATGTAGCATGGGTACCAGCACACAAAGGTATAGGAGGAAACCAAGAAATAGACCACCTAGTTAGTCAAGGGATTAGACAAGTTCTCTTCTTGGAAAAGATAGAGCCAGCACAAGAAGAACATGATAAATACCATAGTAATGTAAAAGAATTGGTATTCAAATTTGGATTACCCAGAATAGTGGCCAGACAGATAGTAGACACCTGTGATAAATGTCATCAGAAAGGAGAGGCTATACATGGGCAGGCAAATTCAGATCTAGGGACTTGGCAAATGGATTGTACCCATCTAGAGGGAAAAATAATCATAGTTGCAGTACATGTAGCTAGTGGATTCATAGAAGCAGAGGTAATTCCACAAGAGACAGGAAGACAGACAGCACTATTTCTGTTAAAATTGGCAGGCAGATGGCCTATTACACATCTACACACAGATAATGGTGCTAACTTTGCTTCGCAAGAAGTAAAGATGGTTGCATGGTGGGCAGGGATAGAGCACACCTTTGGGGTACCATACAATCCACAGAGTCAGGGAGTAGTGGAAGCAATGAATCACCACCTGAAAAATCAAATAGATAGAATCAGGGAACAAGCAAATTCAGTAGAAACCATAGTATTAATGGCAGTTCATTGCATGAATTTTAAAAGAAGGGGAGGAATAGGGGATATGACTCCAGCAGAAAGATTAATTAACATGATCACTACAGAACAAGAGATACAATTTCAACAATCAAAAAACTCAAAATTTAAAAATTTTCGGGTCTATTACAGAGAAGGCAGAGATCAACTGTGGAAGGGACCCGGTGAGCTATTGTGGAAAGGGGAAGGAGCAGTCATCTTAAAGGTAGGGACAGACATTAAGGTAGTACCCAGAAGAAAGGCTAAAATTATCAAAGATTATGGAGGAGGAAAAGAGGTGGATAGCAGTTCCCACATGGAGGATACCGGAGAGGCTAGAGAGGTGGCATAGCCTCATAAAATATCTGAAATATAAAACTAAAGATCTACAAAAGGTTTGCTATGTGCCCCATTTTAAGGTCGGATGGGCATGGTGGACCTGCAGCAGAGTAATCTTCCCACTACAGGAAGGAAGCCATTTAGAAGTACAAGGGTATTGGCATTTGACACCAGAAAAAGGGTGGCTCAGTACTTATGCAGTGAGGATAACCTGGTACTCAAAGAACTTTTGGACAGATGTAACACCAAACTATGCAGACATTTTACTGCATAGCACTTATTTCCCTTGCTTTACAGCGGGAGAAGTGAGAAGGGCCATCAGGGGAGAACAACTGCTGTCTTGCTGCAGGTTCCCGAGAGCTCATAAGTACCAGGTACCAAGCCTACAGTACTTAGCACTGAAAGTAGTAAGCGATGTCAGATCCCAGGGAGAGAATCCCACCTGGAAACAGTGGAGAAGAGACAATAGGAGAGGCCTTCGAATGGCTAAACAGAACAGTAGAGGAGATAAACAGAGAGGCGGTAAACCACCTACCAAGGGAGCTAATTTTCCAGGTTTGGCAAAGGTCTTGGGAATACTGGCATGATGAACAAGGGATGTCACCAAGCTATGTAAAATACAGATACTTGTGTTTAATACAAAAGGCTTTATTTATGCATTGCAAGAAAGGCTGTAGATGTCTAGGGGAAGGACATGGGGCAGGGGGATGGAGACCAGGACCTCCTCCTCCTCCCCCTCCAGGACTAGCATAAATGGAAGAAAGACCTCCAGAAAATGAAGGACCACAAAGGGAACCATGGGATGAATGGGTAGTGGAGGTTCTGGAAGAACTGAAAGAAGAAGCTTTAAAACATTTTGATCCTCGCTTGCTAACTGCACTTGGTAATCATATCTATAATAGACATGGAGACACCCTTGAGGGAGCAGGAGAACTCATTAGAATCCTCCAACGAGCGCTCTTCATGCATTTCAGAGGCGGATGCATCCACTCCAGAATCGGCCAACCTGGGGGAGGAAATCCTCTCTCAGCTATACCGCCCTCTAGAAGCATGCTATAACACATGCTATTGTAAAAAGTGTTGCTACCATTGCCAGTTTTGTTTTCTTAAAAAAGGCTTGGGGATATGTTATGAGCAATCACGAAAGAGAAGAAGAACTCCGAAAAAGGCTAAGGCTAATACATCTTCTGCATCAAACAAGTAAGTATGGGATGTCTTGGGAATCAGCTGCTTATCGCCATCTTGCTTTTAAGTGTCTATGGGATCTATTGTACTCTATATGTCACAGTCTTTTATGGTGTACCAGCTTGGAGGAATGCGACAATTCCCCTCTTTTGTGCAACCAAGAATAGGGATACTTGGGGAACAACTCAGTGCCTACCAGATAATGGTGATTATTCAGAAGTGGCCCTTAATGTTACAGAAAGCTTTGATGCCTGGAATAATACAGTCACAGAACAGGCAATAGAGGATGTATGGCAACTCTTTGAGACCTCAATAAAGCCTTGTGTAAAATTATCCCCATTATGCATTACTATGAGATGCAATAAAAGTGAGACAGATAGATGGGGATTGACAAAATCAATAACAACAACAGCATCAACAACATCAACGACAGCATCAGCAAAAGTAGACATGGTCAATGAGACTAGTTCTTGTATAGCCCAGGATAATTGCACAGGCTTGGAACAAGAGCAAATGATAAGCTGTAAATTCAACATGACAGGGTTAAAAAGAGACAAGAAAAAAGAGTACAATGAAACTTGGTACTCTGCAGATTTGGTATGTGAACAAGGGAATAACACTGGTAATGAAAGTAGATGTTACATGAACCACTGTAACACTTCTGTTATCCAAGAGTCTTGTGACAAACATTATTGGGATGCTATTAGATTTAGGTATTGTGCACCTCCAGGTTATGCTTTGCTTAGATGTAATGACACAAATTATTCAGGCTTTATGCCTAAATGTTCTAAGGTGGTGGTCTCTTCATGCACAAGGATGATGGAGACACAGACTTCTACTTGGTTTGGCTTTAATGGAACTAGAGCAGAAAATAGAACTTATATTTACTGGCATGGTAGGGATAATAGGACTATAATTAGTTTAAATAAGTATTATAATCTAACAATGAAATGTAGAAGACCAGGAAATAAGACAGTTTTACCAGTCACCATTATGTCTGGATTGGTTTTCCACTCACAACCAATCAATGATAGGCCAAAGCAGGCATGGTGTTGGTTTGGAGGAAAATGGAAGGATGCAATAAAAGAGGTGAAGCAGACCATTGTCAAACATCCCAGGTATACTGGAACTAACAATACTGATAAAATCAATTTGACGGCTCCTGGAGGAGGAGATCCGGAAGTTACCTTCATGTGGACAAATTGCAGAGGAGAGTTCCTCTACTGTAAAATGAATTGGTTTCTAAATTGGGTAGAAGATAGGAATACAGCTAACCAGAAGCCAAAGGAACAGCATAAAAGGAATTACGTGCCATGTCATATTAGACAAATAATCAACACTTGGCATAAAGTAGGCAAAAATGTTTATTTGCCTCCAAGAGAGGGAGACCTCACGTGTAACTCCACAGTGACCAGTCTCATAGCAAACATAGATTGGATTGATGGAAACCAAACTAATATCACCATGAGTGCAGAGGTGGCAGAACTGTATCGATTGGAATTGGGAGATTATAAATTAGTAGAGATCACTCCAATTGGCTTGGCCCCCACAGATGTGAAGAGGTACACTACTGGTGGCACCTCAAGAAATAAAAGAGGGGTCTTTGTGCTAGGGTTCTTGGGTTTTCTCGCAACGGCAGGTTCTGCAATGGGCGCGGCGTCGTTGACGCTGACCGCTCAGTCCCGAACTTTATTGGCTGGGATAGTGCAGCAACAGCAACAGCTGTTGGACGTGGTCAAGAGACAACAAGAATTGTTGCGACTGACCGTCTGGGGAACAAAGAACCTCCAGACTAGGGTCACTGCCATCGAGAAGTACTTAAAGGACCAGGCGCAGCTGAATGCTTGGGGATGTGCGTTTAGACAAGTCTGCCACACTACTGTACCATGGCCAAATGCAAGTCTAACACCAAAGTGGAACAATGAGACTTGGCAAGAGTGGGAGCGAAAGGTTGACTTCTTGGAAGAAAATATAACAGCCCTCCTAGAGGAGGCACAAATTCAACAAGAGAAGAACATGTATGAATTACAAAAGTTGAATAGCTGGGATGTGTTTGGCAATTGGTTTGACCTTGCTTCTTGGATAAAGTATATACAATATGGAGTTTATATAGTTGTAGGAGTAATACTGTTAAGAATAGTGATCTATATAGTACAAATGCTAGCTAAGTTAAGGCAGGGGTATAGGCCAGTGTTCTCTTCCCCACCCTCTTATTTCCAGCAGACCCATATCCAACAGGACCCGGCACTGCCAACCAGAGAAGGCAAAGAAAGAGACGGTGGAGAAGGCGGTGGCAACAGCTCCTGGCCTTGGCAGATAGAATATATTCATTTCCTGATCCGCCAACTGATACGCCTCTTGACTTGGCTATTCAGCAACTGCAGAACCTTGCTATCGAGAGTATACCAGATCCTCCAACCAATACTCCAGAGGCTCTCTGCGACCCTACAGAGGATTCGAGAAGTCCTCAGGACTGAACTGACCTACCTACAATATGGGTGGAGCTATTTCCATGAGGCGGTCCAGGCCGTCTGGAGATCTGCGACAGAGACTCTTGCGGGCGCGTGGGGAGACTTATGGGAGACTCTTAGGAGAGGTGGAAGATGGATACTCGCAATCCCCAGGAGGATTAGACAAGGGCTTGAGCTCACTCTCTTGTGAGGGACAGAAATACAATCAGGGACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATTAGCATACAGAAAACAAAATATGGATGATATAGATGAGTAAGATGATGACTTGGTAGGGGTATCAGTGAGGCCAAAAGTTCCCCTAAGAACAATGAGTTACAAATTGGCAATAGACATGTCTCATTTTATAAAAGAAAAGGGGGGACTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCA"
|
24
|
-
|
25
|
-
|
26
|
-
# check if reference option is correct, return chosen 'ref_option'
|
27
|
-
def self.check_ref(ref_option)
|
28
|
-
begin
|
29
|
-
case ref_option
|
30
|
-
when :HXB2
|
31
|
-
HXB2.dup
|
32
|
-
when :NL43
|
33
|
-
NL43.dup
|
34
|
-
when :MAC239
|
35
|
-
MAC239.dup
|
36
|
-
else
|
37
|
-
raise StandardError.new("reference sequence not recognized, choose from 'HXB2' (default), 'NL43', or 'MAC239'.")
|
38
|
-
end
|
39
|
-
rescue StandardError => e
|
40
|
-
puts e.message
|
41
|
-
return nil
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|
data/lib/viral_seq/sdrm_core.rb
DELETED
@@ -1,652 +0,0 @@
|
|
1
|
-
# viral_seq/sdrm_core.rb
|
2
|
-
# core functions for HIV SDRM analysis using MPID-DR protocol.
|
3
|
-
# More details for HIV Surveillance Drug Resistance Mutation (SDRM) can be found at
|
4
|
-
# https://hivdb.stanford.edu/pages/surveillance.html
|
5
|
-
|
6
|
-
# Including methods as:
|
7
|
-
# ViralSeq::sdrm_nrti
|
8
|
-
# ViralSeq::sdrm_nnrti
|
9
|
-
# ViralSeq::hiv_protease
|
10
|
-
# ViralSeq::sdrm_int
|
11
|
-
# ViralSeq::sdrm_pr_bulk
|
12
|
-
# ViralSeq::sdrm_rt_bulk
|
13
|
-
# ViralSeq::sdrm_in_bulk
|
14
|
-
|
15
|
-
# ViralSeq.sdrm_nrti(aa_arry, start_aa)
|
16
|
-
# ViralSeq.sdrm_nnrti(aa_arry, start_aa)
|
17
|
-
# ViralSeq.hiv_protease(aa_arry, start_aa)
|
18
|
-
# ViralSeq.sdrm_int(aa_arry, start_aa)
|
19
|
-
# # funtions to identify SDRMs from a given sequence in an Array object
|
20
|
-
# # function names indicate which HIV drug resistance mutations it can identify
|
21
|
-
# # input an Array object for amino acid sequence ['A', 'M', 'L', ...]
|
22
|
-
# # start_aa is an Integer to indicate codon number of the 1st amino acid sequence in the input aa_array
|
23
|
-
# # return a Hash object for SDRMs identified. {:posiiton =>[:wildtype_codon, :mutation_codon]}
|
24
|
-
|
25
|
-
# ViralSeq.sdrm_pr_bulk(sequence_hash, minority_cut_off)
|
26
|
-
# ViralSeq.sdrm_rt_bulk(sequence_hash, minority_cut_off)
|
27
|
-
# ViralSeq.sdrm_in_bulk(sequence_hash, minority_cut_off)
|
28
|
-
# # functions to identify SDRMs from a sequence hash object.
|
29
|
-
# # name of the functions indicate which region it works on
|
30
|
-
# # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
|
31
|
-
# # PR codon 1-99
|
32
|
-
# # RT codon 34-122, 152-236, two regions are linked
|
33
|
-
# # IN codon 53-174
|
34
|
-
# # sequence_hash is a Hash object of sequences {:name => :sequence, ...}
|
35
|
-
# # sequences usually need to be QCed (remove sequences with stop codon and a3g hypermutations) first
|
36
|
-
# # minority_cut_off is the Integer cut-off for minimal abundance of a mutation to be called as valid mutation
|
37
|
-
# # minority_cut_off can be obtained using ViralSeq::poisson_minority_cutoff function
|
38
|
-
# # return [point_mutation_list, linkage_list, report_list]
|
39
|
-
# =USAGE
|
40
|
-
# # example (example files from ID:VS053118-0566)
|
41
|
-
# sequence = ViralSeq.fasta_to_hash('spec/sample_files/sample_dr_sequences/pr.fasta')
|
42
|
-
# p_cut_off = ViralSeq.poisson_minority_cutoff(sequences)
|
43
|
-
# pr_sdrm = ViralSeq.sdrm_pr_bulk(sequence, p_cut_off)
|
44
|
-
# puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"
|
45
|
-
# pr_sdrm[0].each {|n| puts n.join(',')}
|
46
|
-
# => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
|
47
|
-
# => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
|
48
|
-
# => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
|
49
|
-
# => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
|
50
|
-
#
|
51
|
-
# puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"
|
52
|
-
# pr_sdrm[1].each {|n| puts n.join(',')}
|
53
|
-
# => region,tcs_number,linkage,count,%,CI_low,CI_high,label
|
54
|
-
# => PR,396,D30N+N88D,245,0.61869,0.56884,0.66674,
|
55
|
-
# => PR,396,WT,149,0.37626,0.32837,0.42602,
|
56
|
-
# => PR,396,D30N,1,0.00253,6.0e-05,0.01399,*
|
57
|
-
# => PR,396,D30N+I50V+N88D,1,0.00253,6.0e-05,0.01399,*
|
58
|
-
#
|
59
|
-
# puts "position,codon,tcs_number," + ViralSeq::AMINO_ACID_LIST.join(",")
|
60
|
-
# pr_sdrm[2].each {|n|puts n.join(",")}
|
61
|
-
# => position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
|
62
|
-
# => PR,1,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
63
|
-
# => PR,2,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
64
|
-
# => PR,3,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
65
|
-
# => PR,4,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
66
|
-
# => PR,5,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
67
|
-
# => PR,6,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
|
68
|
-
# => PR,7,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
69
|
-
# => PR,8,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
|
70
|
-
# => PR,9,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
71
|
-
# => PR,10,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
72
|
-
# => PR,11,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
73
|
-
# => PR,12,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,62.1212,0.0,0.0,0.0,0.0
|
74
|
-
# => PR,13,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.1313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.8687,0.0,0.0,0.0
|
75
|
-
# => PR,14,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
76
|
-
# => PR,15,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6263,0.0,0.0,0.0
|
77
|
-
# => PR,16,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
78
|
-
# => PR,17,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
79
|
-
# => PR,18,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.5051,0.0,0.0,0.0,0.0,0.0,0.0
|
80
|
-
# => PR,19,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
81
|
-
# => PR,20,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
82
|
-
# => PR,21,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
83
|
-
# => PR,22,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
84
|
-
# => PR,23,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
85
|
-
# => PR,24,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
86
|
-
# => PR,25,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
87
|
-
# => PR,26,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
88
|
-
# => PR,27,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
89
|
-
# => PR,28,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
90
|
-
# => PR,29,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
91
|
-
# => PR,30,396,0.0,0.0,37.6263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
92
|
-
# => PR,31,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
93
|
-
# => PR,32,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
94
|
-
# => PR,33,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
|
95
|
-
# => PR,34,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
96
|
-
# => PR,35,396,0.0,0.0,62.1212,37.6263,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
97
|
-
# => PR,36,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
|
98
|
-
# => PR,37,396,0.0,0.0,37.8788,61.8687,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
99
|
-
# => PR,38,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
100
|
-
# => PR,39,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
|
101
|
-
# => PR,40,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
102
|
-
# => PR,41,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0
|
103
|
-
# => PR,42,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
|
104
|
-
# => PR,43,396,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
105
|
-
# => PR,44,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
106
|
-
# => PR,45,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
107
|
-
# => PR,46,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
108
|
-
# => PR,47,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
109
|
-
# => PR,48,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
110
|
-
# => PR,49,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
111
|
-
# => PR,50,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
|
112
|
-
# => PR,51,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
113
|
-
# => PR,52,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
114
|
-
# => PR,53,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
115
|
-
# => PR,54,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
116
|
-
# => PR,55,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
117
|
-
# => PR,56,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
118
|
-
# => PR,57,396,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0
|
119
|
-
# => PR,58,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
120
|
-
# => PR,59,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
|
121
|
-
# => PR,60,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
122
|
-
# => PR,61,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
123
|
-
# => PR,62,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
124
|
-
# => PR,63,396,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,37.8788,0.0,0.0,61.8687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
125
|
-
# => PR,64,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
126
|
-
# => PR,65,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
127
|
-
# => PR,66,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
128
|
-
# => PR,67,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
129
|
-
# => PR,68,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
130
|
-
# => PR,69,396,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
131
|
-
# => PR,70,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
132
|
-
# => PR,71,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,37.8788,0.0,0.0,0.0
|
133
|
-
# => PR,72,396,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0
|
134
|
-
# => PR,73,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
135
|
-
# => PR,74,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
136
|
-
# => PR,75,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
137
|
-
# => PR,76,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
138
|
-
# => PR,77,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
139
|
-
# => PR,78,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
140
|
-
# => PR,79,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
141
|
-
# => PR,80,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
142
|
-
# => PR,81,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
143
|
-
# => PR,82,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
|
144
|
-
# => PR,83,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
|
145
|
-
# => PR,84,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
146
|
-
# => PR,85,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
147
|
-
# => PR,86,396,0.0,0.0,0.0,0.5051,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
148
|
-
# => PR,87,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
|
149
|
-
# => PR,88,396,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
150
|
-
# => PR,89,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
151
|
-
# => PR,90,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
152
|
-
# => PR,91,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
153
|
-
# => PR,92,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
154
|
-
# => PR,93,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
155
|
-
# => PR,94,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
156
|
-
# => PR,95,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
157
|
-
# => PR,96,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
|
158
|
-
# => PR,97,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
159
|
-
# => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
|
160
|
-
# => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
module ViralSeq
|
165
|
-
|
166
|
-
# drug resistant mutation summary. input: amino acid array and starting codon, output, hash of summary
|
167
|
-
def self.sdrm_nrti(aa_array,start_aa=1)
|
168
|
-
out_hash = {}
|
169
|
-
sdrm = {}
|
170
|
-
sdrm[41] = ['M',['L']]
|
171
|
-
sdrm[65] = ['K',['R']]
|
172
|
-
sdrm[67] = ['D',['N','G','E']]
|
173
|
-
sdrm[69] = ['T',['D']]
|
174
|
-
sdrm[70] = ['K',['R','E']]
|
175
|
-
sdrm[74] = ['L',['V','I']]
|
176
|
-
sdrm[75] = ['V',['M','T','A','S']]
|
177
|
-
sdrm[77] = ['F',['L']]
|
178
|
-
sdrm[115] = ['Y',['F']]
|
179
|
-
sdrm[116] = ['F',['Y']]
|
180
|
-
sdrm[151] = ['Q',['M']]
|
181
|
-
sdrm[184] = ['M',['V','I']]
|
182
|
-
sdrm[210] = ['L',['W']]
|
183
|
-
sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
|
184
|
-
sdrm[219] = ["K",["Q","E","N","R"]]
|
185
|
-
aa_length = aa_array.size
|
186
|
-
end_aa = start_aa + aa_length - 1
|
187
|
-
(start_aa..end_aa).each do |position|
|
188
|
-
array_position = position - start_aa
|
189
|
-
if sdrm.keys.include?(position)
|
190
|
-
wt_aa = sdrm[position][0]
|
191
|
-
test_aa = aa_array[array_position]
|
192
|
-
if test_aa.size == 1
|
193
|
-
unless wt_aa == test_aa
|
194
|
-
if sdrm[position][1].include?(test_aa)
|
195
|
-
out_hash[position] = [wt_aa,test_aa]
|
196
|
-
end
|
197
|
-
end
|
198
|
-
else
|
199
|
-
test_aa_array = test_aa.split("/")
|
200
|
-
if (test_aa_array & sdrm[position][1])
|
201
|
-
out_hash[position] = [wt_aa,test_aa]
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
end
|
206
|
-
end
|
207
|
-
return out_hash
|
208
|
-
end
|
209
|
-
|
210
|
-
def self.sdrm_nnrti(aa_array,start_aa=1)
|
211
|
-
out_hash = {}
|
212
|
-
sdrm = {}
|
213
|
-
sdrm[100] = ['L',['I']]
|
214
|
-
sdrm[101] = ['K',['E','P']]
|
215
|
-
sdrm[103] = ['K',['N','S']]
|
216
|
-
sdrm[106] = ['V',['M','A']]
|
217
|
-
sdrm[179] = ['V',['F','D']]
|
218
|
-
sdrm[181] = ['Y',['C','I','V']]
|
219
|
-
sdrm[188] = ['Y',['L','H','C']]
|
220
|
-
sdrm[190] = ['G',['A','S','E']]
|
221
|
-
sdrm[225] = ['P',['H']]
|
222
|
-
sdrm[230] = ['M',['L']]
|
223
|
-
aa_length = aa_array.size
|
224
|
-
end_aa = start_aa + aa_length - 1
|
225
|
-
(start_aa..end_aa).each do |position|
|
226
|
-
array_position = position - start_aa
|
227
|
-
if sdrm.keys.include?(position)
|
228
|
-
wt_aa = sdrm[position][0]
|
229
|
-
test_aa = aa_array[array_position]
|
230
|
-
if test_aa.size == 1
|
231
|
-
unless wt_aa == test_aa
|
232
|
-
if sdrm[position][1].include?(test_aa)
|
233
|
-
out_hash[position] = [wt_aa,test_aa]
|
234
|
-
end
|
235
|
-
end
|
236
|
-
else
|
237
|
-
test_aa_array = test_aa.split("/")
|
238
|
-
if (test_aa_array & sdrm[position][1])
|
239
|
-
out_hash[position] = [wt_aa,test_aa]
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
end
|
244
|
-
end
|
245
|
-
return out_hash
|
246
|
-
end
|
247
|
-
|
248
|
-
#HIV protease surveillance mutations
|
249
|
-
|
250
|
-
def self.hiv_protease(aa_array,start_aa=1)
|
251
|
-
out_hash = {}
|
252
|
-
sdrm = {}
|
253
|
-
sdrm[23] = ['L',['I']]
|
254
|
-
sdrm[24] = ['L',['I']]
|
255
|
-
sdrm[30] = ['D',['N']]
|
256
|
-
sdrm[32] = ['V',['I']]
|
257
|
-
sdrm[46] = ['M',['I','L','V']] # M46V not on the SDRM list but we still include it.
|
258
|
-
sdrm[47] = ['I',['V','A']]
|
259
|
-
sdrm[48] = ['G',['V','M']]
|
260
|
-
sdrm[50] = ['I',['V','L']]
|
261
|
-
sdrm[53] = ['F',['Y']]
|
262
|
-
sdrm[54] = ['I',['V','L','M','T','A','S']]
|
263
|
-
sdrm[73] = ['G',['S','T','C','A']]
|
264
|
-
sdrm[76] = ['L',['V']]
|
265
|
-
sdrm[82] = ['V',['A','T','S','F','L','C','M']]
|
266
|
-
sdrm[83] = ['N',['D']]
|
267
|
-
sdrm[84] = ['I',['V','A','C']]
|
268
|
-
sdrm[85] = ['I',['V']]
|
269
|
-
sdrm[88] = ['N',['D','S']]
|
270
|
-
sdrm[90] = ['L',['M']]
|
271
|
-
aa_length = aa_array.size
|
272
|
-
end_aa = start_aa + aa_length - 1
|
273
|
-
(start_aa..end_aa).each do |position|
|
274
|
-
array_position = position - start_aa
|
275
|
-
if sdrm.keys.include?(position)
|
276
|
-
wt_aa = sdrm[position][0]
|
277
|
-
test_aa = aa_array[array_position]
|
278
|
-
if test_aa.size == 1
|
279
|
-
unless wt_aa == test_aa
|
280
|
-
if sdrm[position][1].include?(test_aa)
|
281
|
-
out_hash[position] = [wt_aa,test_aa]
|
282
|
-
end
|
283
|
-
end
|
284
|
-
else
|
285
|
-
test_aa_array = test_aa.split("/")
|
286
|
-
if (test_aa_array & sdrm[position][1])
|
287
|
-
out_hash[position] = [wt_aa,test_aa]
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
end
|
292
|
-
return out_hash
|
293
|
-
end
|
294
|
-
|
295
|
-
#HIV integrase drug resistance mutations
|
296
|
-
|
297
|
-
def self.sdrm_int(aa_array,start_aa=1)
|
298
|
-
out_hash = {}
|
299
|
-
sdrm = {}
|
300
|
-
sdrm[66] = ['T',['A','I','K']]
|
301
|
-
sdrm[74] = ['L',['M']]
|
302
|
-
sdrm[92] = ['E',['Q']]
|
303
|
-
sdrm[95] = ['Q',['K']]
|
304
|
-
sdrm[97] = ['T',['A']]
|
305
|
-
sdrm[121] = ['F',['Y']]
|
306
|
-
sdrm[140] = ['G',['A','S','C']]
|
307
|
-
sdrm[143] = ["Y",["C","H","R"]]
|
308
|
-
sdrm[147] = ['S',['G']]
|
309
|
-
sdrm[148] = ['Q',['H','K','R']]
|
310
|
-
sdrm[155] = ['N',['S','H']]
|
311
|
-
aa_length = aa_array.size
|
312
|
-
end_aa = start_aa + aa_length - 1
|
313
|
-
(start_aa..end_aa).each do |position|
|
314
|
-
array_position = position - start_aa
|
315
|
-
if sdrm.keys.include?(position)
|
316
|
-
wt_aa = sdrm[position][0]
|
317
|
-
test_aa = aa_array[array_position]
|
318
|
-
if test_aa.size == 1
|
319
|
-
unless wt_aa == test_aa
|
320
|
-
if sdrm[position][1].include?(test_aa)
|
321
|
-
out_hash[position] = [wt_aa,test_aa]
|
322
|
-
end
|
323
|
-
end
|
324
|
-
else
|
325
|
-
test_aa_array = test_aa.split("/")
|
326
|
-
if (test_aa_array & sdrm[position][1])
|
327
|
-
out_hash[position] = [wt_aa,test_aa]
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
end
|
332
|
-
end
|
333
|
-
return out_hash
|
334
|
-
end
|
335
|
-
|
336
|
-
# input sequence hash, and Poisson cutoff for minority variants.
|
337
|
-
# HIV-1 PR region SDRM based on HIVDB.stanford.edu
|
338
|
-
# only for MPID-DR MiSeq sequences, PR codon 1-99
|
339
|
-
# return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
|
340
|
-
def self.sdrm_pr_bulk(sequences, cutoff = 0)
|
341
|
-
region = "PR"
|
342
|
-
rf_label = 0
|
343
|
-
start_codon_number = 1
|
344
|
-
n_seq = sequences.size
|
345
|
-
mut = {}
|
346
|
-
mut_com = []
|
347
|
-
aa = {}
|
348
|
-
point_mutation_list = []
|
349
|
-
sequences.each do |name,seq|
|
350
|
-
s = ViralSeq::Sequence.new(name,seq)
|
351
|
-
s.get_aa_array(rf_label)
|
352
|
-
aa_seq = s.aa_array
|
353
|
-
aa[name] = aa_seq.join("")
|
354
|
-
record = ViralSeq.hiv_protease(aa_seq)
|
355
|
-
mut_com << record
|
356
|
-
record.each do |position,mutation|
|
357
|
-
if mut[position]
|
358
|
-
mut[position][1] << mutation[1]
|
359
|
-
else
|
360
|
-
mut[position] = [mutation[0],[]]
|
361
|
-
mut[position][1] << mutation[1]
|
362
|
-
end
|
363
|
-
end
|
364
|
-
end
|
365
|
-
mut.each do |position,mutation|
|
366
|
-
wt = mutation[0]
|
367
|
-
mut_list = mutation[1]
|
368
|
-
count_mut_list = ViralSeq.count(mut_list)
|
369
|
-
count_mut_list.each do |m,number|
|
370
|
-
ci = ViralSeq.r_binom_CI(number, n_seq)
|
371
|
-
label = number < cutoff ? "*" : ""
|
372
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
373
|
-
end
|
374
|
-
end
|
375
|
-
point_mutation_list.sort_by! {|record| record[2]}
|
376
|
-
|
377
|
-
link = ViralSeq.count(mut_com)
|
378
|
-
link2 = {}
|
379
|
-
link.each do |k,v|
|
380
|
-
pattern = []
|
381
|
-
if k.size == 0
|
382
|
-
pattern = ['WT']
|
383
|
-
else
|
384
|
-
k.each do |p,m|
|
385
|
-
pattern << (m[0] + p.to_s + m[1])
|
386
|
-
end
|
387
|
-
end
|
388
|
-
link2[pattern.join("+")] = v
|
389
|
-
end
|
390
|
-
linkage_list = []
|
391
|
-
link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
|
392
|
-
ci = ViralSeq.r_binom_CI(v, n_seq)
|
393
|
-
label = v < cutoff ? "*" : ""
|
394
|
-
linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
|
395
|
-
end
|
396
|
-
|
397
|
-
report_list = []
|
398
|
-
|
399
|
-
div_aa = {}
|
400
|
-
aa_start = start_codon_number
|
401
|
-
|
402
|
-
aa_size = aa.values[0].size - 1
|
403
|
-
|
404
|
-
(0..aa_size).to_a.each do |p|
|
405
|
-
aas = []
|
406
|
-
aa.values.each do |r1|
|
407
|
-
aas << r1[p]
|
408
|
-
end
|
409
|
-
count_aas = ViralSeq.count(aas)
|
410
|
-
div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
411
|
-
aa_start += 1
|
412
|
-
end
|
413
|
-
|
414
|
-
div_aa.each do |k,v|
|
415
|
-
record = [region, k, n_seq]
|
416
|
-
ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
|
417
|
-
aa_count = v[amino_acid]
|
418
|
-
record << (aa_count.to_f/n_seq*100).round(4)
|
419
|
-
end
|
420
|
-
report_list << record
|
421
|
-
end
|
422
|
-
|
423
|
-
return [point_mutation_list, linkage_list, report_list]
|
424
|
-
end
|
425
|
-
|
426
|
-
|
427
|
-
#input sequence hash, and Poisson cutoff for minority variants.
|
428
|
-
#HIV-1 RT region SDRM based on HIVDB.stanford.edu
|
429
|
-
#only for MPID-DR MiSeq sequences
|
430
|
-
#RT codon 34-122, 152-236 two regions are linked.
|
431
|
-
#return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
|
432
|
-
def self.sdrm_rt_bulk(sequences, cutoff = 0)
|
433
|
-
region = "RT"
|
434
|
-
rf_label = 1
|
435
|
-
start_codon_number = 34
|
436
|
-
gap = "AGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC"
|
437
|
-
|
438
|
-
n_seq = sequences.size
|
439
|
-
mut_nrti = {}
|
440
|
-
mut_nnrti = {}
|
441
|
-
mut_com = []
|
442
|
-
r1_aa = {}
|
443
|
-
r2_aa = {}
|
444
|
-
point_mutation_list = []
|
445
|
-
sequences.each do |name,seq|
|
446
|
-
r1 = seq[0,267]
|
447
|
-
r2 = seq[267..-1]
|
448
|
-
seq = r1 + gap + r2
|
449
|
-
s = ViralSeq::Sequence.new(name,seq)
|
450
|
-
s.get_aa_array(rf_label)
|
451
|
-
aa_seq = s.aa_array
|
452
|
-
|
453
|
-
r1_aa[name] = aa_seq[0,89].join("")
|
454
|
-
r2_aa[name] = aa_seq[-85..-1].join("")
|
455
|
-
nrti = ViralSeq.sdrm_nrti(aa_seq,start_codon_number)
|
456
|
-
nnrti = ViralSeq.sdrm_nnrti(aa_seq,start_codon_number)
|
457
|
-
mut_com << (nrti.merge(nnrti))
|
458
|
-
|
459
|
-
nrti.each do |position,mutation|
|
460
|
-
if mut_nrti[position]
|
461
|
-
mut_nrti[position][1] << mutation[1]
|
462
|
-
else
|
463
|
-
mut_nrti[position] = [mutation[0],[]]
|
464
|
-
mut_nrti[position][1] << mutation[1]
|
465
|
-
end
|
466
|
-
end
|
467
|
-
nnrti.each do |position,mutation|
|
468
|
-
if mut_nnrti[position]
|
469
|
-
mut_nnrti[position][1] << mutation[1]
|
470
|
-
else
|
471
|
-
mut_nnrti[position] = [mutation[0],[]]
|
472
|
-
mut_nnrti[position][1] << mutation[1]
|
473
|
-
end
|
474
|
-
end
|
475
|
-
end
|
476
|
-
|
477
|
-
mut_nrti.each do |position,mutation|
|
478
|
-
wt = mutation[0]
|
479
|
-
mut_list = mutation[1]
|
480
|
-
count_mut_list = ViralSeq.count(mut_list)
|
481
|
-
count_mut_list.each do |m,number|
|
482
|
-
ci = ViralSeq.r_binom_CI(number, n_seq)
|
483
|
-
label = number < cutoff ? "*" : ""
|
484
|
-
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
485
|
-
end
|
486
|
-
end
|
487
|
-
|
488
|
-
mut_nnrti.each do |position,mutation|
|
489
|
-
wt = mutation[0]
|
490
|
-
mut_list = mutation[1]
|
491
|
-
count_mut_list = ViralSeq.count(mut_list)
|
492
|
-
count_mut_list.each do |m,number|
|
493
|
-
ci = ViralSeq.r_binom_CI(number, n_seq)
|
494
|
-
label = number < cutoff ? "*" : ""
|
495
|
-
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
496
|
-
end
|
497
|
-
end
|
498
|
-
point_mutation_list.sort_by! {|record| record[2]}
|
499
|
-
|
500
|
-
link = ViralSeq.count(mut_com)
|
501
|
-
link2 = {}
|
502
|
-
link.each do |k,v|
|
503
|
-
pattern = []
|
504
|
-
if k.size == 0
|
505
|
-
pattern = ['WT']
|
506
|
-
else
|
507
|
-
k.each do |p,m|
|
508
|
-
pattern << (m[0] + p.to_s + m[1])
|
509
|
-
end
|
510
|
-
end
|
511
|
-
link2[pattern.join("+")] = v
|
512
|
-
end
|
513
|
-
linkage_list = []
|
514
|
-
link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
|
515
|
-
ci = ViralSeq.r_binom_CI(v, n_seq)
|
516
|
-
label = v < cutoff ? "*" : ""
|
517
|
-
linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
|
518
|
-
end
|
519
|
-
|
520
|
-
report_list = []
|
521
|
-
|
522
|
-
div_aa = {}
|
523
|
-
r1_aa_start = 34
|
524
|
-
r2_aa_start = 152
|
525
|
-
|
526
|
-
r1_aa_size = r1_aa.values[0].size - 1
|
527
|
-
r2_aa_size = r2_aa.values[0].size - 1
|
528
|
-
|
529
|
-
(0..r1_aa_size).to_a.each do |p|
|
530
|
-
aas = []
|
531
|
-
r1_aa.values.each do |r1|
|
532
|
-
aas << r1[p]
|
533
|
-
end
|
534
|
-
count_aas = ViralSeq.count(aas)
|
535
|
-
div_aa[r1_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
536
|
-
r1_aa_start += 1
|
537
|
-
end
|
538
|
-
|
539
|
-
(0..r2_aa_size).to_a.each do |p|
|
540
|
-
aas = []
|
541
|
-
r2_aa.values.each do |r1|
|
542
|
-
aas << r1[p]
|
543
|
-
end
|
544
|
-
count_aas = ViralSeq.count(aas)
|
545
|
-
div_aa[r2_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
546
|
-
r2_aa_start += 1
|
547
|
-
end
|
548
|
-
|
549
|
-
div_aa.each do |k,v|
|
550
|
-
record = [region, k, n_seq]
|
551
|
-
ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
|
552
|
-
aa_count = v[amino_acid]
|
553
|
-
record << (aa_count.to_f/n_seq*100).round(4)
|
554
|
-
end
|
555
|
-
report_list << record
|
556
|
-
end
|
557
|
-
|
558
|
-
return [point_mutation_list, linkage_list, report_list]
|
559
|
-
end
|
560
|
-
|
561
|
-
#input sequence hash, and Poisson cutoff for minority variants.
|
562
|
-
#HIV-1 IN region SDRM based on HIVDB.stanford.edu
|
563
|
-
#only for MPID-DR MiSeq sequences
|
564
|
-
#IN codon 53-174
|
565
|
-
#return [substitution rate with 95% CI, halpotype abundance with 95% CI, amino acid sequence report spreadsheet]
|
566
|
-
def self.sdrm_in_bulk(sequences, cutoff = 0)
|
567
|
-
region = "IN"
|
568
|
-
rf_label = 2
|
569
|
-
start_codon_number = 53
|
570
|
-
n_seq = sequences.size
|
571
|
-
mut = {}
|
572
|
-
mut_com = []
|
573
|
-
aa = {}
|
574
|
-
point_mutation_list = []
|
575
|
-
sequences.each do |name,seq|
|
576
|
-
s = ViralSeq::Sequence.new(name,seq)
|
577
|
-
s.get_aa_array(rf_label)
|
578
|
-
aa_seq = s.aa_array
|
579
|
-
aa[name] = aa_seq.join("")
|
580
|
-
record = ViralSeq.sdrm_int(aa_seq, start_codon_number)
|
581
|
-
mut_com << record
|
582
|
-
record.each do |position,mutation|
|
583
|
-
if mut[position]
|
584
|
-
mut[position][1] << mutation[1]
|
585
|
-
else
|
586
|
-
mut[position] = [mutation[0],[]]
|
587
|
-
mut[position][1] << mutation[1]
|
588
|
-
end
|
589
|
-
end
|
590
|
-
end
|
591
|
-
mut.each do |position,mutation|
|
592
|
-
wt = mutation[0]
|
593
|
-
mut_list = mutation[1]
|
594
|
-
count_mut_list = ViralSeq.count(mut_list)
|
595
|
-
count_mut_list.each do |m,number|
|
596
|
-
ci = ViralSeq.r_binom_CI(number, n_seq)
|
597
|
-
label = number < cutoff ? "*" : ""
|
598
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, (number/n_seq.to_f).round(5), ci[0], ci[1], label]
|
599
|
-
end
|
600
|
-
end
|
601
|
-
point_mutation_list.sort_by! {|record| record[2]}
|
602
|
-
|
603
|
-
link = ViralSeq.count(mut_com)
|
604
|
-
link2 = {}
|
605
|
-
link.each do |k,v|
|
606
|
-
pattern = []
|
607
|
-
if k.size == 0
|
608
|
-
pattern = ['WT']
|
609
|
-
else
|
610
|
-
k.each do |p,m|
|
611
|
-
pattern << (m[0] + p.to_s + m[1])
|
612
|
-
end
|
613
|
-
end
|
614
|
-
link2[pattern.join("+")] = v
|
615
|
-
end
|
616
|
-
linkage_list = []
|
617
|
-
link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
|
618
|
-
ci = ViralSeq.r_binom_CI(v, n_seq)
|
619
|
-
label = v < cutoff ? "*" : ""
|
620
|
-
linkage_list << [region, n_seq, k, v, (v/n_seq.to_f).round(5), ci[0], ci[1], label]
|
621
|
-
end
|
622
|
-
|
623
|
-
report_list = []
|
624
|
-
|
625
|
-
div_aa = {}
|
626
|
-
aa_start = start_codon_number
|
627
|
-
|
628
|
-
aa_size = aa.values[0].size - 1
|
629
|
-
|
630
|
-
(0..aa_size).to_a.each do |p|
|
631
|
-
aas = []
|
632
|
-
aa.values.each do |r1|
|
633
|
-
aas << r1[p]
|
634
|
-
end
|
635
|
-
count_aas = ViralSeq.count(aas)
|
636
|
-
div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
|
637
|
-
aa_start += 1
|
638
|
-
end
|
639
|
-
|
640
|
-
div_aa.each do |k,v|
|
641
|
-
record = [region, k, n_seq]
|
642
|
-
ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
|
643
|
-
aa_count = v[amino_acid]
|
644
|
-
record << (aa_count.to_f/n_seq*100).round(4)
|
645
|
-
end
|
646
|
-
report_list << record
|
647
|
-
end
|
648
|
-
|
649
|
-
return [point_mutation_list, linkage_list, report_list]
|
650
|
-
end
|
651
|
-
|
652
|
-
end
|