viral_seq 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
data/lib/viral_seq/muscle.rb
CHANGED
@@ -1,89 +1,67 @@
|
|
1
|
-
# viral_seq/muscle.rb
|
2
|
-
# wrapper for MUSCLE (http://www.drive5.com/muscle)
|
3
|
-
# Including Methods as:
|
4
|
-
# ViralSeq::check_muscle
|
5
|
-
# ViralSeq::muscle_align
|
6
|
-
# ViralSeq::muscle_align_multi
|
7
|
-
|
8
|
-
# ViralSeq.check_muscle?(path_to_muscle)
|
9
|
-
# # check if the path_to_muscle provided is valid,
|
10
|
-
# # prompt error messages if MUSCLE is not found.
|
11
|
-
|
12
|
-
# ViralSeq.muscle_align(reference_seq, test_sequence, path_to_muscle)
|
13
|
-
# # takes a reference sequence and a test sequence as String object
|
14
|
-
# # without specification on path_to_muscle, MuscleBio will be called to run Muscle
|
15
|
-
# # specify path_to_muscle if other source of muscle needed
|
16
|
-
# # returns aligned reference sequence and test sequences
|
17
|
-
|
18
|
-
# ViralSeq.muscle_align_multi(sequence_hash, path_to_muscle)
|
19
|
-
# # input a sequence_hash object {:name=>:sequence,...}
|
20
|
-
# # without specification on path_to_muscle, MuscleBio will be called to run Muscle
|
21
|
-
# # specify path_to_muscle if other source of muscle needed
|
22
|
-
# # return aligned sequences an hash
|
23
1
|
|
24
2
|
module ViralSeq
|
3
|
+
# alignment using MUSCLE alignment program
|
4
|
+
# @see http://www.drive5.com/muscle MUSCLE download link
|
25
5
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
return true
|
31
|
-
rescue Errno::ENOENT
|
32
|
-
puts "
|
33
|
-
Error: MUSCLE is not found for at the provided {path_to_muscle}!!
|
34
|
-
MUSLCE can be download at http://www.drive5.com/muscle
|
35
|
-
Add MUSCLE excutable path to $PATH using
|
36
|
-
$ export PATH=$PATH:/path/to/muscle
|
37
|
-
or
|
38
|
-
provide path_to_MUSCLE in the function arguments\n
|
39
|
-
"
|
40
|
-
return false
|
41
|
-
end
|
42
|
-
end
|
6
|
+
module Muscle
|
7
|
+
# check if path_to_muscle is correct, prompt error messages if MUSCLE is not found.
|
8
|
+
# @param path_to_muscle [String] path to muscle excutable
|
9
|
+
# @return [boolean]
|
43
10
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
return nil;
|
11
|
+
def self.check_muscle?(path_to_muscle)
|
12
|
+
begin
|
13
|
+
`#{path_to_muscle} -version`
|
14
|
+
return true
|
15
|
+
rescue Errno::ENOENT
|
16
|
+
puts "
|
17
|
+
Error: MUSCLE is not found for at the provided {path_to_muscle}!!
|
18
|
+
MUSLCE can be download at http://www.drive5.com/muscle
|
19
|
+
Add MUSCLE excutable path to $PATH using
|
20
|
+
$ export PATH=$PATH:/path/to/muscle
|
21
|
+
or
|
22
|
+
provide path_to_MUSCLE in the function arguments\n
|
23
|
+
"
|
24
|
+
return false
|
59
25
|
end
|
60
|
-
|
61
|
-
else
|
62
|
-
MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
|
63
|
-
end
|
64
|
-
aln_seq_hash = ViralSeq.fasta_to_hash(temp_aln)
|
65
|
-
File.unlink(temp_file)
|
66
|
-
File.unlink(temp_aln)
|
67
|
-
return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
|
68
|
-
end
|
26
|
+
end # end of .check_muscle?
|
69
27
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if
|
76
|
-
|
77
|
-
|
78
|
-
|
28
|
+
# align a sequence with reference sequence Strings
|
29
|
+
# @param ref_seq [String] reference sequence
|
30
|
+
# @param test_seq [String] test sequence
|
31
|
+
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
32
|
+
# @return [Array] a pair of [:ref_seq_aligned, :test_seq_aligned] or nil
|
33
|
+
# if the cannot find MUSCLE excutable
|
34
|
+
# @example
|
35
|
+
# seq1 = 'AAGGCGTAGGAC'
|
36
|
+
# seq2 = 'AAGCTTAGGACG'
|
37
|
+
# aligned_seqs = ViralSeq::Muscle.align(seq1,seq2)
|
38
|
+
# => ["AAGGCGTAGGAC-", "-AAGCTTAGGACG"]
|
39
|
+
|
40
|
+
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
|
+
temp_dir=File.dirname($0)
|
42
|
+
temp_file = temp_dir + "/_temp_muscle_in"
|
43
|
+
temp_aln = temp_dir + "/_temp_muscle_aln"
|
44
|
+
name = ">test"
|
45
|
+
temp_in = File.open(temp_file,"w")
|
46
|
+
temp_in.puts ">ref"
|
47
|
+
temp_in.puts ref_seq
|
48
|
+
temp_in.puts name
|
49
|
+
temp_in.puts test_seq
|
50
|
+
temp_in.close
|
51
|
+
if path_to_muscle
|
52
|
+
unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
|
53
|
+
File.unlink(temp_file)
|
54
|
+
return nil;
|
55
|
+
end
|
56
|
+
print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
|
57
|
+
else
|
58
|
+
MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
|
79
59
|
end
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
89
|
-
end
|
60
|
+
aln_seq_hash = ViralSeq::SeqHash.fa(temp_aln).dna_hash
|
61
|
+
File.unlink(temp_file)
|
62
|
+
File.unlink(temp_aln)
|
63
|
+
return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
|
64
|
+
end # end of .align
|
65
|
+
end # end of ViralSeq::Muscle
|
66
|
+
|
67
|
+
end # end of ViralSeq
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
module PID
|
5
|
+
|
6
|
+
# generate all Primer ID combinations given the length of Primer ID
|
7
|
+
# @param l [Integer] the length of the Primer ID.
|
8
|
+
# @example generate a pool of Primer IDs with length of 10
|
9
|
+
# primer_id_pool = ViralSeq::PID.generate_pool(10) # 10 is the length of Primer ID
|
10
|
+
# puts primer_id_pool.size #should be 4^10
|
11
|
+
# => 1048576
|
12
|
+
|
13
|
+
def self.generate_pool(l=8)
|
14
|
+
nt = ['A','T','C','G']
|
15
|
+
pid_pool = ['A','T','C','G']
|
16
|
+
(l-1).times do
|
17
|
+
pid_pool = pid_pool.product(nt)
|
18
|
+
pid_pool.collect! do |v|
|
19
|
+
v.join("")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return pid_pool
|
23
|
+
end # end of .generate_primer_id_pool
|
24
|
+
|
25
|
+
end # end of Pid
|
26
|
+
end # end of ViralSeq
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# viral_seq main module
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
# HIV/SIV reference genome sequences, including HXB2, NL43, MAC239
|
5
|
+
# @see https://www.ncbi.nlm.nih.gov/nuccore/K03455 Reference sequence of HIV-1 HXB2 (Genbank accession number K03455)
|
6
|
+
# @see https://www.ncbi.nlm.nih.gov/nuccore/AF324493 Reference sequence of HIV-1 NL43 (Genbank accession number AF324493)
|
7
|
+
# @see https://www.ncbi.nlm.nih.gov/nucleotide/M33262 Reference sequence of SIV MAC239 (Genbank accession number M33262)
|
8
|
+
# @example retrive the reference sequence for HIV NL43
|
9
|
+
# ViralSeq::RefSeq.get(:NL43)
|
10
|
+
# => "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTA..."
|
11
|
+
|
12
|
+
module RefSeq
|
13
|
+
|
14
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
15
|
+
# @return [String] the reference sequence as a String object
|
16
|
+
|
17
|
+
def self.get(ref_option)
|
18
|
+
begin
|
19
|
+
case ref_option
|
20
|
+
when :HXB2
|
21
|
+
"TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
|
22
|
+
when :NL43
|
23
|
+
"TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAAATAAGGAGAGAAGAACAGCTTGTTACACCCTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGCATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGTGTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTATAGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACAGTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTTATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAAGGGTGCCCACACTAATGATGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCATAATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATATGTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGGAATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAAAAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATGGGTTGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATATGGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTTGGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGATTAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTACTAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGAATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGCCTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCATACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATATGGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGATAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGGTTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTATCCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACATGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGAGGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCTGCACGTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
|
24
|
+
when :MAC239
|
25
|
+
"GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAGGCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATACCCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCAGATTGGCGCCTGAACAGGGACTTGAAGGAGAGTGAGAGACTCCTGAGTACGGCTGAGTGAAGGCAGTAAGGGCGGCAGGAACCAACCACGACGGAGTGCTCCTATAAAGGCGCGGGTCGGTACCAGACGGCGTGAGGAGCGGGAGAGGAAGAGGCCTCCGGTTGCAGGTAAGTGCAACACAAAAAAGAAATAGCTGTCTTTTATCCAGGAAGGGGTAATAAGATAGAGTGGGAGATGGGCGTGAGAAACTCCGTCTTGTCAGGGAAGAAAGCAGATGAATTAGAAAAAATTAGGCTACGACCCAACGGAAAGAAAAAGTACATGTTGAAGCATGTAGTATGGGCAGCAAATGAATTAGATAGATTTGGATTAGCAGAAAGCCTGTTGGAGAACAAAGAAGGATGTCAAAAAATACTTTCGGTCTTAGCTCCATTAGTGCCAACAGGCTCAGAAAATTTAAAAAGCCTTTATAATACTGTCTGCGTCATCTGGTGCATTCACGCAGAAGAGAAAGTGAAACACACTGAGGAAGCAAAACAGATAGTGCAGAGACACCTAGTGGTGGAAACAGGAACAACAGAAACTATGCCAAAAACAAGTAGACCAACAGCACCATCTAGCGGCAGAGGAGGAAATTACCCAGTACAACAAATAGGTGGTAACTATGTCCACCTGCCATTAAGCCCGAGAACATTAAATGCCTGGGTAAAATTGATAGAGGAAAAGAAATTTGGAGCAGAAGTAGTGCCAGGATTTCAGGCACTGTCAGAAGGTTGCACCCCCTATGACATTAATCAGATGTTAAATTGTGTGGGAGACCATCAAGCGGCTATGCAGATTATCAGAGATATTATAAACGAGGAGGCTGCAGATTGGGACTTGCAGCACCCACAACCAGCTCCACAACAAGGACAACTTAGGGAGCCGTCAGGATCAGATATTGCAGGAACAACTAGTTCAGTAGATGAACAAATCCAGTGGATGTACAGACAACAGAACCCCATACCAGTAGGCAACATTTACAGGAGATGGATCCAACTGGGGTTGCAAAAATGTGTCAGAATGTATAACCCAACAAACATTCTAGATGTAAAACAAGGGCCAAAAGAGCCATTTCAGAGCTATGTAGACAGGTTCTACAAAAGTTTAAGAGCAGAACAGACAGATGCAGCAGTAAAGAATTGGATGACTCAAACACTGCTGATTCAAAATGCTAACCCAGATTGCAAGCTAGTGCTGAAGGGGCTGGGTGTGAATCCCACCCTAGAAGAAATGCTGACGGCTTGTCAAGGAGTAGGGGGGCCGGGACAGAAGGCTAGATTAATGGCAGAAGCCCTGAAAGAGGCCCTCGCACCAGTGCCAATCCCTTTTGCAGCAGCCCAACAGAGGGGACCAAGAAAGCCAATTAAGTGTTGGAATTGTGGGAAAGAGGGACACTCTGCAAGGCAATGCAGAGCCCCAAGAAGACAGGGATGCTGGAAATGTGGAAAAATGGACCATGTTATGGCCAAATGCCCAGACAGACAGGCGGGTTTTTTAGGCCTTGGTCCATGGGGAAAGAAGCCCCGCAATTTCCCCATGGCTCAAGTGCATCAGGGGCTGATGCCAACTGCTCCCCCAGAGGACCCAGCTGTGGATCTGCTAAAGAACTACATGCAGTTGGGCAAGCAGCAGAGAGAAAAGCAGAGAGAAAGCAGAGAGAAGCCTTACAAGGAGGTGACAGAGGATTTGCTGCACCTCAATTCTCTCTTTGGAGGAGACCAGTAGTCACTGCTCATATTGAAGGACAGCCTGTAGAAGTATTACTGGATACAGGGGCTGATGATTCTATTGTAACAGGAATAGAGTTAGGTCCACATTATACCCCAAAAATAGTAGGAGGAATAGGAGGTTTTATTAATACTAAAGAATACAAAAATGTAGAAATAGAAGTTTTAGGCAAAAGGATTAAAGGGACAATCATGACAGGGGACACCCCGATTAACATTTTTGGTAGAAATTTGCTAACAGCTCTGGGGATGTCTCTAAATTTTCCCATAGCTAAAGTAGAGCCTGTAAAAGTCGCCTTAAAGCCAGGAAAGGATGGACCAAAATTGAAGCAGTGGCCATTATCAAAAGAAAAGATAGTTGCATTAAGAGAAATCTGTGAAAAGATGGAAAAGGATGGTCAGTTGGAGGAAGCTCCCCCGACCAATCCATACAACACCCCCACATTTGCTATAAAGAAAAAGGATAAGAACAAATGGAGAATGCTGATAGATTTTAGGGAACTAAATAGGGTCACTCAGGACTTTACGGAAGTCCAATTAGGAATACCACACCCTGCAGGACTAGCAAAAAGGAAAAGAATTACAGTACTGGATATAGGTGATGCATATTTCTCCATACCTCTAGATGAAGAATTTAGGCAGTACACTGCCTTTACTTTACCATCAGTAAATAATGCAGAGCCAGGAAAACGATACATTTATAAGGTTCTGCCTCAGGGATGGAAGGGGTCACCAGCCATCTTCCAATACACTATGAGACATGTGCTAGAACCCTTCAGGAAGGCAAATCCAGATGTGACCTTAGTCCAGTATATGGATGACATCTTAATAGCTAGTGACAGGACAGACCTGGAACATGACAGGGTAGTTTTACAGTCAAAGGAACTCTTGAATAGCATAGGGTTTTCTACCCCAGAAGAGAAATTCCAAAAAGATCCCCCATTTCAATGGATGGGGTACGAATTGTGGCCAACAAAATGGAAGTTGCAAAAGATAGAGTTGCCACAAAGAGAGACCTGGACAGTGAATGATATACAGAAGTTAGTAGGAGTATTAAATTGGGCAGCTCAAATTTATCCAGGTATAAAAACCAAACATCTCTGTAGGTTAATTAGAGGAAAAATGACTCTAACAGAGGAAGTTCAGTGGACTGAGATGGCAGAAGCAGAATATGAGGAAAATAAAATAATTCTCAGTCAGGAACAAGAAGGATGTTATTACCAAGAAGGCAAGCCATTAGAAGCCACGGTAATAAAGAGTCAGGACAATCAGTGGTCTTATAAAATTCACCAAGAAGACAAAATACTGAAAGTAGGAAAATTTGCAAAGATAAAGAATACACATACCAATGGAGTGAGACTATTAGCACATGTAATACAGAAAATAGGAAAGGAAGCAATAGTGATCTGGGGACAGGTCCCAAAATTCCACTTACCAGTTGAGAAGGATGTATGGGAACAGTGGTGGACAGACTATTGGCAGGTAACCTGGATACCGGAATGGGATTTTATCTCAACACCACCGCTAGTAAGATTAGTCTTCAATCTAGTGAAGGACCCTATAGAGGGAGAAGAAACCTATTATACAGATGGATCATGTAATAAACAGTCAAAAGAAGGGAAAGCAGGATATATCACAGATAGGGGCAAAGACAAAGTAAAAGTGTTAGAACAGACTACTAATCAACAAGCAGAATTGGAAGCATTTCTCATGGCATTGACAGACTCAGGGCCAAAGGCAAATATTATAGTAGATTCACAATATGTTATGGGAATAATAACAGGATGCCCTACAGAATCAGAGAGCAGGCTAGTTAATCAAATAATAGAAGAAATGATTAAAAAGTCAGAAATTTATGTAGCATGGGTACCAGCACACAAAGGTATAGGAGGAAACCAAGAAATAGACCACCTAGTTAGTCAAGGGATTAGACAAGTTCTCTTCTTGGAAAAGATAGAGCCAGCACAAGAAGAACATGATAAATACCATAGTAATGTAAAAGAATTGGTATTCAAATTTGGATTACCCAGAATAGTGGCCAGACAGATAGTAGACACCTGTGATAAATGTCATCAGAAAGGAGAGGCTATACATGGGCAGGCAAATTCAGATCTAGGGACTTGGCAAATGGATTGTACCCATCTAGAGGGAAAAATAATCATAGTTGCAGTACATGTAGCTAGTGGATTCATAGAAGCAGAGGTAATTCCACAAGAGACAGGAAGACAGACAGCACTATTTCTGTTAAAATTGGCAGGCAGATGGCCTATTACACATCTACACACAGATAATGGTGCTAACTTTGCTTCGCAAGAAGTAAAGATGGTTGCATGGTGGGCAGGGATAGAGCACACCTTTGGGGTACCATACAATCCACAGAGTCAGGGAGTAGTGGAAGCAATGAATCACCACCTGAAAAATCAAATAGATAGAATCAGGGAACAAGCAAATTCAGTAGAAACCATAGTATTAATGGCAGTTCATTGCATGAATTTTAAAAGAAGGGGAGGAATAGGGGATATGACTCCAGCAGAAAGATTAATTAACATGATCACTACAGAACAAGAGATACAATTTCAACAATCAAAAAACTCAAAATTTAAAAATTTTCGGGTCTATTACAGAGAAGGCAGAGATCAACTGTGGAAGGGACCCGGTGAGCTATTGTGGAAAGGGGAAGGAGCAGTCATCTTAAAGGTAGGGACAGACATTAAGGTAGTACCCAGAAGAAAGGCTAAAATTATCAAAGATTATGGAGGAGGAAAAGAGGTGGATAGCAGTTCCCACATGGAGGATACCGGAGAGGCTAGAGAGGTGGCATAGCCTCATAAAATATCTGAAATATAAAACTAAAGATCTACAAAAGGTTTGCTATGTGCCCCATTTTAAGGTCGGATGGGCATGGTGGACCTGCAGCAGAGTAATCTTCCCACTACAGGAAGGAAGCCATTTAGAAGTACAAGGGTATTGGCATTTGACACCAGAAAAAGGGTGGCTCAGTACTTATGCAGTGAGGATAACCTGGTACTCAAAGAACTTTTGGACAGATGTAACACCAAACTATGCAGACATTTTACTGCATAGCACTTATTTCCCTTGCTTTACAGCGGGAGAAGTGAGAAGGGCCATCAGGGGAGAACAACTGCTGTCTTGCTGCAGGTTCCCGAGAGCTCATAAGTACCAGGTACCAAGCCTACAGTACTTAGCACTGAAAGTAGTAAGCGATGTCAGATCCCAGGGAGAGAATCCCACCTGGAAACAGTGGAGAAGAGACAATAGGAGAGGCCTTCGAATGGCTAAACAGAACAGTAGAGGAGATAAACAGAGAGGCGGTAAACCACCTACCAAGGGAGCTAATTTTCCAGGTTTGGCAAAGGTCTTGGGAATACTGGCATGATGAACAAGGGATGTCACCAAGCTATGTAAAATACAGATACTTGTGTTTAATACAAAAGGCTTTATTTATGCATTGCAAGAAAGGCTGTAGATGTCTAGGGGAAGGACATGGGGCAGGGGGATGGAGACCAGGACCTCCTCCTCCTCCCCCTCCAGGACTAGCATAAATGGAAGAAAGACCTCCAGAAAATGAAGGACCACAAAGGGAACCATGGGATGAATGGGTAGTGGAGGTTCTGGAAGAACTGAAAGAAGAAGCTTTAAAACATTTTGATCCTCGCTTGCTAACTGCACTTGGTAATCATATCTATAATAGACATGGAGACACCCTTGAGGGAGCAGGAGAACTCATTAGAATCCTCCAACGAGCGCTCTTCATGCATTTCAGAGGCGGATGCATCCACTCCAGAATCGGCCAACCTGGGGGAGGAAATCCTCTCTCAGCTATACCGCCCTCTAGAAGCATGCTATAACACATGCTATTGTAAAAAGTGTTGCTACCATTGCCAGTTTTGTTTTCTTAAAAAAGGCTTGGGGATATGTTATGAGCAATCACGAAAGAGAAGAAGAACTCCGAAAAAGGCTAAGGCTAATACATCTTCTGCATCAAACAAGTAAGTATGGGATGTCTTGGGAATCAGCTGCTTATCGCCATCTTGCTTTTAAGTGTCTATGGGATCTATTGTACTCTATATGTCACAGTCTTTTATGGTGTACCAGCTTGGAGGAATGCGACAATTCCCCTCTTTTGTGCAACCAAGAATAGGGATACTTGGGGAACAACTCAGTGCCTACCAGATAATGGTGATTATTCAGAAGTGGCCCTTAATGTTACAGAAAGCTTTGATGCCTGGAATAATACAGTCACAGAACAGGCAATAGAGGATGTATGGCAACTCTTTGAGACCTCAATAAAGCCTTGTGTAAAATTATCCCCATTATGCATTACTATGAGATGCAATAAAAGTGAGACAGATAGATGGGGATTGACAAAATCAATAACAACAACAGCATCAACAACATCAACGACAGCATCAGCAAAAGTAGACATGGTCAATGAGACTAGTTCTTGTATAGCCCAGGATAATTGCACAGGCTTGGAACAAGAGCAAATGATAAGCTGTAAATTCAACATGACAGGGTTAAAAAGAGACAAGAAAAAAGAGTACAATGAAACTTGGTACTCTGCAGATTTGGTATGTGAACAAGGGAATAACACTGGTAATGAAAGTAGATGTTACATGAACCACTGTAACACTTCTGTTATCCAAGAGTCTTGTGACAAACATTATTGGGATGCTATTAGATTTAGGTATTGTGCACCTCCAGGTTATGCTTTGCTTAGATGTAATGACACAAATTATTCAGGCTTTATGCCTAAATGTTCTAAGGTGGTGGTCTCTTCATGCACAAGGATGATGGAGACACAGACTTCTACTTGGTTTGGCTTTAATGGAACTAGAGCAGAAAATAGAACTTATATTTACTGGCATGGTAGGGATAATAGGACTATAATTAGTTTAAATAAGTATTATAATCTAACAATGAAATGTAGAAGACCAGGAAATAAGACAGTTTTACCAGTCACCATTATGTCTGGATTGGTTTTCCACTCACAACCAATCAATGATAGGCCAAAGCAGGCATGGTGTTGGTTTGGAGGAAAATGGAAGGATGCAATAAAAGAGGTGAAGCAGACCATTGTCAAACATCCCAGGTATACTGGAACTAACAATACTGATAAAATCAATTTGACGGCTCCTGGAGGAGGAGATCCGGAAGTTACCTTCATGTGGACAAATTGCAGAGGAGAGTTCCTCTACTGTAAAATGAATTGGTTTCTAAATTGGGTAGAAGATAGGAATACAGCTAACCAGAAGCCAAAGGAACAGCATAAAAGGAATTACGTGCCATGTCATATTAGACAAATAATCAACACTTGGCATAAAGTAGGCAAAAATGTTTATTTGCCTCCAAGAGAGGGAGACCTCACGTGTAACTCCACAGTGACCAGTCTCATAGCAAACATAGATTGGATTGATGGAAACCAAACTAATATCACCATGAGTGCAGAGGTGGCAGAACTGTATCGATTGGAATTGGGAGATTATAAATTAGTAGAGATCACTCCAATTGGCTTGGCCCCCACAGATGTGAAGAGGTACACTACTGGTGGCACCTCAAGAAATAAAAGAGGGGTCTTTGTGCTAGGGTTCTTGGGTTTTCTCGCAACGGCAGGTTCTGCAATGGGCGCGGCGTCGTTGACGCTGACCGCTCAGTCCCGAACTTTATTGGCTGGGATAGTGCAGCAACAGCAACAGCTGTTGGACGTGGTCAAGAGACAACAAGAATTGTTGCGACTGACCGTCTGGGGAACAAAGAACCTCCAGACTAGGGTCACTGCCATCGAGAAGTACTTAAAGGACCAGGCGCAGCTGAATGCTTGGGGATGTGCGTTTAGACAAGTCTGCCACACTACTGTACCATGGCCAAATGCAAGTCTAACACCAAAGTGGAACAATGAGACTTGGCAAGAGTGGGAGCGAAAGGTTGACTTCTTGGAAGAAAATATAACAGCCCTCCTAGAGGAGGCACAAATTCAACAAGAGAAGAACATGTATGAATTACAAAAGTTGAATAGCTGGGATGTGTTTGGCAATTGGTTTGACCTTGCTTCTTGGATAAAGTATATACAATATGGAGTTTATATAGTTGTAGGAGTAATACTGTTAAGAATAGTGATCTATATAGTACAAATGCTAGCTAAGTTAAGGCAGGGGTATAGGCCAGTGTTCTCTTCCCCACCCTCTTATTTCCAGCAGACCCATATCCAACAGGACCCGGCACTGCCAACCAGAGAAGGCAAAGAAAGAGACGGTGGAGAAGGCGGTGGCAACAGCTCCTGGCCTTGGCAGATAGAATATATTCATTTCCTGATCCGCCAACTGATACGCCTCTTGACTTGGCTATTCAGCAACTGCAGAACCTTGCTATCGAGAGTATACCAGATCCTCCAACCAATACTCCAGAGGCTCTCTGCGACCCTACAGAGGATTCGAGAAGTCCTCAGGACTGAACTGACCTACCTACAATATGGGTGGAGCTATTTCCATGAGGCGGTCCAGGCCGTCTGGAGATCTGCGACAGAGACTCTTGCGGGCGCGTGGGGAGACTTATGGGAGACTCTTAGGAGAGGTGGAAGATGGATACTCGCAATCCCCAGGAGGATTAGACAAGGGCTTGAGCTCACTCTCTTGTGAGGGACAGAAATACAATCAGGGACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATTAGCATACAGAAAACAAAATATGGATGATATAGATGAGTAAGATGATGACTTGGTAGGGGTATCAGTGAGGCCAAAAGTTCCCCTAAGAACAATGAGTTACAAATTGGCAATAGACATGTCTCATTTTATAAAAGAAAAGGGGGGACTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCA"
|
26
|
+
else
|
27
|
+
raise StandardError.new("reference sequence not recognized, choose from :HXB2 (default), :NL43, or :MAC239.")
|
28
|
+
end
|
29
|
+
rescue StandardError => e
|
30
|
+
puts e.message
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
# Fisher's Exact Test Function Library
|
4
|
+
#
|
5
|
+
# Based on JavaScript version created by: Oyvind Langsrud,
|
6
|
+
# Ported to Ruby by Bryan Donovan
|
7
|
+
|
8
|
+
module Rubystats
|
9
|
+
# Fisher's exact test
|
10
|
+
class FishersExactTest
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@sn11 = 0.0
|
14
|
+
@sn1_ = 0.0
|
15
|
+
@sn_1 = 0.0
|
16
|
+
@sn = 0.0
|
17
|
+
@sprob = 0.0
|
18
|
+
|
19
|
+
@sleft = 0.0
|
20
|
+
@sright = 0.0
|
21
|
+
@sless = 0.0
|
22
|
+
@slarg = 0.0
|
23
|
+
|
24
|
+
@left = 0.0
|
25
|
+
@right = 0.0
|
26
|
+
@twotail = 0.0
|
27
|
+
end
|
28
|
+
|
29
|
+
# @see http://lib.stat.cmu.edu/apstat/245 Reference: "Lanczos, C. 'A precision approximation of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964." Translation of Alan Miller's FORTRAN-implementation.
|
30
|
+
|
31
|
+
def lngamm(z)
|
32
|
+
x = 0
|
33
|
+
x += 0.0000001659470187408462 / (z+7)
|
34
|
+
x += 0.000009934937113930748 / (z+6)
|
35
|
+
x -= 0.1385710331296526 / (z+5)
|
36
|
+
x += 12.50734324009056 / (z+4)
|
37
|
+
x -= 176.6150291498386 / (z+3)
|
38
|
+
x += 771.3234287757674 / (z+2)
|
39
|
+
x -= 1259.139216722289 / (z+1)
|
40
|
+
x += 676.5203681218835 / (z)
|
41
|
+
x += 0.9999999999995183
|
42
|
+
|
43
|
+
return(::Math.log(x)-5.58106146679532777-z+(z-0.5) * ::Math.log(z+6.5))
|
44
|
+
end
|
45
|
+
|
46
|
+
def lnfact(n)
|
47
|
+
if n <= 1
|
48
|
+
return 0
|
49
|
+
else
|
50
|
+
return lngamm(n+1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def lnbico(n,k)
|
55
|
+
return lnfact(n) - lnfact(k) - lnfact(n-k)
|
56
|
+
end
|
57
|
+
|
58
|
+
def hyper_323(n11, n1_, n_1, n)
|
59
|
+
return ::Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
|
60
|
+
end
|
61
|
+
|
62
|
+
def hyper(n11)
|
63
|
+
return hyper0(n11, 0, 0, 0)
|
64
|
+
end
|
65
|
+
|
66
|
+
def hyper0(n11i,n1_i,n_1i,ni)
|
67
|
+
if n1_i == 0 and n_1i ==0 and ni == 0
|
68
|
+
unless n11i % 10 == 0
|
69
|
+
if n11i == @sn11+1
|
70
|
+
@sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
|
71
|
+
@sn11 = n11i
|
72
|
+
return @sprob
|
73
|
+
end
|
74
|
+
if n11i == @sn11-1
|
75
|
+
@sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
|
76
|
+
@sn11 = n11i
|
77
|
+
return @sprob
|
78
|
+
end
|
79
|
+
end
|
80
|
+
@sn11 = n11i
|
81
|
+
else
|
82
|
+
@sn11 = n11i
|
83
|
+
@sn1_ = n1_i
|
84
|
+
@sn_1 = n_1i
|
85
|
+
@sn = ni
|
86
|
+
end
|
87
|
+
@sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
|
88
|
+
return @sprob
|
89
|
+
end
|
90
|
+
|
91
|
+
def exact(n11,n1_,n_1,n)
|
92
|
+
|
93
|
+
p = i = j = prob = 0.0
|
94
|
+
|
95
|
+
max = n1_
|
96
|
+
max = n_1 if n_1 < max
|
97
|
+
min = n1_ + n_1 - n
|
98
|
+
min = 0 if min < 0
|
99
|
+
|
100
|
+
if min == max
|
101
|
+
@sless = 1
|
102
|
+
@sright = 1
|
103
|
+
@sleft = 1
|
104
|
+
@slarg = 1
|
105
|
+
return 1
|
106
|
+
end
|
107
|
+
|
108
|
+
prob = hyper0(n11,n1_,n_1,n)
|
109
|
+
@sleft = 0
|
110
|
+
|
111
|
+
p = hyper(min)
|
112
|
+
i = min + 1
|
113
|
+
while p < (0.99999999 * prob)
|
114
|
+
@sleft += p
|
115
|
+
p = hyper(i)
|
116
|
+
i += 1
|
117
|
+
end
|
118
|
+
|
119
|
+
i -= 1
|
120
|
+
|
121
|
+
if p < (1.00000001*prob)
|
122
|
+
@sleft += p
|
123
|
+
else
|
124
|
+
i -= 1
|
125
|
+
end
|
126
|
+
|
127
|
+
@sright = 0
|
128
|
+
|
129
|
+
p = hyper(max)
|
130
|
+
j = max - 1
|
131
|
+
while p < (0.99999999 * prob)
|
132
|
+
@sright += p
|
133
|
+
p = hyper(j)
|
134
|
+
j -= 1
|
135
|
+
end
|
136
|
+
j += 1
|
137
|
+
|
138
|
+
if p < (1.00000001*prob)
|
139
|
+
@sright += p
|
140
|
+
else
|
141
|
+
j += 1
|
142
|
+
end
|
143
|
+
|
144
|
+
if (i - n11).abs < (j - n11).abs
|
145
|
+
@sless = @sleft
|
146
|
+
@slarg = 1 - @sleft + prob
|
147
|
+
else
|
148
|
+
@sless = 1 - @sright + prob
|
149
|
+
@slarg = @sright
|
150
|
+
end
|
151
|
+
return prob
|
152
|
+
end
|
153
|
+
|
154
|
+
def calculate(n11_,n12_,n21_,n22_)
|
155
|
+
n11_ *= -1 if n11_ < 0
|
156
|
+
n12_ *= -1 if n12_ < 0
|
157
|
+
n21_ *= -1 if n21_ < 0
|
158
|
+
n22_ *= -1 if n22_ < 0
|
159
|
+
n1_ = n11_ + n12_
|
160
|
+
n_1 = n11_ + n21_
|
161
|
+
n = n11_ + n12_ + n21_ + n22_
|
162
|
+
exact(n11_,n1_,n_1,n)
|
163
|
+
left = @sless
|
164
|
+
right = @slarg
|
165
|
+
twotail = @sleft + @sright
|
166
|
+
twotail = 1 if twotail > 1
|
167
|
+
values_hash = { :left =>left, :right =>right, :twotail =>twotail }
|
168
|
+
return values_hash
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,1043 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
# ViralSeq::SeqHash class for operation on multiple sequences.
|
5
|
+
# @example read a FASTA sequence file of HIV PR sequences, make alignment, perform the QC location check, filter sequences with stop codons and APOBEC3g/f hypermutations, calculate pairwise diversity, calculate minority cut-off based on Poisson model, and examine for drug resistance mutations.
|
6
|
+
# my_pr_seqhash = ViralSeq::SeqHash.fa('my_pr_fasta_file.fasta')
|
7
|
+
# # new ViralSeq::SeqHash object from a FASTA file
|
8
|
+
# aligned_pr_seqhash = my_pr_seqhash.align
|
9
|
+
# # align with MUSCLE
|
10
|
+
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
|
+
# # filter nt sequences with the reference coordinates
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
|
13
|
+
# # return a new ViralSeq::SeqHash object without stop codons
|
14
|
+
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
|
+
# # further filter out sequences with A3G hypermutations
|
16
|
+
# filtered_seqhash.pi
|
17
|
+
# # return pairwise diveristy π
|
18
|
+
# cut_off = filtered_seqhash.pm
|
19
|
+
# # return cut-off for minority variants based on Poisson model
|
20
|
+
# filtered_seqhash.sdrm_hiv_pr(cut_off)
|
21
|
+
# # examine for drug resistance mutations for PR region.
|
22
|
+
|
23
|
+
class SeqHash
|
24
|
+
# initialize a ViralSeq::SeqHash object
|
25
|
+
def initialize (dna_hash = {}, aa_hash = {}, qc_hash = {}, title = "", file = "")
|
26
|
+
@dna_hash = dna_hash
|
27
|
+
@aa_hash = aa_hash
|
28
|
+
@qc_hash = qc_hash
|
29
|
+
@title = title
|
30
|
+
@file = file
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Hash] Hash object for :name => :sequence_string pairs
|
34
|
+
attr_accessor :dna_hash
|
35
|
+
|
36
|
+
# @return [Hash] Hash object for :name => :amino_acid_sequence_string pairs
|
37
|
+
attr_accessor :aa_hash
|
38
|
+
|
39
|
+
# @return [Hash] Hash object for :name => :qc_score_string pairs
|
40
|
+
attr_accessor :qc_hash
|
41
|
+
|
42
|
+
# @return [String] the title of the SeqHash object.
|
43
|
+
# default as the file basename if SeqHash object is initialized using ::fa or ::fq
|
44
|
+
attr_accessor :title
|
45
|
+
|
46
|
+
# @return [String] the file that is used to initialize SeqHash object, if it exists
|
47
|
+
attr_accessor :file
|
48
|
+
|
49
|
+
# initialize a new ViralSeq::SeqHash object from a FASTA format sequence file
|
50
|
+
# @param infile [String] path to the FASTA format sequence file
|
51
|
+
# @return [ViralSeq::SeqHash]
|
52
|
+
# @example new ViralSeq::SeqHash object from a FASTA file
|
53
|
+
# ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
54
|
+
|
55
|
+
def self.new_from_fasta(infile)
|
56
|
+
f=File.open(infile,"r")
|
57
|
+
return_hash = {}
|
58
|
+
name = ""
|
59
|
+
while line = f.gets do
|
60
|
+
line.tr!("\u0000","")
|
61
|
+
next if line == "\n"
|
62
|
+
next if line =~ /^\=/
|
63
|
+
if line =~ /^\>/
|
64
|
+
name = line.chomp
|
65
|
+
return_hash[name] = ""
|
66
|
+
else
|
67
|
+
return_hash[name] += line.chomp.upcase
|
68
|
+
end
|
69
|
+
end
|
70
|
+
f.close
|
71
|
+
seq_hash = ViralSeq::SeqHash.new
|
72
|
+
seq_hash.dna_hash = return_hash
|
73
|
+
seq_hash.title = File.basename(infile,".*")
|
74
|
+
seq_hash.file = infile
|
75
|
+
return seq_hash
|
76
|
+
end # end of ::new_from_fasta
|
77
|
+
|
78
|
+
# initialize a new ViralSeq::SeqHash object from a FASTA format sequence file of amino acid sequences
|
79
|
+
# @param infile [String] path to the FASTA format sequence file of aa sequences
|
80
|
+
# @return [ViralSeq::SeqHash]
|
81
|
+
|
82
|
+
def self.new_from_aa_fasta(infile)
|
83
|
+
f=File.open(infile,"r")
|
84
|
+
return_hash = {}
|
85
|
+
name = ""
|
86
|
+
while line = f.gets do
|
87
|
+
line.tr!("\u0000","")
|
88
|
+
next if line == "\n"
|
89
|
+
next if line =~ /^\=/
|
90
|
+
if line =~ /^\>/
|
91
|
+
name = line.chomp
|
92
|
+
return_hash[name] = ""
|
93
|
+
else
|
94
|
+
return_hash[name] += line.chomp.upcase
|
95
|
+
end
|
96
|
+
end
|
97
|
+
f.close
|
98
|
+
seq_hash = ViralSeq::SeqHash.new
|
99
|
+
seq_hash.aa_hash = return_hash
|
100
|
+
seq_hash.title = File.basename(infile,".*")
|
101
|
+
seq_hash.file = infile
|
102
|
+
return seq_hash
|
103
|
+
end # end of ::new_from_fasta
|
104
|
+
|
105
|
+
# initialize a new ViralSeq::SeqHash object from a FASTQ format sequence file
|
106
|
+
# @param fastq_file [String] path to the FASTA format sequence file
|
107
|
+
# @return [ViralSeq::SeqHash]
|
108
|
+
# @example new ViralSeq::SeqHash object from a FASTQ file
|
109
|
+
# ViralSeq::SeqHash.fq('my_fastq_file.fastq')
|
110
|
+
|
111
|
+
def self.new_from_fastq(fastq_file)
|
112
|
+
count = 0
|
113
|
+
sequence_a = []
|
114
|
+
quality_a = []
|
115
|
+
count_seq = 0
|
116
|
+
|
117
|
+
File.open(fastq_file,'r') do |file|
|
118
|
+
file.readlines.collect do |line|
|
119
|
+
count +=1
|
120
|
+
count_m = count % 4
|
121
|
+
if count_m == 1
|
122
|
+
line.tr!('@','>')
|
123
|
+
sequence_a << line.chomp
|
124
|
+
quality_a << line.chomp
|
125
|
+
count_seq += 1
|
126
|
+
elsif count_m == 2
|
127
|
+
sequence_a << line.chomp
|
128
|
+
elsif count_m == 0
|
129
|
+
quality_a << line.chomp
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
sequence_hash = Hash[*sequence_a]
|
134
|
+
quality_hash = Hash[*quality_a]
|
135
|
+
|
136
|
+
seq_hash = ViralSeq::SeqHash.new
|
137
|
+
seq_hash.dna_hash = sequence_hash
|
138
|
+
seq_hash.qc_hash = quality_hash
|
139
|
+
seq_hash.title = File.basename(fastq_file,".*")
|
140
|
+
seq_hash.file = fastq_file
|
141
|
+
return seq_hash
|
142
|
+
end # end of ::new_from_fastq
|
143
|
+
|
144
|
+
# initialize a ViralSeq::SeqHash object with an array of sequence strings
|
145
|
+
# @param master_tag [String] master tag to put in the sequence names
|
146
|
+
# @return [ViralSeq::SeqHash] No @qc_hash, @title will be the master_tag
|
147
|
+
|
148
|
+
def self.new_from_array(seq_array,master_tag = 'seq')
|
149
|
+
n = 1
|
150
|
+
hash = {}
|
151
|
+
seq_array.each do |seq|
|
152
|
+
hash[master_tag + "_" + n.to_s] = seq
|
153
|
+
n += 1
|
154
|
+
end
|
155
|
+
seq_hash = ViralSeq::SeqHash.new
|
156
|
+
seq_hash.dna_hash = hash
|
157
|
+
seq_hash.title = master_tag
|
158
|
+
return seq_hash
|
159
|
+
end # end of ::new_from_array
|
160
|
+
|
161
|
+
|
162
|
+
class << self
|
163
|
+
alias_method :fa, :new_from_fasta
|
164
|
+
alias_method :fq, :new_from_fastq
|
165
|
+
alias_method :aa_fa, :new_from_aa_fasta
|
166
|
+
alias_method :array, :new_from_array
|
167
|
+
end
|
168
|
+
|
169
|
+
# generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
|
170
|
+
# @return [String] relaxed sequencial phylip format in a String object
|
171
|
+
# @example convert fasta format to relaxed sequencial phylip format
|
172
|
+
# # my_fasta_file.fasta
|
173
|
+
# # >seq1
|
174
|
+
# # ATAAGAACG
|
175
|
+
# # >seq2
|
176
|
+
# # ATATGAACG
|
177
|
+
# # >seq3
|
178
|
+
# # ATGAGAACG
|
179
|
+
# my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
|
180
|
+
# puts my_seqhash.to_rsphylip
|
181
|
+
# # 3 9
|
182
|
+
# # seq1 ATAAGAACG
|
183
|
+
# # seq2 ATATGAACG
|
184
|
+
# # seq3 ATGAGAACG
|
185
|
+
|
186
|
+
def to_rsphylip
|
187
|
+
seqs = self.dna_hash
|
188
|
+
outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
|
189
|
+
names = seqs.keys
|
190
|
+
names.collect!{|n| n.tr(">", "")}
|
191
|
+
max_name_l = names.max.size
|
192
|
+
max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
|
193
|
+
seqs.each do |k,v|
|
194
|
+
outline += k + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
|
195
|
+
end
|
196
|
+
return outline
|
197
|
+
end # end of #to_rsphylip
|
198
|
+
|
199
|
+
# translate the DNA sequences in @dna_hash to amino acid sequences. generate value for @aa_hash
|
200
|
+
# @param codon_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
|
201
|
+
# @return [NilClass]
|
202
|
+
# @example translate dna sequences from a FASTA format sequence file
|
203
|
+
# # my_fasta_file.fasta
|
204
|
+
# # >seq1
|
205
|
+
# # ATAAGAACG
|
206
|
+
# # >seq2
|
207
|
+
# # ATATGAACG
|
208
|
+
# # >seq3
|
209
|
+
# # ATGAGAACG
|
210
|
+
# my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
|
211
|
+
# my_seqhash.translate
|
212
|
+
# my_seqhash.aa_sequence
|
213
|
+
# => {">seq1"=>"IRT", ">seq2"=>"I*T", ">seq3"=>"MRT"}
|
214
|
+
|
215
|
+
def translate(codon_position = 0)
|
216
|
+
seqs = self.dna_hash
|
217
|
+
@aa_hash = {}
|
218
|
+
seqs.each do |name, seq|
|
219
|
+
s = ViralSeq::Sequence.new(name, seq)
|
220
|
+
s.translate(codon_position)
|
221
|
+
@aa_hash[name] = s.aa_string
|
222
|
+
end
|
223
|
+
return nil
|
224
|
+
end # end of #translate
|
225
|
+
|
226
|
+
# collapse @dna_hash to unique sequence hash.
|
227
|
+
# @param tag # the master tag for unique sequences,
|
228
|
+
# sequences will be named as (tag + "_" + order(Integer) + "_" + counts(Integer))
|
229
|
+
# @return [ViralSeq::SeqHash] new SeqHash object of unique sequence hash
|
230
|
+
# @example
|
231
|
+
# dna_hash = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA', '>seq4' => 'CCCC', '>seq5' => 'CCCC', '>seq6' => 'TTTT'} }
|
232
|
+
# a_seq_hash = ViralSeq::SeqHash.new
|
233
|
+
# a_seq_hash.dna_hash = dna_hash
|
234
|
+
# uniq_sequence = a_seq_hash.uniq_dna_hash('master')
|
235
|
+
# => {">master_1_3"=>"AAAA", ">master_2_2"=>"CCCC", ">master_3_1"=>"TTTT"}
|
236
|
+
|
237
|
+
def uniq_dna_hash(tag = "sequence")
|
238
|
+
seqs = self.dna_hash
|
239
|
+
uni = seqs.values.count_freq
|
240
|
+
new_seq = {}
|
241
|
+
n = 1
|
242
|
+
uni.each do |s,c|
|
243
|
+
name = ">" + tag + "_" + n.to_s + "_" + c.to_s
|
244
|
+
new_seq[name] = s
|
245
|
+
n += 1
|
246
|
+
end
|
247
|
+
seq_hash = ViralSeq::SeqHash.new(new_seq)
|
248
|
+
seq_hash.title = self.title + "_uniq"
|
249
|
+
seq_hash.file = self.file
|
250
|
+
return seq_hash
|
251
|
+
end # end of #uniq_dna_hash
|
252
|
+
|
253
|
+
alias_method :uniq, :uniq_dna_hash
|
254
|
+
|
255
|
+
# given an Array of sequence tags, return a sub ViralSeq::SeqHash object with the sequence tags
|
256
|
+
# @param keys [Array] array of sequence tags
|
257
|
+
# @return [SeqHash] new SeqHash object with sequences of the input keys
|
258
|
+
|
259
|
+
def sub(keys)
|
260
|
+
h1 = {}
|
261
|
+
h2 = {}
|
262
|
+
h3 = {}
|
263
|
+
|
264
|
+
keys.each do |k|
|
265
|
+
dna = self.dna_hash[k]
|
266
|
+
next unless dna
|
267
|
+
h1[k] = dna
|
268
|
+
aa = self.aa_hash[k]
|
269
|
+
h2[k] = aa
|
270
|
+
qc = self.qc_hash[k]
|
271
|
+
h3[k] = qc
|
272
|
+
end
|
273
|
+
title = self.title
|
274
|
+
file = self.file
|
275
|
+
ViralSeq::SeqHash.new(h1,h2,h3,title,file)
|
276
|
+
end
|
277
|
+
|
278
|
+
# screen for sequences with stop codons.
|
279
|
+
# @param (see #translate)
|
280
|
+
# @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
|
281
|
+
#
|
282
|
+
# # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
|
283
|
+
# # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
|
284
|
+
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
285
|
+
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
286
|
+
# my_seqhash.dna_hash
|
287
|
+
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
288
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[0]
|
289
|
+
# stop_codon_seqhash.dna_hash
|
290
|
+
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
291
|
+
# stop_codon_seqhash.aa_hash
|
292
|
+
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
293
|
+
# stop_codon_seqhash.title
|
294
|
+
# => "my_fasta_file_stop"
|
295
|
+
# filtered_seqhash = my_seqhash.stop_codon[1]
|
296
|
+
# filtered_seqhash.aa_hash
|
297
|
+
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
298
|
+
|
299
|
+
def stop_codon(codon_position = 0)
|
300
|
+
self.translate(codon_position)
|
301
|
+
keys = []
|
302
|
+
self.aa_hash.each do |k,v|
|
303
|
+
keys << k if v.include?('*')
|
304
|
+
end
|
305
|
+
seqhash1 = self.sub(keys)
|
306
|
+
seqhash1.title = self.title + "_stop"
|
307
|
+
keys2 = self.aa_hash.keys - keys
|
308
|
+
seqhash2 = self.sub(keys2)
|
309
|
+
return [seqhash1, seqhash2]
|
310
|
+
end #end of #stop_codon
|
311
|
+
|
312
|
+
|
313
|
+
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
314
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
|
315
|
+
# @return [String] consensus sequence
|
316
|
+
# @example consensus sequence from an array of sequences.
|
317
|
+
# seq_array = %w{ ATTTTTTTTT
|
318
|
+
# AATTTTTTTT
|
319
|
+
# AAATTTTTTT
|
320
|
+
# AAAATTTTTT
|
321
|
+
# AAAAATTTTT
|
322
|
+
# AAAAAATTTT
|
323
|
+
# AAAAAAATTT
|
324
|
+
# AAAAAAAATT
|
325
|
+
# AAAAAAAAAT
|
326
|
+
# AAAAAAAAAA }
|
327
|
+
# my_seqhash = ViralSeq::SeqHash.array(seq_array)
|
328
|
+
# my_seqhash.consensus
|
329
|
+
# => 'AAAAAWTTTT'
|
330
|
+
# my_seqhash.consensus(0.7)
|
331
|
+
# => 'AAAANNNTTT'
|
332
|
+
|
333
|
+
def consensus(cutoff = 0.5)
|
334
|
+
seq_array = self.dna_hash.values
|
335
|
+
seq_length = seq_array[0].size
|
336
|
+
seq_size = seq_array.size
|
337
|
+
consensus_seq = ""
|
338
|
+
(0..(seq_length - 1)).each do |position|
|
339
|
+
all_base = []
|
340
|
+
seq_array.each do |seq|
|
341
|
+
all_base << seq[position]
|
342
|
+
end
|
343
|
+
base_count = all_base.count_freq
|
344
|
+
max_base_list = []
|
345
|
+
|
346
|
+
base_count.each do |k,v|
|
347
|
+
if v/seq_size.to_f >= cutoff
|
348
|
+
max_base_list << k
|
349
|
+
end
|
350
|
+
end
|
351
|
+
consensus_seq += call_consensus_base(max_base_list)
|
352
|
+
end
|
353
|
+
return consensus_seq
|
354
|
+
end #end of #consensus
|
355
|
+
|
356
|
+
# function to determine if the sequences have APOBEC3g/f hypermutation.
|
357
|
+
# # APOBEC3G/F pattern: GRD -> ARD
|
358
|
+
# # control pattern: G[YN|RC] -> A[YN|RC]
|
359
|
+
# # use the sample consensus to determine potential a3g sites
|
360
|
+
# # Two criteria to identify hypermutation
|
361
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
|
362
|
+
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
363
|
+
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
364
|
+
# # b/c Poisson model does not do well on small sample size.
|
365
|
+
# @return [Array] three values.
|
366
|
+
# first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
|
367
|
+
# second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
|
368
|
+
# third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
369
|
+
# # sequence tag
|
370
|
+
# # G to A mutation numbers at potential a3g positions
|
371
|
+
# # total potential a3g G positions
|
372
|
+
# # G to A mutation numbers at non a3g positions
|
373
|
+
# # total non a3g G positions
|
374
|
+
# # a3g G to A mutation rate / non-a3g G to A mutation rate
|
375
|
+
# # Fishers Exact P-value
|
376
|
+
# @example identify apobec3gf mutations from a sequence fasta file
|
377
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
378
|
+
# hypermut = my_seqhash.a3g
|
379
|
+
# hypermut[0].dna_hash.keys
|
380
|
+
# => [">Seq7", ">Seq14"]
|
381
|
+
# hypermut[1].dna_hash.keys
|
382
|
+
# => [">Seq1", ">Seq2", ">Seq5"]
|
383
|
+
# hypermut[2]
|
384
|
+
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
385
|
+
#
|
386
|
+
# @example identify apobec3gf mutations from another sequence fasta file
|
387
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
388
|
+
# hypermut = my_seqhash.a3g
|
389
|
+
# hypermut[2]
|
390
|
+
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
391
|
+
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
392
|
+
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
393
|
+
# @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
|
394
|
+
|
395
|
+
def a3g_hypermut
|
396
|
+
# mut_hash number of apobec3g/f mutations per sequence
|
397
|
+
mut_hash = {}
|
398
|
+
hm_hash = {}
|
399
|
+
out_hash = {}
|
400
|
+
|
401
|
+
# total G->A mutations at apobec3g/f positions.
|
402
|
+
total = 0
|
403
|
+
|
404
|
+
# make consensus sequence for the input sequence hash
|
405
|
+
ref = self.consensus
|
406
|
+
|
407
|
+
# obtain apobec3g positions and control positions
|
408
|
+
apobec = apobec3gf(ref)
|
409
|
+
mut = apobec[0]
|
410
|
+
control = apobec[1]
|
411
|
+
|
412
|
+
self.dna_hash.each do |k,v|
|
413
|
+
a = 0 # muts
|
414
|
+
b = 0 # potential mut sites
|
415
|
+
c = 0 # control muts
|
416
|
+
d = 0 # potenrial controls
|
417
|
+
mut.each do |n|
|
418
|
+
next if v[n] == "-"
|
419
|
+
if v[n] == "A"
|
420
|
+
a += 1
|
421
|
+
b += 1
|
422
|
+
else
|
423
|
+
b += 1
|
424
|
+
end
|
425
|
+
end
|
426
|
+
mut_hash[k] = a
|
427
|
+
total += a
|
428
|
+
|
429
|
+
control.each do |n|
|
430
|
+
next if v[n] == "-"
|
431
|
+
if v[n] == "A"
|
432
|
+
c += 1
|
433
|
+
d += 1
|
434
|
+
else
|
435
|
+
d += 1
|
436
|
+
end
|
437
|
+
end
|
438
|
+
rr = (a/b.to_f)/(c/d.to_f)
|
439
|
+
|
440
|
+
t1 = b - a
|
441
|
+
t2 = d - c
|
442
|
+
|
443
|
+
fet = ViralSeq::Rubystats::FishersExactTest.new
|
444
|
+
fisher = fet.calculate(t1,t2,a,c)
|
445
|
+
perc = fisher[:twotail]
|
446
|
+
info = [k, a, b, c, d, rr.round(2), perc]
|
447
|
+
out_hash[k] = info
|
448
|
+
if perc < 0.05
|
449
|
+
hm_hash[k] = info
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
if self.dna_hash.size > 20
|
454
|
+
rate = total.to_f/(self.dna_hash.size)
|
455
|
+
count_mut = mut_hash.values.count_freq
|
456
|
+
maxi_count = count_mut.values.max
|
457
|
+
poisson_hash = ViralSeq::Math::PoissonDist.new(rate,maxi_count).poisson_hash
|
458
|
+
cut_off = 0
|
459
|
+
poisson_hash.each do |k,v|
|
460
|
+
cal = self.dna_hash.size * v
|
461
|
+
obs = count_mut[k]
|
462
|
+
if obs >= 20 * cal
|
463
|
+
cut_off = k
|
464
|
+
break
|
465
|
+
elsif k == maxi_count
|
466
|
+
cut_off = maxi_count
|
467
|
+
end
|
468
|
+
end
|
469
|
+
mut_hash.each do |k,v|
|
470
|
+
if v > cut_off
|
471
|
+
hm_hash[k] = out_hash[k]
|
472
|
+
end
|
473
|
+
end
|
474
|
+
end
|
475
|
+
hm_seq_hash = ViralSeq::SeqHash.new
|
476
|
+
hm_hash.each do |k,_v|
|
477
|
+
hm_seq_hash.dna_hash[k] = self.dna_hash[k]
|
478
|
+
end
|
479
|
+
hm_seq_hash.title = self.title + "_hypermut"
|
480
|
+
hm_seq_hash.file = self.file
|
481
|
+
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
482
|
+
return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
|
483
|
+
end #end of #a3g_hypermut
|
484
|
+
|
485
|
+
alias_method :a3g, :a3g_hypermut
|
486
|
+
|
487
|
+
# Define Poission cut-off for minority variants.
|
488
|
+
# @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 Ref: Zhou, et al. J Virol 2015
|
489
|
+
# @param error_rate [Float] estimated sequencing error rate
|
490
|
+
# @param fold_cutoff [Integer] a fold cut-off to determine poisson minority cut-off. default = 20. i.e. <5% mutations from random methods error.
|
491
|
+
# @return [Integer] a cut-off for minority variants (>=).
|
492
|
+
# @example obtain Poisson minority cut-off from the example sequence FASTA file.
|
493
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
494
|
+
# my_seqhash.pm
|
495
|
+
# => 2 # means that mutations appear at least 2 times are very likely to be a true mutation instead of random methods errors.
|
496
|
+
|
497
|
+
def poisson_minority_cutoff(error_rate = 0.0001, fold_cutoff = 20)
|
498
|
+
sequences = self.dna_hash.values
|
499
|
+
if sequences.size == 0
|
500
|
+
return 0
|
501
|
+
else
|
502
|
+
cut_off = 1
|
503
|
+
l = sequences[0].size
|
504
|
+
rate = sequences.size * error_rate
|
505
|
+
count_mut = variant_for_poisson(sequences)
|
506
|
+
max_count = count_mut.keys.max
|
507
|
+
poisson_hash = ViralSeq::Math::PoissonDist.new(rate, max_count).poisson_hash
|
508
|
+
|
509
|
+
poisson_hash.each do |k,v|
|
510
|
+
cal = l * v
|
511
|
+
obs = count_mut[k] ? count_mut[k] : 0
|
512
|
+
if obs >= fold_cutoff * cal
|
513
|
+
cut_off = k
|
514
|
+
break
|
515
|
+
end
|
516
|
+
end
|
517
|
+
return cut_off
|
518
|
+
end
|
519
|
+
end # end of #poisson_minority_cutoff
|
520
|
+
|
521
|
+
alias_method :pm, :poisson_minority_cutoff
|
522
|
+
|
523
|
+
|
524
|
+
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
525
|
+
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
526
|
+
# @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
|
527
|
+
|
528
|
+
def align(path_to_muscle = false)
|
529
|
+
seq_hash = self.dna_hash
|
530
|
+
if self.file.size > 0
|
531
|
+
temp_dir = File.dirname(self.file)
|
532
|
+
else
|
533
|
+
temp_dir=File.dirname($0)
|
534
|
+
end
|
535
|
+
|
536
|
+
temp_file = temp_dir + "/_temp_muscle_in"
|
537
|
+
temp_aln = temp_dir + "/_temp_muscle_aln"
|
538
|
+
File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
|
539
|
+
if path_to_muscle
|
540
|
+
unless ViralSeq.check_muscle?(path_to_muscle)
|
541
|
+
File.unlink(temp_file)
|
542
|
+
return nil
|
543
|
+
end
|
544
|
+
print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
|
545
|
+
else
|
546
|
+
MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
|
547
|
+
end
|
548
|
+
out_seq_hash = ViralSeq::SeqHash.fa(temp_aln)
|
549
|
+
out_seq_hash.title = self.title + "_aligned"
|
550
|
+
out_seq_hash.file = self.file
|
551
|
+
File.unlink(temp_file)
|
552
|
+
File.unlink(temp_aln)
|
553
|
+
return out_seq_hash
|
554
|
+
end # end of align
|
555
|
+
|
556
|
+
# calculate Shannon's entropy, Euler's number as the base of logarithm
|
557
|
+
# @see https://en.wikipedia.org/wiki/Entropy_(information_theory) Entropy(Wikipedia)
|
558
|
+
# @param option [Symbol] the sequence type `:nt` or `:aa`
|
559
|
+
# @return [Hash] entropy score at each position in the alignment :position => :entropy ,
|
560
|
+
# # position starts at 1.
|
561
|
+
# @example caculate entropy from the example file
|
562
|
+
# sequence_file = 'spec/sample_files/sample_sequence_alignment_for_entropy.fasta'
|
563
|
+
# sequence_hash = ViralSeq::SeqHash.aa_fa(sequence_file)
|
564
|
+
# entropy_hash = sequence_hash.shannons_entropy(:aa)
|
565
|
+
# entropy_hash[3]
|
566
|
+
# => 0.0
|
567
|
+
# entropy_hash[14].round(3)
|
568
|
+
# => 0.639
|
569
|
+
# # This example is the sample input of LANL Entropy-One
|
570
|
+
# # https://www.hiv.lanl.gov/content/sequence/ENTROPY/entropy_one.html?sample_input=1
|
571
|
+
|
572
|
+
def shannons_entropy(option = :nt)
|
573
|
+
sequences = if option == :aa
|
574
|
+
self.aa_hash.values
|
575
|
+
else
|
576
|
+
self.dna_hash.values
|
577
|
+
end
|
578
|
+
entropy_hash = {}
|
579
|
+
seq_l = sequences[0].size
|
580
|
+
(0..(seq_l - 1)).each do |position|
|
581
|
+
element = []
|
582
|
+
sequences.each do |seq|
|
583
|
+
element << seq[position]
|
584
|
+
end
|
585
|
+
entropy = 0
|
586
|
+
element.delete('*')
|
587
|
+
element_size = element.size
|
588
|
+
element.count_freq.each do |_k,v|
|
589
|
+
p = v/element_size.to_f
|
590
|
+
entropy += (-p * ::Math.log(p))
|
591
|
+
end
|
592
|
+
entropy_hash[(position + 1)] = entropy
|
593
|
+
end
|
594
|
+
return entropy_hash
|
595
|
+
end # end of shannons_entropy
|
596
|
+
|
597
|
+
# Function to calculate nucleotide diversity π, for nt sequence only
|
598
|
+
# @see https://en.wikipedia.org/wiki/Nucleotide_diversity Nucleotide Diversity (Wikipedia)
|
599
|
+
# @return [Float] nucleotide diversity π
|
600
|
+
# @example calculate π
|
601
|
+
# sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
|
602
|
+
# my_seqhash = ViralSeq::SeqHash.array(sequences)
|
603
|
+
# my_seqhash.pi
|
604
|
+
# => 0.16667
|
605
|
+
|
606
|
+
def nucleotide_pi
|
607
|
+
sequences = self.dna_hash.values
|
608
|
+
seq_length = sequences[0].size - 1
|
609
|
+
nt_position_hash = {}
|
610
|
+
(0..seq_length).each do |n|
|
611
|
+
nt_position_hash[n] = []
|
612
|
+
sequences.each do |s|
|
613
|
+
nt_position_hash[n] << s[n]
|
614
|
+
end
|
615
|
+
end
|
616
|
+
diver = 0
|
617
|
+
com = 0
|
618
|
+
nt_position_hash.each do |_p,nt|
|
619
|
+
nt.delete_if {|n| n =~ /[^A|^C|^G|^T]/}
|
620
|
+
next if nt.size == 1
|
621
|
+
nt_count = nt.count_freq
|
622
|
+
combination = (nt.size)*(nt.size - 1)/2
|
623
|
+
com += combination
|
624
|
+
a = nt_count["A"]
|
625
|
+
c = nt_count["C"]
|
626
|
+
t = nt_count["T"]
|
627
|
+
g = nt_count["G"]
|
628
|
+
div = a*c + a*t + a*g + c*t + c*g + t*g
|
629
|
+
diver += div
|
630
|
+
end
|
631
|
+
pi = (diver/com.to_f).round(5)
|
632
|
+
return pi
|
633
|
+
end # end of #pi
|
634
|
+
|
635
|
+
alias_method :pi, :nucleotide_pi
|
636
|
+
|
637
|
+
# TN93 distance functionl, tabulate pairwise comparison of sequence pairs in a sequence alignment,
|
638
|
+
# nt sequence only
|
639
|
+
# @return [Hash] pairwise distance table in Hash object {:diff => :freq, ... }
|
640
|
+
# # Note: :diff in different positions (Integer), not percentage.
|
641
|
+
# @example calculate TN93 distribution
|
642
|
+
# sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
|
643
|
+
# my_seqhash = ViralSeq::SeqHash.array(sequences)
|
644
|
+
# my_seqhash.tn93
|
645
|
+
# => {0=>1, 1=>8, 2=>6}
|
646
|
+
|
647
|
+
def tn93
|
648
|
+
sequences = self.dna_hash.values
|
649
|
+
diff = []
|
650
|
+
seq_hash = sequences.count_freq
|
651
|
+
seq_hash.values.each do |v|
|
652
|
+
comb = v * (v - 1) / 2
|
653
|
+
comb.times {diff << 0}
|
654
|
+
end
|
655
|
+
|
656
|
+
seq_hash.keys.combination(2).to_a.each do |pair|
|
657
|
+
s1 = pair[0]
|
658
|
+
s2 = pair[1]
|
659
|
+
diff_temp = s1.compare_with(s2)
|
660
|
+
comb = seq_hash[s1] * seq_hash[s2]
|
661
|
+
comb.times {diff << diff_temp}
|
662
|
+
end
|
663
|
+
|
664
|
+
count_diff = diff.count_freq
|
665
|
+
out_hash = Hash.new(0)
|
666
|
+
Hash[count_diff.sort_by{|k,_v|k}].each do |k,v|
|
667
|
+
out_hash[k] = v
|
668
|
+
end
|
669
|
+
return out_hash
|
670
|
+
end # end of #tn93
|
671
|
+
|
672
|
+
# quality check for HIV sequences based on ViralSeq::Sequence#locator, check if sequences are in the target range
|
673
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
674
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
675
|
+
# @param indel [Boolean] allow indels or not, `ture` or `false`
|
676
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
677
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
678
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with only the sequences that meet the QC criterias
|
679
|
+
# @example QC for sequences in a FASTA files
|
680
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_seq.fasta')
|
681
|
+
# filtered_seqhash = my_seqhash.hiv_seq_qc([4384,4386], 4750..4752, false, :HXB2)
|
682
|
+
# my_seqhash.dna_hash.size
|
683
|
+
# => 6
|
684
|
+
# filtered_seqhash.dna_hash.size
|
685
|
+
# => 4
|
686
|
+
|
687
|
+
def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
|
688
|
+
start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
|
689
|
+
end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
|
690
|
+
seq_hash = self.dna_hash.dup
|
691
|
+
seq_hash_unique = seq_hash.values.uniq
|
692
|
+
seq_hash_unique_pass = []
|
693
|
+
|
694
|
+
seq_hash_unique.each do |seq|
|
695
|
+
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
696
|
+
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
697
|
+
if indel
|
698
|
+
seq_hash_unique_pass << seq
|
699
|
+
elsif loc[3] == false
|
700
|
+
seq_hash_unique_pass << seq
|
701
|
+
end
|
702
|
+
end
|
703
|
+
end
|
704
|
+
seq_pass = []
|
705
|
+
seq_hash_unique_pass.each do |seq|
|
706
|
+
seq_hash.each do |seq_name, orginal_seq|
|
707
|
+
if orginal_seq == seq
|
708
|
+
seq_pass << seq_name
|
709
|
+
seq_hash.delete(seq_name)
|
710
|
+
end
|
711
|
+
end
|
712
|
+
end
|
713
|
+
self.sub(seq_pass)
|
714
|
+
end # end of #hiv_seq_qc
|
715
|
+
|
716
|
+
|
717
|
+
# Remove squences with residual offspring Primer IDs.
|
718
|
+
# Compare PID with sequences which have identical sequences.
|
719
|
+
# PIDs differ by 1 base will be recognized. If PID1 is x time (cutoff) greater than PID2, PID2 will be disgarded.
|
720
|
+
# each sequence tag starting with ">" and the Primer ID sequence
|
721
|
+
# followed by the number of Primer ID appeared in the raw sequence
|
722
|
+
# the information sections in the tags are separated by underscore "_"
|
723
|
+
# example sequence tag: >AGGCGTAGA_32_sample1_RT
|
724
|
+
# @param cutoff [Integer] the fold cut-off to remove the potential residual offspring Primer IDs
|
725
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object without sqeuences containing residual offspring Primer ID
|
726
|
+
|
727
|
+
def filter_similar_pid(cutoff = 10)
|
728
|
+
seq = self.dna_hash.dup
|
729
|
+
uni_seq = seq.values.uniq
|
730
|
+
uni_seq_pid = {}
|
731
|
+
uni_seq.each do |k|
|
732
|
+
seq.each do |name,s|
|
733
|
+
name = name[1..-1]
|
734
|
+
if k == s
|
735
|
+
if uni_seq_pid[k]
|
736
|
+
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
737
|
+
else
|
738
|
+
uni_seq_pid[k] = []
|
739
|
+
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
740
|
+
end
|
741
|
+
end
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
dup_pid = []
|
746
|
+
uni_seq_pid.values.each do |v|
|
747
|
+
next if v.size == 1
|
748
|
+
pid_hash = Hash[v]
|
749
|
+
list = pid_hash.keys
|
750
|
+
list2 = Array.new(list)
|
751
|
+
pairs = []
|
752
|
+
|
753
|
+
list.each do |k|
|
754
|
+
list2.delete(k)
|
755
|
+
list2.each do |k1|
|
756
|
+
pairs << [k,k1]
|
757
|
+
end
|
758
|
+
end
|
759
|
+
|
760
|
+
pairs.each do |p|
|
761
|
+
pid1 = p[0]
|
762
|
+
pid2 = p[1]
|
763
|
+
if pid1.compare_with(pid2) <= 1
|
764
|
+
n1 = pid_hash[pid1].to_i
|
765
|
+
n2 = pid_hash[pid2].to_i
|
766
|
+
if n1 >= cutoff * n2
|
767
|
+
dup_pid << pid2
|
768
|
+
elsif n2 >= cutoff * n1
|
769
|
+
dup_pid << pid1
|
770
|
+
end
|
771
|
+
end
|
772
|
+
end
|
773
|
+
end
|
774
|
+
|
775
|
+
new_seq = {}
|
776
|
+
seq.each do |name,s|
|
777
|
+
pid = name.split("_")[0][1..-1]
|
778
|
+
unless dup_pid.include?(pid)
|
779
|
+
new_seq[name] = s
|
780
|
+
end
|
781
|
+
end
|
782
|
+
self.sub(new_seq.keys)
|
783
|
+
end # end of #filter_similar_pid
|
784
|
+
|
785
|
+
# Collapse sequences by difference cut-offs. Suggesting aligning before using this function.
|
786
|
+
# @param cutoff [Integer] nt base differences. collapse sequences within [cutoff] differences
|
787
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object of collapsed sequences
|
788
|
+
|
789
|
+
def collapse(cutoff=1)
|
790
|
+
seq_array = self.dna_hash.values
|
791
|
+
new_seq_freq = {}
|
792
|
+
seq_freq = seq_array.count_freq
|
793
|
+
if seq_freq.size == 1
|
794
|
+
new_seq_freq = seq_freq
|
795
|
+
else
|
796
|
+
uniq_seq = seq_freq.keys
|
797
|
+
unique_seq_pair = uniq_seq.combination(2)
|
798
|
+
dupli_seq = []
|
799
|
+
unique_seq_pair.each do |pair|
|
800
|
+
seq1 = pair[0]
|
801
|
+
seq2 = pair[1]
|
802
|
+
diff = seq1.compare_with(seq2)
|
803
|
+
if diff <= cutoff
|
804
|
+
freq1 = seq_freq[seq1]
|
805
|
+
freq2 = seq_freq[seq2]
|
806
|
+
freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
seq_freq.each do |seq,freq|
|
811
|
+
unless dupli_seq.include?(seq)
|
812
|
+
new_seq_freq[seq] = freq
|
813
|
+
end
|
814
|
+
end
|
815
|
+
end
|
816
|
+
seqhash = ViralSeq::SeqHash.new
|
817
|
+
n = 1
|
818
|
+
new_seq_freq.each do |seq,freq|
|
819
|
+
name = ">seq_" + n.to_s + '_' + freq.to_s
|
820
|
+
seqhash.dna_hash[name] = seq
|
821
|
+
n += 1
|
822
|
+
end
|
823
|
+
return seqhash
|
824
|
+
end # end of #collapse
|
825
|
+
|
826
|
+
# gap strip from a sequence alignment, all positions that contains gaps ('-') will be removed
|
827
|
+
# @param option [Symbol] sequence options for `:nt` or `:aa`
|
828
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
829
|
+
# @example gap strip for an array of sequences
|
830
|
+
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
831
|
+
# array = { AACCGGTT
|
832
|
+
# A-CCGGTT
|
833
|
+
# AAC-GGTT
|
834
|
+
# AACCG-TT
|
835
|
+
# AACCGGT- }
|
836
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
837
|
+
# puts my_seqhash.gap_strip.dna_hash.values
|
838
|
+
# ACGT
|
839
|
+
# ACGT
|
840
|
+
# ACGT
|
841
|
+
# ACGT
|
842
|
+
# ACGT
|
843
|
+
|
844
|
+
def gap_strip(option = :nt)
|
845
|
+
if option == :nt
|
846
|
+
sequence_alignment = self.dna_hash
|
847
|
+
elsif option == :aa
|
848
|
+
sequence_alignment = self.aa_hash
|
849
|
+
else
|
850
|
+
raise "Option `#{option}` not recognized"
|
851
|
+
end
|
852
|
+
|
853
|
+
new_seq = {}
|
854
|
+
seq_size = sequence_alignment.values[0].size
|
855
|
+
seq_matrix = {}
|
856
|
+
(0..(seq_size - 1)).each do |p|
|
857
|
+
seq_matrix[p] = []
|
858
|
+
sequence_alignment.values.each do |s|
|
859
|
+
seq_matrix[p] << s[p]
|
860
|
+
end
|
861
|
+
end
|
862
|
+
|
863
|
+
seq_matrix.delete_if do |_p, list|
|
864
|
+
list.include?("-")
|
865
|
+
end
|
866
|
+
|
867
|
+
sequence_alignment.each do |n,s|
|
868
|
+
new_s = ""
|
869
|
+
seq_matrix.keys.each {|p| new_s += s[p]}
|
870
|
+
new_seq[n] = new_s
|
871
|
+
end
|
872
|
+
new_seq_hash = ViralSeq::SeqHash.new
|
873
|
+
if option == :nt
|
874
|
+
new_seq_hash.dna_hash = new_seq
|
875
|
+
new_seq_hash.aa_hash = self.aa_hash
|
876
|
+
elsif option == :aa
|
877
|
+
new_seq_hash.dna_hash = self.dna_hash
|
878
|
+
new_seq_hash.aa_hash = new_seq
|
879
|
+
end
|
880
|
+
new_seq_hash.qc_hash = self.qc_hash
|
881
|
+
new_seq_hash.title = self.title + "_strip"
|
882
|
+
new_seq_hash.file = self.file
|
883
|
+
return new_seq_hash
|
884
|
+
end
|
885
|
+
|
886
|
+
# gap strip from a sequence alignment at both ends, only positions at the ends that contains gaps ('-') will be removed.
|
887
|
+
# @param (see #gap_strip)
|
888
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
889
|
+
# @example gap strip for an array of sequences only at the ends
|
890
|
+
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
891
|
+
# array = { AACCGGTT
|
892
|
+
# A-CCGGTT
|
893
|
+
# AAC-GGTT
|
894
|
+
# AACCG-TT
|
895
|
+
# AACCGGT- }
|
896
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
897
|
+
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
898
|
+
# AACCGGT
|
899
|
+
# A-CCGGT
|
900
|
+
# AAC-GGT
|
901
|
+
# AACCG-T
|
902
|
+
# AACCGGT
|
903
|
+
|
904
|
+
def gap_strip_ends(option = :nt)
|
905
|
+
if option == :nt
|
906
|
+
sequence_alignment = self.dna_hash
|
907
|
+
elsif option == :aa
|
908
|
+
sequence_alignment = self.aa_hash
|
909
|
+
else
|
910
|
+
raise "Option #{option} not recognized"
|
911
|
+
end
|
912
|
+
new_seq = {}
|
913
|
+
seq_size = sequence_alignment.values[0].size
|
914
|
+
seq_matrix = {}
|
915
|
+
(0..(seq_size - 1)).each do |p|
|
916
|
+
seq_matrix[p] = []
|
917
|
+
sequence_alignment.values.each do |s|
|
918
|
+
seq_matrix[p] << s[p]
|
919
|
+
end
|
920
|
+
end
|
921
|
+
n1 = 0
|
922
|
+
n2 = 0
|
923
|
+
seq_matrix.each do |_p, list|
|
924
|
+
if list.include?("-")
|
925
|
+
n1 += 1
|
926
|
+
else
|
927
|
+
break
|
928
|
+
end
|
929
|
+
end
|
930
|
+
|
931
|
+
seq_matrix.keys.reverse.each do |p|
|
932
|
+
list = seq_matrix[p]
|
933
|
+
if list.include?("-")
|
934
|
+
n2 += 1
|
935
|
+
else
|
936
|
+
break
|
937
|
+
end
|
938
|
+
end
|
939
|
+
|
940
|
+
sequence_alignment.each do |n,s|
|
941
|
+
new_s = s[n1..(- n2 - 1)]
|
942
|
+
new_seq[n] = new_s
|
943
|
+
end
|
944
|
+
new_seq_hash = ViralSeq::SeqHash.new
|
945
|
+
if option == :nt
|
946
|
+
new_seq_hash.dna_hash = new_seq
|
947
|
+
new_seq_hash.aa_hash = self.aa_hash
|
948
|
+
elsif option == :aa
|
949
|
+
new_seq_hash.dna_hash = self.dna_hash
|
950
|
+
new_seq_hash.aa_hash = new_seq
|
951
|
+
end
|
952
|
+
new_seq_hash.qc_hash = self.qc_hash
|
953
|
+
new_seq_hash.title = self.title + "_strip"
|
954
|
+
new_seq_hash.file = self.file
|
955
|
+
return new_seq_hash
|
956
|
+
end
|
957
|
+
|
958
|
+
|
959
|
+
|
960
|
+
|
961
|
+
|
962
|
+
# start of private functions
|
963
|
+
private
|
964
|
+
|
965
|
+
# APOBEC3G/F mutation position identification,
|
966
|
+
# APOBEC3G/F pattern: GRD -> ARD,
|
967
|
+
# control pattern: G[YN|RC] -> A[YN|RC],
|
968
|
+
def apobec3gf(seq = '')
|
969
|
+
seq.tr!("-", "")
|
970
|
+
seq_length = seq.size
|
971
|
+
apobec_position = []
|
972
|
+
control_position = []
|
973
|
+
(0..(seq_length - 3)).each do |n|
|
974
|
+
tri_base = seq[n,3]
|
975
|
+
if tri_base =~ /G[A|G][A|G|T]/
|
976
|
+
apobec_position << n
|
977
|
+
elsif seq[n] == "G"
|
978
|
+
control_position << n
|
979
|
+
end
|
980
|
+
end
|
981
|
+
return [apobec_position,control_position]
|
982
|
+
end # end of #apobec3gf
|
983
|
+
|
984
|
+
# call consensus nucleotide, used by #consensus
|
985
|
+
def call_consensus_base(base_array)
|
986
|
+
if base_array.size == 1
|
987
|
+
base_array[0]
|
988
|
+
elsif base_array.size == 2
|
989
|
+
case base_array.sort!
|
990
|
+
when ["A","T"]
|
991
|
+
"W"
|
992
|
+
when ["C","G"]
|
993
|
+
"S"
|
994
|
+
when ["A","C"]
|
995
|
+
"M"
|
996
|
+
when ["G","T"]
|
997
|
+
"K"
|
998
|
+
when ["A","G"]
|
999
|
+
"R"
|
1000
|
+
when ["C","T"]
|
1001
|
+
"Y"
|
1002
|
+
else
|
1003
|
+
"N"
|
1004
|
+
end
|
1005
|
+
elsif base_array.size == 3
|
1006
|
+
case base_array.sort!
|
1007
|
+
when ["C","G","T"]
|
1008
|
+
"B"
|
1009
|
+
when ["A","G","T"]
|
1010
|
+
"D"
|
1011
|
+
when ["A","C","T"]
|
1012
|
+
"H"
|
1013
|
+
when ["A","C","G"]
|
1014
|
+
"V"
|
1015
|
+
else
|
1016
|
+
"N"
|
1017
|
+
end
|
1018
|
+
else
|
1019
|
+
"N"
|
1020
|
+
end
|
1021
|
+
end # end of #call_consensus_base
|
1022
|
+
|
1023
|
+
# Input sequence array. output Variant distribution for Poisson cut-off
|
1024
|
+
def variant_for_poisson(seq)
|
1025
|
+
seq_size = seq.size
|
1026
|
+
l = seq[0].size - 1
|
1027
|
+
var = []
|
1028
|
+
(0..l).to_a.each do |pos|
|
1029
|
+
nt = []
|
1030
|
+
seq.each do |s|
|
1031
|
+
nt << s[pos]
|
1032
|
+
end
|
1033
|
+
count_nt = nt.count_freq
|
1034
|
+
v = seq_size - count_nt.values.max
|
1035
|
+
var << v
|
1036
|
+
end
|
1037
|
+
var_count = var.count_freq
|
1038
|
+
var_count.sort_by{|key,_value|key}.to_h
|
1039
|
+
end # end of #varaint_for_poisson
|
1040
|
+
|
1041
|
+
end # end of SeqHash
|
1042
|
+
|
1043
|
+
end # end of ViralSeq
|