viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
data/lib/viral_seq/muscle.rb
CHANGED
@@ -1,89 +1,67 @@
|
|
1
|
-
# viral_seq/muscle.rb
|
2
|
-
# wrapper for MUSCLE (http://www.drive5.com/muscle)
|
3
|
-
# Including Methods as:
|
4
|
-
# ViralSeq::check_muscle
|
5
|
-
# ViralSeq::muscle_align
|
6
|
-
# ViralSeq::muscle_align_multi
|
7
|
-
|
8
|
-
# ViralSeq.check_muscle?(path_to_muscle)
|
9
|
-
# # check if the path_to_muscle provided is valid,
|
10
|
-
# # prompt error messages if MUSCLE is not found.
|
11
|
-
|
12
|
-
# ViralSeq.muscle_align(reference_seq, test_sequence, path_to_muscle)
|
13
|
-
# # takes a reference sequence and a test sequence as String object
|
14
|
-
# # without specification on path_to_muscle, MuscleBio will be called to run Muscle
|
15
|
-
# # specify path_to_muscle if other source of muscle needed
|
16
|
-
# # returns aligned reference sequence and test sequences
|
17
|
-
|
18
|
-
# ViralSeq.muscle_align_multi(sequence_hash, path_to_muscle)
|
19
|
-
# # input a sequence_hash object {:name=>:sequence,...}
|
20
|
-
# # without specification on path_to_muscle, MuscleBio will be called to run Muscle
|
21
|
-
# # specify path_to_muscle if other source of muscle needed
|
22
|
-
# # return aligned sequences an hash
|
23
1
|
|
24
2
|
module ViralSeq
|
3
|
+
# alignment using MUSCLE alignment program
|
4
|
+
# @see http://www.drive5.com/muscle MUSCLE download link
|
25
5
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
return true
|
31
|
-
rescue Errno::ENOENT
|
32
|
-
puts "
|
33
|
-
Error: MUSCLE is not found for at the provided {path_to_muscle}!!
|
34
|
-
MUSLCE can be download at http://www.drive5.com/muscle
|
35
|
-
Add MUSCLE excutable path to $PATH using
|
36
|
-
$ export PATH=$PATH:/path/to/muscle
|
37
|
-
or
|
38
|
-
provide path_to_MUSCLE in the function arguments\n
|
39
|
-
"
|
40
|
-
return false
|
41
|
-
end
|
42
|
-
end
|
6
|
+
module Muscle
|
7
|
+
# check if path_to_muscle is correct, prompt error messages if MUSCLE is not found.
|
8
|
+
# @param path_to_muscle [String] path to muscle excutable
|
9
|
+
# @return [boolean]
|
43
10
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
return nil;
|
11
|
+
def self.check_muscle?(path_to_muscle)
|
12
|
+
begin
|
13
|
+
`#{path_to_muscle} -version`
|
14
|
+
return true
|
15
|
+
rescue Errno::ENOENT
|
16
|
+
puts "
|
17
|
+
Error: MUSCLE is not found for at the provided {path_to_muscle}!!
|
18
|
+
MUSLCE can be download at http://www.drive5.com/muscle
|
19
|
+
Add MUSCLE excutable path to $PATH using
|
20
|
+
$ export PATH=$PATH:/path/to/muscle
|
21
|
+
or
|
22
|
+
provide path_to_MUSCLE in the function arguments\n
|
23
|
+
"
|
24
|
+
return false
|
59
25
|
end
|
60
|
-
|
61
|
-
else
|
62
|
-
MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
|
63
|
-
end
|
64
|
-
aln_seq_hash = ViralSeq.fasta_to_hash(temp_aln)
|
65
|
-
File.unlink(temp_file)
|
66
|
-
File.unlink(temp_aln)
|
67
|
-
return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
|
68
|
-
end
|
26
|
+
end # end of .check_muscle?
|
69
27
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if
|
76
|
-
|
77
|
-
|
78
|
-
|
28
|
+
# align a sequence with reference sequence Strings
|
29
|
+
# @param ref_seq [String] reference sequence
|
30
|
+
# @param test_seq [String] test sequence
|
31
|
+
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
32
|
+
# @return [Array] a pair of [:ref_seq_aligned, :test_seq_aligned] or nil
|
33
|
+
# if the cannot find MUSCLE excutable
|
34
|
+
# @example
|
35
|
+
# seq1 = 'AAGGCGTAGGAC'
|
36
|
+
# seq2 = 'AAGCTTAGGACG'
|
37
|
+
# aligned_seqs = ViralSeq::Muscle.align(seq1,seq2)
|
38
|
+
# => ["AAGGCGTAGGAC-", "-AAGCTTAGGACG"]
|
39
|
+
|
40
|
+
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
|
+
temp_dir=File.dirname($0)
|
42
|
+
temp_file = temp_dir + "/_temp_muscle_in"
|
43
|
+
temp_aln = temp_dir + "/_temp_muscle_aln"
|
44
|
+
name = ">test"
|
45
|
+
temp_in = File.open(temp_file,"w")
|
46
|
+
temp_in.puts ">ref"
|
47
|
+
temp_in.puts ref_seq
|
48
|
+
temp_in.puts name
|
49
|
+
temp_in.puts test_seq
|
50
|
+
temp_in.close
|
51
|
+
if path_to_muscle
|
52
|
+
unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
|
53
|
+
File.unlink(temp_file)
|
54
|
+
return nil;
|
55
|
+
end
|
56
|
+
print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
|
57
|
+
else
|
58
|
+
MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
|
79
59
|
end
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
89
|
-
end
|
60
|
+
aln_seq_hash = ViralSeq::SeqHash.fa(temp_aln).dna_hash
|
61
|
+
File.unlink(temp_file)
|
62
|
+
File.unlink(temp_aln)
|
63
|
+
return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
|
64
|
+
end # end of .align
|
65
|
+
end # end of ViralSeq::Muscle
|
66
|
+
|
67
|
+
end # end of ViralSeq
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
module PID
|
5
|
+
|
6
|
+
# generate all Primer ID combinations given the length of Primer ID
|
7
|
+
# @param l [Integer] the length of the Primer ID.
|
8
|
+
# @example generate a pool of Primer IDs with length of 10
|
9
|
+
# primer_id_pool = ViralSeq::PID.generate_pool(10) # 10 is the length of Primer ID
|
10
|
+
# puts primer_id_pool.size #should be 4^10
|
11
|
+
# => 1048576
|
12
|
+
|
13
|
+
def self.generate_pool(l=8)
|
14
|
+
nt = ['A','T','C','G']
|
15
|
+
pid_pool = ['A','T','C','G']
|
16
|
+
(l-1).times do
|
17
|
+
pid_pool = pid_pool.product(nt)
|
18
|
+
pid_pool.collect! do |v|
|
19
|
+
v.join("")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return pid_pool
|
23
|
+
end # end of .generate_primer_id_pool
|
24
|
+
|
25
|
+
end # end of Pid
|
26
|
+
end # end of ViralSeq
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# viral_seq main module
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
# HIV/SIV reference genome sequences, including HXB2, NL43, MAC239
|
5
|
+
# @see https://www.ncbi.nlm.nih.gov/nuccore/K03455 Reference sequence of HIV-1 HXB2 (Genbank accession number K03455)
|
6
|
+
# @see https://www.ncbi.nlm.nih.gov/nuccore/AF324493 Reference sequence of HIV-1 NL43 (Genbank accession number AF324493)
|
7
|
+
# @see https://www.ncbi.nlm.nih.gov/nucleotide/M33262 Reference sequence of SIV MAC239 (Genbank accession number M33262)
|
8
|
+
# @example retrive the reference sequence for HIV NL43
|
9
|
+
# ViralSeq::RefSeq.get(:NL43)
|
10
|
+
# => "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTA..."
|
11
|
+
|
12
|
+
module RefSeq
|
13
|
+
|
14
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
15
|
+
# @return [String] the reference sequence as a String object
|
16
|
+
|
17
|
+
def self.get(ref_option)
|
18
|
+
begin
|
19
|
+
case ref_option
|
20
|
+
when :HXB2
|
21
|
+
"TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
|
22
|
+
when :NL43
|
23
|
+
"TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAAATAAGGAGAGAAGAACAGCTTGTTACACCCTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGCATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGTGTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTATAGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACAGTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTTATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAAGGGTGCCCACACTAATGATGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCATAATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATATGTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGGAATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAAAAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATGGGTTGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATATGGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTTGGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGATTAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTACTAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGAATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGCCTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCATACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATATGGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGATAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGGTTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTATCCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACATGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGAGGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCTGCACGTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
|
24
|
+
when :MAC239
|
25
|
+
"GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAGGCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATACCCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCAGATTGGCGCCTGAACAGGGACTTGAAGGAGAGTGAGAGACTCCTGAGTACGGCTGAGTGAAGGCAGTAAGGGCGGCAGGAACCAACCACGACGGAGTGCTCCTATAAAGGCGCGGGTCGGTACCAGACGGCGTGAGGAGCGGGAGAGGAAGAGGCCTCCGGTTGCAGGTAAGTGCAACACAAAAAAGAAATAGCTGTCTTTTATCCAGGAAGGGGTAATAAGATAGAGTGGGAGATGGGCGTGAGAAACTCCGTCTTGTCAGGGAAGAAAGCAGATGAATTAGAAAAAATTAGGCTACGACCCAACGGAAAGAAAAAGTACATGTTGAAGCATGTAGTATGGGCAGCAAATGAATTAGATAGATTTGGATTAGCAGAAAGCCTGTTGGAGAACAAAGAAGGATGTCAAAAAATACTTTCGGTCTTAGCTCCATTAGTGCCAACAGGCTCAGAAAATTTAAAAAGCCTTTATAATACTGTCTGCGTCATCTGGTGCATTCACGCAGAAGAGAAAGTGAAACACACTGAGGAAGCAAAACAGATAGTGCAGAGACACCTAGTGGTGGAAACAGGAACAACAGAAACTATGCCAAAAACAAGTAGACCAACAGCACCATCTAGCGGCAGAGGAGGAAATTACCCAGTACAACAAATAGGTGGTAACTATGTCCACCTGCCATTAAGCCCGAGAACATTAAATGCCTGGGTAAAATTGATAGAGGAAAAGAAATTTGGAGCAGAAGTAGTGCCAGGATTTCAGGCACTGTCAGAAGGTTGCACCCCCTATGACATTAATCAGATGTTAAATTGTGTGGGAGACCATCAAGCGGCTATGCAGATTATCAGAGATATTATAAACGAGGAGGCTGCAGATTGGGACTTGCAGCACCCACAACCAGCTCCACAACAAGGACAACTTAGGGAGCCGTCAGGATCAGATATTGCAGGAACAACTAGTTCAGTAGATGAACAAATCCAGTGGATGTACAGACAACAGAACCCCATACCAGTAGGCAACATTTACAGGAGATGGATCCAACTGGGGTTGCAAAAATGTGTCAGAATGTATAACCCAACAAACATTCTAGATGTAAAACAAGGGCCAAAAGAGCCATTTCAGAGCTATGTAGACAGGTTCTACAAAAGTTTAAGAGCAGAACAGACAGATGCAGCAGTAAAGAATTGGATGACTCAAACACTGCTGATTCAAAATGCTAACCCAGATTGCAAGCTAGTGCTGAAGGGGCTGGGTGTGAATCCCACCCTAGAAGAAATGCTGACGGCTTGTCAAGGAGTAGGGGGGCCGGGACAGAAGGCTAGATTAATGGCAGAAGCCCTGAAAGAGGCCCTCGCACCAGTGCCAATCCCTTTTGCAGCAGCCCAACAGAGGGGACCAAGAAAGCCAATTAAGTGTTGGAATTGTGGGAAAGAGGGACACTCTGCAAGGCAATGCAGAGCCCCAAGAAGACAGGGATGCTGGAAATGTGGAAAAATGGACCATGTTATGGCCAAATGCCCAGACAGACAGGCGGGTTTTTTAGGCCTTGGTCCATGGGGAAAGAAGCCCCGCAATTTCCCCATGGCTCAAGTGCATCAGGGGCTGATGCCAACTGCTCCCCCAGAGGACCCAGCTGTGGATCTGCTAAAGAACTACATGCAGTTGGGCAAGCAGCAGAGAGAAAAGCAGAGAGAAAGCAGAGAGAAGCCTTACAAGGAGGTGACAGAGGATTTGCTGCACCTCAATTCTCTCTTTGGAGGAGACCAGTAGTCACTGCTCATATTGAAGGACAGCCTGTAGAAGTATTACTGGATACAGGGGCTGATGATTCTATTGTAACAGGAATAGAGTTAGGTCCACATTATACCCCAAAAATAGTAGGAGGAATAGGAGGTTTTATTAATACTAAAGAATACAAAAATGTAGAAATAGAAGTTTTAGGCAAAAGGATTAAAGGGACAATCATGACAGGGGACACCCCGATTAACATTTTTGGTAGAAATTTGCTAACAGCTCTGGGGATGTCTCTAAATTTTCCCATAGCTAAAGTAGAGCCTGTAAAAGTCGCCTTAAAGCCAGGAAAGGATGGACCAAAATTGAAGCAGTGGCCATTATCAAAAGAAAAGATAGTTGCATTAAGAGAAATCTGTGAAAAGATGGAAAAGGATGGTCAGTTGGAGGAAGCTCCCCCGACCAATCCATACAACACCCCCACATTTGCTATAAAGAAAAAGGATAAGAACAAATGGAGAATGCTGATAGATTTTAGGGAACTAAATAGGGTCACTCAGGACTTTACGGAAGTCCAATTAGGAATACCACACCCTGCAGGACTAGCAAAAAGGAAAAGAATTACAGTACTGGATATAGGTGATGCATATTTCTCCATACCTCTAGATGAAGAATTTAGGCAGTACACTGCCTTTACTTTACCATCAGTAAATAATGCAGAGCCAGGAAAACGATACATTTATAAGGTTCTGCCTCAGGGATGGAAGGGGTCACCAGCCATCTTCCAATACACTATGAGACATGTGCTAGAACCCTTCAGGAAGGCAAATCCAGATGTGACCTTAGTCCAGTATATGGATGACATCTTAATAGCTAGTGACAGGACAGACCTGGAACATGACAGGGTAGTTTTACAGTCAAAGGAACTCTTGAATAGCATAGGGTTTTCTACCCCAGAAGAGAAATTCCAAAAAGATCCCCCATTTCAATGGATGGGGTACGAATTGTGGCCAACAAAATGGAAGTTGCAAAAGATAGAGTTGCCACAAAGAGAGACCTGGACAGTGAATGATATACAGAAGTTAGTAGGAGTATTAAATTGGGCAGCTCAAATTTATCCAGGTATAAAAACCAAACATCTCTGTAGGTTAATTAGAGGAAAAATGACTCTAACAGAGGAAGTTCAGTGGACTGAGATGGCAGAAGCAGAATATGAGGAAAATAAAATAATTCTCAGTCAGGAACAAGAAGGATGTTATTACCAAGAAGGCAAGCCATTAGAAGCCACGGTAATAAAGAGTCAGGACAATCAGTGGTCTTATAAAATTCACCAAGAAGACAAAATACTGAAAGTAGGAAAATTTGCAAAGATAAAGAATACACATACCAATGGAGTGAGACTATTAGCACATGTAATACAGAAAATAGGAAAGGAAGCAATAGTGATCTGGGGACAGGTCCCAAAATTCCACTTACCAGTTGAGAAGGATGTATGGGAACAGTGGTGGACAGACTATTGGCAGGTAACCTGGATACCGGAATGGGATTTTATCTCAACACCACCGCTAGTAAGATTAGTCTTCAATCTAGTGAAGGACCCTATAGAGGGAGAAGAAACCTATTATACAGATGGATCATGTAATAAACAGTCAAAAGAAGGGAAAGCAGGATATATCACAGATAGGGGCAAAGACAAAGTAAAAGTGTTAGAACAGACTACTAATCAACAAGCAGAATTGGAAGCATTTCTCATGGCATTGACAGACTCAGGGCCAAAGGCAAATATTATAGTAGATTCACAATATGTTATGGGAATAATAACAGGATGCCCTACAGAATCAGAGAGCAGGCTAGTTAATCAAATAATAGAAGAAATGATTAAAAAGTCAGAAATTTATGTAGCATGGGTACCAGCACACAAAGGTATAGGAGGAAACCAAGAAATAGACCACCTAGTTAGTCAAGGGATTAGACAAGTTCTCTTCTTGGAAAAGATAGAGCCAGCACAAGAAGAACATGATAAATACCATAGTAATGTAAAAGAATTGGTATTCAAATTTGGATTACCCAGAATAGTGGCCAGACAGATAGTAGACACCTGTGATAAATGTCATCAGAAAGGAGAGGCTATACATGGGCAGGCAAATTCAGATCTAGGGACTTGGCAAATGGATTGTACCCATCTAGAGGGAAAAATAATCATAGTTGCAGTACATGTAGCTAGTGGATTCATAGAAGCAGAGGTAATTCCACAAGAGACAGGAAGACAGACAGCACTATTTCTGTTAAAATTGGCAGGCAGATGGCCTATTACACATCTACACACAGATAATGGTGCTAACTTTGCTTCGCAAGAAGTAAAGATGGTTGCATGGTGGGCAGGGATAGAGCACACCTTTGGGGTACCATACAATCCACAGAGTCAGGGAGTAGTGGAAGCAATGAATCACCACCTGAAAAATCAAATAGATAGAATCAGGGAACAAGCAAATTCAGTAGAAACCATAGTATTAATGGCAGTTCATTGCATGAATTTTAAAAGAAGGGGAGGAATAGGGGATATGACTCCAGCAGAAAGATTAATTAACATGATCACTACAGAACAAGAGATACAATTTCAACAATCAAAAAACTCAAAATTTAAAAATTTTCGGGTCTATTACAGAGAAGGCAGAGATCAACTGTGGAAGGGACCCGGTGAGCTATTGTGGAAAGGGGAAGGAGCAGTCATCTTAAAGGTAGGGACAGACATTAAGGTAGTACCCAGAAGAAAGGCTAAAATTATCAAAGATTATGGAGGAGGAAAAGAGGTGGATAGCAGTTCCCACATGGAGGATACCGGAGAGGCTAGAGAGGTGGCATAGCCTCATAAAATATCTGAAATATAAAACTAAAGATCTACAAAAGGTTTGCTATGTGCCCCATTTTAAGGTCGGATGGGCATGGTGGACCTGCAGCAGAGTAATCTTCCCACTACAGGAAGGAAGCCATTTAGAAGTACAAGGGTATTGGCATTTGACACCAGAAAAAGGGTGGCTCAGTACTTATGCAGTGAGGATAACCTGGTACTCAAAGAACTTTTGGACAGATGTAACACCAAACTATGCAGACATTTTACTGCATAGCACTTATTTCCCTTGCTTTACAGCGGGAGAAGTGAGAAGGGCCATCAGGGGAGAACAACTGCTGTCTTGCTGCAGGTTCCCGAGAGCTCATAAGTACCAGGTACCAAGCCTACAGTACTTAGCACTGAAAGTAGTAAGCGATGTCAGATCCCAGGGAGAGAATCCCACCTGGAAACAGTGGAGAAGAGACAATAGGAGAGGCCTTCGAATGGCTAAACAGAACAGTAGAGGAGATAAACAGAGAGGCGGTAAACCACCTACCAAGGGAGCTAATTTTCCAGGTTTGGCAAAGGTCTTGGGAATACTGGCATGATGAACAAGGGATGTCACCAAGCTATGTAAAATACAGATACTTGTGTTTAATACAAAAGGCTTTATTTATGCATTGCAAGAAAGGCTGTAGATGTCTAGGGGAAGGACATGGGGCAGGGGGATGGAGACCAGGACCTCCTCCTCCTCCCCCTCCAGGACTAGCATAAATGGAAGAAAGACCTCCAGAAAATGAAGGACCACAAAGGGAACCATGGGATGAATGGGTAGTGGAGGTTCTGGAAGAACTGAAAGAAGAAGCTTTAAAACATTTTGATCCTCGCTTGCTAACTGCACTTGGTAATCATATCTATAATAGACATGGAGACACCCTTGAGGGAGCAGGAGAACTCATTAGAATCCTCCAACGAGCGCTCTTCATGCATTTCAGAGGCGGATGCATCCACTCCAGAATCGGCCAACCTGGGGGAGGAAATCCTCTCTCAGCTATACCGCCCTCTAGAAGCATGCTATAACACATGCTATTGTAAAAAGTGTTGCTACCATTGCCAGTTTTGTTTTCTTAAAAAAGGCTTGGGGATATGTTATGAGCAATCACGAAAGAGAAGAAGAACTCCGAAAAAGGCTAAGGCTAATACATCTTCTGCATCAAACAAGTAAGTATGGGATGTCTTGGGAATCAGCTGCTTATCGCCATCTTGCTTTTAAGTGTCTATGGGATCTATTGTACTCTATATGTCACAGTCTTTTATGGTGTACCAGCTTGGAGGAATGCGACAATTCCCCTCTTTTGTGCAACCAAGAATAGGGATACTTGGGGAACAACTCAGTGCCTACCAGATAATGGTGATTATTCAGAAGTGGCCCTTAATGTTACAGAAAGCTTTGATGCCTGGAATAATACAGTCACAGAACAGGCAATAGAGGATGTATGGCAACTCTTTGAGACCTCAATAAAGCCTTGTGTAAAATTATCCCCATTATGCATTACTATGAGATGCAATAAAAGTGAGACAGATAGATGGGGATTGACAAAATCAATAACAACAACAGCATCAACAACATCAACGACAGCATCAGCAAAAGTAGACATGGTCAATGAGACTAGTTCTTGTATAGCCCAGGATAATTGCACAGGCTTGGAACAAGAGCAAATGATAAGCTGTAAATTCAACATGACAGGGTTAAAAAGAGACAAGAAAAAAGAGTACAATGAAACTTGGTACTCTGCAGATTTGGTATGTGAACAAGGGAATAACACTGGTAATGAAAGTAGATGTTACATGAACCACTGTAACACTTCTGTTATCCAAGAGTCTTGTGACAAACATTATTGGGATGCTATTAGATTTAGGTATTGTGCACCTCCAGGTTATGCTTTGCTTAGATGTAATGACACAAATTATTCAGGCTTTATGCCTAAATGTTCTAAGGTGGTGGTCTCTTCATGCACAAGGATGATGGAGACACAGACTTCTACTTGGTTTGGCTTTAATGGAACTAGAGCAGAAAATAGAACTTATATTTACTGGCATGGTAGGGATAATAGGACTATAATTAGTTTAAATAAGTATTATAATCTAACAATGAAATGTAGAAGACCAGGAAATAAGACAGTTTTACCAGTCACCATTATGTCTGGATTGGTTTTCCACTCACAACCAATCAATGATAGGCCAAAGCAGGCATGGTGTTGGTTTGGAGGAAAATGGAAGGATGCAATAAAAGAGGTGAAGCAGACCATTGTCAAACATCCCAGGTATACTGGAACTAACAATACTGATAAAATCAATTTGACGGCTCCTGGAGGAGGAGATCCGGAAGTTACCTTCATGTGGACAAATTGCAGAGGAGAGTTCCTCTACTGTAAAATGAATTGGTTTCTAAATTGGGTAGAAGATAGGAATACAGCTAACCAGAAGCCAAAGGAACAGCATAAAAGGAATTACGTGCCATGTCATATTAGACAAATAATCAACACTTGGCATAAAGTAGGCAAAAATGTTTATTTGCCTCCAAGAGAGGGAGACCTCACGTGTAACTCCACAGTGACCAGTCTCATAGCAAACATAGATTGGATTGATGGAAACCAAACTAATATCACCATGAGTGCAGAGGTGGCAGAACTGTATCGATTGGAATTGGGAGATTATAAATTAGTAGAGATCACTCCAATTGGCTTGGCCCCCACAGATGTGAAGAGGTACACTACTGGTGGCACCTCAAGAAATAAAAGAGGGGTCTTTGTGCTAGGGTTCTTGGGTTTTCTCGCAACGGCAGGTTCTGCAATGGGCGCGGCGTCGTTGACGCTGACCGCTCAGTCCCGAACTTTATTGGCTGGGATAGTGCAGCAACAGCAACAGCTGTTGGACGTGGTCAAGAGACAACAAGAATTGTTGCGACTGACCGTCTGGGGAACAAAGAACCTCCAGACTAGGGTCACTGCCATCGAGAAGTACTTAAAGGACCAGGCGCAGCTGAATGCTTGGGGATGTGCGTTTAGACAAGTCTGCCACACTACTGTACCATGGCCAAATGCAAGTCTAACACCAAAGTGGAACAATGAGACTTGGCAAGAGTGGGAGCGAAAGGTTGACTTCTTGGAAGAAAATATAACAGCCCTCCTAGAGGAGGCACAAATTCAACAAGAGAAGAACATGTATGAATTACAAAAGTTGAATAGCTGGGATGTGTTTGGCAATTGGTTTGACCTTGCTTCTTGGATAAAGTATATACAATATGGAGTTTATATAGTTGTAGGAGTAATACTGTTAAGAATAGTGATCTATATAGTACAAATGCTAGCTAAGTTAAGGCAGGGGTATAGGCCAGTGTTCTCTTCCCCACCCTCTTATTTCCAGCAGACCCATATCCAACAGGACCCGGCACTGCCAACCAGAGAAGGCAAAGAAAGAGACGGTGGAGAAGGCGGTGGCAACAGCTCCTGGCCTTGGCAGATAGAATATATTCATTTCCTGATCCGCCAACTGATACGCCTCTTGACTTGGCTATTCAGCAACTGCAGAACCTTGCTATCGAGAGTATACCAGATCCTCCAACCAATACTCCAGAGGCTCTCTGCGACCCTACAGAGGATTCGAGAAGTCCTCAGGACTGAACTGACCTACCTACAATATGGGTGGAGCTATTTCCATGAGGCGGTCCAGGCCGTCTGGAGATCTGCGACAGAGACTCTTGCGGGCGCGTGGGGAGACTTATGGGAGACTCTTAGGAGAGGTGGAAGATGGATACTCGCAATCCCCAGGAGGATTAGACAAGGGCTTGAGCTCACTCTCTTGTGAGGGACAGAAATACAATCAGGGACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATTAGCATACAGAAAACAAAATATGGATGATATAGATGAGTAAGATGATGACTTGGTAGGGGTATCAGTGAGGCCAAAAGTTCCCCTAAGAACAATGAGTTACAAATTGGCAATAGACATGTCTCATTTTATAAAAGAAAAGGGGGGACTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCA"
|
26
|
+
else
|
27
|
+
raise StandardError.new("reference sequence not recognized, choose from :HXB2 (default), :NL43, or :MAC239.")
|
28
|
+
end
|
29
|
+
rescue StandardError => e
|
30
|
+
puts e.message
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
# Fisher's Exact Test Function Library
|
4
|
+
#
|
5
|
+
# Based on JavaScript version created by: Oyvind Langsrud,
|
6
|
+
# Ported to Ruby by Bryan Donovan
|
7
|
+
|
8
|
+
module Rubystats
|
9
|
+
# Fisher's exact test
|
10
|
+
class FishersExactTest
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@sn11 = 0.0
|
14
|
+
@sn1_ = 0.0
|
15
|
+
@sn_1 = 0.0
|
16
|
+
@sn = 0.0
|
17
|
+
@sprob = 0.0
|
18
|
+
|
19
|
+
@sleft = 0.0
|
20
|
+
@sright = 0.0
|
21
|
+
@sless = 0.0
|
22
|
+
@slarg = 0.0
|
23
|
+
|
24
|
+
@left = 0.0
|
25
|
+
@right = 0.0
|
26
|
+
@twotail = 0.0
|
27
|
+
end
|
28
|
+
|
29
|
+
# @see http://lib.stat.cmu.edu/apstat/245 Reference: "Lanczos, C. 'A precision approximation of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964." Translation of Alan Miller's FORTRAN-implementation.
|
30
|
+
|
31
|
+
def lngamm(z)
|
32
|
+
x = 0
|
33
|
+
x += 0.0000001659470187408462 / (z+7)
|
34
|
+
x += 0.000009934937113930748 / (z+6)
|
35
|
+
x -= 0.1385710331296526 / (z+5)
|
36
|
+
x += 12.50734324009056 / (z+4)
|
37
|
+
x -= 176.6150291498386 / (z+3)
|
38
|
+
x += 771.3234287757674 / (z+2)
|
39
|
+
x -= 1259.139216722289 / (z+1)
|
40
|
+
x += 676.5203681218835 / (z)
|
41
|
+
x += 0.9999999999995183
|
42
|
+
|
43
|
+
return(::Math.log(x)-5.58106146679532777-z+(z-0.5) * ::Math.log(z+6.5))
|
44
|
+
end
|
45
|
+
|
46
|
+
def lnfact(n)
|
47
|
+
if n <= 1
|
48
|
+
return 0
|
49
|
+
else
|
50
|
+
return lngamm(n+1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def lnbico(n,k)
|
55
|
+
return lnfact(n) - lnfact(k) - lnfact(n-k)
|
56
|
+
end
|
57
|
+
|
58
|
+
def hyper_323(n11, n1_, n_1, n)
|
59
|
+
return ::Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
|
60
|
+
end
|
61
|
+
|
62
|
+
def hyper(n11)
|
63
|
+
return hyper0(n11, 0, 0, 0)
|
64
|
+
end
|
65
|
+
|
66
|
+
def hyper0(n11i,n1_i,n_1i,ni)
|
67
|
+
if n1_i == 0 and n_1i ==0 and ni == 0
|
68
|
+
unless n11i % 10 == 0
|
69
|
+
if n11i == @sn11+1
|
70
|
+
@sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
|
71
|
+
@sn11 = n11i
|
72
|
+
return @sprob
|
73
|
+
end
|
74
|
+
if n11i == @sn11-1
|
75
|
+
@sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
|
76
|
+
@sn11 = n11i
|
77
|
+
return @sprob
|
78
|
+
end
|
79
|
+
end
|
80
|
+
@sn11 = n11i
|
81
|
+
else
|
82
|
+
@sn11 = n11i
|
83
|
+
@sn1_ = n1_i
|
84
|
+
@sn_1 = n_1i
|
85
|
+
@sn = ni
|
86
|
+
end
|
87
|
+
@sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
|
88
|
+
return @sprob
|
89
|
+
end
|
90
|
+
|
91
|
+
def exact(n11,n1_,n_1,n)
|
92
|
+
|
93
|
+
p = i = j = prob = 0.0
|
94
|
+
|
95
|
+
max = n1_
|
96
|
+
max = n_1 if n_1 < max
|
97
|
+
min = n1_ + n_1 - n
|
98
|
+
min = 0 if min < 0
|
99
|
+
|
100
|
+
if min == max
|
101
|
+
@sless = 1
|
102
|
+
@sright = 1
|
103
|
+
@sleft = 1
|
104
|
+
@slarg = 1
|
105
|
+
return 1
|
106
|
+
end
|
107
|
+
|
108
|
+
prob = hyper0(n11,n1_,n_1,n)
|
109
|
+
@sleft = 0
|
110
|
+
|
111
|
+
p = hyper(min)
|
112
|
+
i = min + 1
|
113
|
+
while p < (0.99999999 * prob)
|
114
|
+
@sleft += p
|
115
|
+
p = hyper(i)
|
116
|
+
i += 1
|
117
|
+
end
|
118
|
+
|
119
|
+
i -= 1
|
120
|
+
|
121
|
+
if p < (1.00000001*prob)
|
122
|
+
@sleft += p
|
123
|
+
else
|
124
|
+
i -= 1
|
125
|
+
end
|
126
|
+
|
127
|
+
@sright = 0
|
128
|
+
|
129
|
+
p = hyper(max)
|
130
|
+
j = max - 1
|
131
|
+
while p < (0.99999999 * prob)
|
132
|
+
@sright += p
|
133
|
+
p = hyper(j)
|
134
|
+
j -= 1
|
135
|
+
end
|
136
|
+
j += 1
|
137
|
+
|
138
|
+
if p < (1.00000001*prob)
|
139
|
+
@sright += p
|
140
|
+
else
|
141
|
+
j += 1
|
142
|
+
end
|
143
|
+
|
144
|
+
if (i - n11).abs < (j - n11).abs
|
145
|
+
@sless = @sleft
|
146
|
+
@slarg = 1 - @sleft + prob
|
147
|
+
else
|
148
|
+
@sless = 1 - @sright + prob
|
149
|
+
@slarg = @sright
|
150
|
+
end
|
151
|
+
return prob
|
152
|
+
end
|
153
|
+
|
154
|
+
def calculate(n11_,n12_,n21_,n22_)
|
155
|
+
n11_ *= -1 if n11_ < 0
|
156
|
+
n12_ *= -1 if n12_ < 0
|
157
|
+
n21_ *= -1 if n21_ < 0
|
158
|
+
n22_ *= -1 if n22_ < 0
|
159
|
+
n1_ = n11_ + n12_
|
160
|
+
n_1 = n11_ + n21_
|
161
|
+
n = n11_ + n12_ + n21_ + n22_
|
162
|
+
exact(n11_,n1_,n_1,n)
|
163
|
+
left = @sless
|
164
|
+
right = @slarg
|
165
|
+
twotail = @sleft + @sright
|
166
|
+
twotail = 1 if twotail > 1
|
167
|
+
values_hash = { :left =>left, :right =>right, :twotail =>twotail }
|
168
|
+
return values_hash
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,1043 @@
|
|
1
|
+
|
2
|
+
module ViralSeq
|
3
|
+
|
4
|
+
# ViralSeq::SeqHash class for operation on multiple sequences.
|
5
|
+
# @example read a FASTA sequence file of HIV PR sequences, make alignment, perform the QC location check, filter sequences with stop codons and APOBEC3g/f hypermutations, calculate pairwise diversity, calculate minority cut-off based on Poisson model, and examine for drug resistance mutations.
|
6
|
+
# my_pr_seqhash = ViralSeq::SeqHash.fa('my_pr_fasta_file.fasta')
|
7
|
+
# # new ViralSeq::SeqHash object from a FASTA file
|
8
|
+
# aligned_pr_seqhash = my_pr_seqhash.align
|
9
|
+
# # align with MUSCLE
|
10
|
+
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
|
+
# # filter nt sequences with the reference coordinates
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
|
13
|
+
# # return a new ViralSeq::SeqHash object without stop codons
|
14
|
+
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
|
+
# # further filter out sequences with A3G hypermutations
|
16
|
+
# filtered_seqhash.pi
|
17
|
+
# # return pairwise diveristy π
|
18
|
+
# cut_off = filtered_seqhash.pm
|
19
|
+
# # return cut-off for minority variants based on Poisson model
|
20
|
+
# filtered_seqhash.sdrm_hiv_pr(cut_off)
|
21
|
+
# # examine for drug resistance mutations for PR region.
|
22
|
+
|
23
|
+
class SeqHash
|
24
|
+
# initialize a ViralSeq::SeqHash object
|
25
|
+
def initialize (dna_hash = {}, aa_hash = {}, qc_hash = {}, title = "", file = "")
|
26
|
+
@dna_hash = dna_hash
|
27
|
+
@aa_hash = aa_hash
|
28
|
+
@qc_hash = qc_hash
|
29
|
+
@title = title
|
30
|
+
@file = file
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Hash] Hash object for :name => :sequence_string pairs
|
34
|
+
attr_accessor :dna_hash
|
35
|
+
|
36
|
+
# @return [Hash] Hash object for :name => :amino_acid_sequence_string pairs
|
37
|
+
attr_accessor :aa_hash
|
38
|
+
|
39
|
+
# @return [Hash] Hash object for :name => :qc_score_string pairs
|
40
|
+
attr_accessor :qc_hash
|
41
|
+
|
42
|
+
# @return [String] the title of the SeqHash object.
|
43
|
+
# default as the file basename if SeqHash object is initialized using ::fa or ::fq
|
44
|
+
attr_accessor :title
|
45
|
+
|
46
|
+
# @return [String] the file that is used to initialize SeqHash object, if it exists
|
47
|
+
attr_accessor :file
|
48
|
+
|
49
|
+
# initialize a new ViralSeq::SeqHash object from a FASTA format sequence file
|
50
|
+
# @param infile [String] path to the FASTA format sequence file
|
51
|
+
# @return [ViralSeq::SeqHash]
|
52
|
+
# @example new ViralSeq::SeqHash object from a FASTA file
|
53
|
+
# ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
54
|
+
|
55
|
+
def self.new_from_fasta(infile)
|
56
|
+
f=File.open(infile,"r")
|
57
|
+
return_hash = {}
|
58
|
+
name = ""
|
59
|
+
while line = f.gets do
|
60
|
+
line.tr!("\u0000","")
|
61
|
+
next if line == "\n"
|
62
|
+
next if line =~ /^\=/
|
63
|
+
if line =~ /^\>/
|
64
|
+
name = line.chomp
|
65
|
+
return_hash[name] = ""
|
66
|
+
else
|
67
|
+
return_hash[name] += line.chomp.upcase
|
68
|
+
end
|
69
|
+
end
|
70
|
+
f.close
|
71
|
+
seq_hash = ViralSeq::SeqHash.new
|
72
|
+
seq_hash.dna_hash = return_hash
|
73
|
+
seq_hash.title = File.basename(infile,".*")
|
74
|
+
seq_hash.file = infile
|
75
|
+
return seq_hash
|
76
|
+
end # end of ::new_from_fasta
|
77
|
+
|
78
|
+
# initialize a new ViralSeq::SeqHash object from a FASTA format sequence file of amino acid sequences
|
79
|
+
# @param infile [String] path to the FASTA format sequence file of aa sequences
|
80
|
+
# @return [ViralSeq::SeqHash]
|
81
|
+
|
82
|
+
def self.new_from_aa_fasta(infile)
|
83
|
+
f=File.open(infile,"r")
|
84
|
+
return_hash = {}
|
85
|
+
name = ""
|
86
|
+
while line = f.gets do
|
87
|
+
line.tr!("\u0000","")
|
88
|
+
next if line == "\n"
|
89
|
+
next if line =~ /^\=/
|
90
|
+
if line =~ /^\>/
|
91
|
+
name = line.chomp
|
92
|
+
return_hash[name] = ""
|
93
|
+
else
|
94
|
+
return_hash[name] += line.chomp.upcase
|
95
|
+
end
|
96
|
+
end
|
97
|
+
f.close
|
98
|
+
seq_hash = ViralSeq::SeqHash.new
|
99
|
+
seq_hash.aa_hash = return_hash
|
100
|
+
seq_hash.title = File.basename(infile,".*")
|
101
|
+
seq_hash.file = infile
|
102
|
+
return seq_hash
|
103
|
+
end # end of ::new_from_fasta
|
104
|
+
|
105
|
+
# initialize a new ViralSeq::SeqHash object from a FASTQ format sequence file
|
106
|
+
# @param fastq_file [String] path to the FASTA format sequence file
|
107
|
+
# @return [ViralSeq::SeqHash]
|
108
|
+
# @example new ViralSeq::SeqHash object from a FASTQ file
|
109
|
+
# ViralSeq::SeqHash.fq('my_fastq_file.fastq')
|
110
|
+
|
111
|
+
def self.new_from_fastq(fastq_file)
|
112
|
+
count = 0
|
113
|
+
sequence_a = []
|
114
|
+
quality_a = []
|
115
|
+
count_seq = 0
|
116
|
+
|
117
|
+
File.open(fastq_file,'r') do |file|
|
118
|
+
file.readlines.collect do |line|
|
119
|
+
count +=1
|
120
|
+
count_m = count % 4
|
121
|
+
if count_m == 1
|
122
|
+
line.tr!('@','>')
|
123
|
+
sequence_a << line.chomp
|
124
|
+
quality_a << line.chomp
|
125
|
+
count_seq += 1
|
126
|
+
elsif count_m == 2
|
127
|
+
sequence_a << line.chomp
|
128
|
+
elsif count_m == 0
|
129
|
+
quality_a << line.chomp
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
sequence_hash = Hash[*sequence_a]
|
134
|
+
quality_hash = Hash[*quality_a]
|
135
|
+
|
136
|
+
seq_hash = ViralSeq::SeqHash.new
|
137
|
+
seq_hash.dna_hash = sequence_hash
|
138
|
+
seq_hash.qc_hash = quality_hash
|
139
|
+
seq_hash.title = File.basename(fastq_file,".*")
|
140
|
+
seq_hash.file = fastq_file
|
141
|
+
return seq_hash
|
142
|
+
end # end of ::new_from_fastq
|
143
|
+
|
144
|
+
# initialize a ViralSeq::SeqHash object with an array of sequence strings
|
145
|
+
# @param master_tag [String] master tag to put in the sequence names
|
146
|
+
# @return [ViralSeq::SeqHash] No @qc_hash, @title will be the master_tag
|
147
|
+
|
148
|
+
def self.new_from_array(seq_array,master_tag = 'seq')
|
149
|
+
n = 1
|
150
|
+
hash = {}
|
151
|
+
seq_array.each do |seq|
|
152
|
+
hash[master_tag + "_" + n.to_s] = seq
|
153
|
+
n += 1
|
154
|
+
end
|
155
|
+
seq_hash = ViralSeq::SeqHash.new
|
156
|
+
seq_hash.dna_hash = hash
|
157
|
+
seq_hash.title = master_tag
|
158
|
+
return seq_hash
|
159
|
+
end # end of ::new_from_array
|
160
|
+
|
161
|
+
|
162
|
+
class << self
|
163
|
+
alias_method :fa, :new_from_fasta
|
164
|
+
alias_method :fq, :new_from_fastq
|
165
|
+
alias_method :aa_fa, :new_from_aa_fasta
|
166
|
+
alias_method :array, :new_from_array
|
167
|
+
end
|
168
|
+
|
169
|
+
# generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
|
170
|
+
# @return [String] relaxed sequencial phylip format in a String object
|
171
|
+
# @example convert fasta format to relaxed sequencial phylip format
|
172
|
+
# # my_fasta_file.fasta
|
173
|
+
# # >seq1
|
174
|
+
# # ATAAGAACG
|
175
|
+
# # >seq2
|
176
|
+
# # ATATGAACG
|
177
|
+
# # >seq3
|
178
|
+
# # ATGAGAACG
|
179
|
+
# my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
|
180
|
+
# puts my_seqhash.to_rsphylip
|
181
|
+
# # 3 9
|
182
|
+
# # seq1 ATAAGAACG
|
183
|
+
# # seq2 ATATGAACG
|
184
|
+
# # seq3 ATGAGAACG
|
185
|
+
|
186
|
+
def to_rsphylip
|
187
|
+
seqs = self.dna_hash
|
188
|
+
outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
|
189
|
+
names = seqs.keys
|
190
|
+
names.collect!{|n| n.tr(">", "")}
|
191
|
+
max_name_l = names.max.size
|
192
|
+
max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
|
193
|
+
seqs.each do |k,v|
|
194
|
+
outline += k + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
|
195
|
+
end
|
196
|
+
return outline
|
197
|
+
end # end of #to_rsphylip
|
198
|
+
|
199
|
+
# translate the DNA sequences in @dna_hash to amino acid sequences. generate value for @aa_hash
|
200
|
+
# @param codon_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
|
201
|
+
# @return [NilClass]
|
202
|
+
# @example translate dna sequences from a FASTA format sequence file
|
203
|
+
# # my_fasta_file.fasta
|
204
|
+
# # >seq1
|
205
|
+
# # ATAAGAACG
|
206
|
+
# # >seq2
|
207
|
+
# # ATATGAACG
|
208
|
+
# # >seq3
|
209
|
+
# # ATGAGAACG
|
210
|
+
# my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
|
211
|
+
# my_seqhash.translate
|
212
|
+
# my_seqhash.aa_sequence
|
213
|
+
# => {">seq1"=>"IRT", ">seq2"=>"I*T", ">seq3"=>"MRT"}
|
214
|
+
|
215
|
+
def translate(codon_position = 0)
|
216
|
+
seqs = self.dna_hash
|
217
|
+
@aa_hash = {}
|
218
|
+
seqs.each do |name, seq|
|
219
|
+
s = ViralSeq::Sequence.new(name, seq)
|
220
|
+
s.translate(codon_position)
|
221
|
+
@aa_hash[name] = s.aa_string
|
222
|
+
end
|
223
|
+
return nil
|
224
|
+
end # end of #translate
|
225
|
+
|
226
|
+
# collapse @dna_hash to unique sequence hash.
|
227
|
+
# @param tag # the master tag for unique sequences,
|
228
|
+
# sequences will be named as (tag + "_" + order(Integer) + "_" + counts(Integer))
|
229
|
+
# @return [ViralSeq::SeqHash] new SeqHash object of unique sequence hash
|
230
|
+
# @example
|
231
|
+
# dna_hash = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA', '>seq4' => 'CCCC', '>seq5' => 'CCCC', '>seq6' => 'TTTT'} }
|
232
|
+
# a_seq_hash = ViralSeq::SeqHash.new
|
233
|
+
# a_seq_hash.dna_hash = dna_hash
|
234
|
+
# uniq_sequence = a_seq_hash.uniq_dna_hash('master')
|
235
|
+
# => {">master_1_3"=>"AAAA", ">master_2_2"=>"CCCC", ">master_3_1"=>"TTTT"}
|
236
|
+
|
237
|
+
def uniq_dna_hash(tag = "sequence")
|
238
|
+
seqs = self.dna_hash
|
239
|
+
uni = seqs.values.count_freq
|
240
|
+
new_seq = {}
|
241
|
+
n = 1
|
242
|
+
uni.each do |s,c|
|
243
|
+
name = ">" + tag + "_" + n.to_s + "_" + c.to_s
|
244
|
+
new_seq[name] = s
|
245
|
+
n += 1
|
246
|
+
end
|
247
|
+
seq_hash = ViralSeq::SeqHash.new(new_seq)
|
248
|
+
seq_hash.title = self.title + "_uniq"
|
249
|
+
seq_hash.file = self.file
|
250
|
+
return seq_hash
|
251
|
+
end # end of #uniq_dna_hash
|
252
|
+
|
253
|
+
alias_method :uniq, :uniq_dna_hash
|
254
|
+
|
255
|
+
# given an Array of sequence tags, return a sub ViralSeq::SeqHash object with the sequence tags
|
256
|
+
# @param keys [Array] array of sequence tags
|
257
|
+
# @return [SeqHash] new SeqHash object with sequences of the input keys
|
258
|
+
|
259
|
+
def sub(keys)
|
260
|
+
h1 = {}
|
261
|
+
h2 = {}
|
262
|
+
h3 = {}
|
263
|
+
|
264
|
+
keys.each do |k|
|
265
|
+
dna = self.dna_hash[k]
|
266
|
+
next unless dna
|
267
|
+
h1[k] = dna
|
268
|
+
aa = self.aa_hash[k]
|
269
|
+
h2[k] = aa
|
270
|
+
qc = self.qc_hash[k]
|
271
|
+
h3[k] = qc
|
272
|
+
end
|
273
|
+
title = self.title
|
274
|
+
file = self.file
|
275
|
+
ViralSeq::SeqHash.new(h1,h2,h3,title,file)
|
276
|
+
end
|
277
|
+
|
278
|
+
# screen for sequences with stop codons.
|
279
|
+
# @param (see #translate)
|
280
|
+
# @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
|
281
|
+
#
|
282
|
+
# # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
|
283
|
+
# # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
|
284
|
+
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
285
|
+
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
286
|
+
# my_seqhash.dna_hash
|
287
|
+
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
288
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[0]
|
289
|
+
# stop_codon_seqhash.dna_hash
|
290
|
+
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
291
|
+
# stop_codon_seqhash.aa_hash
|
292
|
+
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
293
|
+
# stop_codon_seqhash.title
|
294
|
+
# => "my_fasta_file_stop"
|
295
|
+
# filtered_seqhash = my_seqhash.stop_codon[1]
|
296
|
+
# filtered_seqhash.aa_hash
|
297
|
+
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
298
|
+
|
299
|
+
def stop_codon(codon_position = 0)
|
300
|
+
self.translate(codon_position)
|
301
|
+
keys = []
|
302
|
+
self.aa_hash.each do |k,v|
|
303
|
+
keys << k if v.include?('*')
|
304
|
+
end
|
305
|
+
seqhash1 = self.sub(keys)
|
306
|
+
seqhash1.title = self.title + "_stop"
|
307
|
+
keys2 = self.aa_hash.keys - keys
|
308
|
+
seqhash2 = self.sub(keys2)
|
309
|
+
return [seqhash1, seqhash2]
|
310
|
+
end #end of #stop_codon
|
311
|
+
|
312
|
+
|
313
|
+
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
314
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
|
315
|
+
# @return [String] consensus sequence
|
316
|
+
# @example consensus sequence from an array of sequences.
|
317
|
+
# seq_array = %w{ ATTTTTTTTT
|
318
|
+
# AATTTTTTTT
|
319
|
+
# AAATTTTTTT
|
320
|
+
# AAAATTTTTT
|
321
|
+
# AAAAATTTTT
|
322
|
+
# AAAAAATTTT
|
323
|
+
# AAAAAAATTT
|
324
|
+
# AAAAAAAATT
|
325
|
+
# AAAAAAAAAT
|
326
|
+
# AAAAAAAAAA }
|
327
|
+
# my_seqhash = ViralSeq::SeqHash.array(seq_array)
|
328
|
+
# my_seqhash.consensus
|
329
|
+
# => 'AAAAAWTTTT'
|
330
|
+
# my_seqhash.consensus(0.7)
|
331
|
+
# => 'AAAANNNTTT'
|
332
|
+
|
333
|
+
def consensus(cutoff = 0.5)
|
334
|
+
seq_array = self.dna_hash.values
|
335
|
+
seq_length = seq_array[0].size
|
336
|
+
seq_size = seq_array.size
|
337
|
+
consensus_seq = ""
|
338
|
+
(0..(seq_length - 1)).each do |position|
|
339
|
+
all_base = []
|
340
|
+
seq_array.each do |seq|
|
341
|
+
all_base << seq[position]
|
342
|
+
end
|
343
|
+
base_count = all_base.count_freq
|
344
|
+
max_base_list = []
|
345
|
+
|
346
|
+
base_count.each do |k,v|
|
347
|
+
if v/seq_size.to_f >= cutoff
|
348
|
+
max_base_list << k
|
349
|
+
end
|
350
|
+
end
|
351
|
+
consensus_seq += call_consensus_base(max_base_list)
|
352
|
+
end
|
353
|
+
return consensus_seq
|
354
|
+
end #end of #consensus
|
355
|
+
|
356
|
+
# function to determine if the sequences have APOBEC3g/f hypermutation.
|
357
|
+
# # APOBEC3G/F pattern: GRD -> ARD
|
358
|
+
# # control pattern: G[YN|RC] -> A[YN|RC]
|
359
|
+
# # use the sample consensus to determine potential a3g sites
|
360
|
+
# # Two criteria to identify hypermutation
|
361
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
|
362
|
+
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
363
|
+
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
364
|
+
# # b/c Poisson model does not do well on small sample size.
|
365
|
+
# @return [Array] three values.
|
366
|
+
# first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
|
367
|
+
# second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
|
368
|
+
# third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
369
|
+
# # sequence tag
|
370
|
+
# # G to A mutation numbers at potential a3g positions
|
371
|
+
# # total potential a3g G positions
|
372
|
+
# # G to A mutation numbers at non a3g positions
|
373
|
+
# # total non a3g G positions
|
374
|
+
# # a3g G to A mutation rate / non-a3g G to A mutation rate
|
375
|
+
# # Fishers Exact P-value
|
376
|
+
# @example identify apobec3gf mutations from a sequence fasta file
|
377
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
378
|
+
# hypermut = my_seqhash.a3g
|
379
|
+
# hypermut[0].dna_hash.keys
|
380
|
+
# => [">Seq7", ">Seq14"]
|
381
|
+
# hypermut[1].dna_hash.keys
|
382
|
+
# => [">Seq1", ">Seq2", ">Seq5"]
|
383
|
+
# hypermut[2]
|
384
|
+
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
385
|
+
#
|
386
|
+
# @example identify apobec3gf mutations from another sequence fasta file
|
387
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
388
|
+
# hypermut = my_seqhash.a3g
|
389
|
+
# hypermut[2]
|
390
|
+
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
391
|
+
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
392
|
+
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
393
|
+
# @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
|
394
|
+
|
395
|
+
def a3g_hypermut
|
396
|
+
# mut_hash number of apobec3g/f mutations per sequence
|
397
|
+
mut_hash = {}
|
398
|
+
hm_hash = {}
|
399
|
+
out_hash = {}
|
400
|
+
|
401
|
+
# total G->A mutations at apobec3g/f positions.
|
402
|
+
total = 0
|
403
|
+
|
404
|
+
# make consensus sequence for the input sequence hash
|
405
|
+
ref = self.consensus
|
406
|
+
|
407
|
+
# obtain apobec3g positions and control positions
|
408
|
+
apobec = apobec3gf(ref)
|
409
|
+
mut = apobec[0]
|
410
|
+
control = apobec[1]
|
411
|
+
|
412
|
+
self.dna_hash.each do |k,v|
|
413
|
+
a = 0 # muts
|
414
|
+
b = 0 # potential mut sites
|
415
|
+
c = 0 # control muts
|
416
|
+
d = 0 # potenrial controls
|
417
|
+
mut.each do |n|
|
418
|
+
next if v[n] == "-"
|
419
|
+
if v[n] == "A"
|
420
|
+
a += 1
|
421
|
+
b += 1
|
422
|
+
else
|
423
|
+
b += 1
|
424
|
+
end
|
425
|
+
end
|
426
|
+
mut_hash[k] = a
|
427
|
+
total += a
|
428
|
+
|
429
|
+
control.each do |n|
|
430
|
+
next if v[n] == "-"
|
431
|
+
if v[n] == "A"
|
432
|
+
c += 1
|
433
|
+
d += 1
|
434
|
+
else
|
435
|
+
d += 1
|
436
|
+
end
|
437
|
+
end
|
438
|
+
rr = (a/b.to_f)/(c/d.to_f)
|
439
|
+
|
440
|
+
t1 = b - a
|
441
|
+
t2 = d - c
|
442
|
+
|
443
|
+
fet = ViralSeq::Rubystats::FishersExactTest.new
|
444
|
+
fisher = fet.calculate(t1,t2,a,c)
|
445
|
+
perc = fisher[:twotail]
|
446
|
+
info = [k, a, b, c, d, rr.round(2), perc]
|
447
|
+
out_hash[k] = info
|
448
|
+
if perc < 0.05
|
449
|
+
hm_hash[k] = info
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
if self.dna_hash.size > 20
|
454
|
+
rate = total.to_f/(self.dna_hash.size)
|
455
|
+
count_mut = mut_hash.values.count_freq
|
456
|
+
maxi_count = count_mut.values.max
|
457
|
+
poisson_hash = ViralSeq::Math::PoissonDist.new(rate,maxi_count).poisson_hash
|
458
|
+
cut_off = 0
|
459
|
+
poisson_hash.each do |k,v|
|
460
|
+
cal = self.dna_hash.size * v
|
461
|
+
obs = count_mut[k]
|
462
|
+
if obs >= 20 * cal
|
463
|
+
cut_off = k
|
464
|
+
break
|
465
|
+
elsif k == maxi_count
|
466
|
+
cut_off = maxi_count
|
467
|
+
end
|
468
|
+
end
|
469
|
+
mut_hash.each do |k,v|
|
470
|
+
if v > cut_off
|
471
|
+
hm_hash[k] = out_hash[k]
|
472
|
+
end
|
473
|
+
end
|
474
|
+
end
|
475
|
+
hm_seq_hash = ViralSeq::SeqHash.new
|
476
|
+
hm_hash.each do |k,_v|
|
477
|
+
hm_seq_hash.dna_hash[k] = self.dna_hash[k]
|
478
|
+
end
|
479
|
+
hm_seq_hash.title = self.title + "_hypermut"
|
480
|
+
hm_seq_hash.file = self.file
|
481
|
+
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
482
|
+
return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
|
483
|
+
end #end of #a3g_hypermut
|
484
|
+
|
485
|
+
alias_method :a3g, :a3g_hypermut
|
486
|
+
|
487
|
+
# Define Poission cut-off for minority variants.
|
488
|
+
# @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 Ref: Zhou, et al. J Virol 2015
|
489
|
+
# @param error_rate [Float] estimated sequencing error rate
|
490
|
+
# @param fold_cutoff [Integer] a fold cut-off to determine poisson minority cut-off. default = 20. i.e. <5% mutations from random methods error.
|
491
|
+
# @return [Integer] a cut-off for minority variants (>=).
|
492
|
+
# @example obtain Poisson minority cut-off from the example sequence FASTA file.
|
493
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
494
|
+
# my_seqhash.pm
|
495
|
+
# => 2 # means that mutations appear at least 2 times are very likely to be a true mutation instead of random methods errors.
|
496
|
+
|
497
|
+
def poisson_minority_cutoff(error_rate = 0.0001, fold_cutoff = 20)
|
498
|
+
sequences = self.dna_hash.values
|
499
|
+
if sequences.size == 0
|
500
|
+
return 0
|
501
|
+
else
|
502
|
+
cut_off = 1
|
503
|
+
l = sequences[0].size
|
504
|
+
rate = sequences.size * error_rate
|
505
|
+
count_mut = variant_for_poisson(sequences)
|
506
|
+
max_count = count_mut.keys.max
|
507
|
+
poisson_hash = ViralSeq::Math::PoissonDist.new(rate, max_count).poisson_hash
|
508
|
+
|
509
|
+
poisson_hash.each do |k,v|
|
510
|
+
cal = l * v
|
511
|
+
obs = count_mut[k] ? count_mut[k] : 0
|
512
|
+
if obs >= fold_cutoff * cal
|
513
|
+
cut_off = k
|
514
|
+
break
|
515
|
+
end
|
516
|
+
end
|
517
|
+
return cut_off
|
518
|
+
end
|
519
|
+
end # end of #poisson_minority_cutoff
|
520
|
+
|
521
|
+
alias_method :pm, :poisson_minority_cutoff
|
522
|
+
|
523
|
+
|
524
|
+
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
525
|
+
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
526
|
+
# @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
|
527
|
+
|
528
|
+
def align(path_to_muscle = false)
|
529
|
+
seq_hash = self.dna_hash
|
530
|
+
if self.file.size > 0
|
531
|
+
temp_dir = File.dirname(self.file)
|
532
|
+
else
|
533
|
+
temp_dir=File.dirname($0)
|
534
|
+
end
|
535
|
+
|
536
|
+
temp_file = temp_dir + "/_temp_muscle_in"
|
537
|
+
temp_aln = temp_dir + "/_temp_muscle_aln"
|
538
|
+
File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
|
539
|
+
if path_to_muscle
|
540
|
+
unless ViralSeq.check_muscle?(path_to_muscle)
|
541
|
+
File.unlink(temp_file)
|
542
|
+
return nil
|
543
|
+
end
|
544
|
+
print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
|
545
|
+
else
|
546
|
+
MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
|
547
|
+
end
|
548
|
+
out_seq_hash = ViralSeq::SeqHash.fa(temp_aln)
|
549
|
+
out_seq_hash.title = self.title + "_aligned"
|
550
|
+
out_seq_hash.file = self.file
|
551
|
+
File.unlink(temp_file)
|
552
|
+
File.unlink(temp_aln)
|
553
|
+
return out_seq_hash
|
554
|
+
end # end of align
|
555
|
+
|
556
|
+
# calculate Shannon's entropy, Euler's number as the base of logarithm
|
557
|
+
# @see https://en.wikipedia.org/wiki/Entropy_(information_theory) Entropy(Wikipedia)
|
558
|
+
# @param option [Symbol] the sequence type `:nt` or `:aa`
|
559
|
+
# @return [Hash] entropy score at each position in the alignment :position => :entropy ,
|
560
|
+
# # position starts at 1.
|
561
|
+
# @example caculate entropy from the example file
|
562
|
+
# sequence_file = 'spec/sample_files/sample_sequence_alignment_for_entropy.fasta'
|
563
|
+
# sequence_hash = ViralSeq::SeqHash.aa_fa(sequence_file)
|
564
|
+
# entropy_hash = sequence_hash.shannons_entropy(:aa)
|
565
|
+
# entropy_hash[3]
|
566
|
+
# => 0.0
|
567
|
+
# entropy_hash[14].round(3)
|
568
|
+
# => 0.639
|
569
|
+
# # This example is the sample input of LANL Entropy-One
|
570
|
+
# # https://www.hiv.lanl.gov/content/sequence/ENTROPY/entropy_one.html?sample_input=1
|
571
|
+
|
572
|
+
def shannons_entropy(option = :nt)
|
573
|
+
sequences = if option == :aa
|
574
|
+
self.aa_hash.values
|
575
|
+
else
|
576
|
+
self.dna_hash.values
|
577
|
+
end
|
578
|
+
entropy_hash = {}
|
579
|
+
seq_l = sequences[0].size
|
580
|
+
(0..(seq_l - 1)).each do |position|
|
581
|
+
element = []
|
582
|
+
sequences.each do |seq|
|
583
|
+
element << seq[position]
|
584
|
+
end
|
585
|
+
entropy = 0
|
586
|
+
element.delete('*')
|
587
|
+
element_size = element.size
|
588
|
+
element.count_freq.each do |_k,v|
|
589
|
+
p = v/element_size.to_f
|
590
|
+
entropy += (-p * ::Math.log(p))
|
591
|
+
end
|
592
|
+
entropy_hash[(position + 1)] = entropy
|
593
|
+
end
|
594
|
+
return entropy_hash
|
595
|
+
end # end of shannons_entropy
|
596
|
+
|
597
|
+
# Function to calculate nucleotide diversity π, for nt sequence only
|
598
|
+
# @see https://en.wikipedia.org/wiki/Nucleotide_diversity Nucleotide Diversity (Wikipedia)
|
599
|
+
# @return [Float] nucleotide diversity π
|
600
|
+
# @example calculate π
|
601
|
+
# sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
|
602
|
+
# my_seqhash = ViralSeq::SeqHash.array(sequences)
|
603
|
+
# my_seqhash.pi
|
604
|
+
# => 0.16667
|
605
|
+
|
606
|
+
def nucleotide_pi
|
607
|
+
sequences = self.dna_hash.values
|
608
|
+
seq_length = sequences[0].size - 1
|
609
|
+
nt_position_hash = {}
|
610
|
+
(0..seq_length).each do |n|
|
611
|
+
nt_position_hash[n] = []
|
612
|
+
sequences.each do |s|
|
613
|
+
nt_position_hash[n] << s[n]
|
614
|
+
end
|
615
|
+
end
|
616
|
+
diver = 0
|
617
|
+
com = 0
|
618
|
+
nt_position_hash.each do |_p,nt|
|
619
|
+
nt.delete_if {|n| n =~ /[^A|^C|^G|^T]/}
|
620
|
+
next if nt.size == 1
|
621
|
+
nt_count = nt.count_freq
|
622
|
+
combination = (nt.size)*(nt.size - 1)/2
|
623
|
+
com += combination
|
624
|
+
a = nt_count["A"]
|
625
|
+
c = nt_count["C"]
|
626
|
+
t = nt_count["T"]
|
627
|
+
g = nt_count["G"]
|
628
|
+
div = a*c + a*t + a*g + c*t + c*g + t*g
|
629
|
+
diver += div
|
630
|
+
end
|
631
|
+
pi = (diver/com.to_f).round(5)
|
632
|
+
return pi
|
633
|
+
end # end of #pi
|
634
|
+
|
635
|
+
alias_method :pi, :nucleotide_pi
|
636
|
+
|
637
|
+
# TN93 distance functionl, tabulate pairwise comparison of sequence pairs in a sequence alignment,
|
638
|
+
# nt sequence only
|
639
|
+
# @return [Hash] pairwise distance table in Hash object {:diff => :freq, ... }
|
640
|
+
# # Note: :diff in different positions (Integer), not percentage.
|
641
|
+
# @example calculate TN93 distribution
|
642
|
+
# sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
|
643
|
+
# my_seqhash = ViralSeq::SeqHash.array(sequences)
|
644
|
+
# my_seqhash.tn93
|
645
|
+
# => {0=>1, 1=>8, 2=>6}
|
646
|
+
|
647
|
+
def tn93
|
648
|
+
sequences = self.dna_hash.values
|
649
|
+
diff = []
|
650
|
+
seq_hash = sequences.count_freq
|
651
|
+
seq_hash.values.each do |v|
|
652
|
+
comb = v * (v - 1) / 2
|
653
|
+
comb.times {diff << 0}
|
654
|
+
end
|
655
|
+
|
656
|
+
seq_hash.keys.combination(2).to_a.each do |pair|
|
657
|
+
s1 = pair[0]
|
658
|
+
s2 = pair[1]
|
659
|
+
diff_temp = s1.compare_with(s2)
|
660
|
+
comb = seq_hash[s1] * seq_hash[s2]
|
661
|
+
comb.times {diff << diff_temp}
|
662
|
+
end
|
663
|
+
|
664
|
+
count_diff = diff.count_freq
|
665
|
+
out_hash = Hash.new(0)
|
666
|
+
Hash[count_diff.sort_by{|k,_v|k}].each do |k,v|
|
667
|
+
out_hash[k] = v
|
668
|
+
end
|
669
|
+
return out_hash
|
670
|
+
end # end of #tn93
|
671
|
+
|
672
|
+
# quality check for HIV sequences based on ViralSeq::Sequence#locator, check if sequences are in the target range
|
673
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
674
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
675
|
+
# @param indel [Boolean] allow indels or not, `ture` or `false`
|
676
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
677
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
678
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with only the sequences that meet the QC criterias
|
679
|
+
# @example QC for sequences in a FASTA files
|
680
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_seq.fasta')
|
681
|
+
# filtered_seqhash = my_seqhash.hiv_seq_qc([4384,4386], 4750..4752, false, :HXB2)
|
682
|
+
# my_seqhash.dna_hash.size
|
683
|
+
# => 6
|
684
|
+
# filtered_seqhash.dna_hash.size
|
685
|
+
# => 4
|
686
|
+
|
687
|
+
def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
|
688
|
+
start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
|
689
|
+
end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
|
690
|
+
seq_hash = self.dna_hash.dup
|
691
|
+
seq_hash_unique = seq_hash.values.uniq
|
692
|
+
seq_hash_unique_pass = []
|
693
|
+
|
694
|
+
seq_hash_unique.each do |seq|
|
695
|
+
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
696
|
+
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
697
|
+
if indel
|
698
|
+
seq_hash_unique_pass << seq
|
699
|
+
elsif loc[3] == false
|
700
|
+
seq_hash_unique_pass << seq
|
701
|
+
end
|
702
|
+
end
|
703
|
+
end
|
704
|
+
seq_pass = []
|
705
|
+
seq_hash_unique_pass.each do |seq|
|
706
|
+
seq_hash.each do |seq_name, orginal_seq|
|
707
|
+
if orginal_seq == seq
|
708
|
+
seq_pass << seq_name
|
709
|
+
seq_hash.delete(seq_name)
|
710
|
+
end
|
711
|
+
end
|
712
|
+
end
|
713
|
+
self.sub(seq_pass)
|
714
|
+
end # end of #hiv_seq_qc
|
715
|
+
|
716
|
+
|
717
|
+
# Remove squences with residual offspring Primer IDs.
|
718
|
+
# Compare PID with sequences which have identical sequences.
|
719
|
+
# PIDs differ by 1 base will be recognized. If PID1 is x time (cutoff) greater than PID2, PID2 will be disgarded.
|
720
|
+
# each sequence tag starting with ">" and the Primer ID sequence
|
721
|
+
# followed by the number of Primer ID appeared in the raw sequence
|
722
|
+
# the information sections in the tags are separated by underscore "_"
|
723
|
+
# example sequence tag: >AGGCGTAGA_32_sample1_RT
|
724
|
+
# @param cutoff [Integer] the fold cut-off to remove the potential residual offspring Primer IDs
|
725
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object without sqeuences containing residual offspring Primer ID
|
726
|
+
|
727
|
+
def filter_similar_pid(cutoff = 10)
|
728
|
+
seq = self.dna_hash.dup
|
729
|
+
uni_seq = seq.values.uniq
|
730
|
+
uni_seq_pid = {}
|
731
|
+
uni_seq.each do |k|
|
732
|
+
seq.each do |name,s|
|
733
|
+
name = name[1..-1]
|
734
|
+
if k == s
|
735
|
+
if uni_seq_pid[k]
|
736
|
+
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
737
|
+
else
|
738
|
+
uni_seq_pid[k] = []
|
739
|
+
uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
|
740
|
+
end
|
741
|
+
end
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
dup_pid = []
|
746
|
+
uni_seq_pid.values.each do |v|
|
747
|
+
next if v.size == 1
|
748
|
+
pid_hash = Hash[v]
|
749
|
+
list = pid_hash.keys
|
750
|
+
list2 = Array.new(list)
|
751
|
+
pairs = []
|
752
|
+
|
753
|
+
list.each do |k|
|
754
|
+
list2.delete(k)
|
755
|
+
list2.each do |k1|
|
756
|
+
pairs << [k,k1]
|
757
|
+
end
|
758
|
+
end
|
759
|
+
|
760
|
+
pairs.each do |p|
|
761
|
+
pid1 = p[0]
|
762
|
+
pid2 = p[1]
|
763
|
+
if pid1.compare_with(pid2) <= 1
|
764
|
+
n1 = pid_hash[pid1].to_i
|
765
|
+
n2 = pid_hash[pid2].to_i
|
766
|
+
if n1 >= cutoff * n2
|
767
|
+
dup_pid << pid2
|
768
|
+
elsif n2 >= cutoff * n1
|
769
|
+
dup_pid << pid1
|
770
|
+
end
|
771
|
+
end
|
772
|
+
end
|
773
|
+
end
|
774
|
+
|
775
|
+
new_seq = {}
|
776
|
+
seq.each do |name,s|
|
777
|
+
pid = name.split("_")[0][1..-1]
|
778
|
+
unless dup_pid.include?(pid)
|
779
|
+
new_seq[name] = s
|
780
|
+
end
|
781
|
+
end
|
782
|
+
self.sub(new_seq.keys)
|
783
|
+
end # end of #filter_similar_pid
|
784
|
+
|
785
|
+
# Collapse sequences by difference cut-offs. Suggesting aligning before using this function.
|
786
|
+
# @param cutoff [Integer] nt base differences. collapse sequences within [cutoff] differences
|
787
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object of collapsed sequences
|
788
|
+
|
789
|
+
def collapse(cutoff=1)
|
790
|
+
seq_array = self.dna_hash.values
|
791
|
+
new_seq_freq = {}
|
792
|
+
seq_freq = seq_array.count_freq
|
793
|
+
if seq_freq.size == 1
|
794
|
+
new_seq_freq = seq_freq
|
795
|
+
else
|
796
|
+
uniq_seq = seq_freq.keys
|
797
|
+
unique_seq_pair = uniq_seq.combination(2)
|
798
|
+
dupli_seq = []
|
799
|
+
unique_seq_pair.each do |pair|
|
800
|
+
seq1 = pair[0]
|
801
|
+
seq2 = pair[1]
|
802
|
+
diff = seq1.compare_with(seq2)
|
803
|
+
if diff <= cutoff
|
804
|
+
freq1 = seq_freq[seq1]
|
805
|
+
freq2 = seq_freq[seq2]
|
806
|
+
freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
seq_freq.each do |seq,freq|
|
811
|
+
unless dupli_seq.include?(seq)
|
812
|
+
new_seq_freq[seq] = freq
|
813
|
+
end
|
814
|
+
end
|
815
|
+
end
|
816
|
+
seqhash = ViralSeq::SeqHash.new
|
817
|
+
n = 1
|
818
|
+
new_seq_freq.each do |seq,freq|
|
819
|
+
name = ">seq_" + n.to_s + '_' + freq.to_s
|
820
|
+
seqhash.dna_hash[name] = seq
|
821
|
+
n += 1
|
822
|
+
end
|
823
|
+
return seqhash
|
824
|
+
end # end of #collapse
|
825
|
+
|
826
|
+
# gap strip from a sequence alignment, all positions that contains gaps ('-') will be removed
|
827
|
+
# @param option [Symbol] sequence options for `:nt` or `:aa`
|
828
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
829
|
+
# @example gap strip for an array of sequences
|
830
|
+
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
831
|
+
# array = { AACCGGTT
|
832
|
+
# A-CCGGTT
|
833
|
+
# AAC-GGTT
|
834
|
+
# AACCG-TT
|
835
|
+
# AACCGGT- }
|
836
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
837
|
+
# puts my_seqhash.gap_strip.dna_hash.values
|
838
|
+
# ACGT
|
839
|
+
# ACGT
|
840
|
+
# ACGT
|
841
|
+
# ACGT
|
842
|
+
# ACGT
|
843
|
+
|
844
|
+
def gap_strip(option = :nt)
|
845
|
+
if option == :nt
|
846
|
+
sequence_alignment = self.dna_hash
|
847
|
+
elsif option == :aa
|
848
|
+
sequence_alignment = self.aa_hash
|
849
|
+
else
|
850
|
+
raise "Option `#{option}` not recognized"
|
851
|
+
end
|
852
|
+
|
853
|
+
new_seq = {}
|
854
|
+
seq_size = sequence_alignment.values[0].size
|
855
|
+
seq_matrix = {}
|
856
|
+
(0..(seq_size - 1)).each do |p|
|
857
|
+
seq_matrix[p] = []
|
858
|
+
sequence_alignment.values.each do |s|
|
859
|
+
seq_matrix[p] << s[p]
|
860
|
+
end
|
861
|
+
end
|
862
|
+
|
863
|
+
seq_matrix.delete_if do |_p, list|
|
864
|
+
list.include?("-")
|
865
|
+
end
|
866
|
+
|
867
|
+
sequence_alignment.each do |n,s|
|
868
|
+
new_s = ""
|
869
|
+
seq_matrix.keys.each {|p| new_s += s[p]}
|
870
|
+
new_seq[n] = new_s
|
871
|
+
end
|
872
|
+
new_seq_hash = ViralSeq::SeqHash.new
|
873
|
+
if option == :nt
|
874
|
+
new_seq_hash.dna_hash = new_seq
|
875
|
+
new_seq_hash.aa_hash = self.aa_hash
|
876
|
+
elsif option == :aa
|
877
|
+
new_seq_hash.dna_hash = self.dna_hash
|
878
|
+
new_seq_hash.aa_hash = new_seq
|
879
|
+
end
|
880
|
+
new_seq_hash.qc_hash = self.qc_hash
|
881
|
+
new_seq_hash.title = self.title + "_strip"
|
882
|
+
new_seq_hash.file = self.file
|
883
|
+
return new_seq_hash
|
884
|
+
end
|
885
|
+
|
886
|
+
# gap strip from a sequence alignment at both ends, only positions at the ends that contains gaps ('-') will be removed.
|
887
|
+
# @param (see #gap_strip)
|
888
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
889
|
+
# @example gap strip for an array of sequences only at the ends
|
890
|
+
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
891
|
+
# array = { AACCGGTT
|
892
|
+
# A-CCGGTT
|
893
|
+
# AAC-GGTT
|
894
|
+
# AACCG-TT
|
895
|
+
# AACCGGT- }
|
896
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
897
|
+
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
898
|
+
# AACCGGT
|
899
|
+
# A-CCGGT
|
900
|
+
# AAC-GGT
|
901
|
+
# AACCG-T
|
902
|
+
# AACCGGT
|
903
|
+
|
904
|
+
def gap_strip_ends(option = :nt)
|
905
|
+
if option == :nt
|
906
|
+
sequence_alignment = self.dna_hash
|
907
|
+
elsif option == :aa
|
908
|
+
sequence_alignment = self.aa_hash
|
909
|
+
else
|
910
|
+
raise "Option #{option} not recognized"
|
911
|
+
end
|
912
|
+
new_seq = {}
|
913
|
+
seq_size = sequence_alignment.values[0].size
|
914
|
+
seq_matrix = {}
|
915
|
+
(0..(seq_size - 1)).each do |p|
|
916
|
+
seq_matrix[p] = []
|
917
|
+
sequence_alignment.values.each do |s|
|
918
|
+
seq_matrix[p] << s[p]
|
919
|
+
end
|
920
|
+
end
|
921
|
+
n1 = 0
|
922
|
+
n2 = 0
|
923
|
+
seq_matrix.each do |_p, list|
|
924
|
+
if list.include?("-")
|
925
|
+
n1 += 1
|
926
|
+
else
|
927
|
+
break
|
928
|
+
end
|
929
|
+
end
|
930
|
+
|
931
|
+
seq_matrix.keys.reverse.each do |p|
|
932
|
+
list = seq_matrix[p]
|
933
|
+
if list.include?("-")
|
934
|
+
n2 += 1
|
935
|
+
else
|
936
|
+
break
|
937
|
+
end
|
938
|
+
end
|
939
|
+
|
940
|
+
sequence_alignment.each do |n,s|
|
941
|
+
new_s = s[n1..(- n2 - 1)]
|
942
|
+
new_seq[n] = new_s
|
943
|
+
end
|
944
|
+
new_seq_hash = ViralSeq::SeqHash.new
|
945
|
+
if option == :nt
|
946
|
+
new_seq_hash.dna_hash = new_seq
|
947
|
+
new_seq_hash.aa_hash = self.aa_hash
|
948
|
+
elsif option == :aa
|
949
|
+
new_seq_hash.dna_hash = self.dna_hash
|
950
|
+
new_seq_hash.aa_hash = new_seq
|
951
|
+
end
|
952
|
+
new_seq_hash.qc_hash = self.qc_hash
|
953
|
+
new_seq_hash.title = self.title + "_strip"
|
954
|
+
new_seq_hash.file = self.file
|
955
|
+
return new_seq_hash
|
956
|
+
end
|
957
|
+
|
958
|
+
|
959
|
+
|
960
|
+
|
961
|
+
|
962
|
+
# start of private functions
|
963
|
+
private
|
964
|
+
|
965
|
+
# APOBEC3G/F mutation position identification,
|
966
|
+
# APOBEC3G/F pattern: GRD -> ARD,
|
967
|
+
# control pattern: G[YN|RC] -> A[YN|RC],
|
968
|
+
def apobec3gf(seq = '')
|
969
|
+
seq.tr!("-", "")
|
970
|
+
seq_length = seq.size
|
971
|
+
apobec_position = []
|
972
|
+
control_position = []
|
973
|
+
(0..(seq_length - 3)).each do |n|
|
974
|
+
tri_base = seq[n,3]
|
975
|
+
if tri_base =~ /G[A|G][A|G|T]/
|
976
|
+
apobec_position << n
|
977
|
+
elsif seq[n] == "G"
|
978
|
+
control_position << n
|
979
|
+
end
|
980
|
+
end
|
981
|
+
return [apobec_position,control_position]
|
982
|
+
end # end of #apobec3gf
|
983
|
+
|
984
|
+
# call consensus nucleotide, used by #consensus
|
985
|
+
def call_consensus_base(base_array)
|
986
|
+
if base_array.size == 1
|
987
|
+
base_array[0]
|
988
|
+
elsif base_array.size == 2
|
989
|
+
case base_array.sort!
|
990
|
+
when ["A","T"]
|
991
|
+
"W"
|
992
|
+
when ["C","G"]
|
993
|
+
"S"
|
994
|
+
when ["A","C"]
|
995
|
+
"M"
|
996
|
+
when ["G","T"]
|
997
|
+
"K"
|
998
|
+
when ["A","G"]
|
999
|
+
"R"
|
1000
|
+
when ["C","T"]
|
1001
|
+
"Y"
|
1002
|
+
else
|
1003
|
+
"N"
|
1004
|
+
end
|
1005
|
+
elsif base_array.size == 3
|
1006
|
+
case base_array.sort!
|
1007
|
+
when ["C","G","T"]
|
1008
|
+
"B"
|
1009
|
+
when ["A","G","T"]
|
1010
|
+
"D"
|
1011
|
+
when ["A","C","T"]
|
1012
|
+
"H"
|
1013
|
+
when ["A","C","G"]
|
1014
|
+
"V"
|
1015
|
+
else
|
1016
|
+
"N"
|
1017
|
+
end
|
1018
|
+
else
|
1019
|
+
"N"
|
1020
|
+
end
|
1021
|
+
end # end of #call_consensus_base
|
1022
|
+
|
1023
|
+
# Input sequence array. output Variant distribution for Poisson cut-off
|
1024
|
+
def variant_for_poisson(seq)
|
1025
|
+
seq_size = seq.size
|
1026
|
+
l = seq[0].size - 1
|
1027
|
+
var = []
|
1028
|
+
(0..l).to_a.each do |pos|
|
1029
|
+
nt = []
|
1030
|
+
seq.each do |s|
|
1031
|
+
nt << s[pos]
|
1032
|
+
end
|
1033
|
+
count_nt = nt.count_freq
|
1034
|
+
v = seq_size - count_nt.values.max
|
1035
|
+
var << v
|
1036
|
+
end
|
1037
|
+
var_count = var.count_freq
|
1038
|
+
var_count.sort_by{|key,_value|key}.to_h
|
1039
|
+
end # end of #varaint_for_poisson
|
1040
|
+
|
1041
|
+
end # end of SeqHash
|
1042
|
+
|
1043
|
+
end # end of ViralSeq
|