viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,89 +1,67 @@
1
- # viral_seq/muscle.rb
2
- # wrapper for MUSCLE (http://www.drive5.com/muscle)
3
- # Including Methods as:
4
- # ViralSeq::check_muscle
5
- # ViralSeq::muscle_align
6
- # ViralSeq::muscle_align_multi
7
-
8
- # ViralSeq.check_muscle?(path_to_muscle)
9
- # # check if the path_to_muscle provided is valid,
10
- # # prompt error messages if MUSCLE is not found.
11
-
12
- # ViralSeq.muscle_align(reference_seq, test_sequence, path_to_muscle)
13
- # # takes a reference sequence and a test sequence as String object
14
- # # without specification on path_to_muscle, MuscleBio will be called to run Muscle
15
- # # specify path_to_muscle if other source of muscle needed
16
- # # returns aligned reference sequence and test sequences
17
-
18
- # ViralSeq.muscle_align_multi(sequence_hash, path_to_muscle)
19
- # # input a sequence_hash object {:name=>:sequence,...}
20
- # # without specification on path_to_muscle, MuscleBio will be called to run Muscle
21
- # # specify path_to_muscle if other source of muscle needed
22
- # # return aligned sequences an hash
23
1
 
24
2
  module ViralSeq
3
+ # alignment using MUSCLE alignment program
4
+ # @see http://www.drive5.com/muscle MUSCLE download link
25
5
 
26
- # check if path_to_muscle is correct
27
- def self.check_muscle?(path_to_muscle)
28
- begin
29
- `#{path_to_muscle} -version`
30
- return true
31
- rescue Errno::ENOENT
32
- puts "
33
- Error: MUSCLE is not found for at the provided {path_to_muscle}!!
34
- MUSLCE can be download at http://www.drive5.com/muscle
35
- Add MUSCLE excutable path to $PATH using
36
- $ export PATH=$PATH:/path/to/muscle
37
- or
38
- provide path_to_MUSCLE in the function arguments\n
39
- "
40
- return false
41
- end
42
- end
6
+ module Muscle
7
+ # check if path_to_muscle is correct, prompt error messages if MUSCLE is not found.
8
+ # @param path_to_muscle [String] path to muscle excutable
9
+ # @return [boolean]
43
10
 
44
- def self.muscle_align(ref_seq = "", test_seq = "", path_to_muscle = false)
45
- temp_dir=File.dirname($0)
46
- temp_file = temp_dir + "/_temp_muscle_in"
47
- temp_aln = temp_dir + "/_temp_muscle_aln"
48
- name = ">test"
49
- temp_in = File.open(temp_file,"w")
50
- temp_in.puts ">ref"
51
- temp_in.puts ref_seq
52
- temp_in.puts name
53
- temp_in.puts test_seq
54
- temp_in.close
55
- if path_to_muscle
56
- unless ViralSeq.check_muscle?(path_to_muscle)
57
- File.unlink(temp_file)
58
- return nil;
11
+ def self.check_muscle?(path_to_muscle)
12
+ begin
13
+ `#{path_to_muscle} -version`
14
+ return true
15
+ rescue Errno::ENOENT
16
+ puts "
17
+ Error: MUSCLE is not found for at the provided {path_to_muscle}!!
18
+ MUSLCE can be download at http://www.drive5.com/muscle
19
+ Add MUSCLE excutable path to $PATH using
20
+ $ export PATH=$PATH:/path/to/muscle
21
+ or
22
+ provide path_to_MUSCLE in the function arguments\n
23
+ "
24
+ return false
59
25
  end
60
- print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
61
- else
62
- MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
63
- end
64
- aln_seq_hash = ViralSeq.fasta_to_hash(temp_aln)
65
- File.unlink(temp_file)
66
- File.unlink(temp_aln)
67
- return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
68
- end
26
+ end # end of .check_muscle?
69
27
 
70
- def self.muscle_align_multi(seq_hash = {}, path_to_muscle = false)
71
- temp_dir=File.dirname($0)
72
- temp_file = temp_dir + "/_temp_muscle_in"
73
- temp_aln = temp_dir + "/_temp_muscle_aln"
74
- File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
75
- if path_to_muscle
76
- unless ViralSeq.check_muscle?(path_to_muscle)
77
- File.unlink(temp_file)
78
- return nil
28
+ # align a sequence with reference sequence Strings
29
+ # @param ref_seq [String] reference sequence
30
+ # @param test_seq [String] test sequence
31
+ # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
32
+ # @return [Array] a pair of [:ref_seq_aligned, :test_seq_aligned] or nil
33
+ # if the cannot find MUSCLE excutable
34
+ # @example
35
+ # seq1 = 'AAGGCGTAGGAC'
36
+ # seq2 = 'AAGCTTAGGACG'
37
+ # aligned_seqs = ViralSeq::Muscle.align(seq1,seq2)
38
+ # => ["AAGGCGTAGGAC-", "-AAGCTTAGGACG"]
39
+
40
+ def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
+ temp_dir=File.dirname($0)
42
+ temp_file = temp_dir + "/_temp_muscle_in"
43
+ temp_aln = temp_dir + "/_temp_muscle_aln"
44
+ name = ">test"
45
+ temp_in = File.open(temp_file,"w")
46
+ temp_in.puts ">ref"
47
+ temp_in.puts ref_seq
48
+ temp_in.puts name
49
+ temp_in.puts test_seq
50
+ temp_in.close
51
+ if path_to_muscle
52
+ unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
53
+ File.unlink(temp_file)
54
+ return nil;
55
+ end
56
+ print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
57
+ else
58
+ MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
79
59
  end
80
- print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
81
- else
82
- MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
83
- end
84
- out_seq_hash = ViralSeq.fasta_to_hash(temp_aln)
85
- File.unlink(temp_file)
86
- File.unlink(temp_aln)
87
- return out_seq_hash
88
- end
89
- end
60
+ aln_seq_hash = ViralSeq::SeqHash.fa(temp_aln).dna_hash
61
+ File.unlink(temp_file)
62
+ File.unlink(temp_aln)
63
+ return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
64
+ end # end of .align
65
+ end # end of ViralSeq::Muscle
66
+
67
+ end # end of ViralSeq
@@ -0,0 +1,26 @@
1
+
2
+ module ViralSeq
3
+
4
+ module PID
5
+
6
+ # generate all Primer ID combinations given the length of Primer ID
7
+ # @param l [Integer] the length of the Primer ID.
8
+ # @example generate a pool of Primer IDs with length of 10
9
+ # primer_id_pool = ViralSeq::PID.generate_pool(10) # 10 is the length of Primer ID
10
+ # puts primer_id_pool.size #should be 4^10
11
+ # => 1048576
12
+
13
+ def self.generate_pool(l=8)
14
+ nt = ['A','T','C','G']
15
+ pid_pool = ['A','T','C','G']
16
+ (l-1).times do
17
+ pid_pool = pid_pool.product(nt)
18
+ pid_pool.collect! do |v|
19
+ v.join("")
20
+ end
21
+ end
22
+ return pid_pool
23
+ end # end of .generate_primer_id_pool
24
+
25
+ end # end of Pid
26
+ end # end of ViralSeq
@@ -0,0 +1,35 @@
1
+ # viral_seq main module
2
+ module ViralSeq
3
+
4
+ # HIV/SIV reference genome sequences, including HXB2, NL43, MAC239
5
+ # @see https://www.ncbi.nlm.nih.gov/nuccore/K03455 Reference sequence of HIV-1 HXB2 (Genbank accession number K03455)
6
+ # @see https://www.ncbi.nlm.nih.gov/nuccore/AF324493 Reference sequence of HIV-1 NL43 (Genbank accession number AF324493)
7
+ # @see https://www.ncbi.nlm.nih.gov/nucleotide/M33262 Reference sequence of SIV MAC239 (Genbank accession number M33262)
8
+ # @example retrive the reference sequence for HIV NL43
9
+ # ViralSeq::RefSeq.get(:NL43)
10
+ # => "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTA..."
11
+
12
+ module RefSeq
13
+
14
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
15
+ # @return [String] the reference sequence as a String object
16
+
17
+ def self.get(ref_option)
18
+ begin
19
+ case ref_option
20
+ when :HXB2
21
+ "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
22
+ when :NL43
23
+ "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAAATAAGGAGAGAAGAACAGCTTGTTACACCCTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGCATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGTGTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTATAGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACAGTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTTATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAAGGGTGCCCACACTAATGATGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCATAATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATATGTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGGAATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAAAAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATGGGTTGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATATGGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTTGGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGATTAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTACTAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGAATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGCCTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCATACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATATGGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGATAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGGTTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTATCCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACATGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGAGGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCTGCACGTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
24
+ when :MAC239
25
+ "GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAGGCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATACCCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCAGATTGGCGCCTGAACAGGGACTTGAAGGAGAGTGAGAGACTCCTGAGTACGGCTGAGTGAAGGCAGTAAGGGCGGCAGGAACCAACCACGACGGAGTGCTCCTATAAAGGCGCGGGTCGGTACCAGACGGCGTGAGGAGCGGGAGAGGAAGAGGCCTCCGGTTGCAGGTAAGTGCAACACAAAAAAGAAATAGCTGTCTTTTATCCAGGAAGGGGTAATAAGATAGAGTGGGAGATGGGCGTGAGAAACTCCGTCTTGTCAGGGAAGAAAGCAGATGAATTAGAAAAAATTAGGCTACGACCCAACGGAAAGAAAAAGTACATGTTGAAGCATGTAGTATGGGCAGCAAATGAATTAGATAGATTTGGATTAGCAGAAAGCCTGTTGGAGAACAAAGAAGGATGTCAAAAAATACTTTCGGTCTTAGCTCCATTAGTGCCAACAGGCTCAGAAAATTTAAAAAGCCTTTATAATACTGTCTGCGTCATCTGGTGCATTCACGCAGAAGAGAAAGTGAAACACACTGAGGAAGCAAAACAGATAGTGCAGAGACACCTAGTGGTGGAAACAGGAACAACAGAAACTATGCCAAAAACAAGTAGACCAACAGCACCATCTAGCGGCAGAGGAGGAAATTACCCAGTACAACAAATAGGTGGTAACTATGTCCACCTGCCATTAAGCCCGAGAACATTAAATGCCTGGGTAAAATTGATAGAGGAAAAGAAATTTGGAGCAGAAGTAGTGCCAGGATTTCAGGCACTGTCAGAAGGTTGCACCCCCTATGACATTAATCAGATGTTAAATTGTGTGGGAGACCATCAAGCGGCTATGCAGATTATCAGAGATATTATAAACGAGGAGGCTGCAGATTGGGACTTGCAGCACCCACAACCAGCTCCACAACAAGGACAACTTAGGGAGCCGTCAGGATCAGATATTGCAGGAACAACTAGTTCAGTAGATGAACAAATCCAGTGGATGTACAGACAACAGAACCCCATACCAGTAGGCAACATTTACAGGAGATGGATCCAACTGGGGTTGCAAAAATGTGTCAGAATGTATAACCCAACAAACATTCTAGATGTAAAACAAGGGCCAAAAGAGCCATTTCAGAGCTATGTAGACAGGTTCTACAAAAGTTTAAGAGCAGAACAGACAGATGCAGCAGTAAAGAATTGGATGACTCAAACACTGCTGATTCAAAATGCTAACCCAGATTGCAAGCTAGTGCTGAAGGGGCTGGGTGTGAATCCCACCCTAGAAGAAATGCTGACGGCTTGTCAAGGAGTAGGGGGGCCGGGACAGAAGGCTAGATTAATGGCAGAAGCCCTGAAAGAGGCCCTCGCACCAGTGCCAATCCCTTTTGCAGCAGCCCAACAGAGGGGACCAAGAAAGCCAATTAAGTGTTGGAATTGTGGGAAAGAGGGACACTCTGCAAGGCAATGCAGAGCCCCAAGAAGACAGGGATGCTGGAAATGTGGAAAAATGGACCATGTTATGGCCAAATGCCCAGACAGACAGGCGGGTTTTTTAGGCCTTGGTCCATGGGGAAAGAAGCCCCGCAATTTCCCCATGGCTCAAGTGCATCAGGGGCTGATGCCAACTGCTCCCCCAGAGGACCCAGCTGTGGATCTGCTAAAGAACTACATGCAGTTGGGCAAGCAGCAGAGAGAAAAGCAGAGAGAAAGCAGAGAGAAGCCTTACAAGGAGGTGACAGAGGATTTGCTGCACCTCAATTCTCTCTTTGGAGGAGACCAGTAGTCACTGCTCATATTGAAGGACAGCCTGTAGAAGTATTACTGGATACAGGGGCTGATGATTCTATTGTAACAGGAATAGAGTTAGGTCCACATTATACCCCAAAAATAGTAGGAGGAATAGGAGGTTTTATTAATACTAAAGAATACAAAAATGTAGAAATAGAAGTTTTAGGCAAAAGGATTAAAGGGACAATCATGACAGGGGACACCCCGATTAACATTTTTGGTAGAAATTTGCTAACAGCTCTGGGGATGTCTCTAAATTTTCCCATAGCTAAAGTAGAGCCTGTAAAAGTCGCCTTAAAGCCAGGAAAGGATGGACCAAAATTGAAGCAGTGGCCATTATCAAAAGAAAAGATAGTTGCATTAAGAGAAATCTGTGAAAAGATGGAAAAGGATGGTCAGTTGGAGGAAGCTCCCCCGACCAATCCATACAACACCCCCACATTTGCTATAAAGAAAAAGGATAAGAACAAATGGAGAATGCTGATAGATTTTAGGGAACTAAATAGGGTCACTCAGGACTTTACGGAAGTCCAATTAGGAATACCACACCCTGCAGGACTAGCAAAAAGGAAAAGAATTACAGTACTGGATATAGGTGATGCATATTTCTCCATACCTCTAGATGAAGAATTTAGGCAGTACACTGCCTTTACTTTACCATCAGTAAATAATGCAGAGCCAGGAAAACGATACATTTATAAGGTTCTGCCTCAGGGATGGAAGGGGTCACCAGCCATCTTCCAATACACTATGAGACATGTGCTAGAACCCTTCAGGAAGGCAAATCCAGATGTGACCTTAGTCCAGTATATGGATGACATCTTAATAGCTAGTGACAGGACAGACCTGGAACATGACAGGGTAGTTTTACAGTCAAAGGAACTCTTGAATAGCATAGGGTTTTCTACCCCAGAAGAGAAATTCCAAAAAGATCCCCCATTTCAATGGATGGGGTACGAATTGTGGCCAACAAAATGGAAGTTGCAAAAGATAGAGTTGCCACAAAGAGAGACCTGGACAGTGAATGATATACAGAAGTTAGTAGGAGTATTAAATTGGGCAGCTCAAATTTATCCAGGTATAAAAACCAAACATCTCTGTAGGTTAATTAGAGGAAAAATGACTCTAACAGAGGAAGTTCAGTGGACTGAGATGGCAGAAGCAGAATATGAGGAAAATAAAATAATTCTCAGTCAGGAACAAGAAGGATGTTATTACCAAGAAGGCAAGCCATTAGAAGCCACGGTAATAAAGAGTCAGGACAATCAGTGGTCTTATAAAATTCACCAAGAAGACAAAATACTGAAAGTAGGAAAATTTGCAAAGATAAAGAATACACATACCAATGGAGTGAGACTATTAGCACATGTAATACAGAAAATAGGAAAGGAAGCAATAGTGATCTGGGGACAGGTCCCAAAATTCCACTTACCAGTTGAGAAGGATGTATGGGAACAGTGGTGGACAGACTATTGGCAGGTAACCTGGATACCGGAATGGGATTTTATCTCAACACCACCGCTAGTAAGATTAGTCTTCAATCTAGTGAAGGACCCTATAGAGGGAGAAGAAACCTATTATACAGATGGATCATGTAATAAACAGTCAAAAGAAGGGAAAGCAGGATATATCACAGATAGGGGCAAAGACAAAGTAAAAGTGTTAGAACAGACTACTAATCAACAAGCAGAATTGGAAGCATTTCTCATGGCATTGACAGACTCAGGGCCAAAGGCAAATATTATAGTAGATTCACAATATGTTATGGGAATAATAACAGGATGCCCTACAGAATCAGAGAGCAGGCTAGTTAATCAAATAATAGAAGAAATGATTAAAAAGTCAGAAATTTATGTAGCATGGGTACCAGCACACAAAGGTATAGGAGGAAACCAAGAAATAGACCACCTAGTTAGTCAAGGGATTAGACAAGTTCTCTTCTTGGAAAAGATAGAGCCAGCACAAGAAGAACATGATAAATACCATAGTAATGTAAAAGAATTGGTATTCAAATTTGGATTACCCAGAATAGTGGCCAGACAGATAGTAGACACCTGTGATAAATGTCATCAGAAAGGAGAGGCTATACATGGGCAGGCAAATTCAGATCTAGGGACTTGGCAAATGGATTGTACCCATCTAGAGGGAAAAATAATCATAGTTGCAGTACATGTAGCTAGTGGATTCATAGAAGCAGAGGTAATTCCACAAGAGACAGGAAGACAGACAGCACTATTTCTGTTAAAATTGGCAGGCAGATGGCCTATTACACATCTACACACAGATAATGGTGCTAACTTTGCTTCGCAAGAAGTAAAGATGGTTGCATGGTGGGCAGGGATAGAGCACACCTTTGGGGTACCATACAATCCACAGAGTCAGGGAGTAGTGGAAGCAATGAATCACCACCTGAAAAATCAAATAGATAGAATCAGGGAACAAGCAAATTCAGTAGAAACCATAGTATTAATGGCAGTTCATTGCATGAATTTTAAAAGAAGGGGAGGAATAGGGGATATGACTCCAGCAGAAAGATTAATTAACATGATCACTACAGAACAAGAGATACAATTTCAACAATCAAAAAACTCAAAATTTAAAAATTTTCGGGTCTATTACAGAGAAGGCAGAGATCAACTGTGGAAGGGACCCGGTGAGCTATTGTGGAAAGGGGAAGGAGCAGTCATCTTAAAGGTAGGGACAGACATTAAGGTAGTACCCAGAAGAAAGGCTAAAATTATCAAAGATTATGGAGGAGGAAAAGAGGTGGATAGCAGTTCCCACATGGAGGATACCGGAGAGGCTAGAGAGGTGGCATAGCCTCATAAAATATCTGAAATATAAAACTAAAGATCTACAAAAGGTTTGCTATGTGCCCCATTTTAAGGTCGGATGGGCATGGTGGACCTGCAGCAGAGTAATCTTCCCACTACAGGAAGGAAGCCATTTAGAAGTACAAGGGTATTGGCATTTGACACCAGAAAAAGGGTGGCTCAGTACTTATGCAGTGAGGATAACCTGGTACTCAAAGAACTTTTGGACAGATGTAACACCAAACTATGCAGACATTTTACTGCATAGCACTTATTTCCCTTGCTTTACAGCGGGAGAAGTGAGAAGGGCCATCAGGGGAGAACAACTGCTGTCTTGCTGCAGGTTCCCGAGAGCTCATAAGTACCAGGTACCAAGCCTACAGTACTTAGCACTGAAAGTAGTAAGCGATGTCAGATCCCAGGGAGAGAATCCCACCTGGAAACAGTGGAGAAGAGACAATAGGAGAGGCCTTCGAATGGCTAAACAGAACAGTAGAGGAGATAAACAGAGAGGCGGTAAACCACCTACCAAGGGAGCTAATTTTCCAGGTTTGGCAAAGGTCTTGGGAATACTGGCATGATGAACAAGGGATGTCACCAAGCTATGTAAAATACAGATACTTGTGTTTAATACAAAAGGCTTTATTTATGCATTGCAAGAAAGGCTGTAGATGTCTAGGGGAAGGACATGGGGCAGGGGGATGGAGACCAGGACCTCCTCCTCCTCCCCCTCCAGGACTAGCATAAATGGAAGAAAGACCTCCAGAAAATGAAGGACCACAAAGGGAACCATGGGATGAATGGGTAGTGGAGGTTCTGGAAGAACTGAAAGAAGAAGCTTTAAAACATTTTGATCCTCGCTTGCTAACTGCACTTGGTAATCATATCTATAATAGACATGGAGACACCCTTGAGGGAGCAGGAGAACTCATTAGAATCCTCCAACGAGCGCTCTTCATGCATTTCAGAGGCGGATGCATCCACTCCAGAATCGGCCAACCTGGGGGAGGAAATCCTCTCTCAGCTATACCGCCCTCTAGAAGCATGCTATAACACATGCTATTGTAAAAAGTGTTGCTACCATTGCCAGTTTTGTTTTCTTAAAAAAGGCTTGGGGATATGTTATGAGCAATCACGAAAGAGAAGAAGAACTCCGAAAAAGGCTAAGGCTAATACATCTTCTGCATCAAACAAGTAAGTATGGGATGTCTTGGGAATCAGCTGCTTATCGCCATCTTGCTTTTAAGTGTCTATGGGATCTATTGTACTCTATATGTCACAGTCTTTTATGGTGTACCAGCTTGGAGGAATGCGACAATTCCCCTCTTTTGTGCAACCAAGAATAGGGATACTTGGGGAACAACTCAGTGCCTACCAGATAATGGTGATTATTCAGAAGTGGCCCTTAATGTTACAGAAAGCTTTGATGCCTGGAATAATACAGTCACAGAACAGGCAATAGAGGATGTATGGCAACTCTTTGAGACCTCAATAAAGCCTTGTGTAAAATTATCCCCATTATGCATTACTATGAGATGCAATAAAAGTGAGACAGATAGATGGGGATTGACAAAATCAATAACAACAACAGCATCAACAACATCAACGACAGCATCAGCAAAAGTAGACATGGTCAATGAGACTAGTTCTTGTATAGCCCAGGATAATTGCACAGGCTTGGAACAAGAGCAAATGATAAGCTGTAAATTCAACATGACAGGGTTAAAAAGAGACAAGAAAAAAGAGTACAATGAAACTTGGTACTCTGCAGATTTGGTATGTGAACAAGGGAATAACACTGGTAATGAAAGTAGATGTTACATGAACCACTGTAACACTTCTGTTATCCAAGAGTCTTGTGACAAACATTATTGGGATGCTATTAGATTTAGGTATTGTGCACCTCCAGGTTATGCTTTGCTTAGATGTAATGACACAAATTATTCAGGCTTTATGCCTAAATGTTCTAAGGTGGTGGTCTCTTCATGCACAAGGATGATGGAGACACAGACTTCTACTTGGTTTGGCTTTAATGGAACTAGAGCAGAAAATAGAACTTATATTTACTGGCATGGTAGGGATAATAGGACTATAATTAGTTTAAATAAGTATTATAATCTAACAATGAAATGTAGAAGACCAGGAAATAAGACAGTTTTACCAGTCACCATTATGTCTGGATTGGTTTTCCACTCACAACCAATCAATGATAGGCCAAAGCAGGCATGGTGTTGGTTTGGAGGAAAATGGAAGGATGCAATAAAAGAGGTGAAGCAGACCATTGTCAAACATCCCAGGTATACTGGAACTAACAATACTGATAAAATCAATTTGACGGCTCCTGGAGGAGGAGATCCGGAAGTTACCTTCATGTGGACAAATTGCAGAGGAGAGTTCCTCTACTGTAAAATGAATTGGTTTCTAAATTGGGTAGAAGATAGGAATACAGCTAACCAGAAGCCAAAGGAACAGCATAAAAGGAATTACGTGCCATGTCATATTAGACAAATAATCAACACTTGGCATAAAGTAGGCAAAAATGTTTATTTGCCTCCAAGAGAGGGAGACCTCACGTGTAACTCCACAGTGACCAGTCTCATAGCAAACATAGATTGGATTGATGGAAACCAAACTAATATCACCATGAGTGCAGAGGTGGCAGAACTGTATCGATTGGAATTGGGAGATTATAAATTAGTAGAGATCACTCCAATTGGCTTGGCCCCCACAGATGTGAAGAGGTACACTACTGGTGGCACCTCAAGAAATAAAAGAGGGGTCTTTGTGCTAGGGTTCTTGGGTTTTCTCGCAACGGCAGGTTCTGCAATGGGCGCGGCGTCGTTGACGCTGACCGCTCAGTCCCGAACTTTATTGGCTGGGATAGTGCAGCAACAGCAACAGCTGTTGGACGTGGTCAAGAGACAACAAGAATTGTTGCGACTGACCGTCTGGGGAACAAAGAACCTCCAGACTAGGGTCACTGCCATCGAGAAGTACTTAAAGGACCAGGCGCAGCTGAATGCTTGGGGATGTGCGTTTAGACAAGTCTGCCACACTACTGTACCATGGCCAAATGCAAGTCTAACACCAAAGTGGAACAATGAGACTTGGCAAGAGTGGGAGCGAAAGGTTGACTTCTTGGAAGAAAATATAACAGCCCTCCTAGAGGAGGCACAAATTCAACAAGAGAAGAACATGTATGAATTACAAAAGTTGAATAGCTGGGATGTGTTTGGCAATTGGTTTGACCTTGCTTCTTGGATAAAGTATATACAATATGGAGTTTATATAGTTGTAGGAGTAATACTGTTAAGAATAGTGATCTATATAGTACAAATGCTAGCTAAGTTAAGGCAGGGGTATAGGCCAGTGTTCTCTTCCCCACCCTCTTATTTCCAGCAGACCCATATCCAACAGGACCCGGCACTGCCAACCAGAGAAGGCAAAGAAAGAGACGGTGGAGAAGGCGGTGGCAACAGCTCCTGGCCTTGGCAGATAGAATATATTCATTTCCTGATCCGCCAACTGATACGCCTCTTGACTTGGCTATTCAGCAACTGCAGAACCTTGCTATCGAGAGTATACCAGATCCTCCAACCAATACTCCAGAGGCTCTCTGCGACCCTACAGAGGATTCGAGAAGTCCTCAGGACTGAACTGACCTACCTACAATATGGGTGGAGCTATTTCCATGAGGCGGTCCAGGCCGTCTGGAGATCTGCGACAGAGACTCTTGCGGGCGCGTGGGGAGACTTATGGGAGACTCTTAGGAGAGGTGGAAGATGGATACTCGCAATCCCCAGGAGGATTAGACAAGGGCTTGAGCTCACTCTCTTGTGAGGGACAGAAATACAATCAGGGACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATTAGCATACAGAAAACAAAATATGGATGATATAGATGAGTAAGATGATGACTTGGTAGGGGTATCAGTGAGGCCAAAAGTTCCCCTAAGAACAATGAGTTACAAATTGGCAATAGACATGTCTCATTTTATAAAAGAAAAGGGGGGACTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCA"
26
+ else
27
+ raise StandardError.new("reference sequence not recognized, choose from :HXB2 (default), :NL43, or :MAC239.")
28
+ end
29
+ rescue StandardError => e
30
+ puts e.message
31
+ return nil
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,172 @@
1
+
2
+ module ViralSeq
3
+ # Fisher's Exact Test Function Library
4
+ #
5
+ # Based on JavaScript version created by: Oyvind Langsrud,
6
+ # Ported to Ruby by Bryan Donovan
7
+
8
+ module Rubystats
9
+ # Fisher's exact test
10
+ class FishersExactTest
11
+
12
+ def initialize
13
+ @sn11 = 0.0
14
+ @sn1_ = 0.0
15
+ @sn_1 = 0.0
16
+ @sn = 0.0
17
+ @sprob = 0.0
18
+
19
+ @sleft = 0.0
20
+ @sright = 0.0
21
+ @sless = 0.0
22
+ @slarg = 0.0
23
+
24
+ @left = 0.0
25
+ @right = 0.0
26
+ @twotail = 0.0
27
+ end
28
+
29
+ # @see http://lib.stat.cmu.edu/apstat/245 Reference: "Lanczos, C. 'A precision approximation of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964." Translation of Alan Miller's FORTRAN-implementation.
30
+
31
+ def lngamm(z)
32
+ x = 0
33
+ x += 0.0000001659470187408462 / (z+7)
34
+ x += 0.000009934937113930748 / (z+6)
35
+ x -= 0.1385710331296526 / (z+5)
36
+ x += 12.50734324009056 / (z+4)
37
+ x -= 176.6150291498386 / (z+3)
38
+ x += 771.3234287757674 / (z+2)
39
+ x -= 1259.139216722289 / (z+1)
40
+ x += 676.5203681218835 / (z)
41
+ x += 0.9999999999995183
42
+
43
+ return(::Math.log(x)-5.58106146679532777-z+(z-0.5) * ::Math.log(z+6.5))
44
+ end
45
+
46
+ def lnfact(n)
47
+ if n <= 1
48
+ return 0
49
+ else
50
+ return lngamm(n+1)
51
+ end
52
+ end
53
+
54
+ def lnbico(n,k)
55
+ return lnfact(n) - lnfact(k) - lnfact(n-k)
56
+ end
57
+
58
+ def hyper_323(n11, n1_, n_1, n)
59
+ return ::Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
60
+ end
61
+
62
+ def hyper(n11)
63
+ return hyper0(n11, 0, 0, 0)
64
+ end
65
+
66
+ def hyper0(n11i,n1_i,n_1i,ni)
67
+ if n1_i == 0 and n_1i ==0 and ni == 0
68
+ unless n11i % 10 == 0
69
+ if n11i == @sn11+1
70
+ @sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
71
+ @sn11 = n11i
72
+ return @sprob
73
+ end
74
+ if n11i == @sn11-1
75
+ @sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
76
+ @sn11 = n11i
77
+ return @sprob
78
+ end
79
+ end
80
+ @sn11 = n11i
81
+ else
82
+ @sn11 = n11i
83
+ @sn1_ = n1_i
84
+ @sn_1 = n_1i
85
+ @sn = ni
86
+ end
87
+ @sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
88
+ return @sprob
89
+ end
90
+
91
+ def exact(n11,n1_,n_1,n)
92
+
93
+ p = i = j = prob = 0.0
94
+
95
+ max = n1_
96
+ max = n_1 if n_1 < max
97
+ min = n1_ + n_1 - n
98
+ min = 0 if min < 0
99
+
100
+ if min == max
101
+ @sless = 1
102
+ @sright = 1
103
+ @sleft = 1
104
+ @slarg = 1
105
+ return 1
106
+ end
107
+
108
+ prob = hyper0(n11,n1_,n_1,n)
109
+ @sleft = 0
110
+
111
+ p = hyper(min)
112
+ i = min + 1
113
+ while p < (0.99999999 * prob)
114
+ @sleft += p
115
+ p = hyper(i)
116
+ i += 1
117
+ end
118
+
119
+ i -= 1
120
+
121
+ if p < (1.00000001*prob)
122
+ @sleft += p
123
+ else
124
+ i -= 1
125
+ end
126
+
127
+ @sright = 0
128
+
129
+ p = hyper(max)
130
+ j = max - 1
131
+ while p < (0.99999999 * prob)
132
+ @sright += p
133
+ p = hyper(j)
134
+ j -= 1
135
+ end
136
+ j += 1
137
+
138
+ if p < (1.00000001*prob)
139
+ @sright += p
140
+ else
141
+ j += 1
142
+ end
143
+
144
+ if (i - n11).abs < (j - n11).abs
145
+ @sless = @sleft
146
+ @slarg = 1 - @sleft + prob
147
+ else
148
+ @sless = 1 - @sright + prob
149
+ @slarg = @sright
150
+ end
151
+ return prob
152
+ end
153
+
154
+ def calculate(n11_,n12_,n21_,n22_)
155
+ n11_ *= -1 if n11_ < 0
156
+ n12_ *= -1 if n12_ < 0
157
+ n21_ *= -1 if n21_ < 0
158
+ n22_ *= -1 if n22_ < 0
159
+ n1_ = n11_ + n12_
160
+ n_1 = n11_ + n21_
161
+ n = n11_ + n12_ + n21_ + n22_
162
+ exact(n11_,n1_,n_1,n)
163
+ left = @sless
164
+ right = @slarg
165
+ twotail = @sleft + @sright
166
+ twotail = 1 if twotail > 1
167
+ values_hash = { :left =>left, :right =>right, :twotail =>twotail }
168
+ return values_hash
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,1043 @@
1
+
2
+ module ViralSeq
3
+
4
+ # ViralSeq::SeqHash class for operation on multiple sequences.
5
+ # @example read a FASTA sequence file of HIV PR sequences, make alignment, perform the QC location check, filter sequences with stop codons and APOBEC3g/f hypermutations, calculate pairwise diversity, calculate minority cut-off based on Poisson model, and examine for drug resistance mutations.
6
+ # my_pr_seqhash = ViralSeq::SeqHash.fa('my_pr_fasta_file.fasta')
7
+ # # new ViralSeq::SeqHash object from a FASTA file
8
+ # aligned_pr_seqhash = my_pr_seqhash.align
9
+ # # align with MUSCLE
10
+ # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
+ # # filter nt sequences with the reference coordinates
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
13
+ # # return a new ViralSeq::SeqHash object without stop codons
14
+ # filtered_seqhash = filtered_seqhash.a3g[1]
15
+ # # further filter out sequences with A3G hypermutations
16
+ # filtered_seqhash.pi
17
+ # # return pairwise diveristy π
18
+ # cut_off = filtered_seqhash.pm
19
+ # # return cut-off for minority variants based on Poisson model
20
+ # filtered_seqhash.sdrm_hiv_pr(cut_off)
21
+ # # examine for drug resistance mutations for PR region.
22
+
23
+ class SeqHash
24
+ # initialize a ViralSeq::SeqHash object
25
+ def initialize (dna_hash = {}, aa_hash = {}, qc_hash = {}, title = "", file = "")
26
+ @dna_hash = dna_hash
27
+ @aa_hash = aa_hash
28
+ @qc_hash = qc_hash
29
+ @title = title
30
+ @file = file
31
+ end
32
+
33
+ # @return [Hash] Hash object for :name => :sequence_string pairs
34
+ attr_accessor :dna_hash
35
+
36
+ # @return [Hash] Hash object for :name => :amino_acid_sequence_string pairs
37
+ attr_accessor :aa_hash
38
+
39
+ # @return [Hash] Hash object for :name => :qc_score_string pairs
40
+ attr_accessor :qc_hash
41
+
42
+ # @return [String] the title of the SeqHash object.
43
+ # default as the file basename if SeqHash object is initialized using ::fa or ::fq
44
+ attr_accessor :title
45
+
46
+ # @return [String] the file that is used to initialize SeqHash object, if it exists
47
+ attr_accessor :file
48
+
49
+ # initialize a new ViralSeq::SeqHash object from a FASTA format sequence file
50
+ # @param infile [String] path to the FASTA format sequence file
51
+ # @return [ViralSeq::SeqHash]
52
+ # @example new ViralSeq::SeqHash object from a FASTA file
53
+ # ViralSeq::SeqHash.fa('my_fasta_file.fasta')
54
+
55
+ def self.new_from_fasta(infile)
56
+ f=File.open(infile,"r")
57
+ return_hash = {}
58
+ name = ""
59
+ while line = f.gets do
60
+ line.tr!("\u0000","")
61
+ next if line == "\n"
62
+ next if line =~ /^\=/
63
+ if line =~ /^\>/
64
+ name = line.chomp
65
+ return_hash[name] = ""
66
+ else
67
+ return_hash[name] += line.chomp.upcase
68
+ end
69
+ end
70
+ f.close
71
+ seq_hash = ViralSeq::SeqHash.new
72
+ seq_hash.dna_hash = return_hash
73
+ seq_hash.title = File.basename(infile,".*")
74
+ seq_hash.file = infile
75
+ return seq_hash
76
+ end # end of ::new_from_fasta
77
+
78
+ # initialize a new ViralSeq::SeqHash object from a FASTA format sequence file of amino acid sequences
79
+ # @param infile [String] path to the FASTA format sequence file of aa sequences
80
+ # @return [ViralSeq::SeqHash]
81
+
82
+ def self.new_from_aa_fasta(infile)
83
+ f=File.open(infile,"r")
84
+ return_hash = {}
85
+ name = ""
86
+ while line = f.gets do
87
+ line.tr!("\u0000","")
88
+ next if line == "\n"
89
+ next if line =~ /^\=/
90
+ if line =~ /^\>/
91
+ name = line.chomp
92
+ return_hash[name] = ""
93
+ else
94
+ return_hash[name] += line.chomp.upcase
95
+ end
96
+ end
97
+ f.close
98
+ seq_hash = ViralSeq::SeqHash.new
99
+ seq_hash.aa_hash = return_hash
100
+ seq_hash.title = File.basename(infile,".*")
101
+ seq_hash.file = infile
102
+ return seq_hash
103
+ end # end of ::new_from_fasta
104
+
105
+ # initialize a new ViralSeq::SeqHash object from a FASTQ format sequence file
106
+ # @param fastq_file [String] path to the FASTA format sequence file
107
+ # @return [ViralSeq::SeqHash]
108
+ # @example new ViralSeq::SeqHash object from a FASTQ file
109
+ # ViralSeq::SeqHash.fq('my_fastq_file.fastq')
110
+
111
+ def self.new_from_fastq(fastq_file)
112
+ count = 0
113
+ sequence_a = []
114
+ quality_a = []
115
+ count_seq = 0
116
+
117
+ File.open(fastq_file,'r') do |file|
118
+ file.readlines.collect do |line|
119
+ count +=1
120
+ count_m = count % 4
121
+ if count_m == 1
122
+ line.tr!('@','>')
123
+ sequence_a << line.chomp
124
+ quality_a << line.chomp
125
+ count_seq += 1
126
+ elsif count_m == 2
127
+ sequence_a << line.chomp
128
+ elsif count_m == 0
129
+ quality_a << line.chomp
130
+ end
131
+ end
132
+ end
133
+ sequence_hash = Hash[*sequence_a]
134
+ quality_hash = Hash[*quality_a]
135
+
136
+ seq_hash = ViralSeq::SeqHash.new
137
+ seq_hash.dna_hash = sequence_hash
138
+ seq_hash.qc_hash = quality_hash
139
+ seq_hash.title = File.basename(fastq_file,".*")
140
+ seq_hash.file = fastq_file
141
+ return seq_hash
142
+ end # end of ::new_from_fastq
143
+
144
+ # initialize a ViralSeq::SeqHash object with an array of sequence strings
145
+ # @param master_tag [String] master tag to put in the sequence names
146
+ # @return [ViralSeq::SeqHash] No @qc_hash, @title will be the master_tag
147
+
148
+ def self.new_from_array(seq_array,master_tag = 'seq')
149
+ n = 1
150
+ hash = {}
151
+ seq_array.each do |seq|
152
+ hash[master_tag + "_" + n.to_s] = seq
153
+ n += 1
154
+ end
155
+ seq_hash = ViralSeq::SeqHash.new
156
+ seq_hash.dna_hash = hash
157
+ seq_hash.title = master_tag
158
+ return seq_hash
159
+ end # end of ::new_from_array
160
+
161
+
162
+ class << self
163
+ alias_method :fa, :new_from_fasta
164
+ alias_method :fq, :new_from_fastq
165
+ alias_method :aa_fa, :new_from_aa_fasta
166
+ alias_method :array, :new_from_array
167
+ end
168
+
169
+ # generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
170
+ # @return [String] relaxed sequencial phylip format in a String object
171
+ # @example convert fasta format to relaxed sequencial phylip format
172
+ # # my_fasta_file.fasta
173
+ # # >seq1
174
+ # # ATAAGAACG
175
+ # # >seq2
176
+ # # ATATGAACG
177
+ # # >seq3
178
+ # # ATGAGAACG
179
+ # my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
180
+ # puts my_seqhash.to_rsphylip
181
+ # # 3 9
182
+ # # seq1 ATAAGAACG
183
+ # # seq2 ATATGAACG
184
+ # # seq3 ATGAGAACG
185
+
186
+ def to_rsphylip
187
+ seqs = self.dna_hash
188
+ outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
189
+ names = seqs.keys
190
+ names.collect!{|n| n.tr(">", "")}
191
+ max_name_l = names.max.size
192
+ max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
193
+ seqs.each do |k,v|
194
+ outline += k + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
195
+ end
196
+ return outline
197
+ end # end of #to_rsphylip
198
+
199
+ # translate the DNA sequences in @dna_hash to amino acid sequences. generate value for @aa_hash
200
+ # @param codon_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
201
+ # @return [NilClass]
202
+ # @example translate dna sequences from a FASTA format sequence file
203
+ # # my_fasta_file.fasta
204
+ # # >seq1
205
+ # # ATAAGAACG
206
+ # # >seq2
207
+ # # ATATGAACG
208
+ # # >seq3
209
+ # # ATGAGAACG
210
+ # my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
211
+ # my_seqhash.translate
212
+ # my_seqhash.aa_sequence
213
+ # => {">seq1"=>"IRT", ">seq2"=>"I*T", ">seq3"=>"MRT"}
214
+
215
+ def translate(codon_position = 0)
216
+ seqs = self.dna_hash
217
+ @aa_hash = {}
218
+ seqs.each do |name, seq|
219
+ s = ViralSeq::Sequence.new(name, seq)
220
+ s.translate(codon_position)
221
+ @aa_hash[name] = s.aa_string
222
+ end
223
+ return nil
224
+ end # end of #translate
225
+
226
+ # collapse @dna_hash to unique sequence hash.
227
+ # @param tag # the master tag for unique sequences,
228
+ # sequences will be named as (tag + "_" + order(Integer) + "_" + counts(Integer))
229
+ # @return [ViralSeq::SeqHash] new SeqHash object of unique sequence hash
230
+ # @example
231
+ # dna_hash = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA', '>seq4' => 'CCCC', '>seq5' => 'CCCC', '>seq6' => 'TTTT'} }
232
+ # a_seq_hash = ViralSeq::SeqHash.new
233
+ # a_seq_hash.dna_hash = dna_hash
234
+ # uniq_sequence = a_seq_hash.uniq_dna_hash('master')
235
+ # => {">master_1_3"=>"AAAA", ">master_2_2"=>"CCCC", ">master_3_1"=>"TTTT"}
236
+
237
+ def uniq_dna_hash(tag = "sequence")
238
+ seqs = self.dna_hash
239
+ uni = seqs.values.count_freq
240
+ new_seq = {}
241
+ n = 1
242
+ uni.each do |s,c|
243
+ name = ">" + tag + "_" + n.to_s + "_" + c.to_s
244
+ new_seq[name] = s
245
+ n += 1
246
+ end
247
+ seq_hash = ViralSeq::SeqHash.new(new_seq)
248
+ seq_hash.title = self.title + "_uniq"
249
+ seq_hash.file = self.file
250
+ return seq_hash
251
+ end # end of #uniq_dna_hash
252
+
253
+ alias_method :uniq, :uniq_dna_hash
254
+
255
+ # given an Array of sequence tags, return a sub ViralSeq::SeqHash object with the sequence tags
256
+ # @param keys [Array] array of sequence tags
257
+ # @return [SeqHash] new SeqHash object with sequences of the input keys
258
+
259
+ def sub(keys)
260
+ h1 = {}
261
+ h2 = {}
262
+ h3 = {}
263
+
264
+ keys.each do |k|
265
+ dna = self.dna_hash[k]
266
+ next unless dna
267
+ h1[k] = dna
268
+ aa = self.aa_hash[k]
269
+ h2[k] = aa
270
+ qc = self.qc_hash[k]
271
+ h3[k] = qc
272
+ end
273
+ title = self.title
274
+ file = self.file
275
+ ViralSeq::SeqHash.new(h1,h2,h3,title,file)
276
+ end
277
+
278
+ # screen for sequences with stop codons.
279
+ # @param (see #translate)
280
+ # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
281
+ #
282
+ # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
283
+ # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
284
+ # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
285
+ # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
286
+ # my_seqhash.dna_hash
287
+ # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
288
+ # stop_codon_seqhash = my_seqhash.stop_codon[0]
289
+ # stop_codon_seqhash.dna_hash
290
+ # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
291
+ # stop_codon_seqhash.aa_hash
292
+ # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
293
+ # stop_codon_seqhash.title
294
+ # => "my_fasta_file_stop"
295
+ # filtered_seqhash = my_seqhash.stop_codon[1]
296
+ # filtered_seqhash.aa_hash
297
+ # {">seq1"=>"IRT", ">seq3"=>"MRT"}
298
+
299
+ def stop_codon(codon_position = 0)
300
+ self.translate(codon_position)
301
+ keys = []
302
+ self.aa_hash.each do |k,v|
303
+ keys << k if v.include?('*')
304
+ end
305
+ seqhash1 = self.sub(keys)
306
+ seqhash1.title = self.title + "_stop"
307
+ keys2 = self.aa_hash.keys - keys
308
+ seqhash2 = self.sub(keys2)
309
+ return [seqhash1, seqhash2]
310
+ end #end of #stop_codon
311
+
312
+
313
+ # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
314
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
315
+ # @return [String] consensus sequence
316
+ # @example consensus sequence from an array of sequences.
317
+ # seq_array = %w{ ATTTTTTTTT
318
+ # AATTTTTTTT
319
+ # AAATTTTTTT
320
+ # AAAATTTTTT
321
+ # AAAAATTTTT
322
+ # AAAAAATTTT
323
+ # AAAAAAATTT
324
+ # AAAAAAAATT
325
+ # AAAAAAAAAT
326
+ # AAAAAAAAAA }
327
+ # my_seqhash = ViralSeq::SeqHash.array(seq_array)
328
+ # my_seqhash.consensus
329
+ # => 'AAAAAWTTTT'
330
+ # my_seqhash.consensus(0.7)
331
+ # => 'AAAANNNTTT'
332
+
333
+ def consensus(cutoff = 0.5)
334
+ seq_array = self.dna_hash.values
335
+ seq_length = seq_array[0].size
336
+ seq_size = seq_array.size
337
+ consensus_seq = ""
338
+ (0..(seq_length - 1)).each do |position|
339
+ all_base = []
340
+ seq_array.each do |seq|
341
+ all_base << seq[position]
342
+ end
343
+ base_count = all_base.count_freq
344
+ max_base_list = []
345
+
346
+ base_count.each do |k,v|
347
+ if v/seq_size.to_f >= cutoff
348
+ max_base_list << k
349
+ end
350
+ end
351
+ consensus_seq += call_consensus_base(max_base_list)
352
+ end
353
+ return consensus_seq
354
+ end #end of #consensus
355
+
356
+ # function to determine if the sequences have APOBEC3g/f hypermutation.
357
+ # # APOBEC3G/F pattern: GRD -> ARD
358
+ # # control pattern: G[YN|RC] -> A[YN|RC]
359
+ # # use the sample consensus to determine potential a3g sites
360
+ # # Two criteria to identify hypermutation
361
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
362
+ # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
363
+ # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
364
+ # # b/c Poisson model does not do well on small sample size.
365
+ # @return [Array] three values.
366
+ # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
367
+ # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
368
+ # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
369
+ # # sequence tag
370
+ # # G to A mutation numbers at potential a3g positions
371
+ # # total potential a3g G positions
372
+ # # G to A mutation numbers at non a3g positions
373
+ # # total non a3g G positions
374
+ # # a3g G to A mutation rate / non-a3g G to A mutation rate
375
+ # # Fishers Exact P-value
376
+ # @example identify apobec3gf mutations from a sequence fasta file
377
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
378
+ # hypermut = my_seqhash.a3g
379
+ # hypermut[0].dna_hash.keys
380
+ # => [">Seq7", ">Seq14"]
381
+ # hypermut[1].dna_hash.keys
382
+ # => [">Seq1", ">Seq2", ">Seq5"]
383
+ # hypermut[2]
384
+ # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
385
+ #
386
+ # @example identify apobec3gf mutations from another sequence fasta file
387
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
388
+ # hypermut = my_seqhash.a3g
389
+ # hypermut[2]
390
+ # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
391
+ # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
392
+ # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
393
+ # @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
394
+
395
+ def a3g_hypermut
396
+ # mut_hash number of apobec3g/f mutations per sequence
397
+ mut_hash = {}
398
+ hm_hash = {}
399
+ out_hash = {}
400
+
401
+ # total G->A mutations at apobec3g/f positions.
402
+ total = 0
403
+
404
+ # make consensus sequence for the input sequence hash
405
+ ref = self.consensus
406
+
407
+ # obtain apobec3g positions and control positions
408
+ apobec = apobec3gf(ref)
409
+ mut = apobec[0]
410
+ control = apobec[1]
411
+
412
+ self.dna_hash.each do |k,v|
413
+ a = 0 # muts
414
+ b = 0 # potential mut sites
415
+ c = 0 # control muts
416
+ d = 0 # potenrial controls
417
+ mut.each do |n|
418
+ next if v[n] == "-"
419
+ if v[n] == "A"
420
+ a += 1
421
+ b += 1
422
+ else
423
+ b += 1
424
+ end
425
+ end
426
+ mut_hash[k] = a
427
+ total += a
428
+
429
+ control.each do |n|
430
+ next if v[n] == "-"
431
+ if v[n] == "A"
432
+ c += 1
433
+ d += 1
434
+ else
435
+ d += 1
436
+ end
437
+ end
438
+ rr = (a/b.to_f)/(c/d.to_f)
439
+
440
+ t1 = b - a
441
+ t2 = d - c
442
+
443
+ fet = ViralSeq::Rubystats::FishersExactTest.new
444
+ fisher = fet.calculate(t1,t2,a,c)
445
+ perc = fisher[:twotail]
446
+ info = [k, a, b, c, d, rr.round(2), perc]
447
+ out_hash[k] = info
448
+ if perc < 0.05
449
+ hm_hash[k] = info
450
+ end
451
+ end
452
+
453
+ if self.dna_hash.size > 20
454
+ rate = total.to_f/(self.dna_hash.size)
455
+ count_mut = mut_hash.values.count_freq
456
+ maxi_count = count_mut.values.max
457
+ poisson_hash = ViralSeq::Math::PoissonDist.new(rate,maxi_count).poisson_hash
458
+ cut_off = 0
459
+ poisson_hash.each do |k,v|
460
+ cal = self.dna_hash.size * v
461
+ obs = count_mut[k]
462
+ if obs >= 20 * cal
463
+ cut_off = k
464
+ break
465
+ elsif k == maxi_count
466
+ cut_off = maxi_count
467
+ end
468
+ end
469
+ mut_hash.each do |k,v|
470
+ if v > cut_off
471
+ hm_hash[k] = out_hash[k]
472
+ end
473
+ end
474
+ end
475
+ hm_seq_hash = ViralSeq::SeqHash.new
476
+ hm_hash.each do |k,_v|
477
+ hm_seq_hash.dna_hash[k] = self.dna_hash[k]
478
+ end
479
+ hm_seq_hash.title = self.title + "_hypermut"
480
+ hm_seq_hash.file = self.file
481
+ filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
482
+ return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
483
+ end #end of #a3g_hypermut
484
+
485
+ alias_method :a3g, :a3g_hypermut
486
+
487
+ # Define Poission cut-off for minority variants.
488
+ # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 Ref: Zhou, et al. J Virol 2015
489
+ # @param error_rate [Float] estimated sequencing error rate
490
+ # @param fold_cutoff [Integer] a fold cut-off to determine poisson minority cut-off. default = 20. i.e. <5% mutations from random methods error.
491
+ # @return [Integer] a cut-off for minority variants (>=).
492
+ # @example obtain Poisson minority cut-off from the example sequence FASTA file.
493
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
494
+ # my_seqhash.pm
495
+ # => 2 # means that mutations appear at least 2 times are very likely to be a true mutation instead of random methods errors.
496
+
497
+ def poisson_minority_cutoff(error_rate = 0.0001, fold_cutoff = 20)
498
+ sequences = self.dna_hash.values
499
+ if sequences.size == 0
500
+ return 0
501
+ else
502
+ cut_off = 1
503
+ l = sequences[0].size
504
+ rate = sequences.size * error_rate
505
+ count_mut = variant_for_poisson(sequences)
506
+ max_count = count_mut.keys.max
507
+ poisson_hash = ViralSeq::Math::PoissonDist.new(rate, max_count).poisson_hash
508
+
509
+ poisson_hash.each do |k,v|
510
+ cal = l * v
511
+ obs = count_mut[k] ? count_mut[k] : 0
512
+ if obs >= fold_cutoff * cal
513
+ cut_off = k
514
+ break
515
+ end
516
+ end
517
+ return cut_off
518
+ end
519
+ end # end of #poisson_minority_cutoff
520
+
521
+ alias_method :pm, :poisson_minority_cutoff
522
+
523
+
524
+ # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
525
+ # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
526
+ # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
527
+
528
+ def align(path_to_muscle = false)
529
+ seq_hash = self.dna_hash
530
+ if self.file.size > 0
531
+ temp_dir = File.dirname(self.file)
532
+ else
533
+ temp_dir=File.dirname($0)
534
+ end
535
+
536
+ temp_file = temp_dir + "/_temp_muscle_in"
537
+ temp_aln = temp_dir + "/_temp_muscle_aln"
538
+ File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
539
+ if path_to_muscle
540
+ unless ViralSeq.check_muscle?(path_to_muscle)
541
+ File.unlink(temp_file)
542
+ return nil
543
+ end
544
+ print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
545
+ else
546
+ MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
547
+ end
548
+ out_seq_hash = ViralSeq::SeqHash.fa(temp_aln)
549
+ out_seq_hash.title = self.title + "_aligned"
550
+ out_seq_hash.file = self.file
551
+ File.unlink(temp_file)
552
+ File.unlink(temp_aln)
553
+ return out_seq_hash
554
+ end # end of align
555
+
556
+ # calculate Shannon's entropy, Euler's number as the base of logarithm
557
+ # @see https://en.wikipedia.org/wiki/Entropy_(information_theory) Entropy(Wikipedia)
558
+ # @param option [Symbol] the sequence type `:nt` or `:aa`
559
+ # @return [Hash] entropy score at each position in the alignment :position => :entropy ,
560
+ # # position starts at 1.
561
+ # @example caculate entropy from the example file
562
+ # sequence_file = 'spec/sample_files/sample_sequence_alignment_for_entropy.fasta'
563
+ # sequence_hash = ViralSeq::SeqHash.aa_fa(sequence_file)
564
+ # entropy_hash = sequence_hash.shannons_entropy(:aa)
565
+ # entropy_hash[3]
566
+ # => 0.0
567
+ # entropy_hash[14].round(3)
568
+ # => 0.639
569
+ # # This example is the sample input of LANL Entropy-One
570
+ # # https://www.hiv.lanl.gov/content/sequence/ENTROPY/entropy_one.html?sample_input=1
571
+
572
+ def shannons_entropy(option = :nt)
573
+ sequences = if option == :aa
574
+ self.aa_hash.values
575
+ else
576
+ self.dna_hash.values
577
+ end
578
+ entropy_hash = {}
579
+ seq_l = sequences[0].size
580
+ (0..(seq_l - 1)).each do |position|
581
+ element = []
582
+ sequences.each do |seq|
583
+ element << seq[position]
584
+ end
585
+ entropy = 0
586
+ element.delete('*')
587
+ element_size = element.size
588
+ element.count_freq.each do |_k,v|
589
+ p = v/element_size.to_f
590
+ entropy += (-p * ::Math.log(p))
591
+ end
592
+ entropy_hash[(position + 1)] = entropy
593
+ end
594
+ return entropy_hash
595
+ end # end of shannons_entropy
596
+
597
+ # Function to calculate nucleotide diversity π, for nt sequence only
598
+ # @see https://en.wikipedia.org/wiki/Nucleotide_diversity Nucleotide Diversity (Wikipedia)
599
+ # @return [Float] nucleotide diversity π
600
+ # @example calculate π
601
+ # sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
602
+ # my_seqhash = ViralSeq::SeqHash.array(sequences)
603
+ # my_seqhash.pi
604
+ # => 0.16667
605
+
606
+ def nucleotide_pi
607
+ sequences = self.dna_hash.values
608
+ seq_length = sequences[0].size - 1
609
+ nt_position_hash = {}
610
+ (0..seq_length).each do |n|
611
+ nt_position_hash[n] = []
612
+ sequences.each do |s|
613
+ nt_position_hash[n] << s[n]
614
+ end
615
+ end
616
+ diver = 0
617
+ com = 0
618
+ nt_position_hash.each do |_p,nt|
619
+ nt.delete_if {|n| n =~ /[^A|^C|^G|^T]/}
620
+ next if nt.size == 1
621
+ nt_count = nt.count_freq
622
+ combination = (nt.size)*(nt.size - 1)/2
623
+ com += combination
624
+ a = nt_count["A"]
625
+ c = nt_count["C"]
626
+ t = nt_count["T"]
627
+ g = nt_count["G"]
628
+ div = a*c + a*t + a*g + c*t + c*g + t*g
629
+ diver += div
630
+ end
631
+ pi = (diver/com.to_f).round(5)
632
+ return pi
633
+ end # end of #pi
634
+
635
+ alias_method :pi, :nucleotide_pi
636
+
637
+ # TN93 distance functionl, tabulate pairwise comparison of sequence pairs in a sequence alignment,
638
+ # nt sequence only
639
+ # @return [Hash] pairwise distance table in Hash object {:diff => :freq, ... }
640
+ # # Note: :diff in different positions (Integer), not percentage.
641
+ # @example calculate TN93 distribution
642
+ # sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
643
+ # my_seqhash = ViralSeq::SeqHash.array(sequences)
644
+ # my_seqhash.tn93
645
+ # => {0=>1, 1=>8, 2=>6}
646
+
647
+ def tn93
648
+ sequences = self.dna_hash.values
649
+ diff = []
650
+ seq_hash = sequences.count_freq
651
+ seq_hash.values.each do |v|
652
+ comb = v * (v - 1) / 2
653
+ comb.times {diff << 0}
654
+ end
655
+
656
+ seq_hash.keys.combination(2).to_a.each do |pair|
657
+ s1 = pair[0]
658
+ s2 = pair[1]
659
+ diff_temp = s1.compare_with(s2)
660
+ comb = seq_hash[s1] * seq_hash[s2]
661
+ comb.times {diff << diff_temp}
662
+ end
663
+
664
+ count_diff = diff.count_freq
665
+ out_hash = Hash.new(0)
666
+ Hash[count_diff.sort_by{|k,_v|k}].each do |k,v|
667
+ out_hash[k] = v
668
+ end
669
+ return out_hash
670
+ end # end of #tn93
671
+
672
+ # quality check for HIV sequences based on ViralSeq::Sequence#locator, check if sequences are in the target range
673
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
674
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
675
+ # @param indel [Boolean] allow indels or not, `ture` or `false`
676
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
677
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
678
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with only the sequences that meet the QC criterias
679
+ # @example QC for sequences in a FASTA files
680
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_seq.fasta')
681
+ # filtered_seqhash = my_seqhash.hiv_seq_qc([4384,4386], 4750..4752, false, :HXB2)
682
+ # my_seqhash.dna_hash.size
683
+ # => 6
684
+ # filtered_seqhash.dna_hash.size
685
+ # => 4
686
+
687
+ def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
688
+ start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
689
+ end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
690
+ seq_hash = self.dna_hash.dup
691
+ seq_hash_unique = seq_hash.values.uniq
692
+ seq_hash_unique_pass = []
693
+
694
+ seq_hash_unique.each do |seq|
695
+ loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
696
+ if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
697
+ if indel
698
+ seq_hash_unique_pass << seq
699
+ elsif loc[3] == false
700
+ seq_hash_unique_pass << seq
701
+ end
702
+ end
703
+ end
704
+ seq_pass = []
705
+ seq_hash_unique_pass.each do |seq|
706
+ seq_hash.each do |seq_name, orginal_seq|
707
+ if orginal_seq == seq
708
+ seq_pass << seq_name
709
+ seq_hash.delete(seq_name)
710
+ end
711
+ end
712
+ end
713
+ self.sub(seq_pass)
714
+ end # end of #hiv_seq_qc
715
+
716
+
717
+ # Remove squences with residual offspring Primer IDs.
718
+ # Compare PID with sequences which have identical sequences.
719
+ # PIDs differ by 1 base will be recognized. If PID1 is x time (cutoff) greater than PID2, PID2 will be disgarded.
720
+ # each sequence tag starting with ">" and the Primer ID sequence
721
+ # followed by the number of Primer ID appeared in the raw sequence
722
+ # the information sections in the tags are separated by underscore "_"
723
+ # example sequence tag: >AGGCGTAGA_32_sample1_RT
724
+ # @param cutoff [Integer] the fold cut-off to remove the potential residual offspring Primer IDs
725
+ # @return [ViralSeq::SeqHash] a new SeqHash object without sqeuences containing residual offspring Primer ID
726
+
727
+ def filter_similar_pid(cutoff = 10)
728
+ seq = self.dna_hash.dup
729
+ uni_seq = seq.values.uniq
730
+ uni_seq_pid = {}
731
+ uni_seq.each do |k|
732
+ seq.each do |name,s|
733
+ name = name[1..-1]
734
+ if k == s
735
+ if uni_seq_pid[k]
736
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
737
+ else
738
+ uni_seq_pid[k] = []
739
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
740
+ end
741
+ end
742
+ end
743
+ end
744
+
745
+ dup_pid = []
746
+ uni_seq_pid.values.each do |v|
747
+ next if v.size == 1
748
+ pid_hash = Hash[v]
749
+ list = pid_hash.keys
750
+ list2 = Array.new(list)
751
+ pairs = []
752
+
753
+ list.each do |k|
754
+ list2.delete(k)
755
+ list2.each do |k1|
756
+ pairs << [k,k1]
757
+ end
758
+ end
759
+
760
+ pairs.each do |p|
761
+ pid1 = p[0]
762
+ pid2 = p[1]
763
+ if pid1.compare_with(pid2) <= 1
764
+ n1 = pid_hash[pid1].to_i
765
+ n2 = pid_hash[pid2].to_i
766
+ if n1 >= cutoff * n2
767
+ dup_pid << pid2
768
+ elsif n2 >= cutoff * n1
769
+ dup_pid << pid1
770
+ end
771
+ end
772
+ end
773
+ end
774
+
775
+ new_seq = {}
776
+ seq.each do |name,s|
777
+ pid = name.split("_")[0][1..-1]
778
+ unless dup_pid.include?(pid)
779
+ new_seq[name] = s
780
+ end
781
+ end
782
+ self.sub(new_seq.keys)
783
+ end # end of #filter_similar_pid
784
+
785
+ # Collapse sequences by difference cut-offs. Suggesting aligning before using this function.
786
+ # @param cutoff [Integer] nt base differences. collapse sequences within [cutoff] differences
787
+ # @return [ViralSeq::SeqHash] a new SeqHash object of collapsed sequences
788
+
789
+ def collapse(cutoff=1)
790
+ seq_array = self.dna_hash.values
791
+ new_seq_freq = {}
792
+ seq_freq = seq_array.count_freq
793
+ if seq_freq.size == 1
794
+ new_seq_freq = seq_freq
795
+ else
796
+ uniq_seq = seq_freq.keys
797
+ unique_seq_pair = uniq_seq.combination(2)
798
+ dupli_seq = []
799
+ unique_seq_pair.each do |pair|
800
+ seq1 = pair[0]
801
+ seq2 = pair[1]
802
+ diff = seq1.compare_with(seq2)
803
+ if diff <= cutoff
804
+ freq1 = seq_freq[seq1]
805
+ freq2 = seq_freq[seq2]
806
+ freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
807
+ end
808
+ end
809
+
810
+ seq_freq.each do |seq,freq|
811
+ unless dupli_seq.include?(seq)
812
+ new_seq_freq[seq] = freq
813
+ end
814
+ end
815
+ end
816
+ seqhash = ViralSeq::SeqHash.new
817
+ n = 1
818
+ new_seq_freq.each do |seq,freq|
819
+ name = ">seq_" + n.to_s + '_' + freq.to_s
820
+ seqhash.dna_hash[name] = seq
821
+ n += 1
822
+ end
823
+ return seqhash
824
+ end # end of #collapse
825
+
826
+ # gap strip from a sequence alignment, all positions that contains gaps ('-') will be removed
827
+ # @param option [Symbol] sequence options for `:nt` or `:aa`
828
+ # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
829
+ # @example gap strip for an array of sequences
830
+ # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
831
+ # array = { AACCGGTT
832
+ # A-CCGGTT
833
+ # AAC-GGTT
834
+ # AACCG-TT
835
+ # AACCGGT- }
836
+ # my_seqhash = ViralSeq::SeqHash.array(array)
837
+ # puts my_seqhash.gap_strip.dna_hash.values
838
+ # ACGT
839
+ # ACGT
840
+ # ACGT
841
+ # ACGT
842
+ # ACGT
843
+
844
+ def gap_strip(option = :nt)
845
+ if option == :nt
846
+ sequence_alignment = self.dna_hash
847
+ elsif option == :aa
848
+ sequence_alignment = self.aa_hash
849
+ else
850
+ raise "Option `#{option}` not recognized"
851
+ end
852
+
853
+ new_seq = {}
854
+ seq_size = sequence_alignment.values[0].size
855
+ seq_matrix = {}
856
+ (0..(seq_size - 1)).each do |p|
857
+ seq_matrix[p] = []
858
+ sequence_alignment.values.each do |s|
859
+ seq_matrix[p] << s[p]
860
+ end
861
+ end
862
+
863
+ seq_matrix.delete_if do |_p, list|
864
+ list.include?("-")
865
+ end
866
+
867
+ sequence_alignment.each do |n,s|
868
+ new_s = ""
869
+ seq_matrix.keys.each {|p| new_s += s[p]}
870
+ new_seq[n] = new_s
871
+ end
872
+ new_seq_hash = ViralSeq::SeqHash.new
873
+ if option == :nt
874
+ new_seq_hash.dna_hash = new_seq
875
+ new_seq_hash.aa_hash = self.aa_hash
876
+ elsif option == :aa
877
+ new_seq_hash.dna_hash = self.dna_hash
878
+ new_seq_hash.aa_hash = new_seq
879
+ end
880
+ new_seq_hash.qc_hash = self.qc_hash
881
+ new_seq_hash.title = self.title + "_strip"
882
+ new_seq_hash.file = self.file
883
+ return new_seq_hash
884
+ end
885
+
886
+ # gap strip from a sequence alignment at both ends, only positions at the ends that contains gaps ('-') will be removed.
887
+ # @param (see #gap_strip)
888
+ # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
889
+ # @example gap strip for an array of sequences only at the ends
890
+ # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
891
+ # array = { AACCGGTT
892
+ # A-CCGGTT
893
+ # AAC-GGTT
894
+ # AACCG-TT
895
+ # AACCGGT- }
896
+ # my_seqhash = ViralSeq::SeqHash.array(array)
897
+ # puts my_seqhash.gap_strip_ends.dna_hash.values
898
+ # AACCGGT
899
+ # A-CCGGT
900
+ # AAC-GGT
901
+ # AACCG-T
902
+ # AACCGGT
903
+
904
+ def gap_strip_ends(option = :nt)
905
+ if option == :nt
906
+ sequence_alignment = self.dna_hash
907
+ elsif option == :aa
908
+ sequence_alignment = self.aa_hash
909
+ else
910
+ raise "Option #{option} not recognized"
911
+ end
912
+ new_seq = {}
913
+ seq_size = sequence_alignment.values[0].size
914
+ seq_matrix = {}
915
+ (0..(seq_size - 1)).each do |p|
916
+ seq_matrix[p] = []
917
+ sequence_alignment.values.each do |s|
918
+ seq_matrix[p] << s[p]
919
+ end
920
+ end
921
+ n1 = 0
922
+ n2 = 0
923
+ seq_matrix.each do |_p, list|
924
+ if list.include?("-")
925
+ n1 += 1
926
+ else
927
+ break
928
+ end
929
+ end
930
+
931
+ seq_matrix.keys.reverse.each do |p|
932
+ list = seq_matrix[p]
933
+ if list.include?("-")
934
+ n2 += 1
935
+ else
936
+ break
937
+ end
938
+ end
939
+
940
+ sequence_alignment.each do |n,s|
941
+ new_s = s[n1..(- n2 - 1)]
942
+ new_seq[n] = new_s
943
+ end
944
+ new_seq_hash = ViralSeq::SeqHash.new
945
+ if option == :nt
946
+ new_seq_hash.dna_hash = new_seq
947
+ new_seq_hash.aa_hash = self.aa_hash
948
+ elsif option == :aa
949
+ new_seq_hash.dna_hash = self.dna_hash
950
+ new_seq_hash.aa_hash = new_seq
951
+ end
952
+ new_seq_hash.qc_hash = self.qc_hash
953
+ new_seq_hash.title = self.title + "_strip"
954
+ new_seq_hash.file = self.file
955
+ return new_seq_hash
956
+ end
957
+
958
+
959
+
960
+
961
+
962
+ # start of private functions
963
+ private
964
+
965
+ # APOBEC3G/F mutation position identification,
966
+ # APOBEC3G/F pattern: GRD -> ARD,
967
+ # control pattern: G[YN|RC] -> A[YN|RC],
968
+ def apobec3gf(seq = '')
969
+ seq.tr!("-", "")
970
+ seq_length = seq.size
971
+ apobec_position = []
972
+ control_position = []
973
+ (0..(seq_length - 3)).each do |n|
974
+ tri_base = seq[n,3]
975
+ if tri_base =~ /G[A|G][A|G|T]/
976
+ apobec_position << n
977
+ elsif seq[n] == "G"
978
+ control_position << n
979
+ end
980
+ end
981
+ return [apobec_position,control_position]
982
+ end # end of #apobec3gf
983
+
984
+ # call consensus nucleotide, used by #consensus
985
+ def call_consensus_base(base_array)
986
+ if base_array.size == 1
987
+ base_array[0]
988
+ elsif base_array.size == 2
989
+ case base_array.sort!
990
+ when ["A","T"]
991
+ "W"
992
+ when ["C","G"]
993
+ "S"
994
+ when ["A","C"]
995
+ "M"
996
+ when ["G","T"]
997
+ "K"
998
+ when ["A","G"]
999
+ "R"
1000
+ when ["C","T"]
1001
+ "Y"
1002
+ else
1003
+ "N"
1004
+ end
1005
+ elsif base_array.size == 3
1006
+ case base_array.sort!
1007
+ when ["C","G","T"]
1008
+ "B"
1009
+ when ["A","G","T"]
1010
+ "D"
1011
+ when ["A","C","T"]
1012
+ "H"
1013
+ when ["A","C","G"]
1014
+ "V"
1015
+ else
1016
+ "N"
1017
+ end
1018
+ else
1019
+ "N"
1020
+ end
1021
+ end # end of #call_consensus_base
1022
+
1023
+ # Input sequence array. output Variant distribution for Poisson cut-off
1024
+ def variant_for_poisson(seq)
1025
+ seq_size = seq.size
1026
+ l = seq[0].size - 1
1027
+ var = []
1028
+ (0..l).to_a.each do |pos|
1029
+ nt = []
1030
+ seq.each do |s|
1031
+ nt << s[pos]
1032
+ end
1033
+ count_nt = nt.count_freq
1034
+ v = seq_size - count_nt.values.max
1035
+ var << v
1036
+ end
1037
+ var_count = var.count_freq
1038
+ var_count.sort_by{|key,_value|key}.to_h
1039
+ end # end of #varaint_for_poisson
1040
+
1041
+ end # end of SeqHash
1042
+
1043
+ end # end of ViralSeq