viral_seq 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,89 +1,67 @@
1
- # viral_seq/muscle.rb
2
- # wrapper for MUSCLE (http://www.drive5.com/muscle)
3
- # Including Methods as:
4
- # ViralSeq::check_muscle
5
- # ViralSeq::muscle_align
6
- # ViralSeq::muscle_align_multi
7
-
8
- # ViralSeq.check_muscle?(path_to_muscle)
9
- # # check if the path_to_muscle provided is valid,
10
- # # prompt error messages if MUSCLE is not found.
11
-
12
- # ViralSeq.muscle_align(reference_seq, test_sequence, path_to_muscle)
13
- # # takes a reference sequence and a test sequence as String object
14
- # # without specification on path_to_muscle, MuscleBio will be called to run Muscle
15
- # # specify path_to_muscle if other source of muscle needed
16
- # # returns aligned reference sequence and test sequences
17
-
18
- # ViralSeq.muscle_align_multi(sequence_hash, path_to_muscle)
19
- # # input a sequence_hash object {:name=>:sequence,...}
20
- # # without specification on path_to_muscle, MuscleBio will be called to run Muscle
21
- # # specify path_to_muscle if other source of muscle needed
22
- # # return aligned sequences an hash
23
1
 
24
2
  module ViralSeq
3
+ # alignment using MUSCLE alignment program
4
+ # @see http://www.drive5.com/muscle MUSCLE download link
25
5
 
26
- # check if path_to_muscle is correct
27
- def self.check_muscle?(path_to_muscle)
28
- begin
29
- `#{path_to_muscle} -version`
30
- return true
31
- rescue Errno::ENOENT
32
- puts "
33
- Error: MUSCLE is not found for at the provided {path_to_muscle}!!
34
- MUSLCE can be download at http://www.drive5.com/muscle
35
- Add MUSCLE excutable path to $PATH using
36
- $ export PATH=$PATH:/path/to/muscle
37
- or
38
- provide path_to_MUSCLE in the function arguments\n
39
- "
40
- return false
41
- end
42
- end
6
+ module Muscle
7
+ # check if path_to_muscle is correct, prompt error messages if MUSCLE is not found.
8
+ # @param path_to_muscle [String] path to muscle excutable
9
+ # @return [boolean]
43
10
 
44
- def self.muscle_align(ref_seq = "", test_seq = "", path_to_muscle = false)
45
- temp_dir=File.dirname($0)
46
- temp_file = temp_dir + "/_temp_muscle_in"
47
- temp_aln = temp_dir + "/_temp_muscle_aln"
48
- name = ">test"
49
- temp_in = File.open(temp_file,"w")
50
- temp_in.puts ">ref"
51
- temp_in.puts ref_seq
52
- temp_in.puts name
53
- temp_in.puts test_seq
54
- temp_in.close
55
- if path_to_muscle
56
- unless ViralSeq.check_muscle?(path_to_muscle)
57
- File.unlink(temp_file)
58
- return nil;
11
+ def self.check_muscle?(path_to_muscle)
12
+ begin
13
+ `#{path_to_muscle} -version`
14
+ return true
15
+ rescue Errno::ENOENT
16
+ puts "
17
+ Error: MUSCLE is not found for at the provided {path_to_muscle}!!
18
+ MUSLCE can be download at http://www.drive5.com/muscle
19
+ Add MUSCLE excutable path to $PATH using
20
+ $ export PATH=$PATH:/path/to/muscle
21
+ or
22
+ provide path_to_MUSCLE in the function arguments\n
23
+ "
24
+ return false
59
25
  end
60
- print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
61
- else
62
- MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
63
- end
64
- aln_seq_hash = ViralSeq.fasta_to_hash(temp_aln)
65
- File.unlink(temp_file)
66
- File.unlink(temp_aln)
67
- return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
68
- end
26
+ end # end of .check_muscle?
69
27
 
70
- def self.muscle_align_multi(seq_hash = {}, path_to_muscle = false)
71
- temp_dir=File.dirname($0)
72
- temp_file = temp_dir + "/_temp_muscle_in"
73
- temp_aln = temp_dir + "/_temp_muscle_aln"
74
- File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
75
- if path_to_muscle
76
- unless ViralSeq.check_muscle?(path_to_muscle)
77
- File.unlink(temp_file)
78
- return nil
28
+ # align a sequence with reference sequence Strings
29
+ # @param ref_seq [String] reference sequence
30
+ # @param test_seq [String] test sequence
31
+ # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
32
+ # @return [Array] a pair of [:ref_seq_aligned, :test_seq_aligned] or nil
33
+ # if the cannot find MUSCLE excutable
34
+ # @example
35
+ # seq1 = 'AAGGCGTAGGAC'
36
+ # seq2 = 'AAGCTTAGGACG'
37
+ # aligned_seqs = ViralSeq::Muscle.align(seq1,seq2)
38
+ # => ["AAGGCGTAGGAC-", "-AAGCTTAGGACG"]
39
+
40
+ def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
+ temp_dir=File.dirname($0)
42
+ temp_file = temp_dir + "/_temp_muscle_in"
43
+ temp_aln = temp_dir + "/_temp_muscle_aln"
44
+ name = ">test"
45
+ temp_in = File.open(temp_file,"w")
46
+ temp_in.puts ">ref"
47
+ temp_in.puts ref_seq
48
+ temp_in.puts name
49
+ temp_in.puts test_seq
50
+ temp_in.close
51
+ if path_to_muscle
52
+ unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
53
+ File.unlink(temp_file)
54
+ return nil;
55
+ end
56
+ print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
57
+ else
58
+ MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
79
59
  end
80
- print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
81
- else
82
- MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
83
- end
84
- out_seq_hash = ViralSeq.fasta_to_hash(temp_aln)
85
- File.unlink(temp_file)
86
- File.unlink(temp_aln)
87
- return out_seq_hash
88
- end
89
- end
60
+ aln_seq_hash = ViralSeq::SeqHash.fa(temp_aln).dna_hash
61
+ File.unlink(temp_file)
62
+ File.unlink(temp_aln)
63
+ return [aln_seq_hash[">ref"], aln_seq_hash[">test"]]
64
+ end # end of .align
65
+ end # end of ViralSeq::Muscle
66
+
67
+ end # end of ViralSeq
@@ -0,0 +1,26 @@
1
+
2
+ module ViralSeq
3
+
4
+ module PID
5
+
6
+ # generate all Primer ID combinations given the length of Primer ID
7
+ # @param l [Integer] the length of the Primer ID.
8
+ # @example generate a pool of Primer IDs with length of 10
9
+ # primer_id_pool = ViralSeq::PID.generate_pool(10) # 10 is the length of Primer ID
10
+ # puts primer_id_pool.size #should be 4^10
11
+ # => 1048576
12
+
13
+ def self.generate_pool(l=8)
14
+ nt = ['A','T','C','G']
15
+ pid_pool = ['A','T','C','G']
16
+ (l-1).times do
17
+ pid_pool = pid_pool.product(nt)
18
+ pid_pool.collect! do |v|
19
+ v.join("")
20
+ end
21
+ end
22
+ return pid_pool
23
+ end # end of .generate_primer_id_pool
24
+
25
+ end # end of Pid
26
+ end # end of ViralSeq
@@ -0,0 +1,35 @@
1
+ # viral_seq main module
2
+ module ViralSeq
3
+
4
+ # HIV/SIV reference genome sequences, including HXB2, NL43, MAC239
5
+ # @see https://www.ncbi.nlm.nih.gov/nuccore/K03455 Reference sequence of HIV-1 HXB2 (Genbank accession number K03455)
6
+ # @see https://www.ncbi.nlm.nih.gov/nuccore/AF324493 Reference sequence of HIV-1 NL43 (Genbank accession number AF324493)
7
+ # @see https://www.ncbi.nlm.nih.gov/nucleotide/M33262 Reference sequence of SIV MAC239 (Genbank accession number M33262)
8
+ # @example retrive the reference sequence for HIV NL43
9
+ # ViralSeq::RefSeq.get(:NL43)
10
+ # => "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTA..."
11
+
12
+ module RefSeq
13
+
14
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
15
+ # @return [String] the reference sequence as a String object
16
+
17
+ def self.get(ref_option)
18
+ begin
19
+ case ref_option
20
+ when :HXB2
21
+ "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
22
+ when :NL43
23
+ "TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAAATAAGGAGAGAAGAACAGCTTGTTACACCCTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGCATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGTGTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTATAGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACAGTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTTATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAAGGGTGCCCACACTAATGATGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCATAATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATATGTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGGAATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAAAAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATGGGTTGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATATGGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTTGGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGATTAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTACTAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGAATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGCCTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCATACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATATGGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGATAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGGTTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTATCCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACATGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGAGGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCTGCACGTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA"
24
+ when :MAC239
25
+ "GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAGGCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATACCCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCAGATTGGCGCCTGAACAGGGACTTGAAGGAGAGTGAGAGACTCCTGAGTACGGCTGAGTGAAGGCAGTAAGGGCGGCAGGAACCAACCACGACGGAGTGCTCCTATAAAGGCGCGGGTCGGTACCAGACGGCGTGAGGAGCGGGAGAGGAAGAGGCCTCCGGTTGCAGGTAAGTGCAACACAAAAAAGAAATAGCTGTCTTTTATCCAGGAAGGGGTAATAAGATAGAGTGGGAGATGGGCGTGAGAAACTCCGTCTTGTCAGGGAAGAAAGCAGATGAATTAGAAAAAATTAGGCTACGACCCAACGGAAAGAAAAAGTACATGTTGAAGCATGTAGTATGGGCAGCAAATGAATTAGATAGATTTGGATTAGCAGAAAGCCTGTTGGAGAACAAAGAAGGATGTCAAAAAATACTTTCGGTCTTAGCTCCATTAGTGCCAACAGGCTCAGAAAATTTAAAAAGCCTTTATAATACTGTCTGCGTCATCTGGTGCATTCACGCAGAAGAGAAAGTGAAACACACTGAGGAAGCAAAACAGATAGTGCAGAGACACCTAGTGGTGGAAACAGGAACAACAGAAACTATGCCAAAAACAAGTAGACCAACAGCACCATCTAGCGGCAGAGGAGGAAATTACCCAGTACAACAAATAGGTGGTAACTATGTCCACCTGCCATTAAGCCCGAGAACATTAAATGCCTGGGTAAAATTGATAGAGGAAAAGAAATTTGGAGCAGAAGTAGTGCCAGGATTTCAGGCACTGTCAGAAGGTTGCACCCCCTATGACATTAATCAGATGTTAAATTGTGTGGGAGACCATCAAGCGGCTATGCAGATTATCAGAGATATTATAAACGAGGAGGCTGCAGATTGGGACTTGCAGCACCCACAACCAGCTCCACAACAAGGACAACTTAGGGAGCCGTCAGGATCAGATATTGCAGGAACAACTAGTTCAGTAGATGAACAAATCCAGTGGATGTACAGACAACAGAACCCCATACCAGTAGGCAACATTTACAGGAGATGGATCCAACTGGGGTTGCAAAAATGTGTCAGAATGTATAACCCAACAAACATTCTAGATGTAAAACAAGGGCCAAAAGAGCCATTTCAGAGCTATGTAGACAGGTTCTACAAAAGTTTAAGAGCAGAACAGACAGATGCAGCAGTAAAGAATTGGATGACTCAAACACTGCTGATTCAAAATGCTAACCCAGATTGCAAGCTAGTGCTGAAGGGGCTGGGTGTGAATCCCACCCTAGAAGAAATGCTGACGGCTTGTCAAGGAGTAGGGGGGCCGGGACAGAAGGCTAGATTAATGGCAGAAGCCCTGAAAGAGGCCCTCGCACCAGTGCCAATCCCTTTTGCAGCAGCCCAACAGAGGGGACCAAGAAAGCCAATTAAGTGTTGGAATTGTGGGAAAGAGGGACACTCTGCAAGGCAATGCAGAGCCCCAAGAAGACAGGGATGCTGGAAATGTGGAAAAATGGACCATGTTATGGCCAAATGCCCAGACAGACAGGCGGGTTTTTTAGGCCTTGGTCCATGGGGAAAGAAGCCCCGCAATTTCCCCATGGCTCAAGTGCATCAGGGGCTGATGCCAACTGCTCCCCCAGAGGACCCAGCTGTGGATCTGCTAAAGAACTACATGCAGTTGGGCAAGCAGCAGAGAGAAAAGCAGAGAGAAAGCAGAGAGAAGCCTTACAAGGAGGTGACAGAGGATTTGCTGCACCTCAATTCTCTCTTTGGAGGAGACCAGTAGTCACTGCTCATATTGAAGGACAGCCTGTAGAAGTATTACTGGATACAGGGGCTGATGATTCTATTGTAACAGGAATAGAGTTAGGTCCACATTATACCCCAAAAATAGTAGGAGGAATAGGAGGTTTTATTAATACTAAAGAATACAAAAATGTAGAAATAGAAGTTTTAGGCAAAAGGATTAAAGGGACAATCATGACAGGGGACACCCCGATTAACATTTTTGGTAGAAATTTGCTAACAGCTCTGGGGATGTCTCTAAATTTTCCCATAGCTAAAGTAGAGCCTGTAAAAGTCGCCTTAAAGCCAGGAAAGGATGGACCAAAATTGAAGCAGTGGCCATTATCAAAAGAAAAGATAGTTGCATTAAGAGAAATCTGTGAAAAGATGGAAAAGGATGGTCAGTTGGAGGAAGCTCCCCCGACCAATCCATACAACACCCCCACATTTGCTATAAAGAAAAAGGATAAGAACAAATGGAGAATGCTGATAGATTTTAGGGAACTAAATAGGGTCACTCAGGACTTTACGGAAGTCCAATTAGGAATACCACACCCTGCAGGACTAGCAAAAAGGAAAAGAATTACAGTACTGGATATAGGTGATGCATATTTCTCCATACCTCTAGATGAAGAATTTAGGCAGTACACTGCCTTTACTTTACCATCAGTAAATAATGCAGAGCCAGGAAAACGATACATTTATAAGGTTCTGCCTCAGGGATGGAAGGGGTCACCAGCCATCTTCCAATACACTATGAGACATGTGCTAGAACCCTTCAGGAAGGCAAATCCAGATGTGACCTTAGTCCAGTATATGGATGACATCTTAATAGCTAGTGACAGGACAGACCTGGAACATGACAGGGTAGTTTTACAGTCAAAGGAACTCTTGAATAGCATAGGGTTTTCTACCCCAGAAGAGAAATTCCAAAAAGATCCCCCATTTCAATGGATGGGGTACGAATTGTGGCCAACAAAATGGAAGTTGCAAAAGATAGAGTTGCCACAAAGAGAGACCTGGACAGTGAATGATATACAGAAGTTAGTAGGAGTATTAAATTGGGCAGCTCAAATTTATCCAGGTATAAAAACCAAACATCTCTGTAGGTTAATTAGAGGAAAAATGACTCTAACAGAGGAAGTTCAGTGGACTGAGATGGCAGAAGCAGAATATGAGGAAAATAAAATAATTCTCAGTCAGGAACAAGAAGGATGTTATTACCAAGAAGGCAAGCCATTAGAAGCCACGGTAATAAAGAGTCAGGACAATCAGTGGTCTTATAAAATTCACCAAGAAGACAAAATACTGAAAGTAGGAAAATTTGCAAAGATAAAGAATACACATACCAATGGAGTGAGACTATTAGCACATGTAATACAGAAAATAGGAAAGGAAGCAATAGTGATCTGGGGACAGGTCCCAAAATTCCACTTACCAGTTGAGAAGGATGTATGGGAACAGTGGTGGACAGACTATTGGCAGGTAACCTGGATACCGGAATGGGATTTTATCTCAACACCACCGCTAGTAAGATTAGTCTTCAATCTAGTGAAGGACCCTATAGAGGGAGAAGAAACCTATTATACAGATGGATCATGTAATAAACAGTCAAAAGAAGGGAAAGCAGGATATATCACAGATAGGGGCAAAGACAAAGTAAAAGTGTTAGAACAGACTACTAATCAACAAGCAGAATTGGAAGCATTTCTCATGGCATTGACAGACTCAGGGCCAAAGGCAAATATTATAGTAGATTCACAATATGTTATGGGAATAATAACAGGATGCCCTACAGAATCAGAGAGCAGGCTAGTTAATCAAATAATAGAAGAAATGATTAAAAAGTCAGAAATTTATGTAGCATGGGTACCAGCACACAAAGGTATAGGAGGAAACCAAGAAATAGACCACCTAGTTAGTCAAGGGATTAGACAAGTTCTCTTCTTGGAAAAGATAGAGCCAGCACAAGAAGAACATGATAAATACCATAGTAATGTAAAAGAATTGGTATTCAAATTTGGATTACCCAGAATAGTGGCCAGACAGATAGTAGACACCTGTGATAAATGTCATCAGAAAGGAGAGGCTATACATGGGCAGGCAAATTCAGATCTAGGGACTTGGCAAATGGATTGTACCCATCTAGAGGGAAAAATAATCATAGTTGCAGTACATGTAGCTAGTGGATTCATAGAAGCAGAGGTAATTCCACAAGAGACAGGAAGACAGACAGCACTATTTCTGTTAAAATTGGCAGGCAGATGGCCTATTACACATCTACACACAGATAATGGTGCTAACTTTGCTTCGCAAGAAGTAAAGATGGTTGCATGGTGGGCAGGGATAGAGCACACCTTTGGGGTACCATACAATCCACAGAGTCAGGGAGTAGTGGAAGCAATGAATCACCACCTGAAAAATCAAATAGATAGAATCAGGGAACAAGCAAATTCAGTAGAAACCATAGTATTAATGGCAGTTCATTGCATGAATTTTAAAAGAAGGGGAGGAATAGGGGATATGACTCCAGCAGAAAGATTAATTAACATGATCACTACAGAACAAGAGATACAATTTCAACAATCAAAAAACTCAAAATTTAAAAATTTTCGGGTCTATTACAGAGAAGGCAGAGATCAACTGTGGAAGGGACCCGGTGAGCTATTGTGGAAAGGGGAAGGAGCAGTCATCTTAAAGGTAGGGACAGACATTAAGGTAGTACCCAGAAGAAAGGCTAAAATTATCAAAGATTATGGAGGAGGAAAAGAGGTGGATAGCAGTTCCCACATGGAGGATACCGGAGAGGCTAGAGAGGTGGCATAGCCTCATAAAATATCTGAAATATAAAACTAAAGATCTACAAAAGGTTTGCTATGTGCCCCATTTTAAGGTCGGATGGGCATGGTGGACCTGCAGCAGAGTAATCTTCCCACTACAGGAAGGAAGCCATTTAGAAGTACAAGGGTATTGGCATTTGACACCAGAAAAAGGGTGGCTCAGTACTTATGCAGTGAGGATAACCTGGTACTCAAAGAACTTTTGGACAGATGTAACACCAAACTATGCAGACATTTTACTGCATAGCACTTATTTCCCTTGCTTTACAGCGGGAGAAGTGAGAAGGGCCATCAGGGGAGAACAACTGCTGTCTTGCTGCAGGTTCCCGAGAGCTCATAAGTACCAGGTACCAAGCCTACAGTACTTAGCACTGAAAGTAGTAAGCGATGTCAGATCCCAGGGAGAGAATCCCACCTGGAAACAGTGGAGAAGAGACAATAGGAGAGGCCTTCGAATGGCTAAACAGAACAGTAGAGGAGATAAACAGAGAGGCGGTAAACCACCTACCAAGGGAGCTAATTTTCCAGGTTTGGCAAAGGTCTTGGGAATACTGGCATGATGAACAAGGGATGTCACCAAGCTATGTAAAATACAGATACTTGTGTTTAATACAAAAGGCTTTATTTATGCATTGCAAGAAAGGCTGTAGATGTCTAGGGGAAGGACATGGGGCAGGGGGATGGAGACCAGGACCTCCTCCTCCTCCCCCTCCAGGACTAGCATAAATGGAAGAAAGACCTCCAGAAAATGAAGGACCACAAAGGGAACCATGGGATGAATGGGTAGTGGAGGTTCTGGAAGAACTGAAAGAAGAAGCTTTAAAACATTTTGATCCTCGCTTGCTAACTGCACTTGGTAATCATATCTATAATAGACATGGAGACACCCTTGAGGGAGCAGGAGAACTCATTAGAATCCTCCAACGAGCGCTCTTCATGCATTTCAGAGGCGGATGCATCCACTCCAGAATCGGCCAACCTGGGGGAGGAAATCCTCTCTCAGCTATACCGCCCTCTAGAAGCATGCTATAACACATGCTATTGTAAAAAGTGTTGCTACCATTGCCAGTTTTGTTTTCTTAAAAAAGGCTTGGGGATATGTTATGAGCAATCACGAAAGAGAAGAAGAACTCCGAAAAAGGCTAAGGCTAATACATCTTCTGCATCAAACAAGTAAGTATGGGATGTCTTGGGAATCAGCTGCTTATCGCCATCTTGCTTTTAAGTGTCTATGGGATCTATTGTACTCTATATGTCACAGTCTTTTATGGTGTACCAGCTTGGAGGAATGCGACAATTCCCCTCTTTTGTGCAACCAAGAATAGGGATACTTGGGGAACAACTCAGTGCCTACCAGATAATGGTGATTATTCAGAAGTGGCCCTTAATGTTACAGAAAGCTTTGATGCCTGGAATAATACAGTCACAGAACAGGCAATAGAGGATGTATGGCAACTCTTTGAGACCTCAATAAAGCCTTGTGTAAAATTATCCCCATTATGCATTACTATGAGATGCAATAAAAGTGAGACAGATAGATGGGGATTGACAAAATCAATAACAACAACAGCATCAACAACATCAACGACAGCATCAGCAAAAGTAGACATGGTCAATGAGACTAGTTCTTGTATAGCCCAGGATAATTGCACAGGCTTGGAACAAGAGCAAATGATAAGCTGTAAATTCAACATGACAGGGTTAAAAAGAGACAAGAAAAAAGAGTACAATGAAACTTGGTACTCTGCAGATTTGGTATGTGAACAAGGGAATAACACTGGTAATGAAAGTAGATGTTACATGAACCACTGTAACACTTCTGTTATCCAAGAGTCTTGTGACAAACATTATTGGGATGCTATTAGATTTAGGTATTGTGCACCTCCAGGTTATGCTTTGCTTAGATGTAATGACACAAATTATTCAGGCTTTATGCCTAAATGTTCTAAGGTGGTGGTCTCTTCATGCACAAGGATGATGGAGACACAGACTTCTACTTGGTTTGGCTTTAATGGAACTAGAGCAGAAAATAGAACTTATATTTACTGGCATGGTAGGGATAATAGGACTATAATTAGTTTAAATAAGTATTATAATCTAACAATGAAATGTAGAAGACCAGGAAATAAGACAGTTTTACCAGTCACCATTATGTCTGGATTGGTTTTCCACTCACAACCAATCAATGATAGGCCAAAGCAGGCATGGTGTTGGTTTGGAGGAAAATGGAAGGATGCAATAAAAGAGGTGAAGCAGACCATTGTCAAACATCCCAGGTATACTGGAACTAACAATACTGATAAAATCAATTTGACGGCTCCTGGAGGAGGAGATCCGGAAGTTACCTTCATGTGGACAAATTGCAGAGGAGAGTTCCTCTACTGTAAAATGAATTGGTTTCTAAATTGGGTAGAAGATAGGAATACAGCTAACCAGAAGCCAAAGGAACAGCATAAAAGGAATTACGTGCCATGTCATATTAGACAAATAATCAACACTTGGCATAAAGTAGGCAAAAATGTTTATTTGCCTCCAAGAGAGGGAGACCTCACGTGTAACTCCACAGTGACCAGTCTCATAGCAAACATAGATTGGATTGATGGAAACCAAACTAATATCACCATGAGTGCAGAGGTGGCAGAACTGTATCGATTGGAATTGGGAGATTATAAATTAGTAGAGATCACTCCAATTGGCTTGGCCCCCACAGATGTGAAGAGGTACACTACTGGTGGCACCTCAAGAAATAAAAGAGGGGTCTTTGTGCTAGGGTTCTTGGGTTTTCTCGCAACGGCAGGTTCTGCAATGGGCGCGGCGTCGTTGACGCTGACCGCTCAGTCCCGAACTTTATTGGCTGGGATAGTGCAGCAACAGCAACAGCTGTTGGACGTGGTCAAGAGACAACAAGAATTGTTGCGACTGACCGTCTGGGGAACAAAGAACCTCCAGACTAGGGTCACTGCCATCGAGAAGTACTTAAAGGACCAGGCGCAGCTGAATGCTTGGGGATGTGCGTTTAGACAAGTCTGCCACACTACTGTACCATGGCCAAATGCAAGTCTAACACCAAAGTGGAACAATGAGACTTGGCAAGAGTGGGAGCGAAAGGTTGACTTCTTGGAAGAAAATATAACAGCCCTCCTAGAGGAGGCACAAATTCAACAAGAGAAGAACATGTATGAATTACAAAAGTTGAATAGCTGGGATGTGTTTGGCAATTGGTTTGACCTTGCTTCTTGGATAAAGTATATACAATATGGAGTTTATATAGTTGTAGGAGTAATACTGTTAAGAATAGTGATCTATATAGTACAAATGCTAGCTAAGTTAAGGCAGGGGTATAGGCCAGTGTTCTCTTCCCCACCCTCTTATTTCCAGCAGACCCATATCCAACAGGACCCGGCACTGCCAACCAGAGAAGGCAAAGAAAGAGACGGTGGAGAAGGCGGTGGCAACAGCTCCTGGCCTTGGCAGATAGAATATATTCATTTCCTGATCCGCCAACTGATACGCCTCTTGACTTGGCTATTCAGCAACTGCAGAACCTTGCTATCGAGAGTATACCAGATCCTCCAACCAATACTCCAGAGGCTCTCTGCGACCCTACAGAGGATTCGAGAAGTCCTCAGGACTGAACTGACCTACCTACAATATGGGTGGAGCTATTTCCATGAGGCGGTCCAGGCCGTCTGGAGATCTGCGACAGAGACTCTTGCGGGCGCGTGGGGAGACTTATGGGAGACTCTTAGGAGAGGTGGAAGATGGATACTCGCAATCCCCAGGAGGATTAGACAAGGGCTTGAGCTCACTCTCTTGTGAGGGACAGAAATACAATCAGGGACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATTAGCATACAGAAAACAAAATATGGATGATATAGATGAGTAAGATGATGACTTGGTAGGGGTATCAGTGAGGCCAAAAGTTCCCCTAAGAACAATGAGTTACAAATTGGCAATAGACATGTCTCATTTTATAAAAGAAAAGGGGGGACTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCATCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTTATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACCGCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGGGAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTCGCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTAGACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCCATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACCCTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGCA"
26
+ else
27
+ raise StandardError.new("reference sequence not recognized, choose from :HXB2 (default), :NL43, or :MAC239.")
28
+ end
29
+ rescue StandardError => e
30
+ puts e.message
31
+ return nil
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,172 @@
1
+
2
+ module ViralSeq
3
+ # Fisher's Exact Test Function Library
4
+ #
5
+ # Based on JavaScript version created by: Oyvind Langsrud,
6
+ # Ported to Ruby by Bryan Donovan
7
+
8
+ module Rubystats
9
+ # Fisher's exact test
10
+ class FishersExactTest
11
+
12
+ def initialize
13
+ @sn11 = 0.0
14
+ @sn1_ = 0.0
15
+ @sn_1 = 0.0
16
+ @sn = 0.0
17
+ @sprob = 0.0
18
+
19
+ @sleft = 0.0
20
+ @sright = 0.0
21
+ @sless = 0.0
22
+ @slarg = 0.0
23
+
24
+ @left = 0.0
25
+ @right = 0.0
26
+ @twotail = 0.0
27
+ end
28
+
29
+ # @see http://lib.stat.cmu.edu/apstat/245 Reference: "Lanczos, C. 'A precision approximation of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964." Translation of Alan Miller's FORTRAN-implementation.
30
+
31
+ def lngamm(z)
32
+ x = 0
33
+ x += 0.0000001659470187408462 / (z+7)
34
+ x += 0.000009934937113930748 / (z+6)
35
+ x -= 0.1385710331296526 / (z+5)
36
+ x += 12.50734324009056 / (z+4)
37
+ x -= 176.6150291498386 / (z+3)
38
+ x += 771.3234287757674 / (z+2)
39
+ x -= 1259.139216722289 / (z+1)
40
+ x += 676.5203681218835 / (z)
41
+ x += 0.9999999999995183
42
+
43
+ return(::Math.log(x)-5.58106146679532777-z+(z-0.5) * ::Math.log(z+6.5))
44
+ end
45
+
46
+ def lnfact(n)
47
+ if n <= 1
48
+ return 0
49
+ else
50
+ return lngamm(n+1)
51
+ end
52
+ end
53
+
54
+ def lnbico(n,k)
55
+ return lnfact(n) - lnfact(k) - lnfact(n-k)
56
+ end
57
+
58
+ def hyper_323(n11, n1_, n_1, n)
59
+ return ::Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
60
+ end
61
+
62
+ def hyper(n11)
63
+ return hyper0(n11, 0, 0, 0)
64
+ end
65
+
66
+ def hyper0(n11i,n1_i,n_1i,ni)
67
+ if n1_i == 0 and n_1i ==0 and ni == 0
68
+ unless n11i % 10 == 0
69
+ if n11i == @sn11+1
70
+ @sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
71
+ @sn11 = n11i
72
+ return @sprob
73
+ end
74
+ if n11i == @sn11-1
75
+ @sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
76
+ @sn11 = n11i
77
+ return @sprob
78
+ end
79
+ end
80
+ @sn11 = n11i
81
+ else
82
+ @sn11 = n11i
83
+ @sn1_ = n1_i
84
+ @sn_1 = n_1i
85
+ @sn = ni
86
+ end
87
+ @sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
88
+ return @sprob
89
+ end
90
+
91
+ def exact(n11,n1_,n_1,n)
92
+
93
+ p = i = j = prob = 0.0
94
+
95
+ max = n1_
96
+ max = n_1 if n_1 < max
97
+ min = n1_ + n_1 - n
98
+ min = 0 if min < 0
99
+
100
+ if min == max
101
+ @sless = 1
102
+ @sright = 1
103
+ @sleft = 1
104
+ @slarg = 1
105
+ return 1
106
+ end
107
+
108
+ prob = hyper0(n11,n1_,n_1,n)
109
+ @sleft = 0
110
+
111
+ p = hyper(min)
112
+ i = min + 1
113
+ while p < (0.99999999 * prob)
114
+ @sleft += p
115
+ p = hyper(i)
116
+ i += 1
117
+ end
118
+
119
+ i -= 1
120
+
121
+ if p < (1.00000001*prob)
122
+ @sleft += p
123
+ else
124
+ i -= 1
125
+ end
126
+
127
+ @sright = 0
128
+
129
+ p = hyper(max)
130
+ j = max - 1
131
+ while p < (0.99999999 * prob)
132
+ @sright += p
133
+ p = hyper(j)
134
+ j -= 1
135
+ end
136
+ j += 1
137
+
138
+ if p < (1.00000001*prob)
139
+ @sright += p
140
+ else
141
+ j += 1
142
+ end
143
+
144
+ if (i - n11).abs < (j - n11).abs
145
+ @sless = @sleft
146
+ @slarg = 1 - @sleft + prob
147
+ else
148
+ @sless = 1 - @sright + prob
149
+ @slarg = @sright
150
+ end
151
+ return prob
152
+ end
153
+
154
+ def calculate(n11_,n12_,n21_,n22_)
155
+ n11_ *= -1 if n11_ < 0
156
+ n12_ *= -1 if n12_ < 0
157
+ n21_ *= -1 if n21_ < 0
158
+ n22_ *= -1 if n22_ < 0
159
+ n1_ = n11_ + n12_
160
+ n_1 = n11_ + n21_
161
+ n = n11_ + n12_ + n21_ + n22_
162
+ exact(n11_,n1_,n_1,n)
163
+ left = @sless
164
+ right = @slarg
165
+ twotail = @sleft + @sright
166
+ twotail = 1 if twotail > 1
167
+ values_hash = { :left =>left, :right =>right, :twotail =>twotail }
168
+ return values_hash
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,1043 @@
1
+
2
+ module ViralSeq
3
+
4
+ # ViralSeq::SeqHash class for operation on multiple sequences.
5
+ # @example read a FASTA sequence file of HIV PR sequences, make alignment, perform the QC location check, filter sequences with stop codons and APOBEC3g/f hypermutations, calculate pairwise diversity, calculate minority cut-off based on Poisson model, and examine for drug resistance mutations.
6
+ # my_pr_seqhash = ViralSeq::SeqHash.fa('my_pr_fasta_file.fasta')
7
+ # # new ViralSeq::SeqHash object from a FASTA file
8
+ # aligned_pr_seqhash = my_pr_seqhash.align
9
+ # # align with MUSCLE
10
+ # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
+ # # filter nt sequences with the reference coordinates
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
13
+ # # return a new ViralSeq::SeqHash object without stop codons
14
+ # filtered_seqhash = filtered_seqhash.a3g[1]
15
+ # # further filter out sequences with A3G hypermutations
16
+ # filtered_seqhash.pi
17
+ # # return pairwise diveristy π
18
+ # cut_off = filtered_seqhash.pm
19
+ # # return cut-off for minority variants based on Poisson model
20
+ # filtered_seqhash.sdrm_hiv_pr(cut_off)
21
+ # # examine for drug resistance mutations for PR region.
22
+
23
+ class SeqHash
24
+ # initialize a ViralSeq::SeqHash object
25
+ def initialize (dna_hash = {}, aa_hash = {}, qc_hash = {}, title = "", file = "")
26
+ @dna_hash = dna_hash
27
+ @aa_hash = aa_hash
28
+ @qc_hash = qc_hash
29
+ @title = title
30
+ @file = file
31
+ end
32
+
33
+ # @return [Hash] Hash object for :name => :sequence_string pairs
34
+ attr_accessor :dna_hash
35
+
36
+ # @return [Hash] Hash object for :name => :amino_acid_sequence_string pairs
37
+ attr_accessor :aa_hash
38
+
39
+ # @return [Hash] Hash object for :name => :qc_score_string pairs
40
+ attr_accessor :qc_hash
41
+
42
+ # @return [String] the title of the SeqHash object.
43
+ # default as the file basename if SeqHash object is initialized using ::fa or ::fq
44
+ attr_accessor :title
45
+
46
+ # @return [String] the file that is used to initialize SeqHash object, if it exists
47
+ attr_accessor :file
48
+
49
+ # initialize a new ViralSeq::SeqHash object from a FASTA format sequence file
50
+ # @param infile [String] path to the FASTA format sequence file
51
+ # @return [ViralSeq::SeqHash]
52
+ # @example new ViralSeq::SeqHash object from a FASTA file
53
+ # ViralSeq::SeqHash.fa('my_fasta_file.fasta')
54
+
55
+ def self.new_from_fasta(infile)
56
+ f=File.open(infile,"r")
57
+ return_hash = {}
58
+ name = ""
59
+ while line = f.gets do
60
+ line.tr!("\u0000","")
61
+ next if line == "\n"
62
+ next if line =~ /^\=/
63
+ if line =~ /^\>/
64
+ name = line.chomp
65
+ return_hash[name] = ""
66
+ else
67
+ return_hash[name] += line.chomp.upcase
68
+ end
69
+ end
70
+ f.close
71
+ seq_hash = ViralSeq::SeqHash.new
72
+ seq_hash.dna_hash = return_hash
73
+ seq_hash.title = File.basename(infile,".*")
74
+ seq_hash.file = infile
75
+ return seq_hash
76
+ end # end of ::new_from_fasta
77
+
78
+ # initialize a new ViralSeq::SeqHash object from a FASTA format sequence file of amino acid sequences
79
+ # @param infile [String] path to the FASTA format sequence file of aa sequences
80
+ # @return [ViralSeq::SeqHash]
81
+
82
+ def self.new_from_aa_fasta(infile)
83
+ f=File.open(infile,"r")
84
+ return_hash = {}
85
+ name = ""
86
+ while line = f.gets do
87
+ line.tr!("\u0000","")
88
+ next if line == "\n"
89
+ next if line =~ /^\=/
90
+ if line =~ /^\>/
91
+ name = line.chomp
92
+ return_hash[name] = ""
93
+ else
94
+ return_hash[name] += line.chomp.upcase
95
+ end
96
+ end
97
+ f.close
98
+ seq_hash = ViralSeq::SeqHash.new
99
+ seq_hash.aa_hash = return_hash
100
+ seq_hash.title = File.basename(infile,".*")
101
+ seq_hash.file = infile
102
+ return seq_hash
103
+ end # end of ::new_from_fasta
104
+
105
+ # initialize a new ViralSeq::SeqHash object from a FASTQ format sequence file
106
+ # @param fastq_file [String] path to the FASTA format sequence file
107
+ # @return [ViralSeq::SeqHash]
108
+ # @example new ViralSeq::SeqHash object from a FASTQ file
109
+ # ViralSeq::SeqHash.fq('my_fastq_file.fastq')
110
+
111
+ def self.new_from_fastq(fastq_file)
112
+ count = 0
113
+ sequence_a = []
114
+ quality_a = []
115
+ count_seq = 0
116
+
117
+ File.open(fastq_file,'r') do |file|
118
+ file.readlines.collect do |line|
119
+ count +=1
120
+ count_m = count % 4
121
+ if count_m == 1
122
+ line.tr!('@','>')
123
+ sequence_a << line.chomp
124
+ quality_a << line.chomp
125
+ count_seq += 1
126
+ elsif count_m == 2
127
+ sequence_a << line.chomp
128
+ elsif count_m == 0
129
+ quality_a << line.chomp
130
+ end
131
+ end
132
+ end
133
+ sequence_hash = Hash[*sequence_a]
134
+ quality_hash = Hash[*quality_a]
135
+
136
+ seq_hash = ViralSeq::SeqHash.new
137
+ seq_hash.dna_hash = sequence_hash
138
+ seq_hash.qc_hash = quality_hash
139
+ seq_hash.title = File.basename(fastq_file,".*")
140
+ seq_hash.file = fastq_file
141
+ return seq_hash
142
+ end # end of ::new_from_fastq
143
+
144
+ # initialize a ViralSeq::SeqHash object with an array of sequence strings
145
+ # @param master_tag [String] master tag to put in the sequence names
146
+ # @return [ViralSeq::SeqHash] No @qc_hash, @title will be the master_tag
147
+
148
+ def self.new_from_array(seq_array,master_tag = 'seq')
149
+ n = 1
150
+ hash = {}
151
+ seq_array.each do |seq|
152
+ hash[master_tag + "_" + n.to_s] = seq
153
+ n += 1
154
+ end
155
+ seq_hash = ViralSeq::SeqHash.new
156
+ seq_hash.dna_hash = hash
157
+ seq_hash.title = master_tag
158
+ return seq_hash
159
+ end # end of ::new_from_array
160
+
161
+
162
+ class << self
163
+ alias_method :fa, :new_from_fasta
164
+ alias_method :fq, :new_from_fastq
165
+ alias_method :aa_fa, :new_from_aa_fasta
166
+ alias_method :array, :new_from_array
167
+ end
168
+
169
+ # generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
170
+ # @return [String] relaxed sequencial phylip format in a String object
171
+ # @example convert fasta format to relaxed sequencial phylip format
172
+ # # my_fasta_file.fasta
173
+ # # >seq1
174
+ # # ATAAGAACG
175
+ # # >seq2
176
+ # # ATATGAACG
177
+ # # >seq3
178
+ # # ATGAGAACG
179
+ # my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
180
+ # puts my_seqhash.to_rsphylip
181
+ # # 3 9
182
+ # # seq1 ATAAGAACG
183
+ # # seq2 ATATGAACG
184
+ # # seq3 ATGAGAACG
185
+
186
+ def to_rsphylip
187
+ seqs = self.dna_hash
188
+ outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
189
+ names = seqs.keys
190
+ names.collect!{|n| n.tr(">", "")}
191
+ max_name_l = names.max.size
192
+ max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
193
+ seqs.each do |k,v|
194
+ outline += k + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
195
+ end
196
+ return outline
197
+ end # end of #to_rsphylip
198
+
199
+ # translate the DNA sequences in @dna_hash to amino acid sequences. generate value for @aa_hash
200
+ # @param codon_position [Integer] option `0`, `1` or `2`, indicating 1st, 2nd, 3rd reading frames
201
+ # @return [NilClass]
202
+ # @example translate dna sequences from a FASTA format sequence file
203
+ # # my_fasta_file.fasta
204
+ # # >seq1
205
+ # # ATAAGAACG
206
+ # # >seq2
207
+ # # ATATGAACG
208
+ # # >seq3
209
+ # # ATGAGAACG
210
+ # my_seqhash = ViralSeq::SeqHash.fa(my_fasta_file.fasta)
211
+ # my_seqhash.translate
212
+ # my_seqhash.aa_sequence
213
+ # => {">seq1"=>"IRT", ">seq2"=>"I*T", ">seq3"=>"MRT"}
214
+
215
+ def translate(codon_position = 0)
216
+ seqs = self.dna_hash
217
+ @aa_hash = {}
218
+ seqs.each do |name, seq|
219
+ s = ViralSeq::Sequence.new(name, seq)
220
+ s.translate(codon_position)
221
+ @aa_hash[name] = s.aa_string
222
+ end
223
+ return nil
224
+ end # end of #translate
225
+
226
+ # collapse @dna_hash to unique sequence hash.
227
+ # @param tag # the master tag for unique sequences,
228
+ # sequences will be named as (tag + "_" + order(Integer) + "_" + counts(Integer))
229
+ # @return [ViralSeq::SeqHash] new SeqHash object of unique sequence hash
230
+ # @example
231
+ # dna_hash = {'>seq1' => 'AAAA','>seq2' => 'AAAA', '>seq3' => 'AAAA', '>seq4' => 'CCCC', '>seq5' => 'CCCC', '>seq6' => 'TTTT'} }
232
+ # a_seq_hash = ViralSeq::SeqHash.new
233
+ # a_seq_hash.dna_hash = dna_hash
234
+ # uniq_sequence = a_seq_hash.uniq_dna_hash('master')
235
+ # => {">master_1_3"=>"AAAA", ">master_2_2"=>"CCCC", ">master_3_1"=>"TTTT"}
236
+
237
+ def uniq_dna_hash(tag = "sequence")
238
+ seqs = self.dna_hash
239
+ uni = seqs.values.count_freq
240
+ new_seq = {}
241
+ n = 1
242
+ uni.each do |s,c|
243
+ name = ">" + tag + "_" + n.to_s + "_" + c.to_s
244
+ new_seq[name] = s
245
+ n += 1
246
+ end
247
+ seq_hash = ViralSeq::SeqHash.new(new_seq)
248
+ seq_hash.title = self.title + "_uniq"
249
+ seq_hash.file = self.file
250
+ return seq_hash
251
+ end # end of #uniq_dna_hash
252
+
253
+ alias_method :uniq, :uniq_dna_hash
254
+
255
+ # given an Array of sequence tags, return a sub ViralSeq::SeqHash object with the sequence tags
256
+ # @param keys [Array] array of sequence tags
257
+ # @return [SeqHash] new SeqHash object with sequences of the input keys
258
+
259
+ def sub(keys)
260
+ h1 = {}
261
+ h2 = {}
262
+ h3 = {}
263
+
264
+ keys.each do |k|
265
+ dna = self.dna_hash[k]
266
+ next unless dna
267
+ h1[k] = dna
268
+ aa = self.aa_hash[k]
269
+ h2[k] = aa
270
+ qc = self.qc_hash[k]
271
+ h3[k] = qc
272
+ end
273
+ title = self.title
274
+ file = self.file
275
+ ViralSeq::SeqHash.new(h1,h2,h3,title,file)
276
+ end
277
+
278
+ # screen for sequences with stop codons.
279
+ # @param (see #translate)
280
+ # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
281
+ #
282
+ # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
283
+ # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
284
+ # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
285
+ # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
286
+ # my_seqhash.dna_hash
287
+ # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
288
+ # stop_codon_seqhash = my_seqhash.stop_codon[0]
289
+ # stop_codon_seqhash.dna_hash
290
+ # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
291
+ # stop_codon_seqhash.aa_hash
292
+ # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
293
+ # stop_codon_seqhash.title
294
+ # => "my_fasta_file_stop"
295
+ # filtered_seqhash = my_seqhash.stop_codon[1]
296
+ # filtered_seqhash.aa_hash
297
+ # {">seq1"=>"IRT", ">seq3"=>"MRT"}
298
+
299
+ def stop_codon(codon_position = 0)
300
+ self.translate(codon_position)
301
+ keys = []
302
+ self.aa_hash.each do |k,v|
303
+ keys << k if v.include?('*')
304
+ end
305
+ seqhash1 = self.sub(keys)
306
+ seqhash1.title = self.title + "_stop"
307
+ keys2 = self.aa_hash.keys - keys
308
+ seqhash2 = self.sub(keys2)
309
+ return [seqhash1, seqhash2]
310
+ end #end of #stop_codon
311
+
312
+
313
+ # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
314
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
315
+ # @return [String] consensus sequence
316
+ # @example consensus sequence from an array of sequences.
317
+ # seq_array = %w{ ATTTTTTTTT
318
+ # AATTTTTTTT
319
+ # AAATTTTTTT
320
+ # AAAATTTTTT
321
+ # AAAAATTTTT
322
+ # AAAAAATTTT
323
+ # AAAAAAATTT
324
+ # AAAAAAAATT
325
+ # AAAAAAAAAT
326
+ # AAAAAAAAAA }
327
+ # my_seqhash = ViralSeq::SeqHash.array(seq_array)
328
+ # my_seqhash.consensus
329
+ # => 'AAAAAWTTTT'
330
+ # my_seqhash.consensus(0.7)
331
+ # => 'AAAANNNTTT'
332
+
333
+ def consensus(cutoff = 0.5)
334
+ seq_array = self.dna_hash.values
335
+ seq_length = seq_array[0].size
336
+ seq_size = seq_array.size
337
+ consensus_seq = ""
338
+ (0..(seq_length - 1)).each do |position|
339
+ all_base = []
340
+ seq_array.each do |seq|
341
+ all_base << seq[position]
342
+ end
343
+ base_count = all_base.count_freq
344
+ max_base_list = []
345
+
346
+ base_count.each do |k,v|
347
+ if v/seq_size.to_f >= cutoff
348
+ max_base_list << k
349
+ end
350
+ end
351
+ consensus_seq += call_consensus_base(max_base_list)
352
+ end
353
+ return consensus_seq
354
+ end #end of #consensus
355
+
356
+ # function to determine if the sequences have APOBEC3g/f hypermutation.
357
+ # # APOBEC3G/F pattern: GRD -> ARD
358
+ # # control pattern: G[YN|RC] -> A[YN|RC]
359
+ # # use the sample consensus to determine potential a3g sites
360
+ # # Two criteria to identify hypermutation
361
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
362
+ # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
363
+ # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
364
+ # # b/c Poisson model does not do well on small sample size.
365
+ # @return [Array] three values.
366
+ # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
367
+ # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
368
+ # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
369
+ # # sequence tag
370
+ # # G to A mutation numbers at potential a3g positions
371
+ # # total potential a3g G positions
372
+ # # G to A mutation numbers at non a3g positions
373
+ # # total non a3g G positions
374
+ # # a3g G to A mutation rate / non-a3g G to A mutation rate
375
+ # # Fishers Exact P-value
376
+ # @example identify apobec3gf mutations from a sequence fasta file
377
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
378
+ # hypermut = my_seqhash.a3g
379
+ # hypermut[0].dna_hash.keys
380
+ # => [">Seq7", ">Seq14"]
381
+ # hypermut[1].dna_hash.keys
382
+ # => [">Seq1", ">Seq2", ">Seq5"]
383
+ # hypermut[2]
384
+ # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
385
+ #
386
+ # @example identify apobec3gf mutations from another sequence fasta file
387
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
388
+ # hypermut = my_seqhash.a3g
389
+ # hypermut[2]
390
+ # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
391
+ # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
392
+ # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
393
+ # @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
394
+
395
+ def a3g_hypermut
396
+ # mut_hash number of apobec3g/f mutations per sequence
397
+ mut_hash = {}
398
+ hm_hash = {}
399
+ out_hash = {}
400
+
401
+ # total G->A mutations at apobec3g/f positions.
402
+ total = 0
403
+
404
+ # make consensus sequence for the input sequence hash
405
+ ref = self.consensus
406
+
407
+ # obtain apobec3g positions and control positions
408
+ apobec = apobec3gf(ref)
409
+ mut = apobec[0]
410
+ control = apobec[1]
411
+
412
+ self.dna_hash.each do |k,v|
413
+ a = 0 # muts
414
+ b = 0 # potential mut sites
415
+ c = 0 # control muts
416
+ d = 0 # potenrial controls
417
+ mut.each do |n|
418
+ next if v[n] == "-"
419
+ if v[n] == "A"
420
+ a += 1
421
+ b += 1
422
+ else
423
+ b += 1
424
+ end
425
+ end
426
+ mut_hash[k] = a
427
+ total += a
428
+
429
+ control.each do |n|
430
+ next if v[n] == "-"
431
+ if v[n] == "A"
432
+ c += 1
433
+ d += 1
434
+ else
435
+ d += 1
436
+ end
437
+ end
438
+ rr = (a/b.to_f)/(c/d.to_f)
439
+
440
+ t1 = b - a
441
+ t2 = d - c
442
+
443
+ fet = ViralSeq::Rubystats::FishersExactTest.new
444
+ fisher = fet.calculate(t1,t2,a,c)
445
+ perc = fisher[:twotail]
446
+ info = [k, a, b, c, d, rr.round(2), perc]
447
+ out_hash[k] = info
448
+ if perc < 0.05
449
+ hm_hash[k] = info
450
+ end
451
+ end
452
+
453
+ if self.dna_hash.size > 20
454
+ rate = total.to_f/(self.dna_hash.size)
455
+ count_mut = mut_hash.values.count_freq
456
+ maxi_count = count_mut.values.max
457
+ poisson_hash = ViralSeq::Math::PoissonDist.new(rate,maxi_count).poisson_hash
458
+ cut_off = 0
459
+ poisson_hash.each do |k,v|
460
+ cal = self.dna_hash.size * v
461
+ obs = count_mut[k]
462
+ if obs >= 20 * cal
463
+ cut_off = k
464
+ break
465
+ elsif k == maxi_count
466
+ cut_off = maxi_count
467
+ end
468
+ end
469
+ mut_hash.each do |k,v|
470
+ if v > cut_off
471
+ hm_hash[k] = out_hash[k]
472
+ end
473
+ end
474
+ end
475
+ hm_seq_hash = ViralSeq::SeqHash.new
476
+ hm_hash.each do |k,_v|
477
+ hm_seq_hash.dna_hash[k] = self.dna_hash[k]
478
+ end
479
+ hm_seq_hash.title = self.title + "_hypermut"
480
+ hm_seq_hash.file = self.file
481
+ filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
482
+ return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
483
+ end #end of #a3g_hypermut
484
+
485
+ alias_method :a3g, :a3g_hypermut
486
+
487
+ # Define Poission cut-off for minority variants.
488
+ # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 Ref: Zhou, et al. J Virol 2015
489
+ # @param error_rate [Float] estimated sequencing error rate
490
+ # @param fold_cutoff [Integer] a fold cut-off to determine poisson minority cut-off. default = 20. i.e. <5% mutations from random methods error.
491
+ # @return [Integer] a cut-off for minority variants (>=).
492
+ # @example obtain Poisson minority cut-off from the example sequence FASTA file.
493
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
494
+ # my_seqhash.pm
495
+ # => 2 # means that mutations appear at least 2 times are very likely to be a true mutation instead of random methods errors.
496
+
497
+ def poisson_minority_cutoff(error_rate = 0.0001, fold_cutoff = 20)
498
+ sequences = self.dna_hash.values
499
+ if sequences.size == 0
500
+ return 0
501
+ else
502
+ cut_off = 1
503
+ l = sequences[0].size
504
+ rate = sequences.size * error_rate
505
+ count_mut = variant_for_poisson(sequences)
506
+ max_count = count_mut.keys.max
507
+ poisson_hash = ViralSeq::Math::PoissonDist.new(rate, max_count).poisson_hash
508
+
509
+ poisson_hash.each do |k,v|
510
+ cal = l * v
511
+ obs = count_mut[k] ? count_mut[k] : 0
512
+ if obs >= fold_cutoff * cal
513
+ cut_off = k
514
+ break
515
+ end
516
+ end
517
+ return cut_off
518
+ end
519
+ end # end of #poisson_minority_cutoff
520
+
521
+ alias_method :pm, :poisson_minority_cutoff
522
+
523
+
524
+ # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
525
+ # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
526
+ # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
527
+
528
+ def align(path_to_muscle = false)
529
+ seq_hash = self.dna_hash
530
+ if self.file.size > 0
531
+ temp_dir = File.dirname(self.file)
532
+ else
533
+ temp_dir=File.dirname($0)
534
+ end
535
+
536
+ temp_file = temp_dir + "/_temp_muscle_in"
537
+ temp_aln = temp_dir + "/_temp_muscle_aln"
538
+ File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
539
+ if path_to_muscle
540
+ unless ViralSeq.check_muscle?(path_to_muscle)
541
+ File.unlink(temp_file)
542
+ return nil
543
+ end
544
+ print `#{path_to_muscle} -in #{temp_file} -out #{temp_aln} -quiet`
545
+ else
546
+ MuscleBio.run("muscle -in #{temp_file} -out #{temp_aln} -quiet")
547
+ end
548
+ out_seq_hash = ViralSeq::SeqHash.fa(temp_aln)
549
+ out_seq_hash.title = self.title + "_aligned"
550
+ out_seq_hash.file = self.file
551
+ File.unlink(temp_file)
552
+ File.unlink(temp_aln)
553
+ return out_seq_hash
554
+ end # end of align
555
+
556
+ # calculate Shannon's entropy, Euler's number as the base of logarithm
557
+ # @see https://en.wikipedia.org/wiki/Entropy_(information_theory) Entropy(Wikipedia)
558
+ # @param option [Symbol] the sequence type `:nt` or `:aa`
559
+ # @return [Hash] entropy score at each position in the alignment :position => :entropy ,
560
+ # # position starts at 1.
561
+ # @example caculate entropy from the example file
562
+ # sequence_file = 'spec/sample_files/sample_sequence_alignment_for_entropy.fasta'
563
+ # sequence_hash = ViralSeq::SeqHash.aa_fa(sequence_file)
564
+ # entropy_hash = sequence_hash.shannons_entropy(:aa)
565
+ # entropy_hash[3]
566
+ # => 0.0
567
+ # entropy_hash[14].round(3)
568
+ # => 0.639
569
+ # # This example is the sample input of LANL Entropy-One
570
+ # # https://www.hiv.lanl.gov/content/sequence/ENTROPY/entropy_one.html?sample_input=1
571
+
572
+ def shannons_entropy(option = :nt)
573
+ sequences = if option == :aa
574
+ self.aa_hash.values
575
+ else
576
+ self.dna_hash.values
577
+ end
578
+ entropy_hash = {}
579
+ seq_l = sequences[0].size
580
+ (0..(seq_l - 1)).each do |position|
581
+ element = []
582
+ sequences.each do |seq|
583
+ element << seq[position]
584
+ end
585
+ entropy = 0
586
+ element.delete('*')
587
+ element_size = element.size
588
+ element.count_freq.each do |_k,v|
589
+ p = v/element_size.to_f
590
+ entropy += (-p * ::Math.log(p))
591
+ end
592
+ entropy_hash[(position + 1)] = entropy
593
+ end
594
+ return entropy_hash
595
+ end # end of shannons_entropy
596
+
597
+ # Function to calculate nucleotide diversity π, for nt sequence only
598
+ # @see https://en.wikipedia.org/wiki/Nucleotide_diversity Nucleotide Diversity (Wikipedia)
599
+ # @return [Float] nucleotide diversity π
600
+ # @example calculate π
601
+ # sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
602
+ # my_seqhash = ViralSeq::SeqHash.array(sequences)
603
+ # my_seqhash.pi
604
+ # => 0.16667
605
+
606
+ def nucleotide_pi
607
+ sequences = self.dna_hash.values
608
+ seq_length = sequences[0].size - 1
609
+ nt_position_hash = {}
610
+ (0..seq_length).each do |n|
611
+ nt_position_hash[n] = []
612
+ sequences.each do |s|
613
+ nt_position_hash[n] << s[n]
614
+ end
615
+ end
616
+ diver = 0
617
+ com = 0
618
+ nt_position_hash.each do |_p,nt|
619
+ nt.delete_if {|n| n =~ /[^A|^C|^G|^T]/}
620
+ next if nt.size == 1
621
+ nt_count = nt.count_freq
622
+ combination = (nt.size)*(nt.size - 1)/2
623
+ com += combination
624
+ a = nt_count["A"]
625
+ c = nt_count["C"]
626
+ t = nt_count["T"]
627
+ g = nt_count["G"]
628
+ div = a*c + a*t + a*g + c*t + c*g + t*g
629
+ diver += div
630
+ end
631
+ pi = (diver/com.to_f).round(5)
632
+ return pi
633
+ end # end of #pi
634
+
635
+ alias_method :pi, :nucleotide_pi
636
+
637
+ # TN93 distance functionl, tabulate pairwise comparison of sequence pairs in a sequence alignment,
638
+ # nt sequence only
639
+ # @return [Hash] pairwise distance table in Hash object {:diff => :freq, ... }
640
+ # # Note: :diff in different positions (Integer), not percentage.
641
+ # @example calculate TN93 distribution
642
+ # sequences = %w{ AAGGCCTT ATGGCCTT AAGGCGTT AAGGCCTT AACGCCTT AAGGCCAT }
643
+ # my_seqhash = ViralSeq::SeqHash.array(sequences)
644
+ # my_seqhash.tn93
645
+ # => {0=>1, 1=>8, 2=>6}
646
+
647
+ def tn93
648
+ sequences = self.dna_hash.values
649
+ diff = []
650
+ seq_hash = sequences.count_freq
651
+ seq_hash.values.each do |v|
652
+ comb = v * (v - 1) / 2
653
+ comb.times {diff << 0}
654
+ end
655
+
656
+ seq_hash.keys.combination(2).to_a.each do |pair|
657
+ s1 = pair[0]
658
+ s2 = pair[1]
659
+ diff_temp = s1.compare_with(s2)
660
+ comb = seq_hash[s1] * seq_hash[s2]
661
+ comb.times {diff << diff_temp}
662
+ end
663
+
664
+ count_diff = diff.count_freq
665
+ out_hash = Hash.new(0)
666
+ Hash[count_diff.sort_by{|k,_v|k}].each do |k,v|
667
+ out_hash[k] = v
668
+ end
669
+ return out_hash
670
+ end # end of #tn93
671
+
672
+ # quality check for HIV sequences based on ViralSeq::Sequence#locator, check if sequences are in the target range
673
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
674
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
675
+ # @param indel [Boolean] allow indels or not, `ture` or `false`
676
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
677
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
678
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with only the sequences that meet the QC criterias
679
+ # @example QC for sequences in a FASTA files
680
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_seq.fasta')
681
+ # filtered_seqhash = my_seqhash.hiv_seq_qc([4384,4386], 4750..4752, false, :HXB2)
682
+ # my_seqhash.dna_hash.size
683
+ # => 6
684
+ # filtered_seqhash.dna_hash.size
685
+ # => 4
686
+
687
+ def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
688
+ start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
689
+ end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
690
+ seq_hash = self.dna_hash.dup
691
+ seq_hash_unique = seq_hash.values.uniq
692
+ seq_hash_unique_pass = []
693
+
694
+ seq_hash_unique.each do |seq|
695
+ loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
696
+ if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
697
+ if indel
698
+ seq_hash_unique_pass << seq
699
+ elsif loc[3] == false
700
+ seq_hash_unique_pass << seq
701
+ end
702
+ end
703
+ end
704
+ seq_pass = []
705
+ seq_hash_unique_pass.each do |seq|
706
+ seq_hash.each do |seq_name, orginal_seq|
707
+ if orginal_seq == seq
708
+ seq_pass << seq_name
709
+ seq_hash.delete(seq_name)
710
+ end
711
+ end
712
+ end
713
+ self.sub(seq_pass)
714
+ end # end of #hiv_seq_qc
715
+
716
+
717
+ # Remove squences with residual offspring Primer IDs.
718
+ # Compare PID with sequences which have identical sequences.
719
+ # PIDs differ by 1 base will be recognized. If PID1 is x time (cutoff) greater than PID2, PID2 will be disgarded.
720
+ # each sequence tag starting with ">" and the Primer ID sequence
721
+ # followed by the number of Primer ID appeared in the raw sequence
722
+ # the information sections in the tags are separated by underscore "_"
723
+ # example sequence tag: >AGGCGTAGA_32_sample1_RT
724
+ # @param cutoff [Integer] the fold cut-off to remove the potential residual offspring Primer IDs
725
+ # @return [ViralSeq::SeqHash] a new SeqHash object without sqeuences containing residual offspring Primer ID
726
+
727
+ def filter_similar_pid(cutoff = 10)
728
+ seq = self.dna_hash.dup
729
+ uni_seq = seq.values.uniq
730
+ uni_seq_pid = {}
731
+ uni_seq.each do |k|
732
+ seq.each do |name,s|
733
+ name = name[1..-1]
734
+ if k == s
735
+ if uni_seq_pid[k]
736
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
737
+ else
738
+ uni_seq_pid[k] = []
739
+ uni_seq_pid[k] << [name.split("_")[0],name.split("_")[1]]
740
+ end
741
+ end
742
+ end
743
+ end
744
+
745
+ dup_pid = []
746
+ uni_seq_pid.values.each do |v|
747
+ next if v.size == 1
748
+ pid_hash = Hash[v]
749
+ list = pid_hash.keys
750
+ list2 = Array.new(list)
751
+ pairs = []
752
+
753
+ list.each do |k|
754
+ list2.delete(k)
755
+ list2.each do |k1|
756
+ pairs << [k,k1]
757
+ end
758
+ end
759
+
760
+ pairs.each do |p|
761
+ pid1 = p[0]
762
+ pid2 = p[1]
763
+ if pid1.compare_with(pid2) <= 1
764
+ n1 = pid_hash[pid1].to_i
765
+ n2 = pid_hash[pid2].to_i
766
+ if n1 >= cutoff * n2
767
+ dup_pid << pid2
768
+ elsif n2 >= cutoff * n1
769
+ dup_pid << pid1
770
+ end
771
+ end
772
+ end
773
+ end
774
+
775
+ new_seq = {}
776
+ seq.each do |name,s|
777
+ pid = name.split("_")[0][1..-1]
778
+ unless dup_pid.include?(pid)
779
+ new_seq[name] = s
780
+ end
781
+ end
782
+ self.sub(new_seq.keys)
783
+ end # end of #filter_similar_pid
784
+
785
+ # Collapse sequences by difference cut-offs. Suggesting aligning before using this function.
786
+ # @param cutoff [Integer] nt base differences. collapse sequences within [cutoff] differences
787
+ # @return [ViralSeq::SeqHash] a new SeqHash object of collapsed sequences
788
+
789
+ def collapse(cutoff=1)
790
+ seq_array = self.dna_hash.values
791
+ new_seq_freq = {}
792
+ seq_freq = seq_array.count_freq
793
+ if seq_freq.size == 1
794
+ new_seq_freq = seq_freq
795
+ else
796
+ uniq_seq = seq_freq.keys
797
+ unique_seq_pair = uniq_seq.combination(2)
798
+ dupli_seq = []
799
+ unique_seq_pair.each do |pair|
800
+ seq1 = pair[0]
801
+ seq2 = pair[1]
802
+ diff = seq1.compare_with(seq2)
803
+ if diff <= cutoff
804
+ freq1 = seq_freq[seq1]
805
+ freq2 = seq_freq[seq2]
806
+ freq1 >= freq2 ? dupli_seq << seq2 : dupli_seq << seq1
807
+ end
808
+ end
809
+
810
+ seq_freq.each do |seq,freq|
811
+ unless dupli_seq.include?(seq)
812
+ new_seq_freq[seq] = freq
813
+ end
814
+ end
815
+ end
816
+ seqhash = ViralSeq::SeqHash.new
817
+ n = 1
818
+ new_seq_freq.each do |seq,freq|
819
+ name = ">seq_" + n.to_s + '_' + freq.to_s
820
+ seqhash.dna_hash[name] = seq
821
+ n += 1
822
+ end
823
+ return seqhash
824
+ end # end of #collapse
825
+
826
+ # gap strip from a sequence alignment, all positions that contains gaps ('-') will be removed
827
+ # @param option [Symbol] sequence options for `:nt` or `:aa`
828
+ # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
829
+ # @example gap strip for an array of sequences
830
+ # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
831
+ # array = { AACCGGTT
832
+ # A-CCGGTT
833
+ # AAC-GGTT
834
+ # AACCG-TT
835
+ # AACCGGT- }
836
+ # my_seqhash = ViralSeq::SeqHash.array(array)
837
+ # puts my_seqhash.gap_strip.dna_hash.values
838
+ # ACGT
839
+ # ACGT
840
+ # ACGT
841
+ # ACGT
842
+ # ACGT
843
+
844
+ def gap_strip(option = :nt)
845
+ if option == :nt
846
+ sequence_alignment = self.dna_hash
847
+ elsif option == :aa
848
+ sequence_alignment = self.aa_hash
849
+ else
850
+ raise "Option `#{option}` not recognized"
851
+ end
852
+
853
+ new_seq = {}
854
+ seq_size = sequence_alignment.values[0].size
855
+ seq_matrix = {}
856
+ (0..(seq_size - 1)).each do |p|
857
+ seq_matrix[p] = []
858
+ sequence_alignment.values.each do |s|
859
+ seq_matrix[p] << s[p]
860
+ end
861
+ end
862
+
863
+ seq_matrix.delete_if do |_p, list|
864
+ list.include?("-")
865
+ end
866
+
867
+ sequence_alignment.each do |n,s|
868
+ new_s = ""
869
+ seq_matrix.keys.each {|p| new_s += s[p]}
870
+ new_seq[n] = new_s
871
+ end
872
+ new_seq_hash = ViralSeq::SeqHash.new
873
+ if option == :nt
874
+ new_seq_hash.dna_hash = new_seq
875
+ new_seq_hash.aa_hash = self.aa_hash
876
+ elsif option == :aa
877
+ new_seq_hash.dna_hash = self.dna_hash
878
+ new_seq_hash.aa_hash = new_seq
879
+ end
880
+ new_seq_hash.qc_hash = self.qc_hash
881
+ new_seq_hash.title = self.title + "_strip"
882
+ new_seq_hash.file = self.file
883
+ return new_seq_hash
884
+ end
885
+
886
+ # gap strip from a sequence alignment at both ends, only positions at the ends that contains gaps ('-') will be removed.
887
+ # @param (see #gap_strip)
888
+ # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
889
+ # @example gap strip for an array of sequences only at the ends
890
+ # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
891
+ # array = { AACCGGTT
892
+ # A-CCGGTT
893
+ # AAC-GGTT
894
+ # AACCG-TT
895
+ # AACCGGT- }
896
+ # my_seqhash = ViralSeq::SeqHash.array(array)
897
+ # puts my_seqhash.gap_strip_ends.dna_hash.values
898
+ # AACCGGT
899
+ # A-CCGGT
900
+ # AAC-GGT
901
+ # AACCG-T
902
+ # AACCGGT
903
+
904
+ def gap_strip_ends(option = :nt)
905
+ if option == :nt
906
+ sequence_alignment = self.dna_hash
907
+ elsif option == :aa
908
+ sequence_alignment = self.aa_hash
909
+ else
910
+ raise "Option #{option} not recognized"
911
+ end
912
+ new_seq = {}
913
+ seq_size = sequence_alignment.values[0].size
914
+ seq_matrix = {}
915
+ (0..(seq_size - 1)).each do |p|
916
+ seq_matrix[p] = []
917
+ sequence_alignment.values.each do |s|
918
+ seq_matrix[p] << s[p]
919
+ end
920
+ end
921
+ n1 = 0
922
+ n2 = 0
923
+ seq_matrix.each do |_p, list|
924
+ if list.include?("-")
925
+ n1 += 1
926
+ else
927
+ break
928
+ end
929
+ end
930
+
931
+ seq_matrix.keys.reverse.each do |p|
932
+ list = seq_matrix[p]
933
+ if list.include?("-")
934
+ n2 += 1
935
+ else
936
+ break
937
+ end
938
+ end
939
+
940
+ sequence_alignment.each do |n,s|
941
+ new_s = s[n1..(- n2 - 1)]
942
+ new_seq[n] = new_s
943
+ end
944
+ new_seq_hash = ViralSeq::SeqHash.new
945
+ if option == :nt
946
+ new_seq_hash.dna_hash = new_seq
947
+ new_seq_hash.aa_hash = self.aa_hash
948
+ elsif option == :aa
949
+ new_seq_hash.dna_hash = self.dna_hash
950
+ new_seq_hash.aa_hash = new_seq
951
+ end
952
+ new_seq_hash.qc_hash = self.qc_hash
953
+ new_seq_hash.title = self.title + "_strip"
954
+ new_seq_hash.file = self.file
955
+ return new_seq_hash
956
+ end
957
+
958
+
959
+
960
+
961
+
962
+ # start of private functions
963
+ private
964
+
965
+ # APOBEC3G/F mutation position identification,
966
+ # APOBEC3G/F pattern: GRD -> ARD,
967
+ # control pattern: G[YN|RC] -> A[YN|RC],
968
+ def apobec3gf(seq = '')
969
+ seq.tr!("-", "")
970
+ seq_length = seq.size
971
+ apobec_position = []
972
+ control_position = []
973
+ (0..(seq_length - 3)).each do |n|
974
+ tri_base = seq[n,3]
975
+ if tri_base =~ /G[A|G][A|G|T]/
976
+ apobec_position << n
977
+ elsif seq[n] == "G"
978
+ control_position << n
979
+ end
980
+ end
981
+ return [apobec_position,control_position]
982
+ end # end of #apobec3gf
983
+
984
+ # call consensus nucleotide, used by #consensus
985
+ def call_consensus_base(base_array)
986
+ if base_array.size == 1
987
+ base_array[0]
988
+ elsif base_array.size == 2
989
+ case base_array.sort!
990
+ when ["A","T"]
991
+ "W"
992
+ when ["C","G"]
993
+ "S"
994
+ when ["A","C"]
995
+ "M"
996
+ when ["G","T"]
997
+ "K"
998
+ when ["A","G"]
999
+ "R"
1000
+ when ["C","T"]
1001
+ "Y"
1002
+ else
1003
+ "N"
1004
+ end
1005
+ elsif base_array.size == 3
1006
+ case base_array.sort!
1007
+ when ["C","G","T"]
1008
+ "B"
1009
+ when ["A","G","T"]
1010
+ "D"
1011
+ when ["A","C","T"]
1012
+ "H"
1013
+ when ["A","C","G"]
1014
+ "V"
1015
+ else
1016
+ "N"
1017
+ end
1018
+ else
1019
+ "N"
1020
+ end
1021
+ end # end of #call_consensus_base
1022
+
1023
+ # Input sequence array. output Variant distribution for Poisson cut-off
1024
+ def variant_for_poisson(seq)
1025
+ seq_size = seq.size
1026
+ l = seq[0].size - 1
1027
+ var = []
1028
+ (0..l).to_a.each do |pos|
1029
+ nt = []
1030
+ seq.each do |s|
1031
+ nt << s[pos]
1032
+ end
1033
+ count_nt = nt.count_freq
1034
+ v = seq_size - count_nt.values.max
1035
+ var << v
1036
+ end
1037
+ var_count = var.count_freq
1038
+ var_count.sort_by{|key,_value|key}.to_h
1039
+ end # end of #varaint_for_poisson
1040
+
1041
+ end # end of SeqHash
1042
+
1043
+ end # end of ViralSeq