viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97c823fe653d6655cf938120d163befe805933463dfe339811e115e89f5aac3b
4
- data.tar.gz: df185d45cae565437b54f43a1a03b44186e3295c95732effe1383a32a3c916d9
3
+ metadata.gz: 9055ee4b893bdff77117a2a9c005166637c177b0ed243a5362488ccf7d893e76
4
+ data.tar.gz: 87faa7b60c47eecc6f1e3267d4f2a0df549dc70d935d8adabaf54994e60b8ab4
5
5
  SHA512:
6
- metadata.gz: 6d8652dc6bafe65d9dd42f46aad49c5d8f4a9788760575fd28586462e3781b8c47d618e6891539f91f7c9a0a782934b9b56e24220eac834dbb8d6ca1aeca4c50
7
- data.tar.gz: 1d36a265a5af049f563ec87bed6517d32e015c42108558e63de5fea272ebcf92f5262ff6ed96f1b2c31eb36e1a4aaa9beb00d40bf89a4f5d6ca4fe1809b1bbcb
6
+ metadata.gz: c5a3d9aab73cd1e8b696527392c6caaa0a4eec485fe0dbf38a7db456ddce115288f2ae735717ec9595cc4f732cb6afee8dca750b0ebfc703112a5df7196230ca
7
+ data.tar.gz: f0f040bb1c70f3569ae132023f367f945c408ba73d8d495976ceb0cc2538d7104a56f2009f89789eacbfe45921c017e40578fcb4ccd1df489f75d83d7b733a85
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (0.3.0)
4
+ viral_seq (1.0.0)
5
5
  muscle_bio (~> 0.4)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # viral_seq
1
+ # ViralSeq
2
2
 
3
3
  A Ruby Gem containing bioinformatics tools for processing viral NGS data.
4
4
 
@@ -15,6 +15,12 @@ Load all ViralSeq classes by requiring 'viral_seq.rb'
15
15
  #!/usr/bin/env ruby
16
16
  require 'viral_seq'
17
17
 
18
+ ## Updates
19
+
20
+ Version 1.0.0-07092019:
21
+
22
+ 1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
23
+
18
24
  ## Development
19
25
 
20
26
  Bug reports and pull requests are welcome on GitHub at https://github.com/ViralSeq/viral_seq. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
@@ -0,0 +1,16 @@
1
+ # additional functions for Class::Integer
2
+
3
+ class Integer
4
+ # factorial method for an Integer
5
+ # @return [Integer] factorial for given Integer
6
+ # @example factorial for 5
7
+ # !5
8
+ # => 120
9
+ def !
10
+ if self == 0
11
+ return 1
12
+ else
13
+ (1..self).inject(:*)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,7 @@
1
+ module ViralSeq
2
+
3
+ # array for all amino acid one letter abbreviations
4
+
5
+ AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
+
7
+ end
@@ -0,0 +1,132 @@
1
+ # additional statistic/math functions to Module::Enumerable
2
+ # @example median number
3
+ # array = [1,2,3,4,5,6,7,8,9,10]
4
+ # array.median
5
+ # => 5.5
6
+ # @example sum
7
+ # array = [1,2,3,4,5,6,7,8,9,10]
8
+ # array.sum
9
+ # => 55
10
+ # @example average number (mean)
11
+ # array = [1,2,3,4,5,6,7,8,9,10]
12
+ # array.mean
13
+ # => 5.5
14
+ # @example sample variance
15
+ # array = [1,2,3,4,5,6,7,8,9,10]
16
+ # array.sample_variance
17
+ # => 9.166666666666666
18
+ # @example standard deviation
19
+ # array = [1,2,3,4,5,6,7,8,9,10]
20
+ # array.stdev
21
+ # => 3.0276503540974917
22
+ # @example upper quartile
23
+ # array = [1,2,3,4,5,6,7,8,9,10]
24
+ # array.upper_quartile
25
+ # => 7.5
26
+ # @example lower_quartile
27
+ # array = [1,2,3,4,5,6,7,8,9,10]
28
+ # array.lower_quartile
29
+ # => 3.5
30
+ # @example count frequency of elements in an array
31
+ # array = %w{cat dog monkey cat cat cat monkey}
32
+ # array.count_freq
33
+ # => {"cat"=>4, "dog"=>1, "monkey"=>2}
34
+ # @example count frequency as percentage of elements in an array
35
+ # array = %w{cat dog monkey cat cat cat monkey}
36
+ # array.count_freq2
37
+ # => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
38
+ module Enumerable
39
+
40
+ # generate median number
41
+ # @return [Numeric] median number
42
+ def median
43
+ len = self.length
44
+ sorted = self.sort
45
+ len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
46
+ end
47
+
48
+ # generate summed value
49
+ # @return [Numeric] summed value
50
+ def sum
51
+ self.inject(0){|accum, i| accum + i }
52
+ end
53
+
54
+ # generate mean number
55
+ # @return [Float] mean value
56
+ def mean
57
+ self.sum/self.length.to_f
58
+ end
59
+
60
+ # generate sample variance
61
+ # @return [Float] sample variance
62
+ def sample_variance
63
+ m = self.mean
64
+ sum = self.inject(0){|accum, i| accum + (i-m)**2 }
65
+ sum/(self.length - 1).to_f
66
+ end
67
+
68
+ # generate standard deviation
69
+ # @return [Float] standard deviation
70
+ def stdev
71
+ return Math.sqrt(self.sample_variance)
72
+ end
73
+
74
+ # generate upper quartile value
75
+ # @return [Numeric] upper quartile value
76
+ def upper_quartile
77
+ return nil if self.empty?
78
+ sorted_array = self.sort
79
+ u = (0.25*(3*sorted_array.length))
80
+ if (u-u.truncate).is_a?(Integer)
81
+ return sorted_array[(u-u.truncate)-1]
82
+ else
83
+ sample = sorted_array[u.truncate.abs-1]
84
+ sample1 = sorted_array[(u.truncate.abs)]
85
+ return sample+((sample1-sample)*(u-u.truncate))
86
+ end
87
+ end
88
+
89
+ # generate lower quartile value
90
+ # @return [Numeric] lower quartile value
91
+ def lower_quartile
92
+ return nil if self.empty?
93
+ sorted_array = self.sort
94
+ u = 0.25*sorted_array.length + 1
95
+ if (u-u.truncate).is_a?(Integer)
96
+ return sorted_array[(u-u.truncate)-1]
97
+ else
98
+ sample = sorted_array[u.truncate.abs-1]
99
+ sample1 = sorted_array[(u.truncate.abs)]
100
+ return sample+((sample1-sample)*(u-u.truncate))
101
+ end
102
+ end
103
+
104
+ # tabulate elements and frequencies of an Enumerable
105
+ # return [Hash] return a hash of :element => :freq_count
106
+
107
+ def count_freq
108
+ hash = Hash.new(0)
109
+ self.each do |element|
110
+ hash[element] +=1
111
+ end
112
+ return hash
113
+ end
114
+
115
+ # tabulate elements and frequencies (as percentage) of an Enumerable {
116
+ # @param decimal [Integer] decimals of frequency
117
+ # return [Hash] return a hash of :element => :percentage
118
+
119
+ def count_freq2(decimal = 2)
120
+ hash1 = Hash.new(0)
121
+ self.each do |element|
122
+ hash1[element] += 1
123
+ end
124
+ total_elements = self.size
125
+ hash2 = Hash.new(0)
126
+ hash1.each do |key,value|
127
+ hash2[key] = (value/total_elements.to_f).round(decimal)
128
+ end
129
+ return hash2
130
+ end
131
+
132
+ end
@@ -0,0 +1,45 @@
1
+ # addition methods for Class::Hash required for ViralSeq
2
+
3
+ class Hash
4
+
5
+ # subtract one hash (h2) from the other (h1) if the keys are identical
6
+ # @param other_hash [Hash] the hash that needs to substracted from the hash before the method
7
+ # @return [Hash] hash after substraction
8
+ # @example substract h2 from h1 if the keys match
9
+ # h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
10
+ # h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
11
+ # h1.difference(h2)
12
+ # => {"Bird" => 2, "Snake" => 10}
13
+
14
+ def difference(other_hash)
15
+ reject do |k,_v|
16
+ other_hash.has_key? k
17
+ end
18
+ end
19
+
20
+ # return a new hash with the unique values of input hash as keys,
21
+ # and the keys of the unique values of input hash in an array as values of the new hash
22
+ # @return [Hash] a new hash of :uniq_value_of_orginial_hash => :array_of_keys
23
+ # @example
24
+ # hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
25
+ # hash.uniq_hash
26
+ # => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
27
+
28
+ def uniq_hash
29
+ uniq_values = self.values.uniq
30
+ out_hash = {}
31
+ uniq_values.each do |uniq_va|
32
+ self.each do |k,v|
33
+ if v == uniq_va
34
+ if out_hash[uniq_va]
35
+ out_hash[uniq_va] << k
36
+ else
37
+ out_hash[uniq_va] = []
38
+ out_hash[uniq_va] << k
39
+ end
40
+ end
41
+ end
42
+ end
43
+ return out_hash
44
+ end
45
+ end
@@ -0,0 +1,454 @@
1
+
2
+ module ViralSeq
3
+ class SeqHash
4
+
5
+ # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
+ # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
7
+ # PR codon 1-99
8
+ # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
9
+ # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
10
+ # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
11
+ #
12
+ # # point_mutation_list: two demensional array for the following information,
13
+ # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
14
+ # # linkage_list: two demensional array for the following information,
15
+ # # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
16
+ # # report_list: two demensional array for the following information,
17
+ # # [position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*]
18
+ # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
19
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
20
+ # p_cut_off = my_seqhash.pm
21
+ # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
22
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
23
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
24
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
25
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
26
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
27
+ #
28
+ # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
29
+ # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
30
+ # => PR,396,D30N+N88D,245,0.61869,0.56884,0.66674,
31
+ # => PR,396,WT,149,0.37626,0.32837,0.42602,
32
+ # => PR,396,D30N,1,0.00253,6.0e-05,0.01399,*
33
+ # => PR,396,D30N+I50V+N88D,1,0.00253,6.0e-05,0.01399,*
34
+ #
35
+ # puts "position,codon,tcs_number," + ViralSeq::AMINO_ACID_LIST.join(","); pr_sdrm[2].each {|n|puts n.join(",")}
36
+ # => position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
37
+ # => PR,1,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38
+ # => PR,2,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39
+ # => PR,3,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40
+ # => PR,4,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
41
+ # => PR,5,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42
+ # => PR,6,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
43
+ # => PR,7,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44
+ # => PR,8,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
45
+ # => PR,9,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46
+ # => PR,10,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47
+ # => PR,11,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
48
+ # => PR,12,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,62.1212,0.0,0.0,0.0,0.0
49
+ # => PR,13,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.1313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.8687,0.0,0.0,0.0
50
+ # => PR,14,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51
+ # => PR,15,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6263,0.0,0.0,0.0
52
+ # => PR,16,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53
+ # => PR,17,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54
+ # => PR,18,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.5051,0.0,0.0,0.0,0.0,0.0,0.0
55
+ # => PR,19,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56
+ # => PR,20,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57
+ # => PR,21,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58
+ # => PR,22,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59
+ # => PR,23,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60
+ # => PR,24,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61
+ # => PR,25,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62
+ # => PR,26,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
63
+ # => PR,27,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64
+ # => PR,28,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65
+ # => PR,29,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66
+ # => PR,30,396,0.0,0.0,37.6263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67
+ # => PR,31,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
68
+ # => PR,32,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
69
+ # => PR,33,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
70
+ # => PR,34,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71
+ # => PR,35,396,0.0,0.0,62.1212,37.6263,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72
+ # => PR,36,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
73
+ # => PR,37,396,0.0,0.0,37.8788,61.8687,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74
+ # => PR,38,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75
+ # => PR,39,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
76
+ # => PR,40,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77
+ # => PR,41,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0
78
+ # => PR,42,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
79
+ # => PR,43,396,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80
+ # => PR,44,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81
+ # => PR,45,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82
+ # => PR,46,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83
+ # => PR,47,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84
+ # => PR,48,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85
+ # => PR,49,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86
+ # => PR,50,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
87
+ # => PR,51,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88
+ # => PR,52,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89
+ # => PR,53,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90
+ # => PR,54,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91
+ # => PR,55,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92
+ # => PR,56,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
93
+ # => PR,57,396,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0
94
+ # => PR,58,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95
+ # => PR,59,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
96
+ # => PR,60,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97
+ # => PR,61,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98
+ # => PR,62,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99
+ # => PR,63,396,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,37.8788,0.0,0.0,61.8687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100
+ # => PR,64,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101
+ # => PR,65,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102
+ # => PR,66,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103
+ # => PR,67,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104
+ # => PR,68,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105
+ # => PR,69,396,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106
+ # => PR,70,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107
+ # => PR,71,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,37.8788,0.0,0.0,0.0
108
+ # => PR,72,396,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0
109
+ # => PR,73,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110
+ # => PR,74,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
111
+ # => PR,75,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
112
+ # => PR,76,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113
+ # => PR,77,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114
+ # => PR,78,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115
+ # => PR,79,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116
+ # => PR,80,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
117
+ # => PR,81,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118
+ # => PR,82,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
119
+ # => PR,83,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
120
+ # => PR,84,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121
+ # => PR,85,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122
+ # => PR,86,396,0.0,0.0,0.0,0.5051,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123
+ # => PR,87,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
124
+ # => PR,88,396,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125
+ # => PR,89,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126
+ # => PR,90,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127
+ # => PR,91,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
128
+ # => PR,92,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129
+ # => PR,93,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130
+ # => PR,94,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131
+ # => PR,95,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132
+ # => PR,96,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
133
+ # => PR,97,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134
+ # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
135
+ # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136
+
137
+ def sdrm_hiv_pr(cutoff = 0)
138
+ sequences = self.dna_hash
139
+ region = "PR"
140
+ rf_label = 0
141
+ start_codon_number = 1
142
+ n_seq = sequences.size
143
+ mut = {}
144
+ mut_com = []
145
+ aa = {}
146
+ point_mutation_list = []
147
+ sequences.each do |name,seq|
148
+ s = ViralSeq::Sequence.new(name,seq)
149
+ s.translate(rf_label)
150
+ aa[name] = s.aa_string
151
+ record = s.sdrm(:hiv_pr)
152
+ mut_com << record
153
+ record.each do |position,mutation|
154
+ if mut[position]
155
+ mut[position][1] << mutation[1]
156
+ else
157
+ mut[position] = [mutation[0],[]]
158
+ mut[position][1] << mutation[1]
159
+ end
160
+ end
161
+ end
162
+ mut.each do |position,mutation|
163
+ wt = mutation[0]
164
+ mut_list = mutation[1]
165
+ count_mut_list = mut_list.count_freq
166
+ count_mut_list.each do |m,number|
167
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
168
+ label = number < cutoff ? "*" : ""
169
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
170
+ end
171
+ end
172
+ point_mutation_list.sort_by! {|record| record[2]}
173
+
174
+ link = mut_com.count_freq
175
+ link2 = {}
176
+ link.each do |k,v|
177
+ pattern = []
178
+ if k.size == 0
179
+ pattern = ['WT']
180
+ else
181
+ k.each do |p,m|
182
+ pattern << (m[0] + p.to_s + m[1])
183
+ end
184
+ end
185
+ link2[pattern.join("+")] = v
186
+ end
187
+ linkage_list = []
188
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
189
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
190
+ label = v < cutoff ? "*" : ""
191
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
192
+ end
193
+
194
+ report_list = []
195
+
196
+ div_aa = {}
197
+ aa_start = start_codon_number
198
+
199
+ aa_size = aa.values[0].size - 1
200
+
201
+ (0..aa_size).to_a.each do |p|
202
+ aas = []
203
+ aa.values.each do |r1|
204
+ aas << r1[p]
205
+ end
206
+ count_aas = aas.count_freq
207
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
208
+ aa_start += 1
209
+ end
210
+
211
+ div_aa.each do |k,v|
212
+ record = [region, k, n_seq]
213
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
214
+ aa_count = v[amino_acid]
215
+ record << (aa_count.to_f/n_seq*100).round(4)
216
+ end
217
+ report_list << record
218
+ end
219
+
220
+ return [point_mutation_list, linkage_list, report_list]
221
+ end
222
+
223
+
224
+ # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV RT region.
225
+ # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
226
+ # RT codon 34-122, 152-236, two regions are linked
227
+ # @param (see #sdrm_hiv_pr)
228
+ # @return (see #sdrm_hiv_pr)
229
+
230
+ def sdrm_hiv_rt(cutoff = 0)
231
+ sequences = self.dna_hash
232
+ region = "RT"
233
+ rf_label = 1
234
+ start_codon_number = 34
235
+ gap = "AGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC"
236
+
237
+ n_seq = sequences.size
238
+ mut_nrti = {}
239
+ mut_nnrti = {}
240
+ mut_com = []
241
+ r1_aa = {}
242
+ r2_aa = {}
243
+ point_mutation_list = []
244
+ sequences.each do |name,seq|
245
+ r1 = seq[0,267]
246
+ r2 = seq[267..-1]
247
+ seq = r1 + gap + r2
248
+ s = ViralSeq::Sequence.new(name,seq)
249
+ s.translate(rf_label)
250
+
251
+ r1_aa[name] = s.aa_string[0,89]
252
+ r2_aa[name] = s.aa_string[-85..-1]
253
+ nrti = s.sdrm(:nrti, start_codon_number)
254
+ nnrti = s.sdrm(:nnrti, start_codon_number)
255
+ mut_com << (nrti.merge(nnrti))
256
+
257
+ nrti.each do |position,mutation|
258
+ if mut_nrti[position]
259
+ mut_nrti[position][1] << mutation[1]
260
+ else
261
+ mut_nrti[position] = [mutation[0],[]]
262
+ mut_nrti[position][1] << mutation[1]
263
+ end
264
+ end
265
+ nnrti.each do |position,mutation|
266
+ if mut_nnrti[position]
267
+ mut_nnrti[position][1] << mutation[1]
268
+ else
269
+ mut_nnrti[position] = [mutation[0],[]]
270
+ mut_nnrti[position][1] << mutation[1]
271
+ end
272
+ end
273
+ end
274
+
275
+ mut_nrti.each do |position,mutation|
276
+ wt = mutation[0]
277
+ mut_list = mutation[1]
278
+ count_mut_list = mut_list.count_freq
279
+ count_mut_list.each do |m,number|
280
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
281
+ label = number < cutoff ? "*" : ""
282
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
283
+ end
284
+ end
285
+
286
+ mut_nnrti.each do |position,mutation|
287
+ wt = mutation[0]
288
+ mut_list = mutation[1]
289
+ count_mut_list = mut_list.count_freq
290
+ count_mut_list.each do |m,number|
291
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
292
+ label = number < cutoff ? "*" : ""
293
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
294
+ end
295
+ end
296
+
297
+ point_mutation_list.sort_by! {|record| record[2]}
298
+
299
+ link = mut_com.count_freq
300
+ link2 = {}
301
+ link.each do |k,v|
302
+ pattern = []
303
+ if k.size == 0
304
+ pattern = ['WT']
305
+ else
306
+ k.each do |p,m|
307
+ pattern << (m[0] + p.to_s + m[1])
308
+ end
309
+ end
310
+ link2[pattern.join("+")] = v
311
+ end
312
+ linkage_list = []
313
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
314
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
315
+ label = v < cutoff ? "*" : ""
316
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
317
+ end
318
+
319
+ report_list = []
320
+
321
+ div_aa = {}
322
+ r1_aa_start = 34
323
+ r2_aa_start = 152
324
+
325
+ r1_aa_size = r1_aa.values[0].size - 1
326
+ r2_aa_size = r2_aa.values[0].size - 1
327
+
328
+ (0..r1_aa_size).to_a.each do |p|
329
+ aas = []
330
+ r1_aa.values.each do |r1|
331
+ aas << r1[p]
332
+ end
333
+ count_aas = aas.count_freq
334
+ div_aa[r1_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
335
+ r1_aa_start += 1
336
+ end
337
+
338
+ (0..r2_aa_size).to_a.each do |p|
339
+ aas = []
340
+ r2_aa.values.each do |r1|
341
+ aas << r1[p]
342
+ end
343
+ count_aas = aas.count_freq
344
+ div_aa[r2_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
345
+ r2_aa_start += 1
346
+ end
347
+
348
+ div_aa.each do |k,v|
349
+ record = [region, k, n_seq]
350
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
351
+ aa_count = v[amino_acid]
352
+ record << (aa_count.to_f/n_seq*100).round(4)
353
+ end
354
+ report_list << record
355
+ end
356
+
357
+ return [point_mutation_list, linkage_list, report_list]
358
+ end
359
+
360
+ # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV IN region.
361
+ # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
362
+ # IN codon 53-174
363
+ # @param (see #sdrm_hiv_pr)
364
+ # @return (see #sdrm_hiv_pr)
365
+
366
+ def sdrm_hiv_in(cutoff = 0)
367
+ sequences = self.dna_hash
368
+ region = "IN"
369
+ rf_label = 2
370
+ start_codon_number = 53
371
+ n_seq = sequences.size
372
+ mut = {}
373
+ mut_com = []
374
+ aa = {}
375
+ point_mutation_list = []
376
+ sequences.each do |name,seq|
377
+ s = ViralSeq::Sequence.new(name,seq)
378
+ s.translate(rf_label)
379
+ aa[name] = s.aa_string
380
+ record = s.sdrm(:hiv_in, start_codon_number)
381
+ mut_com << record
382
+ record.each do |position,mutation|
383
+ if mut[position]
384
+ mut[position][1] << mutation[1]
385
+ else
386
+ mut[position] = [mutation[0],[]]
387
+ mut[position][1] << mutation[1]
388
+ end
389
+ end
390
+ end
391
+
392
+ mut.each do |position,mutation|
393
+ wt = mutation[0]
394
+ mut_list = mutation[1]
395
+ count_mut_list = mut_list.count_freq
396
+ count_mut_list.each do |m,number|
397
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
398
+ label = number < cutoff ? "*" : ""
399
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
400
+ end
401
+ end
402
+ point_mutation_list.sort_by! {|record| record[2]}
403
+
404
+ link = mut_com.count_freq
405
+ link2 = {}
406
+ link.each do |k,v|
407
+ pattern = []
408
+ if k.size == 0
409
+ pattern = ['WT']
410
+ else
411
+ k.each do |p,m|
412
+ pattern << (m[0] + p.to_s + m[1])
413
+ end
414
+ end
415
+ link2[pattern.join("+")] = v
416
+ end
417
+ linkage_list = []
418
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
419
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
420
+ label = v < cutoff ? "*" : ""
421
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
422
+ end
423
+
424
+ report_list = []
425
+
426
+ div_aa = {}
427
+ aa_start = start_codon_number
428
+
429
+ aa_size = aa.values[0].size - 1
430
+
431
+ (0..aa_size).to_a.each do |p|
432
+ aas = []
433
+ aa.values.each do |r1|
434
+ aas << r1[p]
435
+ end
436
+ count_aas = aas.count_freq
437
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
438
+ aa_start += 1
439
+ end
440
+
441
+ div_aa.each do |k,v|
442
+ record = [region, k, n_seq]
443
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
444
+ aa_count = v[amino_acid]
445
+ record << (aa_count.to_f/n_seq*100).round(4)
446
+ end
447
+ report_list << record
448
+ end
449
+
450
+ return [point_mutation_list, linkage_list, report_list]
451
+ end
452
+
453
+ end # end of ViralSeq::SeqHash
454
+ end # end of ViralSeq