viral_seq 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97c823fe653d6655cf938120d163befe805933463dfe339811e115e89f5aac3b
4
- data.tar.gz: df185d45cae565437b54f43a1a03b44186e3295c95732effe1383a32a3c916d9
3
+ metadata.gz: 9055ee4b893bdff77117a2a9c005166637c177b0ed243a5362488ccf7d893e76
4
+ data.tar.gz: 87faa7b60c47eecc6f1e3267d4f2a0df549dc70d935d8adabaf54994e60b8ab4
5
5
  SHA512:
6
- metadata.gz: 6d8652dc6bafe65d9dd42f46aad49c5d8f4a9788760575fd28586462e3781b8c47d618e6891539f91f7c9a0a782934b9b56e24220eac834dbb8d6ca1aeca4c50
7
- data.tar.gz: 1d36a265a5af049f563ec87bed6517d32e015c42108558e63de5fea272ebcf92f5262ff6ed96f1b2c31eb36e1a4aaa9beb00d40bf89a4f5d6ca4fe1809b1bbcb
6
+ metadata.gz: c5a3d9aab73cd1e8b696527392c6caaa0a4eec485fe0dbf38a7db456ddce115288f2ae735717ec9595cc4f732cb6afee8dca750b0ebfc703112a5df7196230ca
7
+ data.tar.gz: f0f040bb1c70f3569ae132023f367f945c408ba73d8d495976ceb0cc2538d7104a56f2009f89789eacbfe45921c017e40578fcb4ccd1df489f75d83d7b733a85
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (0.3.0)
4
+ viral_seq (1.0.0)
5
5
  muscle_bio (~> 0.4)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # viral_seq
1
+ # ViralSeq
2
2
 
3
3
  A Ruby Gem containing bioinformatics tools for processing viral NGS data.
4
4
 
@@ -15,6 +15,12 @@ Load all ViralSeq classes by requiring 'viral_seq.rb'
15
15
  #!/usr/bin/env ruby
16
16
  require 'viral_seq'
17
17
 
18
+ ## Updates
19
+
20
+ Version 1.0.0-07092019:
21
+
22
+ 1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
23
+
18
24
  ## Development
19
25
 
20
26
  Bug reports and pull requests are welcome on GitHub at https://github.com/ViralSeq/viral_seq. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
@@ -0,0 +1,16 @@
1
+ # additional functions for Class::Integer
2
+
3
+ class Integer
4
+ # factorial method for an Integer
5
+ # @return [Integer] factorial for given Integer
6
+ # @example factorial for 5
7
+ # !5
8
+ # => 120
9
+ def !
10
+ if self == 0
11
+ return 1
12
+ else
13
+ (1..self).inject(:*)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,7 @@
1
+ module ViralSeq
2
+
3
+ # array for all amino acid one letter abbreviations
4
+
5
+ AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
+
7
+ end
@@ -0,0 +1,132 @@
1
+ # additional statistic/math functions to Module::Enumerable
2
+ # @example median number
3
+ # array = [1,2,3,4,5,6,7,8,9,10]
4
+ # array.median
5
+ # => 5.5
6
+ # @example sum
7
+ # array = [1,2,3,4,5,6,7,8,9,10]
8
+ # array.sum
9
+ # => 55
10
+ # @example average number (mean)
11
+ # array = [1,2,3,4,5,6,7,8,9,10]
12
+ # array.mean
13
+ # => 5.5
14
+ # @example sample variance
15
+ # array = [1,2,3,4,5,6,7,8,9,10]
16
+ # array.sample_variance
17
+ # => 9.166666666666666
18
+ # @example standard deviation
19
+ # array = [1,2,3,4,5,6,7,8,9,10]
20
+ # array.stdev
21
+ # => 3.0276503540974917
22
+ # @example upper quartile
23
+ # array = [1,2,3,4,5,6,7,8,9,10]
24
+ # array.upper_quartile
25
+ # => 7.5
26
+ # @example lower_quartile
27
+ # array = [1,2,3,4,5,6,7,8,9,10]
28
+ # array.lower_quartile
29
+ # => 3.5
30
+ # @example count frequency of elements in an array
31
+ # array = %w{cat dog monkey cat cat cat monkey}
32
+ # array.count_freq
33
+ # => {"cat"=>4, "dog"=>1, "monkey"=>2}
34
+ # @example count frequency as percentage of elements in an array
35
+ # array = %w{cat dog monkey cat cat cat monkey}
36
+ # array.count_freq2
37
+ # => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
38
+ module Enumerable
39
+
40
+ # generate median number
41
+ # @return [Numeric] median number
42
+ def median
43
+ len = self.length
44
+ sorted = self.sort
45
+ len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
46
+ end
47
+
48
+ # generate summed value
49
+ # @return [Numeric] summed value
50
+ def sum
51
+ self.inject(0){|accum, i| accum + i }
52
+ end
53
+
54
+ # generate mean number
55
+ # @return [Float] mean value
56
+ def mean
57
+ self.sum/self.length.to_f
58
+ end
59
+
60
+ # generate sample variance
61
+ # @return [Float] sample variance
62
+ def sample_variance
63
+ m = self.mean
64
+ sum = self.inject(0){|accum, i| accum + (i-m)**2 }
65
+ sum/(self.length - 1).to_f
66
+ end
67
+
68
+ # generate standard deviation
69
+ # @return [Float] standard deviation
70
+ def stdev
71
+ return Math.sqrt(self.sample_variance)
72
+ end
73
+
74
+ # generate upper quartile value
75
+ # @return [Numeric] upper quartile value
76
+ def upper_quartile
77
+ return nil if self.empty?
78
+ sorted_array = self.sort
79
+ u = (0.25*(3*sorted_array.length))
80
+ if (u-u.truncate).is_a?(Integer)
81
+ return sorted_array[(u-u.truncate)-1]
82
+ else
83
+ sample = sorted_array[u.truncate.abs-1]
84
+ sample1 = sorted_array[(u.truncate.abs)]
85
+ return sample+((sample1-sample)*(u-u.truncate))
86
+ end
87
+ end
88
+
89
+ # generate lower quartile value
90
+ # @return [Numeric] lower quartile value
91
+ def lower_quartile
92
+ return nil if self.empty?
93
+ sorted_array = self.sort
94
+ u = 0.25*sorted_array.length + 1
95
+ if (u-u.truncate).is_a?(Integer)
96
+ return sorted_array[(u-u.truncate)-1]
97
+ else
98
+ sample = sorted_array[u.truncate.abs-1]
99
+ sample1 = sorted_array[(u.truncate.abs)]
100
+ return sample+((sample1-sample)*(u-u.truncate))
101
+ end
102
+ end
103
+
104
+ # tabulate elements and frequencies of an Enumerable
105
+ # return [Hash] return a hash of :element => :freq_count
106
+
107
+ def count_freq
108
+ hash = Hash.new(0)
109
+ self.each do |element|
110
+ hash[element] +=1
111
+ end
112
+ return hash
113
+ end
114
+
115
+ # tabulate elements and frequencies (as percentage) of an Enumerable {
116
+ # @param decimal [Integer] decimals of frequency
117
+ # return [Hash] return a hash of :element => :percentage
118
+
119
+ def count_freq2(decimal = 2)
120
+ hash1 = Hash.new(0)
121
+ self.each do |element|
122
+ hash1[element] += 1
123
+ end
124
+ total_elements = self.size
125
+ hash2 = Hash.new(0)
126
+ hash1.each do |key,value|
127
+ hash2[key] = (value/total_elements.to_f).round(decimal)
128
+ end
129
+ return hash2
130
+ end
131
+
132
+ end
@@ -0,0 +1,45 @@
1
+ # addition methods for Class::Hash required for ViralSeq
2
+
3
+ class Hash
4
+
5
+ # subtract one hash (h2) from the other (h1) if the keys are identical
6
+ # @param other_hash [Hash] the hash that needs to substracted from the hash before the method
7
+ # @return [Hash] hash after substraction
8
+ # @example substract h2 from h1 if the keys match
9
+ # h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
10
+ # h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
11
+ # h1.difference(h2)
12
+ # => {"Bird" => 2, "Snake" => 10}
13
+
14
+ def difference(other_hash)
15
+ reject do |k,_v|
16
+ other_hash.has_key? k
17
+ end
18
+ end
19
+
20
+ # return a new hash with the unique values of input hash as keys,
21
+ # and the keys of the unique values of input hash in an array as values of the new hash
22
+ # @return [Hash] a new hash of :uniq_value_of_orginial_hash => :array_of_keys
23
+ # @example
24
+ # hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
25
+ # hash.uniq_hash
26
+ # => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
27
+
28
+ def uniq_hash
29
+ uniq_values = self.values.uniq
30
+ out_hash = {}
31
+ uniq_values.each do |uniq_va|
32
+ self.each do |k,v|
33
+ if v == uniq_va
34
+ if out_hash[uniq_va]
35
+ out_hash[uniq_va] << k
36
+ else
37
+ out_hash[uniq_va] = []
38
+ out_hash[uniq_va] << k
39
+ end
40
+ end
41
+ end
42
+ end
43
+ return out_hash
44
+ end
45
+ end
@@ -0,0 +1,454 @@
1
+
2
+ module ViralSeq
3
+ class SeqHash
4
+
5
+ # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
+ # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
7
+ # PR codon 1-99
8
+ # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
9
+ # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
10
+ # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
11
+ #
12
+ # # point_mutation_list: two demensional array for the following information,
13
+ # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
14
+ # # linkage_list: two demensional array for the following information,
15
+ # # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
16
+ # # report_list: two demensional array for the following information,
17
+ # # [position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*]
18
+ # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
19
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
20
+ # p_cut_off = my_seqhash.pm
21
+ # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
22
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
23
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
24
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
25
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
26
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
27
+ #
28
+ # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
29
+ # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
30
+ # => PR,396,D30N+N88D,245,0.61869,0.56884,0.66674,
31
+ # => PR,396,WT,149,0.37626,0.32837,0.42602,
32
+ # => PR,396,D30N,1,0.00253,6.0e-05,0.01399,*
33
+ # => PR,396,D30N+I50V+N88D,1,0.00253,6.0e-05,0.01399,*
34
+ #
35
+ # puts "position,codon,tcs_number," + ViralSeq::AMINO_ACID_LIST.join(","); pr_sdrm[2].each {|n|puts n.join(",")}
36
+ # => position,codon,tcs_number,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
37
+ # => PR,1,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38
+ # => PR,2,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39
+ # => PR,3,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40
+ # => PR,4,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
41
+ # => PR,5,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42
+ # => PR,6,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
43
+ # => PR,7,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44
+ # => PR,8,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
45
+ # => PR,9,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46
+ # => PR,10,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47
+ # => PR,11,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
48
+ # => PR,12,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,62.1212,0.0,0.0,0.0,0.0
49
+ # => PR,13,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.1313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.8687,0.0,0.0,0.0
50
+ # => PR,14,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51
+ # => PR,15,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6263,0.0,0.0,0.0
52
+ # => PR,16,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53
+ # => PR,17,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54
+ # => PR,18,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.5051,0.0,0.0,0.0,0.0,0.0,0.0
55
+ # => PR,19,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56
+ # => PR,20,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57
+ # => PR,21,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58
+ # => PR,22,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59
+ # => PR,23,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60
+ # => PR,24,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61
+ # => PR,25,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62
+ # => PR,26,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
63
+ # => PR,27,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64
+ # => PR,28,396,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65
+ # => PR,29,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66
+ # => PR,30,396,0.0,0.0,37.6263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.3737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67
+ # => PR,31,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
68
+ # => PR,32,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
69
+ # => PR,33,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
70
+ # => PR,34,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71
+ # => PR,35,396,0.0,0.0,62.1212,37.6263,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72
+ # => PR,36,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
73
+ # => PR,37,396,0.0,0.0,37.8788,61.8687,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74
+ # => PR,38,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75
+ # => PR,39,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
76
+ # => PR,40,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77
+ # => PR,41,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0
78
+ # => PR,42,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
79
+ # => PR,43,396,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80
+ # => PR,44,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81
+ # => PR,45,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82
+ # => PR,46,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83
+ # => PR,47,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84
+ # => PR,48,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85
+ # => PR,49,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86
+ # => PR,50,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.0
87
+ # => PR,51,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88
+ # => PR,52,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89
+ # => PR,53,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90
+ # => PR,54,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91
+ # => PR,55,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92
+ # => PR,56,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
93
+ # => PR,57,396,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0
94
+ # => PR,58,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95
+ # => PR,59,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
96
+ # => PR,60,396,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97
+ # => PR,61,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98
+ # => PR,62,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99
+ # => PR,63,396,0.0,0.0,0.0,0.0,0.0,0.0,0.2525,0.0,0.0,37.8788,0.0,0.0,61.8687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100
+ # => PR,64,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101
+ # => PR,65,396,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102
+ # => PR,66,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103
+ # => PR,67,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104
+ # => PR,68,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105
+ # => PR,69,396,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106
+ # => PR,70,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107
+ # => PR,71,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,37.8788,0.0,0.0,0.0
108
+ # => PR,72,396,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.1212,0.0,0.0,0.0,0.0
109
+ # => PR,73,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110
+ # => PR,74,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
111
+ # => PR,75,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
112
+ # => PR,76,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113
+ # => PR,77,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114
+ # => PR,78,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115
+ # => PR,79,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116
+ # => PR,80,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
117
+ # => PR,81,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118
+ # => PR,82,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
119
+ # => PR,83,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.4949,0.0,0.0,0.0,0.5051,0.0,0.0,0.0,0.0,0.0
120
+ # => PR,84,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121
+ # => PR,85,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122
+ # => PR,86,396,0.0,0.0,0.0,0.5051,0.0,99.4949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123
+ # => PR,87,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
124
+ # => PR,88,396,0.0,0.0,62.1212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.8788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125
+ # => PR,89,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126
+ # => PR,90,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127
+ # => PR,91,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
128
+ # => PR,92,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129
+ # => PR,93,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130
+ # => PR,94,396,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131
+ # => PR,95,396,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132
+ # => PR,96,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
133
+ # => PR,97,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134
+ # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
135
+ # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136
+
137
+ def sdrm_hiv_pr(cutoff = 0)
138
+ sequences = self.dna_hash
139
+ region = "PR"
140
+ rf_label = 0
141
+ start_codon_number = 1
142
+ n_seq = sequences.size
143
+ mut = {}
144
+ mut_com = []
145
+ aa = {}
146
+ point_mutation_list = []
147
+ sequences.each do |name,seq|
148
+ s = ViralSeq::Sequence.new(name,seq)
149
+ s.translate(rf_label)
150
+ aa[name] = s.aa_string
151
+ record = s.sdrm(:hiv_pr)
152
+ mut_com << record
153
+ record.each do |position,mutation|
154
+ if mut[position]
155
+ mut[position][1] << mutation[1]
156
+ else
157
+ mut[position] = [mutation[0],[]]
158
+ mut[position][1] << mutation[1]
159
+ end
160
+ end
161
+ end
162
+ mut.each do |position,mutation|
163
+ wt = mutation[0]
164
+ mut_list = mutation[1]
165
+ count_mut_list = mut_list.count_freq
166
+ count_mut_list.each do |m,number|
167
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
168
+ label = number < cutoff ? "*" : ""
169
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
170
+ end
171
+ end
172
+ point_mutation_list.sort_by! {|record| record[2]}
173
+
174
+ link = mut_com.count_freq
175
+ link2 = {}
176
+ link.each do |k,v|
177
+ pattern = []
178
+ if k.size == 0
179
+ pattern = ['WT']
180
+ else
181
+ k.each do |p,m|
182
+ pattern << (m[0] + p.to_s + m[1])
183
+ end
184
+ end
185
+ link2[pattern.join("+")] = v
186
+ end
187
+ linkage_list = []
188
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
189
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
190
+ label = v < cutoff ? "*" : ""
191
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
192
+ end
193
+
194
+ report_list = []
195
+
196
+ div_aa = {}
197
+ aa_start = start_codon_number
198
+
199
+ aa_size = aa.values[0].size - 1
200
+
201
+ (0..aa_size).to_a.each do |p|
202
+ aas = []
203
+ aa.values.each do |r1|
204
+ aas << r1[p]
205
+ end
206
+ count_aas = aas.count_freq
207
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
208
+ aa_start += 1
209
+ end
210
+
211
+ div_aa.each do |k,v|
212
+ record = [region, k, n_seq]
213
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
214
+ aa_count = v[amino_acid]
215
+ record << (aa_count.to_f/n_seq*100).round(4)
216
+ end
217
+ report_list << record
218
+ end
219
+
220
+ return [point_mutation_list, linkage_list, report_list]
221
+ end
222
+
223
+
224
+ # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV RT region.
225
+ # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
226
+ # RT codon 34-122, 152-236, two regions are linked
227
+ # @param (see #sdrm_hiv_pr)
228
+ # @return (see #sdrm_hiv_pr)
229
+
230
+ def sdrm_hiv_rt(cutoff = 0)
231
+ sequences = self.dna_hash
232
+ region = "RT"
233
+ rf_label = 1
234
+ start_codon_number = 34
235
+ gap = "AGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC"
236
+
237
+ n_seq = sequences.size
238
+ mut_nrti = {}
239
+ mut_nnrti = {}
240
+ mut_com = []
241
+ r1_aa = {}
242
+ r2_aa = {}
243
+ point_mutation_list = []
244
+ sequences.each do |name,seq|
245
+ r1 = seq[0,267]
246
+ r2 = seq[267..-1]
247
+ seq = r1 + gap + r2
248
+ s = ViralSeq::Sequence.new(name,seq)
249
+ s.translate(rf_label)
250
+
251
+ r1_aa[name] = s.aa_string[0,89]
252
+ r2_aa[name] = s.aa_string[-85..-1]
253
+ nrti = s.sdrm(:nrti, start_codon_number)
254
+ nnrti = s.sdrm(:nnrti, start_codon_number)
255
+ mut_com << (nrti.merge(nnrti))
256
+
257
+ nrti.each do |position,mutation|
258
+ if mut_nrti[position]
259
+ mut_nrti[position][1] << mutation[1]
260
+ else
261
+ mut_nrti[position] = [mutation[0],[]]
262
+ mut_nrti[position][1] << mutation[1]
263
+ end
264
+ end
265
+ nnrti.each do |position,mutation|
266
+ if mut_nnrti[position]
267
+ mut_nnrti[position][1] << mutation[1]
268
+ else
269
+ mut_nnrti[position] = [mutation[0],[]]
270
+ mut_nnrti[position][1] << mutation[1]
271
+ end
272
+ end
273
+ end
274
+
275
+ mut_nrti.each do |position,mutation|
276
+ wt = mutation[0]
277
+ mut_list = mutation[1]
278
+ count_mut_list = mut_list.count_freq
279
+ count_mut_list.each do |m,number|
280
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
281
+ label = number < cutoff ? "*" : ""
282
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
283
+ end
284
+ end
285
+
286
+ mut_nnrti.each do |position,mutation|
287
+ wt = mutation[0]
288
+ mut_list = mutation[1]
289
+ count_mut_list = mut_list.count_freq
290
+ count_mut_list.each do |m,number|
291
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
292
+ label = number < cutoff ? "*" : ""
293
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
294
+ end
295
+ end
296
+
297
+ point_mutation_list.sort_by! {|record| record[2]}
298
+
299
+ link = mut_com.count_freq
300
+ link2 = {}
301
+ link.each do |k,v|
302
+ pattern = []
303
+ if k.size == 0
304
+ pattern = ['WT']
305
+ else
306
+ k.each do |p,m|
307
+ pattern << (m[0] + p.to_s + m[1])
308
+ end
309
+ end
310
+ link2[pattern.join("+")] = v
311
+ end
312
+ linkage_list = []
313
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
314
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
315
+ label = v < cutoff ? "*" : ""
316
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
317
+ end
318
+
319
+ report_list = []
320
+
321
+ div_aa = {}
322
+ r1_aa_start = 34
323
+ r2_aa_start = 152
324
+
325
+ r1_aa_size = r1_aa.values[0].size - 1
326
+ r2_aa_size = r2_aa.values[0].size - 1
327
+
328
+ (0..r1_aa_size).to_a.each do |p|
329
+ aas = []
330
+ r1_aa.values.each do |r1|
331
+ aas << r1[p]
332
+ end
333
+ count_aas = aas.count_freq
334
+ div_aa[r1_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
335
+ r1_aa_start += 1
336
+ end
337
+
338
+ (0..r2_aa_size).to_a.each do |p|
339
+ aas = []
340
+ r2_aa.values.each do |r1|
341
+ aas << r1[p]
342
+ end
343
+ count_aas = aas.count_freq
344
+ div_aa[r2_aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
345
+ r2_aa_start += 1
346
+ end
347
+
348
+ div_aa.each do |k,v|
349
+ record = [region, k, n_seq]
350
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
351
+ aa_count = v[amino_acid]
352
+ record << (aa_count.to_f/n_seq*100).round(4)
353
+ end
354
+ report_list << record
355
+ end
356
+
357
+ return [point_mutation_list, linkage_list, report_list]
358
+ end
359
+
360
+ # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV IN region.
361
+ # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
362
+ # IN codon 53-174
363
+ # @param (see #sdrm_hiv_pr)
364
+ # @return (see #sdrm_hiv_pr)
365
+
366
+ def sdrm_hiv_in(cutoff = 0)
367
+ sequences = self.dna_hash
368
+ region = "IN"
369
+ rf_label = 2
370
+ start_codon_number = 53
371
+ n_seq = sequences.size
372
+ mut = {}
373
+ mut_com = []
374
+ aa = {}
375
+ point_mutation_list = []
376
+ sequences.each do |name,seq|
377
+ s = ViralSeq::Sequence.new(name,seq)
378
+ s.translate(rf_label)
379
+ aa[name] = s.aa_string
380
+ record = s.sdrm(:hiv_in, start_codon_number)
381
+ mut_com << record
382
+ record.each do |position,mutation|
383
+ if mut[position]
384
+ mut[position][1] << mutation[1]
385
+ else
386
+ mut[position] = [mutation[0],[]]
387
+ mut[position][1] << mutation[1]
388
+ end
389
+ end
390
+ end
391
+
392
+ mut.each do |position,mutation|
393
+ wt = mutation[0]
394
+ mut_list = mutation[1]
395
+ count_mut_list = mut_list.count_freq
396
+ count_mut_list.each do |m,number|
397
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
398
+ label = number < cutoff ? "*" : ""
399
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
400
+ end
401
+ end
402
+ point_mutation_list.sort_by! {|record| record[2]}
403
+
404
+ link = mut_com.count_freq
405
+ link2 = {}
406
+ link.each do |k,v|
407
+ pattern = []
408
+ if k.size == 0
409
+ pattern = ['WT']
410
+ else
411
+ k.each do |p,m|
412
+ pattern << (m[0] + p.to_s + m[1])
413
+ end
414
+ end
415
+ link2[pattern.join("+")] = v
416
+ end
417
+ linkage_list = []
418
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
419
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
420
+ label = v < cutoff ? "*" : ""
421
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
422
+ end
423
+
424
+ report_list = []
425
+
426
+ div_aa = {}
427
+ aa_start = start_codon_number
428
+
429
+ aa_size = aa.values[0].size - 1
430
+
431
+ (0..aa_size).to_a.each do |p|
432
+ aas = []
433
+ aa.values.each do |r1|
434
+ aas << r1[p]
435
+ end
436
+ count_aas = aas.count_freq
437
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
438
+ aa_start += 1
439
+ end
440
+
441
+ div_aa.each do |k,v|
442
+ record = [region, k, n_seq]
443
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
444
+ aa_count = v[amino_acid]
445
+ record << (aa_count.to_f/n_seq*100).round(4)
446
+ end
447
+ report_list << record
448
+ end
449
+
450
+ return [point_mutation_list, linkage_list, report_list]
451
+ end
452
+
453
+ end # end of ViralSeq::SeqHash
454
+ end # end of ViralSeq