viral_seq 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/viral_seq/a3g.rb +172 -0
- data/lib/viral_seq/fasta.rb +154 -0
- data/lib/viral_seq/hcv_dr.rb +54 -0
- data/lib/viral_seq/locator.rb +299 -0
- data/lib/viral_seq/math.rb +401 -0
- data/lib/viral_seq/misc.rb +103 -0
- data/lib/viral_seq/muscle.rb +89 -0
- data/lib/viral_seq/nt_variation.rb +148 -0
- data/lib/viral_seq/poisson_cutoff.rb +68 -0
- data/lib/viral_seq/refseq.rb +45 -0
- data/lib/viral_seq/sdrm_core.rb +652 -0
- data/lib/viral_seq/sequence.rb +392 -0
- data/lib/viral_seq/tcs_core.rb +556 -0
- data/lib/viral_seq/version.rb +6 -0
- data/lib/viral_seq.rb +41 -0
- data/viral_seq.gemspec +37 -0
- metadata +130 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
# viral_seq/hcv_dr
|
2
|
+
# HCV resistant mutation interpretation
|
3
|
+
# ViralSeq::hcv_ns5a
|
4
|
+
|
5
|
+
# ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
|
6
|
+
# # amino_acid_sequence_array is Array object of the amino acid sequence.
|
7
|
+
# # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
|
8
|
+
# # start_aa_position is the starting aa number of the input sequence as Integer
|
9
|
+
|
10
|
+
module ViralSeq
|
11
|
+
def self.hcv_ns5a(aa_array,start_aa=1)
|
12
|
+
out_hash = {}
|
13
|
+
sdrm = {}
|
14
|
+
sdrm[28] = ['M',['T']]
|
15
|
+
sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
|
16
|
+
sdrm[31] = ['L',['M','V','F']]
|
17
|
+
sdrm[32] = ['P',['L']]
|
18
|
+
sdrm[44] = ['K',['R']]
|
19
|
+
sdrm[58] = ['H',['D','P','S']]
|
20
|
+
sdrm[64] = ['T',['A','S']]
|
21
|
+
sdrm[77] = ['P',['A','S']]
|
22
|
+
sdrm[78] = ['R',['K']]
|
23
|
+
sdrm[79] = ['T',['A']]
|
24
|
+
sdrm[83] = ['T',['M']]
|
25
|
+
sdrm[85] = ['S',['N','H','Y']]
|
26
|
+
sdrm[92] = ['A',['P','T','K','E']]
|
27
|
+
sdrm[93] = ['Y',['C','F','H','N']]
|
28
|
+
sdrm[107] = ['K',['T','S']]
|
29
|
+
sdrm[121] = ['I',['V']]
|
30
|
+
sdrm[135] = ['T',['A']]
|
31
|
+
aa_length = aa_array.size
|
32
|
+
end_aa = start_aa + aa_length - 1
|
33
|
+
(start_aa..end_aa).each do |position|
|
34
|
+
array_position = position - start_aa
|
35
|
+
if sdrm.keys.include?(position)
|
36
|
+
wt_aa = sdrm[position][0]
|
37
|
+
test_aa = aa_array[array_position]
|
38
|
+
if test_aa.size == 1
|
39
|
+
unless wt_aa == test_aa
|
40
|
+
if sdrm[position][1].include?(test_aa)
|
41
|
+
out_hash[position] = [wt_aa,test_aa]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
else
|
45
|
+
test_aa_array = test_aa.split("/")
|
46
|
+
if (test_aa_array & sdrm[position][1])
|
47
|
+
out_hash[position] = [wt_aa,test_aa]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
return out_hash
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,299 @@
|
|
1
|
+
# viral_seq/locator.rb
|
2
|
+
|
3
|
+
# Including following methods:
|
4
|
+
# ViralSeq::sequence_locator
|
5
|
+
# ViralSeq::sequence_clip
|
6
|
+
# ViralSeq::qc_hiv_seq_check
|
7
|
+
|
8
|
+
# HIV sequence locator function
|
9
|
+
# resembling HIV Sequence Locator from LANL
|
10
|
+
# https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
|
11
|
+
# require MUSCLE (http://www.drive5.com/muscle) installed
|
12
|
+
# current version only supports nucleotide sequence, not for amino acid sequence.
|
13
|
+
|
14
|
+
# =USAGE1
|
15
|
+
# # Find the location of a sequence
|
16
|
+
# ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
|
17
|
+
# # input_sequence: String of nucleotide sequence
|
18
|
+
# # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
|
19
|
+
# # path_to_muscle: path to the muscle executable.
|
20
|
+
# # Default as :false, will call MuscleBio to run Muscle
|
21
|
+
# # specify path_to_muscle if other source of muscle needed
|
22
|
+
# # function returns an array of
|
23
|
+
# # start_location (Integer)
|
24
|
+
# # end_location (Integer)
|
25
|
+
# # percentage_of_similarity_to_reference_sequence (Float)
|
26
|
+
# # containing_indel? (Boolean)
|
27
|
+
# # aligned_input_sequence (String)
|
28
|
+
# # aligned_reference_sequence (String)
|
29
|
+
# # example code
|
30
|
+
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
31
|
+
# p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
|
32
|
+
# => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
|
33
|
+
|
34
|
+
# =USAGE2
|
35
|
+
# ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
|
36
|
+
# # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
|
37
|
+
# # return nil if the input sequence is not in the range
|
38
|
+
# # input_sequence: String of nucleotide sequence
|
39
|
+
# # start_position and end_position: Integer of the start and end reference number of the sub-sequence
|
40
|
+
# # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
|
41
|
+
# # path_to_muscle: path to the muscle executable.
|
42
|
+
# # Default as :false, will call MuscleBio to run Muscle
|
43
|
+
# # specify path_to_muscle if other source of muscle needed
|
44
|
+
# # example code
|
45
|
+
# seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
|
46
|
+
# p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
|
47
|
+
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
48
|
+
|
49
|
+
# =USAGE3
|
50
|
+
# ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
|
51
|
+
# # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
|
52
|
+
# # and a boolean value for allowing indels,
|
53
|
+
# # path_to_muscle: path to the muscle executable.
|
54
|
+
# # Default as :false, will call MuscleBio to run Muscle
|
55
|
+
# # specify path_to_muscle if other source of muscle needed
|
56
|
+
# # return a sequence sub-hash that meets the the criteria
|
57
|
+
# # example code
|
58
|
+
# sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
|
59
|
+
# filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
|
60
|
+
# puts sequence_hash.size
|
61
|
+
# => 6
|
62
|
+
# puts filtered_sequence_hash.size
|
63
|
+
# => 4
|
64
|
+
|
65
|
+
module ViralSeq
|
66
|
+
|
67
|
+
def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
|
68
|
+
|
69
|
+
# ViralSeq.check_muscle(path_to_muscle)
|
70
|
+
ori_ref = ViralSeq.check_ref(ref_option)
|
71
|
+
|
72
|
+
begin
|
73
|
+
ori_ref_l = ori_ref.size
|
74
|
+
l1 = 0
|
75
|
+
l2 = 0
|
76
|
+
|
77
|
+
aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
|
78
|
+
aln_test = aln_seq[1]
|
79
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
80
|
+
gap_begin = $1.size
|
81
|
+
gap_end = $3.size
|
82
|
+
aln_test2 = $2
|
83
|
+
ref = aln_seq[0]
|
84
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
85
|
+
ref_size = ref.size
|
86
|
+
if ref_size > 1.3*(seq.size)
|
87
|
+
l1 = l1 + gap_begin
|
88
|
+
l2 = l2 + gap_end
|
89
|
+
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
90
|
+
aln_test2 =~ /#{max_seq}/
|
91
|
+
before_aln_seq = $`
|
92
|
+
before_aln = $`.size
|
93
|
+
post_aln_seq = $'
|
94
|
+
post_aln = $'.size
|
95
|
+
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
96
|
+
b1 = (1.3 * before_aln_seq_size).to_i
|
97
|
+
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
98
|
+
b2 = (1.3 * post_aln_seq_size).to_i
|
99
|
+
if (before_aln > seq.size) and (post_aln <= seq.size)
|
100
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
101
|
+
l1 = l1 + (before_aln - b1)
|
102
|
+
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
103
|
+
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
104
|
+
l2 = l2 + post_aln - b2
|
105
|
+
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
106
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
107
|
+
l1 = l1 + (before_aln - b1)
|
108
|
+
l2 = l2 + (post_aln - b2)
|
109
|
+
end
|
110
|
+
|
111
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
112
|
+
aln_test = aln_seq[1]
|
113
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
114
|
+
gap_begin = $1.size
|
115
|
+
gap_end = $3.size
|
116
|
+
ref = aln_seq[0]
|
117
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
118
|
+
end
|
119
|
+
|
120
|
+
aln_test = aln_seq[1]
|
121
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
122
|
+
gap_begin = $1.size
|
123
|
+
gap_end = $3.size
|
124
|
+
aln_test = $2
|
125
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
126
|
+
s1 = $1.size
|
127
|
+
g1 = $2.size
|
128
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
129
|
+
s2 = $2.size
|
130
|
+
g2 = $1.size
|
131
|
+
|
132
|
+
l1 = l1 + gap_begin
|
133
|
+
l2 = l2 + gap_end
|
134
|
+
repeat = 0
|
135
|
+
|
136
|
+
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
137
|
+
if s1 > s2 and g2 > 2*s2
|
138
|
+
ref = ref[0..(-g2-1)]
|
139
|
+
repeat = 1
|
140
|
+
l2 = l2 + g2
|
141
|
+
elsif s1 < s2 and g1 > 2*s1
|
142
|
+
ref = ref[g1..-1]
|
143
|
+
repeat = 1
|
144
|
+
l1 = l1 + g1
|
145
|
+
end
|
146
|
+
else
|
147
|
+
if g1 > 2*s1
|
148
|
+
ref = ref[g1..-1]
|
149
|
+
repeat = 1
|
150
|
+
l1 = l1 + g1
|
151
|
+
end
|
152
|
+
if g2 > 2*s2
|
153
|
+
ref = ref[0..(-g2 - 1)]
|
154
|
+
repeat = 1
|
155
|
+
l2 = l2 + g2
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
while repeat == 1
|
160
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
161
|
+
aln_test = aln_seq[1]
|
162
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
163
|
+
gap_begin = $1.size
|
164
|
+
gap_end = $3.size
|
165
|
+
aln_test = $2
|
166
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
167
|
+
s1 = $1.size
|
168
|
+
g1 = $2.size
|
169
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
170
|
+
s2 = $2.size
|
171
|
+
g2 = $1.size
|
172
|
+
ref = aln_seq[0]
|
173
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
174
|
+
l1 = l1 + gap_begin
|
175
|
+
l2 = l2 + gap_end
|
176
|
+
repeat = 0
|
177
|
+
if g1 > 2*s1
|
178
|
+
ref = ref[g1..-1]
|
179
|
+
repeat = 1
|
180
|
+
l1 = l1 + g1
|
181
|
+
end
|
182
|
+
if g2 > 2*s2
|
183
|
+
ref = ref[0..(-g2 - 1)]
|
184
|
+
repeat = 1
|
185
|
+
l2 = l2 + g2
|
186
|
+
end
|
187
|
+
end
|
188
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
189
|
+
|
190
|
+
|
191
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
192
|
+
aln_test = aln_seq[1]
|
193
|
+
ref = aln_seq[0]
|
194
|
+
|
195
|
+
#refine alignment
|
196
|
+
|
197
|
+
if ref =~ /^(\-+)/
|
198
|
+
l1 = l1 - $1.size
|
199
|
+
elsif ref =~ /(\-+)$/
|
200
|
+
l2 = l2 + $1.size
|
201
|
+
end
|
202
|
+
|
203
|
+
if (ori_ref_l - l2 - 1) >= l1
|
204
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
205
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
206
|
+
aln_test = aln_seq[1]
|
207
|
+
ref = aln_seq[0]
|
208
|
+
|
209
|
+
ref_size = ref.size
|
210
|
+
sim_count = 0
|
211
|
+
(0..(ref_size-1)).each do |n|
|
212
|
+
ref_base = ref[n]
|
213
|
+
test_base = aln_test[n]
|
214
|
+
sim_count += 1 if ref_base == test_base
|
215
|
+
end
|
216
|
+
similarity = (sim_count/ref_size.to_f*100).round(1)
|
217
|
+
|
218
|
+
loc_p1 = l1 + 1
|
219
|
+
loc_p2 = ori_ref_l - l2
|
220
|
+
if seq.size != (loc_p2 - loc_p1 + 1)
|
221
|
+
indel = true
|
222
|
+
elsif aln_test.include?("-")
|
223
|
+
indel = true
|
224
|
+
else
|
225
|
+
indel = false
|
226
|
+
end
|
227
|
+
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
228
|
+
else
|
229
|
+
return [0,0,0,0,0,0,0]
|
230
|
+
end
|
231
|
+
rescue => e
|
232
|
+
puts "Unexpected error occured."
|
233
|
+
puts "Exception Class: #{ e.class.name }"
|
234
|
+
puts "Exception Message: #{ e.message }"
|
235
|
+
puts "Exception Backtrace: #{ e.backtrace[0] }"
|
236
|
+
puts "ViralSeq.sequence_locator returns nil"
|
237
|
+
return nil
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# sequence clip function
|
242
|
+
def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
243
|
+
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
244
|
+
l1 = loc[0]
|
245
|
+
l2 = loc[1]
|
246
|
+
if (p1 >= l1) & (p2 <= l2)
|
247
|
+
seq = loc[4]
|
248
|
+
ref = loc[5]
|
249
|
+
g1 = 0
|
250
|
+
ref.each_char do |char|
|
251
|
+
break if l1 == p1
|
252
|
+
g1 += 1
|
253
|
+
l1 += 1 unless char == "-"
|
254
|
+
end
|
255
|
+
g2 = 1
|
256
|
+
ref.reverse.each_char do |char|
|
257
|
+
break if l2 == p2
|
258
|
+
g2 += 1
|
259
|
+
l2 -= 1 unless char == "-"
|
260
|
+
end
|
261
|
+
return seq[g1..(-g2)].tr("-","")
|
262
|
+
else
|
263
|
+
return nil
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
# batch quality check of HIV sequences based on ViralSeq.sequence_locator
|
268
|
+
# input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
|
269
|
+
# and allow the sequence to contain indels
|
270
|
+
# return a hash of filtered sequences
|
271
|
+
|
272
|
+
def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
|
273
|
+
seq_hash_unique = seq_hash.values.uniq
|
274
|
+
seq_hash_unique_pass = []
|
275
|
+
start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
|
276
|
+
end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
|
277
|
+
seq_hash_unique.each do |seq|
|
278
|
+
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
279
|
+
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
280
|
+
if indel
|
281
|
+
seq_hash_unique_pass << seq
|
282
|
+
elsif loc[3] == false
|
283
|
+
seq_hash_unique_pass << seq
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
287
|
+
seq_pass = {}
|
288
|
+
seq_hash_unique_pass.each do |seq|
|
289
|
+
seq_hash.each do |seq_name, orginal_seq|
|
290
|
+
if orginal_seq == seq
|
291
|
+
seq_pass[seq_name] = seq
|
292
|
+
seq_hash.delete(seq_name)
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|
296
|
+
return seq_pass
|
297
|
+
end
|
298
|
+
|
299
|
+
end
|
@@ -0,0 +1,401 @@
|
|
1
|
+
# lib/math.rb
|
2
|
+
|
3
|
+
# math and statistic functions
|
4
|
+
# inlcuding the following methods
|
5
|
+
# ViralSeq::count
|
6
|
+
# ViralSeq::count_percentage
|
7
|
+
# ViralSeq::poisson_distribution
|
8
|
+
# ViralSeq::r_binom_CI
|
9
|
+
# Enumerable#median
|
10
|
+
# Enumerable#sum
|
11
|
+
# Enumerable#mean
|
12
|
+
# Enumerable#sample_variance
|
13
|
+
# Enumerable#stdev
|
14
|
+
# Enumerable#upper_quartile
|
15
|
+
# Enumerable#lower_quartile
|
16
|
+
# Integer#!
|
17
|
+
# Rubystats::FishersExactTest
|
18
|
+
# RandomGaussian::new
|
19
|
+
# RandomGaussian#rand
|
20
|
+
|
21
|
+
module ViralSeq
|
22
|
+
|
23
|
+
# count elements in a array, return a hash of {:element1 => number1, :element2 => number2, ...}
|
24
|
+
# =Usage
|
25
|
+
# array = %w{cat dog monkey cat cat cat monkey}
|
26
|
+
# ViralSeq.count(array)
|
27
|
+
# => {"cat"=>4, "dog"=>1, "monkey"=>2}
|
28
|
+
|
29
|
+
def self.count(array)
|
30
|
+
hash = Hash.new(0)
|
31
|
+
array.each do |element|
|
32
|
+
hash[element] +=1
|
33
|
+
end
|
34
|
+
return hash
|
35
|
+
end
|
36
|
+
|
37
|
+
# count elements in a array, return a hash of {:element1 => frequency1, :element2 => frequency2, ...}
|
38
|
+
# default decimal as 2
|
39
|
+
# =Usage
|
40
|
+
# array = %w{cat dog monkey cat cat cat monkey}
|
41
|
+
# ViralSeq.count_percentage(array)
|
42
|
+
# => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
|
43
|
+
|
44
|
+
def self.count_percentage(array,decimal = 2)
|
45
|
+
hash1 = Hash.new(0)
|
46
|
+
array.each do |element|
|
47
|
+
hash1[element] += 1
|
48
|
+
end
|
49
|
+
total_elements = array.size
|
50
|
+
hash2 = Hash.new(0)
|
51
|
+
hash1.each do |key,value|
|
52
|
+
hash2[key] = (value/total_elements.to_f).round(decimal)
|
53
|
+
end
|
54
|
+
return hash2
|
55
|
+
end
|
56
|
+
|
57
|
+
# poisson distribution. input lambda and maximum k, return a hash with keys as k
|
58
|
+
# default k value is 5, meaning calculate up to 5 events.
|
59
|
+
#
|
60
|
+
# Poisson Distribution (https://en.wikipedia.org/wiki/Poisson_distribution)
|
61
|
+
# An event can occur 0, 1, 2, … times in an interval.
|
62
|
+
# The average number of events in an interval is designated λ (lambda).
|
63
|
+
# λ is the event rate, also called the rate parameter.
|
64
|
+
# The probability of observing k events in an interval is given by the equation
|
65
|
+
#
|
66
|
+
# P(k events in interval) = e^(-λ) * λ^k / k!
|
67
|
+
#
|
68
|
+
# λ is the average number of events per interval
|
69
|
+
# e is the number 2.71828... (Euler's number) the base of the natural logarithms
|
70
|
+
# k takes values 0, 1, 2, …
|
71
|
+
# k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
|
72
|
+
#
|
73
|
+
# =USAGE
|
74
|
+
# # We assume the mutaiton rate is 0.005 (event rate λ),
|
75
|
+
# # we would like to calculate the probablity of 3 mutations on one sequence
|
76
|
+
# prob_hash = ViralSeq::poisson_distribution(0.005)
|
77
|
+
# => {0=>0.9950124791926823, 1=>0.004975062395963412, 2=>1.243765598990853e-05, 3=>2.072942664984755e-08, 4=>2.5911783312309436e-11, 5=>2.591178331230944e-14}
|
78
|
+
# prob_hash[3]
|
79
|
+
# => 2.072942664984755e-08
|
80
|
+
|
81
|
+
def self.poisson_distribution(rate,k = 5)
|
82
|
+
out_hash = {}
|
83
|
+
(0..k).each do |n|
|
84
|
+
p = (rate**n * Math::E**(-rate))/!n
|
85
|
+
out_hash[n] = p
|
86
|
+
end
|
87
|
+
return out_hash
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# require R pre-installed
|
92
|
+
# calculate binomial 95% confidence intervals by R. refer to R function binom.test
|
93
|
+
# input number x and n, return an array as [lower_interval, upper_interval]
|
94
|
+
#
|
95
|
+
# =USAGE
|
96
|
+
# # mutation M184V found in 3 out of 923 sequences, the 95% confidence interval is
|
97
|
+
# ViralSeq.r_binom_CI(3, 923)
|
98
|
+
# => [0.02223, 0.19234]
|
99
|
+
#
|
100
|
+
def self.r_binom_CI(x= 0, n= 0)
|
101
|
+
r_output = `Rscript -e 'binom.test(#{x},#{n})$conf.int[1];binom.test(#{x},#{n})$conf.int[2]'`
|
102
|
+
lines = r_output.split "\n"
|
103
|
+
low = lines[0].chomp[4..-1].to_f
|
104
|
+
high = lines[1].chomp[4..-1].to_f
|
105
|
+
return [low.round(5), high.round(5)]
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
# statistic methods
|
111
|
+
# :median :sum :mean :sample_variance :stdev :upper_quartile :lower_quartile
|
112
|
+
# =USAGE
|
113
|
+
# array = [1,2,3,4,5,6,7,8,9,10]
|
114
|
+
# array.median
|
115
|
+
# => 5.5
|
116
|
+
# array.sum
|
117
|
+
# => 55
|
118
|
+
# array.mean
|
119
|
+
# => 5.5
|
120
|
+
# array.sample_variance
|
121
|
+
# => 9.166666666666666
|
122
|
+
# array.stdev
|
123
|
+
# => 3.0276503540974917
|
124
|
+
# array.upper_quartile
|
125
|
+
# => 7.5
|
126
|
+
# array.lower_quartile
|
127
|
+
# => 3.5
|
128
|
+
|
129
|
+
module Enumerable
|
130
|
+
def median
|
131
|
+
len = self.length
|
132
|
+
sorted = self.sort
|
133
|
+
len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
|
134
|
+
end
|
135
|
+
|
136
|
+
def sum
|
137
|
+
self.inject(0){|accum, i| accum + i }
|
138
|
+
end
|
139
|
+
|
140
|
+
def mean
|
141
|
+
self.sum/self.length.to_f
|
142
|
+
end
|
143
|
+
|
144
|
+
def sample_variance
|
145
|
+
m = self.mean
|
146
|
+
sum = self.inject(0){|accum, i| accum + (i-m)**2 }
|
147
|
+
sum/(self.length - 1).to_f
|
148
|
+
end
|
149
|
+
|
150
|
+
def stdev
|
151
|
+
return Math.sqrt(self.sample_variance)
|
152
|
+
end
|
153
|
+
|
154
|
+
def upper_quartile
|
155
|
+
return nil if self.empty?
|
156
|
+
sorted_array = self.sort
|
157
|
+
u = (0.25*(3*sorted_array.length))
|
158
|
+
if (u-u.truncate).is_a?(Integer)
|
159
|
+
return sorted_array[(u-u.truncate)-1]
|
160
|
+
else
|
161
|
+
sample = sorted_array[u.truncate.abs-1]
|
162
|
+
sample1 = sorted_array[(u.truncate.abs)]
|
163
|
+
return sample+((sample1-sample)*(u-u.truncate))
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def lower_quartile
|
168
|
+
return nil if self.empty?
|
169
|
+
sorted_array = self.sort
|
170
|
+
u = 0.25*sorted_array.length + 1
|
171
|
+
if (u-u.truncate).is_a?(Integer)
|
172
|
+
return sorted_array[(u-u.truncate)-1]
|
173
|
+
else
|
174
|
+
sample = sorted_array[u.truncate.abs-1]
|
175
|
+
sample1 = sorted_array[(u.truncate.abs)]
|
176
|
+
return sample+((sample1-sample)*(u-u.truncate))
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
# factorial method for an Integer
|
182
|
+
# Integer.!
|
183
|
+
class Integer
|
184
|
+
def !
|
185
|
+
if self == 0
|
186
|
+
return 1
|
187
|
+
else
|
188
|
+
(1..self).inject(:*)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
# Fisher's Exact Test Function Library
|
195
|
+
#
|
196
|
+
# Based on JavaScript version created by: Oyvind Langsrud
|
197
|
+
# Ported to Ruby by Bryan Donovan
|
198
|
+
|
199
|
+
module Rubystats
|
200
|
+
class FishersExactTest
|
201
|
+
|
202
|
+
def initialize
|
203
|
+
@sn11 = 0.0
|
204
|
+
@sn1_ = 0.0
|
205
|
+
@sn_1 = 0.0
|
206
|
+
@sn = 0.0
|
207
|
+
@sprob = 0.0
|
208
|
+
|
209
|
+
@sleft = 0.0
|
210
|
+
@sright = 0.0
|
211
|
+
@sless = 0.0
|
212
|
+
@slarg = 0.0
|
213
|
+
|
214
|
+
@left = 0.0
|
215
|
+
@right = 0.0
|
216
|
+
@twotail = 0.0
|
217
|
+
end
|
218
|
+
|
219
|
+
# Reference: "Lanczos, C. 'A precision approximation
|
220
|
+
# of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
|
221
|
+
# Translation of Alan Miller's FORTRAN-implementation
|
222
|
+
# See http://lib.stat.cmu.edu/apstat/245
|
223
|
+
def lngamm(z)
|
224
|
+
x = 0
|
225
|
+
x += 0.0000001659470187408462 / (z+7)
|
226
|
+
x += 0.000009934937113930748 / (z+6)
|
227
|
+
x -= 0.1385710331296526 / (z+5)
|
228
|
+
x += 12.50734324009056 / (z+4)
|
229
|
+
x -= 176.6150291498386 / (z+3)
|
230
|
+
x += 771.3234287757674 / (z+2)
|
231
|
+
x -= 1259.139216722289 / (z+1)
|
232
|
+
x += 676.5203681218835 / (z)
|
233
|
+
x += 0.9999999999995183
|
234
|
+
|
235
|
+
return(Math.log(x)-5.58106146679532777-z+(z-0.5) * Math.log(z+6.5))
|
236
|
+
end
|
237
|
+
|
238
|
+
def lnfact(n)
|
239
|
+
if n <= 1
|
240
|
+
return 0
|
241
|
+
else
|
242
|
+
return lngamm(n+1)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def lnbico(n,k)
|
247
|
+
return lnfact(n) - lnfact(k) - lnfact(n-k)
|
248
|
+
end
|
249
|
+
|
250
|
+
def hyper_323(n11, n1_, n_1, n)
|
251
|
+
return Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
|
252
|
+
end
|
253
|
+
|
254
|
+
def hyper(n11)
|
255
|
+
return hyper0(n11, 0, 0, 0)
|
256
|
+
end
|
257
|
+
|
258
|
+
def hyper0(n11i,n1_i,n_1i,ni)
|
259
|
+
if n1_i == 0 and n_1i ==0 and ni == 0
|
260
|
+
unless n11i % 10 == 0
|
261
|
+
if n11i == @sn11+1
|
262
|
+
@sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
|
263
|
+
@sn11 = n11i
|
264
|
+
return @sprob
|
265
|
+
end
|
266
|
+
if n11i == @sn11-1
|
267
|
+
@sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
|
268
|
+
@sn11 = n11i
|
269
|
+
return @sprob
|
270
|
+
end
|
271
|
+
end
|
272
|
+
@sn11 = n11i
|
273
|
+
else
|
274
|
+
@sn11 = n11i
|
275
|
+
@sn1_ = n1_i
|
276
|
+
@sn_1 = n_1i
|
277
|
+
@sn = ni
|
278
|
+
end
|
279
|
+
@sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
|
280
|
+
return @sprob
|
281
|
+
end
|
282
|
+
|
283
|
+
def exact(n11,n1_,n_1,n)
|
284
|
+
|
285
|
+
p = i = j = prob = 0.0
|
286
|
+
|
287
|
+
max = n1_
|
288
|
+
max = n_1 if n_1 < max
|
289
|
+
min = n1_ + n_1 - n
|
290
|
+
min = 0 if min < 0
|
291
|
+
|
292
|
+
if min == max
|
293
|
+
@sless = 1
|
294
|
+
@sright = 1
|
295
|
+
@sleft = 1
|
296
|
+
@slarg = 1
|
297
|
+
return 1
|
298
|
+
end
|
299
|
+
|
300
|
+
prob = hyper0(n11,n1_,n_1,n)
|
301
|
+
@sleft = 0
|
302
|
+
|
303
|
+
p = hyper(min)
|
304
|
+
i = min + 1
|
305
|
+
while p < (0.99999999 * prob)
|
306
|
+
@sleft += p
|
307
|
+
p = hyper(i)
|
308
|
+
i += 1
|
309
|
+
end
|
310
|
+
|
311
|
+
i -= 1
|
312
|
+
|
313
|
+
if p < (1.00000001*prob)
|
314
|
+
@sleft += p
|
315
|
+
else
|
316
|
+
i -= 1
|
317
|
+
end
|
318
|
+
|
319
|
+
@sright = 0
|
320
|
+
|
321
|
+
p = hyper(max)
|
322
|
+
j = max - 1
|
323
|
+
while p < (0.99999999 * prob)
|
324
|
+
@sright += p
|
325
|
+
p = hyper(j)
|
326
|
+
j -= 1
|
327
|
+
end
|
328
|
+
j += 1
|
329
|
+
|
330
|
+
if p < (1.00000001*prob)
|
331
|
+
@sright += p
|
332
|
+
else
|
333
|
+
j += 1
|
334
|
+
end
|
335
|
+
|
336
|
+
if (i - n11).abs < (j - n11).abs
|
337
|
+
@sless = @sleft
|
338
|
+
@slarg = 1 - @sleft + prob
|
339
|
+
else
|
340
|
+
@sless = 1 - @sright + prob
|
341
|
+
@slarg = @sright
|
342
|
+
end
|
343
|
+
return prob
|
344
|
+
end
|
345
|
+
|
346
|
+
def calculate(n11_,n12_,n21_,n22_)
|
347
|
+
n11_ *= -1 if n11_ < 0
|
348
|
+
n12_ *= -1 if n12_ < 0
|
349
|
+
n21_ *= -1 if n21_ < 0
|
350
|
+
n22_ *= -1 if n22_ < 0
|
351
|
+
n1_ = n11_ + n12_
|
352
|
+
n_1 = n11_ + n21_
|
353
|
+
n = n11_ + n12_ + n21_ + n22_
|
354
|
+
exact(n11_,n1_,n_1,n)
|
355
|
+
left = @sless
|
356
|
+
right = @slarg
|
357
|
+
twotail = @sleft + @sright
|
358
|
+
twotail = 1 if twotail > 1
|
359
|
+
values_hash = { :left =>left, :right =>right, :twotail =>twotail }
|
360
|
+
return values_hash
|
361
|
+
end
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
|
366
|
+
# generate values from the standard normal distribution with given mean and standard deviation
|
367
|
+
# See http://en.wikipedia.org/wiki/Box-Muller_transform
|
368
|
+
#
|
369
|
+
# RandomGaussian.new(mean, sd, rng)
|
370
|
+
# # generate RandomGaussian instance with given mean and standard deviation
|
371
|
+
# # default value: mean = 0.0, sd = 1.0
|
372
|
+
#
|
373
|
+
# RandomGaussian.rand
|
374
|
+
# # generate a random number that falls in the pre-defined gaussian distribution
|
375
|
+
# =USAGE
|
376
|
+
# # example
|
377
|
+
# a = RandomGaussian.new
|
378
|
+
# a.rand
|
379
|
+
# numbers = []
|
380
|
+
# 10.times {numbers << a.rand.round(5)}
|
381
|
+
# numbers
|
382
|
+
# [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
|
383
|
+
|
384
|
+
|
385
|
+
class RandomGaussian
|
386
|
+
def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
|
387
|
+
@mean, @sd, @rng = mean, sd, rng
|
388
|
+
@compute_next_pair = false
|
389
|
+
end
|
390
|
+
|
391
|
+
def rand
|
392
|
+
if (@compute_next_pair = !@compute_next_pair)
|
393
|
+
theta = 2 * Math::PI * @rng.call
|
394
|
+
scale = @sd * Math.sqrt(-2 * Math.log(1 - @rng.call))
|
395
|
+
@g1 = @mean + scale * Math.sin(theta)
|
396
|
+
@g0 = @mean + scale * Math.cos(theta)
|
397
|
+
else
|
398
|
+
@g1
|
399
|
+
end
|
400
|
+
end
|
401
|
+
end
|