viral_seq 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/viral_seq/a3g.rb +172 -0
- data/lib/viral_seq/fasta.rb +154 -0
- data/lib/viral_seq/hcv_dr.rb +54 -0
- data/lib/viral_seq/locator.rb +299 -0
- data/lib/viral_seq/math.rb +401 -0
- data/lib/viral_seq/misc.rb +103 -0
- data/lib/viral_seq/muscle.rb +89 -0
- data/lib/viral_seq/nt_variation.rb +148 -0
- data/lib/viral_seq/poisson_cutoff.rb +68 -0
- data/lib/viral_seq/refseq.rb +45 -0
- data/lib/viral_seq/sdrm_core.rb +652 -0
- data/lib/viral_seq/sequence.rb +392 -0
- data/lib/viral_seq/tcs_core.rb +556 -0
- data/lib/viral_seq/version.rb +6 -0
- data/lib/viral_seq.rb +41 -0
- data/viral_seq.gemspec +37 -0
- metadata +130 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
# viral_seq/hcv_dr
|
2
|
+
# HCV resistant mutation interpretation
|
3
|
+
# ViralSeq::hcv_ns5a
|
4
|
+
|
5
|
+
# ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
|
6
|
+
# # amino_acid_sequence_array is Array object of the amino acid sequence.
|
7
|
+
# # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
|
8
|
+
# # start_aa_position is the starting aa number of the input sequence as Integer
|
9
|
+
|
10
|
+
module ViralSeq
|
11
|
+
def self.hcv_ns5a(aa_array,start_aa=1)
|
12
|
+
out_hash = {}
|
13
|
+
sdrm = {}
|
14
|
+
sdrm[28] = ['M',['T']]
|
15
|
+
sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
|
16
|
+
sdrm[31] = ['L',['M','V','F']]
|
17
|
+
sdrm[32] = ['P',['L']]
|
18
|
+
sdrm[44] = ['K',['R']]
|
19
|
+
sdrm[58] = ['H',['D','P','S']]
|
20
|
+
sdrm[64] = ['T',['A','S']]
|
21
|
+
sdrm[77] = ['P',['A','S']]
|
22
|
+
sdrm[78] = ['R',['K']]
|
23
|
+
sdrm[79] = ['T',['A']]
|
24
|
+
sdrm[83] = ['T',['M']]
|
25
|
+
sdrm[85] = ['S',['N','H','Y']]
|
26
|
+
sdrm[92] = ['A',['P','T','K','E']]
|
27
|
+
sdrm[93] = ['Y',['C','F','H','N']]
|
28
|
+
sdrm[107] = ['K',['T','S']]
|
29
|
+
sdrm[121] = ['I',['V']]
|
30
|
+
sdrm[135] = ['T',['A']]
|
31
|
+
aa_length = aa_array.size
|
32
|
+
end_aa = start_aa + aa_length - 1
|
33
|
+
(start_aa..end_aa).each do |position|
|
34
|
+
array_position = position - start_aa
|
35
|
+
if sdrm.keys.include?(position)
|
36
|
+
wt_aa = sdrm[position][0]
|
37
|
+
test_aa = aa_array[array_position]
|
38
|
+
if test_aa.size == 1
|
39
|
+
unless wt_aa == test_aa
|
40
|
+
if sdrm[position][1].include?(test_aa)
|
41
|
+
out_hash[position] = [wt_aa,test_aa]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
else
|
45
|
+
test_aa_array = test_aa.split("/")
|
46
|
+
if (test_aa_array & sdrm[position][1])
|
47
|
+
out_hash[position] = [wt_aa,test_aa]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
return out_hash
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,299 @@
|
|
1
|
+
# viral_seq/locator.rb
|
2
|
+
|
3
|
+
# Including following methods:
|
4
|
+
# ViralSeq::sequence_locator
|
5
|
+
# ViralSeq::sequence_clip
|
6
|
+
# ViralSeq::qc_hiv_seq_check
|
7
|
+
|
8
|
+
# HIV sequence locator function
|
9
|
+
# resembling HIV Sequence Locator from LANL
|
10
|
+
# https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
|
11
|
+
# require MUSCLE (http://www.drive5.com/muscle) installed
|
12
|
+
# current version only supports nucleotide sequence, not for amino acid sequence.
|
13
|
+
|
14
|
+
# =USAGE1
|
15
|
+
# # Find the location of a sequence
|
16
|
+
# ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
|
17
|
+
# # input_sequence: String of nucleotide sequence
|
18
|
+
# # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
|
19
|
+
# # path_to_muscle: path to the muscle executable.
|
20
|
+
# # Default as :false, will call MuscleBio to run Muscle
|
21
|
+
# # specify path_to_muscle if other source of muscle needed
|
22
|
+
# # function returns an array of
|
23
|
+
# # start_location (Integer)
|
24
|
+
# # end_location (Integer)
|
25
|
+
# # percentage_of_similarity_to_reference_sequence (Float)
|
26
|
+
# # containing_indel? (Boolean)
|
27
|
+
# # aligned_input_sequence (String)
|
28
|
+
# # aligned_reference_sequence (String)
|
29
|
+
# # example code
|
30
|
+
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
31
|
+
# p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
|
32
|
+
# => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
|
33
|
+
|
34
|
+
# =USAGE2
|
35
|
+
# ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
|
36
|
+
# # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
|
37
|
+
# # return nil if the input sequence is not in the range
|
38
|
+
# # input_sequence: String of nucleotide sequence
|
39
|
+
# # start_position and end_position: Integer of the start and end reference number of the sub-sequence
|
40
|
+
# # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
|
41
|
+
# # path_to_muscle: path to the muscle executable.
|
42
|
+
# # Default as :false, will call MuscleBio to run Muscle
|
43
|
+
# # specify path_to_muscle if other source of muscle needed
|
44
|
+
# # example code
|
45
|
+
# seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
|
46
|
+
# p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
|
47
|
+
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
48
|
+
|
49
|
+
# =USAGE3
|
50
|
+
# ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
|
51
|
+
# # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
|
52
|
+
# # and a boolean value for allowing indels,
|
53
|
+
# # path_to_muscle: path to the muscle executable.
|
54
|
+
# # Default as :false, will call MuscleBio to run Muscle
|
55
|
+
# # specify path_to_muscle if other source of muscle needed
|
56
|
+
# # return a sequence sub-hash that meets the the criteria
|
57
|
+
# # example code
|
58
|
+
# sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
|
59
|
+
# filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
|
60
|
+
# puts sequence_hash.size
|
61
|
+
# => 6
|
62
|
+
# puts filtered_sequence_hash.size
|
63
|
+
# => 4
|
64
|
+
|
65
|
+
module ViralSeq
|
66
|
+
|
67
|
+
def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
|
68
|
+
|
69
|
+
# ViralSeq.check_muscle(path_to_muscle)
|
70
|
+
ori_ref = ViralSeq.check_ref(ref_option)
|
71
|
+
|
72
|
+
begin
|
73
|
+
ori_ref_l = ori_ref.size
|
74
|
+
l1 = 0
|
75
|
+
l2 = 0
|
76
|
+
|
77
|
+
aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
|
78
|
+
aln_test = aln_seq[1]
|
79
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
80
|
+
gap_begin = $1.size
|
81
|
+
gap_end = $3.size
|
82
|
+
aln_test2 = $2
|
83
|
+
ref = aln_seq[0]
|
84
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
85
|
+
ref_size = ref.size
|
86
|
+
if ref_size > 1.3*(seq.size)
|
87
|
+
l1 = l1 + gap_begin
|
88
|
+
l2 = l2 + gap_end
|
89
|
+
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
90
|
+
aln_test2 =~ /#{max_seq}/
|
91
|
+
before_aln_seq = $`
|
92
|
+
before_aln = $`.size
|
93
|
+
post_aln_seq = $'
|
94
|
+
post_aln = $'.size
|
95
|
+
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
96
|
+
b1 = (1.3 * before_aln_seq_size).to_i
|
97
|
+
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
98
|
+
b2 = (1.3 * post_aln_seq_size).to_i
|
99
|
+
if (before_aln > seq.size) and (post_aln <= seq.size)
|
100
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
101
|
+
l1 = l1 + (before_aln - b1)
|
102
|
+
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
103
|
+
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
104
|
+
l2 = l2 + post_aln - b2
|
105
|
+
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
106
|
+
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
107
|
+
l1 = l1 + (before_aln - b1)
|
108
|
+
l2 = l2 + (post_aln - b2)
|
109
|
+
end
|
110
|
+
|
111
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
112
|
+
aln_test = aln_seq[1]
|
113
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
114
|
+
gap_begin = $1.size
|
115
|
+
gap_end = $3.size
|
116
|
+
ref = aln_seq[0]
|
117
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
118
|
+
end
|
119
|
+
|
120
|
+
aln_test = aln_seq[1]
|
121
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
122
|
+
gap_begin = $1.size
|
123
|
+
gap_end = $3.size
|
124
|
+
aln_test = $2
|
125
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
126
|
+
s1 = $1.size
|
127
|
+
g1 = $2.size
|
128
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
129
|
+
s2 = $2.size
|
130
|
+
g2 = $1.size
|
131
|
+
|
132
|
+
l1 = l1 + gap_begin
|
133
|
+
l2 = l2 + gap_end
|
134
|
+
repeat = 0
|
135
|
+
|
136
|
+
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
137
|
+
if s1 > s2 and g2 > 2*s2
|
138
|
+
ref = ref[0..(-g2-1)]
|
139
|
+
repeat = 1
|
140
|
+
l2 = l2 + g2
|
141
|
+
elsif s1 < s2 and g1 > 2*s1
|
142
|
+
ref = ref[g1..-1]
|
143
|
+
repeat = 1
|
144
|
+
l1 = l1 + g1
|
145
|
+
end
|
146
|
+
else
|
147
|
+
if g1 > 2*s1
|
148
|
+
ref = ref[g1..-1]
|
149
|
+
repeat = 1
|
150
|
+
l1 = l1 + g1
|
151
|
+
end
|
152
|
+
if g2 > 2*s2
|
153
|
+
ref = ref[0..(-g2 - 1)]
|
154
|
+
repeat = 1
|
155
|
+
l2 = l2 + g2
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
while repeat == 1
|
160
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
161
|
+
aln_test = aln_seq[1]
|
162
|
+
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
163
|
+
gap_begin = $1.size
|
164
|
+
gap_end = $3.size
|
165
|
+
aln_test = $2
|
166
|
+
aln_test =~ /^(\w+)(\-*)\w/
|
167
|
+
s1 = $1.size
|
168
|
+
g1 = $2.size
|
169
|
+
aln_test =~ /\w(\-*)(\w+)$/
|
170
|
+
s2 = $2.size
|
171
|
+
g2 = $1.size
|
172
|
+
ref = aln_seq[0]
|
173
|
+
ref = ref[gap_begin..(-gap_end-1)]
|
174
|
+
l1 = l1 + gap_begin
|
175
|
+
l2 = l2 + gap_end
|
176
|
+
repeat = 0
|
177
|
+
if g1 > 2*s1
|
178
|
+
ref = ref[g1..-1]
|
179
|
+
repeat = 1
|
180
|
+
l1 = l1 + g1
|
181
|
+
end
|
182
|
+
if g2 > 2*s2
|
183
|
+
ref = ref[0..(-g2 - 1)]
|
184
|
+
repeat = 1
|
185
|
+
l2 = l2 + g2
|
186
|
+
end
|
187
|
+
end
|
188
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
189
|
+
|
190
|
+
|
191
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
192
|
+
aln_test = aln_seq[1]
|
193
|
+
ref = aln_seq[0]
|
194
|
+
|
195
|
+
#refine alignment
|
196
|
+
|
197
|
+
if ref =~ /^(\-+)/
|
198
|
+
l1 = l1 - $1.size
|
199
|
+
elsif ref =~ /(\-+)$/
|
200
|
+
l2 = l2 + $1.size
|
201
|
+
end
|
202
|
+
|
203
|
+
if (ori_ref_l - l2 - 1) >= l1
|
204
|
+
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
205
|
+
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
206
|
+
aln_test = aln_seq[1]
|
207
|
+
ref = aln_seq[0]
|
208
|
+
|
209
|
+
ref_size = ref.size
|
210
|
+
sim_count = 0
|
211
|
+
(0..(ref_size-1)).each do |n|
|
212
|
+
ref_base = ref[n]
|
213
|
+
test_base = aln_test[n]
|
214
|
+
sim_count += 1 if ref_base == test_base
|
215
|
+
end
|
216
|
+
similarity = (sim_count/ref_size.to_f*100).round(1)
|
217
|
+
|
218
|
+
loc_p1 = l1 + 1
|
219
|
+
loc_p2 = ori_ref_l - l2
|
220
|
+
if seq.size != (loc_p2 - loc_p1 + 1)
|
221
|
+
indel = true
|
222
|
+
elsif aln_test.include?("-")
|
223
|
+
indel = true
|
224
|
+
else
|
225
|
+
indel = false
|
226
|
+
end
|
227
|
+
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
228
|
+
else
|
229
|
+
return [0,0,0,0,0,0,0]
|
230
|
+
end
|
231
|
+
rescue => e
|
232
|
+
puts "Unexpected error occured."
|
233
|
+
puts "Exception Class: #{ e.class.name }"
|
234
|
+
puts "Exception Message: #{ e.message }"
|
235
|
+
puts "Exception Backtrace: #{ e.backtrace[0] }"
|
236
|
+
puts "ViralSeq.sequence_locator returns nil"
|
237
|
+
return nil
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# sequence clip function
|
242
|
+
def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
243
|
+
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
244
|
+
l1 = loc[0]
|
245
|
+
l2 = loc[1]
|
246
|
+
if (p1 >= l1) & (p2 <= l2)
|
247
|
+
seq = loc[4]
|
248
|
+
ref = loc[5]
|
249
|
+
g1 = 0
|
250
|
+
ref.each_char do |char|
|
251
|
+
break if l1 == p1
|
252
|
+
g1 += 1
|
253
|
+
l1 += 1 unless char == "-"
|
254
|
+
end
|
255
|
+
g2 = 1
|
256
|
+
ref.reverse.each_char do |char|
|
257
|
+
break if l2 == p2
|
258
|
+
g2 += 1
|
259
|
+
l2 -= 1 unless char == "-"
|
260
|
+
end
|
261
|
+
return seq[g1..(-g2)].tr("-","")
|
262
|
+
else
|
263
|
+
return nil
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
# batch quality check of HIV sequences based on ViralSeq.sequence_locator
|
268
|
+
# input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
|
269
|
+
# and allow the sequence to contain indels
|
270
|
+
# return a hash of filtered sequences
|
271
|
+
|
272
|
+
def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
|
273
|
+
seq_hash_unique = seq_hash.values.uniq
|
274
|
+
seq_hash_unique_pass = []
|
275
|
+
start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
|
276
|
+
end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
|
277
|
+
seq_hash_unique.each do |seq|
|
278
|
+
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
279
|
+
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
280
|
+
if indel
|
281
|
+
seq_hash_unique_pass << seq
|
282
|
+
elsif loc[3] == false
|
283
|
+
seq_hash_unique_pass << seq
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
287
|
+
seq_pass = {}
|
288
|
+
seq_hash_unique_pass.each do |seq|
|
289
|
+
seq_hash.each do |seq_name, orginal_seq|
|
290
|
+
if orginal_seq == seq
|
291
|
+
seq_pass[seq_name] = seq
|
292
|
+
seq_hash.delete(seq_name)
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|
296
|
+
return seq_pass
|
297
|
+
end
|
298
|
+
|
299
|
+
end
|
@@ -0,0 +1,401 @@
|
|
1
|
+
# lib/math.rb
|
2
|
+
|
3
|
+
# math and statistic functions
|
4
|
+
# inlcuding the following methods
|
5
|
+
# ViralSeq::count
|
6
|
+
# ViralSeq::count_percentage
|
7
|
+
# ViralSeq::poisson_distribution
|
8
|
+
# ViralSeq::r_binom_CI
|
9
|
+
# Enumerable#median
|
10
|
+
# Enumerable#sum
|
11
|
+
# Enumerable#mean
|
12
|
+
# Enumerable#sample_variance
|
13
|
+
# Enumerable#stdev
|
14
|
+
# Enumerable#upper_quartile
|
15
|
+
# Enumerable#lower_quartile
|
16
|
+
# Integer#!
|
17
|
+
# Rubystats::FishersExactTest
|
18
|
+
# RandomGaussian::new
|
19
|
+
# RandomGaussian#rand
|
20
|
+
|
21
|
+
module ViralSeq
|
22
|
+
|
23
|
+
# count elements in a array, return a hash of {:element1 => number1, :element2 => number2, ...}
|
24
|
+
# =Usage
|
25
|
+
# array = %w{cat dog monkey cat cat cat monkey}
|
26
|
+
# ViralSeq.count(array)
|
27
|
+
# => {"cat"=>4, "dog"=>1, "monkey"=>2}
|
28
|
+
|
29
|
+
def self.count(array)
|
30
|
+
hash = Hash.new(0)
|
31
|
+
array.each do |element|
|
32
|
+
hash[element] +=1
|
33
|
+
end
|
34
|
+
return hash
|
35
|
+
end
|
36
|
+
|
37
|
+
# count elements in a array, return a hash of {:element1 => frequency1, :element2 => frequency2, ...}
|
38
|
+
# default decimal as 2
|
39
|
+
# =Usage
|
40
|
+
# array = %w{cat dog monkey cat cat cat monkey}
|
41
|
+
# ViralSeq.count_percentage(array)
|
42
|
+
# => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
|
43
|
+
|
44
|
+
def self.count_percentage(array,decimal = 2)
|
45
|
+
hash1 = Hash.new(0)
|
46
|
+
array.each do |element|
|
47
|
+
hash1[element] += 1
|
48
|
+
end
|
49
|
+
total_elements = array.size
|
50
|
+
hash2 = Hash.new(0)
|
51
|
+
hash1.each do |key,value|
|
52
|
+
hash2[key] = (value/total_elements.to_f).round(decimal)
|
53
|
+
end
|
54
|
+
return hash2
|
55
|
+
end
|
56
|
+
|
57
|
+
# poisson distribution. input lambda and maximum k, return a hash with keys as k
|
58
|
+
# default k value is 5, meaning calculate up to 5 events.
|
59
|
+
#
|
60
|
+
# Poisson Distribution (https://en.wikipedia.org/wiki/Poisson_distribution)
|
61
|
+
# An event can occur 0, 1, 2, … times in an interval.
|
62
|
+
# The average number of events in an interval is designated λ (lambda).
|
63
|
+
# λ is the event rate, also called the rate parameter.
|
64
|
+
# The probability of observing k events in an interval is given by the equation
|
65
|
+
#
|
66
|
+
# P(k events in interval) = e^(-λ) * λ^k / k!
|
67
|
+
#
|
68
|
+
# λ is the average number of events per interval
|
69
|
+
# e is the number 2.71828... (Euler's number) the base of the natural logarithms
|
70
|
+
# k takes values 0, 1, 2, …
|
71
|
+
# k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
|
72
|
+
#
|
73
|
+
# =USAGE
|
74
|
+
# # We assume the mutaiton rate is 0.005 (event rate λ),
|
75
|
+
# # we would like to calculate the probablity of 3 mutations on one sequence
|
76
|
+
# prob_hash = ViralSeq::poisson_distribution(0.005)
|
77
|
+
# => {0=>0.9950124791926823, 1=>0.004975062395963412, 2=>1.243765598990853e-05, 3=>2.072942664984755e-08, 4=>2.5911783312309436e-11, 5=>2.591178331230944e-14}
|
78
|
+
# prob_hash[3]
|
79
|
+
# => 2.072942664984755e-08
|
80
|
+
|
81
|
+
def self.poisson_distribution(rate,k = 5)
|
82
|
+
out_hash = {}
|
83
|
+
(0..k).each do |n|
|
84
|
+
p = (rate**n * Math::E**(-rate))/!n
|
85
|
+
out_hash[n] = p
|
86
|
+
end
|
87
|
+
return out_hash
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# require R pre-installed
|
92
|
+
# calculate binomial 95% confidence intervals by R. refer to R function binom.test
|
93
|
+
# input number x and n, return an array as [lower_interval, upper_interval]
|
94
|
+
#
|
95
|
+
# =USAGE
|
96
|
+
# # mutation M184V found in 3 out of 923 sequences, the 95% confidence interval is
|
97
|
+
# ViralSeq.r_binom_CI(3, 923)
|
98
|
+
# => [0.02223, 0.19234]
|
99
|
+
#
|
100
|
+
def self.r_binom_CI(x= 0, n= 0)
|
101
|
+
r_output = `Rscript -e 'binom.test(#{x},#{n})$conf.int[1];binom.test(#{x},#{n})$conf.int[2]'`
|
102
|
+
lines = r_output.split "\n"
|
103
|
+
low = lines[0].chomp[4..-1].to_f
|
104
|
+
high = lines[1].chomp[4..-1].to_f
|
105
|
+
return [low.round(5), high.round(5)]
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
# statistic methods
|
111
|
+
# :median :sum :mean :sample_variance :stdev :upper_quartile :lower_quartile
|
112
|
+
# =USAGE
|
113
|
+
# array = [1,2,3,4,5,6,7,8,9,10]
|
114
|
+
# array.median
|
115
|
+
# => 5.5
|
116
|
+
# array.sum
|
117
|
+
# => 55
|
118
|
+
# array.mean
|
119
|
+
# => 5.5
|
120
|
+
# array.sample_variance
|
121
|
+
# => 9.166666666666666
|
122
|
+
# array.stdev
|
123
|
+
# => 3.0276503540974917
|
124
|
+
# array.upper_quartile
|
125
|
+
# => 7.5
|
126
|
+
# array.lower_quartile
|
127
|
+
# => 3.5
|
128
|
+
|
129
|
+
module Enumerable
|
130
|
+
def median
|
131
|
+
len = self.length
|
132
|
+
sorted = self.sort
|
133
|
+
len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
|
134
|
+
end
|
135
|
+
|
136
|
+
def sum
|
137
|
+
self.inject(0){|accum, i| accum + i }
|
138
|
+
end
|
139
|
+
|
140
|
+
def mean
|
141
|
+
self.sum/self.length.to_f
|
142
|
+
end
|
143
|
+
|
144
|
+
def sample_variance
|
145
|
+
m = self.mean
|
146
|
+
sum = self.inject(0){|accum, i| accum + (i-m)**2 }
|
147
|
+
sum/(self.length - 1).to_f
|
148
|
+
end
|
149
|
+
|
150
|
+
def stdev
|
151
|
+
return Math.sqrt(self.sample_variance)
|
152
|
+
end
|
153
|
+
|
154
|
+
def upper_quartile
|
155
|
+
return nil if self.empty?
|
156
|
+
sorted_array = self.sort
|
157
|
+
u = (0.25*(3*sorted_array.length))
|
158
|
+
if (u-u.truncate).is_a?(Integer)
|
159
|
+
return sorted_array[(u-u.truncate)-1]
|
160
|
+
else
|
161
|
+
sample = sorted_array[u.truncate.abs-1]
|
162
|
+
sample1 = sorted_array[(u.truncate.abs)]
|
163
|
+
return sample+((sample1-sample)*(u-u.truncate))
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def lower_quartile
|
168
|
+
return nil if self.empty?
|
169
|
+
sorted_array = self.sort
|
170
|
+
u = 0.25*sorted_array.length + 1
|
171
|
+
if (u-u.truncate).is_a?(Integer)
|
172
|
+
return sorted_array[(u-u.truncate)-1]
|
173
|
+
else
|
174
|
+
sample = sorted_array[u.truncate.abs-1]
|
175
|
+
sample1 = sorted_array[(u.truncate.abs)]
|
176
|
+
return sample+((sample1-sample)*(u-u.truncate))
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
# factorial method for an Integer
|
182
|
+
# Integer.!
|
183
|
+
class Integer
|
184
|
+
def !
|
185
|
+
if self == 0
|
186
|
+
return 1
|
187
|
+
else
|
188
|
+
(1..self).inject(:*)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
# Fisher's Exact Test Function Library
|
195
|
+
#
|
196
|
+
# Based on JavaScript version created by: Oyvind Langsrud
|
197
|
+
# Ported to Ruby by Bryan Donovan
|
198
|
+
|
199
|
+
module Rubystats
|
200
|
+
class FishersExactTest
|
201
|
+
|
202
|
+
def initialize
|
203
|
+
@sn11 = 0.0
|
204
|
+
@sn1_ = 0.0
|
205
|
+
@sn_1 = 0.0
|
206
|
+
@sn = 0.0
|
207
|
+
@sprob = 0.0
|
208
|
+
|
209
|
+
@sleft = 0.0
|
210
|
+
@sright = 0.0
|
211
|
+
@sless = 0.0
|
212
|
+
@slarg = 0.0
|
213
|
+
|
214
|
+
@left = 0.0
|
215
|
+
@right = 0.0
|
216
|
+
@twotail = 0.0
|
217
|
+
end
|
218
|
+
|
219
|
+
# Reference: "Lanczos, C. 'A precision approximation
|
220
|
+
# of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
|
221
|
+
# Translation of Alan Miller's FORTRAN-implementation
|
222
|
+
# See http://lib.stat.cmu.edu/apstat/245
|
223
|
+
def lngamm(z)
|
224
|
+
x = 0
|
225
|
+
x += 0.0000001659470187408462 / (z+7)
|
226
|
+
x += 0.000009934937113930748 / (z+6)
|
227
|
+
x -= 0.1385710331296526 / (z+5)
|
228
|
+
x += 12.50734324009056 / (z+4)
|
229
|
+
x -= 176.6150291498386 / (z+3)
|
230
|
+
x += 771.3234287757674 / (z+2)
|
231
|
+
x -= 1259.139216722289 / (z+1)
|
232
|
+
x += 676.5203681218835 / (z)
|
233
|
+
x += 0.9999999999995183
|
234
|
+
|
235
|
+
return(Math.log(x)-5.58106146679532777-z+(z-0.5) * Math.log(z+6.5))
|
236
|
+
end
|
237
|
+
|
238
|
+
def lnfact(n)
|
239
|
+
if n <= 1
|
240
|
+
return 0
|
241
|
+
else
|
242
|
+
return lngamm(n+1)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def lnbico(n,k)
|
247
|
+
return lnfact(n) - lnfact(k) - lnfact(n-k)
|
248
|
+
end
|
249
|
+
|
250
|
+
def hyper_323(n11, n1_, n_1, n)
|
251
|
+
return Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
|
252
|
+
end
|
253
|
+
|
254
|
+
def hyper(n11)
|
255
|
+
return hyper0(n11, 0, 0, 0)
|
256
|
+
end
|
257
|
+
|
258
|
+
def hyper0(n11i,n1_i,n_1i,ni)
|
259
|
+
if n1_i == 0 and n_1i ==0 and ni == 0
|
260
|
+
unless n11i % 10 == 0
|
261
|
+
if n11i == @sn11+1
|
262
|
+
@sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
|
263
|
+
@sn11 = n11i
|
264
|
+
return @sprob
|
265
|
+
end
|
266
|
+
if n11i == @sn11-1
|
267
|
+
@sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
|
268
|
+
@sn11 = n11i
|
269
|
+
return @sprob
|
270
|
+
end
|
271
|
+
end
|
272
|
+
@sn11 = n11i
|
273
|
+
else
|
274
|
+
@sn11 = n11i
|
275
|
+
@sn1_ = n1_i
|
276
|
+
@sn_1 = n_1i
|
277
|
+
@sn = ni
|
278
|
+
end
|
279
|
+
@sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
|
280
|
+
return @sprob
|
281
|
+
end
|
282
|
+
|
283
|
+
def exact(n11,n1_,n_1,n)
|
284
|
+
|
285
|
+
p = i = j = prob = 0.0
|
286
|
+
|
287
|
+
max = n1_
|
288
|
+
max = n_1 if n_1 < max
|
289
|
+
min = n1_ + n_1 - n
|
290
|
+
min = 0 if min < 0
|
291
|
+
|
292
|
+
if min == max
|
293
|
+
@sless = 1
|
294
|
+
@sright = 1
|
295
|
+
@sleft = 1
|
296
|
+
@slarg = 1
|
297
|
+
return 1
|
298
|
+
end
|
299
|
+
|
300
|
+
prob = hyper0(n11,n1_,n_1,n)
|
301
|
+
@sleft = 0
|
302
|
+
|
303
|
+
p = hyper(min)
|
304
|
+
i = min + 1
|
305
|
+
while p < (0.99999999 * prob)
|
306
|
+
@sleft += p
|
307
|
+
p = hyper(i)
|
308
|
+
i += 1
|
309
|
+
end
|
310
|
+
|
311
|
+
i -= 1
|
312
|
+
|
313
|
+
if p < (1.00000001*prob)
|
314
|
+
@sleft += p
|
315
|
+
else
|
316
|
+
i -= 1
|
317
|
+
end
|
318
|
+
|
319
|
+
@sright = 0
|
320
|
+
|
321
|
+
p = hyper(max)
|
322
|
+
j = max - 1
|
323
|
+
while p < (0.99999999 * prob)
|
324
|
+
@sright += p
|
325
|
+
p = hyper(j)
|
326
|
+
j -= 1
|
327
|
+
end
|
328
|
+
j += 1
|
329
|
+
|
330
|
+
if p < (1.00000001*prob)
|
331
|
+
@sright += p
|
332
|
+
else
|
333
|
+
j += 1
|
334
|
+
end
|
335
|
+
|
336
|
+
if (i - n11).abs < (j - n11).abs
|
337
|
+
@sless = @sleft
|
338
|
+
@slarg = 1 - @sleft + prob
|
339
|
+
else
|
340
|
+
@sless = 1 - @sright + prob
|
341
|
+
@slarg = @sright
|
342
|
+
end
|
343
|
+
return prob
|
344
|
+
end
|
345
|
+
|
346
|
+
def calculate(n11_,n12_,n21_,n22_)
|
347
|
+
n11_ *= -1 if n11_ < 0
|
348
|
+
n12_ *= -1 if n12_ < 0
|
349
|
+
n21_ *= -1 if n21_ < 0
|
350
|
+
n22_ *= -1 if n22_ < 0
|
351
|
+
n1_ = n11_ + n12_
|
352
|
+
n_1 = n11_ + n21_
|
353
|
+
n = n11_ + n12_ + n21_ + n22_
|
354
|
+
exact(n11_,n1_,n_1,n)
|
355
|
+
left = @sless
|
356
|
+
right = @slarg
|
357
|
+
twotail = @sleft + @sright
|
358
|
+
twotail = 1 if twotail > 1
|
359
|
+
values_hash = { :left =>left, :right =>right, :twotail =>twotail }
|
360
|
+
return values_hash
|
361
|
+
end
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
|
366
|
+
# generate values from the standard normal distribution with given mean and standard deviation
|
367
|
+
# See http://en.wikipedia.org/wiki/Box-Muller_transform
|
368
|
+
#
|
369
|
+
# RandomGaussian.new(mean, sd, rng)
|
370
|
+
# # generate RandomGaussian instance with given mean and standard deviation
|
371
|
+
# # default value: mean = 0.0, sd = 1.0
|
372
|
+
#
|
373
|
+
# RandomGaussian.rand
|
374
|
+
# # generate a random number that falls in the pre-defined gaussian distribution
|
375
|
+
# =USAGE
|
376
|
+
# # example
|
377
|
+
# a = RandomGaussian.new
|
378
|
+
# a.rand
|
379
|
+
# numbers = []
|
380
|
+
# 10.times {numbers << a.rand.round(5)}
|
381
|
+
# numbers
|
382
|
+
# [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
|
383
|
+
|
384
|
+
|
385
|
+
class RandomGaussian
|
386
|
+
def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
|
387
|
+
@mean, @sd, @rng = mean, sd, rng
|
388
|
+
@compute_next_pair = false
|
389
|
+
end
|
390
|
+
|
391
|
+
def rand
|
392
|
+
if (@compute_next_pair = !@compute_next_pair)
|
393
|
+
theta = 2 * Math::PI * @rng.call
|
394
|
+
scale = @sd * Math.sqrt(-2 * Math.log(1 - @rng.call))
|
395
|
+
@g1 = @mean + scale * Math.sin(theta)
|
396
|
+
@g0 = @mean + scale * Math.cos(theta)
|
397
|
+
else
|
398
|
+
@g1
|
399
|
+
end
|
400
|
+
end
|
401
|
+
end
|