viral_seq 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,54 @@
1
+ # viral_seq/hcv_dr
2
+ # HCV resistant mutation interpretation
3
+ # ViralSeq::hcv_ns5a
4
+
5
+ # ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
6
+ # # amino_acid_sequence_array is Array object of the amino acid sequence.
7
+ # # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
8
+ # # start_aa_position is the starting aa number of the input sequence as Integer
9
+
10
+ module ViralSeq
11
+ def self.hcv_ns5a(aa_array,start_aa=1)
12
+ out_hash = {}
13
+ sdrm = {}
14
+ sdrm[28] = ['M',['T']]
15
+ sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
16
+ sdrm[31] = ['L',['M','V','F']]
17
+ sdrm[32] = ['P',['L']]
18
+ sdrm[44] = ['K',['R']]
19
+ sdrm[58] = ['H',['D','P','S']]
20
+ sdrm[64] = ['T',['A','S']]
21
+ sdrm[77] = ['P',['A','S']]
22
+ sdrm[78] = ['R',['K']]
23
+ sdrm[79] = ['T',['A']]
24
+ sdrm[83] = ['T',['M']]
25
+ sdrm[85] = ['S',['N','H','Y']]
26
+ sdrm[92] = ['A',['P','T','K','E']]
27
+ sdrm[93] = ['Y',['C','F','H','N']]
28
+ sdrm[107] = ['K',['T','S']]
29
+ sdrm[121] = ['I',['V']]
30
+ sdrm[135] = ['T',['A']]
31
+ aa_length = aa_array.size
32
+ end_aa = start_aa + aa_length - 1
33
+ (start_aa..end_aa).each do |position|
34
+ array_position = position - start_aa
35
+ if sdrm.keys.include?(position)
36
+ wt_aa = sdrm[position][0]
37
+ test_aa = aa_array[array_position]
38
+ if test_aa.size == 1
39
+ unless wt_aa == test_aa
40
+ if sdrm[position][1].include?(test_aa)
41
+ out_hash[position] = [wt_aa,test_aa]
42
+ end
43
+ end
44
+ else
45
+ test_aa_array = test_aa.split("/")
46
+ if (test_aa_array & sdrm[position][1])
47
+ out_hash[position] = [wt_aa,test_aa]
48
+ end
49
+ end
50
+ end
51
+ end
52
+ return out_hash
53
+ end
54
+ end
@@ -0,0 +1,299 @@
1
+ # viral_seq/locator.rb
2
+
3
+ # Including following methods:
4
+ # ViralSeq::sequence_locator
5
+ # ViralSeq::sequence_clip
6
+ # ViralSeq::qc_hiv_seq_check
7
+
8
+ # HIV sequence locator function
9
+ # resembling HIV Sequence Locator from LANL
10
+ # https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
11
+ # require MUSCLE (http://www.drive5.com/muscle) installed
12
+ # current version only supports nucleotide sequence, not for amino acid sequence.
13
+
14
+ # =USAGE1
15
+ # # Find the location of a sequence
16
+ # ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
17
+ # # input_sequence: String of nucleotide sequence
18
+ # # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
19
+ # # path_to_muscle: path to the muscle executable.
20
+ # # Default as :false, will call MuscleBio to run Muscle
21
+ # # specify path_to_muscle if other source of muscle needed
22
+ # # function returns an array of
23
+ # # start_location (Integer)
24
+ # # end_location (Integer)
25
+ # # percentage_of_similarity_to_reference_sequence (Float)
26
+ # # containing_indel? (Boolean)
27
+ # # aligned_input_sequence (String)
28
+ # # aligned_reference_sequence (String)
29
+ # # example code
30
+ # sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
31
+ # p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
32
+ # => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
33
+
34
+ # =USAGE2
35
+ # ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
36
+ # # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
37
+ # # return nil if the input sequence is not in the range
38
+ # # input_sequence: String of nucleotide sequence
39
+ # # start_position and end_position: Integer of the start and end reference number of the sub-sequence
40
+ # # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
41
+ # # path_to_muscle: path to the muscle executable.
42
+ # # Default as :false, will call MuscleBio to run Muscle
43
+ # # specify path_to_muscle if other source of muscle needed
44
+ # # example code
45
+ # seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
46
+ # p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
47
+ # => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
48
+
49
+ # =USAGE3
50
+ # ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
51
+ # # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
52
+ # # and a boolean value for allowing indels,
53
+ # # path_to_muscle: path to the muscle executable.
54
+ # # Default as :false, will call MuscleBio to run Muscle
55
+ # # specify path_to_muscle if other source of muscle needed
56
+ # # return a sequence sub-hash that meets the the criteria
57
+ # # example code
58
+ # sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
59
+ # filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
60
+ # puts sequence_hash.size
61
+ # => 6
62
+ # puts filtered_sequence_hash.size
63
+ # => 4
64
+
65
+ module ViralSeq
66
+
67
+ def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
68
+
69
+ # ViralSeq.check_muscle(path_to_muscle)
70
+ ori_ref = ViralSeq.check_ref(ref_option)
71
+
72
+ begin
73
+ ori_ref_l = ori_ref.size
74
+ l1 = 0
75
+ l2 = 0
76
+
77
+ aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
78
+ aln_test = aln_seq[1]
79
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
80
+ gap_begin = $1.size
81
+ gap_end = $3.size
82
+ aln_test2 = $2
83
+ ref = aln_seq[0]
84
+ ref = ref[gap_begin..(-gap_end-1)]
85
+ ref_size = ref.size
86
+ if ref_size > 1.3*(seq.size)
87
+ l1 = l1 + gap_begin
88
+ l2 = l2 + gap_end
89
+ max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
90
+ aln_test2 =~ /#{max_seq}/
91
+ before_aln_seq = $`
92
+ before_aln = $`.size
93
+ post_aln_seq = $'
94
+ post_aln = $'.size
95
+ before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
96
+ b1 = (1.3 * before_aln_seq_size).to_i
97
+ post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
98
+ b2 = (1.3 * post_aln_seq_size).to_i
99
+ if (before_aln > seq.size) and (post_aln <= seq.size)
100
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
101
+ l1 = l1 + (before_aln - b1)
102
+ elsif (post_aln > seq.size) and (before_aln <= seq.size)
103
+ ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
104
+ l2 = l2 + post_aln - b2
105
+ elsif (post_aln > seq.size) and (before_aln > seq.size)
106
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
107
+ l1 = l1 + (before_aln - b1)
108
+ l2 = l2 + (post_aln - b2)
109
+ end
110
+
111
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
112
+ aln_test = aln_seq[1]
113
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
114
+ gap_begin = $1.size
115
+ gap_end = $3.size
116
+ ref = aln_seq[0]
117
+ ref = ref[gap_begin..(-gap_end-1)]
118
+ end
119
+
120
+ aln_test = aln_seq[1]
121
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
122
+ gap_begin = $1.size
123
+ gap_end = $3.size
124
+ aln_test = $2
125
+ aln_test =~ /^(\w+)(\-*)\w/
126
+ s1 = $1.size
127
+ g1 = $2.size
128
+ aln_test =~ /\w(\-*)(\w+)$/
129
+ s2 = $2.size
130
+ g2 = $1.size
131
+
132
+ l1 = l1 + gap_begin
133
+ l2 = l2 + gap_end
134
+ repeat = 0
135
+
136
+ if g1 == g2 and (s1 + g1 + s2) == ref.size
137
+ if s1 > s2 and g2 > 2*s2
138
+ ref = ref[0..(-g2-1)]
139
+ repeat = 1
140
+ l2 = l2 + g2
141
+ elsif s1 < s2 and g1 > 2*s1
142
+ ref = ref[g1..-1]
143
+ repeat = 1
144
+ l1 = l1 + g1
145
+ end
146
+ else
147
+ if g1 > 2*s1
148
+ ref = ref[g1..-1]
149
+ repeat = 1
150
+ l1 = l1 + g1
151
+ end
152
+ if g2 > 2*s2
153
+ ref = ref[0..(-g2 - 1)]
154
+ repeat = 1
155
+ l2 = l2 + g2
156
+ end
157
+ end
158
+
159
+ while repeat == 1
160
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
161
+ aln_test = aln_seq[1]
162
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
163
+ gap_begin = $1.size
164
+ gap_end = $3.size
165
+ aln_test = $2
166
+ aln_test =~ /^(\w+)(\-*)\w/
167
+ s1 = $1.size
168
+ g1 = $2.size
169
+ aln_test =~ /\w(\-*)(\w+)$/
170
+ s2 = $2.size
171
+ g2 = $1.size
172
+ ref = aln_seq[0]
173
+ ref = ref[gap_begin..(-gap_end-1)]
174
+ l1 = l1 + gap_begin
175
+ l2 = l2 + gap_end
176
+ repeat = 0
177
+ if g1 > 2*s1
178
+ ref = ref[g1..-1]
179
+ repeat = 1
180
+ l1 = l1 + g1
181
+ end
182
+ if g2 > 2*s2
183
+ ref = ref[0..(-g2 - 1)]
184
+ repeat = 1
185
+ l2 = l2 + g2
186
+ end
187
+ end
188
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
189
+
190
+
191
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
192
+ aln_test = aln_seq[1]
193
+ ref = aln_seq[0]
194
+
195
+ #refine alignment
196
+
197
+ if ref =~ /^(\-+)/
198
+ l1 = l1 - $1.size
199
+ elsif ref =~ /(\-+)$/
200
+ l2 = l2 + $1.size
201
+ end
202
+
203
+ if (ori_ref_l - l2 - 1) >= l1
204
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
205
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
206
+ aln_test = aln_seq[1]
207
+ ref = aln_seq[0]
208
+
209
+ ref_size = ref.size
210
+ sim_count = 0
211
+ (0..(ref_size-1)).each do |n|
212
+ ref_base = ref[n]
213
+ test_base = aln_test[n]
214
+ sim_count += 1 if ref_base == test_base
215
+ end
216
+ similarity = (sim_count/ref_size.to_f*100).round(1)
217
+
218
+ loc_p1 = l1 + 1
219
+ loc_p2 = ori_ref_l - l2
220
+ if seq.size != (loc_p2 - loc_p1 + 1)
221
+ indel = true
222
+ elsif aln_test.include?("-")
223
+ indel = true
224
+ else
225
+ indel = false
226
+ end
227
+ return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
228
+ else
229
+ return [0,0,0,0,0,0,0]
230
+ end
231
+ rescue => e
232
+ puts "Unexpected error occured."
233
+ puts "Exception Class: #{ e.class.name }"
234
+ puts "Exception Message: #{ e.message }"
235
+ puts "Exception Backtrace: #{ e.backtrace[0] }"
236
+ puts "ViralSeq.sequence_locator returns nil"
237
+ return nil
238
+ end
239
+ end
240
+
241
+ # sequence clip function
242
+ def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
243
+ loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
244
+ l1 = loc[0]
245
+ l2 = loc[1]
246
+ if (p1 >= l1) & (p2 <= l2)
247
+ seq = loc[4]
248
+ ref = loc[5]
249
+ g1 = 0
250
+ ref.each_char do |char|
251
+ break if l1 == p1
252
+ g1 += 1
253
+ l1 += 1 unless char == "-"
254
+ end
255
+ g2 = 1
256
+ ref.reverse.each_char do |char|
257
+ break if l2 == p2
258
+ g2 += 1
259
+ l2 -= 1 unless char == "-"
260
+ end
261
+ return seq[g1..(-g2)].tr("-","")
262
+ else
263
+ return nil
264
+ end
265
+ end
266
+
267
+ # batch quality check of HIV sequences based on ViralSeq.sequence_locator
268
+ # input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
269
+ # and allow the sequence to contain indels
270
+ # return a hash of filtered sequences
271
+
272
+ def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
273
+ seq_hash_unique = seq_hash.values.uniq
274
+ seq_hash_unique_pass = []
275
+ start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
276
+ end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
277
+ seq_hash_unique.each do |seq|
278
+ loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
279
+ if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
280
+ if indel
281
+ seq_hash_unique_pass << seq
282
+ elsif loc[3] == false
283
+ seq_hash_unique_pass << seq
284
+ end
285
+ end
286
+ end
287
+ seq_pass = {}
288
+ seq_hash_unique_pass.each do |seq|
289
+ seq_hash.each do |seq_name, orginal_seq|
290
+ if orginal_seq == seq
291
+ seq_pass[seq_name] = seq
292
+ seq_hash.delete(seq_name)
293
+ end
294
+ end
295
+ end
296
+ return seq_pass
297
+ end
298
+
299
+ end
@@ -0,0 +1,401 @@
1
+ # lib/math.rb
2
+
3
+ # math and statistic functions
4
+ # inlcuding the following methods
5
+ # ViralSeq::count
6
+ # ViralSeq::count_percentage
7
+ # ViralSeq::poisson_distribution
8
+ # ViralSeq::r_binom_CI
9
+ # Enumerable#median
10
+ # Enumerable#sum
11
+ # Enumerable#mean
12
+ # Enumerable#sample_variance
13
+ # Enumerable#stdev
14
+ # Enumerable#upper_quartile
15
+ # Enumerable#lower_quartile
16
+ # Integer#!
17
+ # Rubystats::FishersExactTest
18
+ # RandomGaussian::new
19
+ # RandomGaussian#rand
20
+
21
+ module ViralSeq
22
+
23
+ # count elements in a array, return a hash of {:element1 => number1, :element2 => number2, ...}
24
+ # =Usage
25
+ # array = %w{cat dog monkey cat cat cat monkey}
26
+ # ViralSeq.count(array)
27
+ # => {"cat"=>4, "dog"=>1, "monkey"=>2}
28
+
29
+ def self.count(array)
30
+ hash = Hash.new(0)
31
+ array.each do |element|
32
+ hash[element] +=1
33
+ end
34
+ return hash
35
+ end
36
+
37
+ # count elements in a array, return a hash of {:element1 => frequency1, :element2 => frequency2, ...}
38
+ # default decimal as 2
39
+ # =Usage
40
+ # array = %w{cat dog monkey cat cat cat monkey}
41
+ # ViralSeq.count_percentage(array)
42
+ # => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
43
+
44
+ def self.count_percentage(array,decimal = 2)
45
+ hash1 = Hash.new(0)
46
+ array.each do |element|
47
+ hash1[element] += 1
48
+ end
49
+ total_elements = array.size
50
+ hash2 = Hash.new(0)
51
+ hash1.each do |key,value|
52
+ hash2[key] = (value/total_elements.to_f).round(decimal)
53
+ end
54
+ return hash2
55
+ end
56
+
57
+ # poisson distribution. input lambda and maximum k, return a hash with keys as k
58
+ # default k value is 5, meaning calculate up to 5 events.
59
+ #
60
+ # Poisson Distribution (https://en.wikipedia.org/wiki/Poisson_distribution)
61
+ # An event can occur 0, 1, 2, … times in an interval.
62
+ # The average number of events in an interval is designated λ (lambda).
63
+ # λ is the event rate, also called the rate parameter.
64
+ # The probability of observing k events in an interval is given by the equation
65
+ #
66
+ # P(k events in interval) = e^(-λ) * λ^k / k!
67
+ #
68
+ # λ is the average number of events per interval
69
+ # e is the number 2.71828... (Euler's number) the base of the natural logarithms
70
+ # k takes values 0, 1, 2, …
71
+ # k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
72
+ #
73
+ # =USAGE
74
+ # # We assume the mutaiton rate is 0.005 (event rate λ),
75
+ # # we would like to calculate the probablity of 3 mutations on one sequence
76
+ # prob_hash = ViralSeq::poisson_distribution(0.005)
77
+ # => {0=>0.9950124791926823, 1=>0.004975062395963412, 2=>1.243765598990853e-05, 3=>2.072942664984755e-08, 4=>2.5911783312309436e-11, 5=>2.591178331230944e-14}
78
+ # prob_hash[3]
79
+ # => 2.072942664984755e-08
80
+
81
+ def self.poisson_distribution(rate,k = 5)
82
+ out_hash = {}
83
+ (0..k).each do |n|
84
+ p = (rate**n * Math::E**(-rate))/!n
85
+ out_hash[n] = p
86
+ end
87
+ return out_hash
88
+ end
89
+
90
+
91
+ # require R pre-installed
92
+ # calculate binomial 95% confidence intervals by R. refer to R function binom.test
93
+ # input number x and n, return an array as [lower_interval, upper_interval]
94
+ #
95
+ # =USAGE
96
+ # # mutation M184V found in 3 out of 923 sequences, the 95% confidence interval is
97
+ # ViralSeq.r_binom_CI(3, 923)
98
+ # => [0.02223, 0.19234]
99
+ #
100
+ def self.r_binom_CI(x= 0, n= 0)
101
+ r_output = `Rscript -e 'binom.test(#{x},#{n})$conf.int[1];binom.test(#{x},#{n})$conf.int[2]'`
102
+ lines = r_output.split "\n"
103
+ low = lines[0].chomp[4..-1].to_f
104
+ high = lines[1].chomp[4..-1].to_f
105
+ return [low.round(5), high.round(5)]
106
+ end
107
+
108
+ end
109
+
110
+ # statistic methods
111
+ # :median :sum :mean :sample_variance :stdev :upper_quartile :lower_quartile
112
+ # =USAGE
113
+ # array = [1,2,3,4,5,6,7,8,9,10]
114
+ # array.median
115
+ # => 5.5
116
+ # array.sum
117
+ # => 55
118
+ # array.mean
119
+ # => 5.5
120
+ # array.sample_variance
121
+ # => 9.166666666666666
122
+ # array.stdev
123
+ # => 3.0276503540974917
124
+ # array.upper_quartile
125
+ # => 7.5
126
+ # array.lower_quartile
127
+ # => 3.5
128
+
129
+ module Enumerable
130
+ def median
131
+ len = self.length
132
+ sorted = self.sort
133
+ len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
134
+ end
135
+
136
+ def sum
137
+ self.inject(0){|accum, i| accum + i }
138
+ end
139
+
140
+ def mean
141
+ self.sum/self.length.to_f
142
+ end
143
+
144
+ def sample_variance
145
+ m = self.mean
146
+ sum = self.inject(0){|accum, i| accum + (i-m)**2 }
147
+ sum/(self.length - 1).to_f
148
+ end
149
+
150
+ def stdev
151
+ return Math.sqrt(self.sample_variance)
152
+ end
153
+
154
+ def upper_quartile
155
+ return nil if self.empty?
156
+ sorted_array = self.sort
157
+ u = (0.25*(3*sorted_array.length))
158
+ if (u-u.truncate).is_a?(Integer)
159
+ return sorted_array[(u-u.truncate)-1]
160
+ else
161
+ sample = sorted_array[u.truncate.abs-1]
162
+ sample1 = sorted_array[(u.truncate.abs)]
163
+ return sample+((sample1-sample)*(u-u.truncate))
164
+ end
165
+ end
166
+
167
+ def lower_quartile
168
+ return nil if self.empty?
169
+ sorted_array = self.sort
170
+ u = 0.25*sorted_array.length + 1
171
+ if (u-u.truncate).is_a?(Integer)
172
+ return sorted_array[(u-u.truncate)-1]
173
+ else
174
+ sample = sorted_array[u.truncate.abs-1]
175
+ sample1 = sorted_array[(u.truncate.abs)]
176
+ return sample+((sample1-sample)*(u-u.truncate))
177
+ end
178
+ end
179
+ end
180
+
181
+ # factorial method for an Integer
182
+ # Integer.!
183
+ class Integer
184
+ def !
185
+ if self == 0
186
+ return 1
187
+ else
188
+ (1..self).inject(:*)
189
+ end
190
+ end
191
+ end
192
+
193
+
194
+ # Fisher's Exact Test Function Library
195
+ #
196
+ # Based on JavaScript version created by: Oyvind Langsrud
197
+ # Ported to Ruby by Bryan Donovan
198
+
199
+ module Rubystats
200
+ class FishersExactTest
201
+
202
+ def initialize
203
+ @sn11 = 0.0
204
+ @sn1_ = 0.0
205
+ @sn_1 = 0.0
206
+ @sn = 0.0
207
+ @sprob = 0.0
208
+
209
+ @sleft = 0.0
210
+ @sright = 0.0
211
+ @sless = 0.0
212
+ @slarg = 0.0
213
+
214
+ @left = 0.0
215
+ @right = 0.0
216
+ @twotail = 0.0
217
+ end
218
+
219
+ # Reference: "Lanczos, C. 'A precision approximation
220
+ # of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
221
+ # Translation of Alan Miller's FORTRAN-implementation
222
+ # See http://lib.stat.cmu.edu/apstat/245
223
+ def lngamm(z)
224
+ x = 0
225
+ x += 0.0000001659470187408462 / (z+7)
226
+ x += 0.000009934937113930748 / (z+6)
227
+ x -= 0.1385710331296526 / (z+5)
228
+ x += 12.50734324009056 / (z+4)
229
+ x -= 176.6150291498386 / (z+3)
230
+ x += 771.3234287757674 / (z+2)
231
+ x -= 1259.139216722289 / (z+1)
232
+ x += 676.5203681218835 / (z)
233
+ x += 0.9999999999995183
234
+
235
+ return(Math.log(x)-5.58106146679532777-z+(z-0.5) * Math.log(z+6.5))
236
+ end
237
+
238
+ def lnfact(n)
239
+ if n <= 1
240
+ return 0
241
+ else
242
+ return lngamm(n+1)
243
+ end
244
+ end
245
+
246
+ def lnbico(n,k)
247
+ return lnfact(n) - lnfact(k) - lnfact(n-k)
248
+ end
249
+
250
+ def hyper_323(n11, n1_, n_1, n)
251
+ return Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
252
+ end
253
+
254
+ def hyper(n11)
255
+ return hyper0(n11, 0, 0, 0)
256
+ end
257
+
258
+ def hyper0(n11i,n1_i,n_1i,ni)
259
+ if n1_i == 0 and n_1i ==0 and ni == 0
260
+ unless n11i % 10 == 0
261
+ if n11i == @sn11+1
262
+ @sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
263
+ @sn11 = n11i
264
+ return @sprob
265
+ end
266
+ if n11i == @sn11-1
267
+ @sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
268
+ @sn11 = n11i
269
+ return @sprob
270
+ end
271
+ end
272
+ @sn11 = n11i
273
+ else
274
+ @sn11 = n11i
275
+ @sn1_ = n1_i
276
+ @sn_1 = n_1i
277
+ @sn = ni
278
+ end
279
+ @sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
280
+ return @sprob
281
+ end
282
+
283
+ def exact(n11,n1_,n_1,n)
284
+
285
+ p = i = j = prob = 0.0
286
+
287
+ max = n1_
288
+ max = n_1 if n_1 < max
289
+ min = n1_ + n_1 - n
290
+ min = 0 if min < 0
291
+
292
+ if min == max
293
+ @sless = 1
294
+ @sright = 1
295
+ @sleft = 1
296
+ @slarg = 1
297
+ return 1
298
+ end
299
+
300
+ prob = hyper0(n11,n1_,n_1,n)
301
+ @sleft = 0
302
+
303
+ p = hyper(min)
304
+ i = min + 1
305
+ while p < (0.99999999 * prob)
306
+ @sleft += p
307
+ p = hyper(i)
308
+ i += 1
309
+ end
310
+
311
+ i -= 1
312
+
313
+ if p < (1.00000001*prob)
314
+ @sleft += p
315
+ else
316
+ i -= 1
317
+ end
318
+
319
+ @sright = 0
320
+
321
+ p = hyper(max)
322
+ j = max - 1
323
+ while p < (0.99999999 * prob)
324
+ @sright += p
325
+ p = hyper(j)
326
+ j -= 1
327
+ end
328
+ j += 1
329
+
330
+ if p < (1.00000001*prob)
331
+ @sright += p
332
+ else
333
+ j += 1
334
+ end
335
+
336
+ if (i - n11).abs < (j - n11).abs
337
+ @sless = @sleft
338
+ @slarg = 1 - @sleft + prob
339
+ else
340
+ @sless = 1 - @sright + prob
341
+ @slarg = @sright
342
+ end
343
+ return prob
344
+ end
345
+
346
+ def calculate(n11_,n12_,n21_,n22_)
347
+ n11_ *= -1 if n11_ < 0
348
+ n12_ *= -1 if n12_ < 0
349
+ n21_ *= -1 if n21_ < 0
350
+ n22_ *= -1 if n22_ < 0
351
+ n1_ = n11_ + n12_
352
+ n_1 = n11_ + n21_
353
+ n = n11_ + n12_ + n21_ + n22_
354
+ exact(n11_,n1_,n_1,n)
355
+ left = @sless
356
+ right = @slarg
357
+ twotail = @sleft + @sright
358
+ twotail = 1 if twotail > 1
359
+ values_hash = { :left =>left, :right =>right, :twotail =>twotail }
360
+ return values_hash
361
+ end
362
+ end
363
+ end
364
+
365
+
366
+ # generate values from the standard normal distribution with given mean and standard deviation
367
+ # See http://en.wikipedia.org/wiki/Box-Muller_transform
368
+ #
369
+ # RandomGaussian.new(mean, sd, rng)
370
+ # # generate RandomGaussian instance with given mean and standard deviation
371
+ # # default value: mean = 0.0, sd = 1.0
372
+ #
373
+ # RandomGaussian.rand
374
+ # # generate a random number that falls in the pre-defined gaussian distribution
375
+ # =USAGE
376
+ # # example
377
+ # a = RandomGaussian.new
378
+ # a.rand
379
+ # numbers = []
380
+ # 10.times {numbers << a.rand.round(5)}
381
+ # numbers
382
+ # [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
383
+
384
+
385
+ class RandomGaussian
386
+ def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
387
+ @mean, @sd, @rng = mean, sd, rng
388
+ @compute_next_pair = false
389
+ end
390
+
391
+ def rand
392
+ if (@compute_next_pair = !@compute_next_pair)
393
+ theta = 2 * Math::PI * @rng.call
394
+ scale = @sd * Math.sqrt(-2 * Math.log(1 - @rng.call))
395
+ @g1 = @mean + scale * Math.sin(theta)
396
+ @g0 = @mean + scale * Math.cos(theta)
397
+ else
398
+ @g1
399
+ end
400
+ end
401
+ end