viral_seq 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
data/lib/viral_seq/hcv_dr.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
# viral_seq/hcv_dr
|
2
|
-
# HCV resistant mutation interpretation
|
3
|
-
# ViralSeq::hcv_ns5a
|
4
|
-
|
5
|
-
# ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
|
6
|
-
# # amino_acid_sequence_array is Array object of the amino acid sequence.
|
7
|
-
# # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
|
8
|
-
# # start_aa_position is the starting aa number of the input sequence as Integer
|
9
|
-
|
10
|
-
module ViralSeq
|
11
|
-
def self.hcv_ns5a(aa_array,start_aa=1)
|
12
|
-
out_hash = {}
|
13
|
-
sdrm = {}
|
14
|
-
sdrm[28] = ['M',['T']]
|
15
|
-
sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
|
16
|
-
sdrm[31] = ['L',['M','V','F']]
|
17
|
-
sdrm[32] = ['P',['L']]
|
18
|
-
sdrm[44] = ['K',['R']]
|
19
|
-
sdrm[58] = ['H',['D','P','S']]
|
20
|
-
sdrm[64] = ['T',['A','S']]
|
21
|
-
sdrm[77] = ['P',['A','S']]
|
22
|
-
sdrm[78] = ['R',['K']]
|
23
|
-
sdrm[79] = ['T',['A']]
|
24
|
-
sdrm[83] = ['T',['M']]
|
25
|
-
sdrm[85] = ['S',['N','H','Y']]
|
26
|
-
sdrm[92] = ['A',['P','T','K','E']]
|
27
|
-
sdrm[93] = ['Y',['C','F','H','N']]
|
28
|
-
sdrm[107] = ['K',['T','S']]
|
29
|
-
sdrm[121] = ['I',['V']]
|
30
|
-
sdrm[135] = ['T',['A']]
|
31
|
-
aa_length = aa_array.size
|
32
|
-
end_aa = start_aa + aa_length - 1
|
33
|
-
(start_aa..end_aa).each do |position|
|
34
|
-
array_position = position - start_aa
|
35
|
-
if sdrm.keys.include?(position)
|
36
|
-
wt_aa = sdrm[position][0]
|
37
|
-
test_aa = aa_array[array_position]
|
38
|
-
if test_aa.size == 1
|
39
|
-
unless wt_aa == test_aa
|
40
|
-
if sdrm[position][1].include?(test_aa)
|
41
|
-
out_hash[position] = [wt_aa,test_aa]
|
42
|
-
end
|
43
|
-
end
|
44
|
-
else
|
45
|
-
test_aa_array = test_aa.split("/")
|
46
|
-
if (test_aa_array & sdrm[position][1])
|
47
|
-
out_hash[position] = [wt_aa,test_aa]
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
return out_hash
|
53
|
-
end
|
54
|
-
end
|
data/lib/viral_seq/locator.rb
DELETED
@@ -1,299 +0,0 @@
|
|
1
|
-
# viral_seq/locator.rb
|
2
|
-
|
3
|
-
# Including following methods:
|
4
|
-
# ViralSeq::sequence_locator
|
5
|
-
# ViralSeq::sequence_clip
|
6
|
-
# ViralSeq::qc_hiv_seq_check
|
7
|
-
|
8
|
-
# HIV sequence locator function
|
9
|
-
# resembling HIV Sequence Locator from LANL
|
10
|
-
# https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
|
11
|
-
# require MUSCLE (http://www.drive5.com/muscle) installed
|
12
|
-
# current version only supports nucleotide sequence, not for amino acid sequence.
|
13
|
-
|
14
|
-
# =USAGE1
|
15
|
-
# # Find the location of a sequence
|
16
|
-
# ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
|
17
|
-
# # input_sequence: String of nucleotide sequence
|
18
|
-
# # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
|
19
|
-
# # path_to_muscle: path to the muscle executable.
|
20
|
-
# # Default as :false, will call MuscleBio to run Muscle
|
21
|
-
# # specify path_to_muscle if other source of muscle needed
|
22
|
-
# # function returns an array of
|
23
|
-
# # start_location (Integer)
|
24
|
-
# # end_location (Integer)
|
25
|
-
# # percentage_of_similarity_to_reference_sequence (Float)
|
26
|
-
# # containing_indel? (Boolean)
|
27
|
-
# # aligned_input_sequence (String)
|
28
|
-
# # aligned_reference_sequence (String)
|
29
|
-
# # example code
|
30
|
-
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
31
|
-
# p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
|
32
|
-
# => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
|
33
|
-
|
34
|
-
# =USAGE2
|
35
|
-
# ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
|
36
|
-
# # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
|
37
|
-
# # return nil if the input sequence is not in the range
|
38
|
-
# # input_sequence: String of nucleotide sequence
|
39
|
-
# # start_position and end_position: Integer of the start and end reference number of the sub-sequence
|
40
|
-
# # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
|
41
|
-
# # path_to_muscle: path to the muscle executable.
|
42
|
-
# # Default as :false, will call MuscleBio to run Muscle
|
43
|
-
# # specify path_to_muscle if other source of muscle needed
|
44
|
-
# # example code
|
45
|
-
# seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
|
46
|
-
# p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
|
47
|
-
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
48
|
-
|
49
|
-
# =USAGE3
|
50
|
-
# ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
|
51
|
-
# # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
|
52
|
-
# # and a boolean value for allowing indels,
|
53
|
-
# # path_to_muscle: path to the muscle executable.
|
54
|
-
# # Default as :false, will call MuscleBio to run Muscle
|
55
|
-
# # specify path_to_muscle if other source of muscle needed
|
56
|
-
# # return a sequence sub-hash that meets the the criteria
|
57
|
-
# # example code
|
58
|
-
# sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
|
59
|
-
# filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
|
60
|
-
# puts sequence_hash.size
|
61
|
-
# => 6
|
62
|
-
# puts filtered_sequence_hash.size
|
63
|
-
# => 4
|
64
|
-
|
65
|
-
module ViralSeq
|
66
|
-
|
67
|
-
def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
|
68
|
-
|
69
|
-
# ViralSeq.check_muscle(path_to_muscle)
|
70
|
-
ori_ref = ViralSeq.check_ref(ref_option)
|
71
|
-
|
72
|
-
begin
|
73
|
-
ori_ref_l = ori_ref.size
|
74
|
-
l1 = 0
|
75
|
-
l2 = 0
|
76
|
-
|
77
|
-
aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
|
78
|
-
aln_test = aln_seq[1]
|
79
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
80
|
-
gap_begin = $1.size
|
81
|
-
gap_end = $3.size
|
82
|
-
aln_test2 = $2
|
83
|
-
ref = aln_seq[0]
|
84
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
85
|
-
ref_size = ref.size
|
86
|
-
if ref_size > 1.3*(seq.size)
|
87
|
-
l1 = l1 + gap_begin
|
88
|
-
l2 = l2 + gap_end
|
89
|
-
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
90
|
-
aln_test2 =~ /#{max_seq}/
|
91
|
-
before_aln_seq = $`
|
92
|
-
before_aln = $`.size
|
93
|
-
post_aln_seq = $'
|
94
|
-
post_aln = $'.size
|
95
|
-
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
96
|
-
b1 = (1.3 * before_aln_seq_size).to_i
|
97
|
-
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
98
|
-
b2 = (1.3 * post_aln_seq_size).to_i
|
99
|
-
if (before_aln > seq.size) and (post_aln <= seq.size)
|
100
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
101
|
-
l1 = l1 + (before_aln - b1)
|
102
|
-
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
103
|
-
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
104
|
-
l2 = l2 + post_aln - b2
|
105
|
-
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
106
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
107
|
-
l1 = l1 + (before_aln - b1)
|
108
|
-
l2 = l2 + (post_aln - b2)
|
109
|
-
end
|
110
|
-
|
111
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
112
|
-
aln_test = aln_seq[1]
|
113
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
114
|
-
gap_begin = $1.size
|
115
|
-
gap_end = $3.size
|
116
|
-
ref = aln_seq[0]
|
117
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
118
|
-
end
|
119
|
-
|
120
|
-
aln_test = aln_seq[1]
|
121
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
122
|
-
gap_begin = $1.size
|
123
|
-
gap_end = $3.size
|
124
|
-
aln_test = $2
|
125
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
126
|
-
s1 = $1.size
|
127
|
-
g1 = $2.size
|
128
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
129
|
-
s2 = $2.size
|
130
|
-
g2 = $1.size
|
131
|
-
|
132
|
-
l1 = l1 + gap_begin
|
133
|
-
l2 = l2 + gap_end
|
134
|
-
repeat = 0
|
135
|
-
|
136
|
-
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
137
|
-
if s1 > s2 and g2 > 2*s2
|
138
|
-
ref = ref[0..(-g2-1)]
|
139
|
-
repeat = 1
|
140
|
-
l2 = l2 + g2
|
141
|
-
elsif s1 < s2 and g1 > 2*s1
|
142
|
-
ref = ref[g1..-1]
|
143
|
-
repeat = 1
|
144
|
-
l1 = l1 + g1
|
145
|
-
end
|
146
|
-
else
|
147
|
-
if g1 > 2*s1
|
148
|
-
ref = ref[g1..-1]
|
149
|
-
repeat = 1
|
150
|
-
l1 = l1 + g1
|
151
|
-
end
|
152
|
-
if g2 > 2*s2
|
153
|
-
ref = ref[0..(-g2 - 1)]
|
154
|
-
repeat = 1
|
155
|
-
l2 = l2 + g2
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
while repeat == 1
|
160
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
161
|
-
aln_test = aln_seq[1]
|
162
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
163
|
-
gap_begin = $1.size
|
164
|
-
gap_end = $3.size
|
165
|
-
aln_test = $2
|
166
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
167
|
-
s1 = $1.size
|
168
|
-
g1 = $2.size
|
169
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
170
|
-
s2 = $2.size
|
171
|
-
g2 = $1.size
|
172
|
-
ref = aln_seq[0]
|
173
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
174
|
-
l1 = l1 + gap_begin
|
175
|
-
l2 = l2 + gap_end
|
176
|
-
repeat = 0
|
177
|
-
if g1 > 2*s1
|
178
|
-
ref = ref[g1..-1]
|
179
|
-
repeat = 1
|
180
|
-
l1 = l1 + g1
|
181
|
-
end
|
182
|
-
if g2 > 2*s2
|
183
|
-
ref = ref[0..(-g2 - 1)]
|
184
|
-
repeat = 1
|
185
|
-
l2 = l2 + g2
|
186
|
-
end
|
187
|
-
end
|
188
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
189
|
-
|
190
|
-
|
191
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
192
|
-
aln_test = aln_seq[1]
|
193
|
-
ref = aln_seq[0]
|
194
|
-
|
195
|
-
#refine alignment
|
196
|
-
|
197
|
-
if ref =~ /^(\-+)/
|
198
|
-
l1 = l1 - $1.size
|
199
|
-
elsif ref =~ /(\-+)$/
|
200
|
-
l2 = l2 + $1.size
|
201
|
-
end
|
202
|
-
|
203
|
-
if (ori_ref_l - l2 - 1) >= l1
|
204
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
205
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
206
|
-
aln_test = aln_seq[1]
|
207
|
-
ref = aln_seq[0]
|
208
|
-
|
209
|
-
ref_size = ref.size
|
210
|
-
sim_count = 0
|
211
|
-
(0..(ref_size-1)).each do |n|
|
212
|
-
ref_base = ref[n]
|
213
|
-
test_base = aln_test[n]
|
214
|
-
sim_count += 1 if ref_base == test_base
|
215
|
-
end
|
216
|
-
similarity = (sim_count/ref_size.to_f*100).round(1)
|
217
|
-
|
218
|
-
loc_p1 = l1 + 1
|
219
|
-
loc_p2 = ori_ref_l - l2
|
220
|
-
if seq.size != (loc_p2 - loc_p1 + 1)
|
221
|
-
indel = true
|
222
|
-
elsif aln_test.include?("-")
|
223
|
-
indel = true
|
224
|
-
else
|
225
|
-
indel = false
|
226
|
-
end
|
227
|
-
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
228
|
-
else
|
229
|
-
return [0,0,0,0,0,0,0]
|
230
|
-
end
|
231
|
-
rescue => e
|
232
|
-
puts "Unexpected error occured."
|
233
|
-
puts "Exception Class: #{ e.class.name }"
|
234
|
-
puts "Exception Message: #{ e.message }"
|
235
|
-
puts "Exception Backtrace: #{ e.backtrace[0] }"
|
236
|
-
puts "ViralSeq.sequence_locator returns nil"
|
237
|
-
return nil
|
238
|
-
end
|
239
|
-
end
|
240
|
-
|
241
|
-
# sequence clip function
|
242
|
-
def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
243
|
-
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
244
|
-
l1 = loc[0]
|
245
|
-
l2 = loc[1]
|
246
|
-
if (p1 >= l1) & (p2 <= l2)
|
247
|
-
seq = loc[4]
|
248
|
-
ref = loc[5]
|
249
|
-
g1 = 0
|
250
|
-
ref.each_char do |char|
|
251
|
-
break if l1 == p1
|
252
|
-
g1 += 1
|
253
|
-
l1 += 1 unless char == "-"
|
254
|
-
end
|
255
|
-
g2 = 1
|
256
|
-
ref.reverse.each_char do |char|
|
257
|
-
break if l2 == p2
|
258
|
-
g2 += 1
|
259
|
-
l2 -= 1 unless char == "-"
|
260
|
-
end
|
261
|
-
return seq[g1..(-g2)].tr("-","")
|
262
|
-
else
|
263
|
-
return nil
|
264
|
-
end
|
265
|
-
end
|
266
|
-
|
267
|
-
# batch quality check of HIV sequences based on ViralSeq.sequence_locator
|
268
|
-
# input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
|
269
|
-
# and allow the sequence to contain indels
|
270
|
-
# return a hash of filtered sequences
|
271
|
-
|
272
|
-
def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
|
273
|
-
seq_hash_unique = seq_hash.values.uniq
|
274
|
-
seq_hash_unique_pass = []
|
275
|
-
start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
|
276
|
-
end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
|
277
|
-
seq_hash_unique.each do |seq|
|
278
|
-
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
279
|
-
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
280
|
-
if indel
|
281
|
-
seq_hash_unique_pass << seq
|
282
|
-
elsif loc[3] == false
|
283
|
-
seq_hash_unique_pass << seq
|
284
|
-
end
|
285
|
-
end
|
286
|
-
end
|
287
|
-
seq_pass = {}
|
288
|
-
seq_hash_unique_pass.each do |seq|
|
289
|
-
seq_hash.each do |seq_name, orginal_seq|
|
290
|
-
if orginal_seq == seq
|
291
|
-
seq_pass[seq_name] = seq
|
292
|
-
seq_hash.delete(seq_name)
|
293
|
-
end
|
294
|
-
end
|
295
|
-
end
|
296
|
-
return seq_pass
|
297
|
-
end
|
298
|
-
|
299
|
-
end
|
data/lib/viral_seq/misc.rb
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
# viral_seq/misc.rb
|
2
|
-
|
3
|
-
# miscellaneous methods
|
4
|
-
# including
|
5
|
-
# Hash#copyhash
|
6
|
-
# Hash#difference
|
7
|
-
# Hash#uniq_hash
|
8
|
-
# ViralSeq::tail
|
9
|
-
|
10
|
-
class Hash
|
11
|
-
|
12
|
-
# Hash#copyhash
|
13
|
-
# copy a hash
|
14
|
-
# different from "="
|
15
|
-
# # example
|
16
|
-
# h1 = {1=>'a'}
|
17
|
-
# h2 = h1
|
18
|
-
# h3 = h1.copyhash
|
19
|
-
# h1.object_id == h2.object_id
|
20
|
-
# => true
|
21
|
-
# h1.object_id == h3.object_id
|
22
|
-
# => false
|
23
|
-
|
24
|
-
def copyhash
|
25
|
-
h = Hash.new
|
26
|
-
self.each do |pair|
|
27
|
-
h.store(pair[0], pair[1])
|
28
|
-
end
|
29
|
-
return h
|
30
|
-
end
|
31
|
-
|
32
|
-
# subtract one hash (h2) from the other (h1) if the keys are identical
|
33
|
-
# example:
|
34
|
-
# h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
|
35
|
-
# h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
|
36
|
-
# h1.difference(h2) = {"Bird" => 2, "Snake" => 10}
|
37
|
-
|
38
|
-
def difference(other)
|
39
|
-
reject do |k,_v|
|
40
|
-
other.has_key? k
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# input hash A, return hash B with the unique values of hash A as keys,
|
45
|
-
# and the keys of the unique values of hash A as values of hash B
|
46
|
-
# # example
|
47
|
-
# hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
|
48
|
-
# p hash.uniq_hash
|
49
|
-
# => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
|
50
|
-
|
51
|
-
def uniq_hash
|
52
|
-
uniq_values = self.values.uniq
|
53
|
-
out_hash = {}
|
54
|
-
uniq_values.each do |uniq_va|
|
55
|
-
self.each do |k,v|
|
56
|
-
if v == uniq_va
|
57
|
-
if out_hash[uniq_va]
|
58
|
-
out_hash[uniq_va] << k
|
59
|
-
else
|
60
|
-
out_hash[uniq_va] = []
|
61
|
-
out_hash[uniq_va] << k
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
return out_hash
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
# Tail function for file as 'tail' in bash.
|
71
|
-
def ViralSeq.tail(path, n)
|
72
|
-
file = File.open(path, "r")
|
73
|
-
buffer_s = 512
|
74
|
-
line_count = 0
|
75
|
-
file.seek(0, IO::SEEK_END)
|
76
|
-
|
77
|
-
offset = file.pos # we start at the end
|
78
|
-
|
79
|
-
while line_count <= n && offset > 0
|
80
|
-
to_read = if (offset - buffer_s) < 0
|
81
|
-
offset
|
82
|
-
else
|
83
|
-
buffer_s
|
84
|
-
end
|
85
|
-
|
86
|
-
file.seek(offset-to_read)
|
87
|
-
data = file.read(to_read)
|
88
|
-
|
89
|
-
data.reverse.each_char do |c|
|
90
|
-
if line_count > n
|
91
|
-
offset += 1
|
92
|
-
break
|
93
|
-
end
|
94
|
-
offset -= 1
|
95
|
-
if c == "\n"
|
96
|
-
line_count += 1
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
file.seek(offset)
|
102
|
-
file.read
|
103
|
-
end
|