viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
data/lib/viral_seq/hcv_dr.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
# viral_seq/hcv_dr
|
2
|
-
# HCV resistant mutation interpretation
|
3
|
-
# ViralSeq::hcv_ns5a
|
4
|
-
|
5
|
-
# ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
|
6
|
-
# # amino_acid_sequence_array is Array object of the amino acid sequence.
|
7
|
-
# # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
|
8
|
-
# # start_aa_position is the starting aa number of the input sequence as Integer
|
9
|
-
|
10
|
-
module ViralSeq
|
11
|
-
def self.hcv_ns5a(aa_array,start_aa=1)
|
12
|
-
out_hash = {}
|
13
|
-
sdrm = {}
|
14
|
-
sdrm[28] = ['M',['T']]
|
15
|
-
sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
|
16
|
-
sdrm[31] = ['L',['M','V','F']]
|
17
|
-
sdrm[32] = ['P',['L']]
|
18
|
-
sdrm[44] = ['K',['R']]
|
19
|
-
sdrm[58] = ['H',['D','P','S']]
|
20
|
-
sdrm[64] = ['T',['A','S']]
|
21
|
-
sdrm[77] = ['P',['A','S']]
|
22
|
-
sdrm[78] = ['R',['K']]
|
23
|
-
sdrm[79] = ['T',['A']]
|
24
|
-
sdrm[83] = ['T',['M']]
|
25
|
-
sdrm[85] = ['S',['N','H','Y']]
|
26
|
-
sdrm[92] = ['A',['P','T','K','E']]
|
27
|
-
sdrm[93] = ['Y',['C','F','H','N']]
|
28
|
-
sdrm[107] = ['K',['T','S']]
|
29
|
-
sdrm[121] = ['I',['V']]
|
30
|
-
sdrm[135] = ['T',['A']]
|
31
|
-
aa_length = aa_array.size
|
32
|
-
end_aa = start_aa + aa_length - 1
|
33
|
-
(start_aa..end_aa).each do |position|
|
34
|
-
array_position = position - start_aa
|
35
|
-
if sdrm.keys.include?(position)
|
36
|
-
wt_aa = sdrm[position][0]
|
37
|
-
test_aa = aa_array[array_position]
|
38
|
-
if test_aa.size == 1
|
39
|
-
unless wt_aa == test_aa
|
40
|
-
if sdrm[position][1].include?(test_aa)
|
41
|
-
out_hash[position] = [wt_aa,test_aa]
|
42
|
-
end
|
43
|
-
end
|
44
|
-
else
|
45
|
-
test_aa_array = test_aa.split("/")
|
46
|
-
if (test_aa_array & sdrm[position][1])
|
47
|
-
out_hash[position] = [wt_aa,test_aa]
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
return out_hash
|
53
|
-
end
|
54
|
-
end
|
data/lib/viral_seq/locator.rb
DELETED
@@ -1,299 +0,0 @@
|
|
1
|
-
# viral_seq/locator.rb
|
2
|
-
|
3
|
-
# Including following methods:
|
4
|
-
# ViralSeq::sequence_locator
|
5
|
-
# ViralSeq::sequence_clip
|
6
|
-
# ViralSeq::qc_hiv_seq_check
|
7
|
-
|
8
|
-
# HIV sequence locator function
|
9
|
-
# resembling HIV Sequence Locator from LANL
|
10
|
-
# https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
|
11
|
-
# require MUSCLE (http://www.drive5.com/muscle) installed
|
12
|
-
# current version only supports nucleotide sequence, not for amino acid sequence.
|
13
|
-
|
14
|
-
# =USAGE1
|
15
|
-
# # Find the location of a sequence
|
16
|
-
# ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
|
17
|
-
# # input_sequence: String of nucleotide sequence
|
18
|
-
# # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
|
19
|
-
# # path_to_muscle: path to the muscle executable.
|
20
|
-
# # Default as :false, will call MuscleBio to run Muscle
|
21
|
-
# # specify path_to_muscle if other source of muscle needed
|
22
|
-
# # function returns an array of
|
23
|
-
# # start_location (Integer)
|
24
|
-
# # end_location (Integer)
|
25
|
-
# # percentage_of_similarity_to_reference_sequence (Float)
|
26
|
-
# # containing_indel? (Boolean)
|
27
|
-
# # aligned_input_sequence (String)
|
28
|
-
# # aligned_reference_sequence (String)
|
29
|
-
# # example code
|
30
|
-
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
31
|
-
# p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
|
32
|
-
# => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
|
33
|
-
|
34
|
-
# =USAGE2
|
35
|
-
# ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
|
36
|
-
# # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
|
37
|
-
# # return nil if the input sequence is not in the range
|
38
|
-
# # input_sequence: String of nucleotide sequence
|
39
|
-
# # start_position and end_position: Integer of the start and end reference number of the sub-sequence
|
40
|
-
# # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
|
41
|
-
# # path_to_muscle: path to the muscle executable.
|
42
|
-
# # Default as :false, will call MuscleBio to run Muscle
|
43
|
-
# # specify path_to_muscle if other source of muscle needed
|
44
|
-
# # example code
|
45
|
-
# seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
|
46
|
-
# p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
|
47
|
-
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
48
|
-
|
49
|
-
# =USAGE3
|
50
|
-
# ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
|
51
|
-
# # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
|
52
|
-
# # and a boolean value for allowing indels,
|
53
|
-
# # path_to_muscle: path to the muscle executable.
|
54
|
-
# # Default as :false, will call MuscleBio to run Muscle
|
55
|
-
# # specify path_to_muscle if other source of muscle needed
|
56
|
-
# # return a sequence sub-hash that meets the the criteria
|
57
|
-
# # example code
|
58
|
-
# sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
|
59
|
-
# filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
|
60
|
-
# puts sequence_hash.size
|
61
|
-
# => 6
|
62
|
-
# puts filtered_sequence_hash.size
|
63
|
-
# => 4
|
64
|
-
|
65
|
-
module ViralSeq
|
66
|
-
|
67
|
-
def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
|
68
|
-
|
69
|
-
# ViralSeq.check_muscle(path_to_muscle)
|
70
|
-
ori_ref = ViralSeq.check_ref(ref_option)
|
71
|
-
|
72
|
-
begin
|
73
|
-
ori_ref_l = ori_ref.size
|
74
|
-
l1 = 0
|
75
|
-
l2 = 0
|
76
|
-
|
77
|
-
aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
|
78
|
-
aln_test = aln_seq[1]
|
79
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
80
|
-
gap_begin = $1.size
|
81
|
-
gap_end = $3.size
|
82
|
-
aln_test2 = $2
|
83
|
-
ref = aln_seq[0]
|
84
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
85
|
-
ref_size = ref.size
|
86
|
-
if ref_size > 1.3*(seq.size)
|
87
|
-
l1 = l1 + gap_begin
|
88
|
-
l2 = l2 + gap_end
|
89
|
-
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
90
|
-
aln_test2 =~ /#{max_seq}/
|
91
|
-
before_aln_seq = $`
|
92
|
-
before_aln = $`.size
|
93
|
-
post_aln_seq = $'
|
94
|
-
post_aln = $'.size
|
95
|
-
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
96
|
-
b1 = (1.3 * before_aln_seq_size).to_i
|
97
|
-
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
98
|
-
b2 = (1.3 * post_aln_seq_size).to_i
|
99
|
-
if (before_aln > seq.size) and (post_aln <= seq.size)
|
100
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
101
|
-
l1 = l1 + (before_aln - b1)
|
102
|
-
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
103
|
-
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
104
|
-
l2 = l2 + post_aln - b2
|
105
|
-
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
106
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
107
|
-
l1 = l1 + (before_aln - b1)
|
108
|
-
l2 = l2 + (post_aln - b2)
|
109
|
-
end
|
110
|
-
|
111
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
112
|
-
aln_test = aln_seq[1]
|
113
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
114
|
-
gap_begin = $1.size
|
115
|
-
gap_end = $3.size
|
116
|
-
ref = aln_seq[0]
|
117
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
118
|
-
end
|
119
|
-
|
120
|
-
aln_test = aln_seq[1]
|
121
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
122
|
-
gap_begin = $1.size
|
123
|
-
gap_end = $3.size
|
124
|
-
aln_test = $2
|
125
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
126
|
-
s1 = $1.size
|
127
|
-
g1 = $2.size
|
128
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
129
|
-
s2 = $2.size
|
130
|
-
g2 = $1.size
|
131
|
-
|
132
|
-
l1 = l1 + gap_begin
|
133
|
-
l2 = l2 + gap_end
|
134
|
-
repeat = 0
|
135
|
-
|
136
|
-
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
137
|
-
if s1 > s2 and g2 > 2*s2
|
138
|
-
ref = ref[0..(-g2-1)]
|
139
|
-
repeat = 1
|
140
|
-
l2 = l2 + g2
|
141
|
-
elsif s1 < s2 and g1 > 2*s1
|
142
|
-
ref = ref[g1..-1]
|
143
|
-
repeat = 1
|
144
|
-
l1 = l1 + g1
|
145
|
-
end
|
146
|
-
else
|
147
|
-
if g1 > 2*s1
|
148
|
-
ref = ref[g1..-1]
|
149
|
-
repeat = 1
|
150
|
-
l1 = l1 + g1
|
151
|
-
end
|
152
|
-
if g2 > 2*s2
|
153
|
-
ref = ref[0..(-g2 - 1)]
|
154
|
-
repeat = 1
|
155
|
-
l2 = l2 + g2
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
while repeat == 1
|
160
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
161
|
-
aln_test = aln_seq[1]
|
162
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
163
|
-
gap_begin = $1.size
|
164
|
-
gap_end = $3.size
|
165
|
-
aln_test = $2
|
166
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
167
|
-
s1 = $1.size
|
168
|
-
g1 = $2.size
|
169
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
170
|
-
s2 = $2.size
|
171
|
-
g2 = $1.size
|
172
|
-
ref = aln_seq[0]
|
173
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
174
|
-
l1 = l1 + gap_begin
|
175
|
-
l2 = l2 + gap_end
|
176
|
-
repeat = 0
|
177
|
-
if g1 > 2*s1
|
178
|
-
ref = ref[g1..-1]
|
179
|
-
repeat = 1
|
180
|
-
l1 = l1 + g1
|
181
|
-
end
|
182
|
-
if g2 > 2*s2
|
183
|
-
ref = ref[0..(-g2 - 1)]
|
184
|
-
repeat = 1
|
185
|
-
l2 = l2 + g2
|
186
|
-
end
|
187
|
-
end
|
188
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
189
|
-
|
190
|
-
|
191
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
192
|
-
aln_test = aln_seq[1]
|
193
|
-
ref = aln_seq[0]
|
194
|
-
|
195
|
-
#refine alignment
|
196
|
-
|
197
|
-
if ref =~ /^(\-+)/
|
198
|
-
l1 = l1 - $1.size
|
199
|
-
elsif ref =~ /(\-+)$/
|
200
|
-
l2 = l2 + $1.size
|
201
|
-
end
|
202
|
-
|
203
|
-
if (ori_ref_l - l2 - 1) >= l1
|
204
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
205
|
-
aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
|
206
|
-
aln_test = aln_seq[1]
|
207
|
-
ref = aln_seq[0]
|
208
|
-
|
209
|
-
ref_size = ref.size
|
210
|
-
sim_count = 0
|
211
|
-
(0..(ref_size-1)).each do |n|
|
212
|
-
ref_base = ref[n]
|
213
|
-
test_base = aln_test[n]
|
214
|
-
sim_count += 1 if ref_base == test_base
|
215
|
-
end
|
216
|
-
similarity = (sim_count/ref_size.to_f*100).round(1)
|
217
|
-
|
218
|
-
loc_p1 = l1 + 1
|
219
|
-
loc_p2 = ori_ref_l - l2
|
220
|
-
if seq.size != (loc_p2 - loc_p1 + 1)
|
221
|
-
indel = true
|
222
|
-
elsif aln_test.include?("-")
|
223
|
-
indel = true
|
224
|
-
else
|
225
|
-
indel = false
|
226
|
-
end
|
227
|
-
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
228
|
-
else
|
229
|
-
return [0,0,0,0,0,0,0]
|
230
|
-
end
|
231
|
-
rescue => e
|
232
|
-
puts "Unexpected error occured."
|
233
|
-
puts "Exception Class: #{ e.class.name }"
|
234
|
-
puts "Exception Message: #{ e.message }"
|
235
|
-
puts "Exception Backtrace: #{ e.backtrace[0] }"
|
236
|
-
puts "ViralSeq.sequence_locator returns nil"
|
237
|
-
return nil
|
238
|
-
end
|
239
|
-
end
|
240
|
-
|
241
|
-
# sequence clip function
|
242
|
-
def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
243
|
-
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
244
|
-
l1 = loc[0]
|
245
|
-
l2 = loc[1]
|
246
|
-
if (p1 >= l1) & (p2 <= l2)
|
247
|
-
seq = loc[4]
|
248
|
-
ref = loc[5]
|
249
|
-
g1 = 0
|
250
|
-
ref.each_char do |char|
|
251
|
-
break if l1 == p1
|
252
|
-
g1 += 1
|
253
|
-
l1 += 1 unless char == "-"
|
254
|
-
end
|
255
|
-
g2 = 1
|
256
|
-
ref.reverse.each_char do |char|
|
257
|
-
break if l2 == p2
|
258
|
-
g2 += 1
|
259
|
-
l2 -= 1 unless char == "-"
|
260
|
-
end
|
261
|
-
return seq[g1..(-g2)].tr("-","")
|
262
|
-
else
|
263
|
-
return nil
|
264
|
-
end
|
265
|
-
end
|
266
|
-
|
267
|
-
# batch quality check of HIV sequences based on ViralSeq.sequence_locator
|
268
|
-
# input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
|
269
|
-
# and allow the sequence to contain indels
|
270
|
-
# return a hash of filtered sequences
|
271
|
-
|
272
|
-
def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
|
273
|
-
seq_hash_unique = seq_hash.values.uniq
|
274
|
-
seq_hash_unique_pass = []
|
275
|
-
start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
|
276
|
-
end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
|
277
|
-
seq_hash_unique.each do |seq|
|
278
|
-
loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
|
279
|
-
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
280
|
-
if indel
|
281
|
-
seq_hash_unique_pass << seq
|
282
|
-
elsif loc[3] == false
|
283
|
-
seq_hash_unique_pass << seq
|
284
|
-
end
|
285
|
-
end
|
286
|
-
end
|
287
|
-
seq_pass = {}
|
288
|
-
seq_hash_unique_pass.each do |seq|
|
289
|
-
seq_hash.each do |seq_name, orginal_seq|
|
290
|
-
if orginal_seq == seq
|
291
|
-
seq_pass[seq_name] = seq
|
292
|
-
seq_hash.delete(seq_name)
|
293
|
-
end
|
294
|
-
end
|
295
|
-
end
|
296
|
-
return seq_pass
|
297
|
-
end
|
298
|
-
|
299
|
-
end
|
data/lib/viral_seq/misc.rb
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
# viral_seq/misc.rb
|
2
|
-
|
3
|
-
# miscellaneous methods
|
4
|
-
# including
|
5
|
-
# Hash#copyhash
|
6
|
-
# Hash#difference
|
7
|
-
# Hash#uniq_hash
|
8
|
-
# ViralSeq::tail
|
9
|
-
|
10
|
-
class Hash
|
11
|
-
|
12
|
-
# Hash#copyhash
|
13
|
-
# copy a hash
|
14
|
-
# different from "="
|
15
|
-
# # example
|
16
|
-
# h1 = {1=>'a'}
|
17
|
-
# h2 = h1
|
18
|
-
# h3 = h1.copyhash
|
19
|
-
# h1.object_id == h2.object_id
|
20
|
-
# => true
|
21
|
-
# h1.object_id == h3.object_id
|
22
|
-
# => false
|
23
|
-
|
24
|
-
def copyhash
|
25
|
-
h = Hash.new
|
26
|
-
self.each do |pair|
|
27
|
-
h.store(pair[0], pair[1])
|
28
|
-
end
|
29
|
-
return h
|
30
|
-
end
|
31
|
-
|
32
|
-
# subtract one hash (h2) from the other (h1) if the keys are identical
|
33
|
-
# example:
|
34
|
-
# h1 = {"Cat" => 100, "Dog" => 5, "Bird" => 2, "Snake" => 10}
|
35
|
-
# h2 = {"Cat" => 100, "Dog" => 5, "Bison" => 30}
|
36
|
-
# h1.difference(h2) = {"Bird" => 2, "Snake" => 10}
|
37
|
-
|
38
|
-
def difference(other)
|
39
|
-
reject do |k,_v|
|
40
|
-
other.has_key? k
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# input hash A, return hash B with the unique values of hash A as keys,
|
45
|
-
# and the keys of the unique values of hash A as values of hash B
|
46
|
-
# # example
|
47
|
-
# hash = {1=>"A", 2=>"A", 3=>"C", 4=>"C", 5=>"T"}
|
48
|
-
# p hash.uniq_hash
|
49
|
-
# => {"A"=>[1, 2], "C"=>[3, 4], "T"=>[5]}
|
50
|
-
|
51
|
-
def uniq_hash
|
52
|
-
uniq_values = self.values.uniq
|
53
|
-
out_hash = {}
|
54
|
-
uniq_values.each do |uniq_va|
|
55
|
-
self.each do |k,v|
|
56
|
-
if v == uniq_va
|
57
|
-
if out_hash[uniq_va]
|
58
|
-
out_hash[uniq_va] << k
|
59
|
-
else
|
60
|
-
out_hash[uniq_va] = []
|
61
|
-
out_hash[uniq_va] << k
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
return out_hash
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
# Tail function for file as 'tail' in bash.
|
71
|
-
def ViralSeq.tail(path, n)
|
72
|
-
file = File.open(path, "r")
|
73
|
-
buffer_s = 512
|
74
|
-
line_count = 0
|
75
|
-
file.seek(0, IO::SEEK_END)
|
76
|
-
|
77
|
-
offset = file.pos # we start at the end
|
78
|
-
|
79
|
-
while line_count <= n && offset > 0
|
80
|
-
to_read = if (offset - buffer_s) < 0
|
81
|
-
offset
|
82
|
-
else
|
83
|
-
buffer_s
|
84
|
-
end
|
85
|
-
|
86
|
-
file.seek(offset-to_read)
|
87
|
-
data = file.read(to_read)
|
88
|
-
|
89
|
-
data.reverse.each_char do |c|
|
90
|
-
if line_count > n
|
91
|
-
offset += 1
|
92
|
-
break
|
93
|
-
end
|
94
|
-
offset -= 1
|
95
|
-
if c == "\n"
|
96
|
-
line_count += 1
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
file.seek(offset)
|
102
|
-
file.read
|
103
|
-
end
|