viral_seq 1.9.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -1
- data/README.md +133 -119
- data/bin/locator +2 -2
- data/bin/tcs +38 -38
- data/bin/tcs_sdrm +2 -2
- data/lib/viral_seq/R.rb +3 -1
- data/lib/viral_seq/pid.rb +1 -4
- data/lib/viral_seq/seq_hash.rb +48 -12
- data/lib/viral_seq/sequence.rb +22 -171
- data/lib/viral_seq/string.rb +3 -6
- data/lib/viral_seq/tcs_core.rb +4 -0
- data/lib/viral_seq/tcs_dr.rb +82 -1
- data/lib/viral_seq/util/drm_versions_config.json +52 -0
- data/lib/viral_seq/version.rb +2 -2
- data/lib/viral_seq.rb +2 -0
- data/viral_seq.gemspec +5 -0
- metadata +31 -5
data/lib/viral_seq/pid.rb
CHANGED
@@ -14,10 +14,7 @@ module ViralSeq
|
|
14
14
|
nt = ['A','T','C','G']
|
15
15
|
pid_pool = ['A','T','C','G']
|
16
16
|
(l-1).times do
|
17
|
-
pid_pool = pid_pool.product(nt)
|
18
|
-
pid_pool.collect! do |v|
|
19
|
-
v.join("")
|
20
|
-
end
|
17
|
+
pid_pool = pid_pool.product(nt).map(&:join)
|
21
18
|
end
|
22
19
|
return pid_pool
|
23
20
|
end # end of .generate_primer_id_pool
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -656,7 +656,7 @@ module ViralSeq
|
|
656
656
|
|
657
657
|
def nt_variants
|
658
658
|
return_obj = {}
|
659
|
-
|
659
|
+
|
660
660
|
tcs_number = self.size
|
661
661
|
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
662
662
|
fdr_hash = self.fdr
|
@@ -869,7 +869,7 @@ module ViralSeq
|
|
869
869
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
870
870
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
871
871
|
# @param indel [Boolean] allow indels or not, `ture` or `false`
|
872
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
872
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
873
873
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
874
874
|
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with only the sequences that meet the QC criterias
|
875
875
|
# @example QC for sequences in a FASTA files
|
@@ -880,17 +880,19 @@ module ViralSeq
|
|
880
880
|
# filtered_seqhash.dna_hash.size
|
881
881
|
# => 4
|
882
882
|
|
883
|
-
def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2
|
884
|
-
start_nt = start_nt
|
885
|
-
end_nt = end_nt
|
883
|
+
def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2)
|
884
|
+
start_nt = position_helper(start_nt)
|
885
|
+
end_nt = position_helper(end_nt)
|
886
|
+
|
886
887
|
seq_hash = self.dna_hash.dup
|
887
888
|
seq_hash_unique = seq_hash.values.uniq
|
888
889
|
seq_hash_unique_pass = []
|
889
890
|
|
890
|
-
seq_hash_unique.
|
891
|
-
|
892
|
-
loc =
|
893
|
-
|
891
|
+
batch_locator = VirustLocator::Locator.exec(seq_hash_unique.join("\s"), "nt", 1, ref_option).split("\n")
|
892
|
+
seq_hash_unique.each_with_index do |seq, i|
|
893
|
+
loc = batch_locator[i]
|
894
|
+
loc = locator_helper(loc)
|
895
|
+
next unless loc
|
894
896
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
895
897
|
if indel
|
896
898
|
seq_hash_unique_pass << seq
|
@@ -898,8 +900,11 @@ module ViralSeq
|
|
898
900
|
seq_hash_unique_pass << seq
|
899
901
|
end
|
900
902
|
end
|
903
|
+
|
901
904
|
end
|
905
|
+
|
902
906
|
seq_pass = []
|
907
|
+
|
903
908
|
seq_hash_unique_pass.each do |seq|
|
904
909
|
seq_hash.each do |seq_name, orginal_seq|
|
905
910
|
if orginal_seq == seq
|
@@ -909,10 +914,10 @@ module ViralSeq
|
|
909
914
|
end
|
910
915
|
end
|
911
916
|
self.sub(seq_pass)
|
912
|
-
end # end of #hiv_seq_qc
|
917
|
+
end # end of #hiv_seq_qc # end of #hiv_seq_qc
|
913
918
|
|
914
919
|
# sequence locator for SeqHash object, resembling HIV Sequence Locator from LANL
|
915
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
920
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
916
921
|
# @return [Array] two dimensional array `[[],[],[],...]` for each sequence, including the following information:
|
917
922
|
#
|
918
923
|
# title of the SeqHash object (String)
|
@@ -1341,7 +1346,7 @@ module ViralSeq
|
|
1341
1346
|
seq_hash_unique = seq_hash.uniq_hash
|
1342
1347
|
trimmed_seq_hash = {}
|
1343
1348
|
seq_hash_unique.each do |seq, names|
|
1344
|
-
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option
|
1349
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option).dna
|
1345
1350
|
names.each do |name|
|
1346
1351
|
trimmed_seq_hash[name] = trimmed_seq
|
1347
1352
|
end
|
@@ -1431,6 +1436,37 @@ module ViralSeq
|
|
1431
1436
|
var_count.sort_by{|key,_value|key}.to_h
|
1432
1437
|
end # end of #varaint_for_poisson
|
1433
1438
|
|
1439
|
+
# helper for start/end position for #hiv_seq_qc
|
1440
|
+
def position_helper(position)
|
1441
|
+
if position.is_a?(Range)
|
1442
|
+
return position
|
1443
|
+
elsif position.is_a?(Integer)
|
1444
|
+
return position..position
|
1445
|
+
elsif position.is_a?(String)
|
1446
|
+
return position.to_i..position.to_i
|
1447
|
+
elsif position.is_a?(Array)
|
1448
|
+
return position[0].to_i..position[1].to_i
|
1449
|
+
else
|
1450
|
+
raise "Position #{position} not recognized"
|
1451
|
+
end
|
1452
|
+
end # position_helper
|
1453
|
+
|
1454
|
+
# helper for batch locator
|
1455
|
+
# @param loc [String] the output of batch locator
|
1456
|
+
# @return [Array] the locator information in an array
|
1457
|
+
def locator_helper(loc)
|
1458
|
+
loc = loc.split("\t")
|
1459
|
+
loc[0] = loc[0].to_i
|
1460
|
+
loc[1] = loc[1].to_i
|
1461
|
+
loc[2] = loc[2].to_f.round(1)
|
1462
|
+
if loc[3].to_s.downcase == "true"
|
1463
|
+
loc[3] = true
|
1464
|
+
else
|
1465
|
+
loc[3] = false
|
1466
|
+
end
|
1467
|
+
return loc
|
1468
|
+
end
|
1469
|
+
|
1434
1470
|
end # end of SeqHash
|
1435
1471
|
|
1436
1472
|
end # end of ViralSeq
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -165,7 +165,7 @@ module ViralSeq
|
|
165
165
|
|
166
166
|
# HIV sequence locator function, resembling HIV Sequence Locator from LANL
|
167
167
|
# # current version only supports nucleotide sequence, not for amino acid sequence.
|
168
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
168
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
169
169
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
170
170
|
# @return [Array] an array of the following info:
|
171
171
|
#
|
@@ -181,182 +181,32 @@ module ViralSeq
|
|
181
181
|
#
|
182
182
|
# aligned_reference_sequence (String)
|
183
183
|
#
|
184
|
-
# @example identify the location of the input sequence on the
|
184
|
+
# @example identify the location of the input sequence on the HXB2 genome
|
185
185
|
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
186
186
|
# s = ViralSeq::Sequence.new('my_sequence', sequence)
|
187
|
-
# loc = s.locator(:
|
188
|
-
# h = ViralSeq::SeqHash.new; h.dna_hash['
|
187
|
+
# loc = s.locator(:HXB2)
|
188
|
+
# h = ViralSeq::SeqHash.new; h.dna_hash['HXB2'] = loc[5]; h.dna_hash[s.name] = loc[4]
|
189
189
|
# rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
|
190
|
-
# puts "The input sequence \"#{s.name}\" is located on the
|
191
|
-
# => The input sequence "my_sequence" is located on the
|
192
|
-
# => It is
|
190
|
+
# puts "The input sequence \"#{s.name}\" is located on the HXB2 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].round(1).to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
|
191
|
+
# => The input sequence "my_sequence" is located on the HXB2 nt sequence from 2333 to 2433.
|
192
|
+
# => It is 97.0% similar to the reference.
|
193
193
|
# => It does not have indels.
|
194
194
|
# => The alignment is
|
195
|
-
# =>
|
195
|
+
# => HXB2 AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
|
196
196
|
# => my_sequence AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
|
197
197
|
# @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
|
198
|
-
|
199
|
-
def locator(ref_option = :HXB2, path_to_muscle = false)
|
198
|
+
def locator(ref_option = :HXB2, algorithm = 1)
|
200
199
|
seq = self.dna
|
201
|
-
|
202
|
-
|
200
|
+
ref = ref_option.to_s
|
203
201
|
begin
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
211
|
-
gap_begin = $1.size
|
212
|
-
gap_end = $3.size
|
213
|
-
aln_test2 = $2
|
214
|
-
ref = aln_seq[0]
|
215
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
216
|
-
ref_size = ref.size
|
217
|
-
if ref_size > 1.3*(seq.size)
|
218
|
-
l1 = l1 + gap_begin
|
219
|
-
l2 = l2 + gap_end
|
220
|
-
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
221
|
-
aln_test2 =~ /#{max_seq}/
|
222
|
-
before_aln_seq = $`
|
223
|
-
before_aln = $`.size
|
224
|
-
post_aln_seq = $'
|
225
|
-
post_aln = $'.size
|
226
|
-
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
227
|
-
b1 = (1.3 * before_aln_seq_size).to_i
|
228
|
-
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
229
|
-
b2 = (1.3 * post_aln_seq_size).to_i
|
230
|
-
if (before_aln > seq.size) and (post_aln <= seq.size)
|
231
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
232
|
-
l1 = l1 + (before_aln - b1)
|
233
|
-
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
234
|
-
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
235
|
-
l2 = l2 + post_aln - b2
|
236
|
-
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
237
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
238
|
-
l1 = l1 + (before_aln - b1)
|
239
|
-
l2 = l2 + (post_aln - b2)
|
240
|
-
end
|
241
|
-
|
242
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
243
|
-
aln_test = aln_seq[1]
|
244
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
245
|
-
gap_begin = $1.size
|
246
|
-
gap_end = $3.size
|
247
|
-
ref = aln_seq[0]
|
248
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
249
|
-
end
|
250
|
-
|
251
|
-
aln_test = aln_seq[1]
|
252
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
253
|
-
gap_begin = $1.size
|
254
|
-
gap_end = $3.size
|
255
|
-
aln_test = $2
|
256
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
257
|
-
s1 = $1.size
|
258
|
-
g1 = $2.size
|
259
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
260
|
-
s2 = $2.size
|
261
|
-
g2 = $1.size
|
262
|
-
|
263
|
-
l1 = l1 + gap_begin
|
264
|
-
l2 = l2 + gap_end
|
265
|
-
repeat = 0
|
266
|
-
|
267
|
-
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
268
|
-
if s1 > s2 and g2 >= s2
|
269
|
-
ref = ref[0..(-g2-1)]
|
270
|
-
repeat = 1
|
271
|
-
l2 = l2 + g2
|
272
|
-
elsif s1 < s2 and g1 >= s1
|
273
|
-
ref = ref[g1..-1]
|
274
|
-
repeat = 1
|
275
|
-
l1 = l1 + g1
|
276
|
-
end
|
277
|
-
else
|
278
|
-
if g1 >= s1
|
279
|
-
ref = ref[g1..-1]
|
280
|
-
repeat = 1
|
281
|
-
l1 = l1 + g1
|
282
|
-
end
|
283
|
-
if g2 >= s2
|
284
|
-
ref = ref[0..(-g2 - 1)]
|
285
|
-
repeat = 1
|
286
|
-
l2 = l2 + g2
|
287
|
-
end
|
288
|
-
end
|
289
|
-
|
290
|
-
while repeat == 1
|
291
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
292
|
-
aln_test = aln_seq[1]
|
293
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
294
|
-
gap_begin = $1.size
|
295
|
-
gap_end = $3.size
|
296
|
-
aln_test = $2
|
297
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
298
|
-
s1 = $1.size
|
299
|
-
g1 = $2.size
|
300
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
301
|
-
s2 = $2.size
|
302
|
-
g2 = $1.size
|
303
|
-
ref = aln_seq[0]
|
304
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
305
|
-
l1 = l1 + gap_begin
|
306
|
-
l2 = l2 + gap_end
|
307
|
-
repeat = 0
|
308
|
-
if g1 >= s1
|
309
|
-
ref = ref[g1..-1]
|
310
|
-
repeat = 1
|
311
|
-
l1 = l1 + g1
|
312
|
-
end
|
313
|
-
if g2 >= s2
|
314
|
-
ref = ref[0..(-g2 - 1)]
|
315
|
-
repeat = 1
|
316
|
-
l2 = l2 + g2
|
317
|
-
end
|
318
|
-
end
|
319
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
320
|
-
|
321
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
322
|
-
aln_test = aln_seq[1]
|
323
|
-
ref = aln_seq[0]
|
324
|
-
|
325
|
-
#refine alignment
|
326
|
-
|
327
|
-
if ref =~ /^(\-+)/
|
328
|
-
l1 = l1 - $1.size
|
329
|
-
elsif ref =~ /(\-+)$/
|
330
|
-
l2 = l2 - $1.size
|
331
|
-
end
|
332
|
-
|
333
|
-
if (ori_ref_l - l2 - 1) >= l1
|
334
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
335
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
336
|
-
aln_test = aln_seq[1]
|
337
|
-
ref = aln_seq[0]
|
338
|
-
|
339
|
-
ref_size = ref.size
|
340
|
-
sim_count = 0
|
341
|
-
(0..(ref_size-1)).each do |n|
|
342
|
-
ref_base = ref[n]
|
343
|
-
test_base = aln_test[n]
|
344
|
-
sim_count += 1 if ref_base == test_base
|
345
|
-
end
|
346
|
-
similarity = (sim_count/ref_size.to_f*100).round(1)
|
347
|
-
|
348
|
-
loc_p1 = l1 + 1
|
349
|
-
loc_p2 = ori_ref_l - l2
|
350
|
-
if seq.size != (loc_p2 - loc_p1 + 1)
|
351
|
-
indel = true
|
352
|
-
elsif aln_test.include?("-")
|
353
|
-
indel = true
|
354
|
-
else
|
355
|
-
indel = false
|
356
|
-
end
|
357
|
-
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
202
|
+
loc = VirustLocator::Locator.exec(seq, "nt", algorithm, ref).split("\t")
|
203
|
+
loc[0] = loc[0].to_i
|
204
|
+
loc[1] = loc[1].to_i
|
205
|
+
loc[2] = loc[2].to_f.round(1)
|
206
|
+
if loc[3].to_s.downcase == "true"
|
207
|
+
loc[3] = true
|
358
208
|
else
|
359
|
-
|
209
|
+
loc[3] = false
|
360
210
|
end
|
361
211
|
rescue => e
|
362
212
|
puts "Unexpected error occured."
|
@@ -366,12 +216,13 @@ module ViralSeq
|
|
366
216
|
puts "ViralSeq.sequence_locator returns nil"
|
367
217
|
return nil
|
368
218
|
end
|
369
|
-
|
219
|
+
return loc
|
220
|
+
end #end of locator
|
370
221
|
|
371
222
|
# Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
|
372
223
|
# @param p1 [Integer] start position number on the reference genome
|
373
224
|
# @param p2 [Integer] end position number on the reference genome
|
374
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
225
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
375
226
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
376
227
|
# @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
|
377
228
|
# if either the start or end position is beyond the range of the target sequence.
|
@@ -381,8 +232,8 @@ module ViralSeq
|
|
381
232
|
# s.sequence_clip(2333, 2433, :HXB2).dna
|
382
233
|
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
383
234
|
|
384
|
-
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2
|
385
|
-
loc = self.locator(ref_option
|
235
|
+
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2)
|
236
|
+
loc = self.locator(ref_option)
|
386
237
|
l1 = loc[0]
|
387
238
|
l2 = loc[1]
|
388
239
|
if (p1 >= l1) & (p2 <= l2)
|
data/lib/viral_seq/string.rb
CHANGED
@@ -56,13 +56,13 @@ class String
|
|
56
56
|
Regexp.new match
|
57
57
|
end
|
58
58
|
|
59
|
-
# parse the nucleotide sequences as an Array of Array
|
59
|
+
# parse the nucleotide sequences as an Array of Array
|
60
60
|
# @return [Array] Array of Array at each position
|
61
61
|
# @example parse a sequence with ambiguities to Array of Array
|
62
62
|
# "ATRWCG".nt_to_array
|
63
63
|
# => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
|
64
|
-
|
65
|
-
def nt_to_array
|
64
|
+
|
65
|
+
def nt_to_array
|
66
66
|
return_array = []
|
67
67
|
self.each_char.each do |base|
|
68
68
|
base_array = base.to_list
|
@@ -75,9 +75,6 @@ class String
|
|
75
75
|
# compare the given nt sequence string with the ref sequence string
|
76
76
|
# @param ref [String] the ref sequence string to compare with
|
77
77
|
# @return [Interger] Number of differences
|
78
|
-
# @example parse a sequence with ambiguities to Array of Array
|
79
|
-
# "ATRWCG".nt_to_array
|
80
|
-
# => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
|
81
78
|
|
82
79
|
def nt_diff(ref)
|
83
80
|
count_diff = 0
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -331,6 +331,10 @@ module ViralSeq
|
|
331
331
|
return false
|
332
332
|
elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
333
333
|
return false
|
334
|
+
elsif seq =~ /G{11}/ # a string of poly-G indicates poor quanlity in 2-color chemistry
|
335
|
+
return false
|
336
|
+
elsif seq =~ /C{11}/ # a string of poly-C indicates poor quanlity in 2-color chemistry
|
337
|
+
return false
|
334
338
|
elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
335
339
|
return false
|
336
340
|
elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
data/lib/viral_seq/tcs_dr.rb
CHANGED
@@ -186,7 +186,7 @@ module ViralSeq
|
|
186
186
|
:trim=>false},
|
187
187
|
{:region=>"PR",
|
188
188
|
:cdna=>
|
189
|
-
"
|
189
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
|
190
190
|
:forward=>
|
191
191
|
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
|
192
192
|
:majority=>0,
|
@@ -247,6 +247,87 @@ module ViralSeq
|
|
247
247
|
]
|
248
248
|
},
|
249
249
|
|
250
|
+
"v4" => {:platform_error_rate=>0.01,
|
251
|
+
:primer_pairs=>
|
252
|
+
[{:region=>"RT",
|
253
|
+
:cdna=>
|
254
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTAAGGAATGGAGGTTCTTTCTGATG",
|
255
|
+
:forward=>
|
256
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
|
257
|
+
:majority=>0,
|
258
|
+
:end_join=>true,
|
259
|
+
:end_join_option=>1,
|
260
|
+
:overlap=>0,
|
261
|
+
:TCS_QC=>true,
|
262
|
+
:ref_genome=>"HXB2",
|
263
|
+
:ref_start=>2648,
|
264
|
+
:ref_end=>3209,
|
265
|
+
:indel=>true,
|
266
|
+
:trim=>false},
|
267
|
+
{:region=>"PR",
|
268
|
+
:cdna=>
|
269
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
|
270
|
+
:forward=>
|
271
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
|
272
|
+
:majority=>0,
|
273
|
+
:end_join=>true,
|
274
|
+
:end_join_option=>3,
|
275
|
+
:TCS_QC=>true,
|
276
|
+
:ref_genome=>"HXB2",
|
277
|
+
:ref_start=>0,
|
278
|
+
:ref_end=>2591,
|
279
|
+
:indel=>true,
|
280
|
+
:trim=>true,
|
281
|
+
:trim_ref=>"HXB2",
|
282
|
+
:trim_ref_start=>2253,
|
283
|
+
:trim_ref_end=>2549},
|
284
|
+
{:region=>"IN",
|
285
|
+
:cdna=>
|
286
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCATCACCTGCCATCTGTTTTCCAT",
|
287
|
+
:forward=>"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGCAGAAGTTATYCCAGCAGAAACA",
|
288
|
+
:majority=>0,
|
289
|
+
:end_join=>true,
|
290
|
+
:end_join_option=>2,
|
291
|
+
:overlap=>3,
|
292
|
+
:TCS_QC=>true,
|
293
|
+
:ref_genome=>"HXB2",
|
294
|
+
:ref_start=>4509,
|
295
|
+
:ref_end=>5040,
|
296
|
+
:indel=>true,
|
297
|
+
:trim=>false},
|
298
|
+
{:region=>"V1V3",
|
299
|
+
:cdna=>
|
300
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
|
301
|
+
:forward=>
|
302
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
|
303
|
+
:majority=>0,
|
304
|
+
:end_join=>true,
|
305
|
+
:end_join_option=>1,
|
306
|
+
:overlap=>0,
|
307
|
+
:TCS_QC=>true,
|
308
|
+
:ref_genome=>"HXB2",
|
309
|
+
:ref_start=>6585,
|
310
|
+
:ref_end=>7205..7210,
|
311
|
+
:indel=>true,
|
312
|
+
:trim=>false},
|
313
|
+
{:region=>"CA",
|
314
|
+
:cdna=>
|
315
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCAACAAGGTTTCTGTCATCCAATTTTTTAC",
|
316
|
+
:forward=>
|
317
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGTCAGCCAAAATTACCCTATAGTGC",
|
318
|
+
:majority=>0,
|
319
|
+
:end_join=>true,
|
320
|
+
:end_join_option=>1,
|
321
|
+
:overlap=>0,
|
322
|
+
:TCS_QC=>true,
|
323
|
+
:ref_genome=>"HXB2",
|
324
|
+
:ref_start=>1196,
|
325
|
+
:ref_end=>1725,
|
326
|
+
:indel=>true,
|
327
|
+
:trim=>false}
|
328
|
+
]
|
329
|
+
},
|
330
|
+
|
250
331
|
|
251
332
|
}
|
252
333
|
|
@@ -54,6 +54,58 @@
|
|
54
54
|
}
|
55
55
|
}
|
56
56
|
|
57
|
+
},
|
58
|
+
{
|
59
|
+
"version": "v4",
|
60
|
+
"DRM_range": {
|
61
|
+
"CAI": [56,57, 66, 67, 70, 74, 105, 107],
|
62
|
+
"PI": [23, 24, 30, 32, 46, 47, 48, 50, 53, 54, 73, 76, 82, 83, 84, 88, 90],
|
63
|
+
"NRTI": [41, 65, 67, 69, 70, 74, 75, 77, 115, 116, 151, 184, 210, 215, 219],
|
64
|
+
"NNRTI": [98, 100, 101, 103, 106, 138, 179, 181, 188, 190],
|
65
|
+
"INSTI": [95, 97, 121, 140, 143, 147, 148, 155, 263]
|
66
|
+
},
|
67
|
+
"seq_coord": {
|
68
|
+
"CA": {
|
69
|
+
"minimum": 1196,
|
70
|
+
"maximum": 1725,
|
71
|
+
"gap": {
|
72
|
+
"minimum": 1466,
|
73
|
+
"maximum": 1471
|
74
|
+
}
|
75
|
+
},
|
76
|
+
"PR": {
|
77
|
+
"minimum": 2253,
|
78
|
+
"maximum": 2549
|
79
|
+
},
|
80
|
+
"RT": {
|
81
|
+
"minimum": 2648,
|
82
|
+
"maximum": 3209,
|
83
|
+
"gap": {
|
84
|
+
"minimum": 2915,
|
85
|
+
"maximum": 2949
|
86
|
+
}
|
87
|
+
},
|
88
|
+
"IN": {
|
89
|
+
"minimum": 4509,
|
90
|
+
"maximum": 5040
|
91
|
+
}
|
92
|
+
},
|
93
|
+
"seq_drm_correlation": {
|
94
|
+
"CA": ["CAI"],
|
95
|
+
"RT": ["NRTI", "NNRTI"],
|
96
|
+
"PR": ["PI"],
|
97
|
+
"IN": ["INSTI"]
|
98
|
+
},
|
99
|
+
"ref_info": {
|
100
|
+
"ref_type": "HXB2",
|
101
|
+
"ref_coord": {
|
102
|
+
"CA": [1186,1878],
|
103
|
+
"PR": [2253,2549],
|
104
|
+
"RT": [2550,3869],
|
105
|
+
"IN": [4230,5096]
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
57
109
|
},
|
58
110
|
{
|
59
111
|
"version": "v1",
|
data/lib/viral_seq/version.rb
CHANGED
data/lib/viral_seq.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -37,6 +37,9 @@ Gem::Specification.new do |spec|
|
|
37
37
|
# muscle_bio gem required
|
38
38
|
spec.add_runtime_dependency "muscle_bio", "= 0.4"
|
39
39
|
|
40
|
+
# virust-locator-ruby required
|
41
|
+
spec.add_runtime_dependency "virust-locator-ruby", "~> 0.3"
|
42
|
+
|
40
43
|
# colorize gem required
|
41
44
|
spec.add_runtime_dependency "colorize", "~> 0.1"
|
42
45
|
|
@@ -47,4 +50,6 @@ Gem::Specification.new do |spec|
|
|
47
50
|
spec.add_runtime_dependency "combine_pdf", "~> 1.0", '>= 1.0.0'
|
48
51
|
|
49
52
|
spec.requirements << 'R required for some functions'
|
53
|
+
|
54
|
+
spec.add_dependency "shellwords", "~> 0.2"
|
50
55
|
end
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
8
8
|
- Michael Clark
|
9
|
-
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: bundler
|
@@ -67,6 +66,20 @@ dependencies:
|
|
67
66
|
- - '='
|
68
67
|
- !ruby/object:Gem::Version
|
69
68
|
version: '0.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: virust-locator-ruby
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.3'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.3'
|
70
83
|
- !ruby/object:Gem::Dependency
|
71
84
|
name: colorize
|
72
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -141,6 +154,20 @@ dependencies:
|
|
141
154
|
- - ">="
|
142
155
|
- !ruby/object:Gem::Version
|
143
156
|
version: 1.0.0
|
157
|
+
- !ruby/object:Gem::Dependency
|
158
|
+
name: shellwords
|
159
|
+
requirement: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - "~>"
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: '0.2'
|
164
|
+
type: :runtime
|
165
|
+
prerelease: false
|
166
|
+
version_requirements: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - "~>"
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0.2'
|
144
171
|
description: |-
|
145
172
|
A Ruby Gem with bioinformatics tools for processing viral NGS data.
|
146
173
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
@@ -226,8 +253,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
226
253
|
version: 1.3.6
|
227
254
|
requirements:
|
228
255
|
- R required for some functions
|
229
|
-
rubygems_version: 3.
|
230
|
-
signing_key:
|
256
|
+
rubygems_version: 3.6.7
|
231
257
|
specification_version: 4
|
232
258
|
summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
233
259
|
test_files: []
|