viral_seq 1.0.6 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -4
- data/README.md +116 -45
- data/bin/locator +31 -9
- data/bin/tcs +454 -0
- data/lib/viral_seq.rb +4 -1
- data/lib/viral_seq/constant.rb +5 -1
- data/lib/viral_seq/hash.rb +1 -1
- data/lib/viral_seq/hivdr.rb +1 -1
- data/lib/viral_seq/muscle.rb +2 -2
- data/lib/viral_seq/sdrm.rb +43 -0
- data/lib/viral_seq/seq_hash.rb +173 -42
- data/lib/viral_seq/seq_hash_pair.rb +16 -6
- data/lib/viral_seq/tcs_core.rb +305 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -1
- data/viral_seq.gemspec +5 -1
- metadata +24 -5
data/lib/viral_seq.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
2
2
|
#
|
3
3
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
4
|
# of this software and associated documentation files (the "Software"), to deal
|
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
|
|
35
35
|
require_relative "viral_seq/sequence"
|
36
36
|
require_relative "viral_seq/string"
|
37
37
|
require_relative "viral_seq/version"
|
38
|
+
require_relative "viral_seq/tcs_core"
|
39
|
+
require_relative "viral_seq/tcs_json"
|
40
|
+
|
38
41
|
|
39
42
|
require "muscle_bio"
|
data/lib/viral_seq/constant.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module ViralSeq
|
2
|
-
|
2
|
+
|
3
3
|
# array for all amino acid one letter abbreviations
|
4
4
|
|
5
5
|
AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
|
6
6
|
|
7
|
+
SDRM_HIV_PR_LIST = {}
|
8
|
+
SDRM_HIV_RT_LIST = {}
|
9
|
+
SDRM_HIV_IN_LIST = {}
|
10
|
+
|
7
11
|
end
|
data/lib/viral_seq/hash.rb
CHANGED
data/lib/viral_seq/hivdr.rb
CHANGED
data/lib/viral_seq/muscle.rb
CHANGED
@@ -39,8 +39,8 @@ module ViralSeq
|
|
39
39
|
|
40
40
|
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
41
|
temp_dir = Dir.home
|
42
|
-
temp_file = temp_dir
|
43
|
-
temp_aln = temp_dir
|
42
|
+
temp_file = File.join(temp_dir, "_temp_muscle_in")
|
43
|
+
temp_aln = File.join(temp_dir, "_temp_muscle_aln")
|
44
44
|
name = ">test"
|
45
45
|
temp_in = File.open(temp_file,"w")
|
46
46
|
temp_in.puts ">ref"
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
class DRMs
|
3
|
+
def initialize (mutation_list = {})
|
4
|
+
@mutation_list = mutation_list
|
5
|
+
end
|
6
|
+
|
7
|
+
attr_accessor :mutation_list
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.sdrm_hiv_pr(seq_hash)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.sdrm_hiv_rt(seq_hash)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.sdrm_hiv_in(seq_hash)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.list_from_json(file)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.list_from_csv(file)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.export_list_hiv_pr(file, format = :json)
|
26
|
+
if foramt == :json
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.export_list_hiv_rt(file, format = :json)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.export_list_hiv_in(file, format = :json)
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
def drm_analysis(seq_hash)
|
40
|
+
mutation_list = self.mutation_list
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -9,7 +9,7 @@ module ViralSeq
|
|
9
9
|
# # align with MUSCLE
|
10
10
|
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
11
|
# # filter nt sequences with the reference coordinates
|
12
|
-
# filtered_seqhash = aligned_pr_seqhash.stop_codon[
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
|
13
13
|
# # return a new ViralSeq::SeqHash object without stop codons
|
14
14
|
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
15
|
# # further filter out sequences with A3G hypermutations
|
@@ -130,8 +130,8 @@ module ViralSeq
|
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|
133
|
-
sequence_hash = Hash[
|
134
|
-
quality_hash = Hash[
|
133
|
+
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
134
|
+
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
135
135
|
|
136
136
|
seq_hash = ViralSeq::SeqHash.new
|
137
137
|
seq_hash.dna_hash = sequence_hash
|
@@ -181,6 +181,7 @@ module ViralSeq
|
|
181
181
|
new_seqhash = ViralSeq::SeqHash.new
|
182
182
|
new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
|
183
183
|
new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
|
184
|
+
new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
|
184
185
|
new_seqhash.title = self.title + "_with_" + sh2.title
|
185
186
|
new_seqhash.file = self.file + "," + sh2.file
|
186
187
|
return new_seqhash
|
@@ -248,10 +249,12 @@ module ViralSeq
|
|
248
249
|
def translate(codon_position = 0)
|
249
250
|
seqs = self.dna_hash
|
250
251
|
@aa_hash = {}
|
251
|
-
seqs.each do |
|
252
|
-
s = ViralSeq::Sequence.new(name, seq)
|
252
|
+
seqs.uniq_hash.each do |seq, array_of_name|
|
253
|
+
s = ViralSeq::Sequence.new('name', seq)
|
253
254
|
s.translate(codon_position)
|
254
|
-
|
255
|
+
array_of_name.each do |name|
|
256
|
+
@aa_hash[name] = s.aa_string
|
257
|
+
end
|
255
258
|
end
|
256
259
|
return nil
|
257
260
|
end # end of #translate
|
@@ -310,41 +313,45 @@ module ViralSeq
|
|
310
313
|
|
311
314
|
# screen for sequences with stop codons.
|
312
315
|
# @param (see #translate)
|
313
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
314
317
|
#
|
315
|
-
# #
|
316
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
317
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
318
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
319
322
|
# my_seqhash.dna_hash
|
320
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
321
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
322
325
|
# stop_codon_seqhash.dna_hash
|
323
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
324
327
|
# stop_codon_seqhash.aa_hash
|
325
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
326
329
|
# stop_codon_seqhash.title
|
327
330
|
# => "my_fasta_file_stop"
|
328
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
329
332
|
# filtered_seqhash.aa_hash
|
330
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
331
334
|
|
332
335
|
def stop_codon(codon_position = 0)
|
333
336
|
self.translate(codon_position)
|
334
337
|
keys = []
|
335
|
-
self.aa_hash
|
336
|
-
|
338
|
+
aa_seqs = self.aa_hash
|
339
|
+
aa_seqs.uniq_hash.each do |seq,array_of_name|
|
340
|
+
keys += array_of_name if seq.include?('*')
|
337
341
|
end
|
338
342
|
seqhash1 = self.sub(keys)
|
339
343
|
seqhash1.title = self.title + "_stop"
|
340
|
-
keys2 =
|
344
|
+
keys2 = aa_seqs.keys - keys
|
341
345
|
seqhash2 = self.sub(keys2)
|
342
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
343
350
|
end #end of #stop_codon
|
344
351
|
|
345
352
|
|
346
353
|
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
347
|
-
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at
|
354
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
|
348
355
|
# @return [String] consensus sequence
|
349
356
|
# @example consensus sequence from an array of sequences.
|
350
357
|
# seq_array = %w{ ATTTTTTTTT
|
@@ -376,11 +383,18 @@ module ViralSeq
|
|
376
383
|
base_count = all_base.count_freq
|
377
384
|
max_base_list = []
|
378
385
|
|
379
|
-
|
380
|
-
|
381
|
-
|
386
|
+
if cutoff.zero?
|
387
|
+
max_count = base_count.values.max
|
388
|
+
max_base_hash = base_count.select {|_k,v| v == max_count}
|
389
|
+
max_base_list = max_base_hash.keys
|
390
|
+
else
|
391
|
+
base_count.each do |k,v|
|
392
|
+
if v/seq_size.to_f >= cutoff
|
393
|
+
max_base_list << k
|
394
|
+
end
|
382
395
|
end
|
383
396
|
end
|
397
|
+
|
384
398
|
consensus_seq += call_consensus_base(max_base_list)
|
385
399
|
end
|
386
400
|
return consensus_seq
|
@@ -391,14 +405,14 @@ module ViralSeq
|
|
391
405
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
392
406
|
# # use the sample consensus to determine potential a3g sites
|
393
407
|
# # Two criteria to identify hypermutation
|
394
|
-
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G
|
408
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
395
409
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
396
410
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
397
411
|
# # b/c Poisson model does not do well on small sample size.
|
398
|
-
# @return [
|
399
|
-
#
|
400
|
-
#
|
401
|
-
#
|
412
|
+
# @return [Hash] three paris.
|
413
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
414
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
415
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
402
416
|
# # sequence tag
|
403
417
|
# # G to A mutation numbers at potential a3g positions
|
404
418
|
# # total potential a3g G positions
|
@@ -409,17 +423,17 @@ module ViralSeq
|
|
409
423
|
# @example identify apobec3gf mutations from a sequence fasta file
|
410
424
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
411
425
|
# hypermut = my_seqhash.a3g
|
412
|
-
# hypermut[
|
426
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
413
427
|
# => [">Seq7", ">Seq14"]
|
414
|
-
# hypermut[
|
428
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
415
429
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
416
|
-
# hypermut[
|
430
|
+
# hypermut[:stats]
|
417
431
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
418
432
|
#
|
419
433
|
# @example identify apobec3gf mutations from another sequence fasta file
|
420
434
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
421
435
|
# hypermut = my_seqhash.a3g
|
422
|
-
# hypermut[
|
436
|
+
# hypermut[:stats]
|
423
437
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
424
438
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
425
439
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -512,7 +526,10 @@ module ViralSeq
|
|
512
526
|
hm_seq_hash.title = self.title + "_hypermut"
|
513
527
|
hm_seq_hash.file = self.file
|
514
528
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
515
|
-
return
|
529
|
+
return { a3g_seq: hm_seq_hash,
|
530
|
+
filtered_seq: filtered_seq_hash,
|
531
|
+
stats: hm_hash.values
|
532
|
+
}
|
516
533
|
end #end of #a3g_hypermut
|
517
534
|
|
518
535
|
alias_method :a3g, :a3g_hypermut
|
@@ -532,7 +549,7 @@ module ViralSeq
|
|
532
549
|
if sequences.size == 0
|
533
550
|
return 0
|
534
551
|
else
|
535
|
-
cut_off =
|
552
|
+
cut_off = Float::INFINITY
|
536
553
|
l = sequences[0].size
|
537
554
|
rate = sequences.size * error_rate
|
538
555
|
count_mut = variant_for_poisson(sequences)
|
@@ -541,7 +558,7 @@ module ViralSeq
|
|
541
558
|
|
542
559
|
poisson_hash.each do |k,v|
|
543
560
|
cal = l * v
|
544
|
-
obs = count_mut[k] ? count_mut[k] :
|
561
|
+
obs = count_mut[k] ? count_mut[k] : 1
|
545
562
|
if obs >= fold_cutoff * cal
|
546
563
|
cut_off = k
|
547
564
|
break
|
@@ -726,6 +743,7 @@ module ViralSeq
|
|
726
743
|
|
727
744
|
seq_hash_unique.each do |seq|
|
728
745
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
746
|
+
next unless loc # if locator tool fails, skip this seq.
|
729
747
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
730
748
|
if indel
|
731
749
|
seq_hash_unique_pass << seq
|
@@ -904,11 +922,11 @@ module ViralSeq
|
|
904
922
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
905
923
|
# @example gap strip for an array of sequences
|
906
924
|
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
907
|
-
# array = { AACCGGTT
|
908
|
-
#
|
909
|
-
#
|
910
|
-
#
|
911
|
-
#
|
925
|
+
# array = %w{ AACCGGTT
|
926
|
+
# A-CCGGTT
|
927
|
+
# AAC-GGTT
|
928
|
+
# AACCG-TT
|
929
|
+
# AACCGGT- }
|
912
930
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
913
931
|
# puts my_seqhash.gap_strip.dna_hash.values
|
914
932
|
# ACGT
|
@@ -963,12 +981,11 @@ module ViralSeq
|
|
963
981
|
# @param (see #gap_strip)
|
964
982
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
965
983
|
# @example gap strip for an array of sequences only at the ends
|
966
|
-
# array =
|
967
|
-
#
|
968
|
-
#
|
969
|
-
#
|
970
|
-
#
|
971
|
-
# AACCGGT- }
|
984
|
+
# array = %w{ AACCGGTT
|
985
|
+
# A-CCGGTT
|
986
|
+
# AAC-GGTT
|
987
|
+
# AACCG-TT
|
988
|
+
# AACCGGT- }
|
972
989
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
973
990
|
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
974
991
|
# AACCGGT
|
@@ -1048,7 +1065,121 @@ module ViralSeq
|
|
1048
1065
|
return new_seqhash
|
1049
1066
|
end
|
1050
1067
|
|
1068
|
+
# return an table of frequencies of nucleotides at each position.
|
1069
|
+
# @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
|
1070
|
+
# @param head [Boolean] if the head of table is included.
|
1071
|
+
# @return [Array] a two-dimension array of the frequency table,
|
1072
|
+
# including the following info:
|
1073
|
+
# position on the sequence (starting from 1)
|
1074
|
+
# consensus nucleotide
|
1075
|
+
# total sequence numbers
|
1076
|
+
# percentage of A, shows "-" if agrees with consensus
|
1077
|
+
# percentage of C, shows "-" if agrees with consensus
|
1078
|
+
# percentage of G, shows "-" if agrees with consensus
|
1079
|
+
# percentage of T, shows "-" if agrees with consensus
|
1080
|
+
#
|
1081
|
+
# @example error table for an array of sequences
|
1082
|
+
# array = %w{ AACCGGTT
|
1083
|
+
# AGCCGGTT
|
1084
|
+
# AACTGCTT
|
1085
|
+
# AACCGTTA
|
1086
|
+
# AACCGGTA }
|
1087
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
1088
|
+
# my_seqhash.error_table.each {|r| puts r.join(',')}
|
1089
|
+
# position,consensus,total_seq_number,A,C,G,T
|
1090
|
+
# 1,A,5,-,,,
|
1091
|
+
# 2,A,5,-,,0.2,
|
1092
|
+
# 3,C,5,,-,,
|
1093
|
+
# 4,C,5,,-,,0.2
|
1094
|
+
# 5,G,5,,,-,
|
1095
|
+
# 6,G,5,,0.2,-,0.2
|
1096
|
+
# 7,T,5,,,,-
|
1097
|
+
# 8,T,5,0.4,,,-
|
1098
|
+
|
1099
|
+
def error_table(ref = self.consensus, head = true)
|
1100
|
+
|
1101
|
+
table = []
|
1102
|
+
if head
|
1103
|
+
table << %w{
|
1104
|
+
position
|
1105
|
+
consensus
|
1106
|
+
total_seq_number
|
1107
|
+
A
|
1108
|
+
C
|
1109
|
+
G
|
1110
|
+
T
|
1111
|
+
}
|
1112
|
+
end
|
1113
|
+
ref_size = ref.size
|
1114
|
+
|
1115
|
+
(0..(ref_size - 1)).each do |position|
|
1116
|
+
ref_base = ref[position]
|
1117
|
+
nts = []
|
1118
|
+
|
1119
|
+
self.dna_hash.each do |_k,v|
|
1120
|
+
nts << v[position]
|
1121
|
+
end
|
1122
|
+
|
1123
|
+
freq = nts.count_freq
|
1124
|
+
freq2 = {}
|
1125
|
+
|
1126
|
+
freq.each do |nt,c|
|
1127
|
+
if nt == ref_base
|
1128
|
+
freq2[nt] = '-'
|
1129
|
+
else
|
1130
|
+
freq2[nt] = (c/(self.size).to_f)
|
1131
|
+
end
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
return table
|
1051
1138
|
|
1139
|
+
end # end of error_table
|
1140
|
+
|
1141
|
+
# randomly select n number of sequences from the orginal SeqHash object
|
1142
|
+
# @param n [Integer] number of sequences to randomly select
|
1143
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
|
1144
|
+
|
1145
|
+
def random_select(n = 100)
|
1146
|
+
new_sh = ViralSeq::SeqHash.new
|
1147
|
+
dna_hash = self.dna_hash
|
1148
|
+
aa_hash = self.aa_hash
|
1149
|
+
qc_hash = self.qc_hash
|
1150
|
+
|
1151
|
+
keys = dna_hash.keys.sample(n)
|
1152
|
+
|
1153
|
+
keys.each do |k|
|
1154
|
+
new_sh.dna_hash[k] = dna_hash[k]
|
1155
|
+
new_sh.aa_hash[k] = aa_hash[k]
|
1156
|
+
new_sh.qc_hash[k] = qc_hash[k]
|
1157
|
+
end
|
1158
|
+
new_sh.title = self.title + "_" + n.to_s
|
1159
|
+
return new_sh
|
1160
|
+
end
|
1161
|
+
|
1162
|
+
# trim dna sequences based on the provided reference coordinates.
|
1163
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1164
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
1165
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1166
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1167
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1168
|
+
|
1169
|
+
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1170
|
+
seq_hash = self.dna_hash.dup
|
1171
|
+
seq_hash_unique = seq_hash.uniq_hash
|
1172
|
+
trimmed_seq_hash = {}
|
1173
|
+
seq_hash_unique.each do |seq, names|
|
1174
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
1175
|
+
names.each do |name|
|
1176
|
+
trimmed_seq_hash[name] = trimmed_seq
|
1177
|
+
end
|
1178
|
+
end
|
1179
|
+
return_seq_hash = self.dup
|
1180
|
+
return_seq_hash.dna_hash = trimmed_seq_hash
|
1181
|
+
return return_seq_hash
|
1182
|
+
end
|
1052
1183
|
|
1053
1184
|
# start of private functions
|
1054
1185
|
private
|