viral_seq 1.0.5 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -4
- data/README.md +110 -38
- data/bin/locator +31 -9
- data/bin/tcs +450 -0
- data/lib/viral_seq.rb +4 -1
- data/lib/viral_seq/hash.rb +1 -1
- data/lib/viral_seq/hivdr.rb +2 -0
- data/lib/viral_seq/muscle.rb +2 -2
- data/lib/viral_seq/seq_hash.rb +220 -41
- data/lib/viral_seq/seq_hash_pair.rb +16 -6
- data/lib/viral_seq/tcs_core.rb +303 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -1
- data/viral_seq.gemspec +5 -1
- metadata +23 -5
data/lib/viral_seq.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
2
2
|
#
|
3
3
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
4
|
# of this software and associated documentation files (the "Software"), to deal
|
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
|
|
35
35
|
require_relative "viral_seq/sequence"
|
36
36
|
require_relative "viral_seq/string"
|
37
37
|
require_relative "viral_seq/version"
|
38
|
+
require_relative "viral_seq/tcs_core"
|
39
|
+
require_relative "viral_seq/tcs_json"
|
40
|
+
|
38
41
|
|
39
42
|
require "muscle_bio"
|
data/lib/viral_seq/hash.rb
CHANGED
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -5,6 +5,8 @@ module ViralSeq
|
|
5
5
|
# functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
|
6
6
|
# works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
|
7
7
|
# PR codon 1-99
|
8
|
+
# RT codon 34-122 (HXB2 2650-2914) and 152-236(3001-3257)
|
9
|
+
# IN codon 53-174 (HXB2 4384-4751)
|
8
10
|
# @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
|
9
11
|
# can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
|
10
12
|
# @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
|
data/lib/viral_seq/muscle.rb
CHANGED
@@ -39,8 +39,8 @@ module ViralSeq
|
|
39
39
|
|
40
40
|
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
41
|
temp_dir = Dir.home
|
42
|
-
temp_file = temp_dir
|
43
|
-
temp_aln = temp_dir
|
42
|
+
temp_file = File.join(temp_dir, "_temp_muscle_in")
|
43
|
+
temp_aln = File.join(temp_dir, "_temp_muscle_aln")
|
44
44
|
name = ">test"
|
45
45
|
temp_in = File.open(temp_file,"w")
|
46
46
|
temp_in.puts ">ref"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -9,7 +9,7 @@ module ViralSeq
|
|
9
9
|
# # align with MUSCLE
|
10
10
|
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
11
|
# # filter nt sequences with the reference coordinates
|
12
|
-
# filtered_seqhash = aligned_pr_seqhash.stop_codon[
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
|
13
13
|
# # return a new ViralSeq::SeqHash object without stop codons
|
14
14
|
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
15
|
# # further filter out sequences with A3G hypermutations
|
@@ -130,8 +130,8 @@ module ViralSeq
|
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|
133
|
-
sequence_hash = Hash[
|
134
|
-
quality_hash = Hash[
|
133
|
+
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
134
|
+
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
135
135
|
|
136
136
|
seq_hash = ViralSeq::SeqHash.new
|
137
137
|
seq_hash.dna_hash = sequence_hash
|
@@ -166,6 +166,40 @@ module ViralSeq
|
|
166
166
|
alias_method :array, :new_from_array
|
167
167
|
end
|
168
168
|
|
169
|
+
# the size of nt sequence hash of the SeqHash object
|
170
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
171
|
+
|
172
|
+
def size
|
173
|
+
self.dna_hash.size
|
174
|
+
end
|
175
|
+
|
176
|
+
# combine SeqHash objects
|
177
|
+
# @param sh2 [ViralSeq::SeqHash] another SeqHash
|
178
|
+
# @return [ViralSeq::SeqHash] combined SeqHash
|
179
|
+
|
180
|
+
def +(sh2)
|
181
|
+
new_seqhash = ViralSeq::SeqHash.new
|
182
|
+
new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
|
183
|
+
new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
|
184
|
+
new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
|
185
|
+
new_seqhash.title = self.title + "_with_" + sh2.title
|
186
|
+
new_seqhash.file = self.file + "," + sh2.file
|
187
|
+
return new_seqhash
|
188
|
+
end
|
189
|
+
|
190
|
+
# write the nt sequences to a FASTA format file
|
191
|
+
# @param file [String] path to the FASTA output file
|
192
|
+
# @return [NilClass]
|
193
|
+
|
194
|
+
def write_nt_fa(file)
|
195
|
+
File.open(file, 'w') do |f|
|
196
|
+
self.dna_hash.each do |k,v|
|
197
|
+
f.puts k
|
198
|
+
f.puts v
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
169
203
|
# generate sequences in relaxed sequencial phylip format from a ViralSeq::SeqHash object
|
170
204
|
# @return [String] relaxed sequencial phylip format in a String object
|
171
205
|
# @example convert fasta format to relaxed sequencial phylip format
|
@@ -215,10 +249,12 @@ module ViralSeq
|
|
215
249
|
def translate(codon_position = 0)
|
216
250
|
seqs = self.dna_hash
|
217
251
|
@aa_hash = {}
|
218
|
-
seqs.each do |
|
219
|
-
s = ViralSeq::Sequence.new(name, seq)
|
252
|
+
seqs.uniq_hash.each do |seq, array_of_name|
|
253
|
+
s = ViralSeq::Sequence.new('name', seq)
|
220
254
|
s.translate(codon_position)
|
221
|
-
|
255
|
+
array_of_name.each do |name|
|
256
|
+
@aa_hash[name] = s.aa_string
|
257
|
+
end
|
222
258
|
end
|
223
259
|
return nil
|
224
260
|
end # end of #translate
|
@@ -277,41 +313,45 @@ module ViralSeq
|
|
277
313
|
|
278
314
|
# screen for sequences with stop codons.
|
279
315
|
# @param (see #translate)
|
280
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
281
317
|
#
|
282
|
-
# #
|
283
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
284
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
285
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
286
322
|
# my_seqhash.dna_hash
|
287
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
288
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
289
325
|
# stop_codon_seqhash.dna_hash
|
290
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
291
327
|
# stop_codon_seqhash.aa_hash
|
292
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
293
329
|
# stop_codon_seqhash.title
|
294
330
|
# => "my_fasta_file_stop"
|
295
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
296
332
|
# filtered_seqhash.aa_hash
|
297
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
298
334
|
|
299
335
|
def stop_codon(codon_position = 0)
|
300
336
|
self.translate(codon_position)
|
301
337
|
keys = []
|
302
|
-
self.aa_hash
|
303
|
-
|
338
|
+
aa_seqs = self.aa_hash
|
339
|
+
aa_seqs.uniq_hash.each do |seq,array_of_name|
|
340
|
+
keys += array_of_name if seq.include?('*')
|
304
341
|
end
|
305
342
|
seqhash1 = self.sub(keys)
|
306
343
|
seqhash1.title = self.title + "_stop"
|
307
|
-
keys2 =
|
344
|
+
keys2 = aa_seqs.keys - keys
|
308
345
|
seqhash2 = self.sub(keys2)
|
309
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
310
350
|
end #end of #stop_codon
|
311
351
|
|
312
352
|
|
313
353
|
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
314
|
-
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at
|
354
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
|
315
355
|
# @return [String] consensus sequence
|
316
356
|
# @example consensus sequence from an array of sequences.
|
317
357
|
# seq_array = %w{ ATTTTTTTTT
|
@@ -343,11 +383,18 @@ module ViralSeq
|
|
343
383
|
base_count = all_base.count_freq
|
344
384
|
max_base_list = []
|
345
385
|
|
346
|
-
|
347
|
-
|
348
|
-
|
386
|
+
if cutoff.zero?
|
387
|
+
max_count = base_count.values.max
|
388
|
+
max_base_hash = base_count.select {|_k,v| v == max_count}
|
389
|
+
max_base_list = max_base_hash.keys
|
390
|
+
else
|
391
|
+
base_count.each do |k,v|
|
392
|
+
if v/seq_size.to_f >= cutoff
|
393
|
+
max_base_list << k
|
394
|
+
end
|
349
395
|
end
|
350
396
|
end
|
397
|
+
|
351
398
|
consensus_seq += call_consensus_base(max_base_list)
|
352
399
|
end
|
353
400
|
return consensus_seq
|
@@ -358,14 +405,14 @@ module ViralSeq
|
|
358
405
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
359
406
|
# # use the sample consensus to determine potential a3g sites
|
360
407
|
# # Two criteria to identify hypermutation
|
361
|
-
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G
|
408
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
362
409
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
363
410
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
364
411
|
# # b/c Poisson model does not do well on small sample size.
|
365
|
-
# @return [
|
366
|
-
#
|
367
|
-
#
|
368
|
-
#
|
412
|
+
# @return [Hash] three paris.
|
413
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
414
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
415
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
369
416
|
# # sequence tag
|
370
417
|
# # G to A mutation numbers at potential a3g positions
|
371
418
|
# # total potential a3g G positions
|
@@ -376,17 +423,17 @@ module ViralSeq
|
|
376
423
|
# @example identify apobec3gf mutations from a sequence fasta file
|
377
424
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
378
425
|
# hypermut = my_seqhash.a3g
|
379
|
-
# hypermut[
|
426
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
380
427
|
# => [">Seq7", ">Seq14"]
|
381
|
-
# hypermut[
|
428
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
382
429
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
383
|
-
# hypermut[
|
430
|
+
# hypermut[:stats]
|
384
431
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
385
432
|
#
|
386
433
|
# @example identify apobec3gf mutations from another sequence fasta file
|
387
434
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
388
435
|
# hypermut = my_seqhash.a3g
|
389
|
-
# hypermut[
|
436
|
+
# hypermut[:stats]
|
390
437
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
391
438
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
392
439
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -479,7 +526,10 @@ module ViralSeq
|
|
479
526
|
hm_seq_hash.title = self.title + "_hypermut"
|
480
527
|
hm_seq_hash.file = self.file
|
481
528
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
482
|
-
return
|
529
|
+
return { a3g_seq: hm_seq_hash,
|
530
|
+
filtered_seq: filtered_seq_hash,
|
531
|
+
stats: hm_hash.values
|
532
|
+
}
|
483
533
|
end #end of #a3g_hypermut
|
484
534
|
|
485
535
|
alias_method :a3g, :a3g_hypermut
|
@@ -693,6 +743,7 @@ module ViralSeq
|
|
693
743
|
|
694
744
|
seq_hash_unique.each do |seq|
|
695
745
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
746
|
+
next unless loc # if locator tool fails, skip this seq.
|
696
747
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
697
748
|
if indel
|
698
749
|
seq_hash_unique_pass << seq
|
@@ -748,7 +799,7 @@ module ViralSeq
|
|
748
799
|
s.rc!
|
749
800
|
loc2 = s.locator(ref_option)
|
750
801
|
loc1[2] >= loc2[2] ? (direction = :+; loc = loc1): (direction = :-; loc = loc2)
|
751
|
-
|
802
|
+
|
752
803
|
names.each do |name|
|
753
804
|
out_array << ([title, name, ref_option.to_s, direction.to_s] + loc)
|
754
805
|
end
|
@@ -871,11 +922,11 @@ module ViralSeq
|
|
871
922
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
872
923
|
# @example gap strip for an array of sequences
|
873
924
|
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
874
|
-
# array = { AACCGGTT
|
875
|
-
#
|
876
|
-
#
|
877
|
-
#
|
878
|
-
#
|
925
|
+
# array = %w{ AACCGGTT
|
926
|
+
# A-CCGGTT
|
927
|
+
# AAC-GGTT
|
928
|
+
# AACCG-TT
|
929
|
+
# AACCGGT- }
|
879
930
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
880
931
|
# puts my_seqhash.gap_strip.dna_hash.values
|
881
932
|
# ACGT
|
@@ -930,12 +981,11 @@ module ViralSeq
|
|
930
981
|
# @param (see #gap_strip)
|
931
982
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
932
983
|
# @example gap strip for an array of sequences only at the ends
|
933
|
-
# array =
|
934
|
-
#
|
935
|
-
#
|
936
|
-
#
|
937
|
-
#
|
938
|
-
# AACCGGT- }
|
984
|
+
# array = %w{ AACCGGTT
|
985
|
+
# A-CCGGTT
|
986
|
+
# AAC-GGTT
|
987
|
+
# AACCG-TT
|
988
|
+
# AACCGGT- }
|
939
989
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
940
990
|
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
941
991
|
# AACCGGT
|
@@ -999,8 +1049,137 @@ module ViralSeq
|
|
999
1049
|
end
|
1000
1050
|
|
1001
1051
|
|
1052
|
+
# mutate @dna_hash based on the error_rate
|
1053
|
+
# @param error_rate [Float] error rate used to mutate sequences.
|
1054
|
+
# @return [ViralSeq::SeqHash] new SeqHash object of mutated sequences.
|
1055
|
+
|
1056
|
+
def mutation(error_rate = 0.01)
|
1057
|
+
new_seqhash = ViralSeq::SeqHash.new
|
1058
|
+
dna = {}
|
1059
|
+
self.dna_hash.each do |name, seq|
|
1060
|
+
dna[name + '_mut-' + error_rate.to_s] = seq.mutation(error_rate)
|
1061
|
+
end
|
1062
|
+
new_seqhash.dna_hash = dna
|
1063
|
+
new_seqhash.title = self.title + "_mut-" + error_rate.to_s
|
1064
|
+
new_seqhash.file = self.file
|
1065
|
+
return new_seqhash
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
# return an table of frequencies of nucleotides at each position.
|
1069
|
+
# @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
|
1070
|
+
# @param head [Boolean] if the head of table is included.
|
1071
|
+
# @return [Array] a two-dimension array of the frequency table,
|
1072
|
+
# including the following info:
|
1073
|
+
# position on the sequence (starting from 1)
|
1074
|
+
# consensus nucleotide
|
1075
|
+
# total sequence numbers
|
1076
|
+
# percentage of A, shows "-" if agrees with consensus
|
1077
|
+
# percentage of C, shows "-" if agrees with consensus
|
1078
|
+
# percentage of G, shows "-" if agrees with consensus
|
1079
|
+
# percentage of T, shows "-" if agrees with consensus
|
1080
|
+
#
|
1081
|
+
# @example error table for an array of sequences
|
1082
|
+
# array = %w{ AACCGGTT
|
1083
|
+
# AGCCGGTT
|
1084
|
+
# AACTGCTT
|
1085
|
+
# AACCGTTA
|
1086
|
+
# AACCGGTA }
|
1087
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
1088
|
+
# my_seqhash.error_table.each {|r| puts r.join(',')}
|
1089
|
+
# position,consensus,total_seq_number,A,C,G,T
|
1090
|
+
# 1,A,5,-,,,
|
1091
|
+
# 2,A,5,-,,0.2,
|
1092
|
+
# 3,C,5,,-,,
|
1093
|
+
# 4,C,5,,-,,0.2
|
1094
|
+
# 5,G,5,,,-,
|
1095
|
+
# 6,G,5,,0.2,-,0.2
|
1096
|
+
# 7,T,5,,,,-
|
1097
|
+
# 8,T,5,0.4,,,-
|
1098
|
+
|
1099
|
+
def error_table(ref = self.consensus, head = true)
|
1100
|
+
|
1101
|
+
table = []
|
1102
|
+
if head
|
1103
|
+
table << %w{
|
1104
|
+
position
|
1105
|
+
consensus
|
1106
|
+
total_seq_number
|
1107
|
+
A
|
1108
|
+
C
|
1109
|
+
G
|
1110
|
+
T
|
1111
|
+
}
|
1112
|
+
end
|
1113
|
+
ref_size = ref.size
|
1114
|
+
|
1115
|
+
(0..(ref_size - 1)).each do |position|
|
1116
|
+
ref_base = ref[position]
|
1117
|
+
nts = []
|
1118
|
+
|
1119
|
+
self.dna_hash.each do |_k,v|
|
1120
|
+
nts << v[position]
|
1121
|
+
end
|
1122
|
+
|
1123
|
+
freq = nts.count_freq
|
1124
|
+
freq2 = {}
|
1125
|
+
|
1126
|
+
freq.each do |nt,c|
|
1127
|
+
if nt == ref_base
|
1128
|
+
freq2[nt] = '-'
|
1129
|
+
else
|
1130
|
+
freq2[nt] = (c/(self.size).to_f)
|
1131
|
+
end
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
return table
|
1138
|
+
|
1139
|
+
end # end of error_table
|
1140
|
+
|
1141
|
+
# randomly select n number of sequences from the orginal SeqHash object
|
1142
|
+
# @param n [Integer] number of sequences to randomly select
|
1143
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
|
1144
|
+
|
1145
|
+
def random_select(n = 100)
|
1146
|
+
new_sh = ViralSeq::SeqHash.new
|
1147
|
+
dna_hash = self.dna_hash
|
1148
|
+
aa_hash = self.aa_hash
|
1149
|
+
qc_hash = self.qc_hash
|
1150
|
+
|
1151
|
+
keys = dna_hash.keys.sample(n)
|
1002
1152
|
|
1153
|
+
keys.each do |k|
|
1154
|
+
new_sh.dna_hash[k] = dna_hash[k]
|
1155
|
+
new_sh.aa_hash[k] = aa_hash[k]
|
1156
|
+
new_sh.qc_hash[k] = qc_hash[k]
|
1157
|
+
end
|
1158
|
+
new_sh.title = self.title + "_" + n.to_s
|
1159
|
+
return new_sh
|
1160
|
+
end
|
1003
1161
|
|
1162
|
+
# trim dna sequences based on the provided reference coordinates.
|
1163
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1164
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
1165
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1166
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1167
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1168
|
+
|
1169
|
+
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1170
|
+
seq_hash = self.dna_hash.dup
|
1171
|
+
seq_hash_unique = seq_hash.uniq_hash
|
1172
|
+
trimmed_seq_hash = {}
|
1173
|
+
seq_hash_unique.each do |seq, names|
|
1174
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
1175
|
+
names.each do |name|
|
1176
|
+
trimmed_seq_hash[name] = trimmed_seq
|
1177
|
+
end
|
1178
|
+
end
|
1179
|
+
return_seq_hash = self.dup
|
1180
|
+
return_seq_hash.dna_hash = trimmed_seq_hash
|
1181
|
+
return return_seq_hash
|
1182
|
+
end
|
1004
1183
|
|
1005
1184
|
# start of private functions
|
1006
1185
|
private
|
@@ -7,7 +7,7 @@ module ViralSeq
|
|
7
7
|
# @example join the paired-end sequences with an overlap of 100 bp
|
8
8
|
# my_seqhashpair.join1(100)
|
9
9
|
# @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
|
10
|
-
# my_seqhashpair.
|
10
|
+
# my_seqhashpair.join2(model: :indiv)
|
11
11
|
|
12
12
|
class SeqHashPair
|
13
13
|
|
@@ -80,6 +80,12 @@ module ViralSeq
|
|
80
80
|
alias_method :fa, :new_from_fasta
|
81
81
|
end
|
82
82
|
|
83
|
+
# the size of nt sequence hash of the SeqHashPair object
|
84
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
85
|
+
def size
|
86
|
+
self.dna_hash.size
|
87
|
+
end
|
88
|
+
|
83
89
|
# Pair-end join function for KNOWN overlap size.
|
84
90
|
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
91
|
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
@@ -104,17 +110,21 @@ module ViralSeq
|
|
104
110
|
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
105
111
|
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
106
112
|
joined_seq = {}
|
107
|
-
seq_pair_hash.each do |
|
113
|
+
seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
|
108
114
|
r1_seq = seq_pair[0]
|
109
115
|
r2_seq = seq_pair[1]
|
110
116
|
if overlap.zero?
|
111
|
-
|
117
|
+
joined_sequence = r1_seq + r2_seq
|
112
118
|
elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
|
113
|
-
|
119
|
+
joined_sequence= r1_seq + r2_seq[overlap..-1]
|
114
120
|
else
|
115
121
|
next
|
116
122
|
end
|
123
|
+
seq_names.each do |seq_name|
|
124
|
+
joined_seq[seq_name] = joined_sequence
|
125
|
+
end
|
117
126
|
end
|
127
|
+
|
118
128
|
joined_seq_hash = ViralSeq::SeqHash.new
|
119
129
|
joined_seq_hash.dna_hash = joined_seq
|
120
130
|
joined_seq_hash.title = self.title + "_joined"
|
@@ -139,7 +149,7 @@ module ViralSeq
|
|
139
149
|
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
150
|
# my_seqhashpair.join2.dna_hash
|
141
151
|
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
-
# my_seqhashpair.join2(model :indiv).dna_hash
|
152
|
+
# my_seqhashpair.join2(model: :indiv).dna_hash
|
143
153
|
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
154
|
|
145
155
|
def join2(model: :con, diff: 0.0)
|
@@ -207,7 +217,7 @@ module ViralSeq
|
|
207
217
|
# {minimal overlap set to 4. }
|
208
218
|
def overlap_matrix(sequence1, sequence2)
|
209
219
|
min_overlap = 4
|
210
|
-
max_overlap = [sequence1.size, sequence2.size].
|
220
|
+
max_overlap = [sequence1.size, sequence2.size].min
|
211
221
|
matrix_hash = {}
|
212
222
|
(min_overlap..max_overlap).each do |overlap|
|
213
223
|
matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
|