viral_seq 1.0.7 → 1.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +119 -50
- data/bin/locator +20 -0
- data/bin/tcs +454 -0
- data/lib/viral_seq.rb +4 -1
- data/lib/viral_seq/constant.rb +5 -1
- data/lib/viral_seq/enumerable.rb +0 -10
- data/lib/viral_seq/hash.rb +1 -1
- data/lib/viral_seq/hivdr.rb +1 -1
- data/lib/viral_seq/sdrm.rb +43 -0
- data/lib/viral_seq/seq_hash.rb +61 -25
- data/lib/viral_seq/seq_hash_pair.rb +7 -1
- data/lib/viral_seq/tcs_core.rb +305 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -1
- data/viral_seq.gemspec +1 -1
- metadata +10 -5
data/lib/viral_seq.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
2
2
|
#
|
3
3
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
4
|
# of this software and associated documentation files (the "Software"), to deal
|
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
|
|
35
35
|
require_relative "viral_seq/sequence"
|
36
36
|
require_relative "viral_seq/string"
|
37
37
|
require_relative "viral_seq/version"
|
38
|
+
require_relative "viral_seq/tcs_core"
|
39
|
+
require_relative "viral_seq/tcs_json"
|
40
|
+
|
38
41
|
|
39
42
|
require "muscle_bio"
|
data/lib/viral_seq/constant.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module ViralSeq
|
2
|
-
|
2
|
+
|
3
3
|
# array for all amino acid one letter abbreviations
|
4
4
|
|
5
5
|
AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
|
6
6
|
|
7
|
+
SDRM_HIV_PR_LIST = {}
|
8
|
+
SDRM_HIV_RT_LIST = {}
|
9
|
+
SDRM_HIV_IN_LIST = {}
|
10
|
+
|
7
11
|
end
|
data/lib/viral_seq/enumerable.rb
CHANGED
@@ -3,10 +3,6 @@
|
|
3
3
|
# array = [1,2,3,4,5,6,7,8,9,10]
|
4
4
|
# array.median
|
5
5
|
# => 5.5
|
6
|
-
# @example sum
|
7
|
-
# array = [1,2,3,4,5,6,7,8,9,10]
|
8
|
-
# array.sum
|
9
|
-
# => 55
|
10
6
|
# @example average number (mean)
|
11
7
|
# array = [1,2,3,4,5,6,7,8,9,10]
|
12
8
|
# array.mean
|
@@ -45,12 +41,6 @@ module Enumerable
|
|
45
41
|
len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
|
46
42
|
end
|
47
43
|
|
48
|
-
# generate summed value
|
49
|
-
# @return [Numeric] summed value
|
50
|
-
def sum
|
51
|
-
self.inject(0){|accum, i| accum + i }
|
52
|
-
end
|
53
|
-
|
54
44
|
# generate mean number
|
55
45
|
# @return [Float] mean value
|
56
46
|
def mean
|
data/lib/viral_seq/hash.rb
CHANGED
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
class DRMs
|
3
|
+
def initialize (mutation_list = {})
|
4
|
+
@mutation_list = mutation_list
|
5
|
+
end
|
6
|
+
|
7
|
+
attr_accessor :mutation_list
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.sdrm_hiv_pr(seq_hash)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.sdrm_hiv_rt(seq_hash)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.sdrm_hiv_in(seq_hash)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.list_from_json(file)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.list_from_csv(file)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.export_list_hiv_pr(file, format = :json)
|
26
|
+
if foramt == :json
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.export_list_hiv_rt(file, format = :json)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.export_list_hiv_in(file, format = :json)
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
def drm_analysis(seq_hash)
|
40
|
+
mutation_list = self.mutation_list
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -9,7 +9,7 @@ module ViralSeq
|
|
9
9
|
# # align with MUSCLE
|
10
10
|
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
11
|
# # filter nt sequences with the reference coordinates
|
12
|
-
# filtered_seqhash = aligned_pr_seqhash.stop_codon[
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
|
13
13
|
# # return a new ViralSeq::SeqHash object without stop codons
|
14
14
|
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
15
|
# # further filter out sequences with A3G hypermutations
|
@@ -130,8 +130,8 @@ module ViralSeq
|
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|
133
|
-
sequence_hash = Hash[
|
134
|
-
quality_hash = Hash[
|
133
|
+
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
134
|
+
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
135
135
|
|
136
136
|
seq_hash = ViralSeq::SeqHash.new
|
137
137
|
seq_hash.dna_hash = sequence_hash
|
@@ -181,6 +181,7 @@ module ViralSeq
|
|
181
181
|
new_seqhash = ViralSeq::SeqHash.new
|
182
182
|
new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
|
183
183
|
new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
|
184
|
+
new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
|
184
185
|
new_seqhash.title = self.title + "_with_" + sh2.title
|
185
186
|
new_seqhash.file = self.file + "," + sh2.file
|
186
187
|
return new_seqhash
|
@@ -312,22 +313,22 @@ module ViralSeq
|
|
312
313
|
|
313
314
|
# screen for sequences with stop codons.
|
314
315
|
# @param (see #translate)
|
315
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
316
317
|
#
|
317
|
-
# #
|
318
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
319
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
320
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
321
322
|
# my_seqhash.dna_hash
|
322
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
323
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
324
325
|
# stop_codon_seqhash.dna_hash
|
325
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
326
327
|
# stop_codon_seqhash.aa_hash
|
327
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
328
329
|
# stop_codon_seqhash.title
|
329
330
|
# => "my_fasta_file_stop"
|
330
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
331
332
|
# filtered_seqhash.aa_hash
|
332
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
333
334
|
|
@@ -342,12 +343,15 @@ module ViralSeq
|
|
342
343
|
seqhash1.title = self.title + "_stop"
|
343
344
|
keys2 = aa_seqs.keys - keys
|
344
345
|
seqhash2 = self.sub(keys2)
|
345
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
346
350
|
end #end of #stop_codon
|
347
351
|
|
348
352
|
|
349
353
|
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
350
|
-
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at
|
354
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
|
351
355
|
# @return [String] consensus sequence
|
352
356
|
# @example consensus sequence from an array of sequences.
|
353
357
|
# seq_array = %w{ ATTTTTTTTT
|
@@ -379,11 +383,18 @@ module ViralSeq
|
|
379
383
|
base_count = all_base.count_freq
|
380
384
|
max_base_list = []
|
381
385
|
|
382
|
-
|
383
|
-
|
384
|
-
|
386
|
+
if cutoff.zero?
|
387
|
+
max_count = base_count.values.max
|
388
|
+
max_base_hash = base_count.select {|_k,v| v == max_count}
|
389
|
+
max_base_list = max_base_hash.keys
|
390
|
+
else
|
391
|
+
base_count.each do |k,v|
|
392
|
+
if v/seq_size.to_f >= cutoff
|
393
|
+
max_base_list << k
|
394
|
+
end
|
385
395
|
end
|
386
396
|
end
|
397
|
+
|
387
398
|
consensus_seq += call_consensus_base(max_base_list)
|
388
399
|
end
|
389
400
|
return consensus_seq
|
@@ -394,14 +405,14 @@ module ViralSeq
|
|
394
405
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
395
406
|
# # use the sample consensus to determine potential a3g sites
|
396
407
|
# # Two criteria to identify hypermutation
|
397
|
-
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G
|
408
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
398
409
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
399
410
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
400
411
|
# # b/c Poisson model does not do well on small sample size.
|
401
|
-
# @return [
|
402
|
-
#
|
403
|
-
#
|
404
|
-
#
|
412
|
+
# @return [Hash] three paris.
|
413
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
414
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
415
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
405
416
|
# # sequence tag
|
406
417
|
# # G to A mutation numbers at potential a3g positions
|
407
418
|
# # total potential a3g G positions
|
@@ -412,17 +423,17 @@ module ViralSeq
|
|
412
423
|
# @example identify apobec3gf mutations from a sequence fasta file
|
413
424
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
414
425
|
# hypermut = my_seqhash.a3g
|
415
|
-
# hypermut[
|
426
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
416
427
|
# => [">Seq7", ">Seq14"]
|
417
|
-
# hypermut[
|
428
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
418
429
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
419
|
-
# hypermut[
|
430
|
+
# hypermut[:stats]
|
420
431
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
421
432
|
#
|
422
433
|
# @example identify apobec3gf mutations from another sequence fasta file
|
423
434
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
424
435
|
# hypermut = my_seqhash.a3g
|
425
|
-
# hypermut[
|
436
|
+
# hypermut[:stats]
|
426
437
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
427
438
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
428
439
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -515,7 +526,10 @@ module ViralSeq
|
|
515
526
|
hm_seq_hash.title = self.title + "_hypermut"
|
516
527
|
hm_seq_hash.file = self.file
|
517
528
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
518
|
-
return
|
529
|
+
return { a3g_seq: hm_seq_hash,
|
530
|
+
filtered_seq: filtered_seq_hash,
|
531
|
+
stats: hm_hash.values
|
532
|
+
}
|
519
533
|
end #end of #a3g_hypermut
|
520
534
|
|
521
535
|
alias_method :a3g, :a3g_hypermut
|
@@ -535,7 +549,7 @@ module ViralSeq
|
|
535
549
|
if sequences.size == 0
|
536
550
|
return 0
|
537
551
|
else
|
538
|
-
cut_off =
|
552
|
+
cut_off = Float::INFINITY
|
539
553
|
l = sequences[0].size
|
540
554
|
rate = sequences.size * error_rate
|
541
555
|
count_mut = variant_for_poisson(sequences)
|
@@ -544,7 +558,7 @@ module ViralSeq
|
|
544
558
|
|
545
559
|
poisson_hash.each do |k,v|
|
546
560
|
cal = l * v
|
547
|
-
obs = count_mut[k] ? count_mut[k] :
|
561
|
+
obs = count_mut[k] ? count_mut[k] : 1
|
548
562
|
if obs >= fold_cutoff * cal
|
549
563
|
cut_off = k
|
550
564
|
break
|
@@ -729,6 +743,7 @@ module ViralSeq
|
|
729
743
|
|
730
744
|
seq_hash_unique.each do |seq|
|
731
745
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
746
|
+
next unless loc # if locator tool fails, skip this seq.
|
732
747
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
733
748
|
if indel
|
734
749
|
seq_hash_unique_pass << seq
|
@@ -1144,6 +1159,27 @@ module ViralSeq
|
|
1144
1159
|
return new_sh
|
1145
1160
|
end
|
1146
1161
|
|
1162
|
+
# trim dna sequences based on the provided reference coordinates.
|
1163
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1164
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
1165
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1166
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1167
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1168
|
+
|
1169
|
+
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1170
|
+
seq_hash = self.dna_hash.dup
|
1171
|
+
seq_hash_unique = seq_hash.uniq_hash
|
1172
|
+
trimmed_seq_hash = {}
|
1173
|
+
seq_hash_unique.each do |seq, names|
|
1174
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
1175
|
+
names.each do |name|
|
1176
|
+
trimmed_seq_hash[name] = trimmed_seq
|
1177
|
+
end
|
1178
|
+
end
|
1179
|
+
return_seq_hash = self.dup
|
1180
|
+
return_seq_hash.dna_hash = trimmed_seq_hash
|
1181
|
+
return return_seq_hash
|
1182
|
+
end
|
1147
1183
|
|
1148
1184
|
# start of private functions
|
1149
1185
|
private
|
@@ -80,6 +80,12 @@ module ViralSeq
|
|
80
80
|
alias_method :fa, :new_from_fasta
|
81
81
|
end
|
82
82
|
|
83
|
+
# the size of nt sequence hash of the SeqHashPair object
|
84
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
85
|
+
def size
|
86
|
+
self.dna_hash.size
|
87
|
+
end
|
88
|
+
|
83
89
|
# Pair-end join function for KNOWN overlap size.
|
84
90
|
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
91
|
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
@@ -211,7 +217,7 @@ module ViralSeq
|
|
211
217
|
# {minimal overlap set to 4. }
|
212
218
|
def overlap_matrix(sequence1, sequence2)
|
213
219
|
min_overlap = 4
|
214
|
-
max_overlap = [sequence1.size, sequence2.size].
|
220
|
+
max_overlap = [sequence1.size, sequence2.size].min
|
215
221
|
matrix_hash = {}
|
216
222
|
(min_overlap..max_overlap).each do |overlap|
|
217
223
|
matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
|
@@ -0,0 +1,305 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
|
3
|
+
# Core functions for `tcs` pipeline
|
4
|
+
|
5
|
+
class TcsCore
|
6
|
+
class << self
|
7
|
+
|
8
|
+
# methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
|
9
|
+
|
10
|
+
def calculate_cut_off(m, error_rate = 0.02)
|
11
|
+
n = 0
|
12
|
+
case error_rate
|
13
|
+
when 0.005...0.015
|
14
|
+
if m <= 10
|
15
|
+
n = 2
|
16
|
+
else
|
17
|
+
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
18
|
+
end
|
19
|
+
|
20
|
+
when 0...0.005
|
21
|
+
if m <= 10
|
22
|
+
n = 2
|
23
|
+
else
|
24
|
+
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
25
|
+
end
|
26
|
+
|
27
|
+
else
|
28
|
+
if m <= 10
|
29
|
+
n = 2
|
30
|
+
elsif m <= 8500
|
31
|
+
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
32
|
+
else
|
33
|
+
n = 0.0079 * m + 9.4869
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
n = n.round
|
38
|
+
n = 2 if n < 3
|
39
|
+
return n
|
40
|
+
end
|
41
|
+
|
42
|
+
# identify which file in the directory is R1 file, and which is R2 file based on file names
|
43
|
+
# input as directory (Dir object or a string of path)
|
44
|
+
# by default, .gz files will be unzipped.
|
45
|
+
# return as an hash of {r1_file: file1, r1_file: file2}
|
46
|
+
def r1r2(directory, unzip = true)
|
47
|
+
files = []
|
48
|
+
Dir.chdir(directory) { files = Dir.glob "*" }
|
49
|
+
r1_file = ""
|
50
|
+
r2_file = ""
|
51
|
+
files.each do |f|
|
52
|
+
tag = parser_file_name(f)[:tag]
|
53
|
+
|
54
|
+
if tag.include? "R1"
|
55
|
+
unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
|
56
|
+
elsif tag.include? "R2"
|
57
|
+
unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
return { r1_file: r1_file, r2_file: r2_file }
|
61
|
+
end # end of ViralSeq:TcsCore.r1r2
|
62
|
+
|
63
|
+
# sort directories containing mulitple r1 and r2 files.
|
64
|
+
# use the library name (first string before "_") to seperate libraries
|
65
|
+
# out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
|
66
|
+
# return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
|
67
|
+
|
68
|
+
def sort_by_lib(directory, out_dir = directory + "_sorted")
|
69
|
+
Dir.mkdir(out_dir) unless File.directory?(out_dir)
|
70
|
+
files = []
|
71
|
+
Dir.chdir(directory) {files = Dir.glob("*")}
|
72
|
+
|
73
|
+
files.each do |file|
|
74
|
+
path = File.join(directory,file)
|
75
|
+
index = file.split("_")[0]
|
76
|
+
index_dir = File.join(out_dir, index)
|
77
|
+
Dir.mkdir(index_dir) unless File.directory?(index_dir)
|
78
|
+
File.rename(path, File.join(index_dir, file))
|
79
|
+
end
|
80
|
+
|
81
|
+
return_obj = { with_both_r1_r2: [],
|
82
|
+
missing_r1: [],
|
83
|
+
missing_r2: [],
|
84
|
+
error: []
|
85
|
+
}
|
86
|
+
|
87
|
+
libs = []
|
88
|
+
Dir.chdir(out_dir) { libs = Dir.glob('*') }
|
89
|
+
libs.each do |lib|
|
90
|
+
file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
|
91
|
+
if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
92
|
+
return_obj[:with_both_r1_r2] << lib
|
93
|
+
elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
94
|
+
return_obj[:missing_r1] << lib
|
95
|
+
elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
|
96
|
+
return_obj[:missing_r2] << lib
|
97
|
+
else
|
98
|
+
return_obj[:error] << lib
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return return_obj
|
102
|
+
end
|
103
|
+
|
104
|
+
# sort array of file names to determine if there is potential errors
|
105
|
+
# input name_array array of file names
|
106
|
+
# output hash { }
|
107
|
+
# need to change for each file name have an error code. and a bool to show if all pass
|
108
|
+
def validate_file_name(name_array)
|
109
|
+
errors = {
|
110
|
+
file_type_error: [] ,
|
111
|
+
missing_r1_file: [] ,
|
112
|
+
missing_r2_file: [] ,
|
113
|
+
extra_r1_r2_file: [],
|
114
|
+
no_region_tag: [] ,
|
115
|
+
multiple_region_tag: []
|
116
|
+
}
|
117
|
+
|
118
|
+
passed_libs = {}
|
119
|
+
|
120
|
+
name_with_r1_r2 = []
|
121
|
+
|
122
|
+
name_array.each do |name|
|
123
|
+
tag = parser_file_name(name)[:tag]
|
124
|
+
if name !~ /\.fastq\Z|\.fastq\.gz\Z/
|
125
|
+
errors[:file_type_error] << name
|
126
|
+
elsif tag.count("R1") == 0 and tag.count("R2") == 0
|
127
|
+
errors[:no_region_tag] << name
|
128
|
+
elsif tag.count("R1") > 0 and tag.count("R2") > 0
|
129
|
+
errors[:multiple_region_tag] << name
|
130
|
+
elsif tag.count("R1") > 1 or tag.count("R2") > 1
|
131
|
+
errors[:multiple_region_tag] << name
|
132
|
+
else
|
133
|
+
name_with_r1_r2 << name
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
libs = {}
|
138
|
+
|
139
|
+
name_with_r1_r2.map do |name|
|
140
|
+
libname = parser_file_name(name)[:libname]
|
141
|
+
libs[libname] ||= []
|
142
|
+
libs[libname] << name
|
143
|
+
end
|
144
|
+
|
145
|
+
libs.each do |libname, files|
|
146
|
+
count_r1_file = 0
|
147
|
+
count_r2_file = 0
|
148
|
+
files.each do |name|
|
149
|
+
tag = parser_file_name(name)[:tag]
|
150
|
+
if tag.include? "R1"
|
151
|
+
count_r1_file += 1
|
152
|
+
elsif tag.include? "R2"
|
153
|
+
count_r2_file += 1
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if count_r1_file > 1 or count_r2_file > 1
|
158
|
+
errors[:extra_r1_r2_file] += files
|
159
|
+
elsif count_r1_file.zero?
|
160
|
+
errors[:missing_r1_file] += files
|
161
|
+
elsif count_r2_file.zero?
|
162
|
+
errors[:missing_r2_file] += files
|
163
|
+
else
|
164
|
+
passed_libs[libname] = files
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
passed_names = []
|
169
|
+
|
170
|
+
passed_libs.values.each { |names| passed_names += names}
|
171
|
+
|
172
|
+
if passed_names.size < name_array.size
|
173
|
+
pass = false
|
174
|
+
else
|
175
|
+
pass = true
|
176
|
+
end
|
177
|
+
|
178
|
+
return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
|
179
|
+
end
|
180
|
+
|
181
|
+
# filter r1 raw sequences for non-specific primers.
|
182
|
+
# input r1_sh, SeqHash obj.
|
183
|
+
# return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
|
184
|
+
|
185
|
+
def filter_r1(r1_sh, forward_primer)
|
186
|
+
if forward_primer.match(/(N+)(\w+)$/)
|
187
|
+
forward_n = $1.size
|
188
|
+
forward_bio_primer = $2
|
189
|
+
else
|
190
|
+
forward_n = 0
|
191
|
+
forward_bio_primer = forward_primer
|
192
|
+
end
|
193
|
+
forward_bio_primer_size = forward_bio_primer.size
|
194
|
+
forward_starting_number = forward_n + forward_bio_primer_size
|
195
|
+
forward_primer_ref = forward_bio_primer.nt_parser
|
196
|
+
|
197
|
+
r1_passed_seq = {}
|
198
|
+
r1_raw = r1_sh.dna_hash
|
199
|
+
|
200
|
+
proc_filter = proc do |name|
|
201
|
+
seq = r1_raw[name]
|
202
|
+
next unless general_filter seq
|
203
|
+
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
204
|
+
if primer_region_seq =~ forward_primer_ref
|
205
|
+
new_name = remove_tag name
|
206
|
+
r1_passed_seq[new_name] = seq
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
r1_raw.keys.map do |name|
|
211
|
+
proc_filter.call name
|
212
|
+
end
|
213
|
+
|
214
|
+
return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
|
215
|
+
end # end of filter_r1
|
216
|
+
|
217
|
+
# filter r2 raw sequences for non-specific primers.
|
218
|
+
# input r2_sh, SeqHash obj.
|
219
|
+
# return filtered Hash of sequence name and seq pair, as well as the length of PID.
|
220
|
+
def filter_r2(r2_sh, cdna_primer)
|
221
|
+
r2_raw = r2_sh.dna_hash
|
222
|
+
cdna_primer.match(/(N+)(\w+)$/)
|
223
|
+
pid_length = $1.size
|
224
|
+
cdna_bio_primer = $2
|
225
|
+
cdna_bio_primer_size = cdna_bio_primer.size
|
226
|
+
reverse_starting_number = pid_length + cdna_bio_primer_size
|
227
|
+
cdna_primer_ref = cdna_bio_primer.nt_parser
|
228
|
+
r2_passed_seq = {}
|
229
|
+
proc_filter = proc do |name|
|
230
|
+
seq = r2_raw[name]
|
231
|
+
next unless general_filter seq
|
232
|
+
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
233
|
+
if primer_region_seq =~ cdna_primer_ref
|
234
|
+
new_name = remove_tag name
|
235
|
+
r2_passed_seq[new_name] = seq
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
r2_raw.keys.map do |name|
|
240
|
+
proc_filter.call name
|
241
|
+
end
|
242
|
+
|
243
|
+
return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
|
244
|
+
end # end of filter_r2
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
# puts error message in the log file handler, and abort with the same infor
|
249
|
+
|
250
|
+
def log_and_abort(log, infor)
|
251
|
+
log.puts Time.now.to_s + "\t" + infor
|
252
|
+
log.close
|
253
|
+
abort infor.red.bold
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
|
258
|
+
def unzip_r(indir, f)
|
259
|
+
r_file = File.join(indir, f)
|
260
|
+
if f =~ /.gz/
|
261
|
+
`gzip -d #{r_file}`
|
262
|
+
new_f = f.sub ".gz", ""
|
263
|
+
r_file = File.join(indir, new_f)
|
264
|
+
end
|
265
|
+
return r_file
|
266
|
+
end
|
267
|
+
|
268
|
+
def parser_file_name(file_name)
|
269
|
+
t = file_name.split(".")[0].split("_")
|
270
|
+
if t.size == 1
|
271
|
+
libname = "lib"
|
272
|
+
tag = [ t[0].upcase ]
|
273
|
+
else
|
274
|
+
libname = t[0]
|
275
|
+
tag = t[1..-1].map(&:upcase)
|
276
|
+
end
|
277
|
+
return {libname: libname, tag: tag}
|
278
|
+
end
|
279
|
+
|
280
|
+
def general_filter(seq)
|
281
|
+
if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
282
|
+
return false
|
283
|
+
elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
284
|
+
return false
|
285
|
+
elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
286
|
+
return false
|
287
|
+
else
|
288
|
+
return true
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# remove region info tags from the raw MiSeq sequences.
|
293
|
+
def remove_tag(seq_name)
|
294
|
+
if seq_name =~ /\s/
|
295
|
+
new_tag = $`
|
296
|
+
else
|
297
|
+
new_tag = seq_name[0..-3]
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
end # end of class << self
|
302
|
+
|
303
|
+
end # end of TcsCore module
|
304
|
+
|
305
|
+
end # end of main module
|