viral_seq 1.0.8 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +120 -57
- data/bin/tcs +140 -214
- data/lib/viral_seq.rb +3 -0
- data/lib/viral_seq/constant.rb +5 -1
- data/lib/viral_seq/enumerable.rb +0 -10
- data/lib/viral_seq/hivdr.rb +1 -1
- data/lib/viral_seq/math.rb +3 -3
- data/lib/viral_seq/sdrm.rb +43 -0
- data/lib/viral_seq/seq_hash.rb +38 -24
- data/lib/viral_seq/seq_hash_pair.rb +6 -0
- data/lib/viral_seq/tcs_core.rb +305 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -2
- data/viral_seq.gemspec +1 -1
- metadata +8 -7
- data/bin/tcs_json_generator +0 -170
data/lib/viral_seq.rb
CHANGED
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
|
|
35
35
|
require_relative "viral_seq/sequence"
|
36
36
|
require_relative "viral_seq/string"
|
37
37
|
require_relative "viral_seq/version"
|
38
|
+
require_relative "viral_seq/tcs_core"
|
39
|
+
require_relative "viral_seq/tcs_json"
|
40
|
+
|
38
41
|
|
39
42
|
require "muscle_bio"
|
data/lib/viral_seq/constant.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module ViralSeq
|
2
|
-
|
2
|
+
|
3
3
|
# array for all amino acid one letter abbreviations
|
4
4
|
|
5
5
|
AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
|
6
6
|
|
7
|
+
SDRM_HIV_PR_LIST = {}
|
8
|
+
SDRM_HIV_RT_LIST = {}
|
9
|
+
SDRM_HIV_IN_LIST = {}
|
10
|
+
|
7
11
|
end
|
data/lib/viral_seq/enumerable.rb
CHANGED
@@ -3,10 +3,6 @@
|
|
3
3
|
# array = [1,2,3,4,5,6,7,8,9,10]
|
4
4
|
# array.median
|
5
5
|
# => 5.5
|
6
|
-
# @example sum
|
7
|
-
# array = [1,2,3,4,5,6,7,8,9,10]
|
8
|
-
# array.sum
|
9
|
-
# => 55
|
10
6
|
# @example average number (mean)
|
11
7
|
# array = [1,2,3,4,5,6,7,8,9,10]
|
12
8
|
# array.mean
|
@@ -45,12 +41,6 @@ module Enumerable
|
|
45
41
|
len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
|
46
42
|
end
|
47
43
|
|
48
|
-
# generate summed value
|
49
|
-
# @return [Numeric] summed value
|
50
|
-
def sum
|
51
|
-
self.inject(0){|accum, i| accum + i }
|
52
|
-
end
|
53
|
-
|
54
44
|
# generate mean number
|
55
45
|
# @return [Float] mean value
|
56
46
|
def mean
|
data/lib/viral_seq/hivdr.rb
CHANGED
data/lib/viral_seq/math.rb
CHANGED
@@ -67,7 +67,7 @@ module ViralSeq
|
|
67
67
|
@k = k
|
68
68
|
@poisson_hash = {}
|
69
69
|
(0..k).each do |n|
|
70
|
-
p = (rate**n * ::Math::E**(-rate))
|
70
|
+
p = (rate**n * ::Math::E**(-rate))/n.factorial
|
71
71
|
@poisson_hash[n] = p
|
72
72
|
end
|
73
73
|
end
|
@@ -155,9 +155,9 @@ class Integer
|
|
155
155
|
# factorial method for an Integer
|
156
156
|
# @return [Integer] factorial for given Integer
|
157
157
|
# @example factorial for 5
|
158
|
-
#
|
158
|
+
# 5.factorial
|
159
159
|
# => 120
|
160
|
-
def
|
160
|
+
def factorial
|
161
161
|
if self == 0
|
162
162
|
return 1
|
163
163
|
else
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
class DRMs
|
3
|
+
def initialize (mutation_list = {})
|
4
|
+
@mutation_list = mutation_list
|
5
|
+
end
|
6
|
+
|
7
|
+
attr_accessor :mutation_list
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.sdrm_hiv_pr(seq_hash)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.sdrm_hiv_rt(seq_hash)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.sdrm_hiv_in(seq_hash)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.list_from_json(file)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.list_from_csv(file)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.export_list_hiv_pr(file, format = :json)
|
26
|
+
if foramt == :json
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.export_list_hiv_rt(file, format = :json)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.export_list_hiv_in(file, format = :json)
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
def drm_analysis(seq_hash)
|
40
|
+
mutation_list = self.mutation_list
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -9,7 +9,7 @@ module ViralSeq
|
|
9
9
|
# # align with MUSCLE
|
10
10
|
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
11
|
# # filter nt sequences with the reference coordinates
|
12
|
-
# filtered_seqhash = aligned_pr_seqhash.stop_codon[
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
|
13
13
|
# # return a new ViralSeq::SeqHash object without stop codons
|
14
14
|
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
15
|
# # further filter out sequences with A3G hypermutations
|
@@ -313,22 +313,22 @@ module ViralSeq
|
|
313
313
|
|
314
314
|
# screen for sequences with stop codons.
|
315
315
|
# @param (see #translate)
|
316
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
317
317
|
#
|
318
|
-
# #
|
319
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
320
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
321
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
322
322
|
# my_seqhash.dna_hash
|
323
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
324
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
325
325
|
# stop_codon_seqhash.dna_hash
|
326
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
327
327
|
# stop_codon_seqhash.aa_hash
|
328
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
329
329
|
# stop_codon_seqhash.title
|
330
330
|
# => "my_fasta_file_stop"
|
331
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
332
332
|
# filtered_seqhash.aa_hash
|
333
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
334
334
|
|
@@ -343,12 +343,15 @@ module ViralSeq
|
|
343
343
|
seqhash1.title = self.title + "_stop"
|
344
344
|
keys2 = aa_seqs.keys - keys
|
345
345
|
seqhash2 = self.sub(keys2)
|
346
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
347
350
|
end #end of #stop_codon
|
348
351
|
|
349
352
|
|
350
353
|
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
351
|
-
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at
|
354
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
|
352
355
|
# @return [String] consensus sequence
|
353
356
|
# @example consensus sequence from an array of sequences.
|
354
357
|
# seq_array = %w{ ATTTTTTTTT
|
@@ -380,11 +383,18 @@ module ViralSeq
|
|
380
383
|
base_count = all_base.count_freq
|
381
384
|
max_base_list = []
|
382
385
|
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
+
if cutoff.zero?
|
387
|
+
max_count = base_count.values.max
|
388
|
+
max_base_hash = base_count.select {|_k,v| v == max_count}
|
389
|
+
max_base_list = max_base_hash.keys
|
390
|
+
else
|
391
|
+
base_count.each do |k,v|
|
392
|
+
if v/seq_size.to_f >= cutoff
|
393
|
+
max_base_list << k
|
394
|
+
end
|
386
395
|
end
|
387
396
|
end
|
397
|
+
|
388
398
|
consensus_seq += call_consensus_base(max_base_list)
|
389
399
|
end
|
390
400
|
return consensus_seq
|
@@ -395,14 +405,14 @@ module ViralSeq
|
|
395
405
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
396
406
|
# # use the sample consensus to determine potential a3g sites
|
397
407
|
# # Two criteria to identify hypermutation
|
398
|
-
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G
|
408
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
399
409
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
400
410
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
401
411
|
# # b/c Poisson model does not do well on small sample size.
|
402
|
-
# @return [
|
403
|
-
#
|
404
|
-
#
|
405
|
-
#
|
412
|
+
# @return [Hash] three paris.
|
413
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
414
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
415
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
406
416
|
# # sequence tag
|
407
417
|
# # G to A mutation numbers at potential a3g positions
|
408
418
|
# # total potential a3g G positions
|
@@ -413,17 +423,17 @@ module ViralSeq
|
|
413
423
|
# @example identify apobec3gf mutations from a sequence fasta file
|
414
424
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
415
425
|
# hypermut = my_seqhash.a3g
|
416
|
-
# hypermut[
|
426
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
417
427
|
# => [">Seq7", ">Seq14"]
|
418
|
-
# hypermut[
|
428
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
419
429
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
420
|
-
# hypermut[
|
430
|
+
# hypermut[:stats]
|
421
431
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
422
432
|
#
|
423
433
|
# @example identify apobec3gf mutations from another sequence fasta file
|
424
434
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
425
435
|
# hypermut = my_seqhash.a3g
|
426
|
-
# hypermut[
|
436
|
+
# hypermut[:stats]
|
427
437
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
428
438
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
429
439
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -516,7 +526,10 @@ module ViralSeq
|
|
516
526
|
hm_seq_hash.title = self.title + "_hypermut"
|
517
527
|
hm_seq_hash.file = self.file
|
518
528
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
519
|
-
return
|
529
|
+
return { a3g_seq: hm_seq_hash,
|
530
|
+
filtered_seq: filtered_seq_hash,
|
531
|
+
stats: hm_hash.values
|
532
|
+
}
|
520
533
|
end #end of #a3g_hypermut
|
521
534
|
|
522
535
|
alias_method :a3g, :a3g_hypermut
|
@@ -536,7 +549,7 @@ module ViralSeq
|
|
536
549
|
if sequences.size == 0
|
537
550
|
return 0
|
538
551
|
else
|
539
|
-
cut_off =
|
552
|
+
cut_off = Float::INFINITY
|
540
553
|
l = sequences[0].size
|
541
554
|
rate = sequences.size * error_rate
|
542
555
|
count_mut = variant_for_poisson(sequences)
|
@@ -545,7 +558,7 @@ module ViralSeq
|
|
545
558
|
|
546
559
|
poisson_hash.each do |k,v|
|
547
560
|
cal = l * v
|
548
|
-
obs = count_mut[k] ? count_mut[k] :
|
561
|
+
obs = count_mut[k] ? count_mut[k] : 1
|
549
562
|
if obs >= fold_cutoff * cal
|
550
563
|
cut_off = k
|
551
564
|
break
|
@@ -730,6 +743,7 @@ module ViralSeq
|
|
730
743
|
|
731
744
|
seq_hash_unique.each do |seq|
|
732
745
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
746
|
+
next unless loc # if locator tool fails, skip this seq.
|
733
747
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
734
748
|
if indel
|
735
749
|
seq_hash_unique_pass << seq
|
@@ -1151,7 +1165,7 @@ module ViralSeq
|
|
1151
1165
|
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1152
1166
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1153
1167
|
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1154
|
-
|
1168
|
+
|
1155
1169
|
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1156
1170
|
seq_hash = self.dna_hash.dup
|
1157
1171
|
seq_hash_unique = seq_hash.uniq_hash
|
@@ -80,6 +80,12 @@ module ViralSeq
|
|
80
80
|
alias_method :fa, :new_from_fasta
|
81
81
|
end
|
82
82
|
|
83
|
+
# the size of nt sequence hash of the SeqHashPair object
|
84
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
85
|
+
def size
|
86
|
+
self.dna_hash.size
|
87
|
+
end
|
88
|
+
|
83
89
|
# Pair-end join function for KNOWN overlap size.
|
84
90
|
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
91
|
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
@@ -0,0 +1,305 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
|
3
|
+
# Core functions for `tcs` pipeline
|
4
|
+
|
5
|
+
class TcsCore
|
6
|
+
class << self
|
7
|
+
|
8
|
+
# methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
|
9
|
+
|
10
|
+
def calculate_cut_off(m, error_rate = 0.02)
|
11
|
+
n = 0
|
12
|
+
case error_rate
|
13
|
+
when 0.005...0.015
|
14
|
+
if m <= 10
|
15
|
+
n = 2
|
16
|
+
else
|
17
|
+
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
18
|
+
end
|
19
|
+
|
20
|
+
when 0...0.005
|
21
|
+
if m <= 10
|
22
|
+
n = 2
|
23
|
+
else
|
24
|
+
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
25
|
+
end
|
26
|
+
|
27
|
+
else
|
28
|
+
if m <= 10
|
29
|
+
n = 2
|
30
|
+
elsif m <= 8500
|
31
|
+
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
32
|
+
else
|
33
|
+
n = 0.0079 * m + 9.4869
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
n = n.round
|
38
|
+
n = 2 if n < 3
|
39
|
+
return n
|
40
|
+
end
|
41
|
+
|
42
|
+
# identify which file in the directory is R1 file, and which is R2 file based on file names
|
43
|
+
# input as directory (Dir object or a string of path)
|
44
|
+
# by default, .gz files will be unzipped.
|
45
|
+
# return as an hash of {r1_file: file1, r1_file: file2}
|
46
|
+
def r1r2(directory, unzip = true)
|
47
|
+
files = []
|
48
|
+
Dir.chdir(directory) { files = Dir.glob "*" }
|
49
|
+
r1_file = ""
|
50
|
+
r2_file = ""
|
51
|
+
files.each do |f|
|
52
|
+
tag = parser_file_name(f)[:tag]
|
53
|
+
|
54
|
+
if tag.include? "R1"
|
55
|
+
unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
|
56
|
+
elsif tag.include? "R2"
|
57
|
+
unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
return { r1_file: r1_file, r2_file: r2_file }
|
61
|
+
end # end of ViralSeq:TcsCore.r1r2
|
62
|
+
|
63
|
+
# sort directories containing mulitple r1 and r2 files.
|
64
|
+
# use the library name (first string before "_") to seperate libraries
|
65
|
+
# out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
|
66
|
+
# return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
|
67
|
+
|
68
|
+
def sort_by_lib(directory, out_dir = directory + "_sorted")
|
69
|
+
Dir.mkdir(out_dir) unless File.directory?(out_dir)
|
70
|
+
files = []
|
71
|
+
Dir.chdir(directory) {files = Dir.glob("*")}
|
72
|
+
|
73
|
+
files.each do |file|
|
74
|
+
path = File.join(directory,file)
|
75
|
+
index = file.split("_")[0]
|
76
|
+
index_dir = File.join(out_dir, index)
|
77
|
+
Dir.mkdir(index_dir) unless File.directory?(index_dir)
|
78
|
+
File.rename(path, File.join(index_dir, file))
|
79
|
+
end
|
80
|
+
|
81
|
+
return_obj = { with_both_r1_r2: [],
|
82
|
+
missing_r1: [],
|
83
|
+
missing_r2: [],
|
84
|
+
error: []
|
85
|
+
}
|
86
|
+
|
87
|
+
libs = []
|
88
|
+
Dir.chdir(out_dir) { libs = Dir.glob('*') }
|
89
|
+
libs.each do |lib|
|
90
|
+
file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
|
91
|
+
if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
92
|
+
return_obj[:with_both_r1_r2] << lib
|
93
|
+
elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
94
|
+
return_obj[:missing_r1] << lib
|
95
|
+
elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
|
96
|
+
return_obj[:missing_r2] << lib
|
97
|
+
else
|
98
|
+
return_obj[:error] << lib
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return return_obj
|
102
|
+
end
|
103
|
+
|
104
|
+
# sort array of file names to determine if there is potential errors
|
105
|
+
# input name_array array of file names
|
106
|
+
# output hash { }
|
107
|
+
# need to change for each file name have an error code. and a bool to show if all pass
|
108
|
+
def validate_file_name(name_array)
|
109
|
+
errors = {
|
110
|
+
file_type_error: [] ,
|
111
|
+
missing_r1_file: [] ,
|
112
|
+
missing_r2_file: [] ,
|
113
|
+
extra_r1_r2_file: [],
|
114
|
+
no_region_tag: [] ,
|
115
|
+
multiple_region_tag: []
|
116
|
+
}
|
117
|
+
|
118
|
+
passed_libs = {}
|
119
|
+
|
120
|
+
name_with_r1_r2 = []
|
121
|
+
|
122
|
+
name_array.each do |name|
|
123
|
+
tag = parser_file_name(name)[:tag]
|
124
|
+
if name !~ /\.fastq\Z|\.fastq\.gz\Z/
|
125
|
+
errors[:file_type_error] << name
|
126
|
+
elsif tag.count("R1") == 0 and tag.count("R2") == 0
|
127
|
+
errors[:no_region_tag] << name
|
128
|
+
elsif tag.count("R1") > 0 and tag.count("R2") > 0
|
129
|
+
errors[:multiple_region_tag] << name
|
130
|
+
elsif tag.count("R1") > 1 or tag.count("R2") > 1
|
131
|
+
errors[:multiple_region_tag] << name
|
132
|
+
else
|
133
|
+
name_with_r1_r2 << name
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
libs = {}
|
138
|
+
|
139
|
+
name_with_r1_r2.map do |name|
|
140
|
+
libname = parser_file_name(name)[:libname]
|
141
|
+
libs[libname] ||= []
|
142
|
+
libs[libname] << name
|
143
|
+
end
|
144
|
+
|
145
|
+
libs.each do |libname, files|
|
146
|
+
count_r1_file = 0
|
147
|
+
count_r2_file = 0
|
148
|
+
files.each do |name|
|
149
|
+
tag = parser_file_name(name)[:tag]
|
150
|
+
if tag.include? "R1"
|
151
|
+
count_r1_file += 1
|
152
|
+
elsif tag.include? "R2"
|
153
|
+
count_r2_file += 1
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if count_r1_file > 1 or count_r2_file > 1
|
158
|
+
errors[:extra_r1_r2_file] += files
|
159
|
+
elsif count_r1_file.zero?
|
160
|
+
errors[:missing_r1_file] += files
|
161
|
+
elsif count_r2_file.zero?
|
162
|
+
errors[:missing_r2_file] += files
|
163
|
+
else
|
164
|
+
passed_libs[libname] = files
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
passed_names = []
|
169
|
+
|
170
|
+
passed_libs.values.each { |names| passed_names += names}
|
171
|
+
|
172
|
+
if passed_names.size < name_array.size
|
173
|
+
pass = false
|
174
|
+
else
|
175
|
+
pass = true
|
176
|
+
end
|
177
|
+
|
178
|
+
return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
|
179
|
+
end
|
180
|
+
|
181
|
+
# filter r1 raw sequences for non-specific primers.
|
182
|
+
# input r1_sh, SeqHash obj.
|
183
|
+
# return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
|
184
|
+
|
185
|
+
def filter_r1(r1_sh, forward_primer)
|
186
|
+
if forward_primer.match(/(N+)(\w+)$/)
|
187
|
+
forward_n = $1.size
|
188
|
+
forward_bio_primer = $2
|
189
|
+
else
|
190
|
+
forward_n = 0
|
191
|
+
forward_bio_primer = forward_primer
|
192
|
+
end
|
193
|
+
forward_bio_primer_size = forward_bio_primer.size
|
194
|
+
forward_starting_number = forward_n + forward_bio_primer_size
|
195
|
+
forward_primer_ref = forward_bio_primer.nt_parser
|
196
|
+
|
197
|
+
r1_passed_seq = {}
|
198
|
+
r1_raw = r1_sh.dna_hash
|
199
|
+
|
200
|
+
proc_filter = proc do |name|
|
201
|
+
seq = r1_raw[name]
|
202
|
+
next unless general_filter seq
|
203
|
+
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
204
|
+
if primer_region_seq =~ forward_primer_ref
|
205
|
+
new_name = remove_tag name
|
206
|
+
r1_passed_seq[new_name] = seq
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
r1_raw.keys.map do |name|
|
211
|
+
proc_filter.call name
|
212
|
+
end
|
213
|
+
|
214
|
+
return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
|
215
|
+
end # end of filter_r1
|
216
|
+
|
217
|
+
# filter r2 raw sequences for non-specific primers.
|
218
|
+
# input r2_sh, SeqHash obj.
|
219
|
+
# return filtered Hash of sequence name and seq pair, as well as the length of PID.
|
220
|
+
def filter_r2(r2_sh, cdna_primer)
|
221
|
+
r2_raw = r2_sh.dna_hash
|
222
|
+
cdna_primer.match(/(N+)(\w+)$/)
|
223
|
+
pid_length = $1.size
|
224
|
+
cdna_bio_primer = $2
|
225
|
+
cdna_bio_primer_size = cdna_bio_primer.size
|
226
|
+
reverse_starting_number = pid_length + cdna_bio_primer_size
|
227
|
+
cdna_primer_ref = cdna_bio_primer.nt_parser
|
228
|
+
r2_passed_seq = {}
|
229
|
+
proc_filter = proc do |name|
|
230
|
+
seq = r2_raw[name]
|
231
|
+
next unless general_filter seq
|
232
|
+
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
233
|
+
if primer_region_seq =~ cdna_primer_ref
|
234
|
+
new_name = remove_tag name
|
235
|
+
r2_passed_seq[new_name] = seq
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
r2_raw.keys.map do |name|
|
240
|
+
proc_filter.call name
|
241
|
+
end
|
242
|
+
|
243
|
+
return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
|
244
|
+
end # end of filter_r2
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
# puts error message in the log file handler, and abort with the same infor
|
249
|
+
|
250
|
+
def log_and_abort(log, infor)
|
251
|
+
log.puts Time.now.to_s + "\t" + infor
|
252
|
+
log.close
|
253
|
+
abort infor.red.bold
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
|
258
|
+
def unzip_r(indir, f)
|
259
|
+
r_file = File.join(indir, f)
|
260
|
+
if f =~ /.gz/
|
261
|
+
`gzip -d #{r_file}`
|
262
|
+
new_f = f.sub ".gz", ""
|
263
|
+
r_file = File.join(indir, new_f)
|
264
|
+
end
|
265
|
+
return r_file
|
266
|
+
end
|
267
|
+
|
268
|
+
def parser_file_name(file_name)
|
269
|
+
t = file_name.split(".")[0].split("_")
|
270
|
+
if t.size == 1
|
271
|
+
libname = "lib"
|
272
|
+
tag = [ t[0].upcase ]
|
273
|
+
else
|
274
|
+
libname = t[0]
|
275
|
+
tag = t[1..-1].map(&:upcase)
|
276
|
+
end
|
277
|
+
return {libname: libname, tag: tag}
|
278
|
+
end
|
279
|
+
|
280
|
+
def general_filter(seq)
|
281
|
+
if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
282
|
+
return false
|
283
|
+
elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
284
|
+
return false
|
285
|
+
elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
286
|
+
return false
|
287
|
+
else
|
288
|
+
return true
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# remove region info tags from the raw MiSeq sequences.
|
293
|
+
def remove_tag(seq_name)
|
294
|
+
if seq_name =~ /\s/
|
295
|
+
new_tag = $`
|
296
|
+
else
|
297
|
+
new_tag = seq_name[0..-3]
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
end # end of class << self
|
302
|
+
|
303
|
+
end # end of TcsCore module
|
304
|
+
|
305
|
+
end # end of main module
|