viral_seq 1.0.7 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/viral_seq.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
1
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
2
2
  #
3
3
  # Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  # of this software and associated documentation files (the "Software"), to deal
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
35
35
  require_relative "viral_seq/sequence"
36
36
  require_relative "viral_seq/string"
37
37
  require_relative "viral_seq/version"
38
+ require_relative "viral_seq/tcs_core"
39
+ require_relative "viral_seq/tcs_json"
40
+
38
41
 
39
42
  require "muscle_bio"
@@ -1,7 +1,11 @@
1
1
  module ViralSeq
2
-
2
+
3
3
  # array for all amino acid one letter abbreviations
4
4
 
5
5
  AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
6
 
7
+ SDRM_HIV_PR_LIST = {}
8
+ SDRM_HIV_RT_LIST = {}
9
+ SDRM_HIV_IN_LIST = {}
10
+
7
11
  end
@@ -3,10 +3,6 @@
3
3
  # array = [1,2,3,4,5,6,7,8,9,10]
4
4
  # array.median
5
5
  # => 5.5
6
- # @example sum
7
- # array = [1,2,3,4,5,6,7,8,9,10]
8
- # array.sum
9
- # => 55
10
6
  # @example average number (mean)
11
7
  # array = [1,2,3,4,5,6,7,8,9,10]
12
8
  # array.mean
@@ -45,12 +41,6 @@ module Enumerable
45
41
  len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
46
42
  end
47
43
 
48
- # generate summed value
49
- # @return [Numeric] summed value
50
- def sum
51
- self.inject(0){|accum, i| accum + i }
52
- end
53
-
54
44
  # generate mean number
55
45
  # @return [Float] mean value
56
46
  def mean
@@ -1,4 +1,4 @@
1
- # addition methods for Class::Hash required for ViralSeq
1
+ # additional methods for Class::Hash required for ViralSeq
2
2
 
3
3
  class Hash
4
4
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  module ViralSeq
3
- class SeqHash
3
+ class SDRM
4
4
 
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
@@ -0,0 +1,43 @@
1
+ module ViralSeq
2
+ class DRMs
3
+ def initialize (mutation_list = {})
4
+ @mutation_list = mutation_list
5
+ end
6
+
7
+ attr_accessor :mutation_list
8
+ end
9
+
10
+ def self.sdrm_hiv_pr(seq_hash)
11
+ end
12
+
13
+ def self.sdrm_hiv_rt(seq_hash)
14
+ end
15
+
16
+ def self.sdrm_hiv_in(seq_hash)
17
+ end
18
+
19
+ def self.list_from_json(file)
20
+ end
21
+
22
+ def self.list_from_csv(file)
23
+ end
24
+
25
+ def self.export_list_hiv_pr(file, format = :json)
26
+ if foramt == :json
27
+
28
+ end
29
+ end
30
+
31
+ def self.export_list_hiv_rt(file, format = :json)
32
+
33
+ end
34
+
35
+ def self.export_list_hiv_in(file, format = :json)
36
+
37
+ end
38
+
39
+ def drm_analysis(seq_hash)
40
+ mutation_list = self.mutation_list
41
+
42
+ end
43
+ end
@@ -9,7 +9,7 @@ module ViralSeq
9
9
  # # align with MUSCLE
10
10
  # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
11
  # # filter nt sequences with the reference coordinates
12
- # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
13
13
  # # return a new ViralSeq::SeqHash object without stop codons
14
14
  # filtered_seqhash = filtered_seqhash.a3g[1]
15
15
  # # further filter out sequences with A3G hypermutations
@@ -130,8 +130,8 @@ module ViralSeq
130
130
  end
131
131
  end
132
132
  end
133
- sequence_hash = Hash[*sequence_a]
134
- quality_hash = Hash[*quality_a]
133
+ sequence_hash = Hash[sequence_a.each_slice(2).to_a]
134
+ quality_hash = Hash[quality_a.each_slice(2).to_a]
135
135
 
136
136
  seq_hash = ViralSeq::SeqHash.new
137
137
  seq_hash.dna_hash = sequence_hash
@@ -181,6 +181,7 @@ module ViralSeq
181
181
  new_seqhash = ViralSeq::SeqHash.new
182
182
  new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
183
183
  new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
184
+ new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
184
185
  new_seqhash.title = self.title + "_with_" + sh2.title
185
186
  new_seqhash.file = self.file + "," + sh2.file
186
187
  return new_seqhash
@@ -312,22 +313,22 @@ module ViralSeq
312
313
 
313
314
  # screen for sequences with stop codons.
314
315
  # @param (see #translate)
315
- # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
316
+ # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
316
317
  #
317
- # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
318
- # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
318
+ # # :with_stop_codon : ViralSeq::SeqHash object with stop codons
319
+ # # :without_stop_codon: ViralSeq::SeqHash object without stop codons
319
320
  # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
320
321
  # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
321
322
  # my_seqhash.dna_hash
322
323
  # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
323
- # stop_codon_seqhash = my_seqhash.stop_codon[0]
324
+ # stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
324
325
  # stop_codon_seqhash.dna_hash
325
326
  # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
326
327
  # stop_codon_seqhash.aa_hash
327
328
  # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
328
329
  # stop_codon_seqhash.title
329
330
  # => "my_fasta_file_stop"
330
- # filtered_seqhash = my_seqhash.stop_codon[1]
331
+ # filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
331
332
  # filtered_seqhash.aa_hash
332
333
  # {">seq1"=>"IRT", ">seq3"=>"MRT"}
333
334
 
@@ -342,12 +343,15 @@ module ViralSeq
342
343
  seqhash1.title = self.title + "_stop"
343
344
  keys2 = aa_seqs.keys - keys
344
345
  seqhash2 = self.sub(keys2)
345
- return [seqhash1, seqhash2]
346
+ return {
347
+ with_stop_codon: seqhash1,
348
+ without_stop_codon: seqhash2
349
+ }
346
350
  end #end of #stop_codon
347
351
 
348
352
 
349
353
  # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
350
- # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
354
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
351
355
  # @return [String] consensus sequence
352
356
  # @example consensus sequence from an array of sequences.
353
357
  # seq_array = %w{ ATTTTTTTTT
@@ -379,11 +383,18 @@ module ViralSeq
379
383
  base_count = all_base.count_freq
380
384
  max_base_list = []
381
385
 
382
- base_count.each do |k,v|
383
- if v/seq_size.to_f >= cutoff
384
- max_base_list << k
386
+ if cutoff.zero?
387
+ max_count = base_count.values.max
388
+ max_base_hash = base_count.select {|_k,v| v == max_count}
389
+ max_base_list = max_base_hash.keys
390
+ else
391
+ base_count.each do |k,v|
392
+ if v/seq_size.to_f >= cutoff
393
+ max_base_list << k
394
+ end
385
395
  end
386
396
  end
397
+
387
398
  consensus_seq += call_consensus_base(max_base_list)
388
399
  end
389
400
  return consensus_seq
@@ -394,14 +405,14 @@ module ViralSeq
394
405
  # # control pattern: G[YN|RC] -> A[YN|RC]
395
406
  # # use the sample consensus to determine potential a3g sites
396
407
  # # Two criteria to identify hypermutation
397
- # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
408
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
398
409
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
399
410
  # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
400
411
  # # b/c Poisson model does not do well on small sample size.
401
- # @return [Array] three values.
402
- # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
403
- # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
404
- # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
412
+ # @return [Hash] three paris.
413
+ # :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
414
+ # :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
415
+ # :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
405
416
  # # sequence tag
406
417
  # # G to A mutation numbers at potential a3g positions
407
418
  # # total potential a3g G positions
@@ -412,17 +423,17 @@ module ViralSeq
412
423
  # @example identify apobec3gf mutations from a sequence fasta file
413
424
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
414
425
  # hypermut = my_seqhash.a3g
415
- # hypermut[0].dna_hash.keys
426
+ # hypermut[:a3g_seq].dna_hash.keys
416
427
  # => [">Seq7", ">Seq14"]
417
- # hypermut[1].dna_hash.keys
428
+ # hypermut[:filtered_seq].dna_hash.keys
418
429
  # => [">Seq1", ">Seq2", ">Seq5"]
419
- # hypermut[2]
430
+ # hypermut[:stats]
420
431
  # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
421
432
  #
422
433
  # @example identify apobec3gf mutations from another sequence fasta file
423
434
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
424
435
  # hypermut = my_seqhash.a3g
425
- # hypermut[2]
436
+ # hypermut[:stats]
426
437
  # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
427
438
  # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
428
439
  # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -515,7 +526,10 @@ module ViralSeq
515
526
  hm_seq_hash.title = self.title + "_hypermut"
516
527
  hm_seq_hash.file = self.file
517
528
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
518
- return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
529
+ return { a3g_seq: hm_seq_hash,
530
+ filtered_seq: filtered_seq_hash,
531
+ stats: hm_hash.values
532
+ }
519
533
  end #end of #a3g_hypermut
520
534
 
521
535
  alias_method :a3g, :a3g_hypermut
@@ -535,7 +549,7 @@ module ViralSeq
535
549
  if sequences.size == 0
536
550
  return 0
537
551
  else
538
- cut_off = 1
552
+ cut_off = Float::INFINITY
539
553
  l = sequences[0].size
540
554
  rate = sequences.size * error_rate
541
555
  count_mut = variant_for_poisson(sequences)
@@ -544,7 +558,7 @@ module ViralSeq
544
558
 
545
559
  poisson_hash.each do |k,v|
546
560
  cal = l * v
547
- obs = count_mut[k] ? count_mut[k] : 0
561
+ obs = count_mut[k] ? count_mut[k] : 1
548
562
  if obs >= fold_cutoff * cal
549
563
  cut_off = k
550
564
  break
@@ -729,6 +743,7 @@ module ViralSeq
729
743
 
730
744
  seq_hash_unique.each do |seq|
731
745
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
+ next unless loc # if locator tool fails, skip this seq.
732
747
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
733
748
  if indel
734
749
  seq_hash_unique_pass << seq
@@ -1144,6 +1159,27 @@ module ViralSeq
1144
1159
  return new_sh
1145
1160
  end
1146
1161
 
1162
+ # trim dna sequences based on the provided reference coordinates.
1163
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1164
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
1165
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
1166
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
1167
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
1168
+
1169
+ def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
1170
+ seq_hash = self.dna_hash.dup
1171
+ seq_hash_unique = seq_hash.uniq_hash
1172
+ trimmed_seq_hash = {}
1173
+ seq_hash_unique.each do |seq, names|
1174
+ trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
1175
+ names.each do |name|
1176
+ trimmed_seq_hash[name] = trimmed_seq
1177
+ end
1178
+ end
1179
+ return_seq_hash = self.dup
1180
+ return_seq_hash.dna_hash = trimmed_seq_hash
1181
+ return return_seq_hash
1182
+ end
1147
1183
 
1148
1184
  # start of private functions
1149
1185
  private
@@ -80,6 +80,12 @@ module ViralSeq
80
80
  alias_method :fa, :new_from_fasta
81
81
  end
82
82
 
83
+ # the size of nt sequence hash of the SeqHashPair object
84
+ # @return [Integer] size of nt sequence hash of the SeqHash object
85
+ def size
86
+ self.dna_hash.size
87
+ end
88
+
83
89
  # Pair-end join function for KNOWN overlap size.
84
90
  # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
91
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -211,7 +217,7 @@ module ViralSeq
211
217
  # {minimal overlap set to 4. }
212
218
  def overlap_matrix(sequence1, sequence2)
213
219
  min_overlap = 4
214
- max_overlap = [sequence1.size, sequence2.size].max
220
+ max_overlap = [sequence1.size, sequence2.size].min
215
221
  matrix_hash = {}
216
222
  (min_overlap..max_overlap).each do |overlap|
217
223
  matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
@@ -0,0 +1,305 @@
1
+ module ViralSeq
2
+
3
+ # Core functions for `tcs` pipeline
4
+
5
+ class TcsCore
6
+ class << self
7
+
8
+ # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
9
+
10
+ def calculate_cut_off(m, error_rate = 0.02)
11
+ n = 0
12
+ case error_rate
13
+ when 0.005...0.015
14
+ if m <= 10
15
+ n = 2
16
+ else
17
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
18
+ end
19
+
20
+ when 0...0.005
21
+ if m <= 10
22
+ n = 2
23
+ else
24
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
25
+ end
26
+
27
+ else
28
+ if m <= 10
29
+ n = 2
30
+ elsif m <= 8500
31
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
32
+ else
33
+ n = 0.0079 * m + 9.4869
34
+ end
35
+ end
36
+
37
+ n = n.round
38
+ n = 2 if n < 3
39
+ return n
40
+ end
41
+
42
+ # identify which file in the directory is R1 file, and which is R2 file based on file names
43
+ # input as directory (Dir object or a string of path)
44
+ # by default, .gz files will be unzipped.
45
+ # return as an hash of {r1_file: file1, r1_file: file2}
46
+ def r1r2(directory, unzip = true)
47
+ files = []
48
+ Dir.chdir(directory) { files = Dir.glob "*" }
49
+ r1_file = ""
50
+ r2_file = ""
51
+ files.each do |f|
52
+ tag = parser_file_name(f)[:tag]
53
+
54
+ if tag.include? "R1"
55
+ unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
56
+ elsif tag.include? "R2"
57
+ unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
58
+ end
59
+ end
60
+ return { r1_file: r1_file, r2_file: r2_file }
61
+ end # end of ViralSeq:TcsCore.r1r2
62
+
63
+ # sort directories containing mulitple r1 and r2 files.
64
+ # use the library name (first string before "_") to seperate libraries
65
+ # out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
66
+ # return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
67
+
68
+ def sort_by_lib(directory, out_dir = directory + "_sorted")
69
+ Dir.mkdir(out_dir) unless File.directory?(out_dir)
70
+ files = []
71
+ Dir.chdir(directory) {files = Dir.glob("*")}
72
+
73
+ files.each do |file|
74
+ path = File.join(directory,file)
75
+ index = file.split("_")[0]
76
+ index_dir = File.join(out_dir, index)
77
+ Dir.mkdir(index_dir) unless File.directory?(index_dir)
78
+ File.rename(path, File.join(index_dir, file))
79
+ end
80
+
81
+ return_obj = { with_both_r1_r2: [],
82
+ missing_r1: [],
83
+ missing_r2: [],
84
+ error: []
85
+ }
86
+
87
+ libs = []
88
+ Dir.chdir(out_dir) { libs = Dir.glob('*') }
89
+ libs.each do |lib|
90
+ file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
91
+ if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
92
+ return_obj[:with_both_r1_r2] << lib
93
+ elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
94
+ return_obj[:missing_r1] << lib
95
+ elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
96
+ return_obj[:missing_r2] << lib
97
+ else
98
+ return_obj[:error] << lib
99
+ end
100
+ end
101
+ return return_obj
102
+ end
103
+
104
+ # sort array of file names to determine if there is potential errors
105
+ # input name_array array of file names
106
+ # output hash { }
107
+ # need to change for each file name have an error code. and a bool to show if all pass
108
+ def validate_file_name(name_array)
109
+ errors = {
110
+ file_type_error: [] ,
111
+ missing_r1_file: [] ,
112
+ missing_r2_file: [] ,
113
+ extra_r1_r2_file: [],
114
+ no_region_tag: [] ,
115
+ multiple_region_tag: []
116
+ }
117
+
118
+ passed_libs = {}
119
+
120
+ name_with_r1_r2 = []
121
+
122
+ name_array.each do |name|
123
+ tag = parser_file_name(name)[:tag]
124
+ if name !~ /\.fastq\Z|\.fastq\.gz\Z/
125
+ errors[:file_type_error] << name
126
+ elsif tag.count("R1") == 0 and tag.count("R2") == 0
127
+ errors[:no_region_tag] << name
128
+ elsif tag.count("R1") > 0 and tag.count("R2") > 0
129
+ errors[:multiple_region_tag] << name
130
+ elsif tag.count("R1") > 1 or tag.count("R2") > 1
131
+ errors[:multiple_region_tag] << name
132
+ else
133
+ name_with_r1_r2 << name
134
+ end
135
+ end
136
+
137
+ libs = {}
138
+
139
+ name_with_r1_r2.map do |name|
140
+ libname = parser_file_name(name)[:libname]
141
+ libs[libname] ||= []
142
+ libs[libname] << name
143
+ end
144
+
145
+ libs.each do |libname, files|
146
+ count_r1_file = 0
147
+ count_r2_file = 0
148
+ files.each do |name|
149
+ tag = parser_file_name(name)[:tag]
150
+ if tag.include? "R1"
151
+ count_r1_file += 1
152
+ elsif tag.include? "R2"
153
+ count_r2_file += 1
154
+ end
155
+ end
156
+
157
+ if count_r1_file > 1 or count_r2_file > 1
158
+ errors[:extra_r1_r2_file] += files
159
+ elsif count_r1_file.zero?
160
+ errors[:missing_r1_file] += files
161
+ elsif count_r2_file.zero?
162
+ errors[:missing_r2_file] += files
163
+ else
164
+ passed_libs[libname] = files
165
+ end
166
+ end
167
+
168
+ passed_names = []
169
+
170
+ passed_libs.values.each { |names| passed_names += names}
171
+
172
+ if passed_names.size < name_array.size
173
+ pass = false
174
+ else
175
+ pass = true
176
+ end
177
+
178
+ return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
179
+ end
180
+
181
+ # filter r1 raw sequences for non-specific primers.
182
+ # input r1_sh, SeqHash obj.
183
+ # return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
184
+
185
+ def filter_r1(r1_sh, forward_primer)
186
+ if forward_primer.match(/(N+)(\w+)$/)
187
+ forward_n = $1.size
188
+ forward_bio_primer = $2
189
+ else
190
+ forward_n = 0
191
+ forward_bio_primer = forward_primer
192
+ end
193
+ forward_bio_primer_size = forward_bio_primer.size
194
+ forward_starting_number = forward_n + forward_bio_primer_size
195
+ forward_primer_ref = forward_bio_primer.nt_parser
196
+
197
+ r1_passed_seq = {}
198
+ r1_raw = r1_sh.dna_hash
199
+
200
+ proc_filter = proc do |name|
201
+ seq = r1_raw[name]
202
+ next unless general_filter seq
203
+ primer_region_seq = seq[forward_n, forward_bio_primer_size]
204
+ if primer_region_seq =~ forward_primer_ref
205
+ new_name = remove_tag name
206
+ r1_passed_seq[new_name] = seq
207
+ end
208
+ end
209
+
210
+ r1_raw.keys.map do |name|
211
+ proc_filter.call name
212
+ end
213
+
214
+ return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
215
+ end # end of filter_r1
216
+
217
+ # filter r2 raw sequences for non-specific primers.
218
+ # input r2_sh, SeqHash obj.
219
+ # return filtered Hash of sequence name and seq pair, as well as the length of PID.
220
+ def filter_r2(r2_sh, cdna_primer)
221
+ r2_raw = r2_sh.dna_hash
222
+ cdna_primer.match(/(N+)(\w+)$/)
223
+ pid_length = $1.size
224
+ cdna_bio_primer = $2
225
+ cdna_bio_primer_size = cdna_bio_primer.size
226
+ reverse_starting_number = pid_length + cdna_bio_primer_size
227
+ cdna_primer_ref = cdna_bio_primer.nt_parser
228
+ r2_passed_seq = {}
229
+ proc_filter = proc do |name|
230
+ seq = r2_raw[name]
231
+ next unless general_filter seq
232
+ primer_region_seq = seq[pid_length, cdna_bio_primer_size]
233
+ if primer_region_seq =~ cdna_primer_ref
234
+ new_name = remove_tag name
235
+ r2_passed_seq[new_name] = seq
236
+ end
237
+ end
238
+
239
+ r2_raw.keys.map do |name|
240
+ proc_filter.call name
241
+ end
242
+
243
+ return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
244
+ end # end of filter_r2
245
+
246
+
247
+
248
+ # puts error message in the log file handler, and abort with the same infor
249
+
250
+ def log_and_abort(log, infor)
251
+ log.puts Time.now.to_s + "\t" + infor
252
+ log.close
253
+ abort infor.red.bold
254
+ end
255
+
256
+ private
257
+
258
+ def unzip_r(indir, f)
259
+ r_file = File.join(indir, f)
260
+ if f =~ /.gz/
261
+ `gzip -d #{r_file}`
262
+ new_f = f.sub ".gz", ""
263
+ r_file = File.join(indir, new_f)
264
+ end
265
+ return r_file
266
+ end
267
+
268
+ def parser_file_name(file_name)
269
+ t = file_name.split(".")[0].split("_")
270
+ if t.size == 1
271
+ libname = "lib"
272
+ tag = [ t[0].upcase ]
273
+ else
274
+ libname = t[0]
275
+ tag = t[1..-1].map(&:upcase)
276
+ end
277
+ return {libname: libname, tag: tag}
278
+ end
279
+
280
+ def general_filter(seq)
281
+ if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
282
+ return false
283
+ elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
284
+ return false
285
+ elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
286
+ return false
287
+ else
288
+ return true
289
+ end
290
+ end
291
+
292
+ # remove region info tags from the raw MiSeq sequences.
293
+ def remove_tag(seq_name)
294
+ if seq_name =~ /\s/
295
+ new_tag = $`
296
+ else
297
+ new_tag = seq_name[0..-3]
298
+ end
299
+ end
300
+
301
+ end # end of class << self
302
+
303
+ end # end of TcsCore module
304
+
305
+ end # end of main module