viral_seq 1.0.8 → 1.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/viral_seq.rb CHANGED
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
35
35
  require_relative "viral_seq/sequence"
36
36
  require_relative "viral_seq/string"
37
37
  require_relative "viral_seq/version"
38
+ require_relative "viral_seq/tcs_core"
39
+ require_relative "viral_seq/tcs_json"
40
+
38
41
 
39
42
  require "muscle_bio"
@@ -1,7 +1,11 @@
1
1
  module ViralSeq
2
-
2
+
3
3
  # array for all amino acid one letter abbreviations
4
4
 
5
5
  AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
6
 
7
+ SDRM_HIV_PR_LIST = {}
8
+ SDRM_HIV_RT_LIST = {}
9
+ SDRM_HIV_IN_LIST = {}
10
+
7
11
  end
@@ -3,10 +3,6 @@
3
3
  # array = [1,2,3,4,5,6,7,8,9,10]
4
4
  # array.median
5
5
  # => 5.5
6
- # @example sum
7
- # array = [1,2,3,4,5,6,7,8,9,10]
8
- # array.sum
9
- # => 55
10
6
  # @example average number (mean)
11
7
  # array = [1,2,3,4,5,6,7,8,9,10]
12
8
  # array.mean
@@ -45,12 +41,6 @@ module Enumerable
45
41
  len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
46
42
  end
47
43
 
48
- # generate summed value
49
- # @return [Numeric] summed value
50
- def sum
51
- self.inject(0){|accum, i| accum + i }
52
- end
53
-
54
44
  # generate mean number
55
45
  # @return [Float] mean value
56
46
  def mean
@@ -1,6 +1,6 @@
1
1
 
2
2
  module ViralSeq
3
- class SeqHash
3
+ class SDRM
4
4
 
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
@@ -67,7 +67,7 @@ module ViralSeq
67
67
  @k = k
68
68
  @poisson_hash = {}
69
69
  (0..k).each do |n|
70
- p = (rate**n * ::Math::E**(-rate))/!n
70
+ p = (rate**n * ::Math::E**(-rate))/n.factorial
71
71
  @poisson_hash[n] = p
72
72
  end
73
73
  end
@@ -155,9 +155,9 @@ class Integer
155
155
  # factorial method for an Integer
156
156
  # @return [Integer] factorial for given Integer
157
157
  # @example factorial for 5
158
- # !5
158
+ # 5.factorial
159
159
  # => 120
160
- def !
160
+ def factorial
161
161
  if self == 0
162
162
  return 1
163
163
  else
@@ -0,0 +1,43 @@
1
+ module ViralSeq
2
+ class DRMs
3
+ def initialize (mutation_list = {})
4
+ @mutation_list = mutation_list
5
+ end
6
+
7
+ attr_accessor :mutation_list
8
+ end
9
+
10
+ def self.sdrm_hiv_pr(seq_hash)
11
+ end
12
+
13
+ def self.sdrm_hiv_rt(seq_hash)
14
+ end
15
+
16
+ def self.sdrm_hiv_in(seq_hash)
17
+ end
18
+
19
+ def self.list_from_json(file)
20
+ end
21
+
22
+ def self.list_from_csv(file)
23
+ end
24
+
25
+ def self.export_list_hiv_pr(file, format = :json)
26
+ if foramt == :json
27
+
28
+ end
29
+ end
30
+
31
+ def self.export_list_hiv_rt(file, format = :json)
32
+
33
+ end
34
+
35
+ def self.export_list_hiv_in(file, format = :json)
36
+
37
+ end
38
+
39
+ def drm_analysis(seq_hash)
40
+ mutation_list = self.mutation_list
41
+
42
+ end
43
+ end
@@ -9,7 +9,7 @@ module ViralSeq
9
9
  # # align with MUSCLE
10
10
  # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
11
  # # filter nt sequences with the reference coordinates
12
- # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
13
13
  # # return a new ViralSeq::SeqHash object without stop codons
14
14
  # filtered_seqhash = filtered_seqhash.a3g[1]
15
15
  # # further filter out sequences with A3G hypermutations
@@ -313,22 +313,22 @@ module ViralSeq
313
313
 
314
314
  # screen for sequences with stop codons.
315
315
  # @param (see #translate)
316
- # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
316
+ # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
317
317
  #
318
- # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
319
- # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
318
+ # # :with_stop_codon : ViralSeq::SeqHash object with stop codons
319
+ # # :without_stop_codon: ViralSeq::SeqHash object without stop codons
320
320
  # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
321
321
  # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
322
322
  # my_seqhash.dna_hash
323
323
  # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
324
- # stop_codon_seqhash = my_seqhash.stop_codon[0]
324
+ # stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
325
325
  # stop_codon_seqhash.dna_hash
326
326
  # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
327
327
  # stop_codon_seqhash.aa_hash
328
328
  # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
329
329
  # stop_codon_seqhash.title
330
330
  # => "my_fasta_file_stop"
331
- # filtered_seqhash = my_seqhash.stop_codon[1]
331
+ # filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
332
332
  # filtered_seqhash.aa_hash
333
333
  # {">seq1"=>"IRT", ">seq3"=>"MRT"}
334
334
 
@@ -343,12 +343,15 @@ module ViralSeq
343
343
  seqhash1.title = self.title + "_stop"
344
344
  keys2 = aa_seqs.keys - keys
345
345
  seqhash2 = self.sub(keys2)
346
- return [seqhash1, seqhash2]
346
+ return {
347
+ with_stop_codon: seqhash1,
348
+ without_stop_codon: seqhash2
349
+ }
347
350
  end #end of #stop_codon
348
351
 
349
352
 
350
353
  # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
351
- # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
354
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
352
355
  # @return [String] consensus sequence
353
356
  # @example consensus sequence from an array of sequences.
354
357
  # seq_array = %w{ ATTTTTTTTT
@@ -380,11 +383,18 @@ module ViralSeq
380
383
  base_count = all_base.count_freq
381
384
  max_base_list = []
382
385
 
383
- base_count.each do |k,v|
384
- if v/seq_size.to_f >= cutoff
385
- max_base_list << k
386
+ if cutoff.zero?
387
+ max_count = base_count.values.max
388
+ max_base_hash = base_count.select {|_k,v| v == max_count}
389
+ max_base_list = max_base_hash.keys
390
+ else
391
+ base_count.each do |k,v|
392
+ if v/seq_size.to_f >= cutoff
393
+ max_base_list << k
394
+ end
386
395
  end
387
396
  end
397
+
388
398
  consensus_seq += call_consensus_base(max_base_list)
389
399
  end
390
400
  return consensus_seq
@@ -395,14 +405,14 @@ module ViralSeq
395
405
  # # control pattern: G[YN|RC] -> A[YN|RC]
396
406
  # # use the sample consensus to determine potential a3g sites
397
407
  # # Two criteria to identify hypermutation
398
- # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
408
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
399
409
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
400
410
  # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
401
411
  # # b/c Poisson model does not do well on small sample size.
402
- # @return [Array] three values.
403
- # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
404
- # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
405
- # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
412
+ # @return [Hash] three paris.
413
+ # :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
414
+ # :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
415
+ # :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
406
416
  # # sequence tag
407
417
  # # G to A mutation numbers at potential a3g positions
408
418
  # # total potential a3g G positions
@@ -413,17 +423,17 @@ module ViralSeq
413
423
  # @example identify apobec3gf mutations from a sequence fasta file
414
424
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
415
425
  # hypermut = my_seqhash.a3g
416
- # hypermut[0].dna_hash.keys
426
+ # hypermut[:a3g_seq].dna_hash.keys
417
427
  # => [">Seq7", ">Seq14"]
418
- # hypermut[1].dna_hash.keys
428
+ # hypermut[:filtered_seq].dna_hash.keys
419
429
  # => [">Seq1", ">Seq2", ">Seq5"]
420
- # hypermut[2]
430
+ # hypermut[:stats]
421
431
  # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
422
432
  #
423
433
  # @example identify apobec3gf mutations from another sequence fasta file
424
434
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
425
435
  # hypermut = my_seqhash.a3g
426
- # hypermut[2]
436
+ # hypermut[:stats]
427
437
  # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
428
438
  # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
429
439
  # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -516,7 +526,10 @@ module ViralSeq
516
526
  hm_seq_hash.title = self.title + "_hypermut"
517
527
  hm_seq_hash.file = self.file
518
528
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
519
- return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
529
+ return { a3g_seq: hm_seq_hash,
530
+ filtered_seq: filtered_seq_hash,
531
+ stats: hm_hash.values
532
+ }
520
533
  end #end of #a3g_hypermut
521
534
 
522
535
  alias_method :a3g, :a3g_hypermut
@@ -536,7 +549,7 @@ module ViralSeq
536
549
  if sequences.size == 0
537
550
  return 0
538
551
  else
539
- cut_off = 1
552
+ cut_off = Float::INFINITY
540
553
  l = sequences[0].size
541
554
  rate = sequences.size * error_rate
542
555
  count_mut = variant_for_poisson(sequences)
@@ -545,7 +558,7 @@ module ViralSeq
545
558
 
546
559
  poisson_hash.each do |k,v|
547
560
  cal = l * v
548
- obs = count_mut[k] ? count_mut[k] : 0
561
+ obs = count_mut[k] ? count_mut[k] : 1
549
562
  if obs >= fold_cutoff * cal
550
563
  cut_off = k
551
564
  break
@@ -730,6 +743,7 @@ module ViralSeq
730
743
 
731
744
  seq_hash_unique.each do |seq|
732
745
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
+ next unless loc # if locator tool fails, skip this seq.
733
747
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
734
748
  if indel
735
749
  seq_hash_unique_pass << seq
@@ -1151,7 +1165,7 @@ module ViralSeq
1151
1165
  # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
1152
1166
  # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
1153
1167
  # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
1154
-
1168
+
1155
1169
  def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
1156
1170
  seq_hash = self.dna_hash.dup
1157
1171
  seq_hash_unique = seq_hash.uniq_hash
@@ -80,6 +80,12 @@ module ViralSeq
80
80
  alias_method :fa, :new_from_fasta
81
81
  end
82
82
 
83
+ # the size of nt sequence hash of the SeqHashPair object
84
+ # @return [Integer] size of nt sequence hash of the SeqHash object
85
+ def size
86
+ self.dna_hash.size
87
+ end
88
+
83
89
  # Pair-end join function for KNOWN overlap size.
84
90
  # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
91
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -0,0 +1,305 @@
1
+ module ViralSeq
2
+
3
+ # Core functions for `tcs` pipeline
4
+
5
+ class TcsCore
6
+ class << self
7
+
8
+ # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
9
+
10
+ def calculate_cut_off(m, error_rate = 0.02)
11
+ n = 0
12
+ case error_rate
13
+ when 0.005...0.015
14
+ if m <= 10
15
+ n = 2
16
+ else
17
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
18
+ end
19
+
20
+ when 0...0.005
21
+ if m <= 10
22
+ n = 2
23
+ else
24
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
25
+ end
26
+
27
+ else
28
+ if m <= 10
29
+ n = 2
30
+ elsif m <= 8500
31
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
32
+ else
33
+ n = 0.0079 * m + 9.4869
34
+ end
35
+ end
36
+
37
+ n = n.round
38
+ n = 2 if n < 3
39
+ return n
40
+ end
41
+
42
+ # identify which file in the directory is R1 file, and which is R2 file based on file names
43
+ # input as directory (Dir object or a string of path)
44
+ # by default, .gz files will be unzipped.
45
+ # return as an hash of {r1_file: file1, r1_file: file2}
46
+ def r1r2(directory, unzip = true)
47
+ files = []
48
+ Dir.chdir(directory) { files = Dir.glob "*" }
49
+ r1_file = ""
50
+ r2_file = ""
51
+ files.each do |f|
52
+ tag = parser_file_name(f)[:tag]
53
+
54
+ if tag.include? "R1"
55
+ unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
56
+ elsif tag.include? "R2"
57
+ unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
58
+ end
59
+ end
60
+ return { r1_file: r1_file, r2_file: r2_file }
61
+ end # end of ViralSeq:TcsCore.r1r2
62
+
63
+ # sort directories containing mulitple r1 and r2 files.
64
+ # use the library name (first string before "_") to seperate libraries
65
+ # out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
66
+ # return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
67
+
68
+ def sort_by_lib(directory, out_dir = directory + "_sorted")
69
+ Dir.mkdir(out_dir) unless File.directory?(out_dir)
70
+ files = []
71
+ Dir.chdir(directory) {files = Dir.glob("*")}
72
+
73
+ files.each do |file|
74
+ path = File.join(directory,file)
75
+ index = file.split("_")[0]
76
+ index_dir = File.join(out_dir, index)
77
+ Dir.mkdir(index_dir) unless File.directory?(index_dir)
78
+ File.rename(path, File.join(index_dir, file))
79
+ end
80
+
81
+ return_obj = { with_both_r1_r2: [],
82
+ missing_r1: [],
83
+ missing_r2: [],
84
+ error: []
85
+ }
86
+
87
+ libs = []
88
+ Dir.chdir(out_dir) { libs = Dir.glob('*') }
89
+ libs.each do |lib|
90
+ file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
91
+ if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
92
+ return_obj[:with_both_r1_r2] << lib
93
+ elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
94
+ return_obj[:missing_r1] << lib
95
+ elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
96
+ return_obj[:missing_r2] << lib
97
+ else
98
+ return_obj[:error] << lib
99
+ end
100
+ end
101
+ return return_obj
102
+ end
103
+
104
+ # sort array of file names to determine if there is potential errors
105
+ # input name_array array of file names
106
+ # output hash { }
107
+ # need to change for each file name have an error code. and a bool to show if all pass
108
+ def validate_file_name(name_array)
109
+ errors = {
110
+ file_type_error: [] ,
111
+ missing_r1_file: [] ,
112
+ missing_r2_file: [] ,
113
+ extra_r1_r2_file: [],
114
+ no_region_tag: [] ,
115
+ multiple_region_tag: []
116
+ }
117
+
118
+ passed_libs = {}
119
+
120
+ name_with_r1_r2 = []
121
+
122
+ name_array.each do |name|
123
+ tag = parser_file_name(name)[:tag]
124
+ if name !~ /\.fastq\Z|\.fastq\.gz\Z/
125
+ errors[:file_type_error] << name
126
+ elsif tag.count("R1") == 0 and tag.count("R2") == 0
127
+ errors[:no_region_tag] << name
128
+ elsif tag.count("R1") > 0 and tag.count("R2") > 0
129
+ errors[:multiple_region_tag] << name
130
+ elsif tag.count("R1") > 1 or tag.count("R2") > 1
131
+ errors[:multiple_region_tag] << name
132
+ else
133
+ name_with_r1_r2 << name
134
+ end
135
+ end
136
+
137
+ libs = {}
138
+
139
+ name_with_r1_r2.map do |name|
140
+ libname = parser_file_name(name)[:libname]
141
+ libs[libname] ||= []
142
+ libs[libname] << name
143
+ end
144
+
145
+ libs.each do |libname, files|
146
+ count_r1_file = 0
147
+ count_r2_file = 0
148
+ files.each do |name|
149
+ tag = parser_file_name(name)[:tag]
150
+ if tag.include? "R1"
151
+ count_r1_file += 1
152
+ elsif tag.include? "R2"
153
+ count_r2_file += 1
154
+ end
155
+ end
156
+
157
+ if count_r1_file > 1 or count_r2_file > 1
158
+ errors[:extra_r1_r2_file] += files
159
+ elsif count_r1_file.zero?
160
+ errors[:missing_r1_file] += files
161
+ elsif count_r2_file.zero?
162
+ errors[:missing_r2_file] += files
163
+ else
164
+ passed_libs[libname] = files
165
+ end
166
+ end
167
+
168
+ passed_names = []
169
+
170
+ passed_libs.values.each { |names| passed_names += names}
171
+
172
+ if passed_names.size < name_array.size
173
+ pass = false
174
+ else
175
+ pass = true
176
+ end
177
+
178
+ return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
179
+ end
180
+
181
+ # filter r1 raw sequences for non-specific primers.
182
+ # input r1_sh, SeqHash obj.
183
+ # return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
184
+
185
+ def filter_r1(r1_sh, forward_primer)
186
+ if forward_primer.match(/(N+)(\w+)$/)
187
+ forward_n = $1.size
188
+ forward_bio_primer = $2
189
+ else
190
+ forward_n = 0
191
+ forward_bio_primer = forward_primer
192
+ end
193
+ forward_bio_primer_size = forward_bio_primer.size
194
+ forward_starting_number = forward_n + forward_bio_primer_size
195
+ forward_primer_ref = forward_bio_primer.nt_parser
196
+
197
+ r1_passed_seq = {}
198
+ r1_raw = r1_sh.dna_hash
199
+
200
+ proc_filter = proc do |name|
201
+ seq = r1_raw[name]
202
+ next unless general_filter seq
203
+ primer_region_seq = seq[forward_n, forward_bio_primer_size]
204
+ if primer_region_seq =~ forward_primer_ref
205
+ new_name = remove_tag name
206
+ r1_passed_seq[new_name] = seq
207
+ end
208
+ end
209
+
210
+ r1_raw.keys.map do |name|
211
+ proc_filter.call name
212
+ end
213
+
214
+ return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
215
+ end # end of filter_r1
216
+
217
+ # filter r2 raw sequences for non-specific primers.
218
+ # input r2_sh, SeqHash obj.
219
+ # return filtered Hash of sequence name and seq pair, as well as the length of PID.
220
+ def filter_r2(r2_sh, cdna_primer)
221
+ r2_raw = r2_sh.dna_hash
222
+ cdna_primer.match(/(N+)(\w+)$/)
223
+ pid_length = $1.size
224
+ cdna_bio_primer = $2
225
+ cdna_bio_primer_size = cdna_bio_primer.size
226
+ reverse_starting_number = pid_length + cdna_bio_primer_size
227
+ cdna_primer_ref = cdna_bio_primer.nt_parser
228
+ r2_passed_seq = {}
229
+ proc_filter = proc do |name|
230
+ seq = r2_raw[name]
231
+ next unless general_filter seq
232
+ primer_region_seq = seq[pid_length, cdna_bio_primer_size]
233
+ if primer_region_seq =~ cdna_primer_ref
234
+ new_name = remove_tag name
235
+ r2_passed_seq[new_name] = seq
236
+ end
237
+ end
238
+
239
+ r2_raw.keys.map do |name|
240
+ proc_filter.call name
241
+ end
242
+
243
+ return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
244
+ end # end of filter_r2
245
+
246
+
247
+
248
+ # puts error message in the log file handler, and abort with the same infor
249
+
250
+ def log_and_abort(log, infor)
251
+ log.puts Time.now.to_s + "\t" + infor
252
+ log.close
253
+ abort infor.red.bold
254
+ end
255
+
256
+ private
257
+
258
+ def unzip_r(indir, f)
259
+ r_file = File.join(indir, f)
260
+ if f =~ /.gz/
261
+ `gzip -d #{r_file}`
262
+ new_f = f.sub ".gz", ""
263
+ r_file = File.join(indir, new_f)
264
+ end
265
+ return r_file
266
+ end
267
+
268
+ def parser_file_name(file_name)
269
+ t = file_name.split(".")[0].split("_")
270
+ if t.size == 1
271
+ libname = "lib"
272
+ tag = [ t[0].upcase ]
273
+ else
274
+ libname = t[0]
275
+ tag = t[1..-1].map(&:upcase)
276
+ end
277
+ return {libname: libname, tag: tag}
278
+ end
279
+
280
+ def general_filter(seq)
281
+ if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
282
+ return false
283
+ elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
284
+ return false
285
+ elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
286
+ return false
287
+ else
288
+ return true
289
+ end
290
+ end
291
+
292
+ # remove region info tags from the raw MiSeq sequences.
293
+ def remove_tag(seq_name)
294
+ if seq_name =~ /\s/
295
+ new_tag = $`
296
+ else
297
+ new_tag = seq_name[0..-3]
298
+ end
299
+ end
300
+
301
+ end # end of class << self
302
+
303
+ end # end of TcsCore module
304
+
305
+ end # end of main module