viral_seq 1.0.9 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/viral_seq.rb CHANGED
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
35
35
  require_relative "viral_seq/sequence"
36
36
  require_relative "viral_seq/string"
37
37
  require_relative "viral_seq/version"
38
+ require_relative "viral_seq/tcs_core"
39
+ require_relative "viral_seq/tcs_json"
40
+
38
41
 
39
42
  require "muscle_bio"
@@ -1,7 +1,11 @@
1
1
  module ViralSeq
2
-
2
+
3
3
  # array for all amino acid one letter abbreviations
4
4
 
5
5
  AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
6
 
7
+ SDRM_HIV_PR_LIST = {}
8
+ SDRM_HIV_RT_LIST = {}
9
+ SDRM_HIV_IN_LIST = {}
10
+
7
11
  end
@@ -3,10 +3,6 @@
3
3
  # array = [1,2,3,4,5,6,7,8,9,10]
4
4
  # array.median
5
5
  # => 5.5
6
- # @example sum
7
- # array = [1,2,3,4,5,6,7,8,9,10]
8
- # array.sum
9
- # => 55
10
6
  # @example average number (mean)
11
7
  # array = [1,2,3,4,5,6,7,8,9,10]
12
8
  # array.mean
@@ -45,12 +41,6 @@ module Enumerable
45
41
  len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
46
42
  end
47
43
 
48
- # generate summed value
49
- # @return [Numeric] summed value
50
- def sum
51
- self.inject(0){|accum, i| accum + i }
52
- end
53
-
54
44
  # generate mean number
55
45
  # @return [Float] mean value
56
46
  def mean
@@ -1,6 +1,6 @@
1
1
 
2
2
  module ViralSeq
3
- class SeqHash
3
+ class SDRM
4
4
 
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
@@ -67,7 +67,7 @@ module ViralSeq
67
67
  @k = k
68
68
  @poisson_hash = {}
69
69
  (0..k).each do |n|
70
- p = (rate**n * ::Math::E**(-rate))/!n
70
+ p = (rate**n * ::Math::E**(-rate))/n.factorial
71
71
  @poisson_hash[n] = p
72
72
  end
73
73
  end
@@ -155,9 +155,9 @@ class Integer
155
155
  # factorial method for an Integer
156
156
  # @return [Integer] factorial for given Integer
157
157
  # @example factorial for 5
158
- # !5
158
+ # 5.factorial
159
159
  # => 120
160
- def !
160
+ def factorial
161
161
  if self == 0
162
162
  return 1
163
163
  else
@@ -0,0 +1,43 @@
1
+ module ViralSeq
2
+ class DRMs
3
+ def initialize (mutation_list = {})
4
+ @mutation_list = mutation_list
5
+ end
6
+
7
+ attr_accessor :mutation_list
8
+ end
9
+
10
+ def self.sdrm_hiv_pr(seq_hash)
11
+ end
12
+
13
+ def self.sdrm_hiv_rt(seq_hash)
14
+ end
15
+
16
+ def self.sdrm_hiv_in(seq_hash)
17
+ end
18
+
19
+ def self.list_from_json(file)
20
+ end
21
+
22
+ def self.list_from_csv(file)
23
+ end
24
+
25
+ def self.export_list_hiv_pr(file, format = :json)
26
+ if foramt == :json
27
+
28
+ end
29
+ end
30
+
31
+ def self.export_list_hiv_rt(file, format = :json)
32
+
33
+ end
34
+
35
+ def self.export_list_hiv_in(file, format = :json)
36
+
37
+ end
38
+
39
+ def drm_analysis(seq_hash)
40
+ mutation_list = self.mutation_list
41
+
42
+ end
43
+ end
@@ -9,7 +9,7 @@ module ViralSeq
9
9
  # # align with MUSCLE
10
10
  # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
11
  # # filter nt sequences with the reference coordinates
12
- # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
13
13
  # # return a new ViralSeq::SeqHash object without stop codons
14
14
  # filtered_seqhash = filtered_seqhash.a3g[1]
15
15
  # # further filter out sequences with A3G hypermutations
@@ -351,7 +351,7 @@ module ViralSeq
351
351
 
352
352
 
353
353
  # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
354
- # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
354
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
355
355
  # @return [String] consensus sequence
356
356
  # @example consensus sequence from an array of sequences.
357
357
  # seq_array = %w{ ATTTTTTTTT
@@ -383,11 +383,18 @@ module ViralSeq
383
383
  base_count = all_base.count_freq
384
384
  max_base_list = []
385
385
 
386
- base_count.each do |k,v|
387
- if v/seq_size.to_f >= cutoff
388
- max_base_list << k
386
+ if cutoff.zero?
387
+ max_count = base_count.values.max
388
+ max_base_hash = base_count.select {|_k,v| v == max_count}
389
+ max_base_list = max_base_hash.keys
390
+ else
391
+ base_count.each do |k,v|
392
+ if v/seq_size.to_f >= cutoff
393
+ max_base_list << k
394
+ end
389
395
  end
390
396
  end
397
+
391
398
  consensus_seq += call_consensus_base(max_base_list)
392
399
  end
393
400
  return consensus_seq
@@ -398,7 +405,7 @@ module ViralSeq
398
405
  # # control pattern: G[YN|RC] -> A[YN|RC]
399
406
  # # use the sample consensus to determine potential a3g sites
400
407
  # # Two criteria to identify hypermutation
401
- # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
408
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
402
409
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
403
410
  # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
404
411
  # # b/c Poisson model does not do well on small sample size.
@@ -542,7 +549,7 @@ module ViralSeq
542
549
  if sequences.size == 0
543
550
  return 0
544
551
  else
545
- cut_off = 1
552
+ cut_off = Float::INFINITY
546
553
  l = sequences[0].size
547
554
  rate = sequences.size * error_rate
548
555
  count_mut = variant_for_poisson(sequences)
@@ -551,7 +558,7 @@ module ViralSeq
551
558
 
552
559
  poisson_hash.each do |k,v|
553
560
  cal = l * v
554
- obs = count_mut[k] ? count_mut[k] : 0
561
+ obs = count_mut[k] ? count_mut[k] : 1
555
562
  if obs >= fold_cutoff * cal
556
563
  cut_off = k
557
564
  break
@@ -80,6 +80,12 @@ module ViralSeq
80
80
  alias_method :fa, :new_from_fasta
81
81
  end
82
82
 
83
+ # the size of nt sequence hash of the SeqHashPair object
84
+ # @return [Integer] size of nt sequence hash of the SeqHash object
85
+ def size
86
+ self.dna_hash.size
87
+ end
88
+
83
89
  # Pair-end join function for KNOWN overlap size.
84
90
  # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
91
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -0,0 +1,332 @@
1
+ module ViralSeq
2
+
3
+ # Core functions for `tcs` pipeline
4
+
5
+ class TcsCore
6
+ class << self
7
+
8
+ # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
9
+
10
+ def calculate_cut_off(m, error_rate = 0.02)
11
+ n = 0
12
+ case error_rate
13
+ when 0.005...0.015
14
+ if m <= 10
15
+ n = 2
16
+ else
17
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
18
+ end
19
+
20
+ when 0...0.005
21
+ if m <= 10
22
+ n = 2
23
+ else
24
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
25
+ end
26
+
27
+ else
28
+ if m <= 10
29
+ n = 2
30
+ elsif m <= 8500
31
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
32
+ else
33
+ n = 0.0079 * m + 9.4869
34
+ end
35
+ end
36
+
37
+ n = n.round
38
+ n = 2 if n < 3
39
+ return n
40
+ end
41
+
42
+ # identify which file in the directory is R1 file, and which is R2 file based on file names
43
+ # input as directory (Dir object or a string of path)
44
+ # by default, .gz files will be unzipped.
45
+ # return as an hash of {r1_file: file1, r1_file: file2}
46
+ def r1r2(directory, unzip = true)
47
+ files = []
48
+ Dir.chdir(directory) { files = Dir.glob "*" }
49
+ r1_file = ""
50
+ r2_file = ""
51
+ files.each do |f|
52
+ tag = parser_file_name(f)[:tag]
53
+
54
+ if tag.include? "R1"
55
+ unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
56
+ elsif tag.include? "R2"
57
+ unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
58
+ end
59
+ end
60
+ return { r1_file: r1_file, r2_file: r2_file }
61
+ end # end of ViralSeq:TcsCore.r1r2
62
+
63
+ # sort directories containing mulitple r1 and r2 files.
64
+ # use the library name (first string before "_") to seperate libraries
65
+ # out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
66
+ # return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
67
+
68
+ def sort_by_lib(directory, out_dir = directory + "_sorted")
69
+ Dir.mkdir(out_dir) unless File.directory?(out_dir)
70
+ files = []
71
+ Dir.chdir(directory) {files = Dir.glob("*")}
72
+
73
+ files.each do |file|
74
+ path = File.join(directory,file)
75
+ index = file.split("_")[0]
76
+ index_dir = File.join(out_dir, index)
77
+ Dir.mkdir(index_dir) unless File.directory?(index_dir)
78
+ File.rename(path, File.join(index_dir, file))
79
+ end
80
+
81
+ return_obj = { with_both_r1_r2: [],
82
+ missing_r1: [],
83
+ missing_r2: [],
84
+ error: []
85
+ }
86
+
87
+ libs = []
88
+ Dir.chdir(out_dir) { libs = Dir.glob('*') }
89
+ libs.each do |lib|
90
+ file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
91
+ if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
92
+ return_obj[:with_both_r1_r2] << lib
93
+ elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
94
+ return_obj[:missing_r1] << lib
95
+ elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
96
+ return_obj[:missing_r2] << lib
97
+ else
98
+ return_obj[:error] << lib
99
+ end
100
+ end
101
+ return return_obj
102
+ end
103
+
104
+ # sort array of file names to determine if there is potential errors
105
+ # @param name_array [Array] array of file names
106
+ # @return [hash] name check results
107
+
108
+ def validate_file_name(name_array)
109
+ errors = {
110
+ file_type_error: [] ,
111
+ missing_r1_file: [] ,
112
+ missing_r2_file: [] ,
113
+ extra_r1_r2_file: [],
114
+ no_region_tag: [] ,
115
+ multiple_region_tag: []
116
+ }
117
+
118
+ passed_libs = {}
119
+
120
+ name_with_r1_r2 = []
121
+
122
+ name_array.each do |name|
123
+ tag = parser_file_name(name)[:tag]
124
+ if name !~ /\.fastq\Z|\.fastq\.gz\Z/
125
+ errors[:file_type_error] << name
126
+ elsif tag.count("R1") == 0 and tag.count("R2") == 0
127
+ errors[:no_region_tag] << name
128
+ elsif tag.count("R1") > 0 and tag.count("R2") > 0
129
+ errors[:multiple_region_tag] << name
130
+ elsif tag.count("R1") > 1 or tag.count("R2") > 1
131
+ errors[:multiple_region_tag] << name
132
+ else
133
+ name_with_r1_r2 << name
134
+ end
135
+ end
136
+
137
+ libs = {}
138
+
139
+ name_with_r1_r2.map do |name|
140
+ libname = parser_file_name(name)[:libname]
141
+ libs[libname] ||= []
142
+ libs[libname] << name
143
+ end
144
+
145
+ libs.each do |libname, files|
146
+ count_r1_file = 0
147
+ count_r2_file = 0
148
+ files.each do |name|
149
+ tag = parser_file_name(name)[:tag]
150
+ if tag.include? "R1"
151
+ count_r1_file += 1
152
+ elsif tag.include? "R2"
153
+ count_r2_file += 1
154
+ end
155
+ end
156
+
157
+ if count_r1_file > 1 or count_r2_file > 1
158
+ errors[:extra_r1_r2_file] += files
159
+ elsif count_r1_file.zero?
160
+ errors[:missing_r1_file] += files
161
+ elsif count_r2_file.zero?
162
+ errors[:missing_r2_file] += files
163
+ else
164
+ passed_libs[libname] = files
165
+ end
166
+ end
167
+
168
+ file_name_with_lib_name = {}
169
+ passed_libs.each do |lib_name, files|
170
+ files.each do |f|
171
+ file_name_with_lib_name[f] = lib_name
172
+ end
173
+ end
174
+
175
+ passed_names = []
176
+
177
+ passed_libs.values.each { |names| passed_names += names}
178
+
179
+ if passed_names.size < name_array.size
180
+ pass = false
181
+ else
182
+ pass = true
183
+ end
184
+
185
+ file_name_with_error_type = {}
186
+
187
+ errors.each do |type, files|
188
+ files.each do |f|
189
+ file_name_with_error_type[f] ||= []
190
+ file_name_with_error_type[f] << type.to_s.tr("_", "\s")
191
+ end
192
+ end
193
+
194
+ file_check = []
195
+
196
+ name_array.each do |name|
197
+ file_check_hash = {}
198
+ file_check_hash[:fileName] = name
199
+ file_check_hash[:errors] = file_name_with_error_type[name]
200
+ file_check_hash[:libName] = file_name_with_lib_name[name]
201
+
202
+ file_check << file_check_hash
203
+ end
204
+
205
+ return { allPass: pass, files: file_check }
206
+ end
207
+
208
+ # filter r1 raw sequences for non-specific primers.
209
+ # input r1_sh, SeqHash obj.
210
+ # return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
211
+
212
+ def filter_r1(r1_sh, forward_primer)
213
+ if forward_primer.match(/(N+)(\w+)$/)
214
+ forward_n = $1.size
215
+ forward_bio_primer = $2
216
+ else
217
+ forward_n = 0
218
+ forward_bio_primer = forward_primer
219
+ end
220
+ forward_bio_primer_size = forward_bio_primer.size
221
+ forward_starting_number = forward_n + forward_bio_primer_size
222
+ forward_primer_ref = forward_bio_primer.nt_parser
223
+
224
+ r1_passed_seq = {}
225
+ r1_raw = r1_sh.dna_hash
226
+
227
+ proc_filter = proc do |name|
228
+ seq = r1_raw[name]
229
+ next unless general_filter seq
230
+ primer_region_seq = seq[forward_n, forward_bio_primer_size]
231
+ if primer_region_seq =~ forward_primer_ref
232
+ new_name = remove_tag name
233
+ r1_passed_seq[new_name] = seq
234
+ end
235
+ end
236
+
237
+ r1_raw.keys.map do |name|
238
+ proc_filter.call name
239
+ end
240
+
241
+ return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
242
+ end # end of filter_r1
243
+
244
+ # filter r2 raw sequences for non-specific primers.
245
+ # input r2_sh, SeqHash obj.
246
+ # return filtered Hash of sequence name and seq pair, as well as the length of PID.
247
+ def filter_r2(r2_sh, cdna_primer)
248
+ r2_raw = r2_sh.dna_hash
249
+ cdna_primer.match(/(N+)(\w+)$/)
250
+ pid_length = $1.size
251
+ cdna_bio_primer = $2
252
+ cdna_bio_primer_size = cdna_bio_primer.size
253
+ reverse_starting_number = pid_length + cdna_bio_primer_size
254
+ cdna_primer_ref = cdna_bio_primer.nt_parser
255
+ r2_passed_seq = {}
256
+ proc_filter = proc do |name|
257
+ seq = r2_raw[name]
258
+ next unless general_filter seq
259
+ primer_region_seq = seq[pid_length, cdna_bio_primer_size]
260
+ if primer_region_seq =~ cdna_primer_ref
261
+ new_name = remove_tag name
262
+ r2_passed_seq[new_name] = seq
263
+ end
264
+ end
265
+
266
+ r2_raw.keys.map do |name|
267
+ proc_filter.call name
268
+ end
269
+
270
+ return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
271
+ end # end of filter_r2
272
+
273
+
274
+
275
+ # puts error message in the log file handler, and abort with the same infor
276
+
277
+ def log_and_abort(log, infor)
278
+ log.puts Time.now.to_s + "\t" + infor
279
+ log.close
280
+ abort infor.red.bold
281
+ end
282
+
283
+ private
284
+
285
+ def unzip_r(indir, f)
286
+ r_file = File.join(indir, f)
287
+ if f =~ /.gz/
288
+ `gzip -d #{r_file}`
289
+ new_f = f.sub ".gz", ""
290
+ r_file = File.join(indir, new_f)
291
+ end
292
+ return r_file
293
+ end
294
+
295
+ def parser_file_name(file_name)
296
+ t = file_name.split(".")[0].split("_")
297
+ if t.size == 1
298
+ libname = "lib"
299
+ tag = [ t[0].upcase ]
300
+ else
301
+ libname = t[0]
302
+ tag = t[1..-1].map(&:upcase)
303
+ end
304
+ return {libname: libname, tag: tag}
305
+ end
306
+
307
+ def general_filter(seq)
308
+ if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
309
+ return false
310
+ elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
311
+ return false
312
+ elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
313
+ return false
314
+ else
315
+ return true
316
+ end
317
+ end
318
+
319
+ # remove region info tags from the raw MiSeq sequences.
320
+ def remove_tag(seq_name)
321
+ if seq_name =~ /\s/
322
+ new_tag = $`
323
+ else
324
+ new_tag = seq_name[0..-3]
325
+ end
326
+ end
327
+
328
+ end # end of class << self
329
+
330
+ end # end of TcsCore module
331
+
332
+ end # end of main module