viral_seq 1.7.1 → 1.8.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,193 @@
1
+ module ViralSeq
2
+
3
+ # class to generate recency report
4
+
5
+ class RecencyReport
6
+
7
+ # to generate the recency report in .pdf format.
8
+ # @param log [Hash] Hash from the json summary string of the SDRM report
9
+ # @param outfile [String] path to the output file
10
+ # @return [NilClass] .pdf file generated by the method. Return nil.
11
+
12
+ def self.generate(log, outfile)
13
+
14
+ recency_color = {
15
+ "recent" => "d42828",
16
+ "chronic" => "0666bf",
17
+ "indeterminant"=> "f78914",
18
+ "insufficient data" => "7d7b79"
19
+ }
20
+
21
+ dual_infection_color = {
22
+ "Yes" => "ffcc00",
23
+ "No" => "339900",
24
+ "insufficient data" => "7d7b79"
25
+ }
26
+
27
+ Prawn::Document.generate(outfile, margin: 75) do
28
+
29
+ def text_format(text1, text2)
30
+ [
31
+ { text: text1 + "\s" * (30 - text1.size), styles: [:bold], size: 14, font: "Courier"},
32
+ { text: text2, size: 14, styles: [:underline]}
33
+ ]
34
+ end
35
+
36
+ def text_format2(text1, text2, text3, text4)
37
+ text1 = text1.to_s
38
+ text2 = text2.to_s
39
+ text3 = text3.to_s
40
+ text4 = text4.to_s
41
+
42
+ [
43
+ { text: "\s\s\s" + text1 + "\s"*(11-text1.size) +
44
+ text2 + "\s"*(19-text2.size) +
45
+ text3 + "\s"*(11-text3.size) + text4,
46
+ size: 14,
47
+ font: "Courier"
48
+ }
49
+ ]
50
+ end
51
+
52
+ text("Quantitative Recency Report by MPID-NGS",
53
+ size: 18,
54
+ align: :center,
55
+ style: :bold
56
+ )
57
+
58
+ move_down 20
59
+
60
+ formatted_text(
61
+ text_format("Library ID:", log[:sample_id])
62
+ )
63
+
64
+ move_down 10
65
+
66
+ formatted_text(
67
+ text_format("ViralSeq Version:", ViralSeq::VERSION.to_s)
68
+
69
+ )
70
+
71
+ formatted_text(
72
+ text_format("TCS Version:", ViralSeq::TCS_VERSION.to_s)
73
+ )
74
+
75
+ formatted_text(
76
+ text_format("Processed Date", Time.now.strftime("%Y-%b-%d %H:%M"))
77
+ )
78
+
79
+ move_down 30
80
+
81
+ text("Summary of parameters",
82
+ size: 16,
83
+ style: :bold
84
+ )
85
+
86
+ move_down 20
87
+
88
+ formatted_text(
89
+ [
90
+ { text: "REGION" + "\s"*5 + "AVG. DIVERSITY" + "\s"*5 + "DIST20" + "\s"*5 + "DEPTH",
91
+ styles: [:bold],
92
+ size: 14,
93
+ font: "Courier"
94
+ },
95
+ ]
96
+ )
97
+
98
+ move_down 5
99
+
100
+ formatted_text(
101
+ text_format2("RT", log[:pi_RT], log[:dist20_RT], log[:tcs_RT])
102
+ )
103
+
104
+ formatted_text(
105
+ text_format2("V1V3", log[:pi_V1V3], log[:dist20_V1V3], log[:tcs_V1V3])
106
+ )
107
+
108
+ formatted_text(
109
+ text_format2("P17", log[:pi_P17], log[:dist20_P17], log[:tcs_P17])
110
+ )
111
+
112
+ move_down 30
113
+
114
+ formatted_text(
115
+ [
116
+ { text: "Prediction: ",
117
+ styles: [:bold],
118
+ size: 16,
119
+ },
120
+
121
+ { text: log[:recency].capitalize + " Infection",
122
+ styles: [:bold],
123
+ size: 16,
124
+ color: recency_color[log[:recency]]
125
+ },
126
+
127
+ { text: " (9-month cutoff)",
128
+ size: 14,
129
+ },
130
+ ]
131
+ )
132
+
133
+ move_down 20
134
+
135
+ formatted_text(
136
+ [
137
+ {
138
+ text: "Estimated Day Post Infection: ",
139
+ styles: [:bold],
140
+ size: 16
141
+ },
142
+
143
+ {
144
+ text: log[:dpi].to_s +
145
+ " (" + log[:dpi_lwr].to_s + "-" + log[:dpi_upr].to_s + ") Days",
146
+ styles: [:bold],
147
+ size: 16,
148
+ color: recency_color[log[:recency]]
149
+ }
150
+ ]
151
+ )
152
+
153
+ move_down 20
154
+
155
+ formatted_text(
156
+ [
157
+ {
158
+ text: "Possible multivariant Infection: ",
159
+ styles: [:bold],
160
+ size: 16,
161
+ },
162
+
163
+ {
164
+ text: log[:possible_dual_infection],
165
+ styles: [:bold],
166
+ size: 16,
167
+ color: dual_infection_color[log[:possible_dual_infection]]
168
+ }
169
+ ]
170
+ )
171
+
172
+ move_down 10
173
+
174
+ if log[:possible_dual_infection] == "Yes"
175
+
176
+ formatted_text(
177
+ [
178
+ {
179
+ text: "Warning: Days Post Infection prediction not reliable!",
180
+ styles: [:bold],
181
+ size: 14,
182
+ color: "ffcc00"
183
+ }
184
+ ]
185
+ )
186
+ end
187
+ end
188
+
189
+ end
190
+
191
+ end
192
+
193
+ end
@@ -0,0 +1,7 @@
1
+ module ViralSeq
2
+ # define the path to the root of the gem files.
3
+ # @return [String] string for the root path to the gem files.
4
+ def self.root
5
+ File.dirname __dir__
6
+ end
7
+ end
@@ -495,7 +495,7 @@ module ViralSeq
495
495
  # total G->A mutations at apobec3g/f positions.
496
496
  total = 0
497
497
 
498
- unless ref
498
+ unless ref
499
499
  # make consensus sequence for the input sequence hash
500
500
  ref = self.consensus
501
501
  end
@@ -571,7 +571,7 @@ module ViralSeq
571
571
  hm_hash.each do |k,_v|
572
572
  hm_seq_hash.dna_hash[k] = self.dna_hash[k]
573
573
  end
574
-
574
+
575
575
  hm_seq_hash.title = self.title + "_hypermut"
576
576
  hm_seq_hash.file = self.file
577
577
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
@@ -713,7 +713,7 @@ module ViralSeq
713
713
 
714
714
 
715
715
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
716
- # @param algorithm [Symbol], algorithm for MUSCLE5 only. Choose from :PPP or :Super5.
716
+ # @param algorithm [Symbol], algorithm for MUSCLE5 only. Choose from :PPP or :Super5.
717
717
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
718
718
  # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
719
719
 
@@ -729,7 +729,7 @@ module ViralSeq
729
729
  temp_aln = File.join(temp_dir, "_temp_muscle_aln")
730
730
  File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
731
731
  if path_to_muscle
732
- unless ViralSeq.check_muscle?(path_to_muscle)
732
+ unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
733
733
  File.unlink(temp_file)
734
734
  return nil
735
735
  end
@@ -87,7 +87,8 @@ module ViralSeq
87
87
  end
88
88
 
89
89
  # Pair-end join function for KNOWN overlap size.
90
- # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
90
+ # @param overlap [Integer] simple overlap value indicating how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
91
+ # overlap can also be an explicit [Hash] object for :overlap_size, :r1_overlap, :r2_overlap, :before_overlap, :after_overlap
91
92
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
92
93
  # @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
93
94
  # @example join paired-end sequences with different :diff cut-offs, overlap provided.
@@ -106,24 +107,64 @@ module ViralSeq
106
107
  # => [">pair1", ">pair2", ">pair3"]
107
108
 
108
109
  def join1(overlap = 0, diff = 0.0)
109
- seq_pair_hash = self.dna_hash
110
- raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
111
110
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
111
+
112
+ if overlap.is_a? Integer and overlap.zero?
113
+ overlap = {
114
+ overlap_size: 0,
115
+ r1_overlapped: 0...0,
116
+ r2_overlapped: 0...0,
117
+ before_overlap: {
118
+ region: :r1,
119
+ range: 0..-1,
120
+ } ,
121
+ after_overlap: {
122
+ region: :r2,
123
+ range: 0..-1
124
+ }
125
+ }
126
+ elsif overlap.is_a? Integer
127
+ overlap = {
128
+ overlap_size: overlap,
129
+ r1_overlapped: -overlap..-1,
130
+ r2_overlapped: 0..(overlap - 1),
131
+ before_overlap: {
132
+ region: :r1,
133
+ range: 0..(-overlap - 1),
134
+ } ,
135
+ after_overlap: {
136
+ region: :r2,
137
+ range: overlap..-1
138
+ }
139
+ }
140
+ end
141
+
142
+ seq_pair_hash = self.dna_hash
112
143
  joined_seq = {}
113
144
  seq_pair_hash.each do |seq_name,seq_pair|
114
145
  r1_seq = seq_pair[0]
115
146
  r2_seq = seq_pair[1]
116
- if overlap.zero?
117
- joined_sequence = r1_seq + r2_seq
118
- elsif diff.zero?
119
- if r1_seq[-overlap..-1] == r2_seq[0,overlap]
120
- joined_sequence= r1_seq + r2_seq[overlap..-1]
147
+
148
+ r1_overlap = r1_seq[overlap[:r1_overlapped]]
149
+ r2_overlap = r2_seq[overlap[:r2_overlapped]]
150
+
151
+ overlap_size = overlap[:overlap_size]
152
+
153
+ if (diff.zero? and r1_overlap == r2_overlap) or (!diff.zero? and r1_overlap.compare_with(r2_overlap) <= (overlap_size.abs * diff))
154
+ if overlap[:before_overlap][:region] == :r1
155
+ before_overlap_seq = r1_seq[overlap[:before_overlap][:range]]
156
+ elsif overlap[:before_overlap][:region] == :r2
157
+ before_overlap_seq = r2_seq[overlap[:before_overlap][:range]]
121
158
  end
122
- elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
123
- joined_sequence= r1_seq + r2_seq[overlap..-1]
124
- else
125
- next
159
+
160
+ if overlap[:after_overlap][:region] == :r1
161
+ after_overlap_seq = r1_seq[overlap[:after_overlap][:range]]
162
+ elsif overlap[:after_overlap][:region] == :r2
163
+ after_overlap_seq = r2_seq[overlap[:after_overlap][:range]]
164
+ end
165
+ joined_sequence = before_overlap_seq + r1_overlap + after_overlap_seq
126
166
  end
167
+
127
168
  joined_seq[seq_name] = joined_sequence if joined_sequence
128
169
  end
129
170
 
@@ -164,18 +205,35 @@ module ViralSeq
164
205
  elsif model == :indiv
165
206
  joined_seq = {}
166
207
  seq_pair_hash.each do |seq_name, seq_pair|
208
+ r1_seq = seq_pair[0]
209
+ r2_seq = seq_pair[1]
167
210
  overlap_list = []
168
- overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
169
- cut_off_base = overlap1 * diff
211
+
212
+ overlap_matrix(r1_seq, r2_seq).each do |overlap1, diff_nt|
213
+ cut_off_base = overlap1[:overlap_size] * diff
170
214
  overlap_list << overlap1 if diff_nt <= cut_off_base
171
215
  end
216
+
172
217
  if overlap_list.empty?
173
- joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
218
+ joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
174
219
  else
175
- overlap = overlap_list.max
176
- joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
220
+ overlap_to_use = overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
221
+
222
+ if overlap_to_use[:before_overlap][:region] == :r1
223
+ before_overlap_seq = r1_seq[overlap_to_use[:before_overlap][:range]]
224
+ elsif overlap_to_use[:before_overlap][:region] == :r2
225
+ before_overlap_seq = r2_seq[overlap_to_use[:before_overlap][:range]]
226
+ end
227
+
228
+ if overlap_to_use[:after_overlap][:region] == :r1
229
+ after_overlap_seq = r1_seq[overlap_to_use[:after_overlap][:range]]
230
+ elsif overlap_to_use[:after_overlap][:region] == :r2
231
+ after_overlap_seq = r2_seq[overlap_to_use[:after_overlap][:range]]
232
+ end
233
+ joined_seq[seq_name] = before_overlap_seq + r1_seq[overlap_to_use[:r1_overlapped]] + after_overlap_seq
177
234
  end
178
235
  end
236
+
179
237
  joined_seq_hash = ViralSeq::SeqHash.new
180
238
  joined_seq_hash.dna_hash = joined_seq
181
239
  joined_seq_hash.title = self.title + "_joined"
@@ -197,35 +255,104 @@ module ViralSeq
197
255
  seq_pair_hash.each do |_seq_name, seq_pair|
198
256
  overlap_list = []
199
257
  matrix = overlap_matrix(seq_pair[0], seq_pair[1])
200
- matrix.each do |overlap, diff_nt|
258
+ matrix.each do |overlap_positions, diff_nt|
259
+ overlap = overlap_positions[:overlap_size].abs
201
260
  cut_off_base = overlap * diff
202
- overlap_list << overlap if diff_nt <= cut_off_base
261
+ overlap_list << overlap_positions if diff_nt <= cut_off_base
203
262
  end
263
+
204
264
  if overlap_list.empty?
205
- overlaps << 0
265
+ overlaps << {
266
+ overlap_size: 0,
267
+ r1_overlapped: 0...0,
268
+ r2_overlapped: 0...0,
269
+ before_overlap: {
270
+ region: :r1,
271
+ range: 0..-1,
272
+ } ,
273
+ after_overlap: {
274
+ region: :r2,
275
+ range: 0..-1
276
+ }
277
+ }
206
278
  else
207
- overlaps << overlap_list.max
279
+ overlaps << overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
208
280
  end
281
+
209
282
  end
210
283
  count_overlaps = overlaps.count_freq
211
284
  max_value = count_overlaps.values.max
212
285
  max_overlap_list = []
213
286
  count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
214
- max_overlap_list.max
287
+ max_overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
215
288
  end # end pf determine_overlap_pid_pair
216
289
 
217
290
  # input a pair of sequences as String, return a Hash object of overlapping Hash object
218
291
  # {:overlap_size => number_of_differnt_positions, ...}
219
292
  # {minimal overlap set to 4. }
220
293
  def overlap_matrix(sequence1, sequence2)
221
- min_overlap = 4
222
- max_overlap = [sequence1.size, sequence2.size].min
294
+ list = overlap_list(sequence1.size, sequence2.size)
223
295
  matrix_hash = {}
296
+ list.each do |l|
297
+ range1 = l[:r1_overlapped]
298
+ range2 = l[:r2_overlapped]
299
+ matrix_hash[l] = sequence1[range1].compare_with(sequence2[range2])
300
+ end
301
+ matrix_hash
302
+ end
303
+
304
+ # given two [Integer], return all possible overlaping ranges in an [Array]
305
+ def overlap_list(l1, l2)
306
+ return_list = []
307
+ min_overlap = 4
308
+ max_overlap = [l1, l2].min
309
+ diff = (l1 - l2).abs
310
+ max_reverse = l1/2
311
+
224
312
  (min_overlap..max_overlap).each do |overlap|
225
- matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
313
+ return_list<< {
314
+ overlap_size: overlap,
315
+ r1_overlapped: (l1-overlap)..(l1-1),
316
+ r2_overlapped: 0..(overlap -1),
317
+ before_overlap: {region: :r1, range: 0..(l1 - overlap - 1)},
318
+ after_overlap: {region: :r2, range: overlap..(l2-1)}
319
+ }
320
+ end
321
+
322
+ if l1 >= l2
323
+ (1..diff).each do |overlap|
324
+ return_list << {
325
+ overlap_size: max_overlap,
326
+ r1_overlapped: (diff - overlap)..(l1-1-overlap),
327
+ r2_overlapped: 0..(l2-1),
328
+ before_overlap: {region: :r1, range: 0...(diff - overlap)},
329
+ after_overlap: {region: :r1, range: (l1-overlap)...l1},
330
+ }
331
+ end
332
+ else
333
+ (1..diff).each do |overlap|
334
+ return_list << {
335
+ overlap_size: max_overlap,
336
+ r1_overlapped: 0..(l1-1),
337
+ r2_overlapped: overlap..(max_overlap + overlap - 1),
338
+ before_overlap: {region: :r2, range: 0...overlap},
339
+ after_overlap: {region: :r2, range: (max_overlap + overlap)...l2},
340
+ }
341
+ end
342
+ end
343
+
344
+ (max_reverse..(max_overlap-1)).reverse_each do |overlap|
345
+ return_list << {
346
+ overlap_size: overlap,
347
+ r1_overlapped: 0..(overlap -1),
348
+ r2_overlapped: (l2-overlap)..(l2-1),
349
+ before_overlap: {region: :r2, range: 0..(l2-overlap-1)},
350
+ after_overlap: {region: :r1, range: overlap..(l1-1)},
351
+ }
226
352
  end
227
- return matrix_hash
228
- end # end of overlap_matrix
353
+
354
+ return_list
355
+ end # end of overlap_list
229
356
 
230
357
  end # end of SeqHashPair
231
358
  end # end of ViralSeq