viral_seq 1.7.1 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,193 @@
1
+ module ViralSeq
2
+
3
+ # class to generate recency report
4
+
5
+ class RecencyReport
6
+
7
+ # to generate the recency report in .pdf format.
8
+ # @param log [Hash] Hash from the json summary string of the SDRM report
9
+ # @param outfile [String] path to the output file
10
+ # @return [NilClass] .pdf file generated by the method. Return nil.
11
+
12
+ def self.generate(log, outfile)
13
+
14
+ recency_color = {
15
+ "recent" => "d42828",
16
+ "chronic" => "0666bf",
17
+ "indeterminant"=> "f78914",
18
+ "insufficient data" => "7d7b79"
19
+ }
20
+
21
+ dual_infection_color = {
22
+ "Yes" => "ffcc00",
23
+ "No" => "339900",
24
+ "insufficient data" => "7d7b79"
25
+ }
26
+
27
+ Prawn::Document.generate(outfile, margin: 75) do
28
+
29
+ def text_format(text1, text2)
30
+ [
31
+ { text: text1 + "\s" * (30 - text1.size), styles: [:bold], size: 14, font: "Courier"},
32
+ { text: text2, size: 14, styles: [:underline]}
33
+ ]
34
+ end
35
+
36
+ def text_format2(text1, text2, text3, text4)
37
+ text1 = text1.to_s
38
+ text2 = text2.to_s
39
+ text3 = text3.to_s
40
+ text4 = text4.to_s
41
+
42
+ [
43
+ { text: "\s\s\s" + text1 + "\s"*(11-text1.size) +
44
+ text2 + "\s"*(19-text2.size) +
45
+ text3 + "\s"*(11-text3.size) + text4,
46
+ size: 14,
47
+ font: "Courier"
48
+ }
49
+ ]
50
+ end
51
+
52
+ text("Quantitative Recency Report by MPID-NGS",
53
+ size: 18,
54
+ align: :center,
55
+ style: :bold
56
+ )
57
+
58
+ move_down 20
59
+
60
+ formatted_text(
61
+ text_format("Library ID:", log[:sample_id])
62
+ )
63
+
64
+ move_down 10
65
+
66
+ formatted_text(
67
+ text_format("ViralSeq Version:", ViralSeq::VERSION.to_s)
68
+
69
+ )
70
+
71
+ formatted_text(
72
+ text_format("TCS Version:", ViralSeq::TCS_VERSION.to_s)
73
+ )
74
+
75
+ formatted_text(
76
+ text_format("Processed Date", Time.now.strftime("%Y-%b-%d %H:%M"))
77
+ )
78
+
79
+ move_down 30
80
+
81
+ text("Summary of parameters",
82
+ size: 16,
83
+ style: :bold
84
+ )
85
+
86
+ move_down 20
87
+
88
+ formatted_text(
89
+ [
90
+ { text: "REGION" + "\s"*5 + "AVG. DIVERSITY" + "\s"*5 + "DIST20" + "\s"*5 + "DEPTH",
91
+ styles: [:bold],
92
+ size: 14,
93
+ font: "Courier"
94
+ },
95
+ ]
96
+ )
97
+
98
+ move_down 5
99
+
100
+ formatted_text(
101
+ text_format2("RT", log[:pi_RT], log[:dist20_RT], log[:tcs_RT])
102
+ )
103
+
104
+ formatted_text(
105
+ text_format2("V1V3", log[:pi_V1V3], log[:dist20_V1V3], log[:tcs_V1V3])
106
+ )
107
+
108
+ formatted_text(
109
+ text_format2("P17", log[:pi_P17], log[:dist20_P17], log[:tcs_P17])
110
+ )
111
+
112
+ move_down 30
113
+
114
+ formatted_text(
115
+ [
116
+ { text: "Prediction: ",
117
+ styles: [:bold],
118
+ size: 16,
119
+ },
120
+
121
+ { text: log[:recency].capitalize + " Infection",
122
+ styles: [:bold],
123
+ size: 16,
124
+ color: recency_color[log[:recency]]
125
+ },
126
+
127
+ { text: " (9-month cutoff)",
128
+ size: 14,
129
+ },
130
+ ]
131
+ )
132
+
133
+ move_down 20
134
+
135
+ formatted_text(
136
+ [
137
+ {
138
+ text: "Estimated Day Post Infection: ",
139
+ styles: [:bold],
140
+ size: 16
141
+ },
142
+
143
+ {
144
+ text: log[:dpi].round(1).to_s +
145
+ " (" + log[:dpi_lwr].round(1).to_s + "-" + log[:dpi_upr].round(1).to_s + ") Days",
146
+ styles: [:bold],
147
+ size: 16,
148
+ color: recency_color[log[:recency]]
149
+ }
150
+ ]
151
+ )
152
+
153
+ move_down 20
154
+
155
+ formatted_text(
156
+ [
157
+ {
158
+ text: "Possible multivariant Infection: ",
159
+ styles: [:bold],
160
+ size: 16,
161
+ },
162
+
163
+ {
164
+ text: log[:possible_dual_infection],
165
+ styles: [:bold],
166
+ size: 16,
167
+ color: dual_infection_color[log[:possible_dual_infection]]
168
+ }
169
+ ]
170
+ )
171
+
172
+ move_down 10
173
+
174
+ if log[:possible_dual_infection] == "Yes"
175
+
176
+ formatted_text(
177
+ [
178
+ {
179
+ text: "Warning: Days Post Infection prediction not reliable!",
180
+ styles: [:bold],
181
+ size: 14,
182
+ color: "ffcc00"
183
+ }
184
+ ]
185
+ )
186
+ end
187
+ end
188
+
189
+ end
190
+
191
+ end
192
+
193
+ end
@@ -0,0 +1,7 @@
1
+ module ViralSeq
2
+ # define the path to the root of the gem files.
3
+ # @return [String] string for the root path to the gem files.
4
+ def self.root
5
+ File.dirname __dir__
6
+ end
7
+ end
@@ -495,7 +495,7 @@ module ViralSeq
495
495
  # total G->A mutations at apobec3g/f positions.
496
496
  total = 0
497
497
 
498
- unless ref
498
+ unless ref
499
499
  # make consensus sequence for the input sequence hash
500
500
  ref = self.consensus
501
501
  end
@@ -571,7 +571,7 @@ module ViralSeq
571
571
  hm_hash.each do |k,_v|
572
572
  hm_seq_hash.dna_hash[k] = self.dna_hash[k]
573
573
  end
574
-
574
+
575
575
  hm_seq_hash.title = self.title + "_hypermut"
576
576
  hm_seq_hash.file = self.file
577
577
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
@@ -713,7 +713,7 @@ module ViralSeq
713
713
 
714
714
 
715
715
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
716
- # @param algorithm [Symbol], algorithm for MUSCLE5 only. Choose from :PPP or :Super5.
716
+ # @param algorithm [Symbol], algorithm for MUSCLE5 only. Choose from :PPP or :Super5.
717
717
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
718
718
  # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
719
719
 
@@ -729,7 +729,7 @@ module ViralSeq
729
729
  temp_aln = File.join(temp_dir, "_temp_muscle_aln")
730
730
  File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
731
731
  if path_to_muscle
732
- unless ViralSeq.check_muscle?(path_to_muscle)
732
+ unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
733
733
  File.unlink(temp_file)
734
734
  return nil
735
735
  end
@@ -87,7 +87,8 @@ module ViralSeq
87
87
  end
88
88
 
89
89
  # Pair-end join function for KNOWN overlap size.
90
- # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
90
+ # @param overlap [Integer] simple overlap value indicating how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
91
+ # overlap can also be an explicit [Hash] object for :overlap_size, :r1_overlap, :r2_overlap, :before_overlap, :after_overlap
91
92
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
92
93
  # @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
93
94
  # @example join paired-end sequences with different :diff cut-offs, overlap provided.
@@ -106,24 +107,64 @@ module ViralSeq
106
107
  # => [">pair1", ">pair2", ">pair3"]
107
108
 
108
109
  def join1(overlap = 0, diff = 0.0)
109
- seq_pair_hash = self.dna_hash
110
- raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
111
110
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
111
+
112
+ if overlap.is_a? Integer and overlap.zero?
113
+ overlap = {
114
+ overlap_size: 0,
115
+ r1_overlapped: 0...0,
116
+ r2_overlapped: 0...0,
117
+ before_overlap: {
118
+ region: :r1,
119
+ range: 0..-1,
120
+ } ,
121
+ after_overlap: {
122
+ region: :r2,
123
+ range: 0..-1
124
+ }
125
+ }
126
+ elsif overlap.is_a? Integer
127
+ overlap = {
128
+ overlap_size: overlap,
129
+ r1_overlapped: -overlap..-1,
130
+ r2_overlapped: 0..(overlap - 1),
131
+ before_overlap: {
132
+ region: :r1,
133
+ range: 0..(-overlap - 1),
134
+ } ,
135
+ after_overlap: {
136
+ region: :r2,
137
+ range: overlap..-1
138
+ }
139
+ }
140
+ end
141
+
142
+ seq_pair_hash = self.dna_hash
112
143
  joined_seq = {}
113
144
  seq_pair_hash.each do |seq_name,seq_pair|
114
145
  r1_seq = seq_pair[0]
115
146
  r2_seq = seq_pair[1]
116
- if overlap.zero?
117
- joined_sequence = r1_seq + r2_seq
118
- elsif diff.zero?
119
- if r1_seq[-overlap..-1] == r2_seq[0,overlap]
120
- joined_sequence= r1_seq + r2_seq[overlap..-1]
147
+
148
+ r1_overlap = r1_seq[overlap[:r1_overlapped]]
149
+ r2_overlap = r2_seq[overlap[:r2_overlapped]]
150
+
151
+ overlap_size = overlap[:overlap_size]
152
+
153
+ if (diff.zero? and r1_overlap == r2_overlap) or (!diff.zero? and r1_overlap.compare_with(r2_overlap) <= (overlap_size.abs * diff))
154
+ if overlap[:before_overlap][:region] == :r1
155
+ before_overlap_seq = r1_seq[overlap[:before_overlap][:range]]
156
+ elsif overlap[:before_overlap][:region] == :r2
157
+ before_overlap_seq = r2_seq[overlap[:before_overlap][:range]]
121
158
  end
122
- elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
123
- joined_sequence= r1_seq + r2_seq[overlap..-1]
124
- else
125
- next
159
+
160
+ if overlap[:after_overlap][:region] == :r1
161
+ after_overlap_seq = r1_seq[overlap[:after_overlap][:range]]
162
+ elsif overlap[:after_overlap][:region] == :r2
163
+ after_overlap_seq = r2_seq[overlap[:after_overlap][:range]]
164
+ end
165
+ joined_sequence = before_overlap_seq + r1_overlap + after_overlap_seq
126
166
  end
167
+
127
168
  joined_seq[seq_name] = joined_sequence if joined_sequence
128
169
  end
129
170
 
@@ -164,18 +205,35 @@ module ViralSeq
164
205
  elsif model == :indiv
165
206
  joined_seq = {}
166
207
  seq_pair_hash.each do |seq_name, seq_pair|
208
+ r1_seq = seq_pair[0]
209
+ r2_seq = seq_pair[1]
167
210
  overlap_list = []
168
- overlap_matrix(seq_pair[0], seq_pair[1]).each do |overlap1, diff_nt|
169
- cut_off_base = overlap1 * diff
211
+
212
+ overlap_matrix(r1_seq, r2_seq).each do |overlap1, diff_nt|
213
+ cut_off_base = overlap1[:overlap_size] * diff
170
214
  overlap_list << overlap1 if diff_nt <= cut_off_base
171
215
  end
216
+
172
217
  if overlap_list.empty?
173
- joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
218
+ joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
174
219
  else
175
- overlap = overlap_list.max
176
- joined_seq[seq_name] = seq_pair[0] + seq_pair[1][overlap..-1]
220
+ overlap_to_use = overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
221
+
222
+ if overlap_to_use[:before_overlap][:region] == :r1
223
+ before_overlap_seq = r1_seq[overlap_to_use[:before_overlap][:range]]
224
+ elsif overlap_to_use[:before_overlap][:region] == :r2
225
+ before_overlap_seq = r2_seq[overlap_to_use[:before_overlap][:range]]
226
+ end
227
+
228
+ if overlap_to_use[:after_overlap][:region] == :r1
229
+ after_overlap_seq = r1_seq[overlap_to_use[:after_overlap][:range]]
230
+ elsif overlap_to_use[:after_overlap][:region] == :r2
231
+ after_overlap_seq = r2_seq[overlap_to_use[:after_overlap][:range]]
232
+ end
233
+ joined_seq[seq_name] = before_overlap_seq + r1_seq[overlap_to_use[:r1_overlapped]] + after_overlap_seq
177
234
  end
178
235
  end
236
+
179
237
  joined_seq_hash = ViralSeq::SeqHash.new
180
238
  joined_seq_hash.dna_hash = joined_seq
181
239
  joined_seq_hash.title = self.title + "_joined"
@@ -197,35 +255,104 @@ module ViralSeq
197
255
  seq_pair_hash.each do |_seq_name, seq_pair|
198
256
  overlap_list = []
199
257
  matrix = overlap_matrix(seq_pair[0], seq_pair[1])
200
- matrix.each do |overlap, diff_nt|
258
+ matrix.each do |overlap_positions, diff_nt|
259
+ overlap = overlap_positions[:overlap_size].abs
201
260
  cut_off_base = overlap * diff
202
- overlap_list << overlap if diff_nt <= cut_off_base
261
+ overlap_list << overlap_positions if diff_nt <= cut_off_base
203
262
  end
263
+
204
264
  if overlap_list.empty?
205
- overlaps << 0
265
+ overlaps << {
266
+ overlap_size: 0,
267
+ r1_overlapped: 0...0,
268
+ r2_overlapped: 0...0,
269
+ before_overlap: {
270
+ region: :r1,
271
+ range: 0..-1,
272
+ } ,
273
+ after_overlap: {
274
+ region: :r2,
275
+ range: 0..-1
276
+ }
277
+ }
206
278
  else
207
- overlaps << overlap_list.max
279
+ overlaps << overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
208
280
  end
281
+
209
282
  end
210
283
  count_overlaps = overlaps.count_freq
211
284
  max_value = count_overlaps.values.max
212
285
  max_overlap_list = []
213
286
  count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
214
- max_overlap_list.max
287
+ max_overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
215
288
  end # end pf determine_overlap_pid_pair
216
289
 
217
290
  # input a pair of sequences as String, return a Hash object of overlapping Hash object
218
291
  # {:overlap_size => number_of_differnt_positions, ...}
219
292
  # {minimal overlap set to 4. }
220
293
  def overlap_matrix(sequence1, sequence2)
221
- min_overlap = 4
222
- max_overlap = [sequence1.size, sequence2.size].min
294
+ list = overlap_list(sequence1.size, sequence2.size)
223
295
  matrix_hash = {}
296
+ list.each do |l|
297
+ range1 = l[:r1_overlapped]
298
+ range2 = l[:r2_overlapped]
299
+ matrix_hash[l] = sequence1[range1].compare_with(sequence2[range2])
300
+ end
301
+ matrix_hash
302
+ end
303
+
304
+ # given two [Integer], return all possible overlaping ranges in an [Array]
305
+ def overlap_list(l1, l2)
306
+ return_list = []
307
+ min_overlap = 4
308
+ max_overlap = [l1, l2].min
309
+ diff = (l1 - l2).abs
310
+ max_reverse = l1/2
311
+
224
312
  (min_overlap..max_overlap).each do |overlap|
225
- matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
313
+ return_list<< {
314
+ overlap_size: overlap,
315
+ r1_overlapped: (l1-overlap)..(l1-1),
316
+ r2_overlapped: 0..(overlap -1),
317
+ before_overlap: {region: :r1, range: 0..(l1 - overlap - 1)},
318
+ after_overlap: {region: :r2, range: overlap..(l2-1)}
319
+ }
320
+ end
321
+
322
+ if l1 >= l2
323
+ (1..diff).each do |overlap|
324
+ return_list << {
325
+ overlap_size: max_overlap,
326
+ r1_overlapped: (diff - overlap)..(l1-1-overlap),
327
+ r2_overlapped: 0..(l2-1),
328
+ before_overlap: {region: :r1, range: 0...(diff - overlap)},
329
+ after_overlap: {region: :r1, range: (l1-overlap)...l1},
330
+ }
331
+ end
332
+ else
333
+ (1..diff).each do |overlap|
334
+ return_list << {
335
+ overlap_size: max_overlap,
336
+ r1_overlapped: 0..(l1-1),
337
+ r2_overlapped: overlap..(max_overlap + overlap - 1),
338
+ before_overlap: {region: :r2, range: 0...overlap},
339
+ after_overlap: {region: :r2, range: (max_overlap + overlap)...l2},
340
+ }
341
+ end
342
+ end
343
+
344
+ (max_reverse..(max_overlap-1)).reverse_each do |overlap|
345
+ return_list << {
346
+ overlap_size: overlap,
347
+ r1_overlapped: 0..(overlap -1),
348
+ r2_overlapped: (l2-overlap)..(l2-1),
349
+ before_overlap: {region: :r2, range: 0..(l2-overlap-1)},
350
+ after_overlap: {region: :r1, range: overlap..(l1-1)},
351
+ }
226
352
  end
227
- return matrix_hash
228
- end # end of overlap_matrix
353
+
354
+ return_list
355
+ end # end of overlap_list
229
356
 
230
357
  end # end of SeqHashPair
231
358
  end # end of ViralSeq