viral_seq 1.7.1 → 1.8.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +29 -24
- data/README.md +33 -18
- data/bin/tcs +39 -8
- data/bin/tcs_log +27 -16
- data/bin/tcs_sdrm +23 -14
- data/lib/viral_seq/R.rb +31 -0
- data/lib/viral_seq/constant.rb +0 -41
- data/lib/viral_seq/muscle.rb +1 -1
- data/lib/viral_seq/recency.rb +56 -2
- data/lib/viral_seq/recency_report.rb +193 -0
- data/lib/viral_seq/root.rb +7 -0
- data/lib/viral_seq/seq_hash.rb +4 -4
- data/lib/viral_seq/seq_hash_pair.rb +154 -27
- data/lib/viral_seq/tcs_dr.rb +168 -81
- data/lib/viral_seq/util/check_env.r +9 -0
- data/lib/viral_seq/util/recency_model/rt_only_fit.Rdata +0 -0
- data/lib/viral_seq/util/recency_model/rt_v1v3_fit.Rdata +0 -0
- data/lib/viral_seq/util/recency_model/v1v3_only_fit.Rdata +0 -0
- data/lib/viral_seq/util/sdrm_r.r +34 -0
- data/lib/viral_seq/version.rb +2 -2
- data/lib/viral_seq.rb +6 -0
- data/viral_seq.gemspec +2 -2
- metadata +17 -9
@@ -0,0 +1,193 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
|
3
|
+
# class to generate recency report
|
4
|
+
|
5
|
+
class RecencyReport
|
6
|
+
|
7
|
+
# to generate the recency report in .pdf format.
|
8
|
+
# @param log [Hash] Hash from the json summary string of the SDRM report
|
9
|
+
# @param outfile [String] path to the output file
|
10
|
+
# @return [NilClass] .pdf file generated by the method. Return nil.
|
11
|
+
|
12
|
+
def self.generate(log, outfile)
|
13
|
+
|
14
|
+
recency_color = {
|
15
|
+
"recent" => "d42828",
|
16
|
+
"chronic" => "0666bf",
|
17
|
+
"indeterminant"=> "f78914",
|
18
|
+
"insufficient data" => "7d7b79"
|
19
|
+
}
|
20
|
+
|
21
|
+
dual_infection_color = {
|
22
|
+
"Yes" => "ffcc00",
|
23
|
+
"No" => "339900",
|
24
|
+
"insufficient data" => "7d7b79"
|
25
|
+
}
|
26
|
+
|
27
|
+
Prawn::Document.generate(outfile, margin: 75) do
|
28
|
+
|
29
|
+
def text_format(text1, text2)
|
30
|
+
[
|
31
|
+
{ text: text1 + "\s" * (30 - text1.size), styles: [:bold], size: 14, font: "Courier"},
|
32
|
+
{ text: text2, size: 14, styles: [:underline]}
|
33
|
+
]
|
34
|
+
end
|
35
|
+
|
36
|
+
def text_format2(text1, text2, text3, text4)
|
37
|
+
text1 = text1.to_s
|
38
|
+
text2 = text2.to_s
|
39
|
+
text3 = text3.to_s
|
40
|
+
text4 = text4.to_s
|
41
|
+
|
42
|
+
[
|
43
|
+
{ text: "\s\s\s" + text1 + "\s"*(11-text1.size) +
|
44
|
+
text2 + "\s"*(19-text2.size) +
|
45
|
+
text3 + "\s"*(11-text3.size) + text4,
|
46
|
+
size: 14,
|
47
|
+
font: "Courier"
|
48
|
+
}
|
49
|
+
]
|
50
|
+
end
|
51
|
+
|
52
|
+
text("Quantitative Recency Report by MPID-NGS",
|
53
|
+
size: 18,
|
54
|
+
align: :center,
|
55
|
+
style: :bold
|
56
|
+
)
|
57
|
+
|
58
|
+
move_down 20
|
59
|
+
|
60
|
+
formatted_text(
|
61
|
+
text_format("Library ID:", log[:sample_id])
|
62
|
+
)
|
63
|
+
|
64
|
+
move_down 10
|
65
|
+
|
66
|
+
formatted_text(
|
67
|
+
text_format("ViralSeq Version:", ViralSeq::VERSION.to_s)
|
68
|
+
|
69
|
+
)
|
70
|
+
|
71
|
+
formatted_text(
|
72
|
+
text_format("TCS Version:", ViralSeq::TCS_VERSION.to_s)
|
73
|
+
)
|
74
|
+
|
75
|
+
formatted_text(
|
76
|
+
text_format("Processed Date", Time.now.strftime("%Y-%b-%d %H:%M"))
|
77
|
+
)
|
78
|
+
|
79
|
+
move_down 30
|
80
|
+
|
81
|
+
text("Summary of parameters",
|
82
|
+
size: 16,
|
83
|
+
style: :bold
|
84
|
+
)
|
85
|
+
|
86
|
+
move_down 20
|
87
|
+
|
88
|
+
formatted_text(
|
89
|
+
[
|
90
|
+
{ text: "REGION" + "\s"*5 + "AVG. DIVERSITY" + "\s"*5 + "DIST20" + "\s"*5 + "DEPTH",
|
91
|
+
styles: [:bold],
|
92
|
+
size: 14,
|
93
|
+
font: "Courier"
|
94
|
+
},
|
95
|
+
]
|
96
|
+
)
|
97
|
+
|
98
|
+
move_down 5
|
99
|
+
|
100
|
+
formatted_text(
|
101
|
+
text_format2("RT", log[:pi_RT], log[:dist20_RT], log[:tcs_RT])
|
102
|
+
)
|
103
|
+
|
104
|
+
formatted_text(
|
105
|
+
text_format2("V1V3", log[:pi_V1V3], log[:dist20_V1V3], log[:tcs_V1V3])
|
106
|
+
)
|
107
|
+
|
108
|
+
formatted_text(
|
109
|
+
text_format2("P17", log[:pi_P17], log[:dist20_P17], log[:tcs_P17])
|
110
|
+
)
|
111
|
+
|
112
|
+
move_down 30
|
113
|
+
|
114
|
+
formatted_text(
|
115
|
+
[
|
116
|
+
{ text: "Prediction: ",
|
117
|
+
styles: [:bold],
|
118
|
+
size: 16,
|
119
|
+
},
|
120
|
+
|
121
|
+
{ text: log[:recency].capitalize + " Infection",
|
122
|
+
styles: [:bold],
|
123
|
+
size: 16,
|
124
|
+
color: recency_color[log[:recency]]
|
125
|
+
},
|
126
|
+
|
127
|
+
{ text: " (9-month cutoff)",
|
128
|
+
size: 14,
|
129
|
+
},
|
130
|
+
]
|
131
|
+
)
|
132
|
+
|
133
|
+
move_down 20
|
134
|
+
|
135
|
+
formatted_text(
|
136
|
+
[
|
137
|
+
{
|
138
|
+
text: "Estimated Day Post Infection: ",
|
139
|
+
styles: [:bold],
|
140
|
+
size: 16
|
141
|
+
},
|
142
|
+
|
143
|
+
{
|
144
|
+
text: log[:dpi].to_s +
|
145
|
+
" (" + log[:dpi_lwr].to_s + "-" + log[:dpi_upr].to_s + ") Days",
|
146
|
+
styles: [:bold],
|
147
|
+
size: 16,
|
148
|
+
color: recency_color[log[:recency]]
|
149
|
+
}
|
150
|
+
]
|
151
|
+
)
|
152
|
+
|
153
|
+
move_down 20
|
154
|
+
|
155
|
+
formatted_text(
|
156
|
+
[
|
157
|
+
{
|
158
|
+
text: "Possible multivariant Infection: ",
|
159
|
+
styles: [:bold],
|
160
|
+
size: 16,
|
161
|
+
},
|
162
|
+
|
163
|
+
{
|
164
|
+
text: log[:possible_dual_infection],
|
165
|
+
styles: [:bold],
|
166
|
+
size: 16,
|
167
|
+
color: dual_infection_color[log[:possible_dual_infection]]
|
168
|
+
}
|
169
|
+
]
|
170
|
+
)
|
171
|
+
|
172
|
+
move_down 10
|
173
|
+
|
174
|
+
if log[:possible_dual_infection] == "Yes"
|
175
|
+
|
176
|
+
formatted_text(
|
177
|
+
[
|
178
|
+
{
|
179
|
+
text: "Warning: Days Post Infection prediction not reliable!",
|
180
|
+
styles: [:bold],
|
181
|
+
size: 14,
|
182
|
+
color: "ffcc00"
|
183
|
+
}
|
184
|
+
]
|
185
|
+
)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -495,7 +495,7 @@ module ViralSeq
|
|
495
495
|
# total G->A mutations at apobec3g/f positions.
|
496
496
|
total = 0
|
497
497
|
|
498
|
-
unless ref
|
498
|
+
unless ref
|
499
499
|
# make consensus sequence for the input sequence hash
|
500
500
|
ref = self.consensus
|
501
501
|
end
|
@@ -571,7 +571,7 @@ module ViralSeq
|
|
571
571
|
hm_hash.each do |k,_v|
|
572
572
|
hm_seq_hash.dna_hash[k] = self.dna_hash[k]
|
573
573
|
end
|
574
|
-
|
574
|
+
|
575
575
|
hm_seq_hash.title = self.title + "_hypermut"
|
576
576
|
hm_seq_hash.file = self.file
|
577
577
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
@@ -713,7 +713,7 @@ module ViralSeq
|
|
713
713
|
|
714
714
|
|
715
715
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
716
|
-
# @param algorithm [Symbol], algorithm for MUSCLE5 only. Choose from :PPP or :Super5.
|
716
|
+
# @param algorithm [Symbol], algorithm for MUSCLE5 only. Choose from :PPP or :Super5.
|
717
717
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
718
718
|
# @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
|
719
719
|
|
@@ -729,7 +729,7 @@ module ViralSeq
|
|
729
729
|
temp_aln = File.join(temp_dir, "_temp_muscle_aln")
|
730
730
|
File.open(temp_file, 'w'){|f| seq_hash.each {|k,v| f.puts k; f.puts v}}
|
731
731
|
if path_to_muscle
|
732
|
-
unless ViralSeq.check_muscle?(path_to_muscle)
|
732
|
+
unless ViralSeq::Muscle.check_muscle?(path_to_muscle)
|
733
733
|
File.unlink(temp_file)
|
734
734
|
return nil
|
735
735
|
end
|
@@ -87,7 +87,8 @@ module ViralSeq
|
|
87
87
|
end
|
88
88
|
|
89
89
|
# Pair-end join function for KNOWN overlap size.
|
90
|
-
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
90
|
+
# @param overlap [Integer] simple overlap value indicating how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
91
|
+
# overlap can also be an explicit [Hash] object for :overlap_size, :r1_overlap, :r2_overlap, :before_overlap, :after_overlap
|
91
92
|
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
92
93
|
# @return [ViralSeq::SeqHash] a SeqHash object of joined sequences.
|
93
94
|
# @example join paired-end sequences with different :diff cut-offs, overlap provided.
|
@@ -106,24 +107,64 @@ module ViralSeq
|
|
106
107
|
# => [">pair1", ">pair2", ">pair3"]
|
107
108
|
|
108
109
|
def join1(overlap = 0, diff = 0.0)
|
109
|
-
seq_pair_hash = self.dna_hash
|
110
|
-
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
111
110
|
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
111
|
+
|
112
|
+
if overlap.is_a? Integer and overlap.zero?
|
113
|
+
overlap = {
|
114
|
+
overlap_size: 0,
|
115
|
+
r1_overlapped: 0...0,
|
116
|
+
r2_overlapped: 0...0,
|
117
|
+
before_overlap: {
|
118
|
+
region: :r1,
|
119
|
+
range: 0..-1,
|
120
|
+
} ,
|
121
|
+
after_overlap: {
|
122
|
+
region: :r2,
|
123
|
+
range: 0..-1
|
124
|
+
}
|
125
|
+
}
|
126
|
+
elsif overlap.is_a? Integer
|
127
|
+
overlap = {
|
128
|
+
overlap_size: overlap,
|
129
|
+
r1_overlapped: -overlap..-1,
|
130
|
+
r2_overlapped: 0..(overlap - 1),
|
131
|
+
before_overlap: {
|
132
|
+
region: :r1,
|
133
|
+
range: 0..(-overlap - 1),
|
134
|
+
} ,
|
135
|
+
after_overlap: {
|
136
|
+
region: :r2,
|
137
|
+
range: overlap..-1
|
138
|
+
}
|
139
|
+
}
|
140
|
+
end
|
141
|
+
|
142
|
+
seq_pair_hash = self.dna_hash
|
112
143
|
joined_seq = {}
|
113
144
|
seq_pair_hash.each do |seq_name,seq_pair|
|
114
145
|
r1_seq = seq_pair[0]
|
115
146
|
r2_seq = seq_pair[1]
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
147
|
+
|
148
|
+
r1_overlap = r1_seq[overlap[:r1_overlapped]]
|
149
|
+
r2_overlap = r2_seq[overlap[:r2_overlapped]]
|
150
|
+
|
151
|
+
overlap_size = overlap[:overlap_size]
|
152
|
+
|
153
|
+
if (diff.zero? and r1_overlap == r2_overlap) or (!diff.zero? and r1_overlap.compare_with(r2_overlap) <= (overlap_size.abs * diff))
|
154
|
+
if overlap[:before_overlap][:region] == :r1
|
155
|
+
before_overlap_seq = r1_seq[overlap[:before_overlap][:range]]
|
156
|
+
elsif overlap[:before_overlap][:region] == :r2
|
157
|
+
before_overlap_seq = r2_seq[overlap[:before_overlap][:range]]
|
121
158
|
end
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
159
|
+
|
160
|
+
if overlap[:after_overlap][:region] == :r1
|
161
|
+
after_overlap_seq = r1_seq[overlap[:after_overlap][:range]]
|
162
|
+
elsif overlap[:after_overlap][:region] == :r2
|
163
|
+
after_overlap_seq = r2_seq[overlap[:after_overlap][:range]]
|
164
|
+
end
|
165
|
+
joined_sequence = before_overlap_seq + r1_overlap + after_overlap_seq
|
126
166
|
end
|
167
|
+
|
127
168
|
joined_seq[seq_name] = joined_sequence if joined_sequence
|
128
169
|
end
|
129
170
|
|
@@ -164,18 +205,35 @@ module ViralSeq
|
|
164
205
|
elsif model == :indiv
|
165
206
|
joined_seq = {}
|
166
207
|
seq_pair_hash.each do |seq_name, seq_pair|
|
208
|
+
r1_seq = seq_pair[0]
|
209
|
+
r2_seq = seq_pair[1]
|
167
210
|
overlap_list = []
|
168
|
-
|
169
|
-
|
211
|
+
|
212
|
+
overlap_matrix(r1_seq, r2_seq).each do |overlap1, diff_nt|
|
213
|
+
cut_off_base = overlap1[:overlap_size] * diff
|
170
214
|
overlap_list << overlap1 if diff_nt <= cut_off_base
|
171
215
|
end
|
216
|
+
|
172
217
|
if overlap_list.empty?
|
173
|
-
joined_seq[seq_name]
|
218
|
+
joined_seq[seq_name] = seq_pair[0] + seq_pair[1]
|
174
219
|
else
|
175
|
-
|
176
|
-
|
220
|
+
overlap_to_use = overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
|
221
|
+
|
222
|
+
if overlap_to_use[:before_overlap][:region] == :r1
|
223
|
+
before_overlap_seq = r1_seq[overlap_to_use[:before_overlap][:range]]
|
224
|
+
elsif overlap_to_use[:before_overlap][:region] == :r2
|
225
|
+
before_overlap_seq = r2_seq[overlap_to_use[:before_overlap][:range]]
|
226
|
+
end
|
227
|
+
|
228
|
+
if overlap_to_use[:after_overlap][:region] == :r1
|
229
|
+
after_overlap_seq = r1_seq[overlap_to_use[:after_overlap][:range]]
|
230
|
+
elsif overlap_to_use[:after_overlap][:region] == :r2
|
231
|
+
after_overlap_seq = r2_seq[overlap_to_use[:after_overlap][:range]]
|
232
|
+
end
|
233
|
+
joined_seq[seq_name] = before_overlap_seq + r1_seq[overlap_to_use[:r1_overlapped]] + after_overlap_seq
|
177
234
|
end
|
178
235
|
end
|
236
|
+
|
179
237
|
joined_seq_hash = ViralSeq::SeqHash.new
|
180
238
|
joined_seq_hash.dna_hash = joined_seq
|
181
239
|
joined_seq_hash.title = self.title + "_joined"
|
@@ -197,35 +255,104 @@ module ViralSeq
|
|
197
255
|
seq_pair_hash.each do |_seq_name, seq_pair|
|
198
256
|
overlap_list = []
|
199
257
|
matrix = overlap_matrix(seq_pair[0], seq_pair[1])
|
200
|
-
matrix.each do |
|
258
|
+
matrix.each do |overlap_positions, diff_nt|
|
259
|
+
overlap = overlap_positions[:overlap_size].abs
|
201
260
|
cut_off_base = overlap * diff
|
202
|
-
overlap_list <<
|
261
|
+
overlap_list << overlap_positions if diff_nt <= cut_off_base
|
203
262
|
end
|
263
|
+
|
204
264
|
if overlap_list.empty?
|
205
|
-
overlaps <<
|
265
|
+
overlaps << {
|
266
|
+
overlap_size: 0,
|
267
|
+
r1_overlapped: 0...0,
|
268
|
+
r2_overlapped: 0...0,
|
269
|
+
before_overlap: {
|
270
|
+
region: :r1,
|
271
|
+
range: 0..-1,
|
272
|
+
} ,
|
273
|
+
after_overlap: {
|
274
|
+
region: :r2,
|
275
|
+
range: 0..-1
|
276
|
+
}
|
277
|
+
}
|
206
278
|
else
|
207
|
-
overlaps << overlap_list.
|
279
|
+
overlaps << overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
|
208
280
|
end
|
281
|
+
|
209
282
|
end
|
210
283
|
count_overlaps = overlaps.count_freq
|
211
284
|
max_value = count_overlaps.values.max
|
212
285
|
max_overlap_list = []
|
213
286
|
count_overlaps.each {|overlap, counts| max_overlap_list << overlap if counts == max_value}
|
214
|
-
max_overlap_list.
|
287
|
+
max_overlap_list.sort_by{|k| k[:overlap_size].abs}.reverse[0]
|
215
288
|
end # end pf determine_overlap_pid_pair
|
216
289
|
|
217
290
|
# input a pair of sequences as String, return a Hash object of overlapping Hash object
|
218
291
|
# {:overlap_size => number_of_differnt_positions, ...}
|
219
292
|
# {minimal overlap set to 4. }
|
220
293
|
def overlap_matrix(sequence1, sequence2)
|
221
|
-
|
222
|
-
max_overlap = [sequence1.size, sequence2.size].min
|
294
|
+
list = overlap_list(sequence1.size, sequence2.size)
|
223
295
|
matrix_hash = {}
|
296
|
+
list.each do |l|
|
297
|
+
range1 = l[:r1_overlapped]
|
298
|
+
range2 = l[:r2_overlapped]
|
299
|
+
matrix_hash[l] = sequence1[range1].compare_with(sequence2[range2])
|
300
|
+
end
|
301
|
+
matrix_hash
|
302
|
+
end
|
303
|
+
|
304
|
+
# given two [Integer], return all possible overlaping ranges in an [Array]
|
305
|
+
def overlap_list(l1, l2)
|
306
|
+
return_list = []
|
307
|
+
min_overlap = 4
|
308
|
+
max_overlap = [l1, l2].min
|
309
|
+
diff = (l1 - l2).abs
|
310
|
+
max_reverse = l1/2
|
311
|
+
|
224
312
|
(min_overlap..max_overlap).each do |overlap|
|
225
|
-
|
313
|
+
return_list<< {
|
314
|
+
overlap_size: overlap,
|
315
|
+
r1_overlapped: (l1-overlap)..(l1-1),
|
316
|
+
r2_overlapped: 0..(overlap -1),
|
317
|
+
before_overlap: {region: :r1, range: 0..(l1 - overlap - 1)},
|
318
|
+
after_overlap: {region: :r2, range: overlap..(l2-1)}
|
319
|
+
}
|
320
|
+
end
|
321
|
+
|
322
|
+
if l1 >= l2
|
323
|
+
(1..diff).each do |overlap|
|
324
|
+
return_list << {
|
325
|
+
overlap_size: max_overlap,
|
326
|
+
r1_overlapped: (diff - overlap)..(l1-1-overlap),
|
327
|
+
r2_overlapped: 0..(l2-1),
|
328
|
+
before_overlap: {region: :r1, range: 0...(diff - overlap)},
|
329
|
+
after_overlap: {region: :r1, range: (l1-overlap)...l1},
|
330
|
+
}
|
331
|
+
end
|
332
|
+
else
|
333
|
+
(1..diff).each do |overlap|
|
334
|
+
return_list << {
|
335
|
+
overlap_size: max_overlap,
|
336
|
+
r1_overlapped: 0..(l1-1),
|
337
|
+
r2_overlapped: overlap..(max_overlap + overlap - 1),
|
338
|
+
before_overlap: {region: :r2, range: 0...overlap},
|
339
|
+
after_overlap: {region: :r2, range: (max_overlap + overlap)...l2},
|
340
|
+
}
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
(max_reverse..(max_overlap-1)).reverse_each do |overlap|
|
345
|
+
return_list << {
|
346
|
+
overlap_size: overlap,
|
347
|
+
r1_overlapped: 0..(overlap -1),
|
348
|
+
r2_overlapped: (l2-overlap)..(l2-1),
|
349
|
+
before_overlap: {region: :r2, range: 0..(l2-overlap-1)},
|
350
|
+
after_overlap: {region: :r1, range: overlap..(l1-1)},
|
351
|
+
}
|
226
352
|
end
|
227
|
-
|
228
|
-
|
353
|
+
|
354
|
+
return_list
|
355
|
+
end # end of overlap_list
|
229
356
|
|
230
357
|
end # end of SeqHashPair
|
231
358
|
end # end of ViralSeq
|