viral_seq 1.0.14 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/tcs_sdrm ADDED
@@ -0,0 +1,409 @@
1
+ #!/usr/bin/env ruby
2
+ # tcs/sdrm pipeline for HIV-1 drug resistance mutation and recency
3
+ #
4
+ # command example:
5
+ # $ tcs_sdrm libs_dir
6
+ #
7
+ # lib_dir file structure:
8
+ # libs_dir
9
+ # ├── lib1
10
+ # ├── lib1_RT
11
+ # ├── lib1_PR
12
+ # ├── lib1_IN
13
+ # ├── lib1_V1V3
14
+ # ├── lib2
15
+ # ├── lib1_RT
16
+ # ├── lib1_PR
17
+ # ├── lib1_IN
18
+ # ├── lib1_V1V3
19
+ # ├── ...
20
+ #
21
+ # output data in a new dir as 'libs_dir_SDRM'
22
+
23
+ require 'viral_seq'
24
+ require 'json'
25
+ require 'csv'
26
+ require 'fileutils'
27
+ require 'prawn'
28
+ require 'prawn/table'
29
+ require 'combine_pdf'
30
+
31
+ unless ARGV[0] && File.directory?(ARGV[0])
32
+ abort "No sequence data provided. `tcs_sdrm` pipeline aborted. "
33
+ end
34
+
35
+ begin
36
+ r_version = `R --version`.split("\n")[0]
37
+ r_check = `R -e '#{ViralSeq::R_SCRIPT_CHECK_PACKAGES}' > /dev/null 2>&1`
38
+ rescue Errno::ENOENT
39
+ abort '"R" is not installed. Install R at https://www.r-project.org/' +
40
+ "\n`tcs_sdrm` pipeline aborted."
41
+ end
42
+
43
+ def abstract_line(data)
44
+ return_data = data[3] + data[2] + data[4] + ":" +
45
+ (data[6].to_f * 100).round(2).to_s + "(" +
46
+ (data[7].to_f * 100).round(2).to_s + "-" +
47
+ (data[8].to_f * 100).round(2).to_s + "); "
48
+ end
49
+
50
+ # run params
51
+ log = []
52
+
53
+ log << { time: Time.now }
54
+ log << { viral_seq_version: ViralSeq::VERSION }
55
+ log << { tcs_version: ViralSeq::TCS_VERSION }
56
+ log << { R_version: r_version}
57
+ sdrm_list = {}
58
+ sdrm_list[:nrti] = ViralSeq::DRMs.sdrm_json(:nrti)
59
+ sdrm_list[:nnrti] = ViralSeq::DRMs.sdrm_json(:nnrti)
60
+ sdrm_list[:hiv_pr] = ViralSeq::DRMs.sdrm_json(:hiv_pr)
61
+ sdrm_list[:hiv_in] = ViralSeq::DRMs.sdrm_json(:hiv_in)
62
+ log << { sdrm_list: sdrm_list }
63
+
64
+ # input dir
65
+ indir = ARGV[0]
66
+ libs = Dir[indir + "/*"]
67
+ log << { processed_libs: libs }
68
+
69
+ #output dir
70
+ outdir = indir + "_SDRM"
71
+ Dir.mkdir(outdir) unless File.directory?(outdir)
72
+
73
+ libs.each do |lib|
74
+
75
+ r_script = ViralSeq::R_SCRIPT.dup
76
+
77
+ next unless File.directory?(lib)
78
+
79
+ lib_name = File.basename(lib)
80
+ out_lib_dir = File.join(outdir, lib_name)
81
+ Dir.mkdir(out_lib_dir) unless File.directory?(out_lib_dir)
82
+
83
+ sub_seq_files = Dir[lib + "/*"]
84
+
85
+ seq_summary_file = File.join(out_lib_dir, (lib_name + "_summary.csv"))
86
+ seq_summary_out = File.open(seq_summary_file, "w")
87
+ seq_summary_out.puts 'Region,TCS,TCS with A3G/F hypermutation,TCS with stop codon,' +
88
+ 'TCS w/o hypermutation and stop codon,' +
89
+ 'Poisson cutoff for minority mutation (>=),Pi,Dist20'
90
+
91
+ point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
92
+ point_mutation_out = File.open(point_mutation_file, "w")
93
+ point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
94
+ "number,percentage,95% CI low, 95% CI high, notes"
95
+
96
+ linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
97
+ linkage_out = File.open(linkage_file, "w")
98
+ linkage_out.puts "region,TCS,mutation linkage,number," +
99
+ "percentage,95% CI low, 95% CI high, notes"
100
+
101
+ aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
102
+ aa_report_out = File.open(aa_report_file, "w")
103
+ aa_report_out.puts "region,ref.aa.positions,TCS.number," +
104
+ ViralSeq::AMINO_ACID_LIST.join(",")
105
+
106
+ summary_json_file = File.join(out_lib_dir, (lib_name + "_summary.json"))
107
+ summary_json_out = File.open(summary_json_file,"w")
108
+
109
+ filtered_seq_dir = File.join(out_lib_dir, (lib_name + "_filtered_seq"))
110
+ Dir.mkdir(filtered_seq_dir) unless File.directory?(filtered_seq_dir)
111
+
112
+ aln_seq_dir = File.join(out_lib_dir, (lib_name + "_aln_seq"))
113
+ Dir.mkdir(aln_seq_dir) unless File.directory?(aln_seq_dir)
114
+
115
+ point_mutation_list = []
116
+ linkage_list = []
117
+ aa_report_list = []
118
+ summary_hash = {}
119
+
120
+ sub_seq_files.each do |sub_seq|
121
+ seq_basename = File.basename(sub_seq)
122
+ seqs = ViralSeq::SeqHash.fa(sub_seq)
123
+ next if seqs.size < 3
124
+ if seq_basename =~ /V1V3/i
125
+ summary_hash[:V1V3] = "#{seqs.size.to_s},NA,NA,NA,NA"
126
+ FileUtils.cp(sub_seq, filtered_seq_dir)
127
+ elsif seq_basename =~ /PR/i
128
+ a3g_check = seqs.a3g
129
+ a3g_seqs = a3g_check[:a3g_seq]
130
+ a3g_filtered_seqs = a3g_check[:filtered_seq]
131
+ stop_codon_check = a3g_filtered_seqs.stop_codon
132
+ stop_codon_seqs = stop_codon_check[:with_stop_codon]
133
+ filtered_seqs = stop_codon_check[:without_stop_codon]
134
+ poisson_minority_cutoff = filtered_seqs.pm
135
+ summary_hash[:PR] = [
136
+ seqs.size.to_s,
137
+ a3g_seqs.size.to_s,
138
+ stop_codon_seqs.size.to_s,
139
+ filtered_seqs.size.to_s,
140
+ poisson_minority_cutoff.to_s
141
+ ].join(',')
142
+ next if filtered_seqs.size < 3
143
+ filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
144
+
145
+ sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
146
+ point_mutation_list += sdrm[0]
147
+ linkage_list += sdrm[1]
148
+ aa_report_list += sdrm[2]
149
+
150
+ elsif seq_basename =~/IN/i
151
+ a3g_check = seqs.a3g
152
+ a3g_seqs = a3g_check[:a3g_seq]
153
+ a3g_filtered_seqs = a3g_check[:filtered_seq]
154
+ stop_codon_check = a3g_filtered_seqs.stop_codon(2)
155
+ stop_codon_seqs = stop_codon_check[:with_stop_codon]
156
+ filtered_seqs = stop_codon_check[:without_stop_codon]
157
+ poisson_minority_cutoff = filtered_seqs.pm
158
+ summary_hash[:IN] = [
159
+ seqs.size.to_s,
160
+ a3g_seqs.size.to_s,
161
+ stop_codon_seqs.size.to_s,
162
+ filtered_seqs.size.to_s,
163
+ poisson_minority_cutoff.to_s
164
+ ].join(',')
165
+ next if filtered_seqs.size < 3
166
+ filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
167
+
168
+ sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
169
+ point_mutation_list += sdrm[0]
170
+ linkage_list += sdrm[1]
171
+ aa_report_list += sdrm[2]
172
+
173
+ elsif seq_basename =~/RT/i
174
+ rt_seq1 = {}
175
+ rt_seq2 = {}
176
+ seqs.dna_hash.each do |k,v|
177
+ rt_seq1[k] = v[0,267]
178
+ rt_seq2[k] = v[267..-1]
179
+ end
180
+ rt1 = ViralSeq::SeqHash.new(rt_seq1)
181
+ rt2 = ViralSeq::SeqHash.new(rt_seq2)
182
+ rt1_a3g = rt1.a3g
183
+ rt2_a3g = rt2.a3g
184
+ hypermut_seq_rt1 = rt1_a3g[:a3g_seq]
185
+ hypermut_seq_rt2 = rt2_a3g[:a3g_seq]
186
+ rt1_stop_codon = rt1.stop_codon(1)[:with_stop_codon]
187
+ rt2_stop_codon = rt2.stop_codon(2)[:with_stop_codon]
188
+ hypermut_seq_keys = (hypermut_seq_rt1.dna_hash.keys | hypermut_seq_rt2.dna_hash.keys)
189
+ stop_codon_seq_keys = (rt1_stop_codon.dna_hash.keys | rt2_stop_codon.dna_hash.keys)
190
+ reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
191
+ filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
192
+ poisson_minority_cutoff = filtered_seqs.pm
193
+ summary_hash[:RT] = [
194
+ seqs.size.to_s,
195
+ hypermut_seq_keys.size.to_s,
196
+ stop_codon_seq_keys.size.to_s,
197
+ filtered_seqs.size.to_s,
198
+ poisson_minority_cutoff.to_s
199
+ ].join(',')
200
+ next if filtered_seqs.size < 3
201
+ filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
202
+
203
+ sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
204
+ point_mutation_list += sdrm[0]
205
+ linkage_list += sdrm[1]
206
+ aa_report_list += sdrm[2]
207
+ end
208
+ end
209
+
210
+ point_mutation_list.each do |record|
211
+ point_mutation_out.puts record.join(",")
212
+ end
213
+ linkage_list.each do |record|
214
+ linkage_out.puts record.join(",")
215
+ end
216
+ aa_report_list.each do |record|
217
+ aa_report_out.puts record.join(",")
218
+ end
219
+
220
+ filtered_seq_files = Dir[filtered_seq_dir + "/*"]
221
+
222
+ out_r_csv = File.join(out_lib_dir, (lib_name + "_pi.csv"))
223
+ out_r_pdf = File.join(out_lib_dir, (lib_name + "_pi.pdf"))
224
+
225
+ if filtered_seq_files.size > 0
226
+ filtered_seq_files.each do |seq_file|
227
+ filtered_sh = ViralSeq::SeqHash.fa(seq_file)
228
+ next if filtered_sh.size < 3
229
+ aligned_sh = filtered_sh.random_select(1000).align
230
+ aligned_sh.write_nt_fa(File.join(aln_seq_dir, File.basename(seq_file)))
231
+ end
232
+
233
+ r_script.gsub!(/PATH_TO_FASTA/,aln_seq_dir)
234
+ File.unlink(out_r_csv) if File.exist?(out_r_csv)
235
+ File.unlink(out_r_pdf) if File.exist?(out_r_pdf)
236
+ r_script.gsub!(/OUTPUT_CSV/,out_r_csv)
237
+ r_script.gsub!(/OUTPUT_PDF/,out_r_pdf)
238
+ r_script_file = File.join(out_lib_dir, "/pi.R")
239
+ File.open(r_script_file,"w") {|line| line.puts r_script}
240
+ print `Rscript #{r_script_file} 1> /dev/null 2> /dev/null`
241
+ if File.exist?(out_r_csv)
242
+ pi_csv = File.readlines(out_r_csv)
243
+ pi_csv.each do |line|
244
+ line.chomp!
245
+ data = line.split(",")
246
+ tag = data[0].split("_")[-1].gsub(/\W/,"").to_sym
247
+ summary_hash[tag] += "," + data[1].to_f.round(4).to_s + "," + data[2].to_f.round(4).to_s
248
+ end
249
+ [:PR, :RT, :IN, :V1V3].each do |regions|
250
+ next unless summary_hash[regions]
251
+ seq_summary_out.puts regions.to_s + "," + summary_hash[regions]
252
+ end
253
+ File.unlink(out_r_csv)
254
+ end
255
+ File.unlink(r_script_file)
256
+ end
257
+
258
+ seq_summary_out.close
259
+ point_mutation_out.close
260
+ linkage_out.close
261
+ aa_report_out.close
262
+
263
+ summary_lines = File.readlines(seq_summary_file)
264
+ summary_lines.shift
265
+
266
+ tcs_PR = 0
267
+ tcs_RT = 0
268
+ tcs_IN = 0
269
+ tcs_V1V3 = 0
270
+ pi_RT = 0.0
271
+ pi_V1V3 = 0.0
272
+ dist20_RT = 0.0
273
+ dist20_V1V3 = 0.0
274
+ summary_lines.each do |line|
275
+ data = line.chomp.split(",")
276
+ if data[0] == "PR"
277
+ tcs_PR = data[4].to_i
278
+ elsif data[0] == "RT"
279
+ tcs_RT = data[4].to_i
280
+ pi_RT = data[6].to_f
281
+ dist20_RT = data[7].to_f
282
+ elsif data[0] == "IN"
283
+ tcs_IN = data[4].to_i
284
+ elsif data[0] == "V1V3"
285
+ tcs_V1V3 = data[1].to_i
286
+ pi_V1V3 = data[6].to_f
287
+ dist20_V1V3 = data[7].to_f
288
+ end
289
+ end
290
+
291
+ recency = ViralSeq::Recency.define(
292
+ tcs_RT: tcs_RT,
293
+ tcs_V1V3: tcs_V1V3,
294
+ pi_RT: pi_RT,
295
+ dist20_RT: dist20_RT,
296
+ pi_V1V3: pi_V1V3,
297
+ dist20_V1V3: dist20_V1V3
298
+ )
299
+
300
+ sdrm_lines = File.readlines(point_mutation_file)
301
+ sdrm_lines.shift
302
+ sdrm_PR = ""
303
+ sdrm_RT = ""
304
+ sdrm_IN = ""
305
+ sdrm_lines.each do |line|
306
+ data = line.chomp.split(",")
307
+ next if data[-1] == "*"
308
+ if data[0] == "PR"
309
+ sdrm_PR += abstract_line(data)
310
+ elsif data[0] =~ /NRTI/
311
+ sdrm_RT += abstract_line(data)
312
+ elsif data[0] == "IN"
313
+ sdrm_IN += abstract_line(data)
314
+ end
315
+ end
316
+
317
+ summary_json = [
318
+ sample_id: lib_name,
319
+ tcs_PR: tcs_PR,
320
+ tcs_RT: tcs_RT,
321
+ tcs_IN: tcs_IN,
322
+ tcs_V1V3: tcs_V1V3,
323
+ pi_RT: pi_RT,
324
+ dist20_RT: dist20_RT,
325
+ dist20_V1V3: dist20_V1V3,
326
+ recency: recency,
327
+ sdrm_PR: sdrm_PR,
328
+ sdrm_RT: sdrm_RT,
329
+ sdrm_IN: sdrm_IN
330
+ ]
331
+
332
+ summary_json_out.puts JSON.pretty_generate(summary_json)
333
+ summary_json_out.close
334
+
335
+ csvs = [
336
+ {
337
+ name: "summary",
338
+ title: "Summary",
339
+ file: seq_summary_file,
340
+ newPDF: "",
341
+ table_width: [65,55,110,110,110,110,60,60],
342
+ extra_text: ""
343
+ },
344
+ {
345
+ name: "substitution",
346
+ title: "Surveillance Drug Resistance Mutations",
347
+ file: point_mutation_file,
348
+ newPDF: "",
349
+ table_width: [65,55,85,80,60,65,85,85,85,45],
350
+ extra_text: "* Mutation below Poisson cut-off for minority mutations"
351
+ },
352
+ {
353
+ name: "linkage",
354
+ title: "Mutation Linkage",
355
+ file: linkage_file,
356
+ newPDF: "",
357
+ table_width: [55,50,250,60,80,80,80,45],
358
+ extra_text: "* Mutation below Poisson cut-off for minority mutations"
359
+ }
360
+ ]
361
+
362
+ csvs.each do |csv|
363
+ file_name = File.join(out_lib_dir, (csv[:name] + ".pdf"))
364
+ next unless File.exist? csv[:file]
365
+ Prawn::Document.generate(file_name, :page_layout => :landscape) do |pdf|
366
+ pdf.text((File.basename(lib, ".*") + ': ' + csv[:title]),
367
+ :size => 20,
368
+ :align => :center,
369
+ :style => :bold)
370
+ pdf.move_down 20
371
+ table_data = CSV.open(csv[:file]).to_a
372
+ header = table_data.first
373
+ pdf.table(table_data,
374
+ :header => header,
375
+ :position => :center,
376
+ :column_widths => csv[:table_width],
377
+ :row_colors => ["B6B6B6", "FFFFFF"],
378
+ :cell_style => {:align => :center, :size => 10}) do |table|
379
+ table.row(0).style :font_style => :bold, :size => 12 #, :background_color => 'ff00ff'
380
+ end
381
+ pdf.move_down 5
382
+ pdf.text(csv[:extra_text], :size => 8, :align => :justify,)
383
+ end
384
+ csv[:newPDF] = file_name
385
+ end
386
+
387
+ pdf = CombinePDF.new
388
+ csvs.each do |csv|
389
+ pdf << CombinePDF.load(csv[:newPDF]) if File.exist?(csv[:newPDF])
390
+ end
391
+ pdf << CombinePDF.load(out_r_pdf) if File.exist?(out_r_pdf)
392
+
393
+ pdf.number_pages location: [:bottom_right],
394
+ number_format: "Swanstrom\'s lab HIV SDRM Pipeline, version #{$sdrm_version_number} by S.Z. and M.U.C. Page %s",
395
+ font_size: 6,
396
+ opacity: 0.5
397
+
398
+ pdf.save File.join(out_lib_dir, (lib_name + ".pdf"))
399
+
400
+ csvs.each do |csv|
401
+ File.unlink csv[:newPDF]
402
+ end
403
+ end
404
+
405
+ log_file = File.join(File.dirname(indir), "sdrm_log.json")
406
+
407
+ File.open(log_file, 'w') { |f| f.puts JSON.pretty_generate(log) }
408
+
409
+ FileUtils.touch(File.join(outdir, ".done"))
Binary file
data/docs/dr.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "platform_error_rate": 0.02,
3
+ "primer_pairs": [
4
+ {
5
+ "region": "RT",
6
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
7
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
8
+ "majority": 0.5,
9
+ "end_join": true,
10
+ "end_join_option": 1,
11
+ "overlap": 0,
12
+ "TCS_QC": true,
13
+ "ref_genome": "HXB2",
14
+ "ref_start": 2648,
15
+ "ref_end": 3257,
16
+ "indel": true,
17
+ "trim": false
18
+ },
19
+ {
20
+ "region": "PR",
21
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
22
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
23
+ "majority": 0.5,
24
+ "end_join": true,
25
+ "end_join_option": 3,
26
+ "TCS_QC": true,
27
+ "ref_genome": "HXB2",
28
+ "ref_start": 0,
29
+ "ref_end": 2591,
30
+ "indel": true,
31
+ "trim": true,
32
+ "trim_ref": "HXB2",
33
+ "trim_ref_start": 2253,
34
+ "trim_ref_end": 2549
35
+ },
36
+ {
37
+ "region": "IN",
38
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
39
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
40
+ "majority": 0.5,
41
+ "end_join": true,
42
+ "end_join_option": 3,
43
+ "overlap": 171,
44
+ "TCS_QC": true,
45
+ "ref_genome": "HXB2",
46
+ "ref_start": 4384,
47
+ "ref_end": 4751,
48
+ "indel": false,
49
+ "trim": false
50
+ },
51
+ {
52
+ "region": "V1V3",
53
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
54
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
55
+ "majority": 0.5,
56
+ "end_join": true,
57
+ "end_join_option": 1,
58
+ "overlap": 0,
59
+ "TCS_QC": true,
60
+ "ref_genome": "HXB2",
61
+ "ref_start": 6585,
62
+ "ref_end": 7208,
63
+ "indel": true,
64
+ "trim": false
65
+ }
66
+ ]
67
+ }