viral_seq 1.0.13 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/tcs_sdrm ADDED
@@ -0,0 +1,402 @@
1
+ #!/usr/bin/env ruby
2
+ # tcs/sdrm pipeline for HIV-1 drug resistance mutation and recency
3
+ #
4
+ # command example:
5
+ # $ tcs_sdrm libs_dir
6
+ #
7
+ # lib_dir file structure:
8
+ # libs_dir
9
+ # ├── lib1
10
+ # ├── lib1_RT
11
+ # ├── lib1_PR
12
+ # ├── lib1_IN
13
+ # ├── lib1_V1V3
14
+ # ├── lib2
15
+ # ├── lib1_RT
16
+ # ├── lib1_PR
17
+ # ├── lib1_IN
18
+ # ├── lib1_V1V3
19
+ # ├── ...
20
+ #
21
+ # output data in a new dir as 'libs_dir_SDRM'
22
+
23
+ require 'viral_seq'
24
+ require 'json'
25
+ require 'csv'
26
+ require 'fileutils'
27
+ require 'prawn'
28
+ require 'prawn/table'
29
+ require 'combine_pdf'
30
+
31
+ unless ARGV[0] && File.directory?(ARGV[0])
32
+ abort "No sequence data provided. `tcs_sdrm` pipeline aborted. "
33
+ end
34
+
35
+ def abstract_line(data)
36
+ return_data = data[3] + data[2] + data[4] + ":" +
37
+ (data[6].to_f * 100).round(2).to_s + "(" +
38
+ (data[7].to_f * 100).round(2).to_s + "-" +
39
+ (data[8].to_f * 100).round(2).to_s + "); "
40
+ end
41
+
42
+ # run params
43
+ log = []
44
+
45
+ log << { time: Time.now }
46
+ log << { viral_seq_version: ViralSeq::VERSION }
47
+ log << { tcs_version: ViralSeq::TCS_VERSION }
48
+ r_version = `R --version`.split("\n")[0]
49
+ log << { R_version: r_version}
50
+ sdrm_list = {}
51
+ sdrm_list[:nrti] = ViralSeq::DRMs.sdrm_json(:nrti)
52
+ sdrm_list[:nnrti] = ViralSeq::DRMs.sdrm_json(:nnrti)
53
+ sdrm_list[:hiv_pr] = ViralSeq::DRMs.sdrm_json(:hiv_pr)
54
+ sdrm_list[:hiv_in] = ViralSeq::DRMs.sdrm_json(:hiv_in)
55
+ log << { sdrm_list: sdrm_list }
56
+
57
+ # input dir
58
+ indir = ARGV[0]
59
+ libs = Dir[indir + "/*"]
60
+ log << { processed_libs: libs }
61
+
62
+ #output dir
63
+ outdir = indir + "_SDRM"
64
+ Dir.mkdir(outdir) unless File.directory?(outdir)
65
+
66
+ libs.each do |lib|
67
+
68
+ r_script = ViralSeq::R_SCRIPT.dup
69
+
70
+ next unless File.directory?(lib)
71
+
72
+ lib_name = File.basename(lib)
73
+ out_lib_dir = File.join(outdir, lib_name)
74
+ Dir.mkdir(out_lib_dir) unless File.directory?(out_lib_dir)
75
+
76
+ sub_seq_files = Dir[lib + "/*"]
77
+
78
+ seq_summary_file = File.join(out_lib_dir, (lib_name + "_summary.csv"))
79
+ seq_summary_out = File.open(seq_summary_file, "w")
80
+ seq_summary_out.puts 'Region,TCS,TCS with A3G/F hypermutation,TCS with stop codon,' +
81
+ 'TCS w/o hypermutation and stop codon,' +
82
+ 'Poisson cutoff for minority mutation (>=),Pi,Dist20'
83
+
84
+ point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
85
+ point_mutation_out = File.open(point_mutation_file, "w")
86
+ point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
87
+ "number,percentage,95% CI low, 95% CI high, notes"
88
+
89
+ linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
90
+ linkage_out = File.open(linkage_file, "w")
91
+ linkage_out.puts "region,TCS,mutation linkage,number," +
92
+ "percentage,95% CI low, 95% CI high, notes"
93
+
94
+ aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
95
+ aa_report_out = File.open(aa_report_file, "w")
96
+ aa_report_out.puts "region,ref.aa.positions,TCS.number," +
97
+ ViralSeq::AMINO_ACID_LIST.join(",")
98
+
99
+ summary_json_file = File.join(out_lib_dir, (lib_name + "_summary.json"))
100
+ summary_json_out = File.open(summary_json_file,"w")
101
+
102
+ filtered_seq_dir = File.join(out_lib_dir, (lib_name + "_filtered_seq"))
103
+ Dir.mkdir(filtered_seq_dir) unless File.directory?(filtered_seq_dir)
104
+
105
+ aln_seq_dir = File.join(out_lib_dir, (lib_name + "_aln_seq"))
106
+ Dir.mkdir(aln_seq_dir) unless File.directory?(aln_seq_dir)
107
+
108
+ point_mutation_list = []
109
+ linkage_list = []
110
+ aa_report_list = []
111
+ summary_hash = {}
112
+
113
+ sub_seq_files.each do |sub_seq|
114
+ seq_basename = File.basename(sub_seq)
115
+ seqs = ViralSeq::SeqHash.fa(sub_seq)
116
+ next if seqs.size < 3
117
+ if seq_basename =~ /V1V3/i
118
+ summary_hash[:V1V3] = "#{seqs.size.to_s},NA,NA,NA,NA"
119
+ FileUtils.cp(sub_seq, filtered_seq_dir)
120
+ elsif seq_basename =~ /PR/i
121
+ a3g_check = seqs.a3g
122
+ a3g_seqs = a3g_check[:a3g_seq]
123
+ a3g_filtered_seqs = a3g_check[:filtered_seq]
124
+ stop_codon_check = a3g_filtered_seqs.stop_codon
125
+ stop_codon_seqs = stop_codon_check[:with_stop_codon]
126
+ filtered_seqs = stop_codon_check[:without_stop_codon]
127
+ poisson_minority_cutoff = filtered_seqs.pm
128
+ summary_hash[:PR] = [
129
+ seqs.size.to_s,
130
+ a3g_seqs.size.to_s,
131
+ stop_codon_seqs.size.to_s,
132
+ filtered_seqs.size.to_s,
133
+ poisson_minority_cutoff.to_s
134
+ ].join(',')
135
+ next if filtered_seqs.size < 3
136
+ filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
137
+
138
+ sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
139
+ point_mutation_list += sdrm[0]
140
+ linkage_list += sdrm[1]
141
+ aa_report_list += sdrm[2]
142
+
143
+ elsif seq_basename =~/IN/i
144
+ a3g_check = seqs.a3g
145
+ a3g_seqs = a3g_check[:a3g_seq]
146
+ a3g_filtered_seqs = a3g_check[:filtered_seq]
147
+ stop_codon_check = a3g_filtered_seqs.stop_codon(2)
148
+ stop_codon_seqs = stop_codon_check[:with_stop_codon]
149
+ filtered_seqs = stop_codon_check[:without_stop_codon]
150
+ poisson_minority_cutoff = filtered_seqs.pm
151
+ summary_hash[:IN] = [
152
+ seqs.size.to_s,
153
+ a3g_seqs.size.to_s,
154
+ stop_codon_seqs.size.to_s,
155
+ filtered_seqs.size.to_s,
156
+ poisson_minority_cutoff.to_s
157
+ ].join(',')
158
+ next if filtered_seqs.size < 3
159
+ filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
160
+
161
+ sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
162
+ point_mutation_list += sdrm[0]
163
+ linkage_list += sdrm[1]
164
+ aa_report_list += sdrm[2]
165
+
166
+ elsif seq_basename =~/RT/i
167
+ rt_seq1 = {}
168
+ rt_seq2 = {}
169
+ seqs.dna_hash.each do |k,v|
170
+ rt_seq1[k] = v[0,267]
171
+ rt_seq2[k] = v[267..-1]
172
+ end
173
+ rt1 = ViralSeq::SeqHash.new(rt_seq1)
174
+ rt2 = ViralSeq::SeqHash.new(rt_seq2)
175
+ rt1_a3g = rt1.a3g
176
+ rt2_a3g = rt2.a3g
177
+ hypermut_seq_rt1 = rt1_a3g[:a3g_seq]
178
+ hypermut_seq_rt2 = rt2_a3g[:a3g_seq]
179
+ rt1_stop_codon = rt1.stop_codon(1)[:with_stop_codon]
180
+ rt2_stop_codon = rt2.stop_codon(2)[:with_stop_codon]
181
+ hypermut_seq_keys = (hypermut_seq_rt1.dna_hash.keys | hypermut_seq_rt2.dna_hash.keys)
182
+ stop_codon_seq_keys = (rt1_stop_codon.dna_hash.keys | rt2_stop_codon.dna_hash.keys)
183
+ reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
184
+ filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
185
+ poisson_minority_cutoff = filtered_seqs.pm
186
+ summary_hash[:RT] = [
187
+ seqs.size.to_s,
188
+ hypermut_seq_keys.size.to_s,
189
+ stop_codon_seq_keys.size.to_s,
190
+ filtered_seqs.size.to_s,
191
+ poisson_minority_cutoff.to_s
192
+ ].join(',')
193
+ next if filtered_seqs.size < 3
194
+ filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
195
+
196
+ sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
197
+ point_mutation_list += sdrm[0]
198
+ linkage_list += sdrm[1]
199
+ aa_report_list += sdrm[2]
200
+ end
201
+ end
202
+
203
+ point_mutation_list.each do |record|
204
+ point_mutation_out.puts record.join(",")
205
+ end
206
+ linkage_list.each do |record|
207
+ linkage_out.puts record.join(",")
208
+ end
209
+ aa_report_list.each do |record|
210
+ aa_report_out.puts record.join(",")
211
+ end
212
+
213
+ filtered_seq_files = Dir[filtered_seq_dir + "/*"]
214
+
215
+ out_r_csv = File.join(out_lib_dir, (lib_name + "_pi.csv"))
216
+ out_r_pdf = File.join(out_lib_dir, (lib_name + "_pi.pdf"))
217
+
218
+ if filtered_seq_files.size > 0
219
+ filtered_seq_files.each do |seq_file|
220
+ filtered_sh = ViralSeq::SeqHash.fa(seq_file)
221
+ next if filtered_sh.size < 3
222
+ aligned_sh = filtered_sh.random_select(1000).align
223
+ aligned_sh.write_nt_fa(File.join(aln_seq_dir, File.basename(seq_file)))
224
+ end
225
+
226
+ r_script.gsub!(/PATH_TO_FASTA/,aln_seq_dir)
227
+ File.unlink(out_r_csv) if File.exist?(out_r_csv)
228
+ File.unlink(out_r_pdf) if File.exist?(out_r_pdf)
229
+ r_script.gsub!(/OUTPUT_CSV/,out_r_csv)
230
+ r_script.gsub!(/OUTPUT_PDF/,out_r_pdf)
231
+ r_script_file = File.join(out_lib_dir, "/pi.R")
232
+ File.open(r_script_file,"w") {|line| line.puts r_script}
233
+ print `Rscript #{r_script_file} 1> /dev/null 2> /dev/null`
234
+ if File.exist?(out_r_csv)
235
+ pi_csv = File.readlines(out_r_csv)
236
+ pi_csv.each do |line|
237
+ line.chomp!
238
+ data = line.split(",")
239
+ tag = data[0].split("_")[-1].gsub(/\W/,"").to_sym
240
+ summary_hash[tag] += "," + data[1].to_f.round(4).to_s + "," + data[2].to_f.round(4).to_s
241
+ end
242
+ [:PR, :RT, :IN, :V1V3].each do |regions|
243
+ next unless summary_hash[regions]
244
+ seq_summary_out.puts regions.to_s + "," + summary_hash[regions]
245
+ end
246
+ File.unlink(out_r_csv)
247
+ end
248
+ File.unlink(r_script_file)
249
+ end
250
+
251
+ seq_summary_out.close
252
+ point_mutation_out.close
253
+ linkage_out.close
254
+ aa_report_out.close
255
+
256
+ summary_lines = File.readlines(seq_summary_file)
257
+ summary_lines.shift
258
+
259
+ tcs_PR = 0
260
+ tcs_RT = 0
261
+ tcs_IN = 0
262
+ tcs_V1V3 = 0
263
+ pi_RT = 0.0
264
+ pi_V1V3 = 0.0
265
+ dist20_RT = 0.0
266
+ dist20_V1V3 = 0.0
267
+ summary_lines.each do |line|
268
+ data = line.chomp.split(",")
269
+ if data[0] == "PR"
270
+ tcs_PR = data[4].to_i
271
+ elsif data[0] == "RT"
272
+ tcs_RT = data[4].to_i
273
+ pi_RT = data[6].to_f
274
+ dist20_RT = data[7].to_f
275
+ elsif data[0] == "IN"
276
+ tcs_IN = data[4].to_i
277
+ elsif data[0] == "V1V3"
278
+ tcs_V1V3 = data[1].to_i
279
+ pi_V1V3 = data[6].to_f
280
+ dist20_V1V3 = data[7].to_f
281
+ end
282
+ end
283
+
284
+ recency = ViralSeq::Recency.define(
285
+ tcs_RT: tcs_RT,
286
+ tcs_V1V3: tcs_V1V3,
287
+ pi_RT: pi_RT,
288
+ dist20_RT: dist20_RT,
289
+ pi_V1V3: pi_V1V3,
290
+ dist20_V1V3: dist20_V1V3
291
+ )
292
+
293
+ sdrm_lines = File.readlines(point_mutation_file)
294
+ sdrm_lines.shift
295
+ sdrm_PR = ""
296
+ sdrm_RT = ""
297
+ sdrm_IN = ""
298
+ sdrm_lines.each do |line|
299
+ data = line.chomp.split(",")
300
+ next if data[-1] == "*"
301
+ if data[0] == "PR"
302
+ sdrm_PR += abstract_line(data)
303
+ elsif data[0] =~ /NRTI/
304
+ sdrm_RT += abstract_line(data)
305
+ elsif data[0] == "IN"
306
+ sdrm_IN += abstract_line(data)
307
+ end
308
+ end
309
+
310
+ summary_json = [
311
+ sample_id: lib_name,
312
+ tcs_PR: tcs_PR,
313
+ tcs_RT: tcs_RT,
314
+ tcs_IN: tcs_IN,
315
+ tcs_V1V3: tcs_V1V3,
316
+ pi_RT: pi_RT,
317
+ dist20_RT: dist20_RT,
318
+ dist20_V1V3: dist20_V1V3,
319
+ recency: recency,
320
+ sdrm_PR: sdrm_PR,
321
+ sdrm_RT: sdrm_RT,
322
+ sdrm_IN: sdrm_IN
323
+ ]
324
+
325
+ summary_json_out.puts JSON.pretty_generate(summary_json)
326
+ summary_json_out.close
327
+
328
+ csvs = [
329
+ {
330
+ name: "summary",
331
+ title: "Summary",
332
+ file: seq_summary_file,
333
+ newPDF: "",
334
+ table_width: [65,55,110,110,110,110,60,60],
335
+ extra_text: ""
336
+ },
337
+ {
338
+ name: "substitution",
339
+ title: "Surveillance Drug Resistance Mutations",
340
+ file: point_mutation_file,
341
+ newPDF: "",
342
+ table_width: [65,55,85,80,60,65,85,85,85,45],
343
+ extra_text: "* Mutation below Poisson cut-off for minority mutations"
344
+ },
345
+ {
346
+ name: "linkage",
347
+ title: "Mutation Linkage",
348
+ file: linkage_file,
349
+ newPDF: "",
350
+ table_width: [55,50,250,60,80,80,80,45],
351
+ extra_text: "* Mutation below Poisson cut-off for minority mutations"
352
+ }
353
+ ]
354
+
355
+ csvs.each do |csv|
356
+ file_name = File.join(out_lib_dir, (csv[:name] + ".pdf"))
357
+ next unless File.exist? csv[:file]
358
+ Prawn::Document.generate(file_name, :page_layout => :landscape) do |pdf|
359
+ pdf.text((File.basename(lib, ".*") + ': ' + csv[:title]),
360
+ :size => 20,
361
+ :align => :center,
362
+ :style => :bold)
363
+ pdf.move_down 20
364
+ table_data = CSV.open(csv[:file]).to_a
365
+ header = table_data.first
366
+ pdf.table(table_data,
367
+ :header => header,
368
+ :position => :center,
369
+ :column_widths => csv[:table_width],
370
+ :row_colors => ["B6B6B6", "FFFFFF"],
371
+ :cell_style => {:align => :center, :size => 10}) do |table|
372
+ table.row(0).style :font_style => :bold, :size => 12 #, :background_color => 'ff00ff'
373
+ end
374
+ pdf.move_down 5
375
+ pdf.text(csv[:extra_text], :size => 8, :align => :justify,)
376
+ end
377
+ csv[:newPDF] = file_name
378
+ end
379
+
380
+ pdf = CombinePDF.new
381
+ csvs.each do |csv|
382
+ pdf << CombinePDF.load(csv[:newPDF]) if File.exist?(csv[:newPDF])
383
+ end
384
+ pdf << CombinePDF.load(out_r_pdf) if File.exist?(out_r_pdf)
385
+
386
+ pdf.number_pages location: [:bottom_right],
387
+ number_format: "Swanstrom\'s lab HIV SDRM Pipeline, version #{$sdrm_version_number} by S.Z. and M.U.C. Page %s",
388
+ font_size: 6,
389
+ opacity: 0.5
390
+
391
+ pdf.save File.join(out_lib_dir, (lib_name + ".pdf"))
392
+
393
+ csvs.each do |csv|
394
+ File.unlink csv[:newPDF]
395
+ end
396
+ end
397
+
398
+ log_file = indir + "_sdrm_log.json"
399
+
400
+ File.open(log_file, 'w') { |f| f.puts JSON.pretty_generate(log) }
401
+
402
+ FileUtils.touch(File.join(outdir, ".done"))
Binary file
data/docs/dr.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "platform_error_rate": 0.02,
3
+ "primer_pairs": [
4
+ {
5
+ "region": "RT",
6
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
7
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
8
+ "majority": 0.5,
9
+ "end_join": true,
10
+ "end_join_option": 1,
11
+ "overlap": 0,
12
+ "TCS_QC": true,
13
+ "ref_genome": "HXB2",
14
+ "ref_start": 2648,
15
+ "ref_end": 3257,
16
+ "indel": true,
17
+ "trim": false
18
+ },
19
+ {
20
+ "region": "PR",
21
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
22
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
23
+ "majority": 0.5,
24
+ "end_join": true,
25
+ "end_join_option": 3,
26
+ "TCS_QC": true,
27
+ "ref_genome": "HXB2",
28
+ "ref_start": 0,
29
+ "ref_end": 2591,
30
+ "indel": true,
31
+ "trim": true,
32
+ "trim_ref": "HXB2",
33
+ "trim_ref_start": 2253,
34
+ "trim_ref_end": 2549
35
+ },
36
+ {
37
+ "region": "IN",
38
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
39
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
40
+ "majority": 0.5,
41
+ "end_join": true,
42
+ "end_join_option": 3,
43
+ "overlap": 171,
44
+ "TCS_QC": true,
45
+ "ref_genome": "HXB2",
46
+ "ref_start": 4384,
47
+ "ref_end": 4751,
48
+ "indel": false,
49
+ "trim": false
50
+ },
51
+ {
52
+ "region": "V1V3",
53
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
54
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
55
+ "majority": 0.5,
56
+ "end_join": true,
57
+ "end_join_option": 1,
58
+ "overlap": 0,
59
+ "TCS_QC": true,
60
+ "ref_genome": "HXB2",
61
+ "ref_start": 6585,
62
+ "ref_end": 7208,
63
+ "indel": true,
64
+ "trim": false
65
+ }
66
+ ]
67
+ }