bio-gadget 0.4.8 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,605 @@
1
+ require 'csv'
2
+ require 'fileutils'
3
+ require 'open3'
4
+ require 'parallel'
5
+ require 'thread'
6
+
7
+ require 'bio/gadget/strt/prepare_transcriptome.rb'
8
+
9
+ module Bio
10
+ class Gadget
11
+ class Strt < Bio::Gadget
12
+
13
+ OPT_GENOME = [ :genome, { :default => 'hg38',
14
+ :desc => 'Genome assembly' } ]
15
+
16
+ OPT_UMI_LENGTH = [ :umi_length, { :banner => 'NT',
17
+ :default => 6,
18
+ :desc => 'Length of UMI',
19
+ :type => :numeric } ]
20
+
21
+ # strt:alignment
22
+
23
+ desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
24
+ long_desc <<-DESC
25
+ Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
26
+ DESC
27
+
28
+ method_option *OPT_BUFFER_SIZE
29
+ method_option *OPT_COREUTILS_PREFIX
30
+ method_option *OPT_GREP_PREFIX
31
+ method_option *OPT_PARALLEL
32
+
33
+ def alignment(refdir, seqdir, mapdir)
34
+
35
+ Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
36
+ base = File.basename(fqgz, '.fq.gz')
37
+ STDERR.puts "#{`date`.strip}: Align #{base}..."
38
+ bam = "#{mapdir}/#{base}.bam"
39
+ pipeline(
40
+ "hisat2 --no-unal --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
41
+ "#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
42
+ "samtools sort -@ #{options.parallel} -o #{bam}")
43
+ sh "samtools index #{bam}"
44
+ end
45
+
46
+ STDERR.puts "#{`date`.strip}: Count from all alignments."
47
+ Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
48
+ in_threads: options.parallel) do |bam|
49
+ pipeline(
50
+ "strt count_per_base#{buffer_size_option}#{coreutils_prefix_option}#{parallel_option(options)} #{bam}",
51
+ "pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
52
+ end
53
+
54
+ end
55
+
56
+ # strt:build_index
57
+
58
+ desc 'build_index DIR', 'Build index for alignment'
59
+ long_desc <<-DESC
60
+ Build index for alignment of STRT reads, from the speficied GENOME, TRANSCRIPTOME and VARIATION, at DIR.
61
+ DESC
62
+
63
+ method_option *OPT_COREUTILS_PREFIX
64
+ method_option *OPT_GENOME
65
+ method_option *OPT_GREP_PREFIX
66
+ method_option *OPT_PARALLEL
67
+
68
+ def build_index(dir0)
69
+
70
+ dir = File.expand_path(dir0)
71
+ FileUtils.mkdir_p(dir)
72
+
73
+ STDERR.puts "#{`date`.strip}: Preparing data files..."
74
+
75
+ Parallel.map(
76
+ ["strt prepare_genome#{coreutils_prefix_option}#{genome_option(options)} #{dir}",
77
+ "strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=only #{dir}",
78
+ "strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=only #{dir}",
79
+ "strt prepare_spikein#{coreutils_prefix_option} #{dir}",
80
+ "strt prepare_ribosome#{coreutils_prefix_option}#{genome_option(options)} #{dir}"], in_threads: options.parallel) do |cmd|
81
+ system cmd or exit $?.exitstatus
82
+ end
83
+
84
+ system "unpigz -c #{dir}/genome.fa.gz #{dir}/spikein.fa.gz #{dir}/ribosome.fa.gz > #{dir}/ref.fa"
85
+ system "samtools faidx #{dir}/ref.fa"
86
+
87
+ Parallel.map(["strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=no #{dir}",
88
+ "strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=no #{dir}"], in_threads: options.parallel) do |cmd|
89
+ STDERR.puts cmd
90
+ system cmd or exit $?.exitstatus
91
+ end
92
+
93
+ STDERR.puts "#{`date`.strip}: Building index..."
94
+
95
+ system "hisat2-build -f -p #{options.parallel} --snp #{dir}/variation.snp --haplotype #{dir}/variation.haplotype --ss #{dir}/transcriptome.splice_sites --exon #{dir}/transcriptome.exons #{dir}/ref.fa #{dir}/ref"
96
+
97
+ end
98
+
99
+ # strt:call_allele
100
+
101
+ desc 'call_allele CSV REFDIR MAPDIR', 'Call allele frequency'
102
+ long_desc <<-DESC
103
+ Call allele frequencies of multiple samples specified in a design CSV, based on alignment files at BAMDIR, and reference sequence 'ref.fa' and the index at REFDIR.
104
+ DESC
105
+
106
+ method_option *OPT_GENOME
107
+
108
+ def call_allele(csv, refdir, mapdir)
109
+
110
+ design = CSV.table(csv)
111
+ bams = get_temporary_path('strt.call_allele', 'bams')
112
+ fp = open(bams, 'w')
113
+ design[:base].each {|bam| fp.puts "#{mapdir}/#{bam}.bam" }
114
+ fp.close
115
+ csvdir = File.dirname(csv)
116
+ bcf = "#{csvdir}/strt-call_allele.bcf"
117
+
118
+ pipeline("samtools mpileup -u -t AD,ADF,ADR,DP -f #{refdir}/ref.fa -b #{bams}",
119
+ "bcftools call --multiallelic-caller --variants-only --output-type u",
120
+ "bcftools filter -s LowQual -e '%QUAL<20 || MIN(FORMAT/DP)<20' --output-type b > #{bcf}")
121
+ pipeline("bcftools view #{bcf}",
122
+ "table_annovar.pl - #{refdir} --buildver #{options.genome} --outfile #{csvdir}/strt-call_allele --remove --protocol refGene --operation g --nastring . --vcfinput")
123
+
124
+ end
125
+
126
+ # strt:count_per_base
127
+
128
+ desc 'count_per_base BAM', 'Count reads per base'
129
+ long_desc <<-DESC
130
+ Count reads per base , based on an alignment BAM.
131
+ DESC
132
+
133
+ method_option *OPT_BUFFER_SIZE
134
+ method_option *OPT_COREUTILS_PREFIX
135
+ method_option *OPT_PARALLEL
136
+
137
+ def count_per_base(bam)
138
+
139
+ pipeline(
140
+ "bedtools bamtobed -i #{bam}",
141
+ "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
142
+ "#{sort_command} -t '\t' -k 1,1 -k 2,2n",
143
+ "#{uniq_command(options)} -c",
144
+ "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
145
+
146
+ end
147
+
148
+ # strt:count_per_region
149
+
150
+ desc 'count_per_region COUNT REG [REG ...]',
151
+ 'Count reads per region'
152
+ long_desc <<-DESC
153
+ Count reads per region for a sample. Read counts (a BED-format COUNT) within regions (BED-format REGions) are summed by the region names.
154
+ DESC
155
+
156
+ method_option *OPT_COREUTILS_PREFIX
157
+
158
+ def count_per_region(count, region0, *regions0)
159
+
160
+ pipeline(
161
+ "bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
162
+ "#{cut_command} -f 5,11",
163
+ "ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"ID,COUNT\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")
164
+
165
+ end
166
+
167
+ # strt:prepare_genome
168
+
169
+ desc 'prepare_genome DIR', 'Prepare genome data'
170
+ long_desc <<-DESC
171
+ Prepare data files of the specified GENOME at DIR.
172
+ DESC
173
+
174
+ method_option *OPT_COREUTILS_PREFIX
175
+ method_option *OPT_DOWNLOAD
176
+ method_option *OPT_GENOME
177
+
178
+ def prepare_genome(dir0)
179
+
180
+ dir = File.expand_path(dir0)
181
+ tgz = "#{dir}/#{options.genome}.chromFa.tar.gz"
182
+ ucsc = "rsync://hgdownload.cse.ucsc.edu/goldenPath/#{options.genome}/bigZips"
183
+
184
+ if options.download != 'no'
185
+ if options.genome == 'hg38'
186
+ rsync_file("#{ucsc}/#{options.genome}.chromFa.tar.gz", tgz)
187
+ else
188
+ rsync_file("#{ucsc}/chromFa.tar.gz", tgz)
189
+ end
190
+ end
191
+ pipeline("unpigz -c #{tgz}",
192
+ "#{options.coreutils_prefix}tar -xOf - --exclude \"*_*\"",
193
+ "gawk 'BEGIN{p=\"\"} /^>/{ print p $1 } !/^>/{ printf $1; p=\"\\n\" } END{ print }'",
194
+ "#{fold_command(options)} -w 50",
195
+ "pigz -c > #{dir}/genome.fa.gz"
196
+ ) if options.download != 'only'
197
+
198
+ end
199
+
200
+ # strt:prepare_reads
201
+
202
+ desc 'prepare_reads BASE MAP FQGZ ...', 'Prepare STRT reads'
203
+ long_desc <<-DESC
204
+ Prepare STRT reads from raw sequence files before alignment. After demultiplexing, it performs
205
+ (i) exclusion of redundant reads,
206
+ (ii) exclusion of noncanonical reads, which does not begin with template switching primer,
207
+ (iii) trimming from low-quality base,
208
+ (iv) trimming from sequence similar to HiSeq universal primer,
209
+ and (v) trimming of the template switching primer.
210
+
211
+ Mandatory paramters are (1) BASE; basename for demulplexed and gzipped fastq files, (2) MAP; filename of comma-separated table between barcode and well, and (3) FQGZs; comma-separated filenames of raw sequences; each file is gzipped fastq. When MAP contains 'CAAAGT,A2' and BASE is '~/test', reads having CAAAGT-like barcode are in '~/test.A2.fq.gz' file after the preprocesses.
212
+ DESC
213
+
214
+ method_option *OPT_COREUTILS_PREFIX
215
+ method_option *OPT_GREP_PREFIX
216
+ method_option *OPT_PARALLEL
217
+ method_option *OPT_UMI_LENGTH
218
+
219
+ method_option :maximum_memory,
220
+ default: 50,
221
+ desc: 'Maximum memory usage in percent rate',
222
+ type: :numeric
223
+
224
+ method_option :minimum_length,
225
+ banner: 'NT',
226
+ default: 24,
227
+ desc: 'Minimum length after the preprocess',
228
+ type: :numeric
229
+
230
+ method_option :reads,
231
+ desc: 'Number of raw reads for the preprocess',
232
+ type: :numeric
233
+
234
+ # method_option :maximum_distance,
235
+ # default: 1,
236
+ # desc: 'Maximum distance between barcode and sequence',
237
+ # type: :numeric
238
+
239
+ def prepare_reads(base, map, fqgz0, *fqgzs0)
240
+
241
+ fqgzs = [fqgz0] + fqgzs0
242
+
243
+ bcs = Hash.new
244
+ open(map, 'r').each do |line|
245
+ bc, well = line.rstrip.split(',')
246
+ bcs[bc] = well
247
+ end
248
+
249
+ bcl = bcs.keys.map!{|key| key.length}.sort.uniq[0]
250
+
251
+ tso_pattern = '.'*options.umi_length + '.'*bcl + 'GG'
252
+
253
+ #
254
+
255
+ STDERR.puts "#{`date`.strip}: Demultiplexing each raw sequence files..."
256
+
257
+ fqgz2csv0 = Hash.new
258
+ fqgz2csv1 = Hash.new
259
+ fqgz2base = Hash.new
260
+ fqgzs.each do |fqgz|
261
+ fqgz2csv0[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
262
+ fqgz2csv1[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
263
+ fqgz2base[fqgz] = get_temporary_path('strt.preprocess', 'base', false)
264
+ end
265
+
266
+ Parallel.map(fqgz2csv0.keys, in_processes: options.parallel) do |fqgz|
267
+ cmds = [
268
+ "unpigz -c #{fqgz}",
269
+ "#{fq1l_convert_command(options)}",
270
+ "#{fq1l_count_command(options)} #{fqgz2csv0[fqgz]}",
271
+ "fq1l match_5end#{grep_prefix_option(options)} #{tso_pattern}",
272
+ "#{fq1l_count_command(options)} #{fqgz2csv1[fqgz]}",
273
+ "fq1l annotate_index --first-cycle=#{options.umi_length+1} --last-cycle=#{options.umi_length+bcl}",
274
+ "fq1l annotate_umi --first-cycle=1 --last-cycle=#{options.umi_length}",
275
+ "fq1l sort_index#{coreutils_prefix_option}#{parallel_option(options)} --buffer-size=#{(options.maximum_memory/(fqgz2csv0.keys.size+1)).to_i}%",
276
+ "fq1l demultiplex #{fqgz2base[fqgz]} #{map}"
277
+ ]
278
+ cmds.insert(2, "#{head_command(options)} -n #{options.reads}") unless options.reads.nil?
279
+ stats = Open3.pipeline(*cmds)
280
+ stats.each_index do |i|
281
+ raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success? || (stats[i].signaled? && stats[i].termsig == 13)
282
+ end
283
+ end
284
+
285
+ system "fq1l sum_counts #{fqgz2csv0.values.join(' ')} > #{base}.count.step1.csv"
286
+ unlink_files(fqgz2csv0.values)
287
+
288
+ system "fq1l sum_counts #{fqgz2csv1.values.join(' ')} > #{base}.count.step2.csv"
289
+ unlink_files(fqgz2csv1.values)
290
+
291
+ #
292
+
293
+ (bcs.values + ['NA']).each do |well|
294
+
295
+ STDERR.puts "#{`date`.strip}: Finishing well #{well}..."
296
+
297
+ tmpfqgzs = fqgz2base.values.map {|base| "#{base}.#{well}.fq.gz"}
298
+ csvs = Array.new(6) {|i| "#{base}.#{well}.count.step#{i+3}.csv"}
299
+
300
+ pipeline("unpigz -c #{tmpfqgzs.join(' ')}",
301
+ "#{fq1l_convert_command(options)}",
302
+ "#{fq1l_count_command(options)} #{csvs[0]}",
303
+ "#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
304
+ "fq1l exclude_duplicate",
305
+ "#{fq1l_count_command(options)} #{csvs[1]}",
306
+ "fq1l trim_3end_quality",
307
+ "#{fq1l_count_command(options)} #{csvs[2]}",
308
+ "fq1l trim_3end_primer#{coreutils_prefix_option}#{grep_prefix_option(options)}#{parallel_option(options)}",
309
+ "#{fq1l_count_command(options)} #{csvs[3]}",
310
+ "#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
311
+ "fq1l exclude_degenerate",
312
+ "#{fq1l_count_command(options)} #{csvs[4]}",
313
+ "fq1l trim_5end --minimum-length=#{options.minimum_length} #{tso_pattern}+",
314
+ "#{fq1l_count_command(options)} #{csvs[5]}",
315
+ "fq1l restore#{coreutils_prefix_option}",
316
+ "pigz -c > #{base}.#{well}.fq.gz")
317
+
318
+ unlink_files(tmpfqgzs)
319
+
320
+ end
321
+
322
+ end
323
+
324
+ # strt:prepare_ribosome
325
+
326
+ desc 'prepare_ribosome DIR', 'Prepare ribosome data'
327
+ long_desc <<-DESC
328
+ Prepare ribosome data files for the specified GENOME at DIR.
329
+ DESC
330
+
331
+ method_option *OPT_COREUTILS_PREFIX
332
+ method_option *OPT_DOWNLOAD
333
+ method_option *OPT_GENOME
334
+
335
+ def prepare_ribosome(dir0)
336
+
337
+ dir = File.expand_path(dir0)
338
+
339
+ if options.genome[0..1] == 'hg'
340
+ if options.download != 'no'
341
+ download_file("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=555853&strand=1&rettype=fasta&retmode=text", "#{dir}/U13369.fa")
342
+ system "pigz #{dir}/U13369.fa" or exit $?.exitstatus
343
+ end
344
+ pipeline("unpigz -c #{dir}/U13369.fa.gz",
345
+ "gawk '/^>/{ print \">RIBO_U13369.1\" } !/^>/{ printf $1 } END{ print }'",
346
+ "#{fold_command(options)} -w 50",
347
+ "pigz -c > #{dir}/ribosome.fa.gz"
348
+ ) if options.download != 'only'
349
+ else
350
+ pipeline("echo", "pigz -c > #{dir}/ribosome.fa.gz")
351
+ end
352
+
353
+ end
354
+
355
+ # strt:prepare_spikein
356
+
357
+ desc 'prepare_spikein DIR', 'Prepare spikein data'
358
+ long_desc <<-DESC
359
+ Prepare spikein data files at DIR.
360
+ DESC
361
+
362
+ method_option *OPT_COREUTILS_PREFIX
363
+ method_option *OPT_DOWNLOAD
364
+
365
+ def prepare_spikein(dir0)
366
+
367
+ dir = File.expand_path(dir0)
368
+ zip = "#{dir}/ERCC92.zip"
369
+
370
+ download_file("https://tools.thermofisher.com/content/sfs/manuals/ERCC92.zip", zip) if options.download != 'no'
371
+ pipeline("unzip -cq #{zip} ERCC92.fa",
372
+ "gawk 'BEGIN{p=\"\"} /^>/{ print p \">RNA_SPIKE_\" substr($1, 2); printf(\"AATTC\" ($1 == \">ERCC-00130\" ? \"GAGCTC\" : \"\") ) } /^[ACGT]/{ printf $1; p=\"\\n\" } END{ print }'",
373
+ "#{fold_command(options)} -w 50",
374
+ "pigz -c > #{dir}/spikein.fa.gz") if options.download != 'only'
375
+
376
+ end
377
+
378
+ # strt:prepare_transcriptome
379
+
380
+ register(Bio::Gadget::StrtPrepareTranscriptome,
381
+ 'prepare_transcriptome',
382
+ 'prepare_transcriptome GENOME',
383
+ 'Prepare transcriptome data')
384
+
385
+ # strt:prepare_variation
386
+
387
+ desc 'prepare_variation DIR', 'Prepare variation data'
388
+ long_desc <<-DESC
389
+ Prepare genome variation data files for the specified GENOME dir based on common variations in dbSNP BUILD, at DIR.
390
+ DESC
391
+
392
+ method_option *OPT_COREUTILS_PREFIX
393
+ method_option *OPT_DOWNLOAD
394
+ method_option *OPT_GENOME
395
+
396
+ method_option :dbsnp,
397
+ banner: 'BUILD',
398
+ default: 146,
399
+ desc: 'Build number of dbSNP',
400
+ type: :numeric
401
+
402
+ def prepare_variation(dir0)
403
+
404
+ dir = File.expand_path(dir0)
405
+ snp = "#{dir}/#{options.genome}.snp#{options.dbsnp}Common.txt.gz"
406
+
407
+ rsync_file("rsync://hgdownload.soe.ucsc.edu/goldenPath/#{options.genome}/database/snp#{options.dbsnp}Common.txt.gz", snp) if options.download != 'no'
408
+ pipeline("unpigz -c #{dir}/genome.fa.gz",
409
+ "hisat2_extract_snps_haplotypes_UCSC.py - #{snp} #{dir}/variation"
410
+ ) if options.download != 'only'
411
+
412
+ end
413
+
414
+ # strt:qualify
415
+
416
+ desc 'qualify CSV REFDIR SEQDIR MAPDIR', 'Qualify samples'
417
+ long_desc <<-DESC
418
+ Qualify samples in a design CSV.'
419
+ DESC
420
+
421
+ method_option *OPT_BUFFER_SIZE
422
+ method_option *OPT_COREUTILS_PREFIX
423
+ method_option *OPT_PARALLEL
424
+
425
+ def qualify(csv, refdir, seqdir, mapdir)
426
+
427
+ count_commands = ["#{cut_command} -f 5",
428
+ "ruby -e 'n=0; while gets; n+=$_.to_i; end; puts n'"]
429
+
430
+ samples = CSV.read(csv, {
431
+ headers: true,
432
+ converters: :numeric
433
+ })
434
+ bases = samples["BASE"]
435
+
436
+ samples["TOTAL_READS"] =
437
+ Parallel.map(bases, in_threads: options.parallel) do |base|
438
+ stat = CSV.table("#{seqdir}/#{base}.count.step8.csv")
439
+ n = 0
440
+ stat[:reads].each {|i| n += i }
441
+ n
442
+ end
443
+
444
+ samples["MAPPED_READS"] =
445
+ Parallel.map(bases, in_threads: options.parallel) do |base|
446
+ pipeline_readline("unpigz -c #{mapdir}/#{base}.bed.gz",
447
+ *count_commands).to_i
448
+ end
449
+
450
+ tmp = Array.new
451
+ samples.each do |row|
452
+ tmp << row["MAPPED_READS"].to_f / row["TOTAL_READS"]
453
+ end
454
+ samples["MAPPED_RATE"] = tmp
455
+
456
+ samples["RIBOSOME_READS"] =
457
+ Parallel.map(bases, in_threads: options.parallel) do |base|
458
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/ribosome.bed.gz",
459
+ *count_commands).to_i
460
+ end
461
+
462
+ samples["SPIKEIN_READS"] =
463
+ Parallel.map(bases, in_threads: options.parallel) do |base|
464
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_whole.bed.gz",
465
+ *count_commands).to_i
466
+ end
467
+
468
+ samples["SPIKEIN_5END_READS"] =
469
+ Parallel.map(bases, in_threads: options.parallel) do |base|
470
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
471
+ *count_commands).to_i
472
+ end
473
+
474
+ tmp = Array.new
475
+ samples.each do |row|
476
+ tmp << row["SPIKEIN_5END_READS"].to_f / row["SPIKEIN_READS"]
477
+ end
478
+ samples["SPIKEIN_5END_RATE"] = tmp
479
+
480
+ tmp = Array.new
481
+ samples.each do |row|
482
+ tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
483
+ end
484
+ samples["RELATIVE_POLYA_RNAS"] = tmp
485
+
486
+ samples["CODING_READS"] =
487
+ Parallel.map(bases, in_threads: options.parallel) do |base|
488
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
489
+ *count_commands).to_i
490
+ end
491
+
492
+ samples["CODING_5END_READS"] =
493
+ Parallel.map(bases, in_threads: options.parallel) do |base|
494
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_5end.bed.gz",
495
+ *count_commands).to_i
496
+ end
497
+
498
+ tmp = Array.new
499
+ samples.each do |row|
500
+ tmp << row["CODING_5END_READS"].to_f / row["CODING_READS"]
501
+ end
502
+ samples["CODING_5END_RATE"] = tmp
503
+
504
+ tmp = Array.new
505
+ samples.each do |row|
506
+ tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
507
+ end
508
+ samples["RELATIVE_MRNAS"] = tmp
509
+
510
+ puts samples
511
+
512
+ end
513
+
514
+ # strt:quantitate
515
+
516
+ desc 'quantify CSV REFDIR MAPDIR REGBASE',
517
+ 'Quantify samples'
518
+ long_desc <<-DESC
519
+ Count reads per region for multiple samples in CSV. Read counts (a BED-format COUNT) within regions (REFDIR/ribosome.bed.gz, REFDIR/spikein_5end.bed.gz & REGBASE.bed.gz) are summed by the region names. Addional columns in REGBASE.csv is attached as annotations.
520
+ DESC
521
+
522
+ method_option *OPT_COREUTILS_PREFIX
523
+ method_option *OPT_PARALLEL
524
+
525
+ def quantify(csv, refdir, mapdir, regbase)
526
+
527
+ samples = CSV.read(csv, { headers: true, converters: :numeric })
528
+
529
+ tmp = CSV.read("#{regbase}.csv", { headers:true, converters: :numeric })
530
+ anns = tmp.headers; anns.delete('ID')
531
+ name2anns = Hash.new
532
+ tmp.each {|row| name2anns[row['ID']] = row.values_at(*anns)}
533
+
534
+ bases = samples["BASE"]
535
+ puts ( ['ID'] + anns +
536
+ bases.map {|base| "N|#{base}"} +
537
+ bases.map {|base| "R|#{base}"} ).join(',')
538
+
539
+ name2base2cnt = Hash.new
540
+ base2spike = Hash.new
541
+ @locker = Mutex.new
542
+ Parallel.map(samples["BASE"], in_threads: options.parallel) do |base|
543
+ base2spike[base] = 0.0
544
+ fp = open("| strt count_per_region#{coreutils_prefix_option} #{mapdir}/#{base}.bed.gz #{refdir}/ribosome.bed.gz #{refdir}/spikein_5end.bed.gz #{regbase}.bed.gz")
545
+ fp.gets
546
+ fp.each do |line|
547
+ name, cnt = line.strip.split /,/
548
+ @locker.synchronize do
549
+ if !name2base2cnt.key?(name)
550
+ name2base2cnt[name] = Hash.new
551
+ name2base2cnt[name][base] = Hash.new
552
+ elsif !name2base2cnt[name].key?(base)
553
+ name2base2cnt[name][base] = Hash.new
554
+ end
555
+ name2base2cnt[name][base] = cnt
556
+ base2spike[base] += cnt.to_f if name =~ /^RNA_SPIKE_/
557
+ end
558
+ end
559
+ fp.close
560
+ end
561
+
562
+ name2base2cnt.each do |name, base2cnt|
563
+ puts ( [name] +
564
+ (name2anns.key?(name) ?
565
+ name2anns[name] : Array.new(anns.length, 'NA')) +
566
+ bases.map {|base| base2cnt.key?(base) ?
567
+ base2cnt[base].to_f/base2spike[base]*1000 : 0} +
568
+ bases.map {|base| base2cnt.key?(base) ? base2cnt[base] : 0}
569
+ ).join(',')
570
+ end
571
+
572
+ end
573
+
574
+ #
575
+
576
+ no_commands do
577
+
578
+ def download_option(options)
579
+ " --download=#{options.download}"
580
+ end
581
+
582
+ def genome_option(options)
583
+ " --genome=#{options.genome}"
584
+ end
585
+
586
+ def pipeline_readline(*cmds)
587
+ fp, ths = Open3.pipeline_r(*cmds)
588
+ line = fp.gets.strip
589
+ fp.close
590
+ ths[-1].join
591
+ line
592
+ end
593
+
594
+ def rsync_file(remote, local)
595
+ system "rsync -a #{remote} #{local}" or exit $?.exitstatus
596
+ end
597
+
598
+ end
599
+
600
+ end
601
+ end
602
+ end
603
+
604
+ require 'bio/gadget/strt/count.rb'
605
+ require 'bio/gadget/strt/depth.rb'