bio-gadget 0.4.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,605 @@
1
+ require 'csv'
2
+ require 'fileutils'
3
+ require 'open3'
4
+ require 'parallel'
5
+ require 'thread'
6
+
7
+ require 'bio/gadget/strt/prepare_transcriptome.rb'
8
+
9
+ module Bio
10
+ class Gadget
11
+ class Strt < Bio::Gadget
12
+
13
+ OPT_GENOME = [ :genome, { :default => 'hg38',
14
+ :desc => 'Genome assembly' } ]
15
+
16
+ OPT_UMI_LENGTH = [ :umi_length, { :banner => 'NT',
17
+ :default => 6,
18
+ :desc => 'Length of UMI',
19
+ :type => :numeric } ]
20
+
21
+ # strt:alignment
22
+
23
+ desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
24
+ long_desc <<-DESC
25
+ Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
26
+ DESC
27
+
28
+ method_option *OPT_BUFFER_SIZE
29
+ method_option *OPT_COREUTILS_PREFIX
30
+ method_option *OPT_GREP_PREFIX
31
+ method_option *OPT_PARALLEL
32
+
33
+ def alignment(refdir, seqdir, mapdir)
34
+
35
+ Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
36
+ base = File.basename(fqgz, '.fq.gz')
37
+ STDERR.puts "#{`date`.strip}: Align #{base}..."
38
+ bam = "#{mapdir}/#{base}.bam"
39
+ pipeline(
40
+ "hisat2 --no-unal --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
41
+ "#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
42
+ "samtools sort -@ #{options.parallel} -o #{bam}")
43
+ sh "samtools index #{bam}"
44
+ end
45
+
46
+ STDERR.puts "#{`date`.strip}: Count from all alignments."
47
+ Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
48
+ in_threads: options.parallel) do |bam|
49
+ pipeline(
50
+ "strt count_per_base#{buffer_size_option}#{coreutils_prefix_option}#{parallel_option(options)} #{bam}",
51
+ "pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
52
+ end
53
+
54
+ end
55
+
56
+ # strt:build_index
57
+
58
+ desc 'build_index DIR', 'Build index for alignment'
59
+ long_desc <<-DESC
60
+ Build index for alignment of STRT reads, from the speficied GENOME, TRANSCRIPTOME and VARIATION, at DIR.
61
+ DESC
62
+
63
+ method_option *OPT_COREUTILS_PREFIX
64
+ method_option *OPT_GENOME
65
+ method_option *OPT_GREP_PREFIX
66
+ method_option *OPT_PARALLEL
67
+
68
+ def build_index(dir0)
69
+
70
+ dir = File.expand_path(dir0)
71
+ FileUtils.mkdir_p(dir)
72
+
73
+ STDERR.puts "#{`date`.strip}: Preparing data files..."
74
+
75
+ Parallel.map(
76
+ ["strt prepare_genome#{coreutils_prefix_option}#{genome_option(options)} #{dir}",
77
+ "strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=only #{dir}",
78
+ "strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=only #{dir}",
79
+ "strt prepare_spikein#{coreutils_prefix_option} #{dir}",
80
+ "strt prepare_ribosome#{coreutils_prefix_option}#{genome_option(options)} #{dir}"], in_threads: options.parallel) do |cmd|
81
+ system cmd or exit $?.exitstatus
82
+ end
83
+
84
+ system "unpigz -c #{dir}/genome.fa.gz #{dir}/spikein.fa.gz #{dir}/ribosome.fa.gz > #{dir}/ref.fa"
85
+ system "samtools faidx #{dir}/ref.fa"
86
+
87
+ Parallel.map(["strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=no #{dir}",
88
+ "strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=no #{dir}"], in_threads: options.parallel) do |cmd|
89
+ STDERR.puts cmd
90
+ system cmd or exit $?.exitstatus
91
+ end
92
+
93
+ STDERR.puts "#{`date`.strip}: Building index..."
94
+
95
+ system "hisat2-build -f -p #{options.parallel} --snp #{dir}/variation.snp --haplotype #{dir}/variation.haplotype --ss #{dir}/transcriptome.splice_sites --exon #{dir}/transcriptome.exons #{dir}/ref.fa #{dir}/ref"
96
+
97
+ end
98
+
99
+ # strt:call_allele
100
+
101
+ desc 'call_allele CSV REFDIR MAPDIR', 'Call allele frequency'
102
+ long_desc <<-DESC
103
+ Call allele frequencies of multiple samples specified in a design CSV, based on alignment files at BAMDIR, and reference sequence 'ref.fa' and the index at REFDIR.
104
+ DESC
105
+
106
+ method_option *OPT_GENOME
107
+
108
+ def call_allele(csv, refdir, mapdir)
109
+
110
+ design = CSV.table(csv)
111
+ bams = get_temporary_path('strt.call_allele', 'bams')
112
+ fp = open(bams, 'w')
113
+ design[:base].each {|bam| fp.puts "#{mapdir}/#{bam}.bam" }
114
+ fp.close
115
+ csvdir = File.dirname(csv)
116
+ bcf = "#{csvdir}/strt-call_allele.bcf"
117
+
118
+ pipeline("samtools mpileup -u -t AD,ADF,ADR,DP -f #{refdir}/ref.fa -b #{bams}",
119
+ "bcftools call --multiallelic-caller --variants-only --output-type u",
120
+ "bcftools filter -s LowQual -e '%QUAL<20 || MIN(FORMAT/DP)<20' --output-type b > #{bcf}")
121
+ pipeline("bcftools view #{bcf}",
122
+ "table_annovar.pl - #{refdir} --buildver #{options.genome} --outfile #{csvdir}/strt-call_allele --remove --protocol refGene --operation g --nastring . --vcfinput")
123
+
124
+ end
125
+
126
+ # strt:count_per_base
127
+
128
+ desc 'count_per_base BAM', 'Count reads per base'
129
+ long_desc <<-DESC
130
+ Count reads per base , based on an alignment BAM.
131
+ DESC
132
+
133
+ method_option *OPT_BUFFER_SIZE
134
+ method_option *OPT_COREUTILS_PREFIX
135
+ method_option *OPT_PARALLEL
136
+
137
+ def count_per_base(bam)
138
+
139
+ pipeline(
140
+ "bedtools bamtobed -i #{bam}",
141
+ "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
142
+ "#{sort_command} -t '\t' -k 1,1 -k 2,2n",
143
+ "#{uniq_command(options)} -c",
144
+ "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
145
+
146
+ end
147
+
148
+ # strt:count_per_region
149
+
150
+ desc 'count_per_region COUNT REG [REG ...]',
151
+ 'Count reads per region'
152
+ long_desc <<-DESC
153
+ Count reads per region for a sample. Read counts (a BED-format COUNT) within regions (BED-format REGions) are summed by the region names.
154
+ DESC
155
+
156
+ method_option *OPT_COREUTILS_PREFIX
157
+
158
+ def count_per_region(count, region0, *regions0)
159
+
160
+ pipeline(
161
+ "bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
162
+ "#{cut_command} -f 5,11",
163
+ "ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"ID,COUNT\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")
164
+
165
+ end
166
+
167
+ # strt:prepare_genome
168
+
169
+ desc 'prepare_genome DIR', 'Prepare genome data'
170
+ long_desc <<-DESC
171
+ Prepare data files of the specified GENOME at DIR.
172
+ DESC
173
+
174
+ method_option *OPT_COREUTILS_PREFIX
175
+ method_option *OPT_DOWNLOAD
176
+ method_option *OPT_GENOME
177
+
178
+ def prepare_genome(dir0)
179
+
180
+ dir = File.expand_path(dir0)
181
+ tgz = "#{dir}/#{options.genome}.chromFa.tar.gz"
182
+ ucsc = "rsync://hgdownload.cse.ucsc.edu/goldenPath/#{options.genome}/bigZips"
183
+
184
+ if options.download != 'no'
185
+ if options.genome == 'hg38'
186
+ rsync_file("#{ucsc}/#{options.genome}.chromFa.tar.gz", tgz)
187
+ else
188
+ rsync_file("#{ucsc}/chromFa.tar.gz", tgz)
189
+ end
190
+ end
191
+ pipeline("unpigz -c #{tgz}",
192
+ "#{options.coreutils_prefix}tar -xOf - --exclude \"*_*\"",
193
+ "gawk 'BEGIN{p=\"\"} /^>/{ print p $1 } !/^>/{ printf $1; p=\"\\n\" } END{ print }'",
194
+ "#{fold_command(options)} -w 50",
195
+ "pigz -c > #{dir}/genome.fa.gz"
196
+ ) if options.download != 'only'
197
+
198
+ end
199
+
200
+ # strt:prepare_reads
201
+
202
+ desc 'prepare_reads BASE MAP FQGZ ...', 'Prepare STRT reads'
203
+ long_desc <<-DESC
204
+ Prepare STRT reads from raw sequence files before alignment. After demultiplexing, it performs
205
+ (i) exclusion of redundant reads,
206
+ (ii) exclusion of noncanonical reads, which does not begin with template switching primer,
207
+ (iii) trimming from low-quality base,
208
+ (iv) trimming from sequence similar to HiSeq universal primer,
209
+ and (v) trimming of the template switching primer.
210
+
211
+ Mandatory paramters are (1) BASE; basename for demulplexed and gzipped fastq files, (2) MAP; filename of comma-separated table between barcode and well, and (3) FQGZs; comma-separated filenames of raw sequences; each file is gzipped fastq. When MAP contains 'CAAAGT,A2' and BASE is '~/test', reads having CAAAGT-like barcode are in '~/test.A2.fq.gz' file after the preprocesses.
212
+ DESC
213
+
214
+ method_option *OPT_COREUTILS_PREFIX
215
+ method_option *OPT_GREP_PREFIX
216
+ method_option *OPT_PARALLEL
217
+ method_option *OPT_UMI_LENGTH
218
+
219
+ method_option :maximum_memory,
220
+ default: 50,
221
+ desc: 'Maximum memory usage in percent rate',
222
+ type: :numeric
223
+
224
+ method_option :minimum_length,
225
+ banner: 'NT',
226
+ default: 24,
227
+ desc: 'Minimum length after the preprocess',
228
+ type: :numeric
229
+
230
+ method_option :reads,
231
+ desc: 'Number of raw reads for the preprocess',
232
+ type: :numeric
233
+
234
+ # method_option :maximum_distance,
235
+ # default: 1,
236
+ # desc: 'Maximum distance between barcode and sequence',
237
+ # type: :numeric
238
+
239
+ def prepare_reads(base, map, fqgz0, *fqgzs0)
240
+
241
+ fqgzs = [fqgz0] + fqgzs0
242
+
243
+ bcs = Hash.new
244
+ open(map, 'r').each do |line|
245
+ bc, well = line.rstrip.split(',')
246
+ bcs[bc] = well
247
+ end
248
+
249
+ bcl = bcs.keys.map!{|key| key.length}.sort.uniq[0]
250
+
251
+ tso_pattern = '.'*options.umi_length + '.'*bcl + 'GG'
252
+
253
+ #
254
+
255
+ STDERR.puts "#{`date`.strip}: Demultiplexing each raw sequence files..."
256
+
257
+ fqgz2csv0 = Hash.new
258
+ fqgz2csv1 = Hash.new
259
+ fqgz2base = Hash.new
260
+ fqgzs.each do |fqgz|
261
+ fqgz2csv0[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
262
+ fqgz2csv1[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
263
+ fqgz2base[fqgz] = get_temporary_path('strt.preprocess', 'base', false)
264
+ end
265
+
266
+ Parallel.map(fqgz2csv0.keys, in_processes: options.parallel) do |fqgz|
267
+ cmds = [
268
+ "unpigz -c #{fqgz}",
269
+ "#{fq1l_convert_command(options)}",
270
+ "#{fq1l_count_command(options)} #{fqgz2csv0[fqgz]}",
271
+ "fq1l match_5end#{grep_prefix_option(options)} #{tso_pattern}",
272
+ "#{fq1l_count_command(options)} #{fqgz2csv1[fqgz]}",
273
+ "fq1l annotate_index --first-cycle=#{options.umi_length+1} --last-cycle=#{options.umi_length+bcl}",
274
+ "fq1l annotate_umi --first-cycle=1 --last-cycle=#{options.umi_length}",
275
+ "fq1l sort_index#{coreutils_prefix_option}#{parallel_option(options)} --buffer-size=#{(options.maximum_memory/(fqgz2csv0.keys.size+1)).to_i}%",
276
+ "fq1l demultiplex #{fqgz2base[fqgz]} #{map}"
277
+ ]
278
+ cmds.insert(2, "#{head_command(options)} -n #{options.reads}") unless options.reads.nil?
279
+ stats = Open3.pipeline(*cmds)
280
+ stats.each_index do |i|
281
+ raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success? || (stats[i].signaled? && stats[i].termsig == 13)
282
+ end
283
+ end
284
+
285
+ system "fq1l sum_counts #{fqgz2csv0.values.join(' ')} > #{base}.count.step1.csv"
286
+ unlink_files(fqgz2csv0.values)
287
+
288
+ system "fq1l sum_counts #{fqgz2csv1.values.join(' ')} > #{base}.count.step2.csv"
289
+ unlink_files(fqgz2csv1.values)
290
+
291
+ #
292
+
293
+ (bcs.values + ['NA']).each do |well|
294
+
295
+ STDERR.puts "#{`date`.strip}: Finishing well #{well}..."
296
+
297
+ tmpfqgzs = fqgz2base.values.map {|base| "#{base}.#{well}.fq.gz"}
298
+ csvs = Array.new(6) {|i| "#{base}.#{well}.count.step#{i+3}.csv"}
299
+
300
+ pipeline("unpigz -c #{tmpfqgzs.join(' ')}",
301
+ "#{fq1l_convert_command(options)}",
302
+ "#{fq1l_count_command(options)} #{csvs[0]}",
303
+ "#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
304
+ "fq1l exclude_duplicate",
305
+ "#{fq1l_count_command(options)} #{csvs[1]}",
306
+ "fq1l trim_3end_quality",
307
+ "#{fq1l_count_command(options)} #{csvs[2]}",
308
+ "fq1l trim_3end_primer#{coreutils_prefix_option}#{grep_prefix_option(options)}#{parallel_option(options)}",
309
+ "#{fq1l_count_command(options)} #{csvs[3]}",
310
+ "#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
311
+ "fq1l exclude_degenerate",
312
+ "#{fq1l_count_command(options)} #{csvs[4]}",
313
+ "fq1l trim_5end --minimum-length=#{options.minimum_length} #{tso_pattern}+",
314
+ "#{fq1l_count_command(options)} #{csvs[5]}",
315
+ "fq1l restore#{coreutils_prefix_option}",
316
+ "pigz -c > #{base}.#{well}.fq.gz")
317
+
318
+ unlink_files(tmpfqgzs)
319
+
320
+ end
321
+
322
+ end
323
+
324
+ # strt:prepare_ribosome
325
+
326
+ desc 'prepare_ribosome DIR', 'Prepare ribosome data'
327
+ long_desc <<-DESC
328
+ Prepare ribosome data files for the specified GENOME at DIR.
329
+ DESC
330
+
331
+ method_option *OPT_COREUTILS_PREFIX
332
+ method_option *OPT_DOWNLOAD
333
+ method_option *OPT_GENOME
334
+
335
+ def prepare_ribosome(dir0)
336
+
337
+ dir = File.expand_path(dir0)
338
+
339
+ if options.genome[0..1] == 'hg'
340
+ if options.download != 'no'
341
+ download_file("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=555853&strand=1&rettype=fasta&retmode=text", "#{dir}/U13369.fa")
342
+ system "pigz #{dir}/U13369.fa" or exit $?.exitstatus
343
+ end
344
+ pipeline("unpigz -c #{dir}/U13369.fa.gz",
345
+ "gawk '/^>/{ print \">RIBO_U13369.1\" } !/^>/{ printf $1 } END{ print }'",
346
+ "#{fold_command(options)} -w 50",
347
+ "pigz -c > #{dir}/ribosome.fa.gz"
348
+ ) if options.download != 'only'
349
+ else
350
+ pipeline("echo", "pigz -c > #{dir}/ribosome.fa.gz")
351
+ end
352
+
353
+ end
354
+
355
+ # strt:prepare_spikein
356
+
357
+ desc 'prepare_spikein DIR', 'Prepare spikein data'
358
+ long_desc <<-DESC
359
+ Prepare spikein data files at DIR.
360
+ DESC
361
+
362
+ method_option *OPT_COREUTILS_PREFIX
363
+ method_option *OPT_DOWNLOAD
364
+
365
+ def prepare_spikein(dir0)
366
+
367
+ dir = File.expand_path(dir0)
368
+ zip = "#{dir}/ERCC92.zip"
369
+
370
+ download_file("https://tools.thermofisher.com/content/sfs/manuals/ERCC92.zip", zip) if options.download != 'no'
371
+ pipeline("unzip -cq #{zip} ERCC92.fa",
372
+ "gawk 'BEGIN{p=\"\"} /^>/{ print p \">RNA_SPIKE_\" substr($1, 2); printf(\"AATTC\" ($1 == \">ERCC-00130\" ? \"GAGCTC\" : \"\") ) } /^[ACGT]/{ printf $1; p=\"\\n\" } END{ print }'",
373
+ "#{fold_command(options)} -w 50",
374
+ "pigz -c > #{dir}/spikein.fa.gz") if options.download != 'only'
375
+
376
+ end
377
+
378
+ # strt:prepare_transcriptome
379
+
380
+ register(Bio::Gadget::StrtPrepareTranscriptome,
381
+ 'prepare_transcriptome',
382
+ 'prepare_transcriptome GENOME',
383
+ 'Prepare transcriptome data')
384
+
385
+ # strt:prepare_variation
386
+
387
+ desc 'prepare_variation DIR', 'Prepare variation data'
388
+ long_desc <<-DESC
389
+ Prepare genome variation data files for the specified GENOME dir based on common variations in dbSNP BUILD, at DIR.
390
+ DESC
391
+
392
+ method_option *OPT_COREUTILS_PREFIX
393
+ method_option *OPT_DOWNLOAD
394
+ method_option *OPT_GENOME
395
+
396
+ method_option :dbsnp,
397
+ banner: 'BUILD',
398
+ default: 146,
399
+ desc: 'Build number of dbSNP',
400
+ type: :numeric
401
+
402
+ def prepare_variation(dir0)
403
+
404
+ dir = File.expand_path(dir0)
405
+ snp = "#{dir}/#{options.genome}.snp#{options.dbsnp}Common.txt.gz"
406
+
407
+ rsync_file("rsync://hgdownload.soe.ucsc.edu/goldenPath/#{options.genome}/database/snp#{options.dbsnp}Common.txt.gz", snp) if options.download != 'no'
408
+ pipeline("unpigz -c #{dir}/genome.fa.gz",
409
+ "hisat2_extract_snps_haplotypes_UCSC.py - #{snp} #{dir}/variation"
410
+ ) if options.download != 'only'
411
+
412
+ end
413
+
414
+ # strt:qualify
415
+
416
+ desc 'qualify CSV REFDIR SEQDIR MAPDIR', 'Qualify samples'
417
+ long_desc <<-DESC
418
+ Qualify samples in a design CSV.'
419
+ DESC
420
+
421
+ method_option *OPT_BUFFER_SIZE
422
+ method_option *OPT_COREUTILS_PREFIX
423
+ method_option *OPT_PARALLEL
424
+
425
+ def qualify(csv, refdir, seqdir, mapdir)
426
+
427
+ count_commands = ["#{cut_command} -f 5",
428
+ "ruby -e 'n=0; while gets; n+=$_.to_i; end; puts n'"]
429
+
430
+ samples = CSV.read(csv, {
431
+ headers: true,
432
+ converters: :numeric
433
+ })
434
+ bases = samples["BASE"]
435
+
436
+ samples["TOTAL_READS"] =
437
+ Parallel.map(bases, in_threads: options.parallel) do |base|
438
+ stat = CSV.table("#{seqdir}/#{base}.count.step8.csv")
439
+ n = 0
440
+ stat[:reads].each {|i| n += i }
441
+ n
442
+ end
443
+
444
+ samples["MAPPED_READS"] =
445
+ Parallel.map(bases, in_threads: options.parallel) do |base|
446
+ pipeline_readline("unpigz -c #{mapdir}/#{base}.bed.gz",
447
+ *count_commands).to_i
448
+ end
449
+
450
+ tmp = Array.new
451
+ samples.each do |row|
452
+ tmp << row["MAPPED_READS"].to_f / row["TOTAL_READS"]
453
+ end
454
+ samples["MAPPED_RATE"] = tmp
455
+
456
+ samples["RIBOSOME_READS"] =
457
+ Parallel.map(bases, in_threads: options.parallel) do |base|
458
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/ribosome.bed.gz",
459
+ *count_commands).to_i
460
+ end
461
+
462
+ samples["SPIKEIN_READS"] =
463
+ Parallel.map(bases, in_threads: options.parallel) do |base|
464
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_whole.bed.gz",
465
+ *count_commands).to_i
466
+ end
467
+
468
+ samples["SPIKEIN_5END_READS"] =
469
+ Parallel.map(bases, in_threads: options.parallel) do |base|
470
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
471
+ *count_commands).to_i
472
+ end
473
+
474
+ tmp = Array.new
475
+ samples.each do |row|
476
+ tmp << row["SPIKEIN_5END_READS"].to_f / row["SPIKEIN_READS"]
477
+ end
478
+ samples["SPIKEIN_5END_RATE"] = tmp
479
+
480
+ tmp = Array.new
481
+ samples.each do |row|
482
+ tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
483
+ end
484
+ samples["RELATIVE_POLYA_RNAS"] = tmp
485
+
486
+ samples["CODING_READS"] =
487
+ Parallel.map(bases, in_threads: options.parallel) do |base|
488
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
489
+ *count_commands).to_i
490
+ end
491
+
492
+ samples["CODING_5END_READS"] =
493
+ Parallel.map(bases, in_threads: options.parallel) do |base|
494
+ pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_5end.bed.gz",
495
+ *count_commands).to_i
496
+ end
497
+
498
+ tmp = Array.new
499
+ samples.each do |row|
500
+ tmp << row["CODING_5END_READS"].to_f / row["CODING_READS"]
501
+ end
502
+ samples["CODING_5END_RATE"] = tmp
503
+
504
+ tmp = Array.new
505
+ samples.each do |row|
506
+ tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
507
+ end
508
+ samples["RELATIVE_MRNAS"] = tmp
509
+
510
+ puts samples
511
+
512
+ end
513
+
514
+ # strt:quantitate
515
+
516
+ desc 'quantify CSV REFDIR MAPDIR REGBASE',
517
+ 'Quantify samples'
518
+ long_desc <<-DESC
519
+ Count reads per region for multiple samples in CSV. Read counts (a BED-format COUNT) within regions (REFDIR/ribosome.bed.gz, REFDIR/spikein_5end.bed.gz & REGBASE.bed.gz) are summed by the region names. Addional columns in REGBASE.csv is attached as annotations.
520
+ DESC
521
+
522
+ method_option *OPT_COREUTILS_PREFIX
523
+ method_option *OPT_PARALLEL
524
+
525
+ def quantify(csv, refdir, mapdir, regbase)
526
+
527
+ samples = CSV.read(csv, { headers: true, converters: :numeric })
528
+
529
+ tmp = CSV.read("#{regbase}.csv", { headers:true, converters: :numeric })
530
+ anns = tmp.headers; anns.delete('ID')
531
+ name2anns = Hash.new
532
+ tmp.each {|row| name2anns[row['ID']] = row.values_at(*anns)}
533
+
534
+ bases = samples["BASE"]
535
+ puts ( ['ID'] + anns +
536
+ bases.map {|base| "N|#{base}"} +
537
+ bases.map {|base| "R|#{base}"} ).join(',')
538
+
539
+ name2base2cnt = Hash.new
540
+ base2spike = Hash.new
541
+ @locker = Mutex.new
542
+ Parallel.map(samples["BASE"], in_threads: options.parallel) do |base|
543
+ base2spike[base] = 0.0
544
+ fp = open("| strt count_per_region#{coreutils_prefix_option} #{mapdir}/#{base}.bed.gz #{refdir}/ribosome.bed.gz #{refdir}/spikein_5end.bed.gz #{regbase}.bed.gz")
545
+ fp.gets
546
+ fp.each do |line|
547
+ name, cnt = line.strip.split /,/
548
+ @locker.synchronize do
549
+ if !name2base2cnt.key?(name)
550
+ name2base2cnt[name] = Hash.new
551
+ name2base2cnt[name][base] = Hash.new
552
+ elsif !name2base2cnt[name].key?(base)
553
+ name2base2cnt[name][base] = Hash.new
554
+ end
555
+ name2base2cnt[name][base] = cnt
556
+ base2spike[base] += cnt.to_f if name =~ /^RNA_SPIKE_/
557
+ end
558
+ end
559
+ fp.close
560
+ end
561
+
562
+ name2base2cnt.each do |name, base2cnt|
563
+ puts ( [name] +
564
+ (name2anns.key?(name) ?
565
+ name2anns[name] : Array.new(anns.length, 'NA')) +
566
+ bases.map {|base| base2cnt.key?(base) ?
567
+ base2cnt[base].to_f/base2spike[base]*1000 : 0} +
568
+ bases.map {|base| base2cnt.key?(base) ? base2cnt[base] : 0}
569
+ ).join(',')
570
+ end
571
+
572
+ end
573
+
574
+ #
575
+
576
+ no_commands do
577
+
578
+ def download_option(options)
579
+ " --download=#{options.download}"
580
+ end
581
+
582
+ def genome_option(options)
583
+ " --genome=#{options.genome}"
584
+ end
585
+
586
+ def pipeline_readline(*cmds)
587
+ fp, ths = Open3.pipeline_r(*cmds)
588
+ line = fp.gets.strip
589
+ fp.close
590
+ ths[-1].join
591
+ line
592
+ end
593
+
594
+ def rsync_file(remote, local)
595
+ system "rsync -a #{remote} #{local}" or exit $?.exitstatus
596
+ end
597
+
598
+ end
599
+
600
+ end
601
+ end
602
+ end
603
+
604
+ require 'bio/gadget/strt/count.rb'
605
+ require 'bio/gadget/strt/depth.rb'