bio-gadget 0.4.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
@@ -0,0 +1,605 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'open3'
|
4
|
+
require 'parallel'
|
5
|
+
require 'thread'
|
6
|
+
|
7
|
+
require 'bio/gadget/strt/prepare_transcriptome.rb'
|
8
|
+
|
9
|
+
module Bio
|
10
|
+
class Gadget
|
11
|
+
class Strt < Bio::Gadget
|
12
|
+
|
13
|
+
OPT_GENOME = [ :genome, { :default => 'hg38',
|
14
|
+
:desc => 'Genome assembly' } ]
|
15
|
+
|
16
|
+
OPT_UMI_LENGTH = [ :umi_length, { :banner => 'NT',
|
17
|
+
:default => 6,
|
18
|
+
:desc => 'Length of UMI',
|
19
|
+
:type => :numeric } ]
|
20
|
+
|
21
|
+
# strt:alignment
|
22
|
+
|
23
|
+
desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
|
24
|
+
long_desc <<-DESC
|
25
|
+
Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
|
26
|
+
DESC
|
27
|
+
|
28
|
+
method_option *OPT_BUFFER_SIZE
|
29
|
+
method_option *OPT_COREUTILS_PREFIX
|
30
|
+
method_option *OPT_GREP_PREFIX
|
31
|
+
method_option *OPT_PARALLEL
|
32
|
+
|
33
|
+
def alignment(refdir, seqdir, mapdir)
|
34
|
+
|
35
|
+
Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
|
36
|
+
base = File.basename(fqgz, '.fq.gz')
|
37
|
+
STDERR.puts "#{`date`.strip}: Align #{base}..."
|
38
|
+
bam = "#{mapdir}/#{base}.bam"
|
39
|
+
pipeline(
|
40
|
+
"hisat2 --no-unal --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
|
41
|
+
"#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
|
42
|
+
"samtools sort -@ #{options.parallel} -o #{bam}")
|
43
|
+
sh "samtools index #{bam}"
|
44
|
+
end
|
45
|
+
|
46
|
+
STDERR.puts "#{`date`.strip}: Count from all alignments."
|
47
|
+
Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
|
48
|
+
in_threads: options.parallel) do |bam|
|
49
|
+
pipeline(
|
50
|
+
"strt count_per_base#{buffer_size_option}#{coreutils_prefix_option}#{parallel_option(options)} #{bam}",
|
51
|
+
"pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
# strt:build_index
|
57
|
+
|
58
|
+
desc 'build_index DIR', 'Build index for alignment'
|
59
|
+
long_desc <<-DESC
|
60
|
+
Build index for alignment of STRT reads, from the speficied GENOME, TRANSCRIPTOME and VARIATION, at DIR.
|
61
|
+
DESC
|
62
|
+
|
63
|
+
method_option *OPT_COREUTILS_PREFIX
|
64
|
+
method_option *OPT_GENOME
|
65
|
+
method_option *OPT_GREP_PREFIX
|
66
|
+
method_option *OPT_PARALLEL
|
67
|
+
|
68
|
+
def build_index(dir0)
|
69
|
+
|
70
|
+
dir = File.expand_path(dir0)
|
71
|
+
FileUtils.mkdir_p(dir)
|
72
|
+
|
73
|
+
STDERR.puts "#{`date`.strip}: Preparing data files..."
|
74
|
+
|
75
|
+
Parallel.map(
|
76
|
+
["strt prepare_genome#{coreutils_prefix_option}#{genome_option(options)} #{dir}",
|
77
|
+
"strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=only #{dir}",
|
78
|
+
"strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=only #{dir}",
|
79
|
+
"strt prepare_spikein#{coreutils_prefix_option} #{dir}",
|
80
|
+
"strt prepare_ribosome#{coreutils_prefix_option}#{genome_option(options)} #{dir}"], in_threads: options.parallel) do |cmd|
|
81
|
+
system cmd or exit $?.exitstatus
|
82
|
+
end
|
83
|
+
|
84
|
+
system "unpigz -c #{dir}/genome.fa.gz #{dir}/spikein.fa.gz #{dir}/ribosome.fa.gz > #{dir}/ref.fa"
|
85
|
+
system "samtools faidx #{dir}/ref.fa"
|
86
|
+
|
87
|
+
Parallel.map(["strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=no #{dir}",
|
88
|
+
"strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=no #{dir}"], in_threads: options.parallel) do |cmd|
|
89
|
+
STDERR.puts cmd
|
90
|
+
system cmd or exit $?.exitstatus
|
91
|
+
end
|
92
|
+
|
93
|
+
STDERR.puts "#{`date`.strip}: Building index..."
|
94
|
+
|
95
|
+
system "hisat2-build -f -p #{options.parallel} --snp #{dir}/variation.snp --haplotype #{dir}/variation.haplotype --ss #{dir}/transcriptome.splice_sites --exon #{dir}/transcriptome.exons #{dir}/ref.fa #{dir}/ref"
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
# strt:call_allele
|
100
|
+
|
101
|
+
desc 'call_allele CSV REFDIR MAPDIR', 'Call allele frequency'
|
102
|
+
long_desc <<-DESC
|
103
|
+
Call allele frequencies of multiple samples specified in a design CSV, based on alignment files at BAMDIR, and reference sequence 'ref.fa' and the index at REFDIR.
|
104
|
+
DESC
|
105
|
+
|
106
|
+
method_option *OPT_GENOME
|
107
|
+
|
108
|
+
def call_allele(csv, refdir, mapdir)
|
109
|
+
|
110
|
+
design = CSV.table(csv)
|
111
|
+
bams = get_temporary_path('strt.call_allele', 'bams')
|
112
|
+
fp = open(bams, 'w')
|
113
|
+
design[:base].each {|bam| fp.puts "#{mapdir}/#{bam}.bam" }
|
114
|
+
fp.close
|
115
|
+
csvdir = File.dirname(csv)
|
116
|
+
bcf = "#{csvdir}/strt-call_allele.bcf"
|
117
|
+
|
118
|
+
pipeline("samtools mpileup -u -t AD,ADF,ADR,DP -f #{refdir}/ref.fa -b #{bams}",
|
119
|
+
"bcftools call --multiallelic-caller --variants-only --output-type u",
|
120
|
+
"bcftools filter -s LowQual -e '%QUAL<20 || MIN(FORMAT/DP)<20' --output-type b > #{bcf}")
|
121
|
+
pipeline("bcftools view #{bcf}",
|
122
|
+
"table_annovar.pl - #{refdir} --buildver #{options.genome} --outfile #{csvdir}/strt-call_allele --remove --protocol refGene --operation g --nastring . --vcfinput")
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
# strt:count_per_base
|
127
|
+
|
128
|
+
desc 'count_per_base BAM', 'Count reads per base'
|
129
|
+
long_desc <<-DESC
|
130
|
+
Count reads per base , based on an alignment BAM.
|
131
|
+
DESC
|
132
|
+
|
133
|
+
method_option *OPT_BUFFER_SIZE
|
134
|
+
method_option *OPT_COREUTILS_PREFIX
|
135
|
+
method_option *OPT_PARALLEL
|
136
|
+
|
137
|
+
def count_per_base(bam)
|
138
|
+
|
139
|
+
pipeline(
|
140
|
+
"bedtools bamtobed -i #{bam}",
|
141
|
+
"ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
|
142
|
+
"#{sort_command} -t '\t' -k 1,1 -k 2,2n",
|
143
|
+
"#{uniq_command(options)} -c",
|
144
|
+
"ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
# strt:count_per_region
|
149
|
+
|
150
|
+
desc 'count_per_region COUNT REG [REG ...]',
|
151
|
+
'Count reads per region'
|
152
|
+
long_desc <<-DESC
|
153
|
+
Count reads per region for a sample. Read counts (a BED-format COUNT) within regions (BED-format REGions) are summed by the region names.
|
154
|
+
DESC
|
155
|
+
|
156
|
+
method_option *OPT_COREUTILS_PREFIX
|
157
|
+
|
158
|
+
def count_per_region(count, region0, *regions0)
|
159
|
+
|
160
|
+
pipeline(
|
161
|
+
"bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
|
162
|
+
"#{cut_command} -f 5,11",
|
163
|
+
"ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"ID,COUNT\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
# strt:prepare_genome
|
168
|
+
|
169
|
+
desc 'prepare_genome DIR', 'Prepare genome data'
|
170
|
+
long_desc <<-DESC
|
171
|
+
Prepare data files of the specified GENOME at DIR.
|
172
|
+
DESC
|
173
|
+
|
174
|
+
method_option *OPT_COREUTILS_PREFIX
|
175
|
+
method_option *OPT_DOWNLOAD
|
176
|
+
method_option *OPT_GENOME
|
177
|
+
|
178
|
+
def prepare_genome(dir0)
|
179
|
+
|
180
|
+
dir = File.expand_path(dir0)
|
181
|
+
tgz = "#{dir}/#{options.genome}.chromFa.tar.gz"
|
182
|
+
ucsc = "rsync://hgdownload.cse.ucsc.edu/goldenPath/#{options.genome}/bigZips"
|
183
|
+
|
184
|
+
if options.download != 'no'
|
185
|
+
if options.genome == 'hg38'
|
186
|
+
rsync_file("#{ucsc}/#{options.genome}.chromFa.tar.gz", tgz)
|
187
|
+
else
|
188
|
+
rsync_file("#{ucsc}/chromFa.tar.gz", tgz)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
pipeline("unpigz -c #{tgz}",
|
192
|
+
"#{options.coreutils_prefix}tar -xOf - --exclude \"*_*\"",
|
193
|
+
"gawk 'BEGIN{p=\"\"} /^>/{ print p $1 } !/^>/{ printf $1; p=\"\\n\" } END{ print }'",
|
194
|
+
"#{fold_command(options)} -w 50",
|
195
|
+
"pigz -c > #{dir}/genome.fa.gz"
|
196
|
+
) if options.download != 'only'
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
# strt:prepare_reads
|
201
|
+
|
202
|
+
desc 'prepare_reads BASE MAP FQGZ ...', 'Prepare STRT reads'
|
203
|
+
long_desc <<-DESC
|
204
|
+
Prepare STRT reads from raw sequence files before alignment. After demultiplexing, it performs
|
205
|
+
(i) exclusion of redundant reads,
|
206
|
+
(ii) exclusion of noncanonical reads, which does not begin with template switching primer,
|
207
|
+
(iii) trimming from low-quality base,
|
208
|
+
(iv) trimming from sequence similar to HiSeq universal primer,
|
209
|
+
and (v) trimming of the template switching primer.
|
210
|
+
|
211
|
+
Mandatory paramters are (1) BASE; basename for demulplexed and gzipped fastq files, (2) MAP; filename of comma-separated table between barcode and well, and (3) FQGZs; comma-separated filenames of raw sequences; each file is gzipped fastq. When MAP contains 'CAAAGT,A2' and BASE is '~/test', reads having CAAAGT-like barcode are in '~/test.A2.fq.gz' file after the preprocesses.
|
212
|
+
DESC
|
213
|
+
|
214
|
+
method_option *OPT_COREUTILS_PREFIX
|
215
|
+
method_option *OPT_GREP_PREFIX
|
216
|
+
method_option *OPT_PARALLEL
|
217
|
+
method_option *OPT_UMI_LENGTH
|
218
|
+
|
219
|
+
method_option :maximum_memory,
|
220
|
+
default: 50,
|
221
|
+
desc: 'Maximum memory usage in percent rate',
|
222
|
+
type: :numeric
|
223
|
+
|
224
|
+
method_option :minimum_length,
|
225
|
+
banner: 'NT',
|
226
|
+
default: 24,
|
227
|
+
desc: 'Minimum length after the preprocess',
|
228
|
+
type: :numeric
|
229
|
+
|
230
|
+
method_option :reads,
|
231
|
+
desc: 'Number of raw reads for the preprocess',
|
232
|
+
type: :numeric
|
233
|
+
|
234
|
+
# method_option :maximum_distance,
|
235
|
+
# default: 1,
|
236
|
+
# desc: 'Maximum distance between barcode and sequence',
|
237
|
+
# type: :numeric
|
238
|
+
|
239
|
+
def prepare_reads(base, map, fqgz0, *fqgzs0)
|
240
|
+
|
241
|
+
fqgzs = [fqgz0] + fqgzs0
|
242
|
+
|
243
|
+
bcs = Hash.new
|
244
|
+
open(map, 'r').each do |line|
|
245
|
+
bc, well = line.rstrip.split(',')
|
246
|
+
bcs[bc] = well
|
247
|
+
end
|
248
|
+
|
249
|
+
bcl = bcs.keys.map!{|key| key.length}.sort.uniq[0]
|
250
|
+
|
251
|
+
tso_pattern = '.'*options.umi_length + '.'*bcl + 'GG'
|
252
|
+
|
253
|
+
#
|
254
|
+
|
255
|
+
STDERR.puts "#{`date`.strip}: Demultiplexing each raw sequence files..."
|
256
|
+
|
257
|
+
fqgz2csv0 = Hash.new
|
258
|
+
fqgz2csv1 = Hash.new
|
259
|
+
fqgz2base = Hash.new
|
260
|
+
fqgzs.each do |fqgz|
|
261
|
+
fqgz2csv0[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
|
262
|
+
fqgz2csv1[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
|
263
|
+
fqgz2base[fqgz] = get_temporary_path('strt.preprocess', 'base', false)
|
264
|
+
end
|
265
|
+
|
266
|
+
Parallel.map(fqgz2csv0.keys, in_processes: options.parallel) do |fqgz|
|
267
|
+
cmds = [
|
268
|
+
"unpigz -c #{fqgz}",
|
269
|
+
"#{fq1l_convert_command(options)}",
|
270
|
+
"#{fq1l_count_command(options)} #{fqgz2csv0[fqgz]}",
|
271
|
+
"fq1l match_5end#{grep_prefix_option(options)} #{tso_pattern}",
|
272
|
+
"#{fq1l_count_command(options)} #{fqgz2csv1[fqgz]}",
|
273
|
+
"fq1l annotate_index --first-cycle=#{options.umi_length+1} --last-cycle=#{options.umi_length+bcl}",
|
274
|
+
"fq1l annotate_umi --first-cycle=1 --last-cycle=#{options.umi_length}",
|
275
|
+
"fq1l sort_index#{coreutils_prefix_option}#{parallel_option(options)} --buffer-size=#{(options.maximum_memory/(fqgz2csv0.keys.size+1)).to_i}%",
|
276
|
+
"fq1l demultiplex #{fqgz2base[fqgz]} #{map}"
|
277
|
+
]
|
278
|
+
cmds.insert(2, "#{head_command(options)} -n #{options.reads}") unless options.reads.nil?
|
279
|
+
stats = Open3.pipeline(*cmds)
|
280
|
+
stats.each_index do |i|
|
281
|
+
raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success? || (stats[i].signaled? && stats[i].termsig == 13)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
system "fq1l sum_counts #{fqgz2csv0.values.join(' ')} > #{base}.count.step1.csv"
|
286
|
+
unlink_files(fqgz2csv0.values)
|
287
|
+
|
288
|
+
system "fq1l sum_counts #{fqgz2csv1.values.join(' ')} > #{base}.count.step2.csv"
|
289
|
+
unlink_files(fqgz2csv1.values)
|
290
|
+
|
291
|
+
#
|
292
|
+
|
293
|
+
(bcs.values + ['NA']).each do |well|
|
294
|
+
|
295
|
+
STDERR.puts "#{`date`.strip}: Finishing well #{well}..."
|
296
|
+
|
297
|
+
tmpfqgzs = fqgz2base.values.map {|base| "#{base}.#{well}.fq.gz"}
|
298
|
+
csvs = Array.new(6) {|i| "#{base}.#{well}.count.step#{i+3}.csv"}
|
299
|
+
|
300
|
+
pipeline("unpigz -c #{tmpfqgzs.join(' ')}",
|
301
|
+
"#{fq1l_convert_command(options)}",
|
302
|
+
"#{fq1l_count_command(options)} #{csvs[0]}",
|
303
|
+
"#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
|
304
|
+
"fq1l exclude_duplicate",
|
305
|
+
"#{fq1l_count_command(options)} #{csvs[1]}",
|
306
|
+
"fq1l trim_3end_quality",
|
307
|
+
"#{fq1l_count_command(options)} #{csvs[2]}",
|
308
|
+
"fq1l trim_3end_primer#{coreutils_prefix_option}#{grep_prefix_option(options)}#{parallel_option(options)}",
|
309
|
+
"#{fq1l_count_command(options)} #{csvs[3]}",
|
310
|
+
"#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
|
311
|
+
"fq1l exclude_degenerate",
|
312
|
+
"#{fq1l_count_command(options)} #{csvs[4]}",
|
313
|
+
"fq1l trim_5end --minimum-length=#{options.minimum_length} #{tso_pattern}+",
|
314
|
+
"#{fq1l_count_command(options)} #{csvs[5]}",
|
315
|
+
"fq1l restore#{coreutils_prefix_option}",
|
316
|
+
"pigz -c > #{base}.#{well}.fq.gz")
|
317
|
+
|
318
|
+
unlink_files(tmpfqgzs)
|
319
|
+
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
# strt:prepare_ribosome
|
325
|
+
|
326
|
+
desc 'prepare_ribosome DIR', 'Prepare ribosome data'
|
327
|
+
long_desc <<-DESC
|
328
|
+
Prepare ribosome data files for the specified GENOME at DIR.
|
329
|
+
DESC
|
330
|
+
|
331
|
+
method_option *OPT_COREUTILS_PREFIX
|
332
|
+
method_option *OPT_DOWNLOAD
|
333
|
+
method_option *OPT_GENOME
|
334
|
+
|
335
|
+
def prepare_ribosome(dir0)
|
336
|
+
|
337
|
+
dir = File.expand_path(dir0)
|
338
|
+
|
339
|
+
if options.genome[0..1] == 'hg'
|
340
|
+
if options.download != 'no'
|
341
|
+
download_file("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=555853&strand=1&rettype=fasta&retmode=text", "#{dir}/U13369.fa")
|
342
|
+
system "pigz #{dir}/U13369.fa" or exit $?.exitstatus
|
343
|
+
end
|
344
|
+
pipeline("unpigz -c #{dir}/U13369.fa.gz",
|
345
|
+
"gawk '/^>/{ print \">RIBO_U13369.1\" } !/^>/{ printf $1 } END{ print }'",
|
346
|
+
"#{fold_command(options)} -w 50",
|
347
|
+
"pigz -c > #{dir}/ribosome.fa.gz"
|
348
|
+
) if options.download != 'only'
|
349
|
+
else
|
350
|
+
pipeline("echo", "pigz -c > #{dir}/ribosome.fa.gz")
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
354
|
+
|
355
|
+
# strt:prepare_spikein
|
356
|
+
|
357
|
+
desc 'prepare_spikein DIR', 'Prepare spikein data'
|
358
|
+
long_desc <<-DESC
|
359
|
+
Prepare spikein data files at DIR.
|
360
|
+
DESC
|
361
|
+
|
362
|
+
method_option *OPT_COREUTILS_PREFIX
|
363
|
+
method_option *OPT_DOWNLOAD
|
364
|
+
|
365
|
+
def prepare_spikein(dir0)
|
366
|
+
|
367
|
+
dir = File.expand_path(dir0)
|
368
|
+
zip = "#{dir}/ERCC92.zip"
|
369
|
+
|
370
|
+
download_file("https://tools.thermofisher.com/content/sfs/manuals/ERCC92.zip", zip) if options.download != 'no'
|
371
|
+
pipeline("unzip -cq #{zip} ERCC92.fa",
|
372
|
+
"gawk 'BEGIN{p=\"\"} /^>/{ print p \">RNA_SPIKE_\" substr($1, 2); printf(\"AATTC\" ($1 == \">ERCC-00130\" ? \"GAGCTC\" : \"\") ) } /^[ACGT]/{ printf $1; p=\"\\n\" } END{ print }'",
|
373
|
+
"#{fold_command(options)} -w 50",
|
374
|
+
"pigz -c > #{dir}/spikein.fa.gz") if options.download != 'only'
|
375
|
+
|
376
|
+
end
|
377
|
+
|
378
|
+
# strt:prepare_transcriptome
|
379
|
+
|
380
|
+
register(Bio::Gadget::StrtPrepareTranscriptome,
|
381
|
+
'prepare_transcriptome',
|
382
|
+
'prepare_transcriptome GENOME',
|
383
|
+
'Prepare transcriptome data')
|
384
|
+
|
385
|
+
# strt:prepare_variation
|
386
|
+
|
387
|
+
desc 'prepare_variation DIR', 'Prepare variation data'
|
388
|
+
long_desc <<-DESC
|
389
|
+
Prepare genome variation data files for the specified GENOME dir based on common variations in dbSNP BUILD, at DIR.
|
390
|
+
DESC
|
391
|
+
|
392
|
+
method_option *OPT_COREUTILS_PREFIX
|
393
|
+
method_option *OPT_DOWNLOAD
|
394
|
+
method_option *OPT_GENOME
|
395
|
+
|
396
|
+
method_option :dbsnp,
|
397
|
+
banner: 'BUILD',
|
398
|
+
default: 146,
|
399
|
+
desc: 'Build number of dbSNP',
|
400
|
+
type: :numeric
|
401
|
+
|
402
|
+
def prepare_variation(dir0)
|
403
|
+
|
404
|
+
dir = File.expand_path(dir0)
|
405
|
+
snp = "#{dir}/#{options.genome}.snp#{options.dbsnp}Common.txt.gz"
|
406
|
+
|
407
|
+
rsync_file("rsync://hgdownload.soe.ucsc.edu/goldenPath/#{options.genome}/database/snp#{options.dbsnp}Common.txt.gz", snp) if options.download != 'no'
|
408
|
+
pipeline("unpigz -c #{dir}/genome.fa.gz",
|
409
|
+
"hisat2_extract_snps_haplotypes_UCSC.py - #{snp} #{dir}/variation"
|
410
|
+
) if options.download != 'only'
|
411
|
+
|
412
|
+
end
|
413
|
+
|
414
|
+
# strt:qualify
|
415
|
+
|
416
|
+
desc 'qualify CSV REFDIR SEQDIR MAPDIR', 'Qualify samples'
|
417
|
+
long_desc <<-DESC
|
418
|
+
Qualify samples in a design CSV.'
|
419
|
+
DESC
|
420
|
+
|
421
|
+
method_option *OPT_BUFFER_SIZE
|
422
|
+
method_option *OPT_COREUTILS_PREFIX
|
423
|
+
method_option *OPT_PARALLEL
|
424
|
+
|
425
|
+
def qualify(csv, refdir, seqdir, mapdir)
|
426
|
+
|
427
|
+
count_commands = ["#{cut_command} -f 5",
|
428
|
+
"ruby -e 'n=0; while gets; n+=$_.to_i; end; puts n'"]
|
429
|
+
|
430
|
+
samples = CSV.read(csv, {
|
431
|
+
headers: true,
|
432
|
+
converters: :numeric
|
433
|
+
})
|
434
|
+
bases = samples["BASE"]
|
435
|
+
|
436
|
+
samples["TOTAL_READS"] =
|
437
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
438
|
+
stat = CSV.table("#{seqdir}/#{base}.count.step8.csv")
|
439
|
+
n = 0
|
440
|
+
stat[:reads].each {|i| n += i }
|
441
|
+
n
|
442
|
+
end
|
443
|
+
|
444
|
+
samples["MAPPED_READS"] =
|
445
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
446
|
+
pipeline_readline("unpigz -c #{mapdir}/#{base}.bed.gz",
|
447
|
+
*count_commands).to_i
|
448
|
+
end
|
449
|
+
|
450
|
+
tmp = Array.new
|
451
|
+
samples.each do |row|
|
452
|
+
tmp << row["MAPPED_READS"].to_f / row["TOTAL_READS"]
|
453
|
+
end
|
454
|
+
samples["MAPPED_RATE"] = tmp
|
455
|
+
|
456
|
+
samples["RIBOSOME_READS"] =
|
457
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
458
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/ribosome.bed.gz",
|
459
|
+
*count_commands).to_i
|
460
|
+
end
|
461
|
+
|
462
|
+
samples["SPIKEIN_READS"] =
|
463
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
464
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_whole.bed.gz",
|
465
|
+
*count_commands).to_i
|
466
|
+
end
|
467
|
+
|
468
|
+
samples["SPIKEIN_5END_READS"] =
|
469
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
470
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
|
471
|
+
*count_commands).to_i
|
472
|
+
end
|
473
|
+
|
474
|
+
tmp = Array.new
|
475
|
+
samples.each do |row|
|
476
|
+
tmp << row["SPIKEIN_5END_READS"].to_f / row["SPIKEIN_READS"]
|
477
|
+
end
|
478
|
+
samples["SPIKEIN_5END_RATE"] = tmp
|
479
|
+
|
480
|
+
tmp = Array.new
|
481
|
+
samples.each do |row|
|
482
|
+
tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
|
483
|
+
end
|
484
|
+
samples["RELATIVE_POLYA_RNAS"] = tmp
|
485
|
+
|
486
|
+
samples["CODING_READS"] =
|
487
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
488
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
|
489
|
+
*count_commands).to_i
|
490
|
+
end
|
491
|
+
|
492
|
+
samples["CODING_5END_READS"] =
|
493
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
494
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_5end.bed.gz",
|
495
|
+
*count_commands).to_i
|
496
|
+
end
|
497
|
+
|
498
|
+
tmp = Array.new
|
499
|
+
samples.each do |row|
|
500
|
+
tmp << row["CODING_5END_READS"].to_f / row["CODING_READS"]
|
501
|
+
end
|
502
|
+
samples["CODING_5END_RATE"] = tmp
|
503
|
+
|
504
|
+
tmp = Array.new
|
505
|
+
samples.each do |row|
|
506
|
+
tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
|
507
|
+
end
|
508
|
+
samples["RELATIVE_MRNAS"] = tmp
|
509
|
+
|
510
|
+
puts samples
|
511
|
+
|
512
|
+
end
|
513
|
+
|
514
|
+
# strt:quantitate
|
515
|
+
|
516
|
+
desc 'quantify CSV REFDIR MAPDIR REGBASE',
|
517
|
+
'Quantify samples'
|
518
|
+
long_desc <<-DESC
|
519
|
+
Count reads per region for multiple samples in CSV. Read counts (a BED-format COUNT) within regions (REFDIR/ribosome.bed.gz, REFDIR/spikein_5end.bed.gz & REGBASE.bed.gz) are summed by the region names. Addional columns in REGBASE.csv is attached as annotations.
|
520
|
+
DESC
|
521
|
+
|
522
|
+
method_option *OPT_COREUTILS_PREFIX
|
523
|
+
method_option *OPT_PARALLEL
|
524
|
+
|
525
|
+
def quantify(csv, refdir, mapdir, regbase)
|
526
|
+
|
527
|
+
samples = CSV.read(csv, { headers: true, converters: :numeric })
|
528
|
+
|
529
|
+
tmp = CSV.read("#{regbase}.csv", { headers:true, converters: :numeric })
|
530
|
+
anns = tmp.headers; anns.delete('ID')
|
531
|
+
name2anns = Hash.new
|
532
|
+
tmp.each {|row| name2anns[row['ID']] = row.values_at(*anns)}
|
533
|
+
|
534
|
+
bases = samples["BASE"]
|
535
|
+
puts ( ['ID'] + anns +
|
536
|
+
bases.map {|base| "N|#{base}"} +
|
537
|
+
bases.map {|base| "R|#{base}"} ).join(',')
|
538
|
+
|
539
|
+
name2base2cnt = Hash.new
|
540
|
+
base2spike = Hash.new
|
541
|
+
@locker = Mutex.new
|
542
|
+
Parallel.map(samples["BASE"], in_threads: options.parallel) do |base|
|
543
|
+
base2spike[base] = 0.0
|
544
|
+
fp = open("| strt count_per_region#{coreutils_prefix_option} #{mapdir}/#{base}.bed.gz #{refdir}/ribosome.bed.gz #{refdir}/spikein_5end.bed.gz #{regbase}.bed.gz")
|
545
|
+
fp.gets
|
546
|
+
fp.each do |line|
|
547
|
+
name, cnt = line.strip.split /,/
|
548
|
+
@locker.synchronize do
|
549
|
+
if !name2base2cnt.key?(name)
|
550
|
+
name2base2cnt[name] = Hash.new
|
551
|
+
name2base2cnt[name][base] = Hash.new
|
552
|
+
elsif !name2base2cnt[name].key?(base)
|
553
|
+
name2base2cnt[name][base] = Hash.new
|
554
|
+
end
|
555
|
+
name2base2cnt[name][base] = cnt
|
556
|
+
base2spike[base] += cnt.to_f if name =~ /^RNA_SPIKE_/
|
557
|
+
end
|
558
|
+
end
|
559
|
+
fp.close
|
560
|
+
end
|
561
|
+
|
562
|
+
name2base2cnt.each do |name, base2cnt|
|
563
|
+
puts ( [name] +
|
564
|
+
(name2anns.key?(name) ?
|
565
|
+
name2anns[name] : Array.new(anns.length, 'NA')) +
|
566
|
+
bases.map {|base| base2cnt.key?(base) ?
|
567
|
+
base2cnt[base].to_f/base2spike[base]*1000 : 0} +
|
568
|
+
bases.map {|base| base2cnt.key?(base) ? base2cnt[base] : 0}
|
569
|
+
).join(',')
|
570
|
+
end
|
571
|
+
|
572
|
+
end
|
573
|
+
|
574
|
+
#
|
575
|
+
|
576
|
+
no_commands do
|
577
|
+
|
578
|
+
def download_option(options)
|
579
|
+
" --download=#{options.download}"
|
580
|
+
end
|
581
|
+
|
582
|
+
def genome_option(options)
|
583
|
+
" --genome=#{options.genome}"
|
584
|
+
end
|
585
|
+
|
586
|
+
def pipeline_readline(*cmds)
|
587
|
+
fp, ths = Open3.pipeline_r(*cmds)
|
588
|
+
line = fp.gets.strip
|
589
|
+
fp.close
|
590
|
+
ths[-1].join
|
591
|
+
line
|
592
|
+
end
|
593
|
+
|
594
|
+
def rsync_file(remote, local)
|
595
|
+
system "rsync -a #{remote} #{local}" or exit $?.exitstatus
|
596
|
+
end
|
597
|
+
|
598
|
+
end
|
599
|
+
|
600
|
+
end
|
601
|
+
end
|
602
|
+
end
|
603
|
+
|
604
|
+
require 'bio/gadget/strt/count.rb'
|
605
|
+
require 'bio/gadget/strt/depth.rb'
|