bio-gadget 0.4.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
@@ -0,0 +1,605 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'open3'
|
4
|
+
require 'parallel'
|
5
|
+
require 'thread'
|
6
|
+
|
7
|
+
require 'bio/gadget/strt/prepare_transcriptome.rb'
|
8
|
+
|
9
|
+
module Bio
|
10
|
+
class Gadget
|
11
|
+
class Strt < Bio::Gadget
|
12
|
+
|
13
|
+
OPT_GENOME = [ :genome, { :default => 'hg38',
|
14
|
+
:desc => 'Genome assembly' } ]
|
15
|
+
|
16
|
+
OPT_UMI_LENGTH = [ :umi_length, { :banner => 'NT',
|
17
|
+
:default => 6,
|
18
|
+
:desc => 'Length of UMI',
|
19
|
+
:type => :numeric } ]
|
20
|
+
|
21
|
+
# strt:alignment
|
22
|
+
|
23
|
+
desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
|
24
|
+
long_desc <<-DESC
|
25
|
+
Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
|
26
|
+
DESC
|
27
|
+
|
28
|
+
method_option *OPT_BUFFER_SIZE
|
29
|
+
method_option *OPT_COREUTILS_PREFIX
|
30
|
+
method_option *OPT_GREP_PREFIX
|
31
|
+
method_option *OPT_PARALLEL
|
32
|
+
|
33
|
+
def alignment(refdir, seqdir, mapdir)
|
34
|
+
|
35
|
+
Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
|
36
|
+
base = File.basename(fqgz, '.fq.gz')
|
37
|
+
STDERR.puts "#{`date`.strip}: Align #{base}..."
|
38
|
+
bam = "#{mapdir}/#{base}.bam"
|
39
|
+
pipeline(
|
40
|
+
"hisat2 --no-unal --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
|
41
|
+
"#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
|
42
|
+
"samtools sort -@ #{options.parallel} -o #{bam}")
|
43
|
+
sh "samtools index #{bam}"
|
44
|
+
end
|
45
|
+
|
46
|
+
STDERR.puts "#{`date`.strip}: Count from all alignments."
|
47
|
+
Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
|
48
|
+
in_threads: options.parallel) do |bam|
|
49
|
+
pipeline(
|
50
|
+
"strt count_per_base#{buffer_size_option}#{coreutils_prefix_option}#{parallel_option(options)} #{bam}",
|
51
|
+
"pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
# strt:build_index
|
57
|
+
|
58
|
+
desc 'build_index DIR', 'Build index for alignment'
|
59
|
+
long_desc <<-DESC
|
60
|
+
Build index for alignment of STRT reads, from the speficied GENOME, TRANSCRIPTOME and VARIATION, at DIR.
|
61
|
+
DESC
|
62
|
+
|
63
|
+
method_option *OPT_COREUTILS_PREFIX
|
64
|
+
method_option *OPT_GENOME
|
65
|
+
method_option *OPT_GREP_PREFIX
|
66
|
+
method_option *OPT_PARALLEL
|
67
|
+
|
68
|
+
def build_index(dir0)
|
69
|
+
|
70
|
+
dir = File.expand_path(dir0)
|
71
|
+
FileUtils.mkdir_p(dir)
|
72
|
+
|
73
|
+
STDERR.puts "#{`date`.strip}: Preparing data files..."
|
74
|
+
|
75
|
+
Parallel.map(
|
76
|
+
["strt prepare_genome#{coreutils_prefix_option}#{genome_option(options)} #{dir}",
|
77
|
+
"strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=only #{dir}",
|
78
|
+
"strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=only #{dir}",
|
79
|
+
"strt prepare_spikein#{coreutils_prefix_option} #{dir}",
|
80
|
+
"strt prepare_ribosome#{coreutils_prefix_option}#{genome_option(options)} #{dir}"], in_threads: options.parallel) do |cmd|
|
81
|
+
system cmd or exit $?.exitstatus
|
82
|
+
end
|
83
|
+
|
84
|
+
system "unpigz -c #{dir}/genome.fa.gz #{dir}/spikein.fa.gz #{dir}/ribosome.fa.gz > #{dir}/ref.fa"
|
85
|
+
system "samtools faidx #{dir}/ref.fa"
|
86
|
+
|
87
|
+
Parallel.map(["strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=no #{dir}",
|
88
|
+
"strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=no #{dir}"], in_threads: options.parallel) do |cmd|
|
89
|
+
STDERR.puts cmd
|
90
|
+
system cmd or exit $?.exitstatus
|
91
|
+
end
|
92
|
+
|
93
|
+
STDERR.puts "#{`date`.strip}: Building index..."
|
94
|
+
|
95
|
+
system "hisat2-build -f -p #{options.parallel} --snp #{dir}/variation.snp --haplotype #{dir}/variation.haplotype --ss #{dir}/transcriptome.splice_sites --exon #{dir}/transcriptome.exons #{dir}/ref.fa #{dir}/ref"
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
# strt:call_allele
|
100
|
+
|
101
|
+
desc 'call_allele CSV REFDIR MAPDIR', 'Call allele frequency'
|
102
|
+
long_desc <<-DESC
|
103
|
+
Call allele frequencies of multiple samples specified in a design CSV, based on alignment files at BAMDIR, and reference sequence 'ref.fa' and the index at REFDIR.
|
104
|
+
DESC
|
105
|
+
|
106
|
+
method_option *OPT_GENOME
|
107
|
+
|
108
|
+
def call_allele(csv, refdir, mapdir)
|
109
|
+
|
110
|
+
design = CSV.table(csv)
|
111
|
+
bams = get_temporary_path('strt.call_allele', 'bams')
|
112
|
+
fp = open(bams, 'w')
|
113
|
+
design[:base].each {|bam| fp.puts "#{mapdir}/#{bam}.bam" }
|
114
|
+
fp.close
|
115
|
+
csvdir = File.dirname(csv)
|
116
|
+
bcf = "#{csvdir}/strt-call_allele.bcf"
|
117
|
+
|
118
|
+
pipeline("samtools mpileup -u -t AD,ADF,ADR,DP -f #{refdir}/ref.fa -b #{bams}",
|
119
|
+
"bcftools call --multiallelic-caller --variants-only --output-type u",
|
120
|
+
"bcftools filter -s LowQual -e '%QUAL<20 || MIN(FORMAT/DP)<20' --output-type b > #{bcf}")
|
121
|
+
pipeline("bcftools view #{bcf}",
|
122
|
+
"table_annovar.pl - #{refdir} --buildver #{options.genome} --outfile #{csvdir}/strt-call_allele --remove --protocol refGene --operation g --nastring . --vcfinput")
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
# strt:count_per_base
|
127
|
+
|
128
|
+
desc 'count_per_base BAM', 'Count reads per base'
|
129
|
+
long_desc <<-DESC
|
130
|
+
Count reads per base , based on an alignment BAM.
|
131
|
+
DESC
|
132
|
+
|
133
|
+
method_option *OPT_BUFFER_SIZE
|
134
|
+
method_option *OPT_COREUTILS_PREFIX
|
135
|
+
method_option *OPT_PARALLEL
|
136
|
+
|
137
|
+
def count_per_base(bam)
|
138
|
+
|
139
|
+
pipeline(
|
140
|
+
"bedtools bamtobed -i #{bam}",
|
141
|
+
"ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
|
142
|
+
"#{sort_command} -t '\t' -k 1,1 -k 2,2n",
|
143
|
+
"#{uniq_command(options)} -c",
|
144
|
+
"ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
# strt:count_per_region
|
149
|
+
|
150
|
+
desc 'count_per_region COUNT REG [REG ...]',
|
151
|
+
'Count reads per region'
|
152
|
+
long_desc <<-DESC
|
153
|
+
Count reads per region for a sample. Read counts (a BED-format COUNT) within regions (BED-format REGions) are summed by the region names.
|
154
|
+
DESC
|
155
|
+
|
156
|
+
method_option *OPT_COREUTILS_PREFIX
|
157
|
+
|
158
|
+
def count_per_region(count, region0, *regions0)
|
159
|
+
|
160
|
+
pipeline(
|
161
|
+
"bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
|
162
|
+
"#{cut_command} -f 5,11",
|
163
|
+
"ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"ID,COUNT\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
# strt:prepare_genome
|
168
|
+
|
169
|
+
desc 'prepare_genome DIR', 'Prepare genome data'
|
170
|
+
long_desc <<-DESC
|
171
|
+
Prepare data files of the specified GENOME at DIR.
|
172
|
+
DESC
|
173
|
+
|
174
|
+
method_option *OPT_COREUTILS_PREFIX
|
175
|
+
method_option *OPT_DOWNLOAD
|
176
|
+
method_option *OPT_GENOME
|
177
|
+
|
178
|
+
def prepare_genome(dir0)
|
179
|
+
|
180
|
+
dir = File.expand_path(dir0)
|
181
|
+
tgz = "#{dir}/#{options.genome}.chromFa.tar.gz"
|
182
|
+
ucsc = "rsync://hgdownload.cse.ucsc.edu/goldenPath/#{options.genome}/bigZips"
|
183
|
+
|
184
|
+
if options.download != 'no'
|
185
|
+
if options.genome == 'hg38'
|
186
|
+
rsync_file("#{ucsc}/#{options.genome}.chromFa.tar.gz", tgz)
|
187
|
+
else
|
188
|
+
rsync_file("#{ucsc}/chromFa.tar.gz", tgz)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
pipeline("unpigz -c #{tgz}",
|
192
|
+
"#{options.coreutils_prefix}tar -xOf - --exclude \"*_*\"",
|
193
|
+
"gawk 'BEGIN{p=\"\"} /^>/{ print p $1 } !/^>/{ printf $1; p=\"\\n\" } END{ print }'",
|
194
|
+
"#{fold_command(options)} -w 50",
|
195
|
+
"pigz -c > #{dir}/genome.fa.gz"
|
196
|
+
) if options.download != 'only'
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
# strt:prepare_reads
|
201
|
+
|
202
|
+
desc 'prepare_reads BASE MAP FQGZ ...', 'Prepare STRT reads'
|
203
|
+
long_desc <<-DESC
|
204
|
+
Prepare STRT reads from raw sequence files before alignment. After demultiplexing, it performs
|
205
|
+
(i) exclusion of redundant reads,
|
206
|
+
(ii) exclusion of noncanonical reads, which does not begin with template switching primer,
|
207
|
+
(iii) trimming from low-quality base,
|
208
|
+
(iv) trimming from sequence similar to HiSeq universal primer,
|
209
|
+
and (v) trimming of the template switching primer.
|
210
|
+
|
211
|
+
Mandatory paramters are (1) BASE; basename for demulplexed and gzipped fastq files, (2) MAP; filename of comma-separated table between barcode and well, and (3) FQGZs; comma-separated filenames of raw sequences; each file is gzipped fastq. When MAP contains 'CAAAGT,A2' and BASE is '~/test', reads having CAAAGT-like barcode are in '~/test.A2.fq.gz' file after the preprocesses.
|
212
|
+
DESC
|
213
|
+
|
214
|
+
method_option *OPT_COREUTILS_PREFIX
|
215
|
+
method_option *OPT_GREP_PREFIX
|
216
|
+
method_option *OPT_PARALLEL
|
217
|
+
method_option *OPT_UMI_LENGTH
|
218
|
+
|
219
|
+
method_option :maximum_memory,
|
220
|
+
default: 50,
|
221
|
+
desc: 'Maximum memory usage in percent rate',
|
222
|
+
type: :numeric
|
223
|
+
|
224
|
+
method_option :minimum_length,
|
225
|
+
banner: 'NT',
|
226
|
+
default: 24,
|
227
|
+
desc: 'Minimum length after the preprocess',
|
228
|
+
type: :numeric
|
229
|
+
|
230
|
+
method_option :reads,
|
231
|
+
desc: 'Number of raw reads for the preprocess',
|
232
|
+
type: :numeric
|
233
|
+
|
234
|
+
# method_option :maximum_distance,
|
235
|
+
# default: 1,
|
236
|
+
# desc: 'Maximum distance between barcode and sequence',
|
237
|
+
# type: :numeric
|
238
|
+
|
239
|
+
def prepare_reads(base, map, fqgz0, *fqgzs0)
|
240
|
+
|
241
|
+
fqgzs = [fqgz0] + fqgzs0
|
242
|
+
|
243
|
+
bcs = Hash.new
|
244
|
+
open(map, 'r').each do |line|
|
245
|
+
bc, well = line.rstrip.split(',')
|
246
|
+
bcs[bc] = well
|
247
|
+
end
|
248
|
+
|
249
|
+
bcl = bcs.keys.map!{|key| key.length}.sort.uniq[0]
|
250
|
+
|
251
|
+
tso_pattern = '.'*options.umi_length + '.'*bcl + 'GG'
|
252
|
+
|
253
|
+
#
|
254
|
+
|
255
|
+
STDERR.puts "#{`date`.strip}: Demultiplexing each raw sequence files..."
|
256
|
+
|
257
|
+
fqgz2csv0 = Hash.new
|
258
|
+
fqgz2csv1 = Hash.new
|
259
|
+
fqgz2base = Hash.new
|
260
|
+
fqgzs.each do |fqgz|
|
261
|
+
fqgz2csv0[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
|
262
|
+
fqgz2csv1[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
|
263
|
+
fqgz2base[fqgz] = get_temporary_path('strt.preprocess', 'base', false)
|
264
|
+
end
|
265
|
+
|
266
|
+
Parallel.map(fqgz2csv0.keys, in_processes: options.parallel) do |fqgz|
|
267
|
+
cmds = [
|
268
|
+
"unpigz -c #{fqgz}",
|
269
|
+
"#{fq1l_convert_command(options)}",
|
270
|
+
"#{fq1l_count_command(options)} #{fqgz2csv0[fqgz]}",
|
271
|
+
"fq1l match_5end#{grep_prefix_option(options)} #{tso_pattern}",
|
272
|
+
"#{fq1l_count_command(options)} #{fqgz2csv1[fqgz]}",
|
273
|
+
"fq1l annotate_index --first-cycle=#{options.umi_length+1} --last-cycle=#{options.umi_length+bcl}",
|
274
|
+
"fq1l annotate_umi --first-cycle=1 --last-cycle=#{options.umi_length}",
|
275
|
+
"fq1l sort_index#{coreutils_prefix_option}#{parallel_option(options)} --buffer-size=#{(options.maximum_memory/(fqgz2csv0.keys.size+1)).to_i}%",
|
276
|
+
"fq1l demultiplex #{fqgz2base[fqgz]} #{map}"
|
277
|
+
]
|
278
|
+
cmds.insert(2, "#{head_command(options)} -n #{options.reads}") unless options.reads.nil?
|
279
|
+
stats = Open3.pipeline(*cmds)
|
280
|
+
stats.each_index do |i|
|
281
|
+
raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success? || (stats[i].signaled? && stats[i].termsig == 13)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
system "fq1l sum_counts #{fqgz2csv0.values.join(' ')} > #{base}.count.step1.csv"
|
286
|
+
unlink_files(fqgz2csv0.values)
|
287
|
+
|
288
|
+
system "fq1l sum_counts #{fqgz2csv1.values.join(' ')} > #{base}.count.step2.csv"
|
289
|
+
unlink_files(fqgz2csv1.values)
|
290
|
+
|
291
|
+
#
|
292
|
+
|
293
|
+
(bcs.values + ['NA']).each do |well|
|
294
|
+
|
295
|
+
STDERR.puts "#{`date`.strip}: Finishing well #{well}..."
|
296
|
+
|
297
|
+
tmpfqgzs = fqgz2base.values.map {|base| "#{base}.#{well}.fq.gz"}
|
298
|
+
csvs = Array.new(6) {|i| "#{base}.#{well}.count.step#{i+3}.csv"}
|
299
|
+
|
300
|
+
pipeline("unpigz -c #{tmpfqgzs.join(' ')}",
|
301
|
+
"#{fq1l_convert_command(options)}",
|
302
|
+
"#{fq1l_count_command(options)} #{csvs[0]}",
|
303
|
+
"#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
|
304
|
+
"fq1l exclude_duplicate",
|
305
|
+
"#{fq1l_count_command(options)} #{csvs[1]}",
|
306
|
+
"fq1l trim_3end_quality",
|
307
|
+
"#{fq1l_count_command(options)} #{csvs[2]}",
|
308
|
+
"fq1l trim_3end_primer#{coreutils_prefix_option}#{grep_prefix_option(options)}#{parallel_option(options)}",
|
309
|
+
"#{fq1l_count_command(options)} #{csvs[3]}",
|
310
|
+
"#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
|
311
|
+
"fq1l exclude_degenerate",
|
312
|
+
"#{fq1l_count_command(options)} #{csvs[4]}",
|
313
|
+
"fq1l trim_5end --minimum-length=#{options.minimum_length} #{tso_pattern}+",
|
314
|
+
"#{fq1l_count_command(options)} #{csvs[5]}",
|
315
|
+
"fq1l restore#{coreutils_prefix_option}",
|
316
|
+
"pigz -c > #{base}.#{well}.fq.gz")
|
317
|
+
|
318
|
+
unlink_files(tmpfqgzs)
|
319
|
+
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
# strt:prepare_ribosome
|
325
|
+
|
326
|
+
desc 'prepare_ribosome DIR', 'Prepare ribosome data'
|
327
|
+
long_desc <<-DESC
|
328
|
+
Prepare ribosome data files for the specified GENOME at DIR.
|
329
|
+
DESC
|
330
|
+
|
331
|
+
method_option *OPT_COREUTILS_PREFIX
|
332
|
+
method_option *OPT_DOWNLOAD
|
333
|
+
method_option *OPT_GENOME
|
334
|
+
|
335
|
+
def prepare_ribosome(dir0)
|
336
|
+
|
337
|
+
dir = File.expand_path(dir0)
|
338
|
+
|
339
|
+
if options.genome[0..1] == 'hg'
|
340
|
+
if options.download != 'no'
|
341
|
+
download_file("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=555853&strand=1&rettype=fasta&retmode=text", "#{dir}/U13369.fa")
|
342
|
+
system "pigz #{dir}/U13369.fa" or exit $?.exitstatus
|
343
|
+
end
|
344
|
+
pipeline("unpigz -c #{dir}/U13369.fa.gz",
|
345
|
+
"gawk '/^>/{ print \">RIBO_U13369.1\" } !/^>/{ printf $1 } END{ print }'",
|
346
|
+
"#{fold_command(options)} -w 50",
|
347
|
+
"pigz -c > #{dir}/ribosome.fa.gz"
|
348
|
+
) if options.download != 'only'
|
349
|
+
else
|
350
|
+
pipeline("echo", "pigz -c > #{dir}/ribosome.fa.gz")
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
354
|
+
|
355
|
+
# strt:prepare_spikein
|
356
|
+
|
357
|
+
desc 'prepare_spikein DIR', 'Prepare spikein data'
|
358
|
+
long_desc <<-DESC
|
359
|
+
Prepare spikein data files at DIR.
|
360
|
+
DESC
|
361
|
+
|
362
|
+
method_option *OPT_COREUTILS_PREFIX
|
363
|
+
method_option *OPT_DOWNLOAD
|
364
|
+
|
365
|
+
def prepare_spikein(dir0)
|
366
|
+
|
367
|
+
dir = File.expand_path(dir0)
|
368
|
+
zip = "#{dir}/ERCC92.zip"
|
369
|
+
|
370
|
+
download_file("https://tools.thermofisher.com/content/sfs/manuals/ERCC92.zip", zip) if options.download != 'no'
|
371
|
+
pipeline("unzip -cq #{zip} ERCC92.fa",
|
372
|
+
"gawk 'BEGIN{p=\"\"} /^>/{ print p \">RNA_SPIKE_\" substr($1, 2); printf(\"AATTC\" ($1 == \">ERCC-00130\" ? \"GAGCTC\" : \"\") ) } /^[ACGT]/{ printf $1; p=\"\\n\" } END{ print }'",
|
373
|
+
"#{fold_command(options)} -w 50",
|
374
|
+
"pigz -c > #{dir}/spikein.fa.gz") if options.download != 'only'
|
375
|
+
|
376
|
+
end
|
377
|
+
|
378
|
+
# strt:prepare_transcriptome
|
379
|
+
|
380
|
+
register(Bio::Gadget::StrtPrepareTranscriptome,
|
381
|
+
'prepare_transcriptome',
|
382
|
+
'prepare_transcriptome GENOME',
|
383
|
+
'Prepare transcriptome data')
|
384
|
+
|
385
|
+
# strt:prepare_variation
|
386
|
+
|
387
|
+
desc 'prepare_variation DIR', 'Prepare variation data'
|
388
|
+
long_desc <<-DESC
|
389
|
+
Prepare genome variation data files for the specified GENOME dir based on common variations in dbSNP BUILD, at DIR.
|
390
|
+
DESC
|
391
|
+
|
392
|
+
method_option *OPT_COREUTILS_PREFIX
|
393
|
+
method_option *OPT_DOWNLOAD
|
394
|
+
method_option *OPT_GENOME
|
395
|
+
|
396
|
+
method_option :dbsnp,
|
397
|
+
banner: 'BUILD',
|
398
|
+
default: 146,
|
399
|
+
desc: 'Build number of dbSNP',
|
400
|
+
type: :numeric
|
401
|
+
|
402
|
+
def prepare_variation(dir0)
|
403
|
+
|
404
|
+
dir = File.expand_path(dir0)
|
405
|
+
snp = "#{dir}/#{options.genome}.snp#{options.dbsnp}Common.txt.gz"
|
406
|
+
|
407
|
+
rsync_file("rsync://hgdownload.soe.ucsc.edu/goldenPath/#{options.genome}/database/snp#{options.dbsnp}Common.txt.gz", snp) if options.download != 'no'
|
408
|
+
pipeline("unpigz -c #{dir}/genome.fa.gz",
|
409
|
+
"hisat2_extract_snps_haplotypes_UCSC.py - #{snp} #{dir}/variation"
|
410
|
+
) if options.download != 'only'
|
411
|
+
|
412
|
+
end
|
413
|
+
|
414
|
+
# strt:qualify
|
415
|
+
|
416
|
+
desc 'qualify CSV REFDIR SEQDIR MAPDIR', 'Qualify samples'
|
417
|
+
long_desc <<-DESC
|
418
|
+
Qualify samples in a design CSV.'
|
419
|
+
DESC
|
420
|
+
|
421
|
+
method_option *OPT_BUFFER_SIZE
|
422
|
+
method_option *OPT_COREUTILS_PREFIX
|
423
|
+
method_option *OPT_PARALLEL
|
424
|
+
|
425
|
+
def qualify(csv, refdir, seqdir, mapdir)
|
426
|
+
|
427
|
+
count_commands = ["#{cut_command} -f 5",
|
428
|
+
"ruby -e 'n=0; while gets; n+=$_.to_i; end; puts n'"]
|
429
|
+
|
430
|
+
samples = CSV.read(csv, {
|
431
|
+
headers: true,
|
432
|
+
converters: :numeric
|
433
|
+
})
|
434
|
+
bases = samples["BASE"]
|
435
|
+
|
436
|
+
samples["TOTAL_READS"] =
|
437
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
438
|
+
stat = CSV.table("#{seqdir}/#{base}.count.step8.csv")
|
439
|
+
n = 0
|
440
|
+
stat[:reads].each {|i| n += i }
|
441
|
+
n
|
442
|
+
end
|
443
|
+
|
444
|
+
samples["MAPPED_READS"] =
|
445
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
446
|
+
pipeline_readline("unpigz -c #{mapdir}/#{base}.bed.gz",
|
447
|
+
*count_commands).to_i
|
448
|
+
end
|
449
|
+
|
450
|
+
tmp = Array.new
|
451
|
+
samples.each do |row|
|
452
|
+
tmp << row["MAPPED_READS"].to_f / row["TOTAL_READS"]
|
453
|
+
end
|
454
|
+
samples["MAPPED_RATE"] = tmp
|
455
|
+
|
456
|
+
samples["RIBOSOME_READS"] =
|
457
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
458
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/ribosome.bed.gz",
|
459
|
+
*count_commands).to_i
|
460
|
+
end
|
461
|
+
|
462
|
+
samples["SPIKEIN_READS"] =
|
463
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
464
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_whole.bed.gz",
|
465
|
+
*count_commands).to_i
|
466
|
+
end
|
467
|
+
|
468
|
+
samples["SPIKEIN_5END_READS"] =
|
469
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
470
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
|
471
|
+
*count_commands).to_i
|
472
|
+
end
|
473
|
+
|
474
|
+
tmp = Array.new
|
475
|
+
samples.each do |row|
|
476
|
+
tmp << row["SPIKEIN_5END_READS"].to_f / row["SPIKEIN_READS"]
|
477
|
+
end
|
478
|
+
samples["SPIKEIN_5END_RATE"] = tmp
|
479
|
+
|
480
|
+
tmp = Array.new
|
481
|
+
samples.each do |row|
|
482
|
+
tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
|
483
|
+
end
|
484
|
+
samples["RELATIVE_POLYA_RNAS"] = tmp
|
485
|
+
|
486
|
+
samples["CODING_READS"] =
|
487
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
488
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
|
489
|
+
*count_commands).to_i
|
490
|
+
end
|
491
|
+
|
492
|
+
samples["CODING_5END_READS"] =
|
493
|
+
Parallel.map(bases, in_threads: options.parallel) do |base|
|
494
|
+
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_5end.bed.gz",
|
495
|
+
*count_commands).to_i
|
496
|
+
end
|
497
|
+
|
498
|
+
tmp = Array.new
|
499
|
+
samples.each do |row|
|
500
|
+
tmp << row["CODING_5END_READS"].to_f / row["CODING_READS"]
|
501
|
+
end
|
502
|
+
samples["CODING_5END_RATE"] = tmp
|
503
|
+
|
504
|
+
tmp = Array.new
|
505
|
+
samples.each do |row|
|
506
|
+
tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
|
507
|
+
end
|
508
|
+
samples["RELATIVE_MRNAS"] = tmp
|
509
|
+
|
510
|
+
puts samples
|
511
|
+
|
512
|
+
end
|
513
|
+
|
514
|
+
# strt:quantitate
|
515
|
+
|
516
|
+
desc 'quantify CSV REFDIR MAPDIR REGBASE',
|
517
|
+
'Quantify samples'
|
518
|
+
long_desc <<-DESC
|
519
|
+
Count reads per region for multiple samples in CSV. Read counts (a BED-format COUNT) within regions (REFDIR/ribosome.bed.gz, REFDIR/spikein_5end.bed.gz & REGBASE.bed.gz) are summed by the region names. Addional columns in REGBASE.csv is attached as annotations.
|
520
|
+
DESC
|
521
|
+
|
522
|
+
method_option *OPT_COREUTILS_PREFIX
|
523
|
+
method_option *OPT_PARALLEL
|
524
|
+
|
525
|
+
def quantify(csv, refdir, mapdir, regbase)
|
526
|
+
|
527
|
+
samples = CSV.read(csv, { headers: true, converters: :numeric })
|
528
|
+
|
529
|
+
tmp = CSV.read("#{regbase}.csv", { headers:true, converters: :numeric })
|
530
|
+
anns = tmp.headers; anns.delete('ID')
|
531
|
+
name2anns = Hash.new
|
532
|
+
tmp.each {|row| name2anns[row['ID']] = row.values_at(*anns)}
|
533
|
+
|
534
|
+
bases = samples["BASE"]
|
535
|
+
puts ( ['ID'] + anns +
|
536
|
+
bases.map {|base| "N|#{base}"} +
|
537
|
+
bases.map {|base| "R|#{base}"} ).join(',')
|
538
|
+
|
539
|
+
name2base2cnt = Hash.new
|
540
|
+
base2spike = Hash.new
|
541
|
+
@locker = Mutex.new
|
542
|
+
Parallel.map(samples["BASE"], in_threads: options.parallel) do |base|
|
543
|
+
base2spike[base] = 0.0
|
544
|
+
fp = open("| strt count_per_region#{coreutils_prefix_option} #{mapdir}/#{base}.bed.gz #{refdir}/ribosome.bed.gz #{refdir}/spikein_5end.bed.gz #{regbase}.bed.gz")
|
545
|
+
fp.gets
|
546
|
+
fp.each do |line|
|
547
|
+
name, cnt = line.strip.split /,/
|
548
|
+
@locker.synchronize do
|
549
|
+
if !name2base2cnt.key?(name)
|
550
|
+
name2base2cnt[name] = Hash.new
|
551
|
+
name2base2cnt[name][base] = Hash.new
|
552
|
+
elsif !name2base2cnt[name].key?(base)
|
553
|
+
name2base2cnt[name][base] = Hash.new
|
554
|
+
end
|
555
|
+
name2base2cnt[name][base] = cnt
|
556
|
+
base2spike[base] += cnt.to_f if name =~ /^RNA_SPIKE_/
|
557
|
+
end
|
558
|
+
end
|
559
|
+
fp.close
|
560
|
+
end
|
561
|
+
|
562
|
+
name2base2cnt.each do |name, base2cnt|
|
563
|
+
puts ( [name] +
|
564
|
+
(name2anns.key?(name) ?
|
565
|
+
name2anns[name] : Array.new(anns.length, 'NA')) +
|
566
|
+
bases.map {|base| base2cnt.key?(base) ?
|
567
|
+
base2cnt[base].to_f/base2spike[base]*1000 : 0} +
|
568
|
+
bases.map {|base| base2cnt.key?(base) ? base2cnt[base] : 0}
|
569
|
+
).join(',')
|
570
|
+
end
|
571
|
+
|
572
|
+
end
|
573
|
+
|
574
|
+
#
|
575
|
+
|
576
|
+
no_commands do
|
577
|
+
|
578
|
+
def download_option(options)
|
579
|
+
" --download=#{options.download}"
|
580
|
+
end
|
581
|
+
|
582
|
+
def genome_option(options)
|
583
|
+
" --genome=#{options.genome}"
|
584
|
+
end
|
585
|
+
|
586
|
+
def pipeline_readline(*cmds)
|
587
|
+
fp, ths = Open3.pipeline_r(*cmds)
|
588
|
+
line = fp.gets.strip
|
589
|
+
fp.close
|
590
|
+
ths[-1].join
|
591
|
+
line
|
592
|
+
end
|
593
|
+
|
594
|
+
def rsync_file(remote, local)
|
595
|
+
system "rsync -a #{remote} #{local}" or exit $?.exitstatus
|
596
|
+
end
|
597
|
+
|
598
|
+
end
|
599
|
+
|
600
|
+
end
|
601
|
+
end
|
602
|
+
end
|
603
|
+
|
604
|
+
require 'bio/gadget/strt/count.rb'
|
605
|
+
require 'bio/gadget/strt/depth.rb'
|