bio-gadget 0.4.8 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,53 @@
1
+ require 'parallel'
2
+
3
+ module Bio
4
+ class Gadget
5
+ class Strt < Bio::Gadget
6
+
7
+ desc 'count SMP BASE BED [BED ...]',
8
+ "Count 5'-ends at BASE in each region defined by BEDs"
9
+
10
+ method_option *OPT_COREUTILS_PREFIX
11
+ method_option *OPT_PARALLEL
12
+
13
+ def count(smp, base, bed0, *beds)
14
+
15
+ cPrefix = options.coreutils_prefix
16
+
17
+ smps = Hash.new
18
+ fp = open(smp)
19
+ header = fp.gets.rstrip.split(',')
20
+ idxName = header.index('NAME')
21
+ idxBeds = header.index('5pBEDs')
22
+ fp.each do |line|
23
+ cols = line.rstrip.split(',')
24
+ smps[cols[idxName]] = cols[idxBeds].split(';')
25
+ end
26
+ fp.close
27
+
28
+ tmpfile = get_temporary_path('strt.count', 'bed')
29
+ system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bed0} #{beds.join(' ')} > #{tmpfile}"
30
+
31
+ counts = Hash.new
32
+ mutex = Mutex.new
33
+ Parallel.map(smps.keys, in_threads: options.parallel) do |name|
34
+ bed5ps = smps[name].map { |bed| "#{base}#{bed}.5p.bed.gz" }
35
+ open("| bedtools intersect -nonamecheck -wa -wb -s -sorted -a #{tmpfile} -b #{bed5ps.join(' ')} | #{cPrefix}cut -f 4,10 | #{cPrefix}sort -u | #{cPrefix}cut -f 1 | #{cPrefix}uniq -c").each do |line|
36
+ cnt, id = line.strip.split(' ')
37
+ mutex.synchronize do
38
+ counts[id] = Hash.new unless counts.key?(id)
39
+ counts[id][name] = cnt.to_i
40
+ end
41
+ end
42
+ end
43
+
44
+ names = smps.keys.sort
45
+ puts (['ID'] + names.map { |name| "R|#{name}" }).join(',')
46
+ counts.each do |id, name2count|
47
+ puts ([id] + names.map { |name| name2count.key?(name) ? name2count[name] : 0 }).join(',')
48
+ end
49
+ end
50
+
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,124 @@
1
+ require 'parallel'
2
+
3
+ module Bio
4
+ class Gadget
5
+ class Strt < Bio::Gadget
6
+
7
+ OPT_LENGTH_BARCODE = [ :length_barcode, { :banner => 'NT',
8
+ :default => 6,
9
+ :desc => 'Length of barcode',
10
+ :type => :numeric } ]
11
+
12
+ OPT_LENGTH_GAP = [ :length_gap, { :banner => 'NT',
13
+ :default => 3,
14
+ :desc => 'Length of gap (polyG)',
15
+ :type => :numeric } ]
16
+
17
+ OPT_LENGTH_MINIMUM = [ :length_minimum,
18
+ { :banner => 'NT',
19
+ :default => 25,
20
+ :desc => 'Minimum length after preprocess',
21
+ :type => :numeric } ]
22
+
23
+ OPT_LOW_QUALITIES = [ :low_qualities, { :banner => 'CHARACTERS',
24
+ :default => '!"#',
25
+ :desc => 'Low quality characters',
26
+ :type => :string } ]
27
+
28
+ desc 'depth FQGZ [FQGZ ...]',
29
+ 'Count nonredundant reads according to the sequencing depths'
30
+
31
+ method_option *OPT_BUFFER_SIZE
32
+ method_option *OPT_PARALLEL
33
+ method_option *OPT_COREUTILS_PREFIX
34
+ method_option *OPT_GREP_PREFIX
35
+
36
+ method_option *OPT_LENGTH_BARCODE
37
+ method_option *OPT_LENGTH_GAP
38
+ method_option *OPT_LENGTH_MINIMUM
39
+ method_option *OPT_UMI_LENGTH
40
+ method_option *OPT_LOW_QUALITIES
41
+
42
+ method_option :tss,
43
+ default: false,
44
+ desc: 'Check number of TSSs, instead of STRT reads',
45
+ type: :boolean
46
+
47
+ def depth(fqgz, *fqgzs0)
48
+
49
+ bSize,
50
+ cPfx,
51
+ gPfx,
52
+ par,
53
+ bLen,
54
+ gLen,
55
+ mLen,
56
+ pLen,
57
+ uLen,
58
+ match,
59
+ cPfx0 = configure_depth(options)
60
+
61
+ fqgzs = [fqgz] + fqgzs0
62
+ tmpfiles = Array.new(fqgzs.length) do |i|
63
+ get_temporary_path('strt.depth', 'fq1l')
64
+ end
65
+ tsscmd =
66
+ options.tss ? "fq1l mt5 --minimum-length=#{mLen} #{match}+ | #{cPfx0}cut -f 2 | #{sortCommand(options)} -u |" : ''
67
+ indexes = Array.new(fqgzs.length) { |i| i }
68
+ Parallel.each(indexes, in_threads: options.parallel) do |i|
69
+ system "gunzip -c #{fqgzs[i]} | fq1l convert #{cPfx} > #{tmpfiles[i]}"
70
+ end
71
+
72
+ 1.upto(12).each do |draw|
73
+ fifo = get_fifo('strt.depth', 'fq1l')
74
+ fp0 = open("| #{cPfx0}wc -l #{fifo}")
75
+ fp1 = open(<<CMD
76
+ | LC_ALL=C cat #{tmpfiles.join(' ')} \
77
+ | fq1l to #{draw} #{12-draw} \
78
+ | #{tee_command(options)} #{fifo} \
79
+ | fq1l nr #{bSize} #{cPfx} #{par} \
80
+ | fq1l m5 #{gPfx} #{match} \
81
+ | fq1l m5 #{gPfx} --invert-match '[^\\t]*N' \
82
+ | fq1l qt3 --low-qualities='#{options.low_qualities}' --minimum-length=#{pLen} \
83
+ | fq1l pt3 --primer=AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG --minimum-length=#{pLen} #{cPfx} #{gPfx} \
84
+ | fq1l nr #{bSize} --degenerated-mode #{cPfx} #{par} \
85
+ | #{tsscmd} #{cPfx0}wc -l
86
+ CMD
87
+ )
88
+ raw = fp0.gets.strip.split(/\s+/)[0]
89
+ fp0.close
90
+ nr = fp1.gets.strip
91
+ fp1.close
92
+ puts [raw, nr].join(',')
93
+ end
94
+
95
+ end
96
+
97
+ no_commands do
98
+
99
+ def configure_depth(options)
100
+ uLength = options.umi_length
101
+ bLength = options.length_barcode
102
+ gLength = options.length_gap
103
+ mLength = options.length_minimum
104
+ return [ options.key?('buffer_size') ?
105
+ '--buffer-size='+options.buffer_size : '',
106
+ options.coreutils_prefix == '' ?
107
+ '' : "--coreutils-prefix=#{options.coreutils_prefix}",
108
+ options.grep_prefix == '' ?
109
+ '' : "--grep-prefix=#{options.grep_prefix}",
110
+ "--parallel=#{options.parallel}",
111
+ bLength,
112
+ gLength,
113
+ mLength,
114
+ mLength + uLength + bLength + gLength,
115
+ uLength,
116
+ "#{'.' * uLength}#{'.' * bLength}#{'G' * (gLength-1)}",
117
+ options.coreutils_prefix ]
118
+ end
119
+
120
+ end
121
+
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,230 @@
1
+ require 'open3'
2
+
3
+ module Bio
4
+ class Gadget
5
+ class StrtPrepareTranscriptome < Bio::Gadget
6
+
7
+ package_name :prepare_transcriptome
8
+
9
+ #
10
+
11
+ desc 'hg38 DIR', 'GRCh38/hg38 - human'
12
+ long_desc <<-DESC
13
+ Prepare transcriptome data files based on GENCODE gene annotation RELEASE for GRCh38/h38 at DIR, where it has 'ref.fa.fai' genome index file.
14
+ DESC
15
+
16
+ method_option *OPT_COREUTILS_PREFIX
17
+ method_option *OPT_DOWNLOAD
18
+ method_option *OPT_GREP_PREFIX
19
+
20
+ method_option :gencode,
21
+ banner: 'RELEASE',
22
+ default: 25,
23
+ desc: 'Release number of GENCODE',
24
+ type: :numeric
25
+
26
+ def hg38(dir0)
27
+
28
+ dir = File.expand_path(dir0)
29
+ gtf = "#{dir}/hg38.gencode.v#{options.gencode}.annotation.gtf.gz"
30
+
31
+ if options.download != 'no'
32
+ download_file("ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v#{options.gencode}.annotation.gtf.gz",
33
+ gtf)
34
+ end
35
+
36
+ if options.download != 'only'
37
+
38
+ pipeline("unpigz -c #{gtf}",
39
+ "hisat2_extract_splice_sites.py - > #{dir}/transcriptome.splice_sites")
40
+
41
+ pipeline("unpigz -c #{gtf}",
42
+ "hisat2_extract_exons.py - > #{dir}/transcriptome.exons")
43
+
44
+ fp_ribosome = open_bed_w("#{dir}/ribosome.bed")
45
+ fp_whole = open_bed_w("#{dir}/spikein_whole.bed")
46
+ fp_5end = open_bed_w("#{dir}/spikein_5end.bed")
47
+ open("#{dir}/ref.fa.fai").each do |line|
48
+ acc, len, *tmp = line.rstrip.split
49
+ if acc =~ /^RIBO_/
50
+ fp_ribosome.puts [acc, 0, len, acc, 0, '+'].join("\t")
51
+ fp_ribosome.puts [acc, 0, len, acc, 0, '-'].join("\t")
52
+ end
53
+ if acc =~ /^RNA_SPIKE_/
54
+ fp_whole.puts [acc, 0, len, acc, 0, '+'].join("\t")
55
+ fp_5end.puts [acc, 0, 50, acc, 0, '+'].join("\t")
56
+ end
57
+ end
58
+ fp_ribosome.close
59
+ fp_whole.close
60
+ fp_5end.close
61
+
62
+ atgs = Hash.new
63
+ regex_transcript_id = /transcript_id "([^"]+)"/
64
+ regex_gene_id = /gene_id "([^"]+)"/
65
+ regex_gene_name = /gene_name "([^"]+)"/
66
+ regex_exon_number = /exon_number (\d+)/
67
+ Open3.pipeline_r(
68
+ "unpigz -c #{gtf}",
69
+ "#{grep_command} '\tstart_codon\t'") do |fp, threads|
70
+ fp.each do |line|
71
+ cols = line.rstrip.split /\t/
72
+ atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
73
+ cols[cols[6] == '+' ? 3 : 4].to_i
74
+ end
75
+ fp.close
76
+ end
77
+
78
+ bed_coding_exon = "#{dir}/transcriptome.coding_exon.bed"
79
+ bed_coding_5utr = "#{dir}/transcriptome.coding_5utr.bed"
80
+ bed_coding_promoter = "#{dir}/transcriptome.coding_promoter.bed"
81
+
82
+ fp_coding_gene = open_bed_w("#{dir}/transcriptome.coding_gene.bed")
83
+ fp_coding_exon = open_bed_w(bed_coding_exon)
84
+ fp_coding_5utr = open_bed_w(bed_coding_5utr)
85
+ fp_coding_promoter = open_bed_w(bed_coding_promoter)
86
+ fp_other_gene = open_bed_w("#{dir}/transcriptome.other_gene.bed")
87
+ fp_other_exon = open_bed_w("#{dir}/transcriptome.other_exon.bed")
88
+ fp_other_1st_exon = open_bed_w("#{dir}/transcriptome.other_1st_exon.bed")
89
+ fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
90
+ Open3.pipeline_r(
91
+ "unpigz -c #{gtf}",
92
+ "#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
93
+ fp.each do |line|
94
+ cols = line.rstrip.split /\t/
95
+ ann = cols[8]
96
+ transcript_id = regex_transcript_id.match(ann).to_a[1]
97
+ gene_id = regex_gene_id.match(ann).to_a[1]
98
+ gene_name = regex_gene_name.match(ann).to_a[1]
99
+ exon_number = regex_exon_number.match(ann).to_a[1].to_i
100
+ chr = cols[0]
101
+ left = cols[3].to_i
102
+ right = cols[4].to_i
103
+ str = cols[6]
104
+ acc = "#{gene_name}|#{gene_id}|#{transcript_id}"
105
+ exon = [chr, left-1, right, acc, 0, str].join("\t")
106
+ if cols[2] == 'transcript'
107
+ if atgs.key?(transcript_id)
108
+ fp_coding_gene.puts exon
109
+ else
110
+ fp_other_gene.puts exon
111
+ end
112
+ next
113
+ end
114
+ if atgs.key?(transcript_id)
115
+ fp_coding_exon.puts exon
116
+ atg = atgs[transcript_id]
117
+ if (str == '+' && right < atg) || (str == '-' && atg < left)
118
+ fp_coding_5utr.puts exon
119
+ elsif (str == '+' && left < atg && atg <= right)
120
+ fp_coding_5utr.puts [chr, left-1, atg-1, acc, 0, '+'].join("\t")
121
+ elsif (str == '-' && left <= atg && atg < right)
122
+ fp_coding_5utr.puts [chr, atg, right, acc, 0, '-'].join("\t")
123
+ end
124
+ if exon_number == 1
125
+ if str == '+'
126
+ fp_coding_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
127
+ else
128
+ fp_coding_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
129
+ end
130
+ end
131
+ else
132
+ fp_other_exon.puts exon
133
+ if exon_number == 1
134
+ fp_other_1st_exon.puts exon
135
+ if str == '+'
136
+ fp_other_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
137
+ else
138
+ fp_other_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
139
+ end
140
+ end
141
+ end
142
+ end
143
+ fp.close
144
+ end
145
+ fp_coding_gene.close
146
+ fp_coding_exon.close
147
+ fp_coding_5utr.close
148
+ fp_coding_promoter.close
149
+ fp_other_gene.close
150
+ fp_other_exon.close
151
+ fp_other_1st_exon.close
152
+ fp_other_promoter.close
153
+
154
+ merge_bed_by_gene("Coding5end",
155
+ "5UTR and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
156
+ "#{dir}/transcriptome.coding_5end",
157
+ bed_coding_promoter, bed_coding_5utr)
158
+ merge_bed_by_gene("CodingWhole",
159
+ "Exon and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
160
+ "#{dir}/transcriptome.coding_whole",
161
+ bed_coding_promoter, bed_coding_exon)
162
+
163
+ sh "pigz #{dir}/*.bed"
164
+
165
+ genePred = "#{dir}/hg38_refGene.txt"
166
+ pipeline(
167
+ "unpigz -c #{gtf}",
168
+ "#{grep_command} 'tag \"basic\"'",
169
+ "gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
170
+ sh "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}"
171
+
172
+ end
173
+
174
+ end
175
+
176
+ no_commands do
177
+
178
+ def open_bed_w(bed)
179
+ open("| bedtools sort -i stdin > #{bed}", 'w')
180
+ end
181
+
182
+ def merge_bed_by_gene(tname, tdesc, base, *inbeds)
183
+ tmpbed = get_temporary_path('strt.prepare_transcriptome', 'bed')
184
+ annfp = open("#{base}.csv", 'w')
185
+ annfp.puts "ID,ACCESSIONS"
186
+ Open3.pipeline_r(
187
+ "#{cat_command} #{inbeds.join(' ')}",
188
+ "ruby -anle 'as=$F[3].split /\\|/; puts ($F[0..2]+[as[0]]+$F[4..-1]+[as[1..-1].join(\"|\")]).join(\"\t\")'",
189
+ "#{sort_command} -k 4,4 -k 1,1 -k 2,2n") do |infp, inths|
190
+ presym = ''
191
+ gacc2taccs = Hash.new
192
+ outfp = nil
193
+ outths = nil
194
+ infp.each do |line|
195
+ chr, left, right, sym, tmp, str, accs = line.rstrip.split /\t/
196
+ gacc, tacc = accs.split /\|/
197
+ if presym != sym
198
+ unless outfp.nil?
199
+ outfp.close
200
+ outths[-1].join
201
+ accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
202
+ annfp.puts "#{presym},#{accstr}"
203
+ gacc2taccs = Hash.new
204
+ end
205
+ outfp, outths = Open3.pipeline_w(
206
+ "bedtools sort -i stdin",
207
+ "bedtools merge -s -c 4 -o distinct >> #{tmpbed}")
208
+ presym = sym
209
+ end
210
+ outfp.puts ([chr, left, right, sym, tmp, str]).join("\t")
211
+ gacc2taccs[gacc] = Array.new unless gacc2taccs.key?(gacc)
212
+ gacc2taccs[gacc] << tacc unless gacc2taccs[gacc].include?(tacc)
213
+ end
214
+ unless outfp.nil?
215
+ outfp.close
216
+ outths[-1].join
217
+ accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
218
+ annfp.puts "#{presym},#{accstr}"
219
+ end
220
+ end
221
+ annfp.close
222
+ sh "echo 'track name=#{tname} description=\"#{tdesc}\" visibility=3 colorByStrand=\"38,139,210 203,75,22\"' > #{base}.bed"
223
+ pipeline("ruby -anle 'puts ($F.values_at(0, 1, 2) + [$F[4], 0, $F[3]]).join(\"\t\")' < #{tmpbed}",
224
+ "bedtools sort -i stdin >> #{base}.bed")
225
+ end
226
+ end
227
+
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,135 @@
1
+ require 'bio'
2
+ require 'parallel'
3
+
4
+ module Bio
5
+ class Gadgets < Bio::Gadget
6
+
7
+ desc 'find PATTERN [NAME]',
8
+ 'Find fragments matching with regexp PATTERN from FASTA-format STDIN'
9
+
10
+ method_option *OPT_BUFFER_SIZE
11
+ method_option *OPT_PARALLEL
12
+ method_option *OPT_COREUTILS_PREFIX
13
+
14
+ method_option :ignore_case,
15
+ default: true,
16
+ desc: 'Fold lower case to upper case characters',
17
+ type: :boolean
18
+
19
+ def find(pattern, name0 = '')
20
+
21
+ bSize = options.key?('buffer_size') ? '--buffer-size='+options.buffer_size : ''
22
+ cPrefix = options.coreutils_prefix
23
+ re = Regexp.new("(#{pattern})", options.ignore_case)
24
+ name = name0 == '' ? pattern : name0
25
+
26
+ pids = Array.new
27
+ tmpfiles = Array.new
28
+ ff = Bio::FlatFile.open(Bio::FastaFormat, STDIN)
29
+ ff.each do |entry|
30
+ tmpfiles << tmpfile = get_temporary_path('find', 'bed', false)
31
+ acc = entry.entry_id
32
+ seq = entry.seq
33
+ pids << Process.fork do
34
+ fp = open("| #{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bSize} > #{tmpfile}", 'w')
35
+ #
36
+ pos = 0
37
+ match = re.match(seq, pos)
38
+ while !match.nil?
39
+ fp.puts [acc, match.begin(1), match.end(1), name, '0', '+'].join("\t")
40
+ pos = match.begin(1)+1
41
+ match = re.match(seq, pos)
42
+ end
43
+ #
44
+ pos = 0
45
+ seq = seq.reverse.tr('acgtACGT', 'tgcaTGCA')
46
+ len = seq.length
47
+ match = re.match(seq, pos)
48
+ while !match.nil?
49
+ fp.puts [acc, len-match.end(1), len-match.begin(1), name, '0', '-'].join("\t")
50
+ pos = match.begin(1)+1
51
+ match = re.match(seq, pos)
52
+ end
53
+ #
54
+ fp.close
55
+ end
56
+ while pids.length == options.parallel
57
+ pids.delete(Process.wait)
58
+ end
59
+ end
60
+ ff.close
61
+ while pids.length > 0
62
+ pids.delete(Process.wait)
63
+ end
64
+
65
+ system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 --merge #{bSize} #{tmpfiles.join(' ')}"
66
+ unlink_files(tmpfiles)
67
+
68
+ end
69
+
70
+ #
71
+
72
+ desc 'gap BEDgz1 BEDgz2',
73
+ "Calculate gap distances from 5'-end of fragments 1 to 3'-end of fragments 2"
74
+
75
+ method_option *OPT_PARALLEL
76
+ method_option *OPT_COREUTILS_PREFIX
77
+
78
+ method_option :minimum_gap,
79
+ default: -10000,
80
+ desc: 'Minimum gap distans to be reported',
81
+ type: :numeric
82
+
83
+ method_option :maximum_gap,
84
+ default: 2500,
85
+ desc: 'Maximum gap distans to be reported',
86
+ type: :numeric
87
+
88
+ def gap(bedgz1, bedgz2)
89
+
90
+ cPrefix = options.coreutils_prefix
91
+
92
+ chrs = Hash.new
93
+ open("| unpigz -c #{bedgz1} #{bedgz2} | #{cPrefix}cut -f 1 | #{cPrefix}uniq | #{cPrefix}sort -u").each do |line|
94
+ chrs[line.rstrip] = ''
95
+ end
96
+
97
+ max = options.maximum_gap
98
+ min = options.minimum_gap
99
+ reSep = /\t/
100
+ tmpfiles = Hash.new
101
+ chrs.keys.each do |chr|
102
+ tmpfiles[chr] = get_temporary_path('gap', 'csv', false)
103
+ end
104
+ Parallel.each(chrs.keys, in_processes: options.parallel) do |chr|
105
+ bed2 = Array.new
106
+ open("| gunzip -c #{bedgz2} | grep '^#{chr}\t'").each do |line|
107
+ chr0, *cols = line.rstrip.split(reSep)
108
+ cols[0] = cols[0].to_i
109
+ cols[1] = cols[1].to_i
110
+ bed2 << cols
111
+ end
112
+ fp = open(tmpfiles[chr], 'w')
113
+ open("| gunzip -c #{bedgz1} | grep '^#{chr}\t'").each do |line|
114
+ chr0, start, stop, name, score, str = line.rstrip.split(reSep)
115
+ if str == '+'
116
+ bed2.each do |bed|
117
+ dist = bed[1] - start.to_i + 1
118
+ fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
119
+ end
120
+ else
121
+ bed2.each do |bed|
122
+ dist = stop.to_i - bed[0] + 1
123
+ fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
124
+ end
125
+ end
126
+ end
127
+ fp.close
128
+ end
129
+ system "cat #{tmpfiles.values.join(' ')}"
130
+ unlink_files(tmpfiles)
131
+
132
+ end
133
+
134
+ end
135
+ end