bio-gadget 0.4.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ require 'parallel'
2
+
3
+ module Bio
4
+ class Gadget
5
+ class Strt < Bio::Gadget
6
+
7
+ desc 'count SMP BASE BED [BED ...]',
8
+ "Count 5'-ends at BASE in each region defined by BEDs"
9
+
10
+ method_option *OPT_COREUTILS_PREFIX
11
+ method_option *OPT_PARALLEL
12
+
13
+ def count(smp, base, bed0, *beds)
14
+
15
+ cPrefix = options.coreutils_prefix
16
+
17
+ smps = Hash.new
18
+ fp = open(smp)
19
+ header = fp.gets.rstrip.split(',')
20
+ idxName = header.index('NAME')
21
+ idxBeds = header.index('5pBEDs')
22
+ fp.each do |line|
23
+ cols = line.rstrip.split(',')
24
+ smps[cols[idxName]] = cols[idxBeds].split(';')
25
+ end
26
+ fp.close
27
+
28
+ tmpfile = get_temporary_path('strt.count', 'bed')
29
+ system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bed0} #{beds.join(' ')} > #{tmpfile}"
30
+
31
+ counts = Hash.new
32
+ mutex = Mutex.new
33
+ Parallel.map(smps.keys, in_threads: options.parallel) do |name|
34
+ bed5ps = smps[name].map { |bed| "#{base}#{bed}.5p.bed.gz" }
35
+ open("| bedtools intersect -nonamecheck -wa -wb -s -sorted -a #{tmpfile} -b #{bed5ps.join(' ')} | #{cPrefix}cut -f 4,10 | #{cPrefix}sort -u | #{cPrefix}cut -f 1 | #{cPrefix}uniq -c").each do |line|
36
+ cnt, id = line.strip.split(' ')
37
+ mutex.synchronize do
38
+ counts[id] = Hash.new unless counts.key?(id)
39
+ counts[id][name] = cnt.to_i
40
+ end
41
+ end
42
+ end
43
+
44
+ names = smps.keys.sort
45
+ puts (['ID'] + names.map { |name| "R|#{name}" }).join(',')
46
+ counts.each do |id, name2count|
47
+ puts ([id] + names.map { |name| name2count.key?(name) ? name2count[name] : 0 }).join(',')
48
+ end
49
+ end
50
+
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,124 @@
1
+ require 'parallel'
2
+
3
+ module Bio
4
+ class Gadget
5
+ class Strt < Bio::Gadget
6
+
7
+ OPT_LENGTH_BARCODE = [ :length_barcode, { :banner => 'NT',
8
+ :default => 6,
9
+ :desc => 'Length of barcode',
10
+ :type => :numeric } ]
11
+
12
+ OPT_LENGTH_GAP = [ :length_gap, { :banner => 'NT',
13
+ :default => 3,
14
+ :desc => 'Length of gap (polyG)',
15
+ :type => :numeric } ]
16
+
17
+ OPT_LENGTH_MINIMUM = [ :length_minimum,
18
+ { :banner => 'NT',
19
+ :default => 25,
20
+ :desc => 'Minimum length after preprocess',
21
+ :type => :numeric } ]
22
+
23
+ OPT_LOW_QUALITIES = [ :low_qualities, { :banner => 'CHARACTERS',
24
+ :default => '!"#',
25
+ :desc => 'Low quality characters',
26
+ :type => :string } ]
27
+
28
+ desc 'depth FQGZ [FQGZ ...]',
29
+ 'Count nonredundant reads according to the sequencing depths'
30
+
31
+ method_option *OPT_BUFFER_SIZE
32
+ method_option *OPT_PARALLEL
33
+ method_option *OPT_COREUTILS_PREFIX
34
+ method_option *OPT_GREP_PREFIX
35
+
36
+ method_option *OPT_LENGTH_BARCODE
37
+ method_option *OPT_LENGTH_GAP
38
+ method_option *OPT_LENGTH_MINIMUM
39
+ method_option *OPT_UMI_LENGTH
40
+ method_option *OPT_LOW_QUALITIES
41
+
42
+ method_option :tss,
43
+ default: false,
44
+ desc: 'Check number of TSSs, instead of STRT reads',
45
+ type: :boolean
46
+
47
+ def depth(fqgz, *fqgzs0)
48
+
49
+ bSize,
50
+ cPfx,
51
+ gPfx,
52
+ par,
53
+ bLen,
54
+ gLen,
55
+ mLen,
56
+ pLen,
57
+ uLen,
58
+ match,
59
+ cPfx0 = configure_depth(options)
60
+
61
+ fqgzs = [fqgz] + fqgzs0
62
+ tmpfiles = Array.new(fqgzs.length) do |i|
63
+ get_temporary_path('strt.depth', 'fq1l')
64
+ end
65
+ tsscmd =
66
+ options.tss ? "fq1l mt5 --minimum-length=#{mLen} #{match}+ | #{cPfx0}cut -f 2 | #{sortCommand(options)} -u |" : ''
67
+ indexes = Array.new(fqgzs.length) { |i| i }
68
+ Parallel.each(indexes, in_threads: options.parallel) do |i|
69
+ system "gunzip -c #{fqgzs[i]} | fq1l convert #{cPfx} > #{tmpfiles[i]}"
70
+ end
71
+
72
+ 1.upto(12).each do |draw|
73
+ fifo = get_fifo('strt.depth', 'fq1l')
74
+ fp0 = open("| #{cPfx0}wc -l #{fifo}")
75
+ fp1 = open(<<CMD
76
+ | LC_ALL=C cat #{tmpfiles.join(' ')} \
77
+ | fq1l to #{draw} #{12-draw} \
78
+ | #{tee_command(options)} #{fifo} \
79
+ | fq1l nr #{bSize} #{cPfx} #{par} \
80
+ | fq1l m5 #{gPfx} #{match} \
81
+ | fq1l m5 #{gPfx} --invert-match '[^\\t]*N' \
82
+ | fq1l qt3 --low-qualities='#{options.low_qualities}' --minimum-length=#{pLen} \
83
+ | fq1l pt3 --primer=AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG --minimum-length=#{pLen} #{cPfx} #{gPfx} \
84
+ | fq1l nr #{bSize} --degenerated-mode #{cPfx} #{par} \
85
+ | #{tsscmd} #{cPfx0}wc -l
86
+ CMD
87
+ )
88
+ raw = fp0.gets.strip.split(/\s+/)[0]
89
+ fp0.close
90
+ nr = fp1.gets.strip
91
+ fp1.close
92
+ puts [raw, nr].join(',')
93
+ end
94
+
95
+ end
96
+
97
+ no_commands do
98
+
99
+ def configure_depth(options)
100
+ uLength = options.umi_length
101
+ bLength = options.length_barcode
102
+ gLength = options.length_gap
103
+ mLength = options.length_minimum
104
+ return [ options.key?('buffer_size') ?
105
+ '--buffer-size='+options.buffer_size : '',
106
+ options.coreutils_prefix == '' ?
107
+ '' : "--coreutils-prefix=#{options.coreutils_prefix}",
108
+ options.grep_prefix == '' ?
109
+ '' : "--grep-prefix=#{options.grep_prefix}",
110
+ "--parallel=#{options.parallel}",
111
+ bLength,
112
+ gLength,
113
+ mLength,
114
+ mLength + uLength + bLength + gLength,
115
+ uLength,
116
+ "#{'.' * uLength}#{'.' * bLength}#{'G' * (gLength-1)}",
117
+ options.coreutils_prefix ]
118
+ end
119
+
120
+ end
121
+
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,230 @@
1
+ require 'open3'
2
+
3
+ module Bio
4
+ class Gadget
5
+ class StrtPrepareTranscriptome < Bio::Gadget
6
+
7
+ package_name :prepare_transcriptome
8
+
9
+ #
10
+
11
+ desc 'hg38 DIR', 'GRCh38/hg38 - human'
12
+ long_desc <<-DESC
13
+ Prepare transcriptome data files based on GENCODE gene annotation RELEASE for GRCh38/h38 at DIR, where it has 'ref.fa.fai' genome index file.
14
+ DESC
15
+
16
+ method_option *OPT_COREUTILS_PREFIX
17
+ method_option *OPT_DOWNLOAD
18
+ method_option *OPT_GREP_PREFIX
19
+
20
+ method_option :gencode,
21
+ banner: 'RELEASE',
22
+ default: 25,
23
+ desc: 'Release number of GENCODE',
24
+ type: :numeric
25
+
26
+ def hg38(dir0)
27
+
28
+ dir = File.expand_path(dir0)
29
+ gtf = "#{dir}/hg38.gencode.v#{options.gencode}.annotation.gtf.gz"
30
+
31
+ if options.download != 'no'
32
+ download_file("ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v#{options.gencode}.annotation.gtf.gz",
33
+ gtf)
34
+ end
35
+
36
+ if options.download != 'only'
37
+
38
+ pipeline("unpigz -c #{gtf}",
39
+ "hisat2_extract_splice_sites.py - > #{dir}/transcriptome.splice_sites")
40
+
41
+ pipeline("unpigz -c #{gtf}",
42
+ "hisat2_extract_exons.py - > #{dir}/transcriptome.exons")
43
+
44
+ fp_ribosome = open_bed_w("#{dir}/ribosome.bed")
45
+ fp_whole = open_bed_w("#{dir}/spikein_whole.bed")
46
+ fp_5end = open_bed_w("#{dir}/spikein_5end.bed")
47
+ open("#{dir}/ref.fa.fai").each do |line|
48
+ acc, len, *tmp = line.rstrip.split
49
+ if acc =~ /^RIBO_/
50
+ fp_ribosome.puts [acc, 0, len, acc, 0, '+'].join("\t")
51
+ fp_ribosome.puts [acc, 0, len, acc, 0, '-'].join("\t")
52
+ end
53
+ if acc =~ /^RNA_SPIKE_/
54
+ fp_whole.puts [acc, 0, len, acc, 0, '+'].join("\t")
55
+ fp_5end.puts [acc, 0, 50, acc, 0, '+'].join("\t")
56
+ end
57
+ end
58
+ fp_ribosome.close
59
+ fp_whole.close
60
+ fp_5end.close
61
+
62
+ atgs = Hash.new
63
+ regex_transcript_id = /transcript_id "([^"]+)"/
64
+ regex_gene_id = /gene_id "([^"]+)"/
65
+ regex_gene_name = /gene_name "([^"]+)"/
66
+ regex_exon_number = /exon_number (\d+)/
67
+ Open3.pipeline_r(
68
+ "unpigz -c #{gtf}",
69
+ "#{grep_command} '\tstart_codon\t'") do |fp, threads|
70
+ fp.each do |line|
71
+ cols = line.rstrip.split /\t/
72
+ atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
73
+ cols[cols[6] == '+' ? 3 : 4].to_i
74
+ end
75
+ fp.close
76
+ end
77
+
78
+ bed_coding_exon = "#{dir}/transcriptome.coding_exon.bed"
79
+ bed_coding_5utr = "#{dir}/transcriptome.coding_5utr.bed"
80
+ bed_coding_promoter = "#{dir}/transcriptome.coding_promoter.bed"
81
+
82
+ fp_coding_gene = open_bed_w("#{dir}/transcriptome.coding_gene.bed")
83
+ fp_coding_exon = open_bed_w(bed_coding_exon)
84
+ fp_coding_5utr = open_bed_w(bed_coding_5utr)
85
+ fp_coding_promoter = open_bed_w(bed_coding_promoter)
86
+ fp_other_gene = open_bed_w("#{dir}/transcriptome.other_gene.bed")
87
+ fp_other_exon = open_bed_w("#{dir}/transcriptome.other_exon.bed")
88
+ fp_other_1st_exon = open_bed_w("#{dir}/transcriptome.other_1st_exon.bed")
89
+ fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
90
+ Open3.pipeline_r(
91
+ "unpigz -c #{gtf}",
92
+ "#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
93
+ fp.each do |line|
94
+ cols = line.rstrip.split /\t/
95
+ ann = cols[8]
96
+ transcript_id = regex_transcript_id.match(ann).to_a[1]
97
+ gene_id = regex_gene_id.match(ann).to_a[1]
98
+ gene_name = regex_gene_name.match(ann).to_a[1]
99
+ exon_number = regex_exon_number.match(ann).to_a[1].to_i
100
+ chr = cols[0]
101
+ left = cols[3].to_i
102
+ right = cols[4].to_i
103
+ str = cols[6]
104
+ acc = "#{gene_name}|#{gene_id}|#{transcript_id}"
105
+ exon = [chr, left-1, right, acc, 0, str].join("\t")
106
+ if cols[2] == 'transcript'
107
+ if atgs.key?(transcript_id)
108
+ fp_coding_gene.puts exon
109
+ else
110
+ fp_other_gene.puts exon
111
+ end
112
+ next
113
+ end
114
+ if atgs.key?(transcript_id)
115
+ fp_coding_exon.puts exon
116
+ atg = atgs[transcript_id]
117
+ if (str == '+' && right < atg) || (str == '-' && atg < left)
118
+ fp_coding_5utr.puts exon
119
+ elsif (str == '+' && left < atg && atg <= right)
120
+ fp_coding_5utr.puts [chr, left-1, atg-1, acc, 0, '+'].join("\t")
121
+ elsif (str == '-' && left <= atg && atg < right)
122
+ fp_coding_5utr.puts [chr, atg, right, acc, 0, '-'].join("\t")
123
+ end
124
+ if exon_number == 1
125
+ if str == '+'
126
+ fp_coding_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
127
+ else
128
+ fp_coding_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
129
+ end
130
+ end
131
+ else
132
+ fp_other_exon.puts exon
133
+ if exon_number == 1
134
+ fp_other_1st_exon.puts exon
135
+ if str == '+'
136
+ fp_other_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
137
+ else
138
+ fp_other_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
139
+ end
140
+ end
141
+ end
142
+ end
143
+ fp.close
144
+ end
145
+ fp_coding_gene.close
146
+ fp_coding_exon.close
147
+ fp_coding_5utr.close
148
+ fp_coding_promoter.close
149
+ fp_other_gene.close
150
+ fp_other_exon.close
151
+ fp_other_1st_exon.close
152
+ fp_other_promoter.close
153
+
154
+ merge_bed_by_gene("Coding5end",
155
+ "5UTR and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
156
+ "#{dir}/transcriptome.coding_5end",
157
+ bed_coding_promoter, bed_coding_5utr)
158
+ merge_bed_by_gene("CodingWhole",
159
+ "Exon and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
160
+ "#{dir}/transcriptome.coding_whole",
161
+ bed_coding_promoter, bed_coding_exon)
162
+
163
+ sh "pigz #{dir}/*.bed"
164
+
165
+ genePred = "#{dir}/hg38_refGene.txt"
166
+ pipeline(
167
+ "unpigz -c #{gtf}",
168
+ "#{grep_command} 'tag \"basic\"'",
169
+ "gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
170
+ sh "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}"
171
+
172
+ end
173
+
174
+ end
175
+
176
+ no_commands do
177
+
178
+ def open_bed_w(bed)
179
+ open("| bedtools sort -i stdin > #{bed}", 'w')
180
+ end
181
+
182
+ def merge_bed_by_gene(tname, tdesc, base, *inbeds)
183
+ tmpbed = get_temporary_path('strt.prepare_transcriptome', 'bed')
184
+ annfp = open("#{base}.csv", 'w')
185
+ annfp.puts "ID,ACCESSIONS"
186
+ Open3.pipeline_r(
187
+ "#{cat_command} #{inbeds.join(' ')}",
188
+ "ruby -anle 'as=$F[3].split /\\|/; puts ($F[0..2]+[as[0]]+$F[4..-1]+[as[1..-1].join(\"|\")]).join(\"\t\")'",
189
+ "#{sort_command} -k 4,4 -k 1,1 -k 2,2n") do |infp, inths|
190
+ presym = ''
191
+ gacc2taccs = Hash.new
192
+ outfp = nil
193
+ outths = nil
194
+ infp.each do |line|
195
+ chr, left, right, sym, tmp, str, accs = line.rstrip.split /\t/
196
+ gacc, tacc = accs.split /\|/
197
+ if presym != sym
198
+ unless outfp.nil?
199
+ outfp.close
200
+ outths[-1].join
201
+ accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
202
+ annfp.puts "#{presym},#{accstr}"
203
+ gacc2taccs = Hash.new
204
+ end
205
+ outfp, outths = Open3.pipeline_w(
206
+ "bedtools sort -i stdin",
207
+ "bedtools merge -s -c 4 -o distinct >> #{tmpbed}")
208
+ presym = sym
209
+ end
210
+ outfp.puts ([chr, left, right, sym, tmp, str]).join("\t")
211
+ gacc2taccs[gacc] = Array.new unless gacc2taccs.key?(gacc)
212
+ gacc2taccs[gacc] << tacc unless gacc2taccs[gacc].include?(tacc)
213
+ end
214
+ unless outfp.nil?
215
+ outfp.close
216
+ outths[-1].join
217
+ accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
218
+ annfp.puts "#{presym},#{accstr}"
219
+ end
220
+ end
221
+ annfp.close
222
+ sh "echo 'track name=#{tname} description=\"#{tdesc}\" visibility=3 colorByStrand=\"38,139,210 203,75,22\"' > #{base}.bed"
223
+ pipeline("ruby -anle 'puts ($F.values_at(0, 1, 2) + [$F[4], 0, $F[3]]).join(\"\t\")' < #{tmpbed}",
224
+ "bedtools sort -i stdin >> #{base}.bed")
225
+ end
226
+ end
227
+
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,135 @@
1
+ require 'bio'
2
+ require 'parallel'
3
+
4
+ module Bio
5
+ class Gadgets < Bio::Gadget
6
+
7
+ desc 'find PATTERN [NAME]',
8
+ 'Find fragments matching with regexp PATTERN from FASTA-format STDIN'
9
+
10
+ method_option *OPT_BUFFER_SIZE
11
+ method_option *OPT_PARALLEL
12
+ method_option *OPT_COREUTILS_PREFIX
13
+
14
+ method_option :ignore_case,
15
+ default: true,
16
+ desc: 'Fold lower case to upper case characters',
17
+ type: :boolean
18
+
19
+ def find(pattern, name0 = '')
20
+
21
+ bSize = options.key?('buffer_size') ? '--buffer-size='+options.buffer_size : ''
22
+ cPrefix = options.coreutils_prefix
23
+ re = Regexp.new("(#{pattern})", options.ignore_case)
24
+ name = name0 == '' ? pattern : name0
25
+
26
+ pids = Array.new
27
+ tmpfiles = Array.new
28
+ ff = Bio::FlatFile.open(Bio::FastaFormat, STDIN)
29
+ ff.each do |entry|
30
+ tmpfiles << tmpfile = get_temporary_path('find', 'bed', false)
31
+ acc = entry.entry_id
32
+ seq = entry.seq
33
+ pids << Process.fork do
34
+ fp = open("| #{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bSize} > #{tmpfile}", 'w')
35
+ #
36
+ pos = 0
37
+ match = re.match(seq, pos)
38
+ while !match.nil?
39
+ fp.puts [acc, match.begin(1), match.end(1), name, '0', '+'].join("\t")
40
+ pos = match.begin(1)+1
41
+ match = re.match(seq, pos)
42
+ end
43
+ #
44
+ pos = 0
45
+ seq = seq.reverse.tr('acgtACGT', 'tgcaTGCA')
46
+ len = seq.length
47
+ match = re.match(seq, pos)
48
+ while !match.nil?
49
+ fp.puts [acc, len-match.end(1), len-match.begin(1), name, '0', '-'].join("\t")
50
+ pos = match.begin(1)+1
51
+ match = re.match(seq, pos)
52
+ end
53
+ #
54
+ fp.close
55
+ end
56
+ while pids.length == options.parallel
57
+ pids.delete(Process.wait)
58
+ end
59
+ end
60
+ ff.close
61
+ while pids.length > 0
62
+ pids.delete(Process.wait)
63
+ end
64
+
65
+ system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 --merge #{bSize} #{tmpfiles.join(' ')}"
66
+ unlink_files(tmpfiles)
67
+
68
+ end
69
+
70
+ #
71
+
72
+ desc 'gap BEDgz1 BEDgz2',
73
+ "Calculate gap distances from 5'-end of fragments 1 to 3'-end of fragments 2"
74
+
75
+ method_option *OPT_PARALLEL
76
+ method_option *OPT_COREUTILS_PREFIX
77
+
78
+ method_option :minimum_gap,
79
+ default: -10000,
80
+ desc: 'Minimum gap distans to be reported',
81
+ type: :numeric
82
+
83
+ method_option :maximum_gap,
84
+ default: 2500,
85
+ desc: 'Maximum gap distans to be reported',
86
+ type: :numeric
87
+
88
+ def gap(bedgz1, bedgz2)
89
+
90
+ cPrefix = options.coreutils_prefix
91
+
92
+ chrs = Hash.new
93
+ open("| unpigz -c #{bedgz1} #{bedgz2} | #{cPrefix}cut -f 1 | #{cPrefix}uniq | #{cPrefix}sort -u").each do |line|
94
+ chrs[line.rstrip] = ''
95
+ end
96
+
97
+ max = options.maximum_gap
98
+ min = options.minimum_gap
99
+ reSep = /\t/
100
+ tmpfiles = Hash.new
101
+ chrs.keys.each do |chr|
102
+ tmpfiles[chr] = get_temporary_path('gap', 'csv', false)
103
+ end
104
+ Parallel.each(chrs.keys, in_processes: options.parallel) do |chr|
105
+ bed2 = Array.new
106
+ open("| gunzip -c #{bedgz2} | grep '^#{chr}\t'").each do |line|
107
+ chr0, *cols = line.rstrip.split(reSep)
108
+ cols[0] = cols[0].to_i
109
+ cols[1] = cols[1].to_i
110
+ bed2 << cols
111
+ end
112
+ fp = open(tmpfiles[chr], 'w')
113
+ open("| gunzip -c #{bedgz1} | grep '^#{chr}\t'").each do |line|
114
+ chr0, start, stop, name, score, str = line.rstrip.split(reSep)
115
+ if str == '+'
116
+ bed2.each do |bed|
117
+ dist = bed[1] - start.to_i + 1
118
+ fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
119
+ end
120
+ else
121
+ bed2.each do |bed|
122
+ dist = stop.to_i - bed[0] + 1
123
+ fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
124
+ end
125
+ end
126
+ end
127
+ fp.close
128
+ end
129
+ system "cat #{tmpfiles.values.join(' ')}"
130
+ unlink_files(tmpfiles)
131
+
132
+ end
133
+
134
+ end
135
+ end