bio-gadget 0.4.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'parallel'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
class Gadget
|
5
|
+
class Strt < Bio::Gadget
|
6
|
+
|
7
|
+
desc 'count SMP BASE BED [BED ...]',
|
8
|
+
"Count 5'-ends at BASE in each region defined by BEDs"
|
9
|
+
|
10
|
+
method_option *OPT_COREUTILS_PREFIX
|
11
|
+
method_option *OPT_PARALLEL
|
12
|
+
|
13
|
+
def count(smp, base, bed0, *beds)
|
14
|
+
|
15
|
+
cPrefix = options.coreutils_prefix
|
16
|
+
|
17
|
+
smps = Hash.new
|
18
|
+
fp = open(smp)
|
19
|
+
header = fp.gets.rstrip.split(',')
|
20
|
+
idxName = header.index('NAME')
|
21
|
+
idxBeds = header.index('5pBEDs')
|
22
|
+
fp.each do |line|
|
23
|
+
cols = line.rstrip.split(',')
|
24
|
+
smps[cols[idxName]] = cols[idxBeds].split(';')
|
25
|
+
end
|
26
|
+
fp.close
|
27
|
+
|
28
|
+
tmpfile = get_temporary_path('strt.count', 'bed')
|
29
|
+
system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bed0} #{beds.join(' ')} > #{tmpfile}"
|
30
|
+
|
31
|
+
counts = Hash.new
|
32
|
+
mutex = Mutex.new
|
33
|
+
Parallel.map(smps.keys, in_threads: options.parallel) do |name|
|
34
|
+
bed5ps = smps[name].map { |bed| "#{base}#{bed}.5p.bed.gz" }
|
35
|
+
open("| bedtools intersect -nonamecheck -wa -wb -s -sorted -a #{tmpfile} -b #{bed5ps.join(' ')} | #{cPrefix}cut -f 4,10 | #{cPrefix}sort -u | #{cPrefix}cut -f 1 | #{cPrefix}uniq -c").each do |line|
|
36
|
+
cnt, id = line.strip.split(' ')
|
37
|
+
mutex.synchronize do
|
38
|
+
counts[id] = Hash.new unless counts.key?(id)
|
39
|
+
counts[id][name] = cnt.to_i
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
names = smps.keys.sort
|
45
|
+
puts (['ID'] + names.map { |name| "R|#{name}" }).join(',')
|
46
|
+
counts.each do |id, name2count|
|
47
|
+
puts ([id] + names.map { |name| name2count.key?(name) ? name2count[name] : 0 }).join(',')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'parallel'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
class Gadget
|
5
|
+
class Strt < Bio::Gadget
|
6
|
+
|
7
|
+
OPT_LENGTH_BARCODE = [ :length_barcode, { :banner => 'NT',
|
8
|
+
:default => 6,
|
9
|
+
:desc => 'Length of barcode',
|
10
|
+
:type => :numeric } ]
|
11
|
+
|
12
|
+
OPT_LENGTH_GAP = [ :length_gap, { :banner => 'NT',
|
13
|
+
:default => 3,
|
14
|
+
:desc => 'Length of gap (polyG)',
|
15
|
+
:type => :numeric } ]
|
16
|
+
|
17
|
+
OPT_LENGTH_MINIMUM = [ :length_minimum,
|
18
|
+
{ :banner => 'NT',
|
19
|
+
:default => 25,
|
20
|
+
:desc => 'Minimum length after preprocess',
|
21
|
+
:type => :numeric } ]
|
22
|
+
|
23
|
+
OPT_LOW_QUALITIES = [ :low_qualities, { :banner => 'CHARACTERS',
|
24
|
+
:default => '!"#',
|
25
|
+
:desc => 'Low quality characters',
|
26
|
+
:type => :string } ]
|
27
|
+
|
28
|
+
desc 'depth FQGZ [FQGZ ...]',
|
29
|
+
'Count nonredundant reads according to the sequencing depths'
|
30
|
+
|
31
|
+
method_option *OPT_BUFFER_SIZE
|
32
|
+
method_option *OPT_PARALLEL
|
33
|
+
method_option *OPT_COREUTILS_PREFIX
|
34
|
+
method_option *OPT_GREP_PREFIX
|
35
|
+
|
36
|
+
method_option *OPT_LENGTH_BARCODE
|
37
|
+
method_option *OPT_LENGTH_GAP
|
38
|
+
method_option *OPT_LENGTH_MINIMUM
|
39
|
+
method_option *OPT_UMI_LENGTH
|
40
|
+
method_option *OPT_LOW_QUALITIES
|
41
|
+
|
42
|
+
method_option :tss,
|
43
|
+
default: false,
|
44
|
+
desc: 'Check number of TSSs, instead of STRT reads',
|
45
|
+
type: :boolean
|
46
|
+
|
47
|
+
def depth(fqgz, *fqgzs0)
|
48
|
+
|
49
|
+
bSize,
|
50
|
+
cPfx,
|
51
|
+
gPfx,
|
52
|
+
par,
|
53
|
+
bLen,
|
54
|
+
gLen,
|
55
|
+
mLen,
|
56
|
+
pLen,
|
57
|
+
uLen,
|
58
|
+
match,
|
59
|
+
cPfx0 = configure_depth(options)
|
60
|
+
|
61
|
+
fqgzs = [fqgz] + fqgzs0
|
62
|
+
tmpfiles = Array.new(fqgzs.length) do |i|
|
63
|
+
get_temporary_path('strt.depth', 'fq1l')
|
64
|
+
end
|
65
|
+
tsscmd =
|
66
|
+
options.tss ? "fq1l mt5 --minimum-length=#{mLen} #{match}+ | #{cPfx0}cut -f 2 | #{sortCommand(options)} -u |" : ''
|
67
|
+
indexes = Array.new(fqgzs.length) { |i| i }
|
68
|
+
Parallel.each(indexes, in_threads: options.parallel) do |i|
|
69
|
+
system "gunzip -c #{fqgzs[i]} | fq1l convert #{cPfx} > #{tmpfiles[i]}"
|
70
|
+
end
|
71
|
+
|
72
|
+
1.upto(12).each do |draw|
|
73
|
+
fifo = get_fifo('strt.depth', 'fq1l')
|
74
|
+
fp0 = open("| #{cPfx0}wc -l #{fifo}")
|
75
|
+
fp1 = open(<<CMD
|
76
|
+
| LC_ALL=C cat #{tmpfiles.join(' ')} \
|
77
|
+
| fq1l to #{draw} #{12-draw} \
|
78
|
+
| #{tee_command(options)} #{fifo} \
|
79
|
+
| fq1l nr #{bSize} #{cPfx} #{par} \
|
80
|
+
| fq1l m5 #{gPfx} #{match} \
|
81
|
+
| fq1l m5 #{gPfx} --invert-match '[^\\t]*N' \
|
82
|
+
| fq1l qt3 --low-qualities='#{options.low_qualities}' --minimum-length=#{pLen} \
|
83
|
+
| fq1l pt3 --primer=AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG --minimum-length=#{pLen} #{cPfx} #{gPfx} \
|
84
|
+
| fq1l nr #{bSize} --degenerated-mode #{cPfx} #{par} \
|
85
|
+
| #{tsscmd} #{cPfx0}wc -l
|
86
|
+
CMD
|
87
|
+
)
|
88
|
+
raw = fp0.gets.strip.split(/\s+/)[0]
|
89
|
+
fp0.close
|
90
|
+
nr = fp1.gets.strip
|
91
|
+
fp1.close
|
92
|
+
puts [raw, nr].join(',')
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
no_commands do
|
98
|
+
|
99
|
+
def configure_depth(options)
|
100
|
+
uLength = options.umi_length
|
101
|
+
bLength = options.length_barcode
|
102
|
+
gLength = options.length_gap
|
103
|
+
mLength = options.length_minimum
|
104
|
+
return [ options.key?('buffer_size') ?
|
105
|
+
'--buffer-size='+options.buffer_size : '',
|
106
|
+
options.coreutils_prefix == '' ?
|
107
|
+
'' : "--coreutils-prefix=#{options.coreutils_prefix}",
|
108
|
+
options.grep_prefix == '' ?
|
109
|
+
'' : "--grep-prefix=#{options.grep_prefix}",
|
110
|
+
"--parallel=#{options.parallel}",
|
111
|
+
bLength,
|
112
|
+
gLength,
|
113
|
+
mLength,
|
114
|
+
mLength + uLength + bLength + gLength,
|
115
|
+
uLength,
|
116
|
+
"#{'.' * uLength}#{'.' * bLength}#{'G' * (gLength-1)}",
|
117
|
+
options.coreutils_prefix ]
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,230 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
class Gadget
|
5
|
+
class StrtPrepareTranscriptome < Bio::Gadget
|
6
|
+
|
7
|
+
package_name :prepare_transcriptome
|
8
|
+
|
9
|
+
#
|
10
|
+
|
11
|
+
desc 'hg38 DIR', 'GRCh38/hg38 - human'
|
12
|
+
long_desc <<-DESC
|
13
|
+
Prepare transcriptome data files based on GENCODE gene annotation RELEASE for GRCh38/h38 at DIR, where it has 'ref.fa.fai' genome index file.
|
14
|
+
DESC
|
15
|
+
|
16
|
+
method_option *OPT_COREUTILS_PREFIX
|
17
|
+
method_option *OPT_DOWNLOAD
|
18
|
+
method_option *OPT_GREP_PREFIX
|
19
|
+
|
20
|
+
method_option :gencode,
|
21
|
+
banner: 'RELEASE',
|
22
|
+
default: 25,
|
23
|
+
desc: 'Release number of GENCODE',
|
24
|
+
type: :numeric
|
25
|
+
|
26
|
+
def hg38(dir0)
|
27
|
+
|
28
|
+
dir = File.expand_path(dir0)
|
29
|
+
gtf = "#{dir}/hg38.gencode.v#{options.gencode}.annotation.gtf.gz"
|
30
|
+
|
31
|
+
if options.download != 'no'
|
32
|
+
download_file("ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v#{options.gencode}.annotation.gtf.gz",
|
33
|
+
gtf)
|
34
|
+
end
|
35
|
+
|
36
|
+
if options.download != 'only'
|
37
|
+
|
38
|
+
pipeline("unpigz -c #{gtf}",
|
39
|
+
"hisat2_extract_splice_sites.py - > #{dir}/transcriptome.splice_sites")
|
40
|
+
|
41
|
+
pipeline("unpigz -c #{gtf}",
|
42
|
+
"hisat2_extract_exons.py - > #{dir}/transcriptome.exons")
|
43
|
+
|
44
|
+
fp_ribosome = open_bed_w("#{dir}/ribosome.bed")
|
45
|
+
fp_whole = open_bed_w("#{dir}/spikein_whole.bed")
|
46
|
+
fp_5end = open_bed_w("#{dir}/spikein_5end.bed")
|
47
|
+
open("#{dir}/ref.fa.fai").each do |line|
|
48
|
+
acc, len, *tmp = line.rstrip.split
|
49
|
+
if acc =~ /^RIBO_/
|
50
|
+
fp_ribosome.puts [acc, 0, len, acc, 0, '+'].join("\t")
|
51
|
+
fp_ribosome.puts [acc, 0, len, acc, 0, '-'].join("\t")
|
52
|
+
end
|
53
|
+
if acc =~ /^RNA_SPIKE_/
|
54
|
+
fp_whole.puts [acc, 0, len, acc, 0, '+'].join("\t")
|
55
|
+
fp_5end.puts [acc, 0, 50, acc, 0, '+'].join("\t")
|
56
|
+
end
|
57
|
+
end
|
58
|
+
fp_ribosome.close
|
59
|
+
fp_whole.close
|
60
|
+
fp_5end.close
|
61
|
+
|
62
|
+
atgs = Hash.new
|
63
|
+
regex_transcript_id = /transcript_id "([^"]+)"/
|
64
|
+
regex_gene_id = /gene_id "([^"]+)"/
|
65
|
+
regex_gene_name = /gene_name "([^"]+)"/
|
66
|
+
regex_exon_number = /exon_number (\d+)/
|
67
|
+
Open3.pipeline_r(
|
68
|
+
"unpigz -c #{gtf}",
|
69
|
+
"#{grep_command} '\tstart_codon\t'") do |fp, threads|
|
70
|
+
fp.each do |line|
|
71
|
+
cols = line.rstrip.split /\t/
|
72
|
+
atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
|
73
|
+
cols[cols[6] == '+' ? 3 : 4].to_i
|
74
|
+
end
|
75
|
+
fp.close
|
76
|
+
end
|
77
|
+
|
78
|
+
bed_coding_exon = "#{dir}/transcriptome.coding_exon.bed"
|
79
|
+
bed_coding_5utr = "#{dir}/transcriptome.coding_5utr.bed"
|
80
|
+
bed_coding_promoter = "#{dir}/transcriptome.coding_promoter.bed"
|
81
|
+
|
82
|
+
fp_coding_gene = open_bed_w("#{dir}/transcriptome.coding_gene.bed")
|
83
|
+
fp_coding_exon = open_bed_w(bed_coding_exon)
|
84
|
+
fp_coding_5utr = open_bed_w(bed_coding_5utr)
|
85
|
+
fp_coding_promoter = open_bed_w(bed_coding_promoter)
|
86
|
+
fp_other_gene = open_bed_w("#{dir}/transcriptome.other_gene.bed")
|
87
|
+
fp_other_exon = open_bed_w("#{dir}/transcriptome.other_exon.bed")
|
88
|
+
fp_other_1st_exon = open_bed_w("#{dir}/transcriptome.other_1st_exon.bed")
|
89
|
+
fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
|
90
|
+
Open3.pipeline_r(
|
91
|
+
"unpigz -c #{gtf}",
|
92
|
+
"#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
|
93
|
+
fp.each do |line|
|
94
|
+
cols = line.rstrip.split /\t/
|
95
|
+
ann = cols[8]
|
96
|
+
transcript_id = regex_transcript_id.match(ann).to_a[1]
|
97
|
+
gene_id = regex_gene_id.match(ann).to_a[1]
|
98
|
+
gene_name = regex_gene_name.match(ann).to_a[1]
|
99
|
+
exon_number = regex_exon_number.match(ann).to_a[1].to_i
|
100
|
+
chr = cols[0]
|
101
|
+
left = cols[3].to_i
|
102
|
+
right = cols[4].to_i
|
103
|
+
str = cols[6]
|
104
|
+
acc = "#{gene_name}|#{gene_id}|#{transcript_id}"
|
105
|
+
exon = [chr, left-1, right, acc, 0, str].join("\t")
|
106
|
+
if cols[2] == 'transcript'
|
107
|
+
if atgs.key?(transcript_id)
|
108
|
+
fp_coding_gene.puts exon
|
109
|
+
else
|
110
|
+
fp_other_gene.puts exon
|
111
|
+
end
|
112
|
+
next
|
113
|
+
end
|
114
|
+
if atgs.key?(transcript_id)
|
115
|
+
fp_coding_exon.puts exon
|
116
|
+
atg = atgs[transcript_id]
|
117
|
+
if (str == '+' && right < atg) || (str == '-' && atg < left)
|
118
|
+
fp_coding_5utr.puts exon
|
119
|
+
elsif (str == '+' && left < atg && atg <= right)
|
120
|
+
fp_coding_5utr.puts [chr, left-1, atg-1, acc, 0, '+'].join("\t")
|
121
|
+
elsif (str == '-' && left <= atg && atg < right)
|
122
|
+
fp_coding_5utr.puts [chr, atg, right, acc, 0, '-'].join("\t")
|
123
|
+
end
|
124
|
+
if exon_number == 1
|
125
|
+
if str == '+'
|
126
|
+
fp_coding_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
|
127
|
+
else
|
128
|
+
fp_coding_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
fp_other_exon.puts exon
|
133
|
+
if exon_number == 1
|
134
|
+
fp_other_1st_exon.puts exon
|
135
|
+
if str == '+'
|
136
|
+
fp_other_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
|
137
|
+
else
|
138
|
+
fp_other_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
fp.close
|
144
|
+
end
|
145
|
+
fp_coding_gene.close
|
146
|
+
fp_coding_exon.close
|
147
|
+
fp_coding_5utr.close
|
148
|
+
fp_coding_promoter.close
|
149
|
+
fp_other_gene.close
|
150
|
+
fp_other_exon.close
|
151
|
+
fp_other_1st_exon.close
|
152
|
+
fp_other_promoter.close
|
153
|
+
|
154
|
+
merge_bed_by_gene("Coding5end",
|
155
|
+
"5UTR and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
|
156
|
+
"#{dir}/transcriptome.coding_5end",
|
157
|
+
bed_coding_promoter, bed_coding_5utr)
|
158
|
+
merge_bed_by_gene("CodingWhole",
|
159
|
+
"Exon and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
|
160
|
+
"#{dir}/transcriptome.coding_whole",
|
161
|
+
bed_coding_promoter, bed_coding_exon)
|
162
|
+
|
163
|
+
sh "pigz #{dir}/*.bed"
|
164
|
+
|
165
|
+
genePred = "#{dir}/hg38_refGene.txt"
|
166
|
+
pipeline(
|
167
|
+
"unpigz -c #{gtf}",
|
168
|
+
"#{grep_command} 'tag \"basic\"'",
|
169
|
+
"gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
|
170
|
+
sh "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}"
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
no_commands do
|
177
|
+
|
178
|
+
def open_bed_w(bed)
|
179
|
+
open("| bedtools sort -i stdin > #{bed}", 'w')
|
180
|
+
end
|
181
|
+
|
182
|
+
def merge_bed_by_gene(tname, tdesc, base, *inbeds)
|
183
|
+
tmpbed = get_temporary_path('strt.prepare_transcriptome', 'bed')
|
184
|
+
annfp = open("#{base}.csv", 'w')
|
185
|
+
annfp.puts "ID,ACCESSIONS"
|
186
|
+
Open3.pipeline_r(
|
187
|
+
"#{cat_command} #{inbeds.join(' ')}",
|
188
|
+
"ruby -anle 'as=$F[3].split /\\|/; puts ($F[0..2]+[as[0]]+$F[4..-1]+[as[1..-1].join(\"|\")]).join(\"\t\")'",
|
189
|
+
"#{sort_command} -k 4,4 -k 1,1 -k 2,2n") do |infp, inths|
|
190
|
+
presym = ''
|
191
|
+
gacc2taccs = Hash.new
|
192
|
+
outfp = nil
|
193
|
+
outths = nil
|
194
|
+
infp.each do |line|
|
195
|
+
chr, left, right, sym, tmp, str, accs = line.rstrip.split /\t/
|
196
|
+
gacc, tacc = accs.split /\|/
|
197
|
+
if presym != sym
|
198
|
+
unless outfp.nil?
|
199
|
+
outfp.close
|
200
|
+
outths[-1].join
|
201
|
+
accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
|
202
|
+
annfp.puts "#{presym},#{accstr}"
|
203
|
+
gacc2taccs = Hash.new
|
204
|
+
end
|
205
|
+
outfp, outths = Open3.pipeline_w(
|
206
|
+
"bedtools sort -i stdin",
|
207
|
+
"bedtools merge -s -c 4 -o distinct >> #{tmpbed}")
|
208
|
+
presym = sym
|
209
|
+
end
|
210
|
+
outfp.puts ([chr, left, right, sym, tmp, str]).join("\t")
|
211
|
+
gacc2taccs[gacc] = Array.new unless gacc2taccs.key?(gacc)
|
212
|
+
gacc2taccs[gacc] << tacc unless gacc2taccs[gacc].include?(tacc)
|
213
|
+
end
|
214
|
+
unless outfp.nil?
|
215
|
+
outfp.close
|
216
|
+
outths[-1].join
|
217
|
+
accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
|
218
|
+
annfp.puts "#{presym},#{accstr}"
|
219
|
+
end
|
220
|
+
end
|
221
|
+
annfp.close
|
222
|
+
sh "echo 'track name=#{tname} description=\"#{tdesc}\" visibility=3 colorByStrand=\"38,139,210 203,75,22\"' > #{base}.bed"
|
223
|
+
pipeline("ruby -anle 'puts ($F.values_at(0, 1, 2) + [$F[4], 0, $F[3]]).join(\"\t\")' < #{tmpbed}",
|
224
|
+
"bedtools sort -i stdin >> #{base}.bed")
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
data/lib/bio/gadgets.rb
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
class Gadgets < Bio::Gadget
|
6
|
+
|
7
|
+
desc 'find PATTERN [NAME]',
|
8
|
+
'Find fragments matching with regexp PATTERN from FASTA-format STDIN'
|
9
|
+
|
10
|
+
method_option *OPT_BUFFER_SIZE
|
11
|
+
method_option *OPT_PARALLEL
|
12
|
+
method_option *OPT_COREUTILS_PREFIX
|
13
|
+
|
14
|
+
method_option :ignore_case,
|
15
|
+
default: true,
|
16
|
+
desc: 'Fold lower case to upper case characters',
|
17
|
+
type: :boolean
|
18
|
+
|
19
|
+
def find(pattern, name0 = '')
|
20
|
+
|
21
|
+
bSize = options.key?('buffer_size') ? '--buffer-size='+options.buffer_size : ''
|
22
|
+
cPrefix = options.coreutils_prefix
|
23
|
+
re = Regexp.new("(#{pattern})", options.ignore_case)
|
24
|
+
name = name0 == '' ? pattern : name0
|
25
|
+
|
26
|
+
pids = Array.new
|
27
|
+
tmpfiles = Array.new
|
28
|
+
ff = Bio::FlatFile.open(Bio::FastaFormat, STDIN)
|
29
|
+
ff.each do |entry|
|
30
|
+
tmpfiles << tmpfile = get_temporary_path('find', 'bed', false)
|
31
|
+
acc = entry.entry_id
|
32
|
+
seq = entry.seq
|
33
|
+
pids << Process.fork do
|
34
|
+
fp = open("| #{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bSize} > #{tmpfile}", 'w')
|
35
|
+
#
|
36
|
+
pos = 0
|
37
|
+
match = re.match(seq, pos)
|
38
|
+
while !match.nil?
|
39
|
+
fp.puts [acc, match.begin(1), match.end(1), name, '0', '+'].join("\t")
|
40
|
+
pos = match.begin(1)+1
|
41
|
+
match = re.match(seq, pos)
|
42
|
+
end
|
43
|
+
#
|
44
|
+
pos = 0
|
45
|
+
seq = seq.reverse.tr('acgtACGT', 'tgcaTGCA')
|
46
|
+
len = seq.length
|
47
|
+
match = re.match(seq, pos)
|
48
|
+
while !match.nil?
|
49
|
+
fp.puts [acc, len-match.end(1), len-match.begin(1), name, '0', '-'].join("\t")
|
50
|
+
pos = match.begin(1)+1
|
51
|
+
match = re.match(seq, pos)
|
52
|
+
end
|
53
|
+
#
|
54
|
+
fp.close
|
55
|
+
end
|
56
|
+
while pids.length == options.parallel
|
57
|
+
pids.delete(Process.wait)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
ff.close
|
61
|
+
while pids.length > 0
|
62
|
+
pids.delete(Process.wait)
|
63
|
+
end
|
64
|
+
|
65
|
+
system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 --merge #{bSize} #{tmpfiles.join(' ')}"
|
66
|
+
unlink_files(tmpfiles)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
|
72
|
+
desc 'gap BEDgz1 BEDgz2',
|
73
|
+
"Calculate gap distances from 5'-end of fragments 1 to 3'-end of fragments 2"
|
74
|
+
|
75
|
+
method_option *OPT_PARALLEL
|
76
|
+
method_option *OPT_COREUTILS_PREFIX
|
77
|
+
|
78
|
+
method_option :minimum_gap,
|
79
|
+
default: -10000,
|
80
|
+
desc: 'Minimum gap distans to be reported',
|
81
|
+
type: :numeric
|
82
|
+
|
83
|
+
method_option :maximum_gap,
|
84
|
+
default: 2500,
|
85
|
+
desc: 'Maximum gap distans to be reported',
|
86
|
+
type: :numeric
|
87
|
+
|
88
|
+
def gap(bedgz1, bedgz2)
|
89
|
+
|
90
|
+
cPrefix = options.coreutils_prefix
|
91
|
+
|
92
|
+
chrs = Hash.new
|
93
|
+
open("| unpigz -c #{bedgz1} #{bedgz2} | #{cPrefix}cut -f 1 | #{cPrefix}uniq | #{cPrefix}sort -u").each do |line|
|
94
|
+
chrs[line.rstrip] = ''
|
95
|
+
end
|
96
|
+
|
97
|
+
max = options.maximum_gap
|
98
|
+
min = options.minimum_gap
|
99
|
+
reSep = /\t/
|
100
|
+
tmpfiles = Hash.new
|
101
|
+
chrs.keys.each do |chr|
|
102
|
+
tmpfiles[chr] = get_temporary_path('gap', 'csv', false)
|
103
|
+
end
|
104
|
+
Parallel.each(chrs.keys, in_processes: options.parallel) do |chr|
|
105
|
+
bed2 = Array.new
|
106
|
+
open("| gunzip -c #{bedgz2} | grep '^#{chr}\t'").each do |line|
|
107
|
+
chr0, *cols = line.rstrip.split(reSep)
|
108
|
+
cols[0] = cols[0].to_i
|
109
|
+
cols[1] = cols[1].to_i
|
110
|
+
bed2 << cols
|
111
|
+
end
|
112
|
+
fp = open(tmpfiles[chr], 'w')
|
113
|
+
open("| gunzip -c #{bedgz1} | grep '^#{chr}\t'").each do |line|
|
114
|
+
chr0, start, stop, name, score, str = line.rstrip.split(reSep)
|
115
|
+
if str == '+'
|
116
|
+
bed2.each do |bed|
|
117
|
+
dist = bed[1] - start.to_i + 1
|
118
|
+
fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
|
119
|
+
end
|
120
|
+
else
|
121
|
+
bed2.each do |bed|
|
122
|
+
dist = stop.to_i - bed[0] + 1
|
123
|
+
fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
fp.close
|
128
|
+
end
|
129
|
+
system "cat #{tmpfiles.values.join(' ')}"
|
130
|
+
unlink_files(tmpfiles)
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|