bio-gadget 0.4.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
data/lib/bio-gadget/femrg.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'femrg [GTF...]', 'Extract and merge overlapping first exons.'
|
5
|
-
|
6
|
-
def femrg(*gtf_files)
|
7
|
-
|
8
|
-
exon1s = Hash.new
|
9
|
-
fp = open("| grep -h '\texon\t' #{gtf_files.nil? ? '-' : gtf_files.join(' ')} | cut -f 1,4,5,7,9")
|
10
|
-
fp.each { |line|
|
11
|
-
chr, sstart, sstop, str, attr = line.rstrip.split(/\t/)
|
12
|
-
exon1s[chr] = Hash.new unless exon1s.key?(chr)
|
13
|
-
exon1s[chr][str] = Hash.new unless exon1s[chr].key?(str)
|
14
|
-
id = attr.match(/transcript_id \"([^\"]+)\"/).to_a[1]
|
15
|
-
en = attr.match(/exon_number \"(\d+)\"/).to_a[1].to_i
|
16
|
-
if (!exon1s[chr][str].key?(id) ||
|
17
|
-
(str == "+" && en < exon1s[chr][str][id][2]) ||
|
18
|
-
(str == "-" && exon1s[chr][str][id][2] < en))
|
19
|
-
exon1s[chr][str][id] = [sstart.to_i, sstop.to_i, en]
|
20
|
-
end
|
21
|
-
}
|
22
|
-
fp.close
|
23
|
-
|
24
|
-
idx = 0
|
25
|
-
exon1s.each { |chr, exon1schr|
|
26
|
-
exon1schr.each { |str, exon1schrstr|
|
27
|
-
ids = exon1schrstr.keys.sort { |a, b|
|
28
|
-
exon1schrstr[a][0] <=> exon1schrstr[b][0]
|
29
|
-
}
|
30
|
-
#
|
31
|
-
clusters = Array.new
|
32
|
-
members = [id = ids.shift]
|
33
|
-
start, stop, en = exon1schrstr[id]
|
34
|
-
while ids.length > 0
|
35
|
-
nid = ids.shift
|
36
|
-
nstart, nstop, nen = exon1schrstr[nid]
|
37
|
-
if stop < nstart
|
38
|
-
clusters.push([members, start, stop])
|
39
|
-
members = [nid]
|
40
|
-
start = nstart
|
41
|
-
stop = nstop
|
42
|
-
else
|
43
|
-
members.push(nid)
|
44
|
-
start, stop = [start, stop, nstart, nstop].sort.values_at(0, 3)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
clusters.push([members, start, stop])
|
48
|
-
#
|
49
|
-
clusters.each { |members, start, stop|
|
50
|
-
attr = "gene_id \"FE#{idx}\"; transcript_id \"FE#{idx}\"; "
|
51
|
-
puts [chr, 'bio-gadget:femrg', 'transcript', start, stop, 1000, str, '.', attr].join("\t")
|
52
|
-
puts [chr, 'bio-gadget:femrg', 'exon', start, stop, 1000, str, '.', "#{attr}exon_number \"1\"; member_ids \"#{members.join('|')}\""].join("\t")
|
53
|
-
idx += 1
|
54
|
-
}
|
55
|
-
}
|
56
|
-
}
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
data/lib/bio-gadget/fqxz.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
require 'parallel'
|
2
|
-
require 'pathname'
|
3
|
-
|
4
|
-
module Bio
|
5
|
-
class Gadget < Thor
|
6
|
-
|
7
|
-
namespace :bio
|
8
|
-
|
9
|
-
desc 'fqxz', 'automatic (re)compression of *.fq(.gz|.bz2) files'
|
10
|
-
def fqxz
|
11
|
-
Parallel.map(Pathname.glob('*.fq{.gz,.bz2,}')) { |fqfilename|
|
12
|
-
xzfilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.xz')
|
13
|
-
if !xzfilename.exist?
|
14
|
-
case fqfilename.extname
|
15
|
-
when '.gz'
|
16
|
-
decompressor = 'gunzip -c'
|
17
|
-
when '.bz2'
|
18
|
-
decompressor = 'bunzip2 -c'
|
19
|
-
else
|
20
|
-
decompressor = 'cat'
|
21
|
-
end
|
22
|
-
puts "compressing #{xzfilename}..."
|
23
|
-
system "#{decompressor} #{fqfilename} | xz -z -e -c > #{xzfilename} 2> #{xzfilename}.log"
|
24
|
-
system "xz -t #{xzfilename} >> #{xzfilename}.log 2>&1"
|
25
|
-
end
|
26
|
-
}
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
end
|
data/lib/bio-gadget/peak.rb
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
require 'mkfifo'
|
2
|
-
require 'parallel'
|
3
|
-
|
4
|
-
module Bio
|
5
|
-
class Gadget < Thor
|
6
|
-
|
7
|
-
desc 'peak WIG1,WIG2,... [GTF]', <<DESC
|
8
|
-
Find peak within each exon from (gzipped) variableStep wigs by a majority vote. It will read from a standard input if no GTF option.
|
9
|
-
DESC
|
10
|
-
def peak(wigs, gtf="/dev/stdin")
|
11
|
-
|
12
|
-
nchrpos2val = Hash.new
|
13
|
-
wigs.split(/,/).each { |wig|
|
14
|
-
n = wigs.split(/,/).index(wig)
|
15
|
-
nchrpos2val[n] = Hash.new unless nchrpos2val.key?(n)
|
16
|
-
chr = ''
|
17
|
-
fp = open("| zcat #{wig} | grep -v ^track")
|
18
|
-
fp.each { |line|
|
19
|
-
cols = line.rstrip.split(/\s+/)
|
20
|
-
if cols[0] == 'variableStep'
|
21
|
-
chr = cols[1].match(/chrom=(\S+)$/).to_a[1]
|
22
|
-
nchrpos2val[n][chr] = Hash.new unless nchrpos2val[n].key?(chr)
|
23
|
-
else
|
24
|
-
nchrpos2val[n][chr][cols[0].to_i] = cols[1].to_f
|
25
|
-
end
|
26
|
-
}
|
27
|
-
fp.close
|
28
|
-
}
|
29
|
-
|
30
|
-
chr2exon = Hash.new
|
31
|
-
open("| grep exon #{gtf}").each { |line|
|
32
|
-
cols = line.rstrip.split(/\t/)
|
33
|
-
oid = cols[8].match(/oId \"([^\"]+)/).to_a[1]
|
34
|
-
oid = "#{cols[0]}.#{oid}" if cols[0] =~ /^RNA_SPIKE_/
|
35
|
-
exn = cols[8].match(/exon_number \"(\d+)\"/).to_a[1]
|
36
|
-
chr = cols[0]
|
37
|
-
str = cols[6]
|
38
|
-
start = cols[3].to_i
|
39
|
-
stop = cols[4].to_i
|
40
|
-
chr2exon[chr] = Array.new unless chr2exon.key?(chr)
|
41
|
-
chr2exon[chr].push([str, oid, exn, start, stop])
|
42
|
-
}
|
43
|
-
|
44
|
-
fifopath = mytemppath('fifo-')
|
45
|
-
File.mkfifo(fifopath)
|
46
|
-
|
47
|
-
pid = Kernel.fork {
|
48
|
-
exec "cat #{fifopath}"
|
49
|
-
}
|
50
|
-
|
51
|
-
fp = open(fifopath, 'w')
|
52
|
-
Parallel.each(chr2exon.keys,
|
53
|
-
:in_processes => Parallel.processor_count) { |chr|
|
54
|
-
#
|
55
|
-
chr2exon[chr].each { |str, oid, exn, start, stop|
|
56
|
-
#
|
57
|
-
peak = ''
|
58
|
-
poss = Hash.new
|
59
|
-
nchrpos2val.each { |n, chrpos2val|
|
60
|
-
if chrpos2val.key?(chr)
|
61
|
-
pos2val = chrpos2val[chr]
|
62
|
-
tmppos2val = Hash.new
|
63
|
-
pos2val.each { |pos, val|
|
64
|
-
tmppos2val[pos] = val if start <= pos && pos <= stop
|
65
|
-
}
|
66
|
-
if tmppos2val.size > 0
|
67
|
-
tmpposs = tmppos2val.keys.sort { |a, b|
|
68
|
-
tmppos2val[b] == tmppos2val[a] ? (str == '+' ? (a <=> b) : (b <=> a)) : (tmppos2val[b] <=> tmppos2val[a])
|
69
|
-
}
|
70
|
-
tmppos = tmpposs[0]
|
71
|
-
# puts "#{n} | #{chr}:#{start}-#{stop} #{str} | #{tmpposs}"
|
72
|
-
poss[tmppos] = poss.key?(tmppos) ? poss[tmppos]+1 : 1
|
73
|
-
end
|
74
|
-
end
|
75
|
-
}
|
76
|
-
if poss.size > 0
|
77
|
-
peaks = poss.keys.sort { |a, b|
|
78
|
-
poss[b] == poss[a] ? (str == '+' ? (a <=> b) : (b <=> a)) : (poss[b] <=> poss[a])
|
79
|
-
}
|
80
|
-
peak = peaks[0]
|
81
|
-
end
|
82
|
-
#
|
83
|
-
fp.syswrite([oid, exn, peak].join("\t") + "\n")
|
84
|
-
}
|
85
|
-
#
|
86
|
-
}
|
87
|
-
fp.close
|
88
|
-
|
89
|
-
Process.waitall
|
90
|
-
|
91
|
-
end
|
92
|
-
|
93
|
-
end
|
94
|
-
end
|
data/lib/bio-gadget/qvstat.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
namespace :bio
|
5
|
-
|
6
|
-
desc 'qvstat QUAL', 'statistics of quality values in *.qual file'
|
7
|
-
def qvstat(qualfile)
|
8
|
-
stat = Hash.new
|
9
|
-
myopen(qualfile) { |fp|
|
10
|
-
fp.each { |line|
|
11
|
-
next if /^[\#\>]/ =~ line
|
12
|
-
qvs = line.rstrip.split
|
13
|
-
qvs.each_index { |i|
|
14
|
-
qv = qvs[i]
|
15
|
-
stat[qv] = Array.new unless stat.key?(qv)
|
16
|
-
if stat[qv][i].nil?
|
17
|
-
stat[qv][i] = 1
|
18
|
-
else
|
19
|
-
stat[qv][i] += 1
|
20
|
-
end
|
21
|
-
}
|
22
|
-
}
|
23
|
-
}
|
24
|
-
statfile = qualfile.sub(/.qual(.gz)?$/, '.qvstat')
|
25
|
-
open(statfile, 'w') { |out|
|
26
|
-
qvs = stat.keys.sort { |a, b| a.to_i <=> b.to_i }
|
27
|
-
qvs.each { |qv|
|
28
|
-
out.puts "#{qv} #{stat[qv].join(' ')}"
|
29
|
-
}
|
30
|
-
}
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|
34
|
-
end
|
data/lib/bio-gadget/rgt2mtx.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'rgt2mtx [RGT]', <<DESC
|
5
|
-
Convert cuffdiff read group tracking file (*.read_group_tracking) into tab-separated matrix. If no given name of tracking file, it reads from standard input.
|
6
|
-
DESC
|
7
|
-
option 'gtf', :aliases => '-g', :type => :string, :desc => 'GTF to revert old transcript_id (oId) renamed by cuffcompare. Moreover, "RNA_SPIKE_*" chromosome names will be inserted for the transcripts aligned on the forward strand of spike-in sequences.'
|
8
|
-
option 'sample', :aliases => '-s', :type => :string, :desc => 'Mapping from condition/replicate to sample ID for the column names of output matrix. Tab-separated text with three columns of condition, replicate and the sample ID.'
|
9
|
-
def rgt2mtx(rgt="/dev/stdin")
|
10
|
-
|
11
|
-
tid2oid = Hash.new
|
12
|
-
unless options['gtf'].nil?
|
13
|
-
open(options['gtf']).each { |line|
|
14
|
-
cols = line.rstrip.split(/\t/)
|
15
|
-
tid = cols[8].match(/transcript_id \"([^\"]+)/).to_a[1]
|
16
|
-
oid = cols[8].match(/oId \"([^\"]+)/).to_a[1]
|
17
|
-
oid = "#{cols[0]}.#{oid}" if cols[0] =~ /^RNA_SPIKE_/
|
18
|
-
tid2oid[tid] = oid
|
19
|
-
}
|
20
|
-
end
|
21
|
-
|
22
|
-
cr2sid = Hash.new
|
23
|
-
unless options['sample'].nil?
|
24
|
-
open(options['sample']).each { |line|
|
25
|
-
c, r, sid = line.rstrip.split(/\t/)
|
26
|
-
cr2sid["#{c}|#{r}"] = sid
|
27
|
-
}
|
28
|
-
end
|
29
|
-
|
30
|
-
id = nil
|
31
|
-
header = true
|
32
|
-
raws = Hash.new
|
33
|
-
open("| tail -n +2 #{rgt} | sort -k 1").each { |line|
|
34
|
-
cols = line.rstrip.split(/\t/)
|
35
|
-
id = cols[0] if id.nil?
|
36
|
-
if id != cols[0]
|
37
|
-
if header
|
38
|
-
if options['sample'].nil?
|
39
|
-
puts ([''] + raws.keys.sort).join("\t")
|
40
|
-
else
|
41
|
-
tmp = ['']
|
42
|
-
raws.keys.sort.each { |cr| tmp.push(cr2sid[cr]) }
|
43
|
-
puts tmp.join("\t")
|
44
|
-
end
|
45
|
-
header = false
|
46
|
-
end
|
47
|
-
tmp = [tid2oid.key?(id) ? tid2oid[id] : id]
|
48
|
-
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
49
|
-
puts tmp.join("\t")
|
50
|
-
id = cols[0]
|
51
|
-
end
|
52
|
-
raws["#{cols[1]}|#{cols[2]}"] = cols[3]
|
53
|
-
}
|
54
|
-
tmp = [tid2oid.key?(id) ? tid2oid[id] : id]
|
55
|
-
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
56
|
-
puts tmp.join("\t")
|
57
|
-
end
|
58
|
-
|
59
|
-
end
|
60
|
-
end
|
data/lib/bio-gadget/version.rb
DELETED
data/lib/bio-gadget/wig5p.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'wig5p BAM', <<DESC
|
5
|
-
Convert bam-format alignments into wig-format table. Count is at 5'-end of alignments. Moreover the counts are normalized when there are alignments to references named 'RNA_SPIKE_*'. This procedure requires samtools and bamToBed in BEDTools.
|
6
|
-
DESC
|
7
|
-
option 'reverse', :aliases => '-r', :type => :boolean, :default => false
|
8
|
-
option 'name', :aliases => '-n', :type => :string, :default => ' '
|
9
|
-
def wig5p(bam)
|
10
|
-
|
11
|
-
str = options['reverse'] ? '-' : '+'
|
12
|
-
|
13
|
-
tmp = 0.0
|
14
|
-
open("| samtools view #{bam} | grep RNA_SPIKE_ | grep 'XS:A:+' | cut -f 1 | sort -u").each { |line| tmp += 1.0 }
|
15
|
-
abort "No spike-in reads." if tmp == 0.0
|
16
|
-
spike = 2234.8/tmp
|
17
|
-
|
18
|
-
acc2dups = Hash.new
|
19
|
-
open("| samtools view #{bam} | cut -f 1 | sort | uniq -c").each { |line|
|
20
|
-
dups, acc = line.strip.split(/\s/)
|
21
|
-
acc2dups[acc] = dups.to_f
|
22
|
-
}
|
23
|
-
|
24
|
-
cnts = Hash.new
|
25
|
-
tmpbam = mytemppath('.bam')
|
26
|
-
abort unless system("samtools view -h #{bam} | grep -E 'XS:A:\\#{str}|^@SQ' | samtools view -S -b - > #{tmpbam}")
|
27
|
-
|
28
|
-
open("| bamToBed -i #{tmpbam} | cut -f 1,#{options['reverse'] ? 3 : 2},4").each { |line|
|
29
|
-
chr, poss, acc = line.rstrip.split(/\t/)
|
30
|
-
cnts[chr] = Hash.new if !cnts.key?(chr)
|
31
|
-
pos = poss.to_i
|
32
|
-
if !cnts[chr].key?(pos)
|
33
|
-
cnts[chr][pos] = 1.0/acc2dups[acc]
|
34
|
-
else
|
35
|
-
cnts[chr][pos] = cnts[chr][pos]+1.0/acc2dups[acc]
|
36
|
-
end
|
37
|
-
}
|
38
|
-
|
39
|
-
puts "track type=wiggle_0 name=\"#{options['name']}\" description=\" \" alwaysZero=on visivility=full maxHeightPixels=128:64:16 color=#{options['reverse'] ? '0,128,255' : '255,128,0'}"
|
40
|
-
|
41
|
-
offset = options['reverse'] ? 0 : 1
|
42
|
-
signal = options['reverse'] ? -spike : spike
|
43
|
-
cnts.each { |chr, posvals|
|
44
|
-
puts "variableStep chrom=#{chr}"
|
45
|
-
posvals.keys.sort.each { |pos| puts [pos+offset, signal*posvals[pos]].join("\t") }
|
46
|
-
}
|
47
|
-
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
data/lib/bio-gadget/wigchr.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
namespace :bio
|
5
|
-
|
6
|
-
desc 'wigchr WIG CHR', 'extract wiggle track on specified chromosome'
|
7
|
-
def wigchr(wigfile, chr)
|
8
|
-
target = false
|
9
|
-
myopen(wigfile) { |fp|
|
10
|
-
fp.each { |line|
|
11
|
-
if (/^(fixed|variable)Step/ =~ line)
|
12
|
-
if (/chrom=#{chr}\s/ =~ line)
|
13
|
-
target = true
|
14
|
-
puts line
|
15
|
-
else
|
16
|
-
target = false
|
17
|
-
end
|
18
|
-
elsif (/^\d/ =~ line)
|
19
|
-
puts line if target
|
20
|
-
else
|
21
|
-
puts line
|
22
|
-
end
|
23
|
-
}
|
24
|
-
}
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
end
|