bio-gadget 0.4.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
data/lib/bio-gadget/femrg.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'femrg [GTF...]', 'Extract and merge overlapping first exons.'
|
5
|
-
|
6
|
-
def femrg(*gtf_files)
|
7
|
-
|
8
|
-
exon1s = Hash.new
|
9
|
-
fp = open("| grep -h '\texon\t' #{gtf_files.nil? ? '-' : gtf_files.join(' ')} | cut -f 1,4,5,7,9")
|
10
|
-
fp.each { |line|
|
11
|
-
chr, sstart, sstop, str, attr = line.rstrip.split(/\t/)
|
12
|
-
exon1s[chr] = Hash.new unless exon1s.key?(chr)
|
13
|
-
exon1s[chr][str] = Hash.new unless exon1s[chr].key?(str)
|
14
|
-
id = attr.match(/transcript_id \"([^\"]+)\"/).to_a[1]
|
15
|
-
en = attr.match(/exon_number \"(\d+)\"/).to_a[1].to_i
|
16
|
-
if (!exon1s[chr][str].key?(id) ||
|
17
|
-
(str == "+" && en < exon1s[chr][str][id][2]) ||
|
18
|
-
(str == "-" && exon1s[chr][str][id][2] < en))
|
19
|
-
exon1s[chr][str][id] = [sstart.to_i, sstop.to_i, en]
|
20
|
-
end
|
21
|
-
}
|
22
|
-
fp.close
|
23
|
-
|
24
|
-
idx = 0
|
25
|
-
exon1s.each { |chr, exon1schr|
|
26
|
-
exon1schr.each { |str, exon1schrstr|
|
27
|
-
ids = exon1schrstr.keys.sort { |a, b|
|
28
|
-
exon1schrstr[a][0] <=> exon1schrstr[b][0]
|
29
|
-
}
|
30
|
-
#
|
31
|
-
clusters = Array.new
|
32
|
-
members = [id = ids.shift]
|
33
|
-
start, stop, en = exon1schrstr[id]
|
34
|
-
while ids.length > 0
|
35
|
-
nid = ids.shift
|
36
|
-
nstart, nstop, nen = exon1schrstr[nid]
|
37
|
-
if stop < nstart
|
38
|
-
clusters.push([members, start, stop])
|
39
|
-
members = [nid]
|
40
|
-
start = nstart
|
41
|
-
stop = nstop
|
42
|
-
else
|
43
|
-
members.push(nid)
|
44
|
-
start, stop = [start, stop, nstart, nstop].sort.values_at(0, 3)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
clusters.push([members, start, stop])
|
48
|
-
#
|
49
|
-
clusters.each { |members, start, stop|
|
50
|
-
attr = "gene_id \"FE#{idx}\"; transcript_id \"FE#{idx}\"; "
|
51
|
-
puts [chr, 'bio-gadget:femrg', 'transcript', start, stop, 1000, str, '.', attr].join("\t")
|
52
|
-
puts [chr, 'bio-gadget:femrg', 'exon', start, stop, 1000, str, '.', "#{attr}exon_number \"1\"; member_ids \"#{members.join('|')}\""].join("\t")
|
53
|
-
idx += 1
|
54
|
-
}
|
55
|
-
}
|
56
|
-
}
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
data/lib/bio-gadget/fqxz.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
require 'parallel'
|
2
|
-
require 'pathname'
|
3
|
-
|
4
|
-
module Bio
|
5
|
-
class Gadget < Thor
|
6
|
-
|
7
|
-
namespace :bio
|
8
|
-
|
9
|
-
desc 'fqxz', 'automatic (re)compression of *.fq(.gz|.bz2) files'
|
10
|
-
def fqxz
|
11
|
-
Parallel.map(Pathname.glob('*.fq{.gz,.bz2,}')) { |fqfilename|
|
12
|
-
xzfilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.xz')
|
13
|
-
if !xzfilename.exist?
|
14
|
-
case fqfilename.extname
|
15
|
-
when '.gz'
|
16
|
-
decompressor = 'gunzip -c'
|
17
|
-
when '.bz2'
|
18
|
-
decompressor = 'bunzip2 -c'
|
19
|
-
else
|
20
|
-
decompressor = 'cat'
|
21
|
-
end
|
22
|
-
puts "compressing #{xzfilename}..."
|
23
|
-
system "#{decompressor} #{fqfilename} | xz -z -e -c > #{xzfilename} 2> #{xzfilename}.log"
|
24
|
-
system "xz -t #{xzfilename} >> #{xzfilename}.log 2>&1"
|
25
|
-
end
|
26
|
-
}
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
end
|
data/lib/bio-gadget/peak.rb
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
require 'mkfifo'
|
2
|
-
require 'parallel'
|
3
|
-
|
4
|
-
module Bio
|
5
|
-
class Gadget < Thor
|
6
|
-
|
7
|
-
desc 'peak WIG1,WIG2,... [GTF]', <<DESC
|
8
|
-
Find peak within each exon from (gzipped) variableStep wigs by a majority vote. It will read from a standard input if no GTF option.
|
9
|
-
DESC
|
10
|
-
def peak(wigs, gtf="/dev/stdin")
|
11
|
-
|
12
|
-
nchrpos2val = Hash.new
|
13
|
-
wigs.split(/,/).each { |wig|
|
14
|
-
n = wigs.split(/,/).index(wig)
|
15
|
-
nchrpos2val[n] = Hash.new unless nchrpos2val.key?(n)
|
16
|
-
chr = ''
|
17
|
-
fp = open("| zcat #{wig} | grep -v ^track")
|
18
|
-
fp.each { |line|
|
19
|
-
cols = line.rstrip.split(/\s+/)
|
20
|
-
if cols[0] == 'variableStep'
|
21
|
-
chr = cols[1].match(/chrom=(\S+)$/).to_a[1]
|
22
|
-
nchrpos2val[n][chr] = Hash.new unless nchrpos2val[n].key?(chr)
|
23
|
-
else
|
24
|
-
nchrpos2val[n][chr][cols[0].to_i] = cols[1].to_f
|
25
|
-
end
|
26
|
-
}
|
27
|
-
fp.close
|
28
|
-
}
|
29
|
-
|
30
|
-
chr2exon = Hash.new
|
31
|
-
open("| grep exon #{gtf}").each { |line|
|
32
|
-
cols = line.rstrip.split(/\t/)
|
33
|
-
oid = cols[8].match(/oId \"([^\"]+)/).to_a[1]
|
34
|
-
oid = "#{cols[0]}.#{oid}" if cols[0] =~ /^RNA_SPIKE_/
|
35
|
-
exn = cols[8].match(/exon_number \"(\d+)\"/).to_a[1]
|
36
|
-
chr = cols[0]
|
37
|
-
str = cols[6]
|
38
|
-
start = cols[3].to_i
|
39
|
-
stop = cols[4].to_i
|
40
|
-
chr2exon[chr] = Array.new unless chr2exon.key?(chr)
|
41
|
-
chr2exon[chr].push([str, oid, exn, start, stop])
|
42
|
-
}
|
43
|
-
|
44
|
-
fifopath = mytemppath('fifo-')
|
45
|
-
File.mkfifo(fifopath)
|
46
|
-
|
47
|
-
pid = Kernel.fork {
|
48
|
-
exec "cat #{fifopath}"
|
49
|
-
}
|
50
|
-
|
51
|
-
fp = open(fifopath, 'w')
|
52
|
-
Parallel.each(chr2exon.keys,
|
53
|
-
:in_processes => Parallel.processor_count) { |chr|
|
54
|
-
#
|
55
|
-
chr2exon[chr].each { |str, oid, exn, start, stop|
|
56
|
-
#
|
57
|
-
peak = ''
|
58
|
-
poss = Hash.new
|
59
|
-
nchrpos2val.each { |n, chrpos2val|
|
60
|
-
if chrpos2val.key?(chr)
|
61
|
-
pos2val = chrpos2val[chr]
|
62
|
-
tmppos2val = Hash.new
|
63
|
-
pos2val.each { |pos, val|
|
64
|
-
tmppos2val[pos] = val if start <= pos && pos <= stop
|
65
|
-
}
|
66
|
-
if tmppos2val.size > 0
|
67
|
-
tmpposs = tmppos2val.keys.sort { |a, b|
|
68
|
-
tmppos2val[b] == tmppos2val[a] ? (str == '+' ? (a <=> b) : (b <=> a)) : (tmppos2val[b] <=> tmppos2val[a])
|
69
|
-
}
|
70
|
-
tmppos = tmpposs[0]
|
71
|
-
# puts "#{n} | #{chr}:#{start}-#{stop} #{str} | #{tmpposs}"
|
72
|
-
poss[tmppos] = poss.key?(tmppos) ? poss[tmppos]+1 : 1
|
73
|
-
end
|
74
|
-
end
|
75
|
-
}
|
76
|
-
if poss.size > 0
|
77
|
-
peaks = poss.keys.sort { |a, b|
|
78
|
-
poss[b] == poss[a] ? (str == '+' ? (a <=> b) : (b <=> a)) : (poss[b] <=> poss[a])
|
79
|
-
}
|
80
|
-
peak = peaks[0]
|
81
|
-
end
|
82
|
-
#
|
83
|
-
fp.syswrite([oid, exn, peak].join("\t") + "\n")
|
84
|
-
}
|
85
|
-
#
|
86
|
-
}
|
87
|
-
fp.close
|
88
|
-
|
89
|
-
Process.waitall
|
90
|
-
|
91
|
-
end
|
92
|
-
|
93
|
-
end
|
94
|
-
end
|
data/lib/bio-gadget/qvstat.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
namespace :bio
|
5
|
-
|
6
|
-
desc 'qvstat QUAL', 'statistics of quality values in *.qual file'
|
7
|
-
def qvstat(qualfile)
|
8
|
-
stat = Hash.new
|
9
|
-
myopen(qualfile) { |fp|
|
10
|
-
fp.each { |line|
|
11
|
-
next if /^[\#\>]/ =~ line
|
12
|
-
qvs = line.rstrip.split
|
13
|
-
qvs.each_index { |i|
|
14
|
-
qv = qvs[i]
|
15
|
-
stat[qv] = Array.new unless stat.key?(qv)
|
16
|
-
if stat[qv][i].nil?
|
17
|
-
stat[qv][i] = 1
|
18
|
-
else
|
19
|
-
stat[qv][i] += 1
|
20
|
-
end
|
21
|
-
}
|
22
|
-
}
|
23
|
-
}
|
24
|
-
statfile = qualfile.sub(/.qual(.gz)?$/, '.qvstat')
|
25
|
-
open(statfile, 'w') { |out|
|
26
|
-
qvs = stat.keys.sort { |a, b| a.to_i <=> b.to_i }
|
27
|
-
qvs.each { |qv|
|
28
|
-
out.puts "#{qv} #{stat[qv].join(' ')}"
|
29
|
-
}
|
30
|
-
}
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|
34
|
-
end
|
data/lib/bio-gadget/rgt2mtx.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'rgt2mtx [RGT]', <<DESC
|
5
|
-
Convert cuffdiff read group tracking file (*.read_group_tracking) into tab-separated matrix. If no given name of tracking file, it reads from standard input.
|
6
|
-
DESC
|
7
|
-
option 'gtf', :aliases => '-g', :type => :string, :desc => 'GTF to revert old transcript_id (oId) renamed by cuffcompare. Moreover, "RNA_SPIKE_*" chromosome names will be inserted for the transcripts aligned on the forward strand of spike-in sequences.'
|
8
|
-
option 'sample', :aliases => '-s', :type => :string, :desc => 'Mapping from condition/replicate to sample ID for the column names of output matrix. Tab-separated text with three columns of condition, replicate and the sample ID.'
|
9
|
-
def rgt2mtx(rgt="/dev/stdin")
|
10
|
-
|
11
|
-
tid2oid = Hash.new
|
12
|
-
unless options['gtf'].nil?
|
13
|
-
open(options['gtf']).each { |line|
|
14
|
-
cols = line.rstrip.split(/\t/)
|
15
|
-
tid = cols[8].match(/transcript_id \"([^\"]+)/).to_a[1]
|
16
|
-
oid = cols[8].match(/oId \"([^\"]+)/).to_a[1]
|
17
|
-
oid = "#{cols[0]}.#{oid}" if cols[0] =~ /^RNA_SPIKE_/
|
18
|
-
tid2oid[tid] = oid
|
19
|
-
}
|
20
|
-
end
|
21
|
-
|
22
|
-
cr2sid = Hash.new
|
23
|
-
unless options['sample'].nil?
|
24
|
-
open(options['sample']).each { |line|
|
25
|
-
c, r, sid = line.rstrip.split(/\t/)
|
26
|
-
cr2sid["#{c}|#{r}"] = sid
|
27
|
-
}
|
28
|
-
end
|
29
|
-
|
30
|
-
id = nil
|
31
|
-
header = true
|
32
|
-
raws = Hash.new
|
33
|
-
open("| tail -n +2 #{rgt} | sort -k 1").each { |line|
|
34
|
-
cols = line.rstrip.split(/\t/)
|
35
|
-
id = cols[0] if id.nil?
|
36
|
-
if id != cols[0]
|
37
|
-
if header
|
38
|
-
if options['sample'].nil?
|
39
|
-
puts ([''] + raws.keys.sort).join("\t")
|
40
|
-
else
|
41
|
-
tmp = ['']
|
42
|
-
raws.keys.sort.each { |cr| tmp.push(cr2sid[cr]) }
|
43
|
-
puts tmp.join("\t")
|
44
|
-
end
|
45
|
-
header = false
|
46
|
-
end
|
47
|
-
tmp = [tid2oid.key?(id) ? tid2oid[id] : id]
|
48
|
-
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
49
|
-
puts tmp.join("\t")
|
50
|
-
id = cols[0]
|
51
|
-
end
|
52
|
-
raws["#{cols[1]}|#{cols[2]}"] = cols[3]
|
53
|
-
}
|
54
|
-
tmp = [tid2oid.key?(id) ? tid2oid[id] : id]
|
55
|
-
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
56
|
-
puts tmp.join("\t")
|
57
|
-
end
|
58
|
-
|
59
|
-
end
|
60
|
-
end
|
data/lib/bio-gadget/version.rb
DELETED
data/lib/bio-gadget/wig5p.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'wig5p BAM', <<DESC
|
5
|
-
Convert bam-format alignments into wig-format table. Count is at 5'-end of alignments. Moreover the counts are normalized when there are alignments to references named 'RNA_SPIKE_*'. This procedure requires samtools and bamToBed in BEDTools.
|
6
|
-
DESC
|
7
|
-
option 'reverse', :aliases => '-r', :type => :boolean, :default => false
|
8
|
-
option 'name', :aliases => '-n', :type => :string, :default => ' '
|
9
|
-
def wig5p(bam)
|
10
|
-
|
11
|
-
str = options['reverse'] ? '-' : '+'
|
12
|
-
|
13
|
-
tmp = 0.0
|
14
|
-
open("| samtools view #{bam} | grep RNA_SPIKE_ | grep 'XS:A:+' | cut -f 1 | sort -u").each { |line| tmp += 1.0 }
|
15
|
-
abort "No spike-in reads." if tmp == 0.0
|
16
|
-
spike = 2234.8/tmp
|
17
|
-
|
18
|
-
acc2dups = Hash.new
|
19
|
-
open("| samtools view #{bam} | cut -f 1 | sort | uniq -c").each { |line|
|
20
|
-
dups, acc = line.strip.split(/\s/)
|
21
|
-
acc2dups[acc] = dups.to_f
|
22
|
-
}
|
23
|
-
|
24
|
-
cnts = Hash.new
|
25
|
-
tmpbam = mytemppath('.bam')
|
26
|
-
abort unless system("samtools view -h #{bam} | grep -E 'XS:A:\\#{str}|^@SQ' | samtools view -S -b - > #{tmpbam}")
|
27
|
-
|
28
|
-
open("| bamToBed -i #{tmpbam} | cut -f 1,#{options['reverse'] ? 3 : 2},4").each { |line|
|
29
|
-
chr, poss, acc = line.rstrip.split(/\t/)
|
30
|
-
cnts[chr] = Hash.new if !cnts.key?(chr)
|
31
|
-
pos = poss.to_i
|
32
|
-
if !cnts[chr].key?(pos)
|
33
|
-
cnts[chr][pos] = 1.0/acc2dups[acc]
|
34
|
-
else
|
35
|
-
cnts[chr][pos] = cnts[chr][pos]+1.0/acc2dups[acc]
|
36
|
-
end
|
37
|
-
}
|
38
|
-
|
39
|
-
puts "track type=wiggle_0 name=\"#{options['name']}\" description=\" \" alwaysZero=on visivility=full maxHeightPixels=128:64:16 color=#{options['reverse'] ? '0,128,255' : '255,128,0'}"
|
40
|
-
|
41
|
-
offset = options['reverse'] ? 0 : 1
|
42
|
-
signal = options['reverse'] ? -spike : spike
|
43
|
-
cnts.each { |chr, posvals|
|
44
|
-
puts "variableStep chrom=#{chr}"
|
45
|
-
posvals.keys.sort.each { |pos| puts [pos+offset, signal*posvals[pos]].join("\t") }
|
46
|
-
}
|
47
|
-
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
data/lib/bio-gadget/wigchr.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
namespace :bio
|
5
|
-
|
6
|
-
desc 'wigchr WIG CHR', 'extract wiggle track on specified chromosome'
|
7
|
-
def wigchr(wigfile, chr)
|
8
|
-
target = false
|
9
|
-
myopen(wigfile) { |fp|
|
10
|
-
fp.each { |line|
|
11
|
-
if (/^(fixed|variable)Step/ =~ line)
|
12
|
-
if (/chrom=#{chr}\s/ =~ line)
|
13
|
-
target = true
|
14
|
-
puts line
|
15
|
-
else
|
16
|
-
target = false
|
17
|
-
end
|
18
|
-
elsif (/^\d/ =~ line)
|
19
|
-
puts line if target
|
20
|
-
else
|
21
|
-
puts line
|
22
|
-
end
|
23
|
-
}
|
24
|
-
}
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
end
|