bio-gadget 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.org +12 -9
- data/lib/bio-gadget.rb +3 -3
- data/lib/bio-gadget/rgt2mtx.rb +33 -0
- data/lib/bio-gadget/version.rb +1 -1
- data/lib/bio-gadget/wig5p.rb +51 -0
- metadata +4 -3
- data/lib/bio-gadget/gtfann.rb +0 -93
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a8b08f5a256d452f6399fe4484d6a18811bc3b58
|
4
|
+
data.tar.gz: 532b1903529b32c549eb321e363d0dc8e8efaf27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23f150133d857f16a9ff0a326201fb044ec167d51fcf5addc23cb445ef219b75ff84038d457c3325fe076ce024b91d3afa2ffa33dcbba3c33401b9c75dfaf0bb
|
7
|
+
data.tar.gz: edbfc69fcfa8f7db2003236fc8d7b01046a051955907ad45bb2939096b389d6c3a6f19c79d95dbf3212ed320a20c6f9bf477a5f57e3ce9a542ccedd52ab4e4e5
|
data/README.org
CHANGED
@@ -8,20 +8,23 @@
|
|
8
8
|
|
9
9
|
* Usage
|
10
10
|
|
11
|
-
To check all commands in this package
|
11
|
+
To check all commands and the usages in this package
|
12
12
|
|
13
13
|
: bio-gadget help
|
14
|
-
: bio-gadget list
|
15
14
|
|
16
15
|
Currently available commands are
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
17
|
+
- dedup :: Deduplicate fastq (via STDIN)
|
18
|
+
- demlt :: Demultiplex fastq by barcodes
|
19
|
+
- fqxz :: automatic (re)compression of *.fq(.gz|.bz2) files
|
20
|
+
- qvstat :: Statistics of quality values in *.qual file
|
21
|
+
- rgt2mtx :: Convert cuffdiff read group tracking file into tab-separated matrix
|
22
|
+
- wig5p :: Convert bam-format alignments into wig-format table
|
23
|
+
- wigchr :: Extract wiggle track on specified chromosome
|
24
|
+
|
25
|
+
** Memo :noexport:
|
26
|
+
- gtfensembl :: Create gtf of ENSEMBL Genes (a.k.a. ensGene.gtf) from XX files of UCSC annotation database; .txt.gz of
|
27
|
+
- gtfucsc :: Create gtf of UCSC Genes from four (knownIsoforms.txt.gz, kgXref.txt.gz, kgTxInfo.txt.gz and knownGene.txt.gz) files of UCSC annotation database. The output gtf contains gene_id, transcript_id and gene_name attributes, so it convenients to use as a gene/transcript annotation file for tophat/cufflinks etc.
|
25
28
|
|
26
29
|
* Contributing
|
27
30
|
|
data/lib/bio-gadget.rb
CHANGED
@@ -2,8 +2,9 @@ require 'bio-gadget/version'
|
|
2
2
|
require 'bio-gadget/dedup'
|
3
3
|
require 'bio-gadget/demlt'
|
4
4
|
require 'bio-gadget/fqxz'
|
5
|
-
require 'bio-gadget/gtfann'
|
6
5
|
require 'bio-gadget/qvstat'
|
6
|
+
require 'bio-gadget/rgt2mtx'
|
7
|
+
require 'bio-gadget/wig5p'
|
7
8
|
require 'bio-gadget/wigchr'
|
8
9
|
|
9
10
|
require 'tempfile'
|
@@ -11,8 +12,6 @@ require 'tempfile'
|
|
11
12
|
module Bio
|
12
13
|
class Gadget < Thor
|
13
14
|
|
14
|
-
namespace :bio
|
15
|
-
|
16
15
|
private
|
17
16
|
|
18
17
|
def myopen(file, &block)
|
@@ -38,5 +37,6 @@ module Bio
|
|
38
37
|
END {
|
39
38
|
@@mytemppaths.each { |path| File.unlink(path) if File.exist?(path) }
|
40
39
|
}
|
40
|
+
|
41
41
|
end
|
42
42
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Bio
|
2
|
+
class Gadget < Thor
|
3
|
+
|
4
|
+
desc 'rgt2mtx [RGT]', <<DESC
|
5
|
+
Convert cuffdiff read group tracking file into tab-separated matrix. If no given name of tracking file, it reads from standard input.
|
6
|
+
DESC
|
7
|
+
def rgt2mtx(rgt="/dev/stdin")
|
8
|
+
|
9
|
+
id = nil
|
10
|
+
header = true
|
11
|
+
raws = Hash.new
|
12
|
+
open("| tail -n +2 #{rgt} | sort -k 1").each { |line|
|
13
|
+
cols = line.rstrip.split(/\t/)
|
14
|
+
id = cols[0] if id.nil?
|
15
|
+
if id != cols[0]
|
16
|
+
if header
|
17
|
+
puts (['tracking_id'] + raws.keys.sort).join("\t")
|
18
|
+
header = false
|
19
|
+
end
|
20
|
+
tmp = [id]
|
21
|
+
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
22
|
+
puts tmp.join("\t")
|
23
|
+
id = cols[0]
|
24
|
+
end
|
25
|
+
raws["#{cols[1]}|#{cols[2]}"] = cols[3]
|
26
|
+
}
|
27
|
+
tmp = [id]
|
28
|
+
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
29
|
+
puts tmp.join("\t")
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
data/lib/bio-gadget/version.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
module Bio
|
2
|
+
class Gadget < Thor
|
3
|
+
|
4
|
+
desc 'wig5p BAM', <<DESC
|
5
|
+
Convert bam-format alignments into wig-format table. Count is at 5'-end of alignments. Moreover the counts are normalized when there are alignments to references named 'RNA_SPIKE_*'. This procedure requires samtools and bamToBed in BEDTools.
|
6
|
+
DESC
|
7
|
+
option 'reverse', :aliases => '-r', :type => :boolean, :default => false
|
8
|
+
option 'name', :aliases => '-n', :type => :string, :default => ' '
|
9
|
+
def wig5p(bam)
|
10
|
+
|
11
|
+
str = options['reverse'] ? '-' : '+'
|
12
|
+
|
13
|
+
tmp = 0.0
|
14
|
+
open("| samtools view #{bam} | grep RNA_SPIKE_ | grep 'XS:A:+' | cut -f 1 | sort -u").each { |line| tmp += 1.0 }
|
15
|
+
abort "No spike-in reads." if tmp == 0.0
|
16
|
+
spike = 2234.8/tmp
|
17
|
+
|
18
|
+
acc2dups = Hash.new
|
19
|
+
open("| samtools view #{bam} | cut -f 1 | sort | uniq -c").each { |line|
|
20
|
+
dups, acc = line.strip.split(/\s/)
|
21
|
+
acc2dups[acc] = dups.to_f
|
22
|
+
}
|
23
|
+
|
24
|
+
cnts = Hash.new
|
25
|
+
tmpbam = mytemppath('.bam')
|
26
|
+
abort unless system("samtools view -h #{bam} | grep -E 'XS:A:\\#{str}|^@SQ' | samtools view -S -b - > #{tmpbam}")
|
27
|
+
|
28
|
+
open("| bamToBed -i #{tmpbam} | cut -f 1,#{options['reverse'] ? 3 : 2},4").each { |line|
|
29
|
+
chr, poss, acc = line.rstrip.split(/\t/)
|
30
|
+
cnts[chr] = Hash.new if !cnts.key?(chr)
|
31
|
+
pos = poss.to_i
|
32
|
+
if !cnts[chr].key?(pos)
|
33
|
+
cnts[chr][pos] = 1.0/acc2dups[acc]
|
34
|
+
else
|
35
|
+
cnts[chr][pos] = cnts[chr][pos]+1.0/acc2dups[acc]
|
36
|
+
end
|
37
|
+
}
|
38
|
+
|
39
|
+
puts "track type=wiggle_0 name=\"#{options['name']}\" description=\" \" alwaysZero=on visivility=full maxHeightPixels=128:64:16 color=#{options['reverse'] ? '0,128,255' : '255,128,0'}"
|
40
|
+
|
41
|
+
offset = options['reverse'] ? 0 : 1
|
42
|
+
signal = options['reverse'] ? -spike : spike
|
43
|
+
cnts.each { |chr, posvals|
|
44
|
+
puts "variableStep chrom=#{chr}"
|
45
|
+
posvals.keys.sort.each { |pos| puts [pos+offset, signal*posvals[pos]].join("\t") }
|
46
|
+
}
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-gadget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shintaro Katayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -100,9 +100,10 @@ files:
|
|
100
100
|
- lib/bio-gadget/dedup.rb
|
101
101
|
- lib/bio-gadget/demlt.rb
|
102
102
|
- lib/bio-gadget/fqxz.rb
|
103
|
-
- lib/bio-gadget/gtfann.rb
|
104
103
|
- lib/bio-gadget/qvstat.rb
|
104
|
+
- lib/bio-gadget/rgt2mtx.rb
|
105
105
|
- lib/bio-gadget/version.rb
|
106
|
+
- lib/bio-gadget/wig5p.rb
|
106
107
|
- lib/bio-gadget/wigchr.rb
|
107
108
|
homepage: https://github.com/shka/ruby-bio-gadget
|
108
109
|
licenses: []
|
data/lib/bio-gadget/gtfann.rb
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'ucscann ISOFORMSGZ XREFGZ [KNOWNGENE]', 'create knownGene.gtf'
|
5
|
-
def ucscann(isoforms, xref, knowngene='/dev/stdin')
|
6
|
-
|
7
|
-
acc2id = Hash.new
|
8
|
-
open("| gunzip -c #{isoforms}").each { |line|
|
9
|
-
id, acc = line.rstrip.split(/\t/)
|
10
|
-
acc2id[acc] = id
|
11
|
-
}
|
12
|
-
|
13
|
-
acc2sym = Hash.new
|
14
|
-
open("| gunzip -c #{xref} | cut -f 1,5").each { |line|
|
15
|
-
acc, sym = line.rstrip.split(/\t/)
|
16
|
-
acc2sym[acc] = sym
|
17
|
-
}
|
18
|
-
|
19
|
-
open("| sort -k1,1 -k4,4n", 'w') { |fp|
|
20
|
-
open(knowngene).each { |line|
|
21
|
-
cols = line.rstrip.split(/\t/)
|
22
|
-
acc, chr, str = cols.values_at(0, 1, 2)
|
23
|
-
cs = cols[5].to_i
|
24
|
-
ce = cols[6].to_i
|
25
|
-
lefts = cols[8].split(/,/)
|
26
|
-
rights = cols[9].split(/,/)
|
27
|
-
prop = "gene_id \"#{acc2id[acc]}\"; transcript_id \"#{acc}\""
|
28
|
-
prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
|
29
|
-
lefts.each_index { |i|
|
30
|
-
next if lefts[i].nil?
|
31
|
-
l = lefts[i].to_i
|
32
|
-
r = rights[i].to_i
|
33
|
-
fp.puts [chr, 'knownGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
|
34
|
-
}
|
35
|
-
if cs != ce
|
36
|
-
fp.puts [chr, 'knownGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
|
37
|
-
lefts.each_index { |i|
|
38
|
-
next if lefts[i].nil?
|
39
|
-
l = lefts[i].to_i
|
40
|
-
next if ce-1 < l
|
41
|
-
r = rights[i].to_i
|
42
|
-
next if r-1 < cs
|
43
|
-
fp.puts [chr, 'knownGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
|
44
|
-
}
|
45
|
-
end
|
46
|
-
}
|
47
|
-
}
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
desc 'ensann GENENAMEGZ [ENSGENE]', 'create ensGene.gtf'
|
52
|
-
def ensann(genename, ensgene='/dev/stdin')
|
53
|
-
|
54
|
-
acc2sym = Hash.new
|
55
|
-
open("| gunzip -c #{genename}").each { |line|
|
56
|
-
acc, sym = line.rstrip.split(/\t/)
|
57
|
-
acc2sym[acc] = sym
|
58
|
-
}
|
59
|
-
|
60
|
-
open("| sort -k1,1 -k4,4n", 'w') { |fp|
|
61
|
-
open(ensgene).each { |line|
|
62
|
-
cols = line.rstrip.split(/\t/)
|
63
|
-
acc, chr, str, id = cols.values_at(1, 2, 3, 12)
|
64
|
-
cs = cols[6].to_i
|
65
|
-
ce = cols[7].to_i
|
66
|
-
lefts = cols[9].split(/,/)
|
67
|
-
rights = cols[10].split(/,/)
|
68
|
-
prop = "gene_id \"#{id}\"; transcript_id \"#{acc}\""
|
69
|
-
prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
|
70
|
-
lefts.each_index { |i|
|
71
|
-
next if lefts[i].nil?
|
72
|
-
l = lefts[i].to_i
|
73
|
-
r = rights[i].to_i
|
74
|
-
fp.puts [chr, 'ensGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
|
75
|
-
}
|
76
|
-
if cs != ce
|
77
|
-
fp.puts [chr, 'ensGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
|
78
|
-
lefts.each_index { |i|
|
79
|
-
next if lefts[i].nil?
|
80
|
-
l = lefts[i].to_i
|
81
|
-
next if ce-1 < l
|
82
|
-
r = rights[i].to_i
|
83
|
-
next if r-1 < cs
|
84
|
-
fp.puts [chr, 'ensGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
|
85
|
-
}
|
86
|
-
end
|
87
|
-
}
|
88
|
-
}
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
93
|
-
end
|