bio-gadget 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.org +12 -9
- data/lib/bio-gadget.rb +3 -3
- data/lib/bio-gadget/rgt2mtx.rb +33 -0
- data/lib/bio-gadget/version.rb +1 -1
- data/lib/bio-gadget/wig5p.rb +51 -0
- metadata +4 -3
- data/lib/bio-gadget/gtfann.rb +0 -93
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a8b08f5a256d452f6399fe4484d6a18811bc3b58
|
4
|
+
data.tar.gz: 532b1903529b32c549eb321e363d0dc8e8efaf27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23f150133d857f16a9ff0a326201fb044ec167d51fcf5addc23cb445ef219b75ff84038d457c3325fe076ce024b91d3afa2ffa33dcbba3c33401b9c75dfaf0bb
|
7
|
+
data.tar.gz: edbfc69fcfa8f7db2003236fc8d7b01046a051955907ad45bb2939096b389d6c3a6f19c79d95dbf3212ed320a20c6f9bf477a5f57e3ce9a542ccedd52ab4e4e5
|
data/README.org
CHANGED
@@ -8,20 +8,23 @@
|
|
8
8
|
|
9
9
|
* Usage
|
10
10
|
|
11
|
-
To check all commands in this package
|
11
|
+
To check all commands and the usages in this package
|
12
12
|
|
13
13
|
: bio-gadget help
|
14
|
-
: bio-gadget list
|
15
14
|
|
16
15
|
Currently available commands are
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
17
|
+
- dedup :: Deduplicate fastq (via STDIN)
|
18
|
+
- demlt :: Demultiplex fastq by barcodes
|
19
|
+
- fqxz :: automatic (re)compression of *.fq(.gz|.bz2) files
|
20
|
+
- qvstat :: Statistics of quality values in *.qual file
|
21
|
+
- rgt2mtx :: Convert cuffdiff read group tracking file into tab-separated matrix
|
22
|
+
- wig5p :: Convert bam-format alignments into wig-format table
|
23
|
+
- wigchr :: Extract wiggle track on specified chromosome
|
24
|
+
|
25
|
+
** Memo :noexport:
|
26
|
+
- gtfensembl :: Create gtf of ENSEMBL Genes (a.k.a. ensGene.gtf) from XX files of UCSC annotation database; .txt.gz of
|
27
|
+
- gtfucsc :: Create gtf of UCSC Genes from four (knownIsoforms.txt.gz, kgXref.txt.gz, kgTxInfo.txt.gz and knownGene.txt.gz) files of UCSC annotation database. The output gtf contains gene_id, transcript_id and gene_name attributes, so it convenients to use as a gene/transcript annotation file for tophat/cufflinks etc.
|
25
28
|
|
26
29
|
* Contributing
|
27
30
|
|
data/lib/bio-gadget.rb
CHANGED
@@ -2,8 +2,9 @@ require 'bio-gadget/version'
|
|
2
2
|
require 'bio-gadget/dedup'
|
3
3
|
require 'bio-gadget/demlt'
|
4
4
|
require 'bio-gadget/fqxz'
|
5
|
-
require 'bio-gadget/gtfann'
|
6
5
|
require 'bio-gadget/qvstat'
|
6
|
+
require 'bio-gadget/rgt2mtx'
|
7
|
+
require 'bio-gadget/wig5p'
|
7
8
|
require 'bio-gadget/wigchr'
|
8
9
|
|
9
10
|
require 'tempfile'
|
@@ -11,8 +12,6 @@ require 'tempfile'
|
|
11
12
|
module Bio
|
12
13
|
class Gadget < Thor
|
13
14
|
|
14
|
-
namespace :bio
|
15
|
-
|
16
15
|
private
|
17
16
|
|
18
17
|
def myopen(file, &block)
|
@@ -38,5 +37,6 @@ module Bio
|
|
38
37
|
END {
|
39
38
|
@@mytemppaths.each { |path| File.unlink(path) if File.exist?(path) }
|
40
39
|
}
|
40
|
+
|
41
41
|
end
|
42
42
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Bio
|
2
|
+
class Gadget < Thor
|
3
|
+
|
4
|
+
desc 'rgt2mtx [RGT]', <<DESC
|
5
|
+
Convert cuffdiff read group tracking file into tab-separated matrix. If no given name of tracking file, it reads from standard input.
|
6
|
+
DESC
|
7
|
+
def rgt2mtx(rgt="/dev/stdin")
|
8
|
+
|
9
|
+
id = nil
|
10
|
+
header = true
|
11
|
+
raws = Hash.new
|
12
|
+
open("| tail -n +2 #{rgt} | sort -k 1").each { |line|
|
13
|
+
cols = line.rstrip.split(/\t/)
|
14
|
+
id = cols[0] if id.nil?
|
15
|
+
if id != cols[0]
|
16
|
+
if header
|
17
|
+
puts (['tracking_id'] + raws.keys.sort).join("\t")
|
18
|
+
header = false
|
19
|
+
end
|
20
|
+
tmp = [id]
|
21
|
+
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
22
|
+
puts tmp.join("\t")
|
23
|
+
id = cols[0]
|
24
|
+
end
|
25
|
+
raws["#{cols[1]}|#{cols[2]}"] = cols[3]
|
26
|
+
}
|
27
|
+
tmp = [id]
|
28
|
+
raws.keys.sort.each { |k| tmp.push(raws[k]) }
|
29
|
+
puts tmp.join("\t")
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
data/lib/bio-gadget/version.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
module Bio
|
2
|
+
class Gadget < Thor
|
3
|
+
|
4
|
+
desc 'wig5p BAM', <<DESC
|
5
|
+
Convert bam-format alignments into wig-format table. Count is at 5'-end of alignments. Moreover the counts are normalized when there are alignments to references named 'RNA_SPIKE_*'. This procedure requires samtools and bamToBed in BEDTools.
|
6
|
+
DESC
|
7
|
+
option 'reverse', :aliases => '-r', :type => :boolean, :default => false
|
8
|
+
option 'name', :aliases => '-n', :type => :string, :default => ' '
|
9
|
+
def wig5p(bam)
|
10
|
+
|
11
|
+
str = options['reverse'] ? '-' : '+'
|
12
|
+
|
13
|
+
tmp = 0.0
|
14
|
+
open("| samtools view #{bam} | grep RNA_SPIKE_ | grep 'XS:A:+' | cut -f 1 | sort -u").each { |line| tmp += 1.0 }
|
15
|
+
abort "No spike-in reads." if tmp == 0.0
|
16
|
+
spike = 2234.8/tmp
|
17
|
+
|
18
|
+
acc2dups = Hash.new
|
19
|
+
open("| samtools view #{bam} | cut -f 1 | sort | uniq -c").each { |line|
|
20
|
+
dups, acc = line.strip.split(/\s/)
|
21
|
+
acc2dups[acc] = dups.to_f
|
22
|
+
}
|
23
|
+
|
24
|
+
cnts = Hash.new
|
25
|
+
tmpbam = mytemppath('.bam')
|
26
|
+
abort unless system("samtools view -h #{bam} | grep -E 'XS:A:\\#{str}|^@SQ' | samtools view -S -b - > #{tmpbam}")
|
27
|
+
|
28
|
+
open("| bamToBed -i #{tmpbam} | cut -f 1,#{options['reverse'] ? 3 : 2},4").each { |line|
|
29
|
+
chr, poss, acc = line.rstrip.split(/\t/)
|
30
|
+
cnts[chr] = Hash.new if !cnts.key?(chr)
|
31
|
+
pos = poss.to_i
|
32
|
+
if !cnts[chr].key?(pos)
|
33
|
+
cnts[chr][pos] = 1.0/acc2dups[acc]
|
34
|
+
else
|
35
|
+
cnts[chr][pos] = cnts[chr][pos]+1.0/acc2dups[acc]
|
36
|
+
end
|
37
|
+
}
|
38
|
+
|
39
|
+
puts "track type=wiggle_0 name=\"#{options['name']}\" description=\" \" alwaysZero=on visivility=full maxHeightPixels=128:64:16 color=#{options['reverse'] ? '0,128,255' : '255,128,0'}"
|
40
|
+
|
41
|
+
offset = options['reverse'] ? 0 : 1
|
42
|
+
signal = options['reverse'] ? -spike : spike
|
43
|
+
cnts.each { |chr, posvals|
|
44
|
+
puts "variableStep chrom=#{chr}"
|
45
|
+
posvals.keys.sort.each { |pos| puts [pos+offset, signal*posvals[pos]].join("\t") }
|
46
|
+
}
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-gadget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shintaro Katayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -100,9 +100,10 @@ files:
|
|
100
100
|
- lib/bio-gadget/dedup.rb
|
101
101
|
- lib/bio-gadget/demlt.rb
|
102
102
|
- lib/bio-gadget/fqxz.rb
|
103
|
-
- lib/bio-gadget/gtfann.rb
|
104
103
|
- lib/bio-gadget/qvstat.rb
|
104
|
+
- lib/bio-gadget/rgt2mtx.rb
|
105
105
|
- lib/bio-gadget/version.rb
|
106
|
+
- lib/bio-gadget/wig5p.rb
|
106
107
|
- lib/bio-gadget/wigchr.rb
|
107
108
|
homepage: https://github.com/shka/ruby-bio-gadget
|
108
109
|
licenses: []
|
data/lib/bio-gadget/gtfann.rb
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
module Bio
|
2
|
-
class Gadget < Thor
|
3
|
-
|
4
|
-
desc 'ucscann ISOFORMSGZ XREFGZ [KNOWNGENE]', 'create knownGene.gtf'
|
5
|
-
def ucscann(isoforms, xref, knowngene='/dev/stdin')
|
6
|
-
|
7
|
-
acc2id = Hash.new
|
8
|
-
open("| gunzip -c #{isoforms}").each { |line|
|
9
|
-
id, acc = line.rstrip.split(/\t/)
|
10
|
-
acc2id[acc] = id
|
11
|
-
}
|
12
|
-
|
13
|
-
acc2sym = Hash.new
|
14
|
-
open("| gunzip -c #{xref} | cut -f 1,5").each { |line|
|
15
|
-
acc, sym = line.rstrip.split(/\t/)
|
16
|
-
acc2sym[acc] = sym
|
17
|
-
}
|
18
|
-
|
19
|
-
open("| sort -k1,1 -k4,4n", 'w') { |fp|
|
20
|
-
open(knowngene).each { |line|
|
21
|
-
cols = line.rstrip.split(/\t/)
|
22
|
-
acc, chr, str = cols.values_at(0, 1, 2)
|
23
|
-
cs = cols[5].to_i
|
24
|
-
ce = cols[6].to_i
|
25
|
-
lefts = cols[8].split(/,/)
|
26
|
-
rights = cols[9].split(/,/)
|
27
|
-
prop = "gene_id \"#{acc2id[acc]}\"; transcript_id \"#{acc}\""
|
28
|
-
prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
|
29
|
-
lefts.each_index { |i|
|
30
|
-
next if lefts[i].nil?
|
31
|
-
l = lefts[i].to_i
|
32
|
-
r = rights[i].to_i
|
33
|
-
fp.puts [chr, 'knownGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
|
34
|
-
}
|
35
|
-
if cs != ce
|
36
|
-
fp.puts [chr, 'knownGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
|
37
|
-
lefts.each_index { |i|
|
38
|
-
next if lefts[i].nil?
|
39
|
-
l = lefts[i].to_i
|
40
|
-
next if ce-1 < l
|
41
|
-
r = rights[i].to_i
|
42
|
-
next if r-1 < cs
|
43
|
-
fp.puts [chr, 'knownGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
|
44
|
-
}
|
45
|
-
end
|
46
|
-
}
|
47
|
-
}
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
desc 'ensann GENENAMEGZ [ENSGENE]', 'create ensGene.gtf'
|
52
|
-
def ensann(genename, ensgene='/dev/stdin')
|
53
|
-
|
54
|
-
acc2sym = Hash.new
|
55
|
-
open("| gunzip -c #{genename}").each { |line|
|
56
|
-
acc, sym = line.rstrip.split(/\t/)
|
57
|
-
acc2sym[acc] = sym
|
58
|
-
}
|
59
|
-
|
60
|
-
open("| sort -k1,1 -k4,4n", 'w') { |fp|
|
61
|
-
open(ensgene).each { |line|
|
62
|
-
cols = line.rstrip.split(/\t/)
|
63
|
-
acc, chr, str, id = cols.values_at(1, 2, 3, 12)
|
64
|
-
cs = cols[6].to_i
|
65
|
-
ce = cols[7].to_i
|
66
|
-
lefts = cols[9].split(/,/)
|
67
|
-
rights = cols[10].split(/,/)
|
68
|
-
prop = "gene_id \"#{id}\"; transcript_id \"#{acc}\""
|
69
|
-
prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
|
70
|
-
lefts.each_index { |i|
|
71
|
-
next if lefts[i].nil?
|
72
|
-
l = lefts[i].to_i
|
73
|
-
r = rights[i].to_i
|
74
|
-
fp.puts [chr, 'ensGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
|
75
|
-
}
|
76
|
-
if cs != ce
|
77
|
-
fp.puts [chr, 'ensGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
|
78
|
-
lefts.each_index { |i|
|
79
|
-
next if lefts[i].nil?
|
80
|
-
l = lefts[i].to_i
|
81
|
-
next if ce-1 < l
|
82
|
-
r = rights[i].to_i
|
83
|
-
next if r-1 < cs
|
84
|
-
fp.puts [chr, 'ensGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
|
85
|
-
}
|
86
|
-
end
|
87
|
-
}
|
88
|
-
}
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
93
|
-
end
|