bio-gadget 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aaeb43dbf9adba74b5741250dab6e1764421a4c4
4
- data.tar.gz: f45ff460eeee4aee51b7888f39592dac24075898
3
+ metadata.gz: a8b08f5a256d452f6399fe4484d6a18811bc3b58
4
+ data.tar.gz: 532b1903529b32c549eb321e363d0dc8e8efaf27
5
5
  SHA512:
6
- metadata.gz: 0f7dbebd237bb629a0e1d758b6e024427586d54a7cf4ccac59270b34e9c2dec374ee0d5bf81c88465eaabf531a520c712876349d8e72db3793fd9560c45cceee
7
- data.tar.gz: 4d39fb3ee8a65513bbea18750e435b5f5d54166c1e3f92920f6905e83248c7e0a39361382660ca80253572b466b0458d60b6cf4e3891971f02e34ef83292bed8
6
+ metadata.gz: 23f150133d857f16a9ff0a326201fb044ec167d51fcf5addc23cb445ef219b75ff84038d457c3325fe076ce024b91d3afa2ffa33dcbba3c33401b9c75dfaf0bb
7
+ data.tar.gz: edbfc69fcfa8f7db2003236fc8d7b01046a051955907ad45bb2939096b389d6c3a6f19c79d95dbf3212ed320a20c6f9bf477a5f57e3ce9a542ccedd52ab4e4e5
data/README.org CHANGED
@@ -8,20 +8,23 @@
8
8
 
9
9
  * Usage
10
10
 
11
- To check all commands in this package,
11
+ To check all commands and the usages in this package
12
12
 
13
13
  : bio-gadget help
14
- : bio-gadget list
15
14
 
16
15
  Currently available commands are
17
16
 
18
- : bio-gadget dedup # deduplicate fastq (via STDIN)
19
- : bio-gadget demlt BARCODE [FASTQ] # demultiplex fastq by barcodes
20
- : bio-gadget ensann GENENAMEGZ [ENSGENE] # create ensGene.gtf
21
- : bio-gadget fqxz # automatic (re)compression of *.fq(.gz|.bz2) files
22
- : bio-gadget qvstat QUAL # statistics of quality values in *.qual file
23
- : bio-gadget ucscann ISOFORMSGZ XREFGZ [KNOWNGENE] # create knownGene.gtf
24
- : bio-gadget wigchr WIG CHR # extract wiggle track on specified chromosome
17
+ - dedup :: Deduplicate fastq (via STDIN)
18
+ - demlt :: Demultiplex fastq by barcodes
19
+ - fqxz :: automatic (re)compression of *.fq(.gz|.bz2) files
20
+ - qvstat :: Statistics of quality values in *.qual file
21
+ - rgt2mtx :: Convert cuffdiff read group tracking file into tab-separated matrix
22
+ - wig5p :: Convert bam-format alignments into wig-format table
23
+ - wigchr :: Extract wiggle track on specified chromosome
24
+
25
+ ** Memo :noexport:
26
+ - gtfensembl :: Create gtf of ENSEMBL Genes (a.k.a. ensGene.gtf) from XX files of UCSC annotation database; .txt.gz of
27
+ - gtfucsc :: Create gtf of UCSC Genes from four (knownIsoforms.txt.gz, kgXref.txt.gz, kgTxInfo.txt.gz and knownGene.txt.gz) files of UCSC annotation database. The output gtf contains gene_id, transcript_id and gene_name attributes, so it convenients to use as a gene/transcript annotation file for tophat/cufflinks etc.
25
28
 
26
29
  * Contributing
27
30
 
data/lib/bio-gadget.rb CHANGED
@@ -2,8 +2,9 @@ require 'bio-gadget/version'
2
2
  require 'bio-gadget/dedup'
3
3
  require 'bio-gadget/demlt'
4
4
  require 'bio-gadget/fqxz'
5
- require 'bio-gadget/gtfann'
6
5
  require 'bio-gadget/qvstat'
6
+ require 'bio-gadget/rgt2mtx'
7
+ require 'bio-gadget/wig5p'
7
8
  require 'bio-gadget/wigchr'
8
9
 
9
10
  require 'tempfile'
@@ -11,8 +12,6 @@ require 'tempfile'
11
12
  module Bio
12
13
  class Gadget < Thor
13
14
 
14
- namespace :bio
15
-
16
15
  private
17
16
 
18
17
  def myopen(file, &block)
@@ -38,5 +37,6 @@ module Bio
38
37
  END {
39
38
  @@mytemppaths.each { |path| File.unlink(path) if File.exist?(path) }
40
39
  }
40
+
41
41
  end
42
42
  end
@@ -0,0 +1,33 @@
1
+ module Bio
2
+ class Gadget < Thor
3
+
4
+ desc 'rgt2mtx [RGT]', <<DESC
5
+ Convert cuffdiff read group tracking file into tab-separated matrix. If no given name of tracking file, it reads from standard input.
6
+ DESC
7
+ def rgt2mtx(rgt="/dev/stdin")
8
+
9
+ id = nil
10
+ header = true
11
+ raws = Hash.new
12
+ open("| tail -n +2 #{rgt} | sort -k 1").each { |line|
13
+ cols = line.rstrip.split(/\t/)
14
+ id = cols[0] if id.nil?
15
+ if id != cols[0]
16
+ if header
17
+ puts (['tracking_id'] + raws.keys.sort).join("\t")
18
+ header = false
19
+ end
20
+ tmp = [id]
21
+ raws.keys.sort.each { |k| tmp.push(raws[k]) }
22
+ puts tmp.join("\t")
23
+ id = cols[0]
24
+ end
25
+ raws["#{cols[1]}|#{cols[2]}"] = cols[3]
26
+ }
27
+ tmp = [id]
28
+ raws.keys.sort.each { |k| tmp.push(raws[k]) }
29
+ puts tmp.join("\t")
30
+ end
31
+
32
+ end
33
+ end
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.3.1"
6
+ VERSION = "0.4.0"
7
7
 
8
8
  end
9
9
  end
@@ -0,0 +1,51 @@
1
+ module Bio
2
+ class Gadget < Thor
3
+
4
+ desc 'wig5p BAM', <<DESC
5
+ Convert bam-format alignments into wig-format table. Count is at 5'-end of alignments. Moreover the counts are normalized when there are alignments to references named 'RNA_SPIKE_*'. This procedure requires samtools and bamToBed in BEDTools.
6
+ DESC
7
+ option 'reverse', :aliases => '-r', :type => :boolean, :default => false
8
+ option 'name', :aliases => '-n', :type => :string, :default => ' '
9
+ def wig5p(bam)
10
+
11
+ str = options['reverse'] ? '-' : '+'
12
+
13
+ tmp = 0.0
14
+ open("| samtools view #{bam} | grep RNA_SPIKE_ | grep 'XS:A:+' | cut -f 1 | sort -u").each { |line| tmp += 1.0 }
15
+ abort "No spike-in reads." if tmp == 0.0
16
+ spike = 2234.8/tmp
17
+
18
+ acc2dups = Hash.new
19
+ open("| samtools view #{bam} | cut -f 1 | sort | uniq -c").each { |line|
20
+ dups, acc = line.strip.split(/\s/)
21
+ acc2dups[acc] = dups.to_f
22
+ }
23
+
24
+ cnts = Hash.new
25
+ tmpbam = mytemppath('.bam')
26
+ abort unless system("samtools view -h #{bam} | grep -E 'XS:A:\\#{str}|^@SQ' | samtools view -S -b - > #{tmpbam}")
27
+
28
+ open("| bamToBed -i #{tmpbam} | cut -f 1,#{options['reverse'] ? 3 : 2},4").each { |line|
29
+ chr, poss, acc = line.rstrip.split(/\t/)
30
+ cnts[chr] = Hash.new if !cnts.key?(chr)
31
+ pos = poss.to_i
32
+ if !cnts[chr].key?(pos)
33
+ cnts[chr][pos] = 1.0/acc2dups[acc]
34
+ else
35
+ cnts[chr][pos] = cnts[chr][pos]+1.0/acc2dups[acc]
36
+ end
37
+ }
38
+
39
+ puts "track type=wiggle_0 name=\"#{options['name']}\" description=\" \" alwaysZero=on visivility=full maxHeightPixels=128:64:16 color=#{options['reverse'] ? '0,128,255' : '255,128,0'}"
40
+
41
+ offset = options['reverse'] ? 0 : 1
42
+ signal = options['reverse'] ? -spike : spike
43
+ cnts.each { |chr, posvals|
44
+ puts "variableStep chrom=#{chr}"
45
+ posvals.keys.sort.each { |pos| puts [pos+offset, signal*posvals[pos]].join("\t") }
46
+ }
47
+
48
+ end
49
+
50
+ end
51
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shintaro Katayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-17 00:00:00.000000000 Z
11
+ date: 2013-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -100,9 +100,10 @@ files:
100
100
  - lib/bio-gadget/dedup.rb
101
101
  - lib/bio-gadget/demlt.rb
102
102
  - lib/bio-gadget/fqxz.rb
103
- - lib/bio-gadget/gtfann.rb
104
103
  - lib/bio-gadget/qvstat.rb
104
+ - lib/bio-gadget/rgt2mtx.rb
105
105
  - lib/bio-gadget/version.rb
106
+ - lib/bio-gadget/wig5p.rb
106
107
  - lib/bio-gadget/wigchr.rb
107
108
  homepage: https://github.com/shka/ruby-bio-gadget
108
109
  licenses: []
@@ -1,93 +0,0 @@
1
- module Bio
2
- class Gadget < Thor
3
-
4
- desc 'ucscann ISOFORMSGZ XREFGZ [KNOWNGENE]', 'create knownGene.gtf'
5
- def ucscann(isoforms, xref, knowngene='/dev/stdin')
6
-
7
- acc2id = Hash.new
8
- open("| gunzip -c #{isoforms}").each { |line|
9
- id, acc = line.rstrip.split(/\t/)
10
- acc2id[acc] = id
11
- }
12
-
13
- acc2sym = Hash.new
14
- open("| gunzip -c #{xref} | cut -f 1,5").each { |line|
15
- acc, sym = line.rstrip.split(/\t/)
16
- acc2sym[acc] = sym
17
- }
18
-
19
- open("| sort -k1,1 -k4,4n", 'w') { |fp|
20
- open(knowngene).each { |line|
21
- cols = line.rstrip.split(/\t/)
22
- acc, chr, str = cols.values_at(0, 1, 2)
23
- cs = cols[5].to_i
24
- ce = cols[6].to_i
25
- lefts = cols[8].split(/,/)
26
- rights = cols[9].split(/,/)
27
- prop = "gene_id \"#{acc2id[acc]}\"; transcript_id \"#{acc}\""
28
- prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
29
- lefts.each_index { |i|
30
- next if lefts[i].nil?
31
- l = lefts[i].to_i
32
- r = rights[i].to_i
33
- fp.puts [chr, 'knownGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
34
- }
35
- if cs != ce
36
- fp.puts [chr, 'knownGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
37
- lefts.each_index { |i|
38
- next if lefts[i].nil?
39
- l = lefts[i].to_i
40
- next if ce-1 < l
41
- r = rights[i].to_i
42
- next if r-1 < cs
43
- fp.puts [chr, 'knownGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
44
- }
45
- end
46
- }
47
- }
48
-
49
- end
50
-
51
- desc 'ensann GENENAMEGZ [ENSGENE]', 'create ensGene.gtf'
52
- def ensann(genename, ensgene='/dev/stdin')
53
-
54
- acc2sym = Hash.new
55
- open("| gunzip -c #{genename}").each { |line|
56
- acc, sym = line.rstrip.split(/\t/)
57
- acc2sym[acc] = sym
58
- }
59
-
60
- open("| sort -k1,1 -k4,4n", 'w') { |fp|
61
- open(ensgene).each { |line|
62
- cols = line.rstrip.split(/\t/)
63
- acc, chr, str, id = cols.values_at(1, 2, 3, 12)
64
- cs = cols[6].to_i
65
- ce = cols[7].to_i
66
- lefts = cols[9].split(/,/)
67
- rights = cols[10].split(/,/)
68
- prop = "gene_id \"#{id}\"; transcript_id \"#{acc}\""
69
- prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
70
- lefts.each_index { |i|
71
- next if lefts[i].nil?
72
- l = lefts[i].to_i
73
- r = rights[i].to_i
74
- fp.puts [chr, 'ensGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
75
- }
76
- if cs != ce
77
- fp.puts [chr, 'ensGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
78
- lefts.each_index { |i|
79
- next if lefts[i].nil?
80
- l = lefts[i].to_i
81
- next if ce-1 < l
82
- r = rights[i].to_i
83
- next if r-1 < cs
84
- fp.puts [chr, 'ensGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
85
- }
86
- end
87
- }
88
- }
89
-
90
- end
91
-
92
- end
93
- end