bio-gadget 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aaeb43dbf9adba74b5741250dab6e1764421a4c4
4
- data.tar.gz: f45ff460eeee4aee51b7888f39592dac24075898
3
+ metadata.gz: a8b08f5a256d452f6399fe4484d6a18811bc3b58
4
+ data.tar.gz: 532b1903529b32c549eb321e363d0dc8e8efaf27
5
5
  SHA512:
6
- metadata.gz: 0f7dbebd237bb629a0e1d758b6e024427586d54a7cf4ccac59270b34e9c2dec374ee0d5bf81c88465eaabf531a520c712876349d8e72db3793fd9560c45cceee
7
- data.tar.gz: 4d39fb3ee8a65513bbea18750e435b5f5d54166c1e3f92920f6905e83248c7e0a39361382660ca80253572b466b0458d60b6cf4e3891971f02e34ef83292bed8
6
+ metadata.gz: 23f150133d857f16a9ff0a326201fb044ec167d51fcf5addc23cb445ef219b75ff84038d457c3325fe076ce024b91d3afa2ffa33dcbba3c33401b9c75dfaf0bb
7
+ data.tar.gz: edbfc69fcfa8f7db2003236fc8d7b01046a051955907ad45bb2939096b389d6c3a6f19c79d95dbf3212ed320a20c6f9bf477a5f57e3ce9a542ccedd52ab4e4e5
data/README.org CHANGED
@@ -8,20 +8,23 @@
8
8
 
9
9
  * Usage
10
10
 
11
- To check all commands in this package,
11
+ To check all commands and the usages in this package
12
12
 
13
13
  : bio-gadget help
14
- : bio-gadget list
15
14
 
16
15
  Currently available commands are
17
16
 
18
- : bio-gadget dedup # deduplicate fastq (via STDIN)
19
- : bio-gadget demlt BARCODE [FASTQ] # demultiplex fastq by barcodes
20
- : bio-gadget ensann GENENAMEGZ [ENSGENE] # create ensGene.gtf
21
- : bio-gadget fqxz # automatic (re)compression of *.fq(.gz|.bz2) files
22
- : bio-gadget qvstat QUAL # statistics of quality values in *.qual file
23
- : bio-gadget ucscann ISOFORMSGZ XREFGZ [KNOWNGENE] # create knownGene.gtf
24
- : bio-gadget wigchr WIG CHR # extract wiggle track on specified chromosome
17
+ - dedup :: Deduplicate fastq (via STDIN)
18
+ - demlt :: Demultiplex fastq by barcodes
19
+ - fqxz :: automatic (re)compression of *.fq(.gz|.bz2) files
20
+ - qvstat :: Statistics of quality values in *.qual file
21
+ - rgt2mtx :: Convert cuffdiff read group tracking file into tab-separated matrix
22
+ - wig5p :: Convert bam-format alignments into wig-format table
23
+ - wigchr :: Extract wiggle track on specified chromosome
24
+
25
+ ** Memo :noexport:
26
+ - gtfensembl :: Create gtf of ENSEMBL Genes (a.k.a. ensGene.gtf) from XX files of UCSC annotation database; .txt.gz of
27
+ - gtfucsc :: Create gtf of UCSC Genes from four (knownIsoforms.txt.gz, kgXref.txt.gz, kgTxInfo.txt.gz and knownGene.txt.gz) files of UCSC annotation database. The output gtf contains gene_id, transcript_id and gene_name attributes, so it convenients to use as a gene/transcript annotation file for tophat/cufflinks etc.
25
28
 
26
29
  * Contributing
27
30
 
data/lib/bio-gadget.rb CHANGED
@@ -2,8 +2,9 @@ require 'bio-gadget/version'
2
2
  require 'bio-gadget/dedup'
3
3
  require 'bio-gadget/demlt'
4
4
  require 'bio-gadget/fqxz'
5
- require 'bio-gadget/gtfann'
6
5
  require 'bio-gadget/qvstat'
6
+ require 'bio-gadget/rgt2mtx'
7
+ require 'bio-gadget/wig5p'
7
8
  require 'bio-gadget/wigchr'
8
9
 
9
10
  require 'tempfile'
@@ -11,8 +12,6 @@ require 'tempfile'
11
12
  module Bio
12
13
  class Gadget < Thor
13
14
 
14
- namespace :bio
15
-
16
15
  private
17
16
 
18
17
  def myopen(file, &block)
@@ -38,5 +37,6 @@ module Bio
38
37
  END {
39
38
  @@mytemppaths.each { |path| File.unlink(path) if File.exist?(path) }
40
39
  }
40
+
41
41
  end
42
42
  end
@@ -0,0 +1,33 @@
1
+ module Bio
2
+ class Gadget < Thor
3
+
4
+ desc 'rgt2mtx [RGT]', <<DESC
5
+ Convert cuffdiff read group tracking file into tab-separated matrix. If no given name of tracking file, it reads from standard input.
6
+ DESC
7
+ def rgt2mtx(rgt="/dev/stdin")
8
+
9
+ id = nil
10
+ header = true
11
+ raws = Hash.new
12
+ open("| tail -n +2 #{rgt} | sort -k 1").each { |line|
13
+ cols = line.rstrip.split(/\t/)
14
+ id = cols[0] if id.nil?
15
+ if id != cols[0]
16
+ if header
17
+ puts (['tracking_id'] + raws.keys.sort).join("\t")
18
+ header = false
19
+ end
20
+ tmp = [id]
21
+ raws.keys.sort.each { |k| tmp.push(raws[k]) }
22
+ puts tmp.join("\t")
23
+ id = cols[0]
24
+ end
25
+ raws["#{cols[1]}|#{cols[2]}"] = cols[3]
26
+ }
27
+ tmp = [id]
28
+ raws.keys.sort.each { |k| tmp.push(raws[k]) }
29
+ puts tmp.join("\t")
30
+ end
31
+
32
+ end
33
+ end
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.3.1"
6
+ VERSION = "0.4.0"
7
7
 
8
8
  end
9
9
  end
@@ -0,0 +1,51 @@
1
+ module Bio
2
+ class Gadget < Thor
3
+
4
+ desc 'wig5p BAM', <<DESC
5
+ Convert bam-format alignments into wig-format table. Count is at 5'-end of alignments. Moreover the counts are normalized when there are alignments to references named 'RNA_SPIKE_*'. This procedure requires samtools and bamToBed in BEDTools.
6
+ DESC
7
+ option 'reverse', :aliases => '-r', :type => :boolean, :default => false
8
+ option 'name', :aliases => '-n', :type => :string, :default => ' '
9
+ def wig5p(bam)
10
+
11
+ str = options['reverse'] ? '-' : '+'
12
+
13
+ tmp = 0.0
14
+ open("| samtools view #{bam} | grep RNA_SPIKE_ | grep 'XS:A:+' | cut -f 1 | sort -u").each { |line| tmp += 1.0 }
15
+ abort "No spike-in reads." if tmp == 0.0
16
+ spike = 2234.8/tmp
17
+
18
+ acc2dups = Hash.new
19
+ open("| samtools view #{bam} | cut -f 1 | sort | uniq -c").each { |line|
20
+ dups, acc = line.strip.split(/\s/)
21
+ acc2dups[acc] = dups.to_f
22
+ }
23
+
24
+ cnts = Hash.new
25
+ tmpbam = mytemppath('.bam')
26
+ abort unless system("samtools view -h #{bam} | grep -E 'XS:A:\\#{str}|^@SQ' | samtools view -S -b - > #{tmpbam}")
27
+
28
+ open("| bamToBed -i #{tmpbam} | cut -f 1,#{options['reverse'] ? 3 : 2},4").each { |line|
29
+ chr, poss, acc = line.rstrip.split(/\t/)
30
+ cnts[chr] = Hash.new if !cnts.key?(chr)
31
+ pos = poss.to_i
32
+ if !cnts[chr].key?(pos)
33
+ cnts[chr][pos] = 1.0/acc2dups[acc]
34
+ else
35
+ cnts[chr][pos] = cnts[chr][pos]+1.0/acc2dups[acc]
36
+ end
37
+ }
38
+
39
+ puts "track type=wiggle_0 name=\"#{options['name']}\" description=\" \" alwaysZero=on visivility=full maxHeightPixels=128:64:16 color=#{options['reverse'] ? '0,128,255' : '255,128,0'}"
40
+
41
+ offset = options['reverse'] ? 0 : 1
42
+ signal = options['reverse'] ? -spike : spike
43
+ cnts.each { |chr, posvals|
44
+ puts "variableStep chrom=#{chr}"
45
+ posvals.keys.sort.each { |pos| puts [pos+offset, signal*posvals[pos]].join("\t") }
46
+ }
47
+
48
+ end
49
+
50
+ end
51
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shintaro Katayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-17 00:00:00.000000000 Z
11
+ date: 2013-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -100,9 +100,10 @@ files:
100
100
  - lib/bio-gadget/dedup.rb
101
101
  - lib/bio-gadget/demlt.rb
102
102
  - lib/bio-gadget/fqxz.rb
103
- - lib/bio-gadget/gtfann.rb
104
103
  - lib/bio-gadget/qvstat.rb
104
+ - lib/bio-gadget/rgt2mtx.rb
105
105
  - lib/bio-gadget/version.rb
106
+ - lib/bio-gadget/wig5p.rb
106
107
  - lib/bio-gadget/wigchr.rb
107
108
  homepage: https://github.com/shka/ruby-bio-gadget
108
109
  licenses: []
@@ -1,93 +0,0 @@
1
- module Bio
2
- class Gadget < Thor
3
-
4
- desc 'ucscann ISOFORMSGZ XREFGZ [KNOWNGENE]', 'create knownGene.gtf'
5
- def ucscann(isoforms, xref, knowngene='/dev/stdin')
6
-
7
- acc2id = Hash.new
8
- open("| gunzip -c #{isoforms}").each { |line|
9
- id, acc = line.rstrip.split(/\t/)
10
- acc2id[acc] = id
11
- }
12
-
13
- acc2sym = Hash.new
14
- open("| gunzip -c #{xref} | cut -f 1,5").each { |line|
15
- acc, sym = line.rstrip.split(/\t/)
16
- acc2sym[acc] = sym
17
- }
18
-
19
- open("| sort -k1,1 -k4,4n", 'w') { |fp|
20
- open(knowngene).each { |line|
21
- cols = line.rstrip.split(/\t/)
22
- acc, chr, str = cols.values_at(0, 1, 2)
23
- cs = cols[5].to_i
24
- ce = cols[6].to_i
25
- lefts = cols[8].split(/,/)
26
- rights = cols[9].split(/,/)
27
- prop = "gene_id \"#{acc2id[acc]}\"; transcript_id \"#{acc}\""
28
- prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
29
- lefts.each_index { |i|
30
- next if lefts[i].nil?
31
- l = lefts[i].to_i
32
- r = rights[i].to_i
33
- fp.puts [chr, 'knownGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
34
- }
35
- if cs != ce
36
- fp.puts [chr, 'knownGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
37
- lefts.each_index { |i|
38
- next if lefts[i].nil?
39
- l = lefts[i].to_i
40
- next if ce-1 < l
41
- r = rights[i].to_i
42
- next if r-1 < cs
43
- fp.puts [chr, 'knownGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
44
- }
45
- end
46
- }
47
- }
48
-
49
- end
50
-
51
- desc 'ensann GENENAMEGZ [ENSGENE]', 'create ensGene.gtf'
52
- def ensann(genename, ensgene='/dev/stdin')
53
-
54
- acc2sym = Hash.new
55
- open("| gunzip -c #{genename}").each { |line|
56
- acc, sym = line.rstrip.split(/\t/)
57
- acc2sym[acc] = sym
58
- }
59
-
60
- open("| sort -k1,1 -k4,4n", 'w') { |fp|
61
- open(ensgene).each { |line|
62
- cols = line.rstrip.split(/\t/)
63
- acc, chr, str, id = cols.values_at(1, 2, 3, 12)
64
- cs = cols[6].to_i
65
- ce = cols[7].to_i
66
- lefts = cols[9].split(/,/)
67
- rights = cols[10].split(/,/)
68
- prop = "gene_id \"#{id}\"; transcript_id \"#{acc}\""
69
- prop += "; gene_name \"#{acc2sym[acc]}\"" if acc2sym.has_key?(acc)
70
- lefts.each_index { |i|
71
- next if lefts[i].nil?
72
- l = lefts[i].to_i
73
- r = rights[i].to_i
74
- fp.puts [chr, 'ensGene', 'exon', l+1, r, 0, str, '.', prop].join("\t")
75
- }
76
- if cs != ce
77
- fp.puts [chr, 'ensGene', 'start_codon', (str == '+' ? cs+1 : ce-2), (str == '+' ? cs+3 : ce), 0, str, '.', prop].join("\t")
78
- lefts.each_index { |i|
79
- next if lefts[i].nil?
80
- l = lefts[i].to_i
81
- next if ce-1 < l
82
- r = rights[i].to_i
83
- next if r-1 < cs
84
- fp.puts [chr, 'ensGene', 'CDS', (cs < l ? l : cs), (r < ce ? r : ce), 0, str, '.', prop].join("\t")
85
- }
86
- end
87
- }
88
- }
89
-
90
- end
91
-
92
- end
93
- end