bio-gadget 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.org CHANGED
@@ -17,8 +17,9 @@ Currently available commands are
17
17
 
18
18
  : bio
19
19
  : ---
20
+ : gthor bio:dedup # deduplicate fastq (via STDIN)
20
21
  : gthor bio:demlt BC POS # demultiplex fastq (via STDIN) by barcodes
21
- : gthor bio:fqlzma # automatic (re)compression of *.fq(.gz|.bz2) files
22
+ : gthor bio:fqxz # automatic (re)compression of *.fq(.gz|.bz2) files
22
23
  : gthor bio:qvstat QUAL # statistics of quality values in *.qual file
23
24
  : gthor bio:wigchr WIG CHR # extract wiggle track on specified chromosome
24
25
 
data/lib/bio-gadget.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'bio-gadget/version'
2
+ require 'bio-gadget/dedup'
2
3
  require 'bio-gadget/demlt'
3
- require 'bio-gadget/fqlzma'
4
+ require 'bio-gadget/fqxz'
4
5
  require 'bio-gadget/qvstat'
5
6
  require 'bio-gadget/wigchr'
6
7
 
@@ -0,0 +1,33 @@
1
+ require 'bio-faster'
2
+ require 'parallel'
3
+
4
+ module Bio
5
+ class Gadget < Thor
6
+ namespace :bio
7
+
8
+ desc 'dedup', 'deduplicate fastq (via STDIN)'
9
+ def dedup
10
+
11
+ p1in, p1out = IO.pipe
12
+
13
+ fork {
14
+ p1in.close
15
+ $stdout.reopen(p1out)
16
+ open("| sort -k 1 -r -S #{sprintf('%2d', 100/(Parallel.processor_count+1))}% -T $TMPDIR | cut -f 2- | uniq -f 2", 'w') { |fp|
17
+ Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
18
+ fp.puts "#{seq}#{qvs}\t#{seqid}\t#{qvs}\t#{seq}"
19
+ end
20
+ }
21
+ }
22
+
23
+ p1out.close
24
+
25
+ p1in.each_line { |line|
26
+ seqid, qvs, seq = line.rstrip.split
27
+ puts "@#{seqid}\n#{seq}\n+\n#{qvs}"
28
+ }
29
+
30
+ end
31
+
32
+ end
33
+ end
@@ -8,7 +8,7 @@ module Bio
8
8
  namespace :bio
9
9
 
10
10
  desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
11
- option :destdir, :type => :string, :default => '.'
11
+ option 'output-dir', :type => :string, :default => '.', :aliases => '-o'
12
12
  def demlt(bcfile, tmpofs)
13
13
 
14
14
  ofs = tmpofs.to_i
@@ -35,8 +35,8 @@ module Bio
35
35
  q = SizedQueue.new(100000)
36
36
  t = Thread.new(well, q) do |well, q|
37
37
  tc = Thread.current
38
- tc[:file] = "#{options[:destdir]}/#{well}.fq.gz"
39
- fp = open("| gzip -c > #{tc[:file]}", 'w')
38
+ tc[:file] = "#{options['output-dir']}/#{well}.fq.xz"
39
+ fp = open("| xz -z -c -e > #{tc[:file]}", 'w')
40
40
  tc[:read] = 0
41
41
  while vals = q.shift
42
42
  if vals == ""
@@ -52,19 +52,28 @@ module Bio
52
52
  ts.push(t)
53
53
  }
54
54
 
55
+ rq = Queue.new
56
+ Thread.new(rq) {
57
+ Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
58
+ rq.push([seqid, seq, qvs])
59
+ end
60
+ rq.push('')
61
+ }
62
+
55
63
  seqs = Array.new
56
- Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
57
- seqs.push([seqid, seq, qvs])
58
- if seqs.size == 100000 * Parallel.processor_count
64
+ while vals = rq.shift
65
+ if vals != ""
66
+ seqs.push(vals)
67
+ end
68
+ if vals == "" || seqs.size == 100000 * Parallel.processor_count
59
69
  parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
60
70
  seqs = Array.new
61
71
  end
72
+ if vals == ""
73
+ qs.each { |q| q.push('') }
74
+ break
75
+ end
62
76
  end
63
- if seqs.size > 0
64
- parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
65
- end
66
-
67
- qs.each { |q| q.push('') }
68
77
  ts.each { |t| t.join }
69
78
 
70
79
  total = 0
@@ -104,9 +113,9 @@ module Bio
104
113
  dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
105
114
  if dists[0][1] < dists[1][1] && dists[0][1] < 2
106
115
  idx = dists[0][0]
107
- qs[idx].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
116
+ qs[idx].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
108
117
  else
109
- qs[-1].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
118
+ qs[-1].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
110
119
  end
111
120
  end
112
121
 
@@ -6,11 +6,11 @@ module Bio
6
6
 
7
7
  namespace :bio
8
8
 
9
- desc 'fqlzma', 'automatic (re)compression of *.fq(.gz|.bz2) files'
10
- def fqlzma
9
+ desc 'fqxz', 'automatic (re)compression of *.fq(.gz|.bz2) files'
10
+ def fqxz
11
11
  Parallel.map(Pathname.glob('*.fq{.gz,.bz2,}')) { |fqfilename|
12
- lzmafilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.lzma')
13
- if !lzmafilename.exist?
12
+ xzfilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.xz')
13
+ if !xzfilename.exist?
14
14
  case fqfilename.extname
15
15
  when '.gz'
16
16
  decompressor = 'gunzip -c'
@@ -19,9 +19,9 @@ module Bio
19
19
  else
20
20
  decompressor = 'cat'
21
21
  end
22
- puts "compressing #{lzmafilename}..."
23
- system "#{decompressor} #{fqfilename} | lzma -c > #{lzmafilename} 2> #{lzmafilename}.log"
24
- system "lzma -t #{lzmafilename} >> #{lzmafilename}.log 2>&1"
22
+ puts "compressing #{xzfilename}..."
23
+ system "#{decompressor} #{fqfilename} | xz -z -e -c > #{xzfilename} 2> #{xzfilename}.log"
24
+ system "xz -t #{xzfilename} >> #{xzfilename}.log 2>&1"
25
25
  end
26
26
  }
27
27
  end
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.1.4"
6
+ VERSION = "0.2.1"
7
7
 
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-07 00:00:00.000000000 Z
12
+ date: 2012-12-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gthor
@@ -90,8 +90,9 @@ files:
90
90
  - Rakefile
91
91
  - bio-gadget.gemspec
92
92
  - lib/bio-gadget.rb
93
+ - lib/bio-gadget/dedup.rb
93
94
  - lib/bio-gadget/demlt.rb
94
- - lib/bio-gadget/fqlzma.rb
95
+ - lib/bio-gadget/fqxz.rb
95
96
  - lib/bio-gadget/qvstat.rb
96
97
  - lib/bio-gadget/version.rb
97
98
  - lib/bio-gadget/wigchr.rb