bio-gadget 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.org CHANGED
@@ -17,8 +17,9 @@ Currently available commands are
17
17
 
18
18
  : bio
19
19
  : ---
20
+ : gthor bio:dedup # deduplicate fastq (via STDIN)
20
21
  : gthor bio:demlt BC POS # demultiplex fastq (via STDIN) by barcodes
21
- : gthor bio:fqlzma # automatic (re)compression of *.fq(.gz|.bz2) files
22
+ : gthor bio:fqxz # automatic (re)compression of *.fq(.gz|.bz2) files
22
23
  : gthor bio:qvstat QUAL # statistics of quality values in *.qual file
23
24
  : gthor bio:wigchr WIG CHR # extract wiggle track on specified chromosome
24
25
 
data/lib/bio-gadget.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'bio-gadget/version'
2
+ require 'bio-gadget/dedup'
2
3
  require 'bio-gadget/demlt'
3
- require 'bio-gadget/fqlzma'
4
+ require 'bio-gadget/fqxz'
4
5
  require 'bio-gadget/qvstat'
5
6
  require 'bio-gadget/wigchr'
6
7
 
@@ -0,0 +1,33 @@
1
+ require 'bio-faster'
2
+ require 'parallel'
3
+
4
+ module Bio
5
+ class Gadget < Thor
6
+ namespace :bio
7
+
8
+ desc 'dedup', 'deduplicate fastq (via STDIN)'
9
+ def dedup
10
+
11
+ p1in, p1out = IO.pipe
12
+
13
+ fork {
14
+ p1in.close
15
+ $stdout.reopen(p1out)
16
+ open("| sort -k 1 -r -S #{sprintf('%2d', 100/(Parallel.processor_count+1))}% -T $TMPDIR | cut -f 2- | uniq -f 2", 'w') { |fp|
17
+ Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
18
+ fp.puts "#{seq}#{qvs}\t#{seqid}\t#{qvs}\t#{seq}"
19
+ end
20
+ }
21
+ }
22
+
23
+ p1out.close
24
+
25
+ p1in.each_line { |line|
26
+ seqid, qvs, seq = line.rstrip.split
27
+ puts "@#{seqid}\n#{seq}\n+\n#{qvs}"
28
+ }
29
+
30
+ end
31
+
32
+ end
33
+ end
@@ -8,7 +8,7 @@ module Bio
8
8
  namespace :bio
9
9
 
10
10
  desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
11
- option :destdir, :type => :string, :default => '.'
11
+ option 'output-dir', :type => :string, :default => '.', :aliases => '-o'
12
12
  def demlt(bcfile, tmpofs)
13
13
 
14
14
  ofs = tmpofs.to_i
@@ -35,8 +35,8 @@ module Bio
35
35
  q = SizedQueue.new(100000)
36
36
  t = Thread.new(well, q) do |well, q|
37
37
  tc = Thread.current
38
- tc[:file] = "#{options[:destdir]}/#{well}.fq.gz"
39
- fp = open("| gzip -c > #{tc[:file]}", 'w')
38
+ tc[:file] = "#{options['output-dir']}/#{well}.fq.xz"
39
+ fp = open("| xz -z -c -e > #{tc[:file]}", 'w')
40
40
  tc[:read] = 0
41
41
  while vals = q.shift
42
42
  if vals == ""
@@ -52,19 +52,28 @@ module Bio
52
52
  ts.push(t)
53
53
  }
54
54
 
55
+ rq = Queue.new
56
+ Thread.new(rq) {
57
+ Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
58
+ rq.push([seqid, seq, qvs])
59
+ end
60
+ rq.push('')
61
+ }
62
+
55
63
  seqs = Array.new
56
- Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
57
- seqs.push([seqid, seq, qvs])
58
- if seqs.size == 100000 * Parallel.processor_count
64
+ while vals = rq.shift
65
+ if vals != ""
66
+ seqs.push(vals)
67
+ end
68
+ if vals == "" || seqs.size == 100000 * Parallel.processor_count
59
69
  parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
60
70
  seqs = Array.new
61
71
  end
72
+ if vals == ""
73
+ qs.each { |q| q.push('') }
74
+ break
75
+ end
62
76
  end
63
- if seqs.size > 0
64
- parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
65
- end
66
-
67
- qs.each { |q| q.push('') }
68
77
  ts.each { |t| t.join }
69
78
 
70
79
  total = 0
@@ -104,9 +113,9 @@ module Bio
104
113
  dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
105
114
  if dists[0][1] < dists[1][1] && dists[0][1] < 2
106
115
  idx = dists[0][0]
107
- qs[idx].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
116
+ qs[idx].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
108
117
  else
109
- qs[-1].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
118
+ qs[-1].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
110
119
  end
111
120
  end
112
121
 
@@ -6,11 +6,11 @@ module Bio
6
6
 
7
7
  namespace :bio
8
8
 
9
- desc 'fqlzma', 'automatic (re)compression of *.fq(.gz|.bz2) files'
10
- def fqlzma
9
+ desc 'fqxz', 'automatic (re)compression of *.fq(.gz|.bz2) files'
10
+ def fqxz
11
11
  Parallel.map(Pathname.glob('*.fq{.gz,.bz2,}')) { |fqfilename|
12
- lzmafilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.lzma')
13
- if !lzmafilename.exist?
12
+ xzfilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.xz')
13
+ if !xzfilename.exist?
14
14
  case fqfilename.extname
15
15
  when '.gz'
16
16
  decompressor = 'gunzip -c'
@@ -19,9 +19,9 @@ module Bio
19
19
  else
20
20
  decompressor = 'cat'
21
21
  end
22
- puts "compressing #{lzmafilename}..."
23
- system "#{decompressor} #{fqfilename} | lzma -c > #{lzmafilename} 2> #{lzmafilename}.log"
24
- system "lzma -t #{lzmafilename} >> #{lzmafilename}.log 2>&1"
22
+ puts "compressing #{xzfilename}..."
23
+ system "#{decompressor} #{fqfilename} | xz -z -e -c > #{xzfilename} 2> #{xzfilename}.log"
24
+ system "xz -t #{xzfilename} >> #{xzfilename}.log 2>&1"
25
25
  end
26
26
  }
27
27
  end
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.1.4"
6
+ VERSION = "0.2.1"
7
7
 
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-07 00:00:00.000000000 Z
12
+ date: 2012-12-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gthor
@@ -90,8 +90,9 @@ files:
90
90
  - Rakefile
91
91
  - bio-gadget.gemspec
92
92
  - lib/bio-gadget.rb
93
+ - lib/bio-gadget/dedup.rb
93
94
  - lib/bio-gadget/demlt.rb
94
- - lib/bio-gadget/fqlzma.rb
95
+ - lib/bio-gadget/fqxz.rb
95
96
  - lib/bio-gadget/qvstat.rb
96
97
  - lib/bio-gadget/version.rb
97
98
  - lib/bio-gadget/wigchr.rb