bio-gadget 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.org +2 -1
- data/lib/bio-gadget.rb +2 -1
- data/lib/bio-gadget/dedup.rb +33 -0
- data/lib/bio-gadget/demlt.rb +22 -13
- data/lib/bio-gadget/{fqlzma.rb → fqxz.rb} +7 -7
- data/lib/bio-gadget/version.rb +1 -1
- metadata +4 -3
data/README.org
CHANGED
@@ -17,8 +17,9 @@ Currently available commands are
|
|
17
17
|
|
18
18
|
: bio
|
19
19
|
: ---
|
20
|
+
: gthor bio:dedup # deduplicate fastq (via STDIN)
|
20
21
|
: gthor bio:demlt BC POS # demultiplex fastq (via STDIN) by barcodes
|
21
|
-
: gthor bio:
|
22
|
+
: gthor bio:fqxz # automatic (re)compression of *.fq(.gz|.bz2) files
|
22
23
|
: gthor bio:qvstat QUAL # statistics of quality values in *.qual file
|
23
24
|
: gthor bio:wigchr WIG CHR # extract wiggle track on specified chromosome
|
24
25
|
|
data/lib/bio-gadget.rb
CHANGED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'bio-faster'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
class Gadget < Thor
|
6
|
+
namespace :bio
|
7
|
+
|
8
|
+
desc 'dedup', 'deduplicate fastq (via STDIN)'
|
9
|
+
def dedup
|
10
|
+
|
11
|
+
p1in, p1out = IO.pipe
|
12
|
+
|
13
|
+
fork {
|
14
|
+
p1in.close
|
15
|
+
$stdout.reopen(p1out)
|
16
|
+
open("| sort -k 1 -r -S #{sprintf('%2d', 100/(Parallel.processor_count+1))}% -T $TMPDIR | cut -f 2- | uniq -f 2", 'w') { |fp|
|
17
|
+
Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
|
18
|
+
fp.puts "#{seq}#{qvs}\t#{seqid}\t#{qvs}\t#{seq}"
|
19
|
+
end
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
p1out.close
|
24
|
+
|
25
|
+
p1in.each_line { |line|
|
26
|
+
seqid, qvs, seq = line.rstrip.split
|
27
|
+
puts "@#{seqid}\n#{seq}\n+\n#{qvs}"
|
28
|
+
}
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
data/lib/bio-gadget/demlt.rb
CHANGED
@@ -8,7 +8,7 @@ module Bio
|
|
8
8
|
namespace :bio
|
9
9
|
|
10
10
|
desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
|
11
|
-
option
|
11
|
+
option 'output-dir', :type => :string, :default => '.', :aliases => '-o'
|
12
12
|
def demlt(bcfile, tmpofs)
|
13
13
|
|
14
14
|
ofs = tmpofs.to_i
|
@@ -35,8 +35,8 @@ module Bio
|
|
35
35
|
q = SizedQueue.new(100000)
|
36
36
|
t = Thread.new(well, q) do |well, q|
|
37
37
|
tc = Thread.current
|
38
|
-
tc[:file] = "#{options[
|
39
|
-
fp = open("|
|
38
|
+
tc[:file] = "#{options['output-dir']}/#{well}.fq.xz"
|
39
|
+
fp = open("| xz -z -c -e > #{tc[:file]}", 'w')
|
40
40
|
tc[:read] = 0
|
41
41
|
while vals = q.shift
|
42
42
|
if vals == ""
|
@@ -52,19 +52,28 @@ module Bio
|
|
52
52
|
ts.push(t)
|
53
53
|
}
|
54
54
|
|
55
|
+
rq = Queue.new
|
56
|
+
Thread.new(rq) {
|
57
|
+
Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
|
58
|
+
rq.push([seqid, seq, qvs])
|
59
|
+
end
|
60
|
+
rq.push('')
|
61
|
+
}
|
62
|
+
|
55
63
|
seqs = Array.new
|
56
|
-
|
57
|
-
|
58
|
-
|
64
|
+
while vals = rq.shift
|
65
|
+
if vals != ""
|
66
|
+
seqs.push(vals)
|
67
|
+
end
|
68
|
+
if vals == "" || seqs.size == 100000 * Parallel.processor_count
|
59
69
|
parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
|
60
70
|
seqs = Array.new
|
61
71
|
end
|
72
|
+
if vals == ""
|
73
|
+
qs.each { |q| q.push('') }
|
74
|
+
break
|
75
|
+
end
|
62
76
|
end
|
63
|
-
if seqs.size > 0
|
64
|
-
parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
|
65
|
-
end
|
66
|
-
|
67
|
-
qs.each { |q| q.push('') }
|
68
77
|
ts.each { |t| t.join }
|
69
78
|
|
70
79
|
total = 0
|
@@ -104,9 +113,9 @@ module Bio
|
|
104
113
|
dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
|
105
114
|
if dists[0][1] < dists[1][1] && dists[0][1] < 2
|
106
115
|
idx = dists[0][0]
|
107
|
-
qs[idx].push("
|
116
|
+
qs[idx].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
|
108
117
|
else
|
109
|
-
qs[-1].push("
|
118
|
+
qs[-1].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
|
110
119
|
end
|
111
120
|
end
|
112
121
|
|
@@ -6,11 +6,11 @@ module Bio
|
|
6
6
|
|
7
7
|
namespace :bio
|
8
8
|
|
9
|
-
desc '
|
10
|
-
def
|
9
|
+
desc 'fqxz', 'automatic (re)compression of *.fq(.gz|.bz2) files'
|
10
|
+
def fqxz
|
11
11
|
Parallel.map(Pathname.glob('*.fq{.gz,.bz2,}')) { |fqfilename|
|
12
|
-
|
13
|
-
if !
|
12
|
+
xzfilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.xz')
|
13
|
+
if !xzfilename.exist?
|
14
14
|
case fqfilename.extname
|
15
15
|
when '.gz'
|
16
16
|
decompressor = 'gunzip -c'
|
@@ -19,9 +19,9 @@ module Bio
|
|
19
19
|
else
|
20
20
|
decompressor = 'cat'
|
21
21
|
end
|
22
|
-
puts "compressing #{
|
23
|
-
system "#{decompressor} #{fqfilename} |
|
24
|
-
system "
|
22
|
+
puts "compressing #{xzfilename}..."
|
23
|
+
system "#{decompressor} #{fqfilename} | xz -z -e -c > #{xzfilename} 2> #{xzfilename}.log"
|
24
|
+
system "xz -t #{xzfilename} >> #{xzfilename}.log 2>&1"
|
25
25
|
end
|
26
26
|
}
|
27
27
|
end
|
data/lib/bio-gadget/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-gadget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: gthor
|
@@ -90,8 +90,9 @@ files:
|
|
90
90
|
- Rakefile
|
91
91
|
- bio-gadget.gemspec
|
92
92
|
- lib/bio-gadget.rb
|
93
|
+
- lib/bio-gadget/dedup.rb
|
93
94
|
- lib/bio-gadget/demlt.rb
|
94
|
-
- lib/bio-gadget/
|
95
|
+
- lib/bio-gadget/fqxz.rb
|
95
96
|
- lib/bio-gadget/qvstat.rb
|
96
97
|
- lib/bio-gadget/version.rb
|
97
98
|
- lib/bio-gadget/wigchr.rb
|