bio-gadget 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.org +2 -1
- data/lib/bio-gadget.rb +2 -1
- data/lib/bio-gadget/dedup.rb +33 -0
- data/lib/bio-gadget/demlt.rb +22 -13
- data/lib/bio-gadget/{fqlzma.rb → fqxz.rb} +7 -7
- data/lib/bio-gadget/version.rb +1 -1
- metadata +4 -3
data/README.org
CHANGED
|
@@ -17,8 +17,9 @@ Currently available commands are
|
|
|
17
17
|
|
|
18
18
|
: bio
|
|
19
19
|
: ---
|
|
20
|
+
: gthor bio:dedup # deduplicate fastq (via STDIN)
|
|
20
21
|
: gthor bio:demlt BC POS # demultiplex fastq (via STDIN) by barcodes
|
|
21
|
-
: gthor bio:
|
|
22
|
+
: gthor bio:fqxz # automatic (re)compression of *.fq(.gz|.bz2) files
|
|
22
23
|
: gthor bio:qvstat QUAL # statistics of quality values in *.qual file
|
|
23
24
|
: gthor bio:wigchr WIG CHR # extract wiggle track on specified chromosome
|
|
24
25
|
|
data/lib/bio-gadget.rb
CHANGED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
require 'bio-faster'
|
|
2
|
+
require 'parallel'
|
|
3
|
+
|
|
4
|
+
module Bio
|
|
5
|
+
class Gadget < Thor
|
|
6
|
+
namespace :bio
|
|
7
|
+
|
|
8
|
+
desc 'dedup', 'deduplicate fastq (via STDIN)'
|
|
9
|
+
def dedup
|
|
10
|
+
|
|
11
|
+
p1in, p1out = IO.pipe
|
|
12
|
+
|
|
13
|
+
fork {
|
|
14
|
+
p1in.close
|
|
15
|
+
$stdout.reopen(p1out)
|
|
16
|
+
open("| sort -k 1 -r -S #{sprintf('%2d', 100/(Parallel.processor_count+1))}% -T $TMPDIR | cut -f 2- | uniq -f 2", 'w') { |fp|
|
|
17
|
+
Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
|
|
18
|
+
fp.puts "#{seq}#{qvs}\t#{seqid}\t#{qvs}\t#{seq}"
|
|
19
|
+
end
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
p1out.close
|
|
24
|
+
|
|
25
|
+
p1in.each_line { |line|
|
|
26
|
+
seqid, qvs, seq = line.rstrip.split
|
|
27
|
+
puts "@#{seqid}\n#{seq}\n+\n#{qvs}"
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
end
|
|
33
|
+
end
|
data/lib/bio-gadget/demlt.rb
CHANGED
|
@@ -8,7 +8,7 @@ module Bio
|
|
|
8
8
|
namespace :bio
|
|
9
9
|
|
|
10
10
|
desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
|
|
11
|
-
option
|
|
11
|
+
option 'output-dir', :type => :string, :default => '.', :aliases => '-o'
|
|
12
12
|
def demlt(bcfile, tmpofs)
|
|
13
13
|
|
|
14
14
|
ofs = tmpofs.to_i
|
|
@@ -35,8 +35,8 @@ module Bio
|
|
|
35
35
|
q = SizedQueue.new(100000)
|
|
36
36
|
t = Thread.new(well, q) do |well, q|
|
|
37
37
|
tc = Thread.current
|
|
38
|
-
tc[:file] = "#{options[
|
|
39
|
-
fp = open("|
|
|
38
|
+
tc[:file] = "#{options['output-dir']}/#{well}.fq.xz"
|
|
39
|
+
fp = open("| xz -z -c -e > #{tc[:file]}", 'w')
|
|
40
40
|
tc[:read] = 0
|
|
41
41
|
while vals = q.shift
|
|
42
42
|
if vals == ""
|
|
@@ -52,19 +52,28 @@ module Bio
|
|
|
52
52
|
ts.push(t)
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
rq = Queue.new
|
|
56
|
+
Thread.new(rq) {
|
|
57
|
+
Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
|
|
58
|
+
rq.push([seqid, seq, qvs])
|
|
59
|
+
end
|
|
60
|
+
rq.push('')
|
|
61
|
+
}
|
|
62
|
+
|
|
55
63
|
seqs = Array.new
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
64
|
+
while vals = rq.shift
|
|
65
|
+
if vals != ""
|
|
66
|
+
seqs.push(vals)
|
|
67
|
+
end
|
|
68
|
+
if vals == "" || seqs.size == 100000 * Parallel.processor_count
|
|
59
69
|
parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
|
|
60
70
|
seqs = Array.new
|
|
61
71
|
end
|
|
72
|
+
if vals == ""
|
|
73
|
+
qs.each { |q| q.push('') }
|
|
74
|
+
break
|
|
75
|
+
end
|
|
62
76
|
end
|
|
63
|
-
if seqs.size > 0
|
|
64
|
-
parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
qs.each { |q| q.push('') }
|
|
68
77
|
ts.each { |t| t.join }
|
|
69
78
|
|
|
70
79
|
total = 0
|
|
@@ -104,9 +113,9 @@ module Bio
|
|
|
104
113
|
dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
|
|
105
114
|
if dists[0][1] < dists[1][1] && dists[0][1] < 2
|
|
106
115
|
idx = dists[0][0]
|
|
107
|
-
qs[idx].push("
|
|
116
|
+
qs[idx].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
|
|
108
117
|
else
|
|
109
|
-
qs[-1].push("
|
|
118
|
+
qs[-1].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
|
|
110
119
|
end
|
|
111
120
|
end
|
|
112
121
|
|
|
@@ -6,11 +6,11 @@ module Bio
|
|
|
6
6
|
|
|
7
7
|
namespace :bio
|
|
8
8
|
|
|
9
|
-
desc '
|
|
10
|
-
def
|
|
9
|
+
desc 'fqxz', 'automatic (re)compression of *.fq(.gz|.bz2) files'
|
|
10
|
+
def fqxz
|
|
11
11
|
Parallel.map(Pathname.glob('*.fq{.gz,.bz2,}')) { |fqfilename|
|
|
12
|
-
|
|
13
|
-
if !
|
|
12
|
+
xzfilename = fqfilename.sub(/\.fq(\.(gz|bz2))*$/, '.fq.xz')
|
|
13
|
+
if !xzfilename.exist?
|
|
14
14
|
case fqfilename.extname
|
|
15
15
|
when '.gz'
|
|
16
16
|
decompressor = 'gunzip -c'
|
|
@@ -19,9 +19,9 @@ module Bio
|
|
|
19
19
|
else
|
|
20
20
|
decompressor = 'cat'
|
|
21
21
|
end
|
|
22
|
-
puts "compressing #{
|
|
23
|
-
system "#{decompressor} #{fqfilename} |
|
|
24
|
-
system "
|
|
22
|
+
puts "compressing #{xzfilename}..."
|
|
23
|
+
system "#{decompressor} #{fqfilename} | xz -z -e -c > #{xzfilename} 2> #{xzfilename}.log"
|
|
24
|
+
system "xz -t #{xzfilename} >> #{xzfilename}.log 2>&1"
|
|
25
25
|
end
|
|
26
26
|
}
|
|
27
27
|
end
|
data/lib/bio-gadget/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: bio-gadget
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-12-
|
|
12
|
+
date: 2012-12-09 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: gthor
|
|
@@ -90,8 +90,9 @@ files:
|
|
|
90
90
|
- Rakefile
|
|
91
91
|
- bio-gadget.gemspec
|
|
92
92
|
- lib/bio-gadget.rb
|
|
93
|
+
- lib/bio-gadget/dedup.rb
|
|
93
94
|
- lib/bio-gadget/demlt.rb
|
|
94
|
-
- lib/bio-gadget/
|
|
95
|
+
- lib/bio-gadget/fqxz.rb
|
|
95
96
|
- lib/bio-gadget/qvstat.rb
|
|
96
97
|
- lib/bio-gadget/version.rb
|
|
97
98
|
- lib/bio-gadget/wigchr.rb
|