bio-gadget 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/bio-gadget.gemspec +1 -0
- data/lib/bio-gadget.rb +9 -0
- data/lib/bio-gadget/demlt.rb +135 -49
- data/lib/bio-gadget/version.rb +1 -1
- metadata +18 -2
data/bio-gadget.gemspec
CHANGED
data/lib/bio-gadget.rb
CHANGED
@@ -5,6 +5,8 @@ require 'bio-gadget/fqxz'
|
|
5
5
|
require 'bio-gadget/qvstat'
|
6
6
|
require 'bio-gadget/wigchr'
|
7
7
|
|
8
|
+
require 'tempfile'
|
9
|
+
|
8
10
|
module Bio
|
9
11
|
class Gadget < Thor
|
10
12
|
|
@@ -22,5 +24,12 @@ module Bio
|
|
22
24
|
end
|
23
25
|
end
|
24
26
|
|
27
|
+
def mytempfile(basename, tmpdir = Dir::tmpdir)
|
28
|
+
fp = Tempfile.open(basename, tmpdir)
|
29
|
+
path = fp.path
|
30
|
+
fp.close #!
|
31
|
+
path
|
32
|
+
end
|
33
|
+
|
25
34
|
end
|
26
35
|
end
|
data/lib/bio-gadget/demlt.rb
CHANGED
@@ -1,17 +1,18 @@
|
|
1
1
|
require 'bio-faster'
|
2
2
|
require 'levenshtein'
|
3
|
+
require 'mkfifo'
|
3
4
|
require 'parallel'
|
4
|
-
require 'thread'
|
5
5
|
|
6
6
|
module Bio
|
7
7
|
class Gadget < Thor
|
8
8
|
namespace :bio
|
9
9
|
|
10
|
-
desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
|
11
|
-
option 'output-dir', :type => :string, :default => '.'
|
12
|
-
def demlt(bcfile, tmpofs)
|
10
|
+
desc 'demlt BC POS LEN', 'demultiplex fastq (via STDIN) by barcodes'
|
11
|
+
option 'output-dir', :aliases => '-o', :type => :string, :default => '.'
|
12
|
+
def demlt(bcfile, tmpofs, tmplen)
|
13
13
|
|
14
14
|
ofs = tmpofs.to_i
|
15
|
+
len = tmplen.to_i
|
15
16
|
|
16
17
|
wells = Array.new
|
17
18
|
bcs = Array.new
|
@@ -25,62 +26,147 @@ module Bio
|
|
25
26
|
|
26
27
|
bclens.uniq!
|
27
28
|
if bclens.size != 1
|
28
|
-
raise 'Inconsistent barcode
|
29
|
+
raise 'Inconsistent barcode sequence lengths'
|
29
30
|
end
|
30
31
|
bclen = bclens[0]
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
33
|
+
procs = Parallel.processor_count
|
34
|
+
|
35
|
+
fifo1paths = Array.new
|
36
|
+
procs.times { |i| fifo1paths.push(mytempfile('fifo1-')) }
|
37
|
+
pid = Kernel.fork {
|
38
|
+
fifo1s = Array.new
|
39
|
+
fifo1paths.each { |fifo1path| fifo1s.push(open(fifo1path, 'w+')) }
|
40
|
+
total = 0
|
41
|
+
Bio::Faster.new(:stdin).each_record(:quality => :raw) do |vals|
|
42
|
+
fifo1 = fifo1s[total % procs]
|
43
|
+
fifo1.puts(vals.join("\t"))
|
44
|
+
fifo1.flush
|
45
|
+
total += 1
|
46
|
+
end
|
47
|
+
fifo1s.each { |fifo1| fifo1.puts('*'); fifo1.close }
|
48
|
+
Kernel.exit!
|
49
|
+
}
|
50
|
+
|
51
|
+
fifo1paths.each { |fifo1path|
|
52
|
+
until File.exist?(fifo1path)
|
53
|
+
sleep 1
|
50
54
|
end
|
51
|
-
qs.push(q)
|
52
|
-
ts.push(t)
|
53
55
|
}
|
54
56
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
57
|
+
fifo2paths = Array.new
|
58
|
+
procs.times { |i|
|
59
|
+
fifo2path = mytempfile('fifo2-')
|
60
|
+
fifo2paths.push(fifo2path)
|
61
|
+
pid = Kernel.fork {
|
62
|
+
open(fifo2path, 'w+') { |fifo2|
|
63
|
+
fifo1 = open(fifo1paths[i], 'r+')
|
64
|
+
while true
|
65
|
+
line = fifo1.gets
|
66
|
+
if line.nil?
|
67
|
+
sleep 1
|
68
|
+
elsif line == "*\n"
|
69
|
+
break
|
70
|
+
else
|
71
|
+
seqid, seq, qvs = line.rstrip.split(/\t/)
|
72
|
+
tmpdists = Hash.new
|
73
|
+
bcs.each_index { |bcidx|
|
74
|
+
tmpdists[bcidx] = Levenshtein.distance(bcs[bcidx], seq[ofs, bclen])
|
75
|
+
}
|
76
|
+
dists = tmpdists.sort { |a, b| a[1] <=> b[1] }
|
77
|
+
bc = dists[0][1] < 2 && dists[0][1] < dists[1][1] ? dists[0][0] : -1
|
78
|
+
fifo2.puts("#{bc}\t#{seqid}\t#{seq}\t#{qvs}")
|
79
|
+
fifo2.flush
|
80
|
+
end
|
81
|
+
end
|
82
|
+
fifo1.close
|
83
|
+
fifo2.puts('*')
|
84
|
+
}
|
85
|
+
Kernel.exit!
|
59
86
|
}
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
87
|
+
}
|
88
|
+
|
89
|
+
fifo2paths.each { |fifo2path|
|
90
|
+
until File.exist?(fifo2path)
|
91
|
+
sleep 1
|
65
92
|
end
|
66
|
-
|
67
|
-
end
|
93
|
+
}
|
68
94
|
|
69
|
-
|
70
|
-
ts.each { |t| t.join }
|
95
|
+
tmpwells = wells + ['other']
|
71
96
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
97
|
+
fifo3paths = Array.new
|
98
|
+
tmpwells.each_index { |i| fifo3paths.push(mytempfile('fifo3-')) }
|
99
|
+
pid = Kernel.fork {
|
100
|
+
fifo2s = Array.new
|
101
|
+
fifo2paths.each { |fifo2path| fifo2s.push(open(fifo2path, 'r+')) }
|
102
|
+
fifo2done = Hash.new
|
103
|
+
fifo3s = Array.new
|
104
|
+
fifo3paths.each { |fifo3path| fifo3s.push(open(fifo3path, 'w+')) }
|
105
|
+
fifo2s.cycle { |fifo2|
|
106
|
+
unless fifo2done.key?(fifo2)
|
107
|
+
line = fifo2.gets
|
108
|
+
if line.nil?
|
109
|
+
sleep 1
|
110
|
+
elsif line == "*\n"
|
111
|
+
# puts("#{fifo2} eof.")
|
112
|
+
fifo2done[fifo2] = ''
|
113
|
+
else
|
114
|
+
bcs, seqid, seq, qvs = line.rstrip.split(/\t/)
|
115
|
+
fifo3 = fifo3s[bcs.to_i]
|
116
|
+
fifo3.puts([seqid, seq, qvs].join("\t"))
|
117
|
+
fifo3.flush
|
118
|
+
end
|
119
|
+
end
|
120
|
+
if fifo2done.size == fifo2s.size
|
121
|
+
break
|
122
|
+
end
|
123
|
+
}
|
124
|
+
fifo2s.each { |fifo2| fifo2.close }
|
125
|
+
fifo3s.each { |fifo3| fifo3.puts('*'); fifo3.close }
|
126
|
+
Kernel.exit!
|
127
|
+
}
|
128
|
+
|
129
|
+
fifo3paths.each { |fifo3path|
|
130
|
+
until File.exist?(fifo3path)
|
131
|
+
sleep 1
|
132
|
+
end
|
133
|
+
}
|
134
|
+
|
135
|
+
tmpwells.each_index { |i|
|
136
|
+
well = tmpwells[i]
|
137
|
+
outpath = "#{options['output-dir']}/#{well}.fq.xz"
|
138
|
+
pid = Kernel.fork {
|
139
|
+
left = ofs+bclen
|
140
|
+
right = ofs+bclen+len-1
|
141
|
+
preprocess = ofs > 0 ? <<"DEDUPandFORMAT"
|
142
|
+
| ruby -F'\\t' -anle 'f1=$F[1][0..#{right}];f2=$F[2][0..#{right}];puts([f1+f2, $F[0], f2, f1].join("\\t"))' \\
|
143
|
+
| sort -k 1 -r | cut -f 2- | uniq -f 2 \\
|
144
|
+
| ruby -F'\\t' -anle 'puts(["@"+$F[0], $F[2][#{left}..-1], "+", $F[1][#{left}..-1]].join("\\n"))' \\
|
145
|
+
DEDUPandFORMAT
|
146
|
+
: <<"FORMAT"
|
147
|
+
| ruby -F'\\t' -anle 'puts(["@"+$F[0], $F[1][#{left}..-1], "+", $F[2][#{left}..-1].rstrip].join("\\n"))' \\
|
148
|
+
FORMAT
|
149
|
+
preprocess += "| xz -z -c -e > #{outpath}"
|
150
|
+
open(preprocess, 'w') { |fp|
|
151
|
+
fifo3 = open(fifo3paths[i], 'r+')
|
152
|
+
while true
|
153
|
+
line = fifo3.gets
|
154
|
+
if line.nil?
|
155
|
+
sleep 1
|
156
|
+
elsif line == "*\n"
|
157
|
+
break
|
158
|
+
else
|
159
|
+
fp.puts(line)
|
160
|
+
fp.flush
|
161
|
+
end
|
162
|
+
end
|
163
|
+
fifo3.close
|
164
|
+
}
|
165
|
+
Kernel.exit!
|
166
|
+
}
|
78
167
|
}
|
79
|
-
|
80
|
-
|
81
|
-
puts "Other\t#{r}\t#{t[:file]}"
|
82
|
-
puts '===='
|
83
|
-
puts "Total\t#{total+r}"
|
168
|
+
|
169
|
+
Process.waitall
|
84
170
|
|
85
171
|
end
|
86
172
|
|
data/lib/bio-gadget/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-gadget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: gthor
|
@@ -75,6 +75,22 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: mkfifo
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
78
94
|
description: Gadgets for bioinformatics
|
79
95
|
email:
|
80
96
|
- shintaro.katayama@gmail.com
|