bio-gadget 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
data/bio-gadget.gemspec CHANGED
@@ -18,4 +18,5 @@ Gem::Specification.new do |gem|
18
18
  gem.add_dependency 'parallel'
19
19
  gem.add_dependency 'levenshtein-ffi'
20
20
  gem.add_dependency 'bio-faster'
21
+ gem.add_dependency 'mkfifo'
21
22
  end
data/lib/bio-gadget.rb CHANGED
@@ -5,6 +5,8 @@ require 'bio-gadget/fqxz'
5
5
  require 'bio-gadget/qvstat'
6
6
  require 'bio-gadget/wigchr'
7
7
 
8
+ require 'tempfile'
9
+
8
10
  module Bio
9
11
  class Gadget < Thor
10
12
 
@@ -22,5 +24,12 @@ module Bio
22
24
  end
23
25
  end
24
26
 
27
+ def mytempfile(basename, tmpdir = Dir::tmpdir)
28
+ fp = Tempfile.open(basename, tmpdir)
29
+ path = fp.path
30
+ fp.close #!
31
+ path
32
+ end
33
+
25
34
  end
26
35
  end
@@ -1,17 +1,18 @@
1
1
  require 'bio-faster'
2
2
  require 'levenshtein'
3
+ require 'mkfifo'
3
4
  require 'parallel'
4
- require 'thread'
5
5
 
6
6
  module Bio
7
7
  class Gadget < Thor
8
8
  namespace :bio
9
9
 
10
- desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
11
- option 'output-dir', :type => :string, :default => '.', :aliases => '-o'
12
- def demlt(bcfile, tmpofs)
10
+ desc 'demlt BC POS LEN', 'demultiplex fastq (via STDIN) by barcodes'
11
+ option 'output-dir', :aliases => '-o', :type => :string, :default => '.'
12
+ def demlt(bcfile, tmpofs, tmplen)
13
13
 
14
14
  ofs = tmpofs.to_i
15
+ len = tmplen.to_i
15
16
 
16
17
  wells = Array.new
17
18
  bcs = Array.new
@@ -25,62 +26,147 @@ module Bio
25
26
 
26
27
  bclens.uniq!
27
28
  if bclens.size != 1
28
- raise 'Inconsistent barcode sequences'
29
+ raise 'Inconsistent barcode sequence lengths'
29
30
  end
30
31
  bclen = bclens[0]
31
32
 
32
- ts = Array.new
33
- qs = Array.new
34
- (wells + ['other']).each { |well|
35
- q = Queue.new
36
- t = Thread.new(well, q) do |well, q|
37
- tc = Thread.current
38
- tc[:file] = "#{options['output-dir']}/#{well}.fq.xz"
39
- fp = open("| xz -z -c -e > #{tc[:file]}", 'w')
40
- tc[:read] = 0
41
- while vals = q.shift
42
- if vals == ""
43
- break
44
- else
45
- fp.puts(vals)
46
- tc[:read] = tc[:read] + 1
47
- end
48
- end
49
- fp.close()
33
+ procs = Parallel.processor_count
34
+
35
+ fifo1paths = Array.new
36
+ procs.times { |i| fifo1paths.push(mytempfile('fifo1-')) }
37
+ pid = Kernel.fork {
38
+ fifo1s = Array.new
39
+ fifo1paths.each { |fifo1path| fifo1s.push(open(fifo1path, 'w+')) }
40
+ total = 0
41
+ Bio::Faster.new(:stdin).each_record(:quality => :raw) do |vals|
42
+ fifo1 = fifo1s[total % procs]
43
+ fifo1.puts(vals.join("\t"))
44
+ fifo1.flush
45
+ total += 1
46
+ end
47
+ fifo1s.each { |fifo1| fifo1.puts('*'); fifo1.close }
48
+ Kernel.exit!
49
+ }
50
+
51
+ fifo1paths.each { |fifo1path|
52
+ until File.exist?(fifo1path)
53
+ sleep 1
50
54
  end
51
- qs.push(q)
52
- ts.push(t)
53
55
  }
54
56
 
55
- Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
56
- tmpdists = Hash.new
57
- bcs.each_index { |bcidx|
58
- tmpdists[bcidx] = Levenshtein.distance(bcs[bcidx], seq[ofs, bclen])
57
+ fifo2paths = Array.new
58
+ procs.times { |i|
59
+ fifo2path = mytempfile('fifo2-')
60
+ fifo2paths.push(fifo2path)
61
+ pid = Kernel.fork {
62
+ open(fifo2path, 'w+') { |fifo2|
63
+ fifo1 = open(fifo1paths[i], 'r+')
64
+ while true
65
+ line = fifo1.gets
66
+ if line.nil?
67
+ sleep 1
68
+ elsif line == "*\n"
69
+ break
70
+ else
71
+ seqid, seq, qvs = line.rstrip.split(/\t/)
72
+ tmpdists = Hash.new
73
+ bcs.each_index { |bcidx|
74
+ tmpdists[bcidx] = Levenshtein.distance(bcs[bcidx], seq[ofs, bclen])
75
+ }
76
+ dists = tmpdists.sort { |a, b| a[1] <=> b[1] }
77
+ bc = dists[0][1] < 2 && dists[0][1] < dists[1][1] ? dists[0][0] : -1
78
+ fifo2.puts("#{bc}\t#{seqid}\t#{seq}\t#{qvs}")
79
+ fifo2.flush
80
+ end
81
+ end
82
+ fifo1.close
83
+ fifo2.puts('*')
84
+ }
85
+ Kernel.exit!
59
86
  }
60
- dists = tmpdists.sort { |a, b| a[1] <=> b[1] }
61
- if dists[0][1] < 2 && dists[0][1] < dists[1][1]
62
- qs[dists[0][0]].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
63
- else
64
- qs[-1].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
87
+ }
88
+
89
+ fifo2paths.each { |fifo2path|
90
+ until File.exist?(fifo2path)
91
+ sleep 1
65
92
  end
66
- Thread.pass
67
- end
93
+ }
68
94
 
69
- qs.each { |q| q.push('') }
70
- ts.each { |t| t.join }
95
+ tmpwells = wells + ['other']
71
96
 
72
- total = 0
73
- bcs.each_index { |i|
74
- t = ts[i]
75
- r = t[:read]
76
- puts "#{bcs[i]}\t#{r}\t#{t[:file]}"
77
- total = total+r
97
+ fifo3paths = Array.new
98
+ tmpwells.each_index { |i| fifo3paths.push(mytempfile('fifo3-')) }
99
+ pid = Kernel.fork {
100
+ fifo2s = Array.new
101
+ fifo2paths.each { |fifo2path| fifo2s.push(open(fifo2path, 'r+')) }
102
+ fifo2done = Hash.new
103
+ fifo3s = Array.new
104
+ fifo3paths.each { |fifo3path| fifo3s.push(open(fifo3path, 'w+')) }
105
+ fifo2s.cycle { |fifo2|
106
+ unless fifo2done.key?(fifo2)
107
+ line = fifo2.gets
108
+ if line.nil?
109
+ sleep 1
110
+ elsif line == "*\n"
111
+ # puts("#{fifo2} eof.")
112
+ fifo2done[fifo2] = ''
113
+ else
114
+ bcs, seqid, seq, qvs = line.rstrip.split(/\t/)
115
+ fifo3 = fifo3s[bcs.to_i]
116
+ fifo3.puts([seqid, seq, qvs].join("\t"))
117
+ fifo3.flush
118
+ end
119
+ end
120
+ if fifo2done.size == fifo2s.size
121
+ break
122
+ end
123
+ }
124
+ fifo2s.each { |fifo2| fifo2.close }
125
+ fifo3s.each { |fifo3| fifo3.puts('*'); fifo3.close }
126
+ Kernel.exit!
127
+ }
128
+
129
+ fifo3paths.each { |fifo3path|
130
+ until File.exist?(fifo3path)
131
+ sleep 1
132
+ end
133
+ }
134
+
135
+ tmpwells.each_index { |i|
136
+ well = tmpwells[i]
137
+ outpath = "#{options['output-dir']}/#{well}.fq.xz"
138
+ pid = Kernel.fork {
139
+ left = ofs+bclen
140
+ right = ofs+bclen+len-1
141
+ preprocess = ofs > 0 ? <<"DEDUPandFORMAT"
142
+ | ruby -F'\\t' -anle 'f1=$F[1][0..#{right}];f2=$F[2][0..#{right}];puts([f1+f2, $F[0], f2, f1].join("\\t"))' \\
143
+ | sort -k 1 -r | cut -f 2- | uniq -f 2 \\
144
+ | ruby -F'\\t' -anle 'puts(["@"+$F[0], $F[2][#{left}..-1], "+", $F[1][#{left}..-1]].join("\\n"))' \\
145
+ DEDUPandFORMAT
146
+ : <<"FORMAT"
147
+ | ruby -F'\\t' -anle 'puts(["@"+$F[0], $F[1][#{left}..-1], "+", $F[2][#{left}..-1].rstrip].join("\\n"))' \\
148
+ FORMAT
149
+ preprocess += "| xz -z -c -e > #{outpath}"
150
+ open(preprocess, 'w') { |fp|
151
+ fifo3 = open(fifo3paths[i], 'r+')
152
+ while true
153
+ line = fifo3.gets
154
+ if line.nil?
155
+ sleep 1
156
+ elsif line == "*\n"
157
+ break
158
+ else
159
+ fp.puts(line)
160
+ fp.flush
161
+ end
162
+ end
163
+ fifo3.close
164
+ }
165
+ Kernel.exit!
166
+ }
78
167
  }
79
- t = ts[-1]
80
- r = t[:read]
81
- puts "Other\t#{r}\t#{t[:file]}"
82
- puts '===='
83
- puts "Total\t#{total+r}"
168
+
169
+ Process.waitall
84
170
 
85
171
  end
86
172
 
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.2.3"
6
+ VERSION = "0.2.4"
7
7
 
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-21 00:00:00.000000000 Z
12
+ date: 2013-01-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gthor
@@ -75,6 +75,22 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mkfifo
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
78
94
  description: Gadgets for bioinformatics
79
95
  email:
80
96
  - shintaro.katayama@gmail.com