bio-gadget 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bio-gadget.gemspec CHANGED
@@ -18,4 +18,5 @@ Gem::Specification.new do |gem|
18
18
  gem.add_dependency 'parallel'
19
19
  gem.add_dependency 'levenshtein-ffi'
20
20
  gem.add_dependency 'bio-faster'
21
+ gem.add_dependency 'mkfifo'
21
22
  end
data/lib/bio-gadget.rb CHANGED
@@ -5,6 +5,8 @@ require 'bio-gadget/fqxz'
5
5
  require 'bio-gadget/qvstat'
6
6
  require 'bio-gadget/wigchr'
7
7
 
8
+ require 'tempfile'
9
+
8
10
  module Bio
9
11
  class Gadget < Thor
10
12
 
@@ -22,5 +24,12 @@ module Bio
22
24
  end
23
25
  end
24
26
 
27
+ def mytempfile(basename, tmpdir = Dir::tmpdir)
28
+ fp = Tempfile.open(basename, tmpdir)
29
+ path = fp.path
30
+ fp.close #!
31
+ path
32
+ end
33
+
25
34
  end
26
35
  end
@@ -1,17 +1,18 @@
1
1
  require 'bio-faster'
2
2
  require 'levenshtein'
3
+ require 'mkfifo'
3
4
  require 'parallel'
4
- require 'thread'
5
5
 
6
6
  module Bio
7
7
  class Gadget < Thor
8
8
  namespace :bio
9
9
 
10
- desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
11
- option 'output-dir', :type => :string, :default => '.', :aliases => '-o'
12
- def demlt(bcfile, tmpofs)
10
+ desc 'demlt BC POS LEN', 'demultiplex fastq (via STDIN) by barcodes'
11
+ option 'output-dir', :aliases => '-o', :type => :string, :default => '.'
12
+ def demlt(bcfile, tmpofs, tmplen)
13
13
 
14
14
  ofs = tmpofs.to_i
15
+ len = tmplen.to_i
15
16
 
16
17
  wells = Array.new
17
18
  bcs = Array.new
@@ -25,62 +26,147 @@ module Bio
25
26
 
26
27
  bclens.uniq!
27
28
  if bclens.size != 1
28
- raise 'Inconsistent barcode sequences'
29
+ raise 'Inconsistent barcode sequence lengths'
29
30
  end
30
31
  bclen = bclens[0]
31
32
 
32
- ts = Array.new
33
- qs = Array.new
34
- (wells + ['other']).each { |well|
35
- q = Queue.new
36
- t = Thread.new(well, q) do |well, q|
37
- tc = Thread.current
38
- tc[:file] = "#{options['output-dir']}/#{well}.fq.xz"
39
- fp = open("| xz -z -c -e > #{tc[:file]}", 'w')
40
- tc[:read] = 0
41
- while vals = q.shift
42
- if vals == ""
43
- break
44
- else
45
- fp.puts(vals)
46
- tc[:read] = tc[:read] + 1
47
- end
48
- end
49
- fp.close()
33
+ procs = Parallel.processor_count
34
+
35
+ fifo1paths = Array.new
36
+ procs.times { |i| fifo1paths.push(mytempfile('fifo1-')) }
37
+ pid = Kernel.fork {
38
+ fifo1s = Array.new
39
+ fifo1paths.each { |fifo1path| fifo1s.push(open(fifo1path, 'w+')) }
40
+ total = 0
41
+ Bio::Faster.new(:stdin).each_record(:quality => :raw) do |vals|
42
+ fifo1 = fifo1s[total % procs]
43
+ fifo1.puts(vals.join("\t"))
44
+ fifo1.flush
45
+ total += 1
46
+ end
47
+ fifo1s.each { |fifo1| fifo1.puts('*'); fifo1.close }
48
+ Kernel.exit!
49
+ }
50
+
51
+ fifo1paths.each { |fifo1path|
52
+ until File.exist?(fifo1path)
53
+ sleep 1
50
54
  end
51
- qs.push(q)
52
- ts.push(t)
53
55
  }
54
56
 
55
- Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
56
- tmpdists = Hash.new
57
- bcs.each_index { |bcidx|
58
- tmpdists[bcidx] = Levenshtein.distance(bcs[bcidx], seq[ofs, bclen])
57
+ fifo2paths = Array.new
58
+ procs.times { |i|
59
+ fifo2path = mytempfile('fifo2-')
60
+ fifo2paths.push(fifo2path)
61
+ pid = Kernel.fork {
62
+ open(fifo2path, 'w+') { |fifo2|
63
+ fifo1 = open(fifo1paths[i], 'r+')
64
+ while true
65
+ line = fifo1.gets
66
+ if line.nil?
67
+ sleep 1
68
+ elsif line == "*\n"
69
+ break
70
+ else
71
+ seqid, seq, qvs = line.rstrip.split(/\t/)
72
+ tmpdists = Hash.new
73
+ bcs.each_index { |bcidx|
74
+ tmpdists[bcidx] = Levenshtein.distance(bcs[bcidx], seq[ofs, bclen])
75
+ }
76
+ dists = tmpdists.sort { |a, b| a[1] <=> b[1] }
77
+ bc = dists[0][1] < 2 && dists[0][1] < dists[1][1] ? dists[0][0] : -1
78
+ fifo2.puts("#{bc}\t#{seqid}\t#{seq}\t#{qvs}")
79
+ fifo2.flush
80
+ end
81
+ end
82
+ fifo1.close
83
+ fifo2.puts('*')
84
+ }
85
+ Kernel.exit!
59
86
  }
60
- dists = tmpdists.sort { |a, b| a[1] <=> b[1] }
61
- if dists[0][1] < 2 && dists[0][1] < dists[1][1]
62
- qs[dists[0][0]].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
63
- else
64
- qs[-1].push("@#{seqid}\n#{seq}\n+\n#{qvs}")
87
+ }
88
+
89
+ fifo2paths.each { |fifo2path|
90
+ until File.exist?(fifo2path)
91
+ sleep 1
65
92
  end
66
- Thread.pass
67
- end
93
+ }
68
94
 
69
- qs.each { |q| q.push('') }
70
- ts.each { |t| t.join }
95
+ tmpwells = wells + ['other']
71
96
 
72
- total = 0
73
- bcs.each_index { |i|
74
- t = ts[i]
75
- r = t[:read]
76
- puts "#{bcs[i]}\t#{r}\t#{t[:file]}"
77
- total = total+r
97
+ fifo3paths = Array.new
98
+ tmpwells.each_index { |i| fifo3paths.push(mytempfile('fifo3-')) }
99
+ pid = Kernel.fork {
100
+ fifo2s = Array.new
101
+ fifo2paths.each { |fifo2path| fifo2s.push(open(fifo2path, 'r+')) }
102
+ fifo2done = Hash.new
103
+ fifo3s = Array.new
104
+ fifo3paths.each { |fifo3path| fifo3s.push(open(fifo3path, 'w+')) }
105
+ fifo2s.cycle { |fifo2|
106
+ unless fifo2done.key?(fifo2)
107
+ line = fifo2.gets
108
+ if line.nil?
109
+ sleep 1
110
+ elsif line == "*\n"
111
+ # puts("#{fifo2} eof.")
112
+ fifo2done[fifo2] = ''
113
+ else
114
+ bcs, seqid, seq, qvs = line.rstrip.split(/\t/)
115
+ fifo3 = fifo3s[bcs.to_i]
116
+ fifo3.puts([seqid, seq, qvs].join("\t"))
117
+ fifo3.flush
118
+ end
119
+ end
120
+ if fifo2done.size == fifo2s.size
121
+ break
122
+ end
123
+ }
124
+ fifo2s.each { |fifo2| fifo2.close }
125
+ fifo3s.each { |fifo3| fifo3.puts('*'); fifo3.close }
126
+ Kernel.exit!
127
+ }
128
+
129
+ fifo3paths.each { |fifo3path|
130
+ until File.exist?(fifo3path)
131
+ sleep 1
132
+ end
133
+ }
134
+
135
+ tmpwells.each_index { |i|
136
+ well = tmpwells[i]
137
+ outpath = "#{options['output-dir']}/#{well}.fq.xz"
138
+ pid = Kernel.fork {
139
+ left = ofs+bclen
140
+ right = ofs+bclen+len-1
141
+ preprocess = ofs > 0 ? <<"DEDUPandFORMAT"
142
+ | ruby -F'\\t' -anle 'f1=$F[1][0..#{right}];f2=$F[2][0..#{right}];puts([f1+f2, $F[0], f2, f1].join("\\t"))' \\
143
+ | sort -k 1 -r | cut -f 2- | uniq -f 2 \\
144
+ | ruby -F'\\t' -anle 'puts(["@"+$F[0], $F[2][#{left}..-1], "+", $F[1][#{left}..-1]].join("\\n"))' \\
145
+ DEDUPandFORMAT
146
+ : <<"FORMAT"
147
+ | ruby -F'\\t' -anle 'puts(["@"+$F[0], $F[1][#{left}..-1], "+", $F[2][#{left}..-1].rstrip].join("\\n"))' \\
148
+ FORMAT
149
+ preprocess += "| xz -z -c -e > #{outpath}"
150
+ open(preprocess, 'w') { |fp|
151
+ fifo3 = open(fifo3paths[i], 'r+')
152
+ while true
153
+ line = fifo3.gets
154
+ if line.nil?
155
+ sleep 1
156
+ elsif line == "*\n"
157
+ break
158
+ else
159
+ fp.puts(line)
160
+ fp.flush
161
+ end
162
+ end
163
+ fifo3.close
164
+ }
165
+ Kernel.exit!
166
+ }
78
167
  }
79
- t = ts[-1]
80
- r = t[:read]
81
- puts "Other\t#{r}\t#{t[:file]}"
82
- puts '===='
83
- puts "Total\t#{total+r}"
168
+
169
+ Process.waitall
84
170
 
85
171
  end
86
172
 
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.2.3"
6
+ VERSION = "0.2.4"
7
7
 
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-21 00:00:00.000000000 Z
12
+ date: 2013-01-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gthor
@@ -75,6 +75,22 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mkfifo
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
78
94
  description: Gadgets for bioinformatics
79
95
  email:
80
96
  - shintaro.katayama@gmail.com