bio-gadget 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,6 @@
1
1
  require 'bio-faster'
2
2
  require 'levenshtein'
3
+ require 'parallel'
3
4
  require 'thread'
4
5
 
5
6
  module Bio
@@ -7,6 +8,7 @@ module Bio
7
8
  namespace :bio
8
9
 
9
10
  desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
11
+ option :destdir, :type => :string, :default => '.'
10
12
  def demlt(bcfile, tmpofs)
11
13
 
12
14
  ofs = tmpofs.to_i
@@ -30,14 +32,18 @@ module Bio
30
32
  ts = Array.new
31
33
  qs = Array.new
32
34
  (wells + ['other']).each { |well|
33
- q = Queue.new
34
- t = Thread.new(well, q) do |well|
35
- fp = open("| gzip -c > #{well}.fq.gz", 'w')
35
+ q = SizedQueue.new(100000)
36
+ t = Thread.new(well, q) do |well, q|
37
+ tc = Thread.current
38
+ tc[:file] = "#{options[:destdir]}/#{well}.fq.gz"
39
+ fp = open("| gzip -c > #{tc[:file]}", 'w')
40
+ tc[:read] = 0
36
41
  while vals = q.shift
37
42
  if vals == ""
38
43
  break
39
44
  else
40
45
  fp.puts(vals)
46
+ tc[:read] = tc[:read] + 1
41
47
  end
42
48
  end
43
49
  fp.close()
@@ -46,37 +52,65 @@ module Bio
46
52
  ts.push(t)
47
53
  }
48
54
 
49
- reads = Array.new(bcs.size+1, 0)
50
- tmpdist = Hash.new
55
+ seqs = Array.new
51
56
  Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
52
- seqbc = seq[ofs, bclen]
53
- bcs.each_index do |i|
54
- tmpdist[i] = Levenshtein.distance(bcs[i], seqbc)
55
- end
56
- dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
57
- if dists[0][1] < dists[1][1] && dists[0][1] < 2
58
- idx = dists[0][0]
59
- qs[idx].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
60
- reads[idx] = reads[idx]+1
61
- else
62
- qs[-1].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
63
- reads[-1] = reads[-1]+1
57
+ seqs.push([seqid, seq, qvs])
58
+ if seqs.size == 100000 * Parallel.processor_count
59
+ parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
60
+ seqs = Array.new
64
61
  end
65
62
  end
63
+ if seqs.size > 0
64
+ parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
65
+ end
66
66
 
67
67
  qs.each { |q| q.push('') }
68
68
  ts.each { |t| t.join }
69
69
 
70
70
  total = 0
71
71
  bcs.each_index { |i|
72
- r = reads[i]
73
- puts "#{bcs[i]}\t#{r}\t#{wells[i]}.fq.gz"
72
+ t = ts[i]
73
+ r = t[:read]
74
+ puts "#{bcs[i]}\t#{r}\t#{t[:file]}"
74
75
  total = total+r
75
76
  }
76
- puts "Other\t#{reads[-1]}\tother.fq.gz"
77
+ t = ts[-1]
78
+ r = t[:read]
79
+ puts "Other\t#{r}\t#{t[:file]}"
77
80
  puts '===='
78
- puts "Total\t#{total+reads[-1]}"
81
+ puts "Total\t#{total+r}"
82
+
83
+ end
84
+
85
+ protected
86
+
87
+ def parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
88
+
89
+ tmpdists = Parallel.map_with_index(bcs, :in_processes => Parallel.processor_count) do |bc, bcidx|
90
+ tmpdist = Array.new
91
+ seqs.each_index do |seqidx|
92
+ seqbc = seqs[seqidx][1][ofs, bclen]
93
+ tmpdist.push(Levenshtein.distance(bc, seqbc))
94
+ end
95
+ tmpdist
96
+ end
97
+
98
+ tmpdist = Hash.new
99
+ seqs.each_index do |seqidx|
100
+ seqid, seq, qvs = seqs[seqidx]
101
+ bcs.each_index do |bcidx|
102
+ tmpdist[bcidx] = tmpdists[bcidx][seqidx]
103
+ end
104
+ dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
105
+ if dists[0][1] < dists[1][1] && dists[0][1] < 2
106
+ idx = dists[0][0]
107
+ qs[idx].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
108
+ else
109
+ qs[-1].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
110
+ end
111
+ end
79
112
 
80
113
  end
114
+
81
115
  end
82
116
  end
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.1.3"
6
+ VERSION = "0.1.4"
7
7
 
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-06 00:00:00.000000000 Z
12
+ date: 2012-12-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gthor