bio-gadget 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  require 'bio-faster'
2
2
  require 'levenshtein'
3
+ require 'parallel'
3
4
  require 'thread'
4
5
 
5
6
  module Bio
@@ -7,6 +8,7 @@ module Bio
7
8
  namespace :bio
8
9
 
9
10
  desc 'demlt BC POS', 'demultiplex fastq (via STDIN) by barcodes'
11
+ option :destdir, :type => :string, :default => '.'
10
12
  def demlt(bcfile, tmpofs)
11
13
 
12
14
  ofs = tmpofs.to_i
@@ -30,14 +32,18 @@ module Bio
30
32
  ts = Array.new
31
33
  qs = Array.new
32
34
  (wells + ['other']).each { |well|
33
- q = Queue.new
34
- t = Thread.new(well, q) do |well|
35
- fp = open("| gzip -c > #{well}.fq.gz", 'w')
35
+ q = SizedQueue.new(100000)
36
+ t = Thread.new(well, q) do |well, q|
37
+ tc = Thread.current
38
+ tc[:file] = "#{options[:destdir]}/#{well}.fq.gz"
39
+ fp = open("| gzip -c > #{tc[:file]}", 'w')
40
+ tc[:read] = 0
36
41
  while vals = q.shift
37
42
  if vals == ""
38
43
  break
39
44
  else
40
45
  fp.puts(vals)
46
+ tc[:read] = tc[:read] + 1
41
47
  end
42
48
  end
43
49
  fp.close()
@@ -46,37 +52,65 @@ module Bio
46
52
  ts.push(t)
47
53
  }
48
54
 
49
- reads = Array.new(bcs.size+1, 0)
50
- tmpdist = Hash.new
55
+ seqs = Array.new
51
56
  Bio::Faster.new(:stdin).each_record(:quality => :raw) do |seqid, seq, qvs|
52
- seqbc = seq[ofs, bclen]
53
- bcs.each_index do |i|
54
- tmpdist[i] = Levenshtein.distance(bcs[i], seqbc)
55
- end
56
- dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
57
- if dists[0][1] < dists[1][1] && dists[0][1] < 2
58
- idx = dists[0][0]
59
- qs[idx].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
60
- reads[idx] = reads[idx]+1
61
- else
62
- qs[-1].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
63
- reads[-1] = reads[-1]+1
57
+ seqs.push([seqid, seq, qvs])
58
+ if seqs.size == 100000 * Parallel.processor_count
59
+ parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
60
+ seqs = Array.new
64
61
  end
65
62
  end
63
+ if seqs.size > 0
64
+ parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
65
+ end
66
66
 
67
67
  qs.each { |q| q.push('') }
68
68
  ts.each { |t| t.join }
69
69
 
70
70
  total = 0
71
71
  bcs.each_index { |i|
72
- r = reads[i]
73
- puts "#{bcs[i]}\t#{r}\t#{wells[i]}.fq.gz"
72
+ t = ts[i]
73
+ r = t[:read]
74
+ puts "#{bcs[i]}\t#{r}\t#{t[:file]}"
74
75
  total = total+r
75
76
  }
76
- puts "Other\t#{reads[-1]}\tother.fq.gz"
77
+ t = ts[-1]
78
+ r = t[:read]
79
+ puts "Other\t#{r}\t#{t[:file]}"
77
80
  puts '===='
78
- puts "Total\t#{total+reads[-1]}"
81
+ puts "Total\t#{total+r}"
82
+
83
+ end
84
+
85
+ protected
86
+
87
+ def parallel_Levenshtein(seqs, bcs, ofs, bclen, qs)
88
+
89
+ tmpdists = Parallel.map_with_index(bcs, :in_processes => Parallel.processor_count) do |bc, bcidx|
90
+ tmpdist = Array.new
91
+ seqs.each_index do |seqidx|
92
+ seqbc = seqs[seqidx][1][ofs, bclen]
93
+ tmpdist.push(Levenshtein.distance(bc, seqbc))
94
+ end
95
+ tmpdist
96
+ end
97
+
98
+ tmpdist = Hash.new
99
+ seqs.each_index do |seqidx|
100
+ seqid, seq, qvs = seqs[seqidx]
101
+ bcs.each_index do |bcidx|
102
+ tmpdist[bcidx] = tmpdists[bcidx][seqidx]
103
+ end
104
+ dists = tmpdist.sort { |a, b| a[1] <=> b[1] }
105
+ if dists[0][1] < dists[1][1] && dists[0][1] < 2
106
+ idx = dists[0][0]
107
+ qs[idx].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
108
+ else
109
+ qs[-1].push(">#{seqid}\n#{seq}\n+\n#{qvs}")
110
+ end
111
+ end
79
112
 
80
113
  end
114
+
81
115
  end
82
116
  end
@@ -3,7 +3,7 @@ require 'thor'
3
3
  module Bio
4
4
  class Gadget < Thor
5
5
 
6
- VERSION = "0.1.3"
6
+ VERSION = "0.1.4"
7
7
 
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gadget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-06 00:00:00.000000000 Z
12
+ date: 2012-12-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gthor