seqtrimnext 2.0.51 → 2.0.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/History.txt +7 -0
  2. data/Manifest.txt +3 -3
  3. data/README.rdoc +18 -3
  4. data/Rakefile +2 -1
  5. data/bin/parse_params.rb +5 -1
  6. data/bin/seqtrimnext +53 -21
  7. data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
  8. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
  9. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
  10. data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
  11. data/lib/seqtrimnext/classes/params.rb +109 -123
  12. data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
  13. data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
  14. data/lib/seqtrimnext/classes/sequence.rb +2 -2
  15. data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
  16. data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
  17. data/lib/seqtrimnext/plugins/plugin.rb +42 -12
  18. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
  19. data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
  20. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
  21. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
  22. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
  23. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
  24. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
  25. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
  26. data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
  27. data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
  28. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
  29. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
  30. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
  31. data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
  32. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
  33. data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
  34. data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
  35. data/lib/seqtrimnext/templates/amplicons.txt +1 -8
  36. data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
  37. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
  38. data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
  39. data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
  40. data/lib/seqtrimnext/templates/only_quality.txt +24 -0
  41. data/lib/seqtrimnext/templates/sanger.txt +25 -0
  42. data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
  43. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
  44. data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
  45. data/lib/seqtrimnext.rb +1 -1
  46. metadata +20 -7
  47. data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
  48. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -12,245 +12,94 @@ require "plugin"
12
12
  class PluginLowQuality < Plugin
13
13
 
14
14
 
15
-
16
- def create_sum_window(qual,ini,index_window_end)
17
-
18
- # puts "--------index w #{index_window_end}"
19
- sum=[]
20
- i=ini
21
- # puts "#{i} #{index_window_end}"
22
- while (i<=index_window_end) # initialize sum
23
- sum[i]=0
24
- i += 1
15
+
16
+ def next_low_qual_region(quals,from_pos,min_value,max_good_quals=2)
17
+
18
+ rstart=nil
19
+ rend=nil
20
+
21
+ i=from_pos
22
+
23
+ good_q=0
24
+
25
+ # skip good values
26
+ while (i< quals.length) && (quals[i]>=min_value)
27
+ i +=1
28
+ end
29
+
30
+ # now we have found a bad quality, or end of sequence
31
+ if i < quals.length
32
+ rstart=i
33
+ len=0
34
+
35
+ # puts " - [#{rstart},#{len}]"
36
+
37
+ # continue growing while region of lowqual until more than 2 bases of good qual are found
38
+ begin
39
+ q=quals[i]
40
+
41
+ if q<min_value
42
+ len += 1
43
+ # puts "BAD #{q}<#{min_value}"
44
+ len += good_q
45
+ good_q=0
46
+ else
47
+ good_q+=1
48
+ end
49
+ # puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
50
+
51
+ i+=1
52
+ end while (i < quals.length) && (good_q <= max_good_quals)
53
+
54
+ rend = rstart + len -1
55
+ # puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
56
+ end
57
+
58
+ return [rstart,rend]
59
+ end
60
+
61
+ # A region is valid if it starts in 0, ends in seq.length or is big enought
62
+ def valid_low_qual_region?(quals,rstart,rend,min_region_size)
63
+ # puts [rstart,rend,0,quals.length,(rend-rstart+1)].join(';')
64
+ # res =((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
65
+ # if res
66
+ # puts "VALID"
67
+ # end
68
+ return ((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
69
+ end
70
+
71
+
72
+ def get_low_qual_regions(quals,min_value, min_region_size,max_good_quals=2)
73
+
74
+ # the initial region is the whole array
75
+ left=0
76
+ right=quals.length-1
77
+ # puts quals.map{|e| ("%2d" % e.to_s)}.join(' ')
78
+
79
+ # puts "[#{left},#{right}]"
80
+
81
+ i = 0
82
+
83
+ from_pos=0
84
+ regions =[]
85
+
86
+ # get all new regions
87
+ begin
88
+ rstart, rend = next_low_qual_region(quals,from_pos,min_value,max_good_quals)
89
+ if !rstart.nil?
90
+ from_pos= rend+1
91
+
92
+ if valid_low_qual_region?(quals,rstart,rend,min_region_size)
93
+ regions << [rstart,rend]
25
94
  end
26
- # puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
27
-
28
- i=ini
29
- while (i<ini+@window)
30
-
31
- sum[ini] += qual[i]
32
- i+=1
33
- end
34
-
35
-
36
- i=ini+1
37
-
38
- while (i<=index_window_end)
39
-
40
- sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
41
- i+=1
42
-
43
- end
44
-
45
- # puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
46
-
47
- return sum
48
-
49
- end
50
-
51
- def find_bounds_high_quality(sum,ini,index_window_end)
52
-
53
- new_start = -1
54
- new_end = -1
55
-
56
- # puts " ini #{ini} iwe #{index_window_end}"
57
- # puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
58
- if (ini>index_window_end)
59
- temp_start= ini
60
- # new_start, new_end = temp_start, index_window_end
61
- new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
62
- # new_start, new_end = index_window_end, index_window_end
63
- end
64
- # puts " temp_start #{temp_start}" if (ini>index_window_end)
65
- temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
66
-
67
- i=ini+1
68
- while (i<=index_window_end)
69
- if (sum[i]>=@cut_off)
70
- if (temp_start<0)
71
- temp_start=i #just in!
72
- # puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
73
- end
74
-
75
- else
76
- # puts "sum #{sum[i]} < cut off "
77
- if(temp_start>=0) #just out!
78
- # puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
79
- if (((i-1-temp_start)>=(new_end-new_start)))
80
- new_start,new_end=temp_start,i-1
81
- # puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
82
- end
83
- temp_start= -1
84
- end
85
- end
86
- i+=1
87
-
88
-
89
- end
90
- # puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
91
-
92
- if (temp_start != -1) # finished while ok
93
- # puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
94
- if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
95
- new_start, new_end = temp_start, index_window_end #-1
96
- end
97
- end
98
-
99
- # puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
100
-
101
- # puts " newstart #{new_start} newend #{new_end}"
102
-
103
- return new_start,new_end
104
-
105
-
106
- end
107
-
108
- def cut_fine_bounds_short(qual,new_start,new_end)
109
-
110
- i=0
111
- # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
112
- while (i<@window)
113
- if (qual[new_start+i]>=@low)
114
- break
115
- end
116
- i+=1
117
- end
118
- new_start +=i
119
- # puts "#{new_start} ***********"
120
-
121
- i=@window -1
122
- while (i>=0)
123
- if (qual[new_end+i]>=@low)
124
- break
125
- end
126
- i-=1
127
- end
128
- new_end += i
129
- # puts "6a new_start #{new_start} new-end #{new_end}"
130
-
131
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
132
- return new_start, new_end
133
-
134
- end
135
-
136
-
137
- # cuts fine the high quality bounds
138
- def cut_fine_bounds(qual,new_start,new_end)
139
- # puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
140
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
141
- # cut it fine
142
-
143
- one_ok = 0
144
-
145
- i=@window-1
146
- # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
147
- while (i>=0)
148
- if (qual[new_start+i] < @low)
149
- break if one_ok
150
- else
151
- one_ok = 1
152
- end
153
- i-=1
154
- end
155
- new_start += i+1
156
- oneOk = 0
157
- i=0
158
- while (i<@window)
159
- if (qual[new_end+i] < @low)
160
- break if oneOk
161
- else
162
- oneOk = 1
163
- end
164
- i+=1
165
- end
166
- new_end += i-1
167
- # puts "6b new_start #{new_start} new-end #{new_end}"
168
-
169
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
170
- return new_start, new_end
171
-
172
- end
173
-
174
- def find_high_quality(qual,ini=0)
175
-
176
- # puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
177
-
178
- update=false
179
- # if @window>qual.length-ini #search in the last window although has a low size
180
- # @window=qual.length-ini
181
- # # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
182
- # @cut_off=@window*@low
183
- # update=true
184
- # end
185
-
186
- if (ini==0 or update)
187
- #index_window_start = ini
188
- @index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
189
- #TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
190
-
191
-
192
- @sum = create_sum_window(qual,ini,@index_window_end)
193
- # puts "SUMA #{@sum.join(' ')}"
194
- end
195
-
196
- new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
197
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
198
-
199
- if (new_start>=0)
200
- if (new_start+@window >= new_end)
201
- # puts "cfs"
202
- new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
203
- # puts "cfs"
204
-
205
- else
206
- # puts "cf"
207
- new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
208
- # puts "cf"
209
- end
210
- end
211
-
212
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
213
-
214
- return new_start,new_end #+1
215
-
216
-
217
95
  end
96
+ end while !rstart.nil?
97
+
98
+ return regions
99
+
100
+ end
218
101
 
219
-
220
- def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
221
-
222
- action_size = p_begin-1
223
- if action_size>=(@window/2)
224
-
225
-
226
- # puts "action_SIZE1 #{action_size} > #{@window/2}"
227
-
228
- if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
229
- # it's created an action before of the high quality part
230
- a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
231
- # puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
232
- actions.push a
233
- end
234
- end
235
- end
236
-
237
- def add_action_after_high_qual(p_begin,p_end,actions,seq)
238
-
239
- action_size = seq.insert_end-p_end
240
- if action_size>=(@window/2)
241
-
242
-
243
- # puts "action_SIZE2 #{action_size} > #{@window/2}"
244
-
245
- if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
246
- # it's created an action before of the high quality part
247
- a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
248
-
249
- actions.push a
250
- end
251
- end
252
- end
253
-
102
+
254
103
 
255
104
 
256
105
 
@@ -266,100 +115,42 @@ class PluginLowQuality < Plugin
266
115
  # Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
267
116
  #-----------------------------------------------------------------
268
117
 
269
- def execute(seqs)
270
- seqs.each do |s|
271
- exec_seq(s)
272
- end
273
- end
274
-
275
-
276
- def exec_seq(seq)
118
+ def exec_seq(seq,blast_query)
277
119
 
278
120
  if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
279
- $LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
121
+ $LOG.debug " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
280
122
  elsif (seq.seq_qual.size>0)
281
- $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
282
-
283
- @low=@params.get_param('min_quality').to_i
284
-
285
- if @params.get_param('window_width').to_i>seq.seq_fasta.length
286
- @window=seq.seq_fasta.length
287
-
288
- else
289
- @window=@params.get_param('window_width').to_i
290
-
291
- end
292
- @cut_off=@window*@low
293
-
294
- type='ActionLowQuality'
295
- low_qual=0
296
- actions=[]
297
-
298
- p_begin,p_end =0,-1 # positions from high quality bounds
299
-
300
- # @stats[:low_qual]={}
301
- # @stats['low_qual']={}
302
-
303
-
304
- while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
305
-
306
-
307
- p_begin_old,p_end_old= p_begin, p_end
308
- p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
309
- # entra=(p_begin>0) or (p_end_old<0)
310
- #
311
- # puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
312
-
313
- if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
314
- # it's created an action before of the high quality part
315
- add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
316
- # puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
317
- low_qual = p_begin-1-p_end_old-1 + 1
318
-
319
- add_stats('low_qual',low_qual)
320
- # @stats[:low_qual]={low_qual => 1}
321
-
322
- end
323
-
324
- # puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
325
-
326
- end
327
-
328
- # puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
329
- if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
330
123
 
331
- # it's created an action after of the high quality part
332
- add_action_after_high_qual(p_begin,p_end,actions,seq)
333
- # puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}"
334
- low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1
335
- # if @stats[:low_qual][low_qual].nil?
336
- # @stats[:low_qual][low_qual] = 0
337
- # end
338
- # @stats[:low_qual][low_qual] += 1
339
- add_stats('low_qual',low_qual)
340
- # @stats[:low_qual]={low_qual => 1}
341
- end
342
-
343
- # puts "-----ññññ----- high quality #{p_begin} #{p_end}"
344
-
345
-
346
- if p_end<0 and p_end_old #add action low qual to all the part
347
- a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
348
- # puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
349
- low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
350
-
351
- # if @stats[:low_qual][low_qual].nil?
352
- # @stats[:low_qual][low_qual] = 0
353
- # end
354
- # @stats[:low_qual][low_qual] += 1
355
- add_stats('low_qual',low_qual)
356
- # @stats[:low_qual]={'low_qual' => 1}
124
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
125
+
126
+ min_quality=@params.get_param('min_quality').to_i
127
+ min_length_inside_seq=@params.get_param('min_length_inside_seq').to_i
128
+ max_consecutive_good_bases=@params.get_param('max_consecutive_good_bases').to_i
129
+
130
+ type='ActionLowQuality'
131
+ actions=[]
132
+
133
+ regions=get_low_qual_regions(seq.seq_qual,min_quality,min_length_inside_seq,max_consecutive_good_bases)
357
134
 
358
- actions.push a
359
- end
360
-
361
- # puts "------- ADDING ACTIONs LOW QUAL #{actions.size}"
362
- seq.add_actions(actions)
135
+ regions.each do |r|
136
+ low_qual_size=r.last-r.first+1
137
+
138
+ # puts "(#{low_qual_size}) = [#{r.first},#{r.last}]: #{a[r.first..r.last].map{|e| ("%2d" % e.to_s)}.join(' ')}"
139
+
140
+
141
+ add_stats('low_qual',low_qual_size)
142
+
143
+
144
+ # create action
145
+ a = seq.new_action(r.first,r.last,type) # adds the correspondent action to the sequence
146
+ actions.push a
147
+
148
+
149
+
150
+ end
151
+
152
+ # add quals
153
+ seq.add_actions(actions)
363
154
  end
364
155
 
365
156
  end
@@ -379,16 +170,21 @@ class PluginLowQuality < Plugin
379
170
  default_value = 20
380
171
  params.check_param(errors,'min_quality','Integer',default_value,comment)
381
172
 
382
- comment='Quality window for scanning low quality segments'
173
+ comment='Quality window for scanning low quality segments'
383
174
  default_value = 15
384
175
  params.check_param(errors,'window_width','Integer',default_value,comment)
385
176
 
386
-
177
+
178
+ comment='Minimum length of a bad quality segment inside the sequence'
179
+ default_value = 8
180
+ params.check_param(errors,'min_length_inside_seq','Integer',default_value,comment)
181
+
182
+
183
+ comment='Maximum consecutive good-quality bases between two bad quality regions'
184
+ default_value = 2
185
+ params.check_param(errors,'max_consecutive_good_bases','Integer',default_value,comment)
387
186
 
388
187
  return errors
389
188
  end
390
189
 
391
-
392
- private :find_high_quality
393
-
394
190
  end
@@ -14,17 +14,6 @@ class PluginMids < Plugin
14
14
  MAX_MID_ERRORS = 2
15
15
  #MIN_MID_SIZE = 7 # very important, don't touch
16
16
  # DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
17
-
18
-
19
-
20
- #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
21
- def execute(seqs)
22
- blasts= do_blasts(seqs)
23
-
24
- seqs.each_with_index do |s,i|
25
- exec_seq(s,blasts.querys[i])
26
- end
27
- end
28
17
 
29
18
  def do_blasts(seqs)
30
19
  # find MIDS with less results than max_target_seqs value
@@ -86,16 +86,7 @@ class PluginShortInsert < Plugin
86
86
  return sub_inserts
87
87
  end
88
88
 
89
- #Begins the plugin1's execution to warn if the inserted is so short
90
- def execute(seqs)
91
- seqs.each do |s|
92
- exec_seq(s)
93
- end
94
- end
95
-
96
-
97
- def exec_seq(seq)
98
-
89
+ def exec_seq(seq,blast_query)
99
90
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
100
91
  # puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
101
92
 
@@ -33,21 +33,17 @@ class PluginUserContaminants < Plugin
33
33
  return res
34
34
  end
35
35
 
36
- #Begins the plugin1's execution to warn that there are classify in the sequence "seq"
37
- def execute(seqs)
38
- blasts= do_blasts(seqs)
39
-
40
- seqs.each_with_index do |s,i|
41
- exec_seq(s,blasts.querys[i])
42
- end
36
+ def can_execute?
37
+ return !@params.get_param('user_contaminant_db').empty?
43
38
  end
44
39
 
40
+
45
41
  def do_blasts(seqs)
46
42
 
47
43
  # TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
48
44
  # y una secuencia de baja complejidad como entrada
49
45
 
50
- blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_classify')} -perc_identity #{@params.get_param('blast_percent_classify')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
46
+ blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_user_contaminant')} -perc_identity #{@params.get_param('blast_percent_user_contaminant')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
51
47
 
52
48
  $LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
53
49
 
@@ -72,42 +68,55 @@ class PluginUserContaminants < Plugin
72
68
 
73
69
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for classify into the sequence"
74
70
 
75
- type = "ActionClassify"
71
+ type = "ActionUserContaminant"
76
72
 
77
73
  classify={}
74
+ contaminants=[]
78
75
 
79
- # classify_ids=[]
76
+
77
+ merge_hits(blast_query.hits,contaminants,nil,false)
78
+
79
+ begin
80
+ contaminants2=contaminants
81
+ contaminants = [] #second round to save contaminants without overlap
82
+ merge_hits(contaminants2,contaminants,nil,false)
83
+ end until (contaminants2.count == contaminants.count)
84
+
85
+ contaminants.sort {|c1,c2| (c1.q_end - c1.q_beg + 1)<=>(c2.q_end - c2.q_beg + 1)}
80
86
 
81
- classify=sum_hits_by_id(blast_query.hits)
87
+ # classify=sum_hits_by_id(contaminants.hits)
82
88
 
83
89
  actions=[]
84
- classify_size=0
90
+ # classify_size=0
85
91
 
86
- min_cont_size=@params.get_param('min_classify_hit_size').to_i
92
+ min_cont_size=@params.get_param('min_user_contaminant_size').to_i
87
93
 
88
- biggest_classify = classify.sort {|c1,c2| c1[1]<=>c2[1]}
94
+ # biggest_classify = contaminants.sort {|c1,c2| c1[1]<=>c2[1]}
89
95
 
90
- if !biggest_classify.empty?
96
+ if !contaminants.empty?
91
97
 
92
- definition,classify_size = biggest_classify.last
98
+ # definition,classify_size = biggest_classify.last
93
99
 
100
+ biggest_contaminant=contaminants.last
101
+ hit_size=(biggest_contaminant.q_end - biggest_contaminant.q_beg + 1)
94
102
 
95
- a = seq.new_action(-1,-1,type) # adds the correspondent action to the sequence
103
+ a = seq.new_action(biggest_contaminant.q_beg,biggest_contaminant.q_end,type) # adds the correspondent action to the sequence
96
104
 
97
- a.message = definition
105
+ a.message = biggest_contaminant.definition
98
106
 
99
- a.tag_id = definition.gsub(' ','_')
107
+ seq.add_comment("Contaminated: #{biggest_contaminant.definition}")
108
+
109
+ a.tag_id = biggest_contaminant.definition.gsub(' ','_')
100
110
 
101
111
  # a.found_definition = c.definition # save the classify definitions, each separately
102
112
 
103
113
  #save to this file
104
- seq.add_file_tag(1, a.tag_id, :file)
105
-
114
+ seq.add_file_tag(0, 'with_user_contaminant', :both, 10)
106
115
 
107
116
  actions.push a
108
-
109
- add_stats('classify_size',classify_size)
110
- add_stats('classify_ids',definition)
117
+
118
+ add_stats('user_contaminant_size',hit_size)
119
+ add_stats('user_contaminant_ids',biggest_contaminant.definition)
111
120
 
112
121
  seq.add_actions(actions)
113
122
  end
@@ -121,21 +130,20 @@ class PluginUserContaminants < Plugin
121
130
 
122
131
  comment='Blast E-value used as cut-off when searching for contaminations'
123
132
  default_value = 1e-10
124
- params.check_param(errors,'blast_evalue_classify','Float',default_value,comment)
133
+ params.check_param(errors,'blast_evalue_user_contaminant','Float',default_value,comment)
125
134
 
126
- comment='Minimum required identity (%) for a reliable classify'
135
+ comment='Minimum required identity (%) for a reliable user contaminant match'
127
136
  default_value = 85
128
- params.check_param(errors,'blast_percent_classify','Integer',default_value,comment)
137
+ params.check_param(errors,'blast_percent_user_contaminant','Integer',default_value,comment)
129
138
 
130
- comment='Minimum hit size (nt) for considering to classify'
139
+ comment='Minimum hit size (nt) for considering for user contaminant'
131
140
  default_value = 30 # era 40
132
- params.check_param(errors,'min_classify_hit_size','Integer',default_value,comment)
141
+ params.check_param(errors,'min_user_contaminant_size','Integer',default_value,comment)
133
142
 
134
- comment='Path for classify database'
135
- default_value = File.join($FORMATTED_DB_PATH,'classify.fasta')
143
+ comment='Path for user contaminant database'
144
+ default_value = "" #File.join($FORMATTED_DB_PATH,'user_contaminant.fasta')
136
145
  params.check_param(errors,'user_contaminant_db','DB',default_value,comment)
137
146
 
138
-
139
147
  return errors
140
148
  end
141
149
 
@@ -25,15 +25,6 @@ class PluginVectors < Plugin
25
25
  return ((linkers.count>=1) && (vector_beg+seq.insert_start>=linkers[0].start_pos) && (vector_end+seq.insert_start<=linkers[0].end_pos))
26
26
  end
27
27
 
28
- #Begins the plugin1's execution to warn that there are vectors in the sequence "seq"
29
- def execute(seqs)
30
- blasts= do_blasts(seqs)
31
-
32
- seqs.each_with_index do |s,i|
33
- exec_seq(s,blasts.querys[i])
34
- end
35
- end
36
-
37
28
  def do_blasts(seqs)
38
29
  # find MIDS with less results than max_target_seqs value
39
30
  blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors