seqtrimnext 2.0.51 → 2.0.52

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/History.txt +7 -0
  2. data/Manifest.txt +3 -3
  3. data/README.rdoc +18 -3
  4. data/Rakefile +2 -1
  5. data/bin/parse_params.rb +5 -1
  6. data/bin/seqtrimnext +53 -21
  7. data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
  8. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
  9. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
  10. data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
  11. data/lib/seqtrimnext/classes/params.rb +109 -123
  12. data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
  13. data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
  14. data/lib/seqtrimnext/classes/sequence.rb +2 -2
  15. data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
  16. data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
  17. data/lib/seqtrimnext/plugins/plugin.rb +42 -12
  18. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
  19. data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
  20. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
  21. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
  22. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
  23. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
  24. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
  25. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
  26. data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
  27. data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
  28. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
  29. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
  30. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
  31. data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
  32. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
  33. data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
  34. data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
  35. data/lib/seqtrimnext/templates/amplicons.txt +1 -8
  36. data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
  37. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
  38. data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
  39. data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
  40. data/lib/seqtrimnext/templates/only_quality.txt +24 -0
  41. data/lib/seqtrimnext/templates/sanger.txt +25 -0
  42. data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
  43. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
  44. data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
  45. data/lib/seqtrimnext.rb +1 -1
  46. metadata +20 -7
  47. data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
  48. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -12,245 +12,94 @@ require "plugin"
12
12
  class PluginLowQuality < Plugin
13
13
 
14
14
 
15
-
16
- def create_sum_window(qual,ini,index_window_end)
17
-
18
- # puts "--------index w #{index_window_end}"
19
- sum=[]
20
- i=ini
21
- # puts "#{i} #{index_window_end}"
22
- while (i<=index_window_end) # initialize sum
23
- sum[i]=0
24
- i += 1
15
+
16
+ def next_low_qual_region(quals,from_pos,min_value,max_good_quals=2)
17
+
18
+ rstart=nil
19
+ rend=nil
20
+
21
+ i=from_pos
22
+
23
+ good_q=0
24
+
25
+ # skip good values
26
+ while (i< quals.length) && (quals[i]>=min_value)
27
+ i +=1
28
+ end
29
+
30
+ # now we have found a bad quality, or end of sequence
31
+ if i < quals.length
32
+ rstart=i
33
+ len=0
34
+
35
+ # puts " - [#{rstart},#{len}]"
36
+
37
+ # continue growing while region of lowqual until more than 2 bases of good qual are found
38
+ begin
39
+ q=quals[i]
40
+
41
+ if q<min_value
42
+ len += 1
43
+ # puts "BAD #{q}<#{min_value}"
44
+ len += good_q
45
+ good_q=0
46
+ else
47
+ good_q+=1
48
+ end
49
+ # puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
50
+
51
+ i+=1
52
+ end while (i < quals.length) && (good_q <= max_good_quals)
53
+
54
+ rend = rstart + len -1
55
+ # puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
56
+ end
57
+
58
+ return [rstart,rend]
59
+ end
60
+
61
+ # A region is valid if it starts in 0, ends in seq.length or is big enought
62
+ def valid_low_qual_region?(quals,rstart,rend,min_region_size)
63
+ # puts [rstart,rend,0,quals.length,(rend-rstart+1)].join(';')
64
+ # res =((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
65
+ # if res
66
+ # puts "VALID"
67
+ # end
68
+ return ((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
69
+ end
70
+
71
+
72
+ def get_low_qual_regions(quals,min_value, min_region_size,max_good_quals=2)
73
+
74
+ # the initial region is the whole array
75
+ left=0
76
+ right=quals.length-1
77
+ # puts quals.map{|e| ("%2d" % e.to_s)}.join(' ')
78
+
79
+ # puts "[#{left},#{right}]"
80
+
81
+ i = 0
82
+
83
+ from_pos=0
84
+ regions =[]
85
+
86
+ # get all new regions
87
+ begin
88
+ rstart, rend = next_low_qual_region(quals,from_pos,min_value,max_good_quals)
89
+ if !rstart.nil?
90
+ from_pos= rend+1
91
+
92
+ if valid_low_qual_region?(quals,rstart,rend,min_region_size)
93
+ regions << [rstart,rend]
25
94
  end
26
- # puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
27
-
28
- i=ini
29
- while (i<ini+@window)
30
-
31
- sum[ini] += qual[i]
32
- i+=1
33
- end
34
-
35
-
36
- i=ini+1
37
-
38
- while (i<=index_window_end)
39
-
40
- sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
41
- i+=1
42
-
43
- end
44
-
45
- # puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
46
-
47
- return sum
48
-
49
- end
50
-
51
- def find_bounds_high_quality(sum,ini,index_window_end)
52
-
53
- new_start = -1
54
- new_end = -1
55
-
56
- # puts " ini #{ini} iwe #{index_window_end}"
57
- # puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
58
- if (ini>index_window_end)
59
- temp_start= ini
60
- # new_start, new_end = temp_start, index_window_end
61
- new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
62
- # new_start, new_end = index_window_end, index_window_end
63
- end
64
- # puts " temp_start #{temp_start}" if (ini>index_window_end)
65
- temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
66
-
67
- i=ini+1
68
- while (i<=index_window_end)
69
- if (sum[i]>=@cut_off)
70
- if (temp_start<0)
71
- temp_start=i #just in!
72
- # puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
73
- end
74
-
75
- else
76
- # puts "sum #{sum[i]} < cut off "
77
- if(temp_start>=0) #just out!
78
- # puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
79
- if (((i-1-temp_start)>=(new_end-new_start)))
80
- new_start,new_end=temp_start,i-1
81
- # puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
82
- end
83
- temp_start= -1
84
- end
85
- end
86
- i+=1
87
-
88
-
89
- end
90
- # puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
91
-
92
- if (temp_start != -1) # finished while ok
93
- # puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
94
- if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
95
- new_start, new_end = temp_start, index_window_end #-1
96
- end
97
- end
98
-
99
- # puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
100
-
101
- # puts " newstart #{new_start} newend #{new_end}"
102
-
103
- return new_start,new_end
104
-
105
-
106
- end
107
-
108
- def cut_fine_bounds_short(qual,new_start,new_end)
109
-
110
- i=0
111
- # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
112
- while (i<@window)
113
- if (qual[new_start+i]>=@low)
114
- break
115
- end
116
- i+=1
117
- end
118
- new_start +=i
119
- # puts "#{new_start} ***********"
120
-
121
- i=@window -1
122
- while (i>=0)
123
- if (qual[new_end+i]>=@low)
124
- break
125
- end
126
- i-=1
127
- end
128
- new_end += i
129
- # puts "6a new_start #{new_start} new-end #{new_end}"
130
-
131
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
132
- return new_start, new_end
133
-
134
- end
135
-
136
-
137
- # cuts fine the high quality bounds
138
- def cut_fine_bounds(qual,new_start,new_end)
139
- # puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
140
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
141
- # cut it fine
142
-
143
- one_ok = 0
144
-
145
- i=@window-1
146
- # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
147
- while (i>=0)
148
- if (qual[new_start+i] < @low)
149
- break if one_ok
150
- else
151
- one_ok = 1
152
- end
153
- i-=1
154
- end
155
- new_start += i+1
156
- oneOk = 0
157
- i=0
158
- while (i<@window)
159
- if (qual[new_end+i] < @low)
160
- break if oneOk
161
- else
162
- oneOk = 1
163
- end
164
- i+=1
165
- end
166
- new_end += i-1
167
- # puts "6b new_start #{new_start} new-end #{new_end}"
168
-
169
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
170
- return new_start, new_end
171
-
172
- end
173
-
174
- def find_high_quality(qual,ini=0)
175
-
176
- # puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
177
-
178
- update=false
179
- # if @window>qual.length-ini #search in the last window although has a low size
180
- # @window=qual.length-ini
181
- # # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
182
- # @cut_off=@window*@low
183
- # update=true
184
- # end
185
-
186
- if (ini==0 or update)
187
- #index_window_start = ini
188
- @index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
189
- #TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
190
-
191
-
192
- @sum = create_sum_window(qual,ini,@index_window_end)
193
- # puts "SUMA #{@sum.join(' ')}"
194
- end
195
-
196
- new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
197
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
198
-
199
- if (new_start>=0)
200
- if (new_start+@window >= new_end)
201
- # puts "cfs"
202
- new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
203
- # puts "cfs"
204
-
205
- else
206
- # puts "cf"
207
- new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
208
- # puts "cf"
209
- end
210
- end
211
-
212
- # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
213
-
214
- return new_start,new_end #+1
215
-
216
-
217
95
  end
96
+ end while !rstart.nil?
97
+
98
+ return regions
99
+
100
+ end
218
101
 
219
-
220
- def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
221
-
222
- action_size = p_begin-1
223
- if action_size>=(@window/2)
224
-
225
-
226
- # puts "action_SIZE1 #{action_size} > #{@window/2}"
227
-
228
- if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
229
- # it's created an action before of the high quality part
230
- a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
231
- # puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
232
- actions.push a
233
- end
234
- end
235
- end
236
-
237
- def add_action_after_high_qual(p_begin,p_end,actions,seq)
238
-
239
- action_size = seq.insert_end-p_end
240
- if action_size>=(@window/2)
241
-
242
-
243
- # puts "action_SIZE2 #{action_size} > #{@window/2}"
244
-
245
- if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
246
- # it's created an action before of the high quality part
247
- a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
248
-
249
- actions.push a
250
- end
251
- end
252
- end
253
-
102
+
254
103
 
255
104
 
256
105
 
@@ -266,100 +115,42 @@ class PluginLowQuality < Plugin
266
115
  # Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
267
116
  #-----------------------------------------------------------------
268
117
 
269
- def execute(seqs)
270
- seqs.each do |s|
271
- exec_seq(s)
272
- end
273
- end
274
-
275
-
276
- def exec_seq(seq)
118
+ def exec_seq(seq,blast_query)
277
119
 
278
120
  if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
279
- $LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
121
+ $LOG.debug " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
280
122
  elsif (seq.seq_qual.size>0)
281
- $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
282
-
283
- @low=@params.get_param('min_quality').to_i
284
-
285
- if @params.get_param('window_width').to_i>seq.seq_fasta.length
286
- @window=seq.seq_fasta.length
287
-
288
- else
289
- @window=@params.get_param('window_width').to_i
290
-
291
- end
292
- @cut_off=@window*@low
293
-
294
- type='ActionLowQuality'
295
- low_qual=0
296
- actions=[]
297
-
298
- p_begin,p_end =0,-1 # positions from high quality bounds
299
-
300
- # @stats[:low_qual]={}
301
- # @stats['low_qual']={}
302
-
303
-
304
- while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
305
-
306
-
307
- p_begin_old,p_end_old= p_begin, p_end
308
- p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
309
- # entra=(p_begin>0) or (p_end_old<0)
310
- #
311
- # puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
312
-
313
- if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
314
- # it's created an action before of the high quality part
315
- add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
316
- # puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
317
- low_qual = p_begin-1-p_end_old-1 + 1
318
-
319
- add_stats('low_qual',low_qual)
320
- # @stats[:low_qual]={low_qual => 1}
321
-
322
- end
323
-
324
- # puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
325
-
326
- end
327
-
328
- # puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
329
- if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
330
123
 
331
- # it's created an action after of the high quality part
332
- add_action_after_high_qual(p_begin,p_end,actions,seq)
333
- # puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}"
334
- low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1
335
- # if @stats[:low_qual][low_qual].nil?
336
- # @stats[:low_qual][low_qual] = 0
337
- # end
338
- # @stats[:low_qual][low_qual] += 1
339
- add_stats('low_qual',low_qual)
340
- # @stats[:low_qual]={low_qual => 1}
341
- end
342
-
343
- # puts "-----ññññ----- high quality #{p_begin} #{p_end}"
344
-
345
-
346
- if p_end<0 and p_end_old #add action low qual to all the part
347
- a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
348
- # puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
349
- low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
350
-
351
- # if @stats[:low_qual][low_qual].nil?
352
- # @stats[:low_qual][low_qual] = 0
353
- # end
354
- # @stats[:low_qual][low_qual] += 1
355
- add_stats('low_qual',low_qual)
356
- # @stats[:low_qual]={'low_qual' => 1}
124
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
125
+
126
+ min_quality=@params.get_param('min_quality').to_i
127
+ min_length_inside_seq=@params.get_param('min_length_inside_seq').to_i
128
+ max_consecutive_good_bases=@params.get_param('max_consecutive_good_bases').to_i
129
+
130
+ type='ActionLowQuality'
131
+ actions=[]
132
+
133
+ regions=get_low_qual_regions(seq.seq_qual,min_quality,min_length_inside_seq,max_consecutive_good_bases)
357
134
 
358
- actions.push a
359
- end
360
-
361
- # puts "------- ADDING ACTIONs LOW QUAL #{actions.size}"
362
- seq.add_actions(actions)
135
+ regions.each do |r|
136
+ low_qual_size=r.last-r.first+1
137
+
138
+ # puts "(#{low_qual_size}) = [#{r.first},#{r.last}]: #{a[r.first..r.last].map{|e| ("%2d" % e.to_s)}.join(' ')}"
139
+
140
+
141
+ add_stats('low_qual',low_qual_size)
142
+
143
+
144
+ # create action
145
+ a = seq.new_action(r.first,r.last,type) # adds the correspondent action to the sequence
146
+ actions.push a
147
+
148
+
149
+
150
+ end
151
+
152
+ # add quals
153
+ seq.add_actions(actions)
363
154
  end
364
155
 
365
156
  end
@@ -379,16 +170,21 @@ class PluginLowQuality < Plugin
379
170
  default_value = 20
380
171
  params.check_param(errors,'min_quality','Integer',default_value,comment)
381
172
 
382
- comment='Quality window for scanning low quality segments'
173
+ comment='Quality window for scanning low quality segments'
383
174
  default_value = 15
384
175
  params.check_param(errors,'window_width','Integer',default_value,comment)
385
176
 
386
-
177
+
178
+ comment='Minimum length of a bad quality segment inside the sequence'
179
+ default_value = 8
180
+ params.check_param(errors,'min_length_inside_seq','Integer',default_value,comment)
181
+
182
+
183
+ comment='Maximum consecutive good-quality bases between two bad quality regions'
184
+ default_value = 2
185
+ params.check_param(errors,'max_consecutive_good_bases','Integer',default_value,comment)
387
186
 
388
187
  return errors
389
188
  end
390
189
 
391
-
392
- private :find_high_quality
393
-
394
190
  end
@@ -14,17 +14,6 @@ class PluginMids < Plugin
14
14
  MAX_MID_ERRORS = 2
15
15
  #MIN_MID_SIZE = 7 # very important, don't touch
16
16
  # DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
17
-
18
-
19
-
20
- #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
21
- def execute(seqs)
22
- blasts= do_blasts(seqs)
23
-
24
- seqs.each_with_index do |s,i|
25
- exec_seq(s,blasts.querys[i])
26
- end
27
- end
28
17
 
29
18
  def do_blasts(seqs)
30
19
  # find MIDS with less results than max_target_seqs value
@@ -86,16 +86,7 @@ class PluginShortInsert < Plugin
86
86
  return sub_inserts
87
87
  end
88
88
 
89
- #Begins the plugin1's execution to warn if the inserted is so short
90
- def execute(seqs)
91
- seqs.each do |s|
92
- exec_seq(s)
93
- end
94
- end
95
-
96
-
97
- def exec_seq(seq)
98
-
89
+ def exec_seq(seq,blast_query)
99
90
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
100
91
  # puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
101
92
 
@@ -33,21 +33,17 @@ class PluginUserContaminants < Plugin
33
33
  return res
34
34
  end
35
35
 
36
- #Begins the plugin1's execution to warn that there are classify in the sequence "seq"
37
- def execute(seqs)
38
- blasts= do_blasts(seqs)
39
-
40
- seqs.each_with_index do |s,i|
41
- exec_seq(s,blasts.querys[i])
42
- end
36
+ def can_execute?
37
+ return !@params.get_param('user_contaminant_db').empty?
43
38
  end
44
39
 
40
+
45
41
  def do_blasts(seqs)
46
42
 
47
43
  # TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
48
44
  # y una secuencia de baja complejidad como entrada
49
45
 
50
- blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_classify')} -perc_identity #{@params.get_param('blast_percent_classify')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
46
+ blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_user_contaminant')} -perc_identity #{@params.get_param('blast_percent_user_contaminant')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
51
47
 
52
48
  $LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
53
49
 
@@ -72,42 +68,55 @@ class PluginUserContaminants < Plugin
72
68
 
73
69
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for classify into the sequence"
74
70
 
75
- type = "ActionClassify"
71
+ type = "ActionUserContaminant"
76
72
 
77
73
  classify={}
74
+ contaminants=[]
78
75
 
79
- # classify_ids=[]
76
+
77
+ merge_hits(blast_query.hits,contaminants,nil,false)
78
+
79
+ begin
80
+ contaminants2=contaminants
81
+ contaminants = [] #second round to save contaminants without overlap
82
+ merge_hits(contaminants2,contaminants,nil,false)
83
+ end until (contaminants2.count == contaminants.count)
84
+
85
+ contaminants.sort {|c1,c2| (c1.q_end - c1.q_beg + 1)<=>(c2.q_end - c2.q_beg + 1)}
80
86
 
81
- classify=sum_hits_by_id(blast_query.hits)
87
+ # classify=sum_hits_by_id(contaminants.hits)
82
88
 
83
89
  actions=[]
84
- classify_size=0
90
+ # classify_size=0
85
91
 
86
- min_cont_size=@params.get_param('min_classify_hit_size').to_i
92
+ min_cont_size=@params.get_param('min_user_contaminant_size').to_i
87
93
 
88
- biggest_classify = classify.sort {|c1,c2| c1[1]<=>c2[1]}
94
+ # biggest_classify = contaminants.sort {|c1,c2| c1[1]<=>c2[1]}
89
95
 
90
- if !biggest_classify.empty?
96
+ if !contaminants.empty?
91
97
 
92
- definition,classify_size = biggest_classify.last
98
+ # definition,classify_size = biggest_classify.last
93
99
 
100
+ biggest_contaminant=contaminants.last
101
+ hit_size=(biggest_contaminant.q_end - biggest_contaminant.q_beg + 1)
94
102
 
95
- a = seq.new_action(-1,-1,type) # adds the correspondent action to the sequence
103
+ a = seq.new_action(biggest_contaminant.q_beg,biggest_contaminant.q_end,type) # adds the correspondent action to the sequence
96
104
 
97
- a.message = definition
105
+ a.message = biggest_contaminant.definition
98
106
 
99
- a.tag_id = definition.gsub(' ','_')
107
+ seq.add_comment("Contaminated: #{biggest_contaminant.definition}")
108
+
109
+ a.tag_id = biggest_contaminant.definition.gsub(' ','_')
100
110
 
101
111
  # a.found_definition = c.definition # save the classify definitions, each separately
102
112
 
103
113
  #save to this file
104
- seq.add_file_tag(1, a.tag_id, :file)
105
-
114
+ seq.add_file_tag(0, 'with_user_contaminant', :both, 10)
106
115
 
107
116
  actions.push a
108
-
109
- add_stats('classify_size',classify_size)
110
- add_stats('classify_ids',definition)
117
+
118
+ add_stats('user_contaminant_size',hit_size)
119
+ add_stats('user_contaminant_ids',biggest_contaminant.definition)
111
120
 
112
121
  seq.add_actions(actions)
113
122
  end
@@ -121,21 +130,20 @@ class PluginUserContaminants < Plugin
121
130
 
122
131
  comment='Blast E-value used as cut-off when searching for contaminations'
123
132
  default_value = 1e-10
124
- params.check_param(errors,'blast_evalue_classify','Float',default_value,comment)
133
+ params.check_param(errors,'blast_evalue_user_contaminant','Float',default_value,comment)
125
134
 
126
- comment='Minimum required identity (%) for a reliable classify'
135
+ comment='Minimum required identity (%) for a reliable user contaminant match'
127
136
  default_value = 85
128
- params.check_param(errors,'blast_percent_classify','Integer',default_value,comment)
137
+ params.check_param(errors,'blast_percent_user_contaminant','Integer',default_value,comment)
129
138
 
130
- comment='Minimum hit size (nt) for considering to classify'
139
+ comment='Minimum hit size (nt) for considering for user contaminant'
131
140
  default_value = 30 # era 40
132
- params.check_param(errors,'min_classify_hit_size','Integer',default_value,comment)
141
+ params.check_param(errors,'min_user_contaminant_size','Integer',default_value,comment)
133
142
 
134
- comment='Path for classify database'
135
- default_value = File.join($FORMATTED_DB_PATH,'classify.fasta')
143
+ comment='Path for user contaminant database'
144
+ default_value = "" #File.join($FORMATTED_DB_PATH,'user_contaminant.fasta')
136
145
  params.check_param(errors,'user_contaminant_db','DB',default_value,comment)
137
146
 
138
-
139
147
  return errors
140
148
  end
141
149
 
@@ -25,15 +25,6 @@ class PluginVectors < Plugin
25
25
  return ((linkers.count>=1) && (vector_beg+seq.insert_start>=linkers[0].start_pos) && (vector_end+seq.insert_start<=linkers[0].end_pos))
26
26
  end
27
27
 
28
- #Begins the plugin1's execution to warn that there are vectors in the sequence "seq"
29
- def execute(seqs)
30
- blasts= do_blasts(seqs)
31
-
32
- seqs.each_with_index do |s,i|
33
- exec_seq(s,blasts.querys[i])
34
- end
35
- end
36
-
37
28
  def do_blasts(seqs)
38
29
  # find MIDS with less results than max_target_seqs value
39
30
  blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors