seqtrimnext 2.0.51 → 2.0.52
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -12,245 +12,94 @@ require "plugin"
|
|
12
12
|
class PluginLowQuality < Plugin
|
13
13
|
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
15
|
+
|
16
|
+
def next_low_qual_region(quals,from_pos,min_value,max_good_quals=2)
|
17
|
+
|
18
|
+
rstart=nil
|
19
|
+
rend=nil
|
20
|
+
|
21
|
+
i=from_pos
|
22
|
+
|
23
|
+
good_q=0
|
24
|
+
|
25
|
+
# skip good values
|
26
|
+
while (i< quals.length) && (quals[i]>=min_value)
|
27
|
+
i +=1
|
28
|
+
end
|
29
|
+
|
30
|
+
# now we have found a bad quality, or end of sequence
|
31
|
+
if i < quals.length
|
32
|
+
rstart=i
|
33
|
+
len=0
|
34
|
+
|
35
|
+
# puts " - [#{rstart},#{len}]"
|
36
|
+
|
37
|
+
# continue growing while region of lowqual until more than 2 bases of good qual are found
|
38
|
+
begin
|
39
|
+
q=quals[i]
|
40
|
+
|
41
|
+
if q<min_value
|
42
|
+
len += 1
|
43
|
+
# puts "BAD #{q}<#{min_value}"
|
44
|
+
len += good_q
|
45
|
+
good_q=0
|
46
|
+
else
|
47
|
+
good_q+=1
|
48
|
+
end
|
49
|
+
# puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
|
50
|
+
|
51
|
+
i+=1
|
52
|
+
end while (i < quals.length) && (good_q <= max_good_quals)
|
53
|
+
|
54
|
+
rend = rstart + len -1
|
55
|
+
# puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
|
56
|
+
end
|
57
|
+
|
58
|
+
return [rstart,rend]
|
59
|
+
end
|
60
|
+
|
61
|
+
# A region is valid if it starts in 0, ends in seq.length or is big enought
|
62
|
+
def valid_low_qual_region?(quals,rstart,rend,min_region_size)
|
63
|
+
# puts [rstart,rend,0,quals.length,(rend-rstart+1)].join(';')
|
64
|
+
# res =((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
|
65
|
+
# if res
|
66
|
+
# puts "VALID"
|
67
|
+
# end
|
68
|
+
return ((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def get_low_qual_regions(quals,min_value, min_region_size,max_good_quals=2)
|
73
|
+
|
74
|
+
# the initial region is the whole array
|
75
|
+
left=0
|
76
|
+
right=quals.length-1
|
77
|
+
# puts quals.map{|e| ("%2d" % e.to_s)}.join(' ')
|
78
|
+
|
79
|
+
# puts "[#{left},#{right}]"
|
80
|
+
|
81
|
+
i = 0
|
82
|
+
|
83
|
+
from_pos=0
|
84
|
+
regions =[]
|
85
|
+
|
86
|
+
# get all new regions
|
87
|
+
begin
|
88
|
+
rstart, rend = next_low_qual_region(quals,from_pos,min_value,max_good_quals)
|
89
|
+
if !rstart.nil?
|
90
|
+
from_pos= rend+1
|
91
|
+
|
92
|
+
if valid_low_qual_region?(quals,rstart,rend,min_region_size)
|
93
|
+
regions << [rstart,rend]
|
25
94
|
end
|
26
|
-
# puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
|
27
|
-
|
28
|
-
i=ini
|
29
|
-
while (i<ini+@window)
|
30
|
-
|
31
|
-
sum[ini] += qual[i]
|
32
|
-
i+=1
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
i=ini+1
|
37
|
-
|
38
|
-
while (i<=index_window_end)
|
39
|
-
|
40
|
-
sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
|
41
|
-
i+=1
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
# puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
|
46
|
-
|
47
|
-
return sum
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
def find_bounds_high_quality(sum,ini,index_window_end)
|
52
|
-
|
53
|
-
new_start = -1
|
54
|
-
new_end = -1
|
55
|
-
|
56
|
-
# puts " ini #{ini} iwe #{index_window_end}"
|
57
|
-
# puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
|
58
|
-
if (ini>index_window_end)
|
59
|
-
temp_start= ini
|
60
|
-
# new_start, new_end = temp_start, index_window_end
|
61
|
-
new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
|
62
|
-
# new_start, new_end = index_window_end, index_window_end
|
63
|
-
end
|
64
|
-
# puts " temp_start #{temp_start}" if (ini>index_window_end)
|
65
|
-
temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
|
66
|
-
|
67
|
-
i=ini+1
|
68
|
-
while (i<=index_window_end)
|
69
|
-
if (sum[i]>=@cut_off)
|
70
|
-
if (temp_start<0)
|
71
|
-
temp_start=i #just in!
|
72
|
-
# puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
|
73
|
-
end
|
74
|
-
|
75
|
-
else
|
76
|
-
# puts "sum #{sum[i]} < cut off "
|
77
|
-
if(temp_start>=0) #just out!
|
78
|
-
# puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
|
79
|
-
if (((i-1-temp_start)>=(new_end-new_start)))
|
80
|
-
new_start,new_end=temp_start,i-1
|
81
|
-
# puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
|
82
|
-
end
|
83
|
-
temp_start= -1
|
84
|
-
end
|
85
|
-
end
|
86
|
-
i+=1
|
87
|
-
|
88
|
-
|
89
|
-
end
|
90
|
-
# puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
91
|
-
|
92
|
-
if (temp_start != -1) # finished while ok
|
93
|
-
# puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
|
94
|
-
if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
|
95
|
-
new_start, new_end = temp_start, index_window_end #-1
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
100
|
-
|
101
|
-
# puts " newstart #{new_start} newend #{new_end}"
|
102
|
-
|
103
|
-
return new_start,new_end
|
104
|
-
|
105
|
-
|
106
|
-
end
|
107
|
-
|
108
|
-
def cut_fine_bounds_short(qual,new_start,new_end)
|
109
|
-
|
110
|
-
i=0
|
111
|
-
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
112
|
-
while (i<@window)
|
113
|
-
if (qual[new_start+i]>=@low)
|
114
|
-
break
|
115
|
-
end
|
116
|
-
i+=1
|
117
|
-
end
|
118
|
-
new_start +=i
|
119
|
-
# puts "#{new_start} ***********"
|
120
|
-
|
121
|
-
i=@window -1
|
122
|
-
while (i>=0)
|
123
|
-
if (qual[new_end+i]>=@low)
|
124
|
-
break
|
125
|
-
end
|
126
|
-
i-=1
|
127
|
-
end
|
128
|
-
new_end += i
|
129
|
-
# puts "6a new_start #{new_start} new-end #{new_end}"
|
130
|
-
|
131
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
|
132
|
-
return new_start, new_end
|
133
|
-
|
134
|
-
end
|
135
|
-
|
136
|
-
|
137
|
-
# cuts fine the high quality bounds
|
138
|
-
def cut_fine_bounds(qual,new_start,new_end)
|
139
|
-
# puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
|
140
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
141
|
-
# cut it fine
|
142
|
-
|
143
|
-
one_ok = 0
|
144
|
-
|
145
|
-
i=@window-1
|
146
|
-
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
147
|
-
while (i>=0)
|
148
|
-
if (qual[new_start+i] < @low)
|
149
|
-
break if one_ok
|
150
|
-
else
|
151
|
-
one_ok = 1
|
152
|
-
end
|
153
|
-
i-=1
|
154
|
-
end
|
155
|
-
new_start += i+1
|
156
|
-
oneOk = 0
|
157
|
-
i=0
|
158
|
-
while (i<@window)
|
159
|
-
if (qual[new_end+i] < @low)
|
160
|
-
break if oneOk
|
161
|
-
else
|
162
|
-
oneOk = 1
|
163
|
-
end
|
164
|
-
i+=1
|
165
|
-
end
|
166
|
-
new_end += i-1
|
167
|
-
# puts "6b new_start #{new_start} new-end #{new_end}"
|
168
|
-
|
169
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
170
|
-
return new_start, new_end
|
171
|
-
|
172
|
-
end
|
173
|
-
|
174
|
-
def find_high_quality(qual,ini=0)
|
175
|
-
|
176
|
-
# puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
|
177
|
-
|
178
|
-
update=false
|
179
|
-
# if @window>qual.length-ini #search in the last window although has a low size
|
180
|
-
# @window=qual.length-ini
|
181
|
-
# # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
|
182
|
-
# @cut_off=@window*@low
|
183
|
-
# update=true
|
184
|
-
# end
|
185
|
-
|
186
|
-
if (ini==0 or update)
|
187
|
-
#index_window_start = ini
|
188
|
-
@index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
|
189
|
-
#TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
|
190
|
-
|
191
|
-
|
192
|
-
@sum = create_sum_window(qual,ini,@index_window_end)
|
193
|
-
# puts "SUMA #{@sum.join(' ')}"
|
194
|
-
end
|
195
|
-
|
196
|
-
new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
|
197
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
198
|
-
|
199
|
-
if (new_start>=0)
|
200
|
-
if (new_start+@window >= new_end)
|
201
|
-
# puts "cfs"
|
202
|
-
new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
|
203
|
-
# puts "cfs"
|
204
|
-
|
205
|
-
else
|
206
|
-
# puts "cf"
|
207
|
-
new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
|
208
|
-
# puts "cf"
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
213
|
-
|
214
|
-
return new_start,new_end #+1
|
215
|
-
|
216
|
-
|
217
95
|
end
|
96
|
+
end while !rstart.nil?
|
97
|
+
|
98
|
+
return regions
|
99
|
+
|
100
|
+
end
|
218
101
|
|
219
|
-
|
220
|
-
def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
|
221
|
-
|
222
|
-
action_size = p_begin-1
|
223
|
-
if action_size>=(@window/2)
|
224
|
-
|
225
|
-
|
226
|
-
# puts "action_SIZE1 #{action_size} > #{@window/2}"
|
227
|
-
|
228
|
-
if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
|
229
|
-
# it's created an action before of the high quality part
|
230
|
-
a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
231
|
-
# puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
|
232
|
-
actions.push a
|
233
|
-
end
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
def add_action_after_high_qual(p_begin,p_end,actions,seq)
|
238
|
-
|
239
|
-
action_size = seq.insert_end-p_end
|
240
|
-
if action_size>=(@window/2)
|
241
|
-
|
242
|
-
|
243
|
-
# puts "action_SIZE2 #{action_size} > #{@window/2}"
|
244
|
-
|
245
|
-
if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
|
246
|
-
# it's created an action before of the high quality part
|
247
|
-
a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
248
|
-
|
249
|
-
actions.push a
|
250
|
-
end
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
102
|
+
|
254
103
|
|
255
104
|
|
256
105
|
|
@@ -266,100 +115,42 @@ class PluginLowQuality < Plugin
|
|
266
115
|
# Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
|
267
116
|
#-----------------------------------------------------------------
|
268
117
|
|
269
|
-
|
270
|
-
seqs.each do |s|
|
271
|
-
exec_seq(s)
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
|
276
|
-
def exec_seq(seq)
|
118
|
+
def exec_seq(seq,blast_query)
|
277
119
|
|
278
120
|
if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
|
279
|
-
$LOG.
|
121
|
+
$LOG.debug " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
|
280
122
|
elsif (seq.seq_qual.size>0)
|
281
|
-
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
282
|
-
|
283
|
-
@low=@params.get_param('min_quality').to_i
|
284
|
-
|
285
|
-
if @params.get_param('window_width').to_i>seq.seq_fasta.length
|
286
|
-
@window=seq.seq_fasta.length
|
287
|
-
|
288
|
-
else
|
289
|
-
@window=@params.get_param('window_width').to_i
|
290
|
-
|
291
|
-
end
|
292
|
-
@cut_off=@window*@low
|
293
|
-
|
294
|
-
type='ActionLowQuality'
|
295
|
-
low_qual=0
|
296
|
-
actions=[]
|
297
|
-
|
298
|
-
p_begin,p_end =0,-1 # positions from high quality bounds
|
299
|
-
|
300
|
-
# @stats[:low_qual]={}
|
301
|
-
# @stats['low_qual']={}
|
302
|
-
|
303
|
-
|
304
|
-
while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
|
305
|
-
|
306
|
-
|
307
|
-
p_begin_old,p_end_old= p_begin, p_end
|
308
|
-
p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
|
309
|
-
# entra=(p_begin>0) or (p_end_old<0)
|
310
|
-
#
|
311
|
-
# puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
|
312
|
-
|
313
|
-
if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
|
314
|
-
# it's created an action before of the high quality part
|
315
|
-
add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
|
316
|
-
# puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
|
317
|
-
low_qual = p_begin-1-p_end_old-1 + 1
|
318
|
-
|
319
|
-
add_stats('low_qual',low_qual)
|
320
|
-
# @stats[:low_qual]={low_qual => 1}
|
321
|
-
|
322
|
-
end
|
323
|
-
|
324
|
-
# puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
|
325
|
-
|
326
|
-
end
|
327
|
-
|
328
|
-
# puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
|
329
|
-
if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
|
330
123
|
|
331
|
-
#
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
end
|
342
|
-
|
343
|
-
# puts "-----ññññ----- high quality #{p_begin} #{p_end}"
|
344
|
-
|
345
|
-
|
346
|
-
if p_end<0 and p_end_old #add action low qual to all the part
|
347
|
-
a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
348
|
-
# puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
|
349
|
-
low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
|
350
|
-
|
351
|
-
# if @stats[:low_qual][low_qual].nil?
|
352
|
-
# @stats[:low_qual][low_qual] = 0
|
353
|
-
# end
|
354
|
-
# @stats[:low_qual][low_qual] += 1
|
355
|
-
add_stats('low_qual',low_qual)
|
356
|
-
# @stats[:low_qual]={'low_qual' => 1}
|
124
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
125
|
+
|
126
|
+
min_quality=@params.get_param('min_quality').to_i
|
127
|
+
min_length_inside_seq=@params.get_param('min_length_inside_seq').to_i
|
128
|
+
max_consecutive_good_bases=@params.get_param('max_consecutive_good_bases').to_i
|
129
|
+
|
130
|
+
type='ActionLowQuality'
|
131
|
+
actions=[]
|
132
|
+
|
133
|
+
regions=get_low_qual_regions(seq.seq_qual,min_quality,min_length_inside_seq,max_consecutive_good_bases)
|
357
134
|
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
135
|
+
regions.each do |r|
|
136
|
+
low_qual_size=r.last-r.first+1
|
137
|
+
|
138
|
+
# puts "(#{low_qual_size}) = [#{r.first},#{r.last}]: #{a[r.first..r.last].map{|e| ("%2d" % e.to_s)}.join(' ')}"
|
139
|
+
|
140
|
+
|
141
|
+
add_stats('low_qual',low_qual_size)
|
142
|
+
|
143
|
+
|
144
|
+
# create action
|
145
|
+
a = seq.new_action(r.first,r.last,type) # adds the correspondent action to the sequence
|
146
|
+
actions.push a
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# add quals
|
153
|
+
seq.add_actions(actions)
|
363
154
|
end
|
364
155
|
|
365
156
|
end
|
@@ -379,16 +170,21 @@ class PluginLowQuality < Plugin
|
|
379
170
|
default_value = 20
|
380
171
|
params.check_param(errors,'min_quality','Integer',default_value,comment)
|
381
172
|
|
382
|
-
|
173
|
+
comment='Quality window for scanning low quality segments'
|
383
174
|
default_value = 15
|
384
175
|
params.check_param(errors,'window_width','Integer',default_value,comment)
|
385
176
|
|
386
|
-
|
177
|
+
|
178
|
+
comment='Minimum length of a bad quality segment inside the sequence'
|
179
|
+
default_value = 8
|
180
|
+
params.check_param(errors,'min_length_inside_seq','Integer',default_value,comment)
|
181
|
+
|
182
|
+
|
183
|
+
comment='Maximum consecutive good-quality bases between two bad quality regions'
|
184
|
+
default_value = 2
|
185
|
+
params.check_param(errors,'max_consecutive_good_bases','Integer',default_value,comment)
|
387
186
|
|
388
187
|
return errors
|
389
188
|
end
|
390
189
|
|
391
|
-
|
392
|
-
private :find_high_quality
|
393
|
-
|
394
190
|
end
|
@@ -14,17 +14,6 @@ class PluginMids < Plugin
|
|
14
14
|
MAX_MID_ERRORS = 2
|
15
15
|
#MIN_MID_SIZE = 7 # very important, don't touch
|
16
16
|
# DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
21
|
-
def execute(seqs)
|
22
|
-
blasts= do_blasts(seqs)
|
23
|
-
|
24
|
-
seqs.each_with_index do |s,i|
|
25
|
-
exec_seq(s,blasts.querys[i])
|
26
|
-
end
|
27
|
-
end
|
28
17
|
|
29
18
|
def do_blasts(seqs)
|
30
19
|
# find MIDS with less results than max_target_seqs value
|
@@ -86,16 +86,7 @@ class PluginShortInsert < Plugin
|
|
86
86
|
return sub_inserts
|
87
87
|
end
|
88
88
|
|
89
|
-
|
90
|
-
def execute(seqs)
|
91
|
-
seqs.each do |s|
|
92
|
-
exec_seq(s)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
def exec_seq(seq)
|
98
|
-
|
89
|
+
def exec_seq(seq,blast_query)
|
99
90
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
100
91
|
# puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
|
101
92
|
|
@@ -33,21 +33,17 @@ class PluginUserContaminants < Plugin
|
|
33
33
|
return res
|
34
34
|
end
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
blasts= do_blasts(seqs)
|
39
|
-
|
40
|
-
seqs.each_with_index do |s,i|
|
41
|
-
exec_seq(s,blasts.querys[i])
|
42
|
-
end
|
36
|
+
def can_execute?
|
37
|
+
return !@params.get_param('user_contaminant_db').empty?
|
43
38
|
end
|
44
39
|
|
40
|
+
|
45
41
|
def do_blasts(seqs)
|
46
42
|
|
47
43
|
# TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
|
48
44
|
# y una secuencia de baja complejidad como entrada
|
49
45
|
|
50
|
-
blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('
|
46
|
+
blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_user_contaminant')} -perc_identity #{@params.get_param('blast_percent_user_contaminant')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
|
51
47
|
|
52
48
|
$LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
|
53
49
|
|
@@ -72,42 +68,55 @@ class PluginUserContaminants < Plugin
|
|
72
68
|
|
73
69
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for classify into the sequence"
|
74
70
|
|
75
|
-
type = "
|
71
|
+
type = "ActionUserContaminant"
|
76
72
|
|
77
73
|
classify={}
|
74
|
+
contaminants=[]
|
78
75
|
|
79
|
-
|
76
|
+
|
77
|
+
merge_hits(blast_query.hits,contaminants,nil,false)
|
78
|
+
|
79
|
+
begin
|
80
|
+
contaminants2=contaminants
|
81
|
+
contaminants = [] #second round to save contaminants without overlap
|
82
|
+
merge_hits(contaminants2,contaminants,nil,false)
|
83
|
+
end until (contaminants2.count == contaminants.count)
|
84
|
+
|
85
|
+
contaminants.sort {|c1,c2| (c1.q_end - c1.q_beg + 1)<=>(c2.q_end - c2.q_beg + 1)}
|
80
86
|
|
81
|
-
classify=sum_hits_by_id(
|
87
|
+
# classify=sum_hits_by_id(contaminants.hits)
|
82
88
|
|
83
89
|
actions=[]
|
84
|
-
classify_size=0
|
90
|
+
# classify_size=0
|
85
91
|
|
86
|
-
min_cont_size=@params.get_param('
|
92
|
+
min_cont_size=@params.get_param('min_user_contaminant_size').to_i
|
87
93
|
|
88
|
-
biggest_classify =
|
94
|
+
# biggest_classify = contaminants.sort {|c1,c2| c1[1]<=>c2[1]}
|
89
95
|
|
90
|
-
if !
|
96
|
+
if !contaminants.empty?
|
91
97
|
|
92
|
-
definition,classify_size = biggest_classify.last
|
98
|
+
# definition,classify_size = biggest_classify.last
|
93
99
|
|
100
|
+
biggest_contaminant=contaminants.last
|
101
|
+
hit_size=(biggest_contaminant.q_end - biggest_contaminant.q_beg + 1)
|
94
102
|
|
95
|
-
a = seq.new_action(
|
103
|
+
a = seq.new_action(biggest_contaminant.q_beg,biggest_contaminant.q_end,type) # adds the correspondent action to the sequence
|
96
104
|
|
97
|
-
a.message = definition
|
105
|
+
a.message = biggest_contaminant.definition
|
98
106
|
|
99
|
-
|
107
|
+
seq.add_comment("Contaminated: #{biggest_contaminant.definition}")
|
108
|
+
|
109
|
+
a.tag_id = biggest_contaminant.definition.gsub(' ','_')
|
100
110
|
|
101
111
|
# a.found_definition = c.definition # save the classify definitions, each separately
|
102
112
|
|
103
113
|
#save to this file
|
104
|
-
seq.add_file_tag(
|
105
|
-
|
114
|
+
seq.add_file_tag(0, 'with_user_contaminant', :both, 10)
|
106
115
|
|
107
116
|
actions.push a
|
108
|
-
|
109
|
-
add_stats('
|
110
|
-
add_stats('
|
117
|
+
|
118
|
+
add_stats('user_contaminant_size',hit_size)
|
119
|
+
add_stats('user_contaminant_ids',biggest_contaminant.definition)
|
111
120
|
|
112
121
|
seq.add_actions(actions)
|
113
122
|
end
|
@@ -121,21 +130,20 @@ class PluginUserContaminants < Plugin
|
|
121
130
|
|
122
131
|
comment='Blast E-value used as cut-off when searching for contaminations'
|
123
132
|
default_value = 1e-10
|
124
|
-
params.check_param(errors,'
|
133
|
+
params.check_param(errors,'blast_evalue_user_contaminant','Float',default_value,comment)
|
125
134
|
|
126
|
-
comment='Minimum required identity (%) for a reliable
|
135
|
+
comment='Minimum required identity (%) for a reliable user contaminant match'
|
127
136
|
default_value = 85
|
128
|
-
params.check_param(errors,'
|
137
|
+
params.check_param(errors,'blast_percent_user_contaminant','Integer',default_value,comment)
|
129
138
|
|
130
|
-
comment='Minimum hit size (nt) for considering
|
139
|
+
comment='Minimum hit size (nt) for considering for user contaminant'
|
131
140
|
default_value = 30 # era 40
|
132
|
-
params.check_param(errors,'
|
141
|
+
params.check_param(errors,'min_user_contaminant_size','Integer',default_value,comment)
|
133
142
|
|
134
|
-
comment='Path for
|
135
|
-
default_value = File.join($FORMATTED_DB_PATH,'
|
143
|
+
comment='Path for user contaminant database'
|
144
|
+
default_value = "" #File.join($FORMATTED_DB_PATH,'user_contaminant.fasta')
|
136
145
|
params.check_param(errors,'user_contaminant_db','DB',default_value,comment)
|
137
146
|
|
138
|
-
|
139
147
|
return errors
|
140
148
|
end
|
141
149
|
|
@@ -25,15 +25,6 @@ class PluginVectors < Plugin
|
|
25
25
|
return ((linkers.count>=1) && (vector_beg+seq.insert_start>=linkers[0].start_pos) && (vector_end+seq.insert_start<=linkers[0].end_pos))
|
26
26
|
end
|
27
27
|
|
28
|
-
#Begins the plugin1's execution to warn that there are vectors in the sequence "seq"
|
29
|
-
def execute(seqs)
|
30
|
-
blasts= do_blasts(seqs)
|
31
|
-
|
32
|
-
seqs.each_with_index do |s,i|
|
33
|
-
exec_seq(s,blasts.querys[i])
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
28
|
def do_blasts(seqs)
|
38
29
|
# find MIDS with less results than max_target_seqs value
|
39
30
|
blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
|