seqtrimnext 2.0.51 → 2.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -12,245 +12,94 @@ require "plugin"
|
|
12
12
|
class PluginLowQuality < Plugin
|
13
13
|
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
15
|
+
|
16
|
+
def next_low_qual_region(quals,from_pos,min_value,max_good_quals=2)
|
17
|
+
|
18
|
+
rstart=nil
|
19
|
+
rend=nil
|
20
|
+
|
21
|
+
i=from_pos
|
22
|
+
|
23
|
+
good_q=0
|
24
|
+
|
25
|
+
# skip good values
|
26
|
+
while (i< quals.length) && (quals[i]>=min_value)
|
27
|
+
i +=1
|
28
|
+
end
|
29
|
+
|
30
|
+
# now we have found a bad quality, or end of sequence
|
31
|
+
if i < quals.length
|
32
|
+
rstart=i
|
33
|
+
len=0
|
34
|
+
|
35
|
+
# puts " - [#{rstart},#{len}]"
|
36
|
+
|
37
|
+
# continue growing while region of lowqual until more than 2 bases of good qual are found
|
38
|
+
begin
|
39
|
+
q=quals[i]
|
40
|
+
|
41
|
+
if q<min_value
|
42
|
+
len += 1
|
43
|
+
# puts "BAD #{q}<#{min_value}"
|
44
|
+
len += good_q
|
45
|
+
good_q=0
|
46
|
+
else
|
47
|
+
good_q+=1
|
48
|
+
end
|
49
|
+
# puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
|
50
|
+
|
51
|
+
i+=1
|
52
|
+
end while (i < quals.length) && (good_q <= max_good_quals)
|
53
|
+
|
54
|
+
rend = rstart + len -1
|
55
|
+
# puts "#{q} - q[#{rstart},#{rend}], #{good_q}"
|
56
|
+
end
|
57
|
+
|
58
|
+
return [rstart,rend]
|
59
|
+
end
|
60
|
+
|
61
|
+
# A region is valid if it starts in 0, ends in seq.length or is big enought
|
62
|
+
def valid_low_qual_region?(quals,rstart,rend,min_region_size)
|
63
|
+
# puts [rstart,rend,0,quals.length,(rend-rstart+1)].join(';')
|
64
|
+
# res =((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
|
65
|
+
# if res
|
66
|
+
# puts "VALID"
|
67
|
+
# end
|
68
|
+
return ((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def get_low_qual_regions(quals,min_value, min_region_size,max_good_quals=2)
|
73
|
+
|
74
|
+
# the initial region is the whole array
|
75
|
+
left=0
|
76
|
+
right=quals.length-1
|
77
|
+
# puts quals.map{|e| ("%2d" % e.to_s)}.join(' ')
|
78
|
+
|
79
|
+
# puts "[#{left},#{right}]"
|
80
|
+
|
81
|
+
i = 0
|
82
|
+
|
83
|
+
from_pos=0
|
84
|
+
regions =[]
|
85
|
+
|
86
|
+
# get all new regions
|
87
|
+
begin
|
88
|
+
rstart, rend = next_low_qual_region(quals,from_pos,min_value,max_good_quals)
|
89
|
+
if !rstart.nil?
|
90
|
+
from_pos= rend+1
|
91
|
+
|
92
|
+
if valid_low_qual_region?(quals,rstart,rend,min_region_size)
|
93
|
+
regions << [rstart,rend]
|
25
94
|
end
|
26
|
-
# puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
|
27
|
-
|
28
|
-
i=ini
|
29
|
-
while (i<ini+@window)
|
30
|
-
|
31
|
-
sum[ini] += qual[i]
|
32
|
-
i+=1
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
i=ini+1
|
37
|
-
|
38
|
-
while (i<=index_window_end)
|
39
|
-
|
40
|
-
sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
|
41
|
-
i+=1
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
# puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
|
46
|
-
|
47
|
-
return sum
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
def find_bounds_high_quality(sum,ini,index_window_end)
|
52
|
-
|
53
|
-
new_start = -1
|
54
|
-
new_end = -1
|
55
|
-
|
56
|
-
# puts " ini #{ini} iwe #{index_window_end}"
|
57
|
-
# puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
|
58
|
-
if (ini>index_window_end)
|
59
|
-
temp_start= ini
|
60
|
-
# new_start, new_end = temp_start, index_window_end
|
61
|
-
new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
|
62
|
-
# new_start, new_end = index_window_end, index_window_end
|
63
|
-
end
|
64
|
-
# puts " temp_start #{temp_start}" if (ini>index_window_end)
|
65
|
-
temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
|
66
|
-
|
67
|
-
i=ini+1
|
68
|
-
while (i<=index_window_end)
|
69
|
-
if (sum[i]>=@cut_off)
|
70
|
-
if (temp_start<0)
|
71
|
-
temp_start=i #just in!
|
72
|
-
# puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
|
73
|
-
end
|
74
|
-
|
75
|
-
else
|
76
|
-
# puts "sum #{sum[i]} < cut off "
|
77
|
-
if(temp_start>=0) #just out!
|
78
|
-
# puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
|
79
|
-
if (((i-1-temp_start)>=(new_end-new_start)))
|
80
|
-
new_start,new_end=temp_start,i-1
|
81
|
-
# puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
|
82
|
-
end
|
83
|
-
temp_start= -1
|
84
|
-
end
|
85
|
-
end
|
86
|
-
i+=1
|
87
|
-
|
88
|
-
|
89
|
-
end
|
90
|
-
# puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
91
|
-
|
92
|
-
if (temp_start != -1) # finished while ok
|
93
|
-
# puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
|
94
|
-
if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
|
95
|
-
new_start, new_end = temp_start, index_window_end #-1
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
100
|
-
|
101
|
-
# puts " newstart #{new_start} newend #{new_end}"
|
102
|
-
|
103
|
-
return new_start,new_end
|
104
|
-
|
105
|
-
|
106
|
-
end
|
107
|
-
|
108
|
-
def cut_fine_bounds_short(qual,new_start,new_end)
|
109
|
-
|
110
|
-
i=0
|
111
|
-
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
112
|
-
while (i<@window)
|
113
|
-
if (qual[new_start+i]>=@low)
|
114
|
-
break
|
115
|
-
end
|
116
|
-
i+=1
|
117
|
-
end
|
118
|
-
new_start +=i
|
119
|
-
# puts "#{new_start} ***********"
|
120
|
-
|
121
|
-
i=@window -1
|
122
|
-
while (i>=0)
|
123
|
-
if (qual[new_end+i]>=@low)
|
124
|
-
break
|
125
|
-
end
|
126
|
-
i-=1
|
127
|
-
end
|
128
|
-
new_end += i
|
129
|
-
# puts "6a new_start #{new_start} new-end #{new_end}"
|
130
|
-
|
131
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
|
132
|
-
return new_start, new_end
|
133
|
-
|
134
|
-
end
|
135
|
-
|
136
|
-
|
137
|
-
# cuts fine the high quality bounds
|
138
|
-
def cut_fine_bounds(qual,new_start,new_end)
|
139
|
-
# puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
|
140
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
141
|
-
# cut it fine
|
142
|
-
|
143
|
-
one_ok = 0
|
144
|
-
|
145
|
-
i=@window-1
|
146
|
-
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
147
|
-
while (i>=0)
|
148
|
-
if (qual[new_start+i] < @low)
|
149
|
-
break if one_ok
|
150
|
-
else
|
151
|
-
one_ok = 1
|
152
|
-
end
|
153
|
-
i-=1
|
154
|
-
end
|
155
|
-
new_start += i+1
|
156
|
-
oneOk = 0
|
157
|
-
i=0
|
158
|
-
while (i<@window)
|
159
|
-
if (qual[new_end+i] < @low)
|
160
|
-
break if oneOk
|
161
|
-
else
|
162
|
-
oneOk = 1
|
163
|
-
end
|
164
|
-
i+=1
|
165
|
-
end
|
166
|
-
new_end += i-1
|
167
|
-
# puts "6b new_start #{new_start} new-end #{new_end}"
|
168
|
-
|
169
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
170
|
-
return new_start, new_end
|
171
|
-
|
172
|
-
end
|
173
|
-
|
174
|
-
def find_high_quality(qual,ini=0)
|
175
|
-
|
176
|
-
# puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
|
177
|
-
|
178
|
-
update=false
|
179
|
-
# if @window>qual.length-ini #search in the last window although has a low size
|
180
|
-
# @window=qual.length-ini
|
181
|
-
# # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
|
182
|
-
# @cut_off=@window*@low
|
183
|
-
# update=true
|
184
|
-
# end
|
185
|
-
|
186
|
-
if (ini==0 or update)
|
187
|
-
#index_window_start = ini
|
188
|
-
@index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
|
189
|
-
#TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
|
190
|
-
|
191
|
-
|
192
|
-
@sum = create_sum_window(qual,ini,@index_window_end)
|
193
|
-
# puts "SUMA #{@sum.join(' ')}"
|
194
|
-
end
|
195
|
-
|
196
|
-
new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
|
197
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
198
|
-
|
199
|
-
if (new_start>=0)
|
200
|
-
if (new_start+@window >= new_end)
|
201
|
-
# puts "cfs"
|
202
|
-
new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
|
203
|
-
# puts "cfs"
|
204
|
-
|
205
|
-
else
|
206
|
-
# puts "cf"
|
207
|
-
new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
|
208
|
-
# puts "cf"
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
213
|
-
|
214
|
-
return new_start,new_end #+1
|
215
|
-
|
216
|
-
|
217
95
|
end
|
96
|
+
end while !rstart.nil?
|
97
|
+
|
98
|
+
return regions
|
99
|
+
|
100
|
+
end
|
218
101
|
|
219
|
-
|
220
|
-
def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
|
221
|
-
|
222
|
-
action_size = p_begin-1
|
223
|
-
if action_size>=(@window/2)
|
224
|
-
|
225
|
-
|
226
|
-
# puts "action_SIZE1 #{action_size} > #{@window/2}"
|
227
|
-
|
228
|
-
if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
|
229
|
-
# it's created an action before of the high quality part
|
230
|
-
a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
231
|
-
# puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
|
232
|
-
actions.push a
|
233
|
-
end
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
def add_action_after_high_qual(p_begin,p_end,actions,seq)
|
238
|
-
|
239
|
-
action_size = seq.insert_end-p_end
|
240
|
-
if action_size>=(@window/2)
|
241
|
-
|
242
|
-
|
243
|
-
# puts "action_SIZE2 #{action_size} > #{@window/2}"
|
244
|
-
|
245
|
-
if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
|
246
|
-
# it's created an action before of the high quality part
|
247
|
-
a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
248
|
-
|
249
|
-
actions.push a
|
250
|
-
end
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
102
|
+
|
254
103
|
|
255
104
|
|
256
105
|
|
@@ -266,100 +115,42 @@ class PluginLowQuality < Plugin
|
|
266
115
|
# Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
|
267
116
|
#-----------------------------------------------------------------
|
268
117
|
|
269
|
-
|
270
|
-
seqs.each do |s|
|
271
|
-
exec_seq(s)
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
|
276
|
-
def exec_seq(seq)
|
118
|
+
def exec_seq(seq,blast_query)
|
277
119
|
|
278
120
|
if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
|
279
|
-
$LOG.
|
121
|
+
$LOG.debug " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
|
280
122
|
elsif (seq.seq_qual.size>0)
|
281
|
-
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
282
|
-
|
283
|
-
@low=@params.get_param('min_quality').to_i
|
284
|
-
|
285
|
-
if @params.get_param('window_width').to_i>seq.seq_fasta.length
|
286
|
-
@window=seq.seq_fasta.length
|
287
|
-
|
288
|
-
else
|
289
|
-
@window=@params.get_param('window_width').to_i
|
290
|
-
|
291
|
-
end
|
292
|
-
@cut_off=@window*@low
|
293
|
-
|
294
|
-
type='ActionLowQuality'
|
295
|
-
low_qual=0
|
296
|
-
actions=[]
|
297
|
-
|
298
|
-
p_begin,p_end =0,-1 # positions from high quality bounds
|
299
|
-
|
300
|
-
# @stats[:low_qual]={}
|
301
|
-
# @stats['low_qual']={}
|
302
|
-
|
303
|
-
|
304
|
-
while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
|
305
|
-
|
306
|
-
|
307
|
-
p_begin_old,p_end_old= p_begin, p_end
|
308
|
-
p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
|
309
|
-
# entra=(p_begin>0) or (p_end_old<0)
|
310
|
-
#
|
311
|
-
# puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
|
312
|
-
|
313
|
-
if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
|
314
|
-
# it's created an action before of the high quality part
|
315
|
-
add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
|
316
|
-
# puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
|
317
|
-
low_qual = p_begin-1-p_end_old-1 + 1
|
318
|
-
|
319
|
-
add_stats('low_qual',low_qual)
|
320
|
-
# @stats[:low_qual]={low_qual => 1}
|
321
|
-
|
322
|
-
end
|
323
|
-
|
324
|
-
# puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
|
325
|
-
|
326
|
-
end
|
327
|
-
|
328
|
-
# puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
|
329
|
-
if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
|
330
123
|
|
331
|
-
#
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
end
|
342
|
-
|
343
|
-
# puts "-----ññññ----- high quality #{p_begin} #{p_end}"
|
344
|
-
|
345
|
-
|
346
|
-
if p_end<0 and p_end_old #add action low qual to all the part
|
347
|
-
a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
348
|
-
# puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
|
349
|
-
low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
|
350
|
-
|
351
|
-
# if @stats[:low_qual][low_qual].nil?
|
352
|
-
# @stats[:low_qual][low_qual] = 0
|
353
|
-
# end
|
354
|
-
# @stats[:low_qual][low_qual] += 1
|
355
|
-
add_stats('low_qual',low_qual)
|
356
|
-
# @stats[:low_qual]={'low_qual' => 1}
|
124
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
125
|
+
|
126
|
+
min_quality=@params.get_param('min_quality').to_i
|
127
|
+
min_length_inside_seq=@params.get_param('min_length_inside_seq').to_i
|
128
|
+
max_consecutive_good_bases=@params.get_param('max_consecutive_good_bases').to_i
|
129
|
+
|
130
|
+
type='ActionLowQuality'
|
131
|
+
actions=[]
|
132
|
+
|
133
|
+
regions=get_low_qual_regions(seq.seq_qual,min_quality,min_length_inside_seq,max_consecutive_good_bases)
|
357
134
|
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
135
|
+
regions.each do |r|
|
136
|
+
low_qual_size=r.last-r.first+1
|
137
|
+
|
138
|
+
# puts "(#{low_qual_size}) = [#{r.first},#{r.last}]: #{a[r.first..r.last].map{|e| ("%2d" % e.to_s)}.join(' ')}"
|
139
|
+
|
140
|
+
|
141
|
+
add_stats('low_qual',low_qual_size)
|
142
|
+
|
143
|
+
|
144
|
+
# create action
|
145
|
+
a = seq.new_action(r.first,r.last,type) # adds the correspondent action to the sequence
|
146
|
+
actions.push a
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# add quals
|
153
|
+
seq.add_actions(actions)
|
363
154
|
end
|
364
155
|
|
365
156
|
end
|
@@ -379,16 +170,21 @@ class PluginLowQuality < Plugin
|
|
379
170
|
default_value = 20
|
380
171
|
params.check_param(errors,'min_quality','Integer',default_value,comment)
|
381
172
|
|
382
|
-
|
173
|
+
comment='Quality window for scanning low quality segments'
|
383
174
|
default_value = 15
|
384
175
|
params.check_param(errors,'window_width','Integer',default_value,comment)
|
385
176
|
|
386
|
-
|
177
|
+
|
178
|
+
comment='Minimum length of a bad quality segment inside the sequence'
|
179
|
+
default_value = 8
|
180
|
+
params.check_param(errors,'min_length_inside_seq','Integer',default_value,comment)
|
181
|
+
|
182
|
+
|
183
|
+
comment='Maximum consecutive good-quality bases between two bad quality regions'
|
184
|
+
default_value = 2
|
185
|
+
params.check_param(errors,'max_consecutive_good_bases','Integer',default_value,comment)
|
387
186
|
|
388
187
|
return errors
|
389
188
|
end
|
390
189
|
|
391
|
-
|
392
|
-
private :find_high_quality
|
393
|
-
|
394
190
|
end
|
@@ -14,17 +14,6 @@ class PluginMids < Plugin
|
|
14
14
|
MAX_MID_ERRORS = 2
|
15
15
|
#MIN_MID_SIZE = 7 # very important, don't touch
|
16
16
|
# DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
21
|
-
def execute(seqs)
|
22
|
-
blasts= do_blasts(seqs)
|
23
|
-
|
24
|
-
seqs.each_with_index do |s,i|
|
25
|
-
exec_seq(s,blasts.querys[i])
|
26
|
-
end
|
27
|
-
end
|
28
17
|
|
29
18
|
def do_blasts(seqs)
|
30
19
|
# find MIDS with less results than max_target_seqs value
|
@@ -86,16 +86,7 @@ class PluginShortInsert < Plugin
|
|
86
86
|
return sub_inserts
|
87
87
|
end
|
88
88
|
|
89
|
-
|
90
|
-
def execute(seqs)
|
91
|
-
seqs.each do |s|
|
92
|
-
exec_seq(s)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
def exec_seq(seq)
|
98
|
-
|
89
|
+
def exec_seq(seq,blast_query)
|
99
90
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
100
91
|
# puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
|
101
92
|
|
@@ -33,21 +33,17 @@ class PluginUserContaminants < Plugin
|
|
33
33
|
return res
|
34
34
|
end
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
blasts= do_blasts(seqs)
|
39
|
-
|
40
|
-
seqs.each_with_index do |s,i|
|
41
|
-
exec_seq(s,blasts.querys[i])
|
42
|
-
end
|
36
|
+
def can_execute?
|
37
|
+
return !@params.get_param('user_contaminant_db').empty?
|
43
38
|
end
|
44
39
|
|
40
|
+
|
45
41
|
def do_blasts(seqs)
|
46
42
|
|
47
43
|
# TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
|
48
44
|
# y una secuencia de baja complejidad como entrada
|
49
45
|
|
50
|
-
blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('
|
46
|
+
blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_user_contaminant')} -perc_identity #{@params.get_param('blast_percent_user_contaminant')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
|
51
47
|
|
52
48
|
$LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
|
53
49
|
|
@@ -72,42 +68,55 @@ class PluginUserContaminants < Plugin
|
|
72
68
|
|
73
69
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for classify into the sequence"
|
74
70
|
|
75
|
-
type = "
|
71
|
+
type = "ActionUserContaminant"
|
76
72
|
|
77
73
|
classify={}
|
74
|
+
contaminants=[]
|
78
75
|
|
79
|
-
|
76
|
+
|
77
|
+
merge_hits(blast_query.hits,contaminants,nil,false)
|
78
|
+
|
79
|
+
begin
|
80
|
+
contaminants2=contaminants
|
81
|
+
contaminants = [] #second round to save contaminants without overlap
|
82
|
+
merge_hits(contaminants2,contaminants,nil,false)
|
83
|
+
end until (contaminants2.count == contaminants.count)
|
84
|
+
|
85
|
+
contaminants.sort {|c1,c2| (c1.q_end - c1.q_beg + 1)<=>(c2.q_end - c2.q_beg + 1)}
|
80
86
|
|
81
|
-
classify=sum_hits_by_id(
|
87
|
+
# classify=sum_hits_by_id(contaminants.hits)
|
82
88
|
|
83
89
|
actions=[]
|
84
|
-
classify_size=0
|
90
|
+
# classify_size=0
|
85
91
|
|
86
|
-
min_cont_size=@params.get_param('
|
92
|
+
min_cont_size=@params.get_param('min_user_contaminant_size').to_i
|
87
93
|
|
88
|
-
biggest_classify =
|
94
|
+
# biggest_classify = contaminants.sort {|c1,c2| c1[1]<=>c2[1]}
|
89
95
|
|
90
|
-
if !
|
96
|
+
if !contaminants.empty?
|
91
97
|
|
92
|
-
definition,classify_size = biggest_classify.last
|
98
|
+
# definition,classify_size = biggest_classify.last
|
93
99
|
|
100
|
+
biggest_contaminant=contaminants.last
|
101
|
+
hit_size=(biggest_contaminant.q_end - biggest_contaminant.q_beg + 1)
|
94
102
|
|
95
|
-
a = seq.new_action(
|
103
|
+
a = seq.new_action(biggest_contaminant.q_beg,biggest_contaminant.q_end,type) # adds the correspondent action to the sequence
|
96
104
|
|
97
|
-
a.message = definition
|
105
|
+
a.message = biggest_contaminant.definition
|
98
106
|
|
99
|
-
|
107
|
+
seq.add_comment("Contaminated: #{biggest_contaminant.definition}")
|
108
|
+
|
109
|
+
a.tag_id = biggest_contaminant.definition.gsub(' ','_')
|
100
110
|
|
101
111
|
# a.found_definition = c.definition # save the classify definitions, each separately
|
102
112
|
|
103
113
|
#save to this file
|
104
|
-
seq.add_file_tag(
|
105
|
-
|
114
|
+
seq.add_file_tag(0, 'with_user_contaminant', :both, 10)
|
106
115
|
|
107
116
|
actions.push a
|
108
|
-
|
109
|
-
add_stats('
|
110
|
-
add_stats('
|
117
|
+
|
118
|
+
add_stats('user_contaminant_size',hit_size)
|
119
|
+
add_stats('user_contaminant_ids',biggest_contaminant.definition)
|
111
120
|
|
112
121
|
seq.add_actions(actions)
|
113
122
|
end
|
@@ -121,21 +130,20 @@ class PluginUserContaminants < Plugin
|
|
121
130
|
|
122
131
|
comment='Blast E-value used as cut-off when searching for contaminations'
|
123
132
|
default_value = 1e-10
|
124
|
-
params.check_param(errors,'
|
133
|
+
params.check_param(errors,'blast_evalue_user_contaminant','Float',default_value,comment)
|
125
134
|
|
126
|
-
comment='Minimum required identity (%) for a reliable
|
135
|
+
comment='Minimum required identity (%) for a reliable user contaminant match'
|
127
136
|
default_value = 85
|
128
|
-
params.check_param(errors,'
|
137
|
+
params.check_param(errors,'blast_percent_user_contaminant','Integer',default_value,comment)
|
129
138
|
|
130
|
-
comment='Minimum hit size (nt) for considering
|
139
|
+
comment='Minimum hit size (nt) for considering for user contaminant'
|
131
140
|
default_value = 30 # era 40
|
132
|
-
params.check_param(errors,'
|
141
|
+
params.check_param(errors,'min_user_contaminant_size','Integer',default_value,comment)
|
133
142
|
|
134
|
-
comment='Path for
|
135
|
-
default_value = File.join($FORMATTED_DB_PATH,'
|
143
|
+
comment='Path for user contaminant database'
|
144
|
+
default_value = "" #File.join($FORMATTED_DB_PATH,'user_contaminant.fasta')
|
136
145
|
params.check_param(errors,'user_contaminant_db','DB',default_value,comment)
|
137
146
|
|
138
|
-
|
139
147
|
return errors
|
140
148
|
end
|
141
149
|
|
@@ -25,15 +25,6 @@ class PluginVectors < Plugin
|
|
25
25
|
return ((linkers.count>=1) && (vector_beg+seq.insert_start>=linkers[0].start_pos) && (vector_end+seq.insert_start<=linkers[0].end_pos))
|
26
26
|
end
|
27
27
|
|
28
|
-
#Begins the plugin1's execution to warn that there are vectors in the sequence "seq"
|
29
|
-
def execute(seqs)
|
30
|
-
blasts= do_blasts(seqs)
|
31
|
-
|
32
|
-
seqs.each_with_index do |s,i|
|
33
|
-
exec_seq(s,blasts.querys[i])
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
28
|
def do_blasts(seqs)
|
38
29
|
# find MIDS with less results than max_target_seqs value
|
39
30
|
blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
|