seqtrimnext 2.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
@@ -0,0 +1,394 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
########################################################
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
5
|
+
#
|
6
|
+
# Defines the main methods that are necessary to execute PluginLowQuality. See the main method called execute.
|
7
|
+
|
8
|
+
#
|
9
|
+
# Inherit: Plugin
|
10
|
+
########################################################
|
11
|
+
|
12
|
+
class PluginLowQuality < Plugin
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
def create_sum_window(qual,ini,index_window_end)
|
17
|
+
|
18
|
+
# puts "--------index w #{index_window_end}"
|
19
|
+
sum=[]
|
20
|
+
i=ini
|
21
|
+
# puts "#{i} #{index_window_end}"
|
22
|
+
while (i<=index_window_end) # initialize sum
|
23
|
+
sum[i]=0
|
24
|
+
i += 1
|
25
|
+
end
|
26
|
+
# puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
|
27
|
+
|
28
|
+
i=ini
|
29
|
+
while (i<ini+@window)
|
30
|
+
|
31
|
+
sum[ini] += qual[i]
|
32
|
+
i+=1
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
i=ini+1
|
37
|
+
|
38
|
+
while (i<=index_window_end)
|
39
|
+
|
40
|
+
sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
|
41
|
+
i+=1
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
# puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
|
46
|
+
|
47
|
+
return sum
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
def find_bounds_high_quality(sum,ini,index_window_end)
|
52
|
+
|
53
|
+
new_start = -1
|
54
|
+
new_end = -1
|
55
|
+
|
56
|
+
# puts " ini #{ini} iwe #{index_window_end}"
|
57
|
+
# puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
|
58
|
+
if (ini>index_window_end)
|
59
|
+
temp_start= ini
|
60
|
+
# new_start, new_end = temp_start, index_window_end
|
61
|
+
new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
|
62
|
+
# new_start, new_end = index_window_end, index_window_end
|
63
|
+
end
|
64
|
+
# puts " temp_start #{temp_start}" if (ini>index_window_end)
|
65
|
+
temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
|
66
|
+
|
67
|
+
i=ini+1
|
68
|
+
while (i<=index_window_end)
|
69
|
+
if (sum[i]>=@cut_off)
|
70
|
+
if (temp_start<0)
|
71
|
+
temp_start=i #just in!
|
72
|
+
# puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
|
73
|
+
end
|
74
|
+
|
75
|
+
else
|
76
|
+
# puts "sum #{sum[i]} < cut off "
|
77
|
+
if(temp_start>=0) #just out!
|
78
|
+
# puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
|
79
|
+
if (((i-1-temp_start)>=(new_end-new_start)))
|
80
|
+
new_start,new_end=temp_start,i-1
|
81
|
+
# puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
|
82
|
+
end
|
83
|
+
temp_start= -1
|
84
|
+
end
|
85
|
+
end
|
86
|
+
i+=1
|
87
|
+
|
88
|
+
|
89
|
+
end
|
90
|
+
# puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
91
|
+
|
92
|
+
if (temp_start != -1) # finished while ok
|
93
|
+
# puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
|
94
|
+
if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
|
95
|
+
new_start, new_end = temp_start, index_window_end #-1
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
100
|
+
|
101
|
+
# puts " newstart #{new_start} newend #{new_end}"
|
102
|
+
|
103
|
+
return new_start,new_end
|
104
|
+
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
def cut_fine_bounds_short(qual,new_start,new_end)
|
109
|
+
|
110
|
+
i=0
|
111
|
+
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
112
|
+
while (i<@window)
|
113
|
+
if (qual[new_start+i]>=@low)
|
114
|
+
break
|
115
|
+
end
|
116
|
+
i+=1
|
117
|
+
end
|
118
|
+
new_start +=i
|
119
|
+
# puts "#{new_start} ***********"
|
120
|
+
|
121
|
+
i=@window -1
|
122
|
+
while (i>=0)
|
123
|
+
if (qual[new_end+i]>=@low)
|
124
|
+
break
|
125
|
+
end
|
126
|
+
i-=1
|
127
|
+
end
|
128
|
+
new_end += i
|
129
|
+
# puts "6a new_start #{new_start} new-end #{new_end}"
|
130
|
+
|
131
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
|
132
|
+
return new_start, new_end
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# cuts fine the high quality bounds
|
138
|
+
def cut_fine_bounds(qual,new_start,new_end)
|
139
|
+
# puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
|
140
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
141
|
+
# cut it fine
|
142
|
+
|
143
|
+
one_ok = 0
|
144
|
+
|
145
|
+
i=@window-1
|
146
|
+
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
147
|
+
while (i>=0)
|
148
|
+
if (qual[new_start+i] < @low)
|
149
|
+
break if one_ok
|
150
|
+
else
|
151
|
+
one_ok = 1
|
152
|
+
end
|
153
|
+
i-=1
|
154
|
+
end
|
155
|
+
new_start += i+1
|
156
|
+
oneOk = 0
|
157
|
+
i=0
|
158
|
+
while (i<@window)
|
159
|
+
if (qual[new_end+i] < @low)
|
160
|
+
break if oneOk
|
161
|
+
else
|
162
|
+
oneOk = 1
|
163
|
+
end
|
164
|
+
i+=1
|
165
|
+
end
|
166
|
+
new_end += i-1
|
167
|
+
# puts "6b new_start #{new_start} new-end #{new_end}"
|
168
|
+
|
169
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
170
|
+
return new_start, new_end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
def find_high_quality(qual,ini=0)
|
175
|
+
|
176
|
+
# puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
|
177
|
+
|
178
|
+
update=false
|
179
|
+
# if @window>qual.length-ini #search in the last window although has a low size
|
180
|
+
# @window=qual.length-ini
|
181
|
+
# # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
|
182
|
+
# @cut_off=@window*@low
|
183
|
+
# update=true
|
184
|
+
# end
|
185
|
+
|
186
|
+
if (ini==0 or update)
|
187
|
+
#index_window_start = ini
|
188
|
+
@index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
|
189
|
+
#TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
|
190
|
+
|
191
|
+
|
192
|
+
@sum = create_sum_window(qual,ini,@index_window_end)
|
193
|
+
# puts "SUMA #{@sum.join(' ')}"
|
194
|
+
end
|
195
|
+
|
196
|
+
new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
|
197
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
198
|
+
|
199
|
+
if (new_start>=0)
|
200
|
+
if (new_start+@window >= new_end)
|
201
|
+
# puts "cfs"
|
202
|
+
new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
|
203
|
+
# puts "cfs"
|
204
|
+
|
205
|
+
else
|
206
|
+
# puts "cf"
|
207
|
+
new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
|
208
|
+
# puts "cf"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
213
|
+
|
214
|
+
return new_start,new_end #+1
|
215
|
+
|
216
|
+
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
|
221
|
+
|
222
|
+
action_size = p_begin-1
|
223
|
+
if action_size>=(@window/2)
|
224
|
+
|
225
|
+
|
226
|
+
# puts "action_SIZE1 #{action_size} > #{@window/2}"
|
227
|
+
|
228
|
+
if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
|
229
|
+
# it's created an action before of the high quality part
|
230
|
+
a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
231
|
+
# puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
|
232
|
+
actions.push a
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def add_action_after_high_qual(p_begin,p_end,actions,seq)
|
238
|
+
|
239
|
+
action_size = seq.insert_end-p_end
|
240
|
+
if action_size>=(@window/2)
|
241
|
+
|
242
|
+
|
243
|
+
# puts "action_SIZE2 #{action_size} > #{@window/2}"
|
244
|
+
|
245
|
+
if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
|
246
|
+
# it's created an action before of the high quality part
|
247
|
+
a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
248
|
+
|
249
|
+
actions.push a
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
|
258
|
+
######################################################################
|
259
|
+
#---------------------------------------------------------------------
|
260
|
+
|
261
|
+
# Begins the plugin1's execution whit the sequence "seq"
|
262
|
+
# Creates an action by each subsequence with low quality to eliminate it
|
263
|
+
# A subsequence has low quality if (the add of all its qualitis < subsequence_size*20)
|
264
|
+
# Creates the qualities windows from the sequence, looks for the subsequence with high quality
|
265
|
+
# and mark, with an action, the before part to the High Quality Subsequence like a low quality part
|
266
|
+
# Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
|
267
|
+
#-----------------------------------------------------------------
|
268
|
+
|
269
|
+
def execute(seqs)
|
270
|
+
seqs.each do |s|
|
271
|
+
exec_seq(s)
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
|
276
|
+
def exec_seq(seq)
|
277
|
+
|
278
|
+
if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
|
279
|
+
$LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
|
280
|
+
elsif (seq.seq_qual.size>0)
|
281
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
282
|
+
|
283
|
+
@low=@params.get_param('min_quality').to_i
|
284
|
+
|
285
|
+
if @params.get_param('window_width').to_i>seq.seq_fasta.length
|
286
|
+
@window=seq.seq_fasta.length
|
287
|
+
|
288
|
+
else
|
289
|
+
@window=@params.get_param('window_width').to_i
|
290
|
+
|
291
|
+
end
|
292
|
+
@cut_off=@window*@low
|
293
|
+
|
294
|
+
type='ActionLowQuality'
|
295
|
+
low_qual=0
|
296
|
+
actions=[]
|
297
|
+
|
298
|
+
p_begin,p_end =0,-1 # positions from high quality bounds
|
299
|
+
|
300
|
+
# @stats[:low_qual]={}
|
301
|
+
# @stats['low_qual']={}
|
302
|
+
|
303
|
+
|
304
|
+
while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
|
305
|
+
|
306
|
+
|
307
|
+
p_begin_old,p_end_old= p_begin, p_end
|
308
|
+
p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
|
309
|
+
# entra=(p_begin>0) or (p_end_old<0)
|
310
|
+
#
|
311
|
+
# puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
|
312
|
+
|
313
|
+
if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
|
314
|
+
# it's created an action before of the high quality part
|
315
|
+
add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
|
316
|
+
# puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
|
317
|
+
low_qual = p_begin-1-p_end_old-1 + 1
|
318
|
+
|
319
|
+
add_stats('low_qual',low_qual)
|
320
|
+
# @stats[:low_qual]={low_qual => 1}
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
# puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
|
325
|
+
|
326
|
+
end
|
327
|
+
|
328
|
+
# puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
|
329
|
+
if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
|
330
|
+
|
331
|
+
# it's created an action after of the high quality part
|
332
|
+
add_action_after_high_qual(p_begin,p_end,actions,seq)
|
333
|
+
# puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}"
|
334
|
+
low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1
|
335
|
+
# if @stats[:low_qual][low_qual].nil?
|
336
|
+
# @stats[:low_qual][low_qual] = 0
|
337
|
+
# end
|
338
|
+
# @stats[:low_qual][low_qual] += 1
|
339
|
+
add_stats('low_qual',low_qual)
|
340
|
+
# @stats[:low_qual]={low_qual => 1}
|
341
|
+
end
|
342
|
+
|
343
|
+
# puts "-----ññññ----- high quality #{p_begin} #{p_end}"
|
344
|
+
|
345
|
+
|
346
|
+
if p_end<0 and p_end_old #add action low qual to all the part
|
347
|
+
a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
348
|
+
# puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
|
349
|
+
low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
|
350
|
+
|
351
|
+
# if @stats[:low_qual][low_qual].nil?
|
352
|
+
# @stats[:low_qual][low_qual] = 0
|
353
|
+
# end
|
354
|
+
# @stats[:low_qual][low_qual] += 1
|
355
|
+
add_stats('low_qual',low_qual)
|
356
|
+
# @stats[:low_qual]={'low_qual' => 1}
|
357
|
+
|
358
|
+
actions.push a
|
359
|
+
end
|
360
|
+
|
361
|
+
# puts "------- ADDING ACTIONs LOW QUAL #{actions.size}"
|
362
|
+
seq.add_actions(actions)
|
363
|
+
end
|
364
|
+
|
365
|
+
end
|
366
|
+
|
367
|
+
#-----------------------------------------------------------------
|
368
|
+
|
369
|
+
|
370
|
+
######################################################################
|
371
|
+
#---------------------------------------------------------------------
|
372
|
+
|
373
|
+
#Returns an array with the errors due to parameters are missing
|
374
|
+
def self.check_params(params)
|
375
|
+
|
376
|
+
errors=[]
|
377
|
+
|
378
|
+
comment='Minimum quality value for every nucleotide'
|
379
|
+
default_value = 20
|
380
|
+
params.check_param(errors,'min_quality','Integer',default_value,comment)
|
381
|
+
|
382
|
+
comment='Quality window for scanning low quality segments'
|
383
|
+
default_value = 15
|
384
|
+
params.check_param(errors,'window_width','Integer',default_value,comment)
|
385
|
+
|
386
|
+
|
387
|
+
|
388
|
+
return errors
|
389
|
+
end
|
390
|
+
|
391
|
+
|
392
|
+
private :find_high_quality
|
393
|
+
|
394
|
+
end
|
@@ -0,0 +1,231 @@
|
|
1
|
+
require "plugin"
|
2
|
+
require 'recover_mid'
|
3
|
+
include RecoverMid
|
4
|
+
|
5
|
+
########################################################
|
6
|
+
# Author: Almudena Bocinos Rioboo
|
7
|
+
#
|
8
|
+
# Defines the main methods that are necessary to execute PluginMids
|
9
|
+
# Inherit: Plugin
|
10
|
+
########################################################
|
11
|
+
|
12
|
+
class PluginMids < Plugin
|
13
|
+
SIZE_SEARCH_MID=20
|
14
|
+
MAX_MID_ERRORS = 2
|
15
|
+
#MIN_MID_SIZE = 7 # very important, don't touch
|
16
|
+
# DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
21
|
+
def execute(seqs)
|
22
|
+
blasts= do_blasts(seqs)
|
23
|
+
|
24
|
+
seqs.each_with_index do |s,i|
|
25
|
+
exec_seq(s,blasts.querys[i])
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def do_blasts(seqs)
|
30
|
+
# find MIDS with less results than max_target_seqs value
|
31
|
+
blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
|
32
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd)
|
33
|
+
|
34
|
+
fastas=[]
|
35
|
+
|
36
|
+
seqs.each do |seq|
|
37
|
+
fastas.push ">"+seq.seq_name
|
38
|
+
fastas.push seq.seq_fasta[0..SIZE_SEARCH_MID]
|
39
|
+
end
|
40
|
+
|
41
|
+
# fastas=fastas.join("\n")
|
42
|
+
|
43
|
+
blast_table_results = blast.do_blast(fastas)
|
44
|
+
|
45
|
+
# puts blast_table_results.inspect
|
46
|
+
|
47
|
+
return blast_table_results
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def exec_seq(seq,blast_query)
|
52
|
+
if blast_query.query_id != seq.seq_name
|
53
|
+
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
|
58
|
+
|
59
|
+
|
60
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
|
61
|
+
# blast_table_results.inspect
|
62
|
+
|
63
|
+
actions=[]
|
64
|
+
file_tag='no_MID'
|
65
|
+
|
66
|
+
key_size=0
|
67
|
+
mid_size=0
|
68
|
+
|
69
|
+
key_already_found=!seq.get_actions(ActionKey).empty?
|
70
|
+
|
71
|
+
mid_errors=[] #number of MIDs with 1 error, and number of MIDs with 2 errors
|
72
|
+
mid_id=[] #number of MIDs from each type
|
73
|
+
mid_found = false
|
74
|
+
|
75
|
+
if !blast_query.hits.empty? # mid found
|
76
|
+
|
77
|
+
# blast_query.hits.sort!{|h1,h2| h1.q_beg <=> h2.q_beg}
|
78
|
+
# puts blast_query.count.to_s + "============== #{blast_query.hits[0].inspect}"
|
79
|
+
# blast_table_results.inspect
|
80
|
+
|
81
|
+
# select first sorted mid
|
82
|
+
mid=blast_query.hits[0]
|
83
|
+
|
84
|
+
# find a not reversed mid
|
85
|
+
if mid.reversed
|
86
|
+
|
87
|
+
blast_query.hits.each do |hit|
|
88
|
+
if !hit.reversed # take the first non-reversed one
|
89
|
+
mid = hit
|
90
|
+
break
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
# puts "DOES THE MID HAVE ENOUGHT SIZE? #{mid.q_end-mid.q_beg+1} >= #{MIN_MID_SIZE}?"
|
97
|
+
mid_size=mid.q_end-mid.q_beg+1
|
98
|
+
|
99
|
+
db_mid=@params.get_mid(mid.subject_id)
|
100
|
+
db_mid_size = db_mid.size #get mid's size from DB
|
101
|
+
|
102
|
+
mid_initial_pos=mid.q_beg-mid.s_beg
|
103
|
+
has_full_key=false
|
104
|
+
if @params.get_param('sequencing_key')
|
105
|
+
has_full_key = !seq.seq_fasta.index(@params.get_param('sequencing_key')).nil?
|
106
|
+
end
|
107
|
+
|
108
|
+
if mid.reversed
|
109
|
+
# discard mid
|
110
|
+
elsif (mid.gaps+mid.mismatches > MAX_MID_ERRORS) # number of ERRORS and GAPs is higher than MAX_MID_ERRORS,
|
111
|
+
# discard mid
|
112
|
+
elsif (mid.q_beg<3) # if found mid starts below 3, then discard it
|
113
|
+
# discard mid
|
114
|
+
elsif (has_full_key && (mid_initial_pos >=6))
|
115
|
+
# discard mid
|
116
|
+
elsif (!has_full_key && (mid_initial_pos >=7))
|
117
|
+
# discard mid
|
118
|
+
elsif (mid_size >= db_mid_size-1) # MID found and MID's size is enought, THEN create key and mid
|
119
|
+
|
120
|
+
key_beg,key_end=[0,mid.q_beg-1]
|
121
|
+
key_size=mid.q_beg
|
122
|
+
|
123
|
+
# Create an ActionKey before the ActionMid
|
124
|
+
if key_size>0 && !key_already_found
|
125
|
+
a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
|
126
|
+
actions.push a
|
127
|
+
end
|
128
|
+
|
129
|
+
#Create an ActionMid
|
130
|
+
a = seq.new_action(mid.q_beg,mid.q_end,"ActionMid") # adds the ActionMids to the sequence
|
131
|
+
a.message = mid.subject_id
|
132
|
+
a.tag_id = mid.subject_id
|
133
|
+
file_tag = mid.subject_id
|
134
|
+
actions.push a
|
135
|
+
|
136
|
+
mid_found = true
|
137
|
+
|
138
|
+
elsif (mid_size >= db_mid_size-3)
|
139
|
+
# To recover a MID it must start or end in one edge
|
140
|
+
if (mid.s_beg==0) || (mid.s_end==mid_size)
|
141
|
+
|
142
|
+
new_q_beg, new_q_end, recovered_size,recovered_mid = recover_mid(mid, db_mid, seq.seq_fasta[0..SIZE_SEARCH_MID])
|
143
|
+
|
144
|
+
$LOG.debug("Recover mid: #{recovered_mid} valid (#{recovered_size} >= #{10-1}) = #{recovered_size>=10-1}, #{seq.seq_fasta[new_q_beg..new_q_end]}")
|
145
|
+
|
146
|
+
if recovered_size >= db_mid_size-1
|
147
|
+
mid_size = recovered_size
|
148
|
+
|
149
|
+
# if MID found and MID's size is enought to recover a MID, THEN create an action key and mid
|
150
|
+
key_beg,key_end=[0,new_q_beg-1]
|
151
|
+
key_size=new_q_beg
|
152
|
+
|
153
|
+
$LOG.debug "RECOVER OUTPUT: #{new_q_beg} #{new_q_end} #{recovered_size}"
|
154
|
+
|
155
|
+
# if key_size > 4(or max_size_key) then seq.seq_rejected
|
156
|
+
|
157
|
+
# Create an ActionKey before the ActionMid
|
158
|
+
if key_size>0 && !key_already_found
|
159
|
+
a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
|
160
|
+
actions.push a
|
161
|
+
end
|
162
|
+
|
163
|
+
#Create an ActionMid to a recovered mid
|
164
|
+
a = seq.new_action(new_q_beg,new_q_end,"ActionMid") # adds the ActionMids to the sequence
|
165
|
+
a.message = "Recovered " + mid.subject_id
|
166
|
+
a.tag_id = mid.subject_id
|
167
|
+
file_tag = mid.subject_id
|
168
|
+
actions.push a
|
169
|
+
add_stats('recovered_mid_id',mid.subject_id)
|
170
|
+
|
171
|
+
mid_found = true
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
if !mid_found && !key_already_found # MID not found, take only the key
|
178
|
+
mid_size=0
|
179
|
+
key_beg,key_end=[0,3]
|
180
|
+
key_size=4
|
181
|
+
a = seq.new_action(key_beg,key_end,'ActionKey') # adds the actionKey to the sequence
|
182
|
+
actions.push a
|
183
|
+
end
|
184
|
+
|
185
|
+
#Add actions
|
186
|
+
seq.add_actions(actions)
|
187
|
+
|
188
|
+
seq.add_file_tag(1, file_tag, :both)
|
189
|
+
# seq.add_file_tag(2,'sequence')
|
190
|
+
|
191
|
+
if (mid_found) # MID without errors
|
192
|
+
|
193
|
+
add_stats('mid_id',mid.subject_id)
|
194
|
+
add_stats('mid_id','total')
|
195
|
+
|
196
|
+
#save MID count by ID
|
197
|
+
add_stats(mid.subject_id,mid_size)
|
198
|
+
|
199
|
+
if (mid.gaps+mid.mismatches > 0)
|
200
|
+
add_stats('mid_with_errors',mid.gaps+mid.mismatches)
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
if !key_already_found
|
206
|
+
add_stats('key_size',key_size)
|
207
|
+
add_stats('mid_size',mid_size)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
#Returns an array with the errors due to parameters are missing
|
212
|
+
def self.check_params(params)
|
213
|
+
errors=[]
|
214
|
+
|
215
|
+
comment='Blast E-value used as cut-off when searching for MIDs'
|
216
|
+
default_value = 1e-10
|
217
|
+
params.check_param(errors,'blast_evalue_mids','Float',default_value,comment)
|
218
|
+
|
219
|
+
comment='Minimum required identity (%) for a reliable MID'
|
220
|
+
default_value = 95
|
221
|
+
params.check_param(errors,'blast_percent_mids','Integer',default_value,comment)
|
222
|
+
|
223
|
+
comment='Path for MID database'
|
224
|
+
default_value = File.join($FORMATTED_DB_PATH,'mids.fasta')
|
225
|
+
params.check_param(errors,'mids_db','DB',default_value,comment)
|
226
|
+
|
227
|
+
return errors
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
end
|