seqtrimnext 2.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
require "plugin"
|
|
2
|
+
|
|
3
|
+
########################################################
|
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
|
5
|
+
#
|
|
6
|
+
# Defines the main methods that are necessary to execute PluginLowQuality. See the main method called execute.
|
|
7
|
+
|
|
8
|
+
#
|
|
9
|
+
# Inherit: Plugin
|
|
10
|
+
########################################################
|
|
11
|
+
|
|
12
|
+
class PluginLowQuality < Plugin
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_sum_window(qual,ini,index_window_end)
|
|
17
|
+
|
|
18
|
+
# puts "--------index w #{index_window_end}"
|
|
19
|
+
sum=[]
|
|
20
|
+
i=ini
|
|
21
|
+
# puts "#{i} #{index_window_end}"
|
|
22
|
+
while (i<=index_window_end) # initialize sum
|
|
23
|
+
sum[i]=0
|
|
24
|
+
i += 1
|
|
25
|
+
end
|
|
26
|
+
# puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
|
|
27
|
+
|
|
28
|
+
i=ini
|
|
29
|
+
while (i<ini+@window)
|
|
30
|
+
|
|
31
|
+
sum[ini] += qual[i]
|
|
32
|
+
i+=1
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
i=ini+1
|
|
37
|
+
|
|
38
|
+
while (i<=index_window_end)
|
|
39
|
+
|
|
40
|
+
sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
|
|
41
|
+
i+=1
|
|
42
|
+
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
|
|
46
|
+
|
|
47
|
+
return sum
|
|
48
|
+
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def find_bounds_high_quality(sum,ini,index_window_end)
|
|
52
|
+
|
|
53
|
+
new_start = -1
|
|
54
|
+
new_end = -1
|
|
55
|
+
|
|
56
|
+
# puts " ini #{ini} iwe #{index_window_end}"
|
|
57
|
+
# puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
|
|
58
|
+
if (ini>index_window_end)
|
|
59
|
+
temp_start= ini
|
|
60
|
+
# new_start, new_end = temp_start, index_window_end
|
|
61
|
+
new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
|
|
62
|
+
# new_start, new_end = index_window_end, index_window_end
|
|
63
|
+
end
|
|
64
|
+
# puts " temp_start #{temp_start}" if (ini>index_window_end)
|
|
65
|
+
temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
|
|
66
|
+
|
|
67
|
+
i=ini+1
|
|
68
|
+
while (i<=index_window_end)
|
|
69
|
+
if (sum[i]>=@cut_off)
|
|
70
|
+
if (temp_start<0)
|
|
71
|
+
temp_start=i #just in!
|
|
72
|
+
# puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
else
|
|
76
|
+
# puts "sum #{sum[i]} < cut off "
|
|
77
|
+
if(temp_start>=0) #just out!
|
|
78
|
+
# puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
|
|
79
|
+
if (((i-1-temp_start)>=(new_end-new_start)))
|
|
80
|
+
new_start,new_end=temp_start,i-1
|
|
81
|
+
# puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
|
|
82
|
+
end
|
|
83
|
+
temp_start= -1
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
i+=1
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
end
|
|
90
|
+
# puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
|
91
|
+
|
|
92
|
+
if (temp_start != -1) # finished while ok
|
|
93
|
+
# puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
|
|
94
|
+
if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
|
|
95
|
+
new_start, new_end = temp_start, index_window_end #-1
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
|
|
100
|
+
|
|
101
|
+
# puts " newstart #{new_start} newend #{new_end}"
|
|
102
|
+
|
|
103
|
+
return new_start,new_end
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def cut_fine_bounds_short(qual,new_start,new_end)
|
|
109
|
+
|
|
110
|
+
i=0
|
|
111
|
+
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
|
112
|
+
while (i<@window)
|
|
113
|
+
if (qual[new_start+i]>=@low)
|
|
114
|
+
break
|
|
115
|
+
end
|
|
116
|
+
i+=1
|
|
117
|
+
end
|
|
118
|
+
new_start +=i
|
|
119
|
+
# puts "#{new_start} ***********"
|
|
120
|
+
|
|
121
|
+
i=@window -1
|
|
122
|
+
while (i>=0)
|
|
123
|
+
if (qual[new_end+i]>=@low)
|
|
124
|
+
break
|
|
125
|
+
end
|
|
126
|
+
i-=1
|
|
127
|
+
end
|
|
128
|
+
new_end += i
|
|
129
|
+
# puts "6a new_start #{new_start} new-end #{new_end}"
|
|
130
|
+
|
|
131
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
|
|
132
|
+
return new_start, new_end
|
|
133
|
+
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# cuts fine the high quality bounds
|
|
138
|
+
def cut_fine_bounds(qual,new_start,new_end)
|
|
139
|
+
# puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
|
|
140
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
|
141
|
+
# cut it fine
|
|
142
|
+
|
|
143
|
+
one_ok = 0
|
|
144
|
+
|
|
145
|
+
i=@window-1
|
|
146
|
+
# puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
|
|
147
|
+
while (i>=0)
|
|
148
|
+
if (qual[new_start+i] < @low)
|
|
149
|
+
break if one_ok
|
|
150
|
+
else
|
|
151
|
+
one_ok = 1
|
|
152
|
+
end
|
|
153
|
+
i-=1
|
|
154
|
+
end
|
|
155
|
+
new_start += i+1
|
|
156
|
+
oneOk = 0
|
|
157
|
+
i=0
|
|
158
|
+
while (i<@window)
|
|
159
|
+
if (qual[new_end+i] < @low)
|
|
160
|
+
break if oneOk
|
|
161
|
+
else
|
|
162
|
+
oneOk = 1
|
|
163
|
+
end
|
|
164
|
+
i+=1
|
|
165
|
+
end
|
|
166
|
+
new_end += i-1
|
|
167
|
+
# puts "6b new_start #{new_start} new-end #{new_end}"
|
|
168
|
+
|
|
169
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
|
170
|
+
return new_start, new_end
|
|
171
|
+
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def find_high_quality(qual,ini=0)
|
|
175
|
+
|
|
176
|
+
# puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
|
|
177
|
+
|
|
178
|
+
update=false
|
|
179
|
+
# if @window>qual.length-ini #search in the last window although has a low size
|
|
180
|
+
# @window=qual.length-ini
|
|
181
|
+
# # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
|
|
182
|
+
# @cut_off=@window*@low
|
|
183
|
+
# update=true
|
|
184
|
+
# end
|
|
185
|
+
|
|
186
|
+
if (ini==0 or update)
|
|
187
|
+
#index_window_start = ini
|
|
188
|
+
@index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
|
|
189
|
+
#TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@sum = create_sum_window(qual,ini,@index_window_end)
|
|
193
|
+
# puts "SUMA #{@sum.join(' ')}"
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
|
|
197
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
|
|
198
|
+
|
|
199
|
+
if (new_start>=0)
|
|
200
|
+
if (new_start+@window >= new_end)
|
|
201
|
+
# puts "cfs"
|
|
202
|
+
new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
|
|
203
|
+
# puts "cfs"
|
|
204
|
+
|
|
205
|
+
else
|
|
206
|
+
# puts "cf"
|
|
207
|
+
new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
|
|
208
|
+
# puts "cf"
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
|
|
213
|
+
|
|
214
|
+
return new_start,new_end #+1
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
|
|
221
|
+
|
|
222
|
+
action_size = p_begin-1
|
|
223
|
+
if action_size>=(@window/2)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# puts "action_SIZE1 #{action_size} > #{@window/2}"
|
|
227
|
+
|
|
228
|
+
if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
|
|
229
|
+
# it's created an action before of the high quality part
|
|
230
|
+
a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
|
231
|
+
# puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
|
|
232
|
+
actions.push a
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def add_action_after_high_qual(p_begin,p_end,actions,seq)
|
|
238
|
+
|
|
239
|
+
action_size = seq.insert_end-p_end
|
|
240
|
+
if action_size>=(@window/2)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# puts "action_SIZE2 #{action_size} > #{@window/2}"
|
|
244
|
+
|
|
245
|
+
if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
|
|
246
|
+
# it's created an action before of the high quality part
|
|
247
|
+
a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
|
248
|
+
|
|
249
|
+
actions.push a
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
######################################################################
|
|
259
|
+
#---------------------------------------------------------------------
|
|
260
|
+
|
|
261
|
+
# Begins the plugin1's execution whit the sequence "seq"
|
|
262
|
+
# Creates an action by each subsequence with low quality to eliminate it
|
|
263
|
+
# A subsequence has low quality if (the add of all its qualitis < subsequence_size*20)
|
|
264
|
+
# Creates the qualities windows from the sequence, looks for the subsequence with high quality
|
|
265
|
+
# and mark, with an action, the before part to the High Quality Subsequence like a low quality part
|
|
266
|
+
# Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
|
|
267
|
+
#-----------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
def execute(seqs)
|
|
270
|
+
seqs.each do |s|
|
|
271
|
+
exec_seq(s)
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def exec_seq(seq)
|
|
277
|
+
|
|
278
|
+
if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
|
|
279
|
+
$LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
|
|
280
|
+
elsif (seq.seq_qual.size>0)
|
|
281
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
|
282
|
+
|
|
283
|
+
@low=@params.get_param('min_quality').to_i
|
|
284
|
+
|
|
285
|
+
if @params.get_param('window_width').to_i>seq.seq_fasta.length
|
|
286
|
+
@window=seq.seq_fasta.length
|
|
287
|
+
|
|
288
|
+
else
|
|
289
|
+
@window=@params.get_param('window_width').to_i
|
|
290
|
+
|
|
291
|
+
end
|
|
292
|
+
@cut_off=@window*@low
|
|
293
|
+
|
|
294
|
+
type='ActionLowQuality'
|
|
295
|
+
low_qual=0
|
|
296
|
+
actions=[]
|
|
297
|
+
|
|
298
|
+
p_begin,p_end =0,-1 # positions from high quality bounds
|
|
299
|
+
|
|
300
|
+
# @stats[:low_qual]={}
|
|
301
|
+
# @stats['low_qual']={}
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
p_begin_old,p_end_old= p_begin, p_end
|
|
308
|
+
p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
|
|
309
|
+
# entra=(p_begin>0) or (p_end_old<0)
|
|
310
|
+
#
|
|
311
|
+
# puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
|
|
312
|
+
|
|
313
|
+
if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
|
|
314
|
+
# it's created an action before of the high quality part
|
|
315
|
+
add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
|
|
316
|
+
# puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
|
|
317
|
+
low_qual = p_begin-1-p_end_old-1 + 1
|
|
318
|
+
|
|
319
|
+
add_stats('low_qual',low_qual)
|
|
320
|
+
# @stats[:low_qual]={low_qual => 1}
|
|
321
|
+
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
|
|
325
|
+
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
|
|
329
|
+
if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
|
|
330
|
+
|
|
331
|
+
# it's created an action after of the high quality part
|
|
332
|
+
add_action_after_high_qual(p_begin,p_end,actions,seq)
|
|
333
|
+
# puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}"
|
|
334
|
+
low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1
|
|
335
|
+
# if @stats[:low_qual][low_qual].nil?
|
|
336
|
+
# @stats[:low_qual][low_qual] = 0
|
|
337
|
+
# end
|
|
338
|
+
# @stats[:low_qual][low_qual] += 1
|
|
339
|
+
add_stats('low_qual',low_qual)
|
|
340
|
+
# @stats[:low_qual]={low_qual => 1}
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# puts "-----ññññ----- high quality #{p_begin} #{p_end}"
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
if p_end<0 and p_end_old #add action low qual to all the part
|
|
347
|
+
a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
|
|
348
|
+
# puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
|
|
349
|
+
low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
|
|
350
|
+
|
|
351
|
+
# if @stats[:low_qual][low_qual].nil?
|
|
352
|
+
# @stats[:low_qual][low_qual] = 0
|
|
353
|
+
# end
|
|
354
|
+
# @stats[:low_qual][low_qual] += 1
|
|
355
|
+
add_stats('low_qual',low_qual)
|
|
356
|
+
# @stats[:low_qual]={'low_qual' => 1}
|
|
357
|
+
|
|
358
|
+
actions.push a
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
# puts "------- ADDING ACTIONs LOW QUAL #{actions.size}"
|
|
362
|
+
seq.add_actions(actions)
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
#-----------------------------------------------------------------
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
######################################################################
|
|
371
|
+
#---------------------------------------------------------------------
|
|
372
|
+
|
|
373
|
+
#Returns an array with the errors due to parameters are missing
|
|
374
|
+
def self.check_params(params)
|
|
375
|
+
|
|
376
|
+
errors=[]
|
|
377
|
+
|
|
378
|
+
comment='Minimum quality value for every nucleotide'
|
|
379
|
+
default_value = 20
|
|
380
|
+
params.check_param(errors,'min_quality','Integer',default_value,comment)
|
|
381
|
+
|
|
382
|
+
comment='Quality window for scanning low quality segments'
|
|
383
|
+
default_value = 15
|
|
384
|
+
params.check_param(errors,'window_width','Integer',default_value,comment)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
return errors
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
private :find_high_quality
|
|
393
|
+
|
|
394
|
+
end
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
require "plugin"
|
|
2
|
+
require 'recover_mid'
|
|
3
|
+
include RecoverMid
|
|
4
|
+
|
|
5
|
+
########################################################
|
|
6
|
+
# Author: Almudena Bocinos Rioboo
|
|
7
|
+
#
|
|
8
|
+
# Defines the main methods that are necessary to execute PluginMids
|
|
9
|
+
# Inherit: Plugin
|
|
10
|
+
########################################################
|
|
11
|
+
|
|
12
|
+
class PluginMids < Plugin
|
|
13
|
+
SIZE_SEARCH_MID=20
|
|
14
|
+
MAX_MID_ERRORS = 2
|
|
15
|
+
#MIN_MID_SIZE = 7 # very important, don't touch
|
|
16
|
+
# DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
|
21
|
+
def execute(seqs)
|
|
22
|
+
blasts= do_blasts(seqs)
|
|
23
|
+
|
|
24
|
+
seqs.each_with_index do |s,i|
|
|
25
|
+
exec_seq(s,blasts.querys[i])
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def do_blasts(seqs)
|
|
30
|
+
# find MIDS with less results than max_target_seqs value
|
|
31
|
+
blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
|
|
32
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd)
|
|
33
|
+
|
|
34
|
+
fastas=[]
|
|
35
|
+
|
|
36
|
+
seqs.each do |seq|
|
|
37
|
+
fastas.push ">"+seq.seq_name
|
|
38
|
+
fastas.push seq.seq_fasta[0..SIZE_SEARCH_MID]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# fastas=fastas.join("\n")
|
|
42
|
+
|
|
43
|
+
blast_table_results = blast.do_blast(fastas)
|
|
44
|
+
|
|
45
|
+
# puts blast_table_results.inspect
|
|
46
|
+
|
|
47
|
+
return blast_table_results
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def exec_seq(seq,blast_query)
|
|
52
|
+
if blast_query.query_id != seq.seq_name
|
|
53
|
+
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
|
|
61
|
+
# blast_table_results.inspect
|
|
62
|
+
|
|
63
|
+
actions=[]
|
|
64
|
+
file_tag='no_MID'
|
|
65
|
+
|
|
66
|
+
key_size=0
|
|
67
|
+
mid_size=0
|
|
68
|
+
|
|
69
|
+
key_already_found=!seq.get_actions(ActionKey).empty?
|
|
70
|
+
|
|
71
|
+
mid_errors=[] #number of MIDs with 1 error, and number of MIDs with 2 errors
|
|
72
|
+
mid_id=[] #number of MIDs from each type
|
|
73
|
+
mid_found = false
|
|
74
|
+
|
|
75
|
+
if !blast_query.hits.empty? # mid found
|
|
76
|
+
|
|
77
|
+
# blast_query.hits.sort!{|h1,h2| h1.q_beg <=> h2.q_beg}
|
|
78
|
+
# puts blast_query.count.to_s + "============== #{blast_query.hits[0].inspect}"
|
|
79
|
+
# blast_table_results.inspect
|
|
80
|
+
|
|
81
|
+
# select first sorted mid
|
|
82
|
+
mid=blast_query.hits[0]
|
|
83
|
+
|
|
84
|
+
# find a not reversed mid
|
|
85
|
+
if mid.reversed
|
|
86
|
+
|
|
87
|
+
blast_query.hits.each do |hit|
|
|
88
|
+
if !hit.reversed # take the first non-reversed one
|
|
89
|
+
mid = hit
|
|
90
|
+
break
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# puts "DOES THE MID HAVE ENOUGHT SIZE? #{mid.q_end-mid.q_beg+1} >= #{MIN_MID_SIZE}?"
|
|
97
|
+
mid_size=mid.q_end-mid.q_beg+1
|
|
98
|
+
|
|
99
|
+
db_mid=@params.get_mid(mid.subject_id)
|
|
100
|
+
db_mid_size = db_mid.size #get mid's size from DB
|
|
101
|
+
|
|
102
|
+
mid_initial_pos=mid.q_beg-mid.s_beg
|
|
103
|
+
has_full_key=false
|
|
104
|
+
if @params.get_param('sequencing_key')
|
|
105
|
+
has_full_key = !seq.seq_fasta.index(@params.get_param('sequencing_key')).nil?
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
if mid.reversed
|
|
109
|
+
# discard mid
|
|
110
|
+
elsif (mid.gaps+mid.mismatches > MAX_MID_ERRORS) # number of ERRORS and GAPs is higher than MAX_MID_ERRORS,
|
|
111
|
+
# discard mid
|
|
112
|
+
elsif (mid.q_beg<3) # if found mid starts below 3, then discard it
|
|
113
|
+
# discard mid
|
|
114
|
+
elsif (has_full_key && (mid_initial_pos >=6))
|
|
115
|
+
# discard mid
|
|
116
|
+
elsif (!has_full_key && (mid_initial_pos >=7))
|
|
117
|
+
# discard mid
|
|
118
|
+
elsif (mid_size >= db_mid_size-1) # MID found and MID's size is enought, THEN create key and mid
|
|
119
|
+
|
|
120
|
+
key_beg,key_end=[0,mid.q_beg-1]
|
|
121
|
+
key_size=mid.q_beg
|
|
122
|
+
|
|
123
|
+
# Create an ActionKey before the ActionMid
|
|
124
|
+
if key_size>0 && !key_already_found
|
|
125
|
+
a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
|
|
126
|
+
actions.push a
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
#Create an ActionMid
|
|
130
|
+
a = seq.new_action(mid.q_beg,mid.q_end,"ActionMid") # adds the ActionMids to the sequence
|
|
131
|
+
a.message = mid.subject_id
|
|
132
|
+
a.tag_id = mid.subject_id
|
|
133
|
+
file_tag = mid.subject_id
|
|
134
|
+
actions.push a
|
|
135
|
+
|
|
136
|
+
mid_found = true
|
|
137
|
+
|
|
138
|
+
elsif (mid_size >= db_mid_size-3)
|
|
139
|
+
# To recover a MID it must start or end in one edge
|
|
140
|
+
if (mid.s_beg==0) || (mid.s_end==mid_size)
|
|
141
|
+
|
|
142
|
+
new_q_beg, new_q_end, recovered_size,recovered_mid = recover_mid(mid, db_mid, seq.seq_fasta[0..SIZE_SEARCH_MID])
|
|
143
|
+
|
|
144
|
+
$LOG.debug("Recover mid: #{recovered_mid} valid (#{recovered_size} >= #{10-1}) = #{recovered_size>=10-1}, #{seq.seq_fasta[new_q_beg..new_q_end]}")
|
|
145
|
+
|
|
146
|
+
if recovered_size >= db_mid_size-1
|
|
147
|
+
mid_size = recovered_size
|
|
148
|
+
|
|
149
|
+
# if MID found and MID's size is enought to recover a MID, THEN create an action key and mid
|
|
150
|
+
key_beg,key_end=[0,new_q_beg-1]
|
|
151
|
+
key_size=new_q_beg
|
|
152
|
+
|
|
153
|
+
$LOG.debug "RECOVER OUTPUT: #{new_q_beg} #{new_q_end} #{recovered_size}"
|
|
154
|
+
|
|
155
|
+
# if key_size > 4(or max_size_key) then seq.seq_rejected
|
|
156
|
+
|
|
157
|
+
# Create an ActionKey before the ActionMid
|
|
158
|
+
if key_size>0 && !key_already_found
|
|
159
|
+
a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
|
|
160
|
+
actions.push a
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
#Create an ActionMid to a recovered mid
|
|
164
|
+
a = seq.new_action(new_q_beg,new_q_end,"ActionMid") # adds the ActionMids to the sequence
|
|
165
|
+
a.message = "Recovered " + mid.subject_id
|
|
166
|
+
a.tag_id = mid.subject_id
|
|
167
|
+
file_tag = mid.subject_id
|
|
168
|
+
actions.push a
|
|
169
|
+
add_stats('recovered_mid_id',mid.subject_id)
|
|
170
|
+
|
|
171
|
+
mid_found = true
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
if !mid_found && !key_already_found # MID not found, take only the key
|
|
178
|
+
mid_size=0
|
|
179
|
+
key_beg,key_end=[0,3]
|
|
180
|
+
key_size=4
|
|
181
|
+
a = seq.new_action(key_beg,key_end,'ActionKey') # adds the actionKey to the sequence
|
|
182
|
+
actions.push a
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
#Add actions
|
|
186
|
+
seq.add_actions(actions)
|
|
187
|
+
|
|
188
|
+
seq.add_file_tag(1, file_tag, :both)
|
|
189
|
+
# seq.add_file_tag(2,'sequence')
|
|
190
|
+
|
|
191
|
+
if (mid_found) # MID without errors
|
|
192
|
+
|
|
193
|
+
add_stats('mid_id',mid.subject_id)
|
|
194
|
+
add_stats('mid_id','total')
|
|
195
|
+
|
|
196
|
+
#save MID count by ID
|
|
197
|
+
add_stats(mid.subject_id,mid_size)
|
|
198
|
+
|
|
199
|
+
if (mid.gaps+mid.mismatches > 0)
|
|
200
|
+
add_stats('mid_with_errors',mid.gaps+mid.mismatches)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
if !key_already_found
|
|
206
|
+
add_stats('key_size',key_size)
|
|
207
|
+
add_stats('mid_size',mid_size)
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
#Returns an array with the errors due to parameters are missing
|
|
212
|
+
def self.check_params(params)
|
|
213
|
+
errors=[]
|
|
214
|
+
|
|
215
|
+
comment='Blast E-value used as cut-off when searching for MIDs'
|
|
216
|
+
default_value = 1e-10
|
|
217
|
+
params.check_param(errors,'blast_evalue_mids','Float',default_value,comment)
|
|
218
|
+
|
|
219
|
+
comment='Minimum required identity (%) for a reliable MID'
|
|
220
|
+
default_value = 95
|
|
221
|
+
params.check_param(errors,'blast_percent_mids','Integer',default_value,comment)
|
|
222
|
+
|
|
223
|
+
comment='Path for MID database'
|
|
224
|
+
default_value = File.join($FORMATTED_DB_PATH,'mids.fasta')
|
|
225
|
+
params.check_param(errors,'mids_db','DB',default_value,comment)
|
|
226
|
+
|
|
227
|
+
return errors
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
end
|