seqtrimnext 2.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,394 @@
1
+ require "plugin"
2
+
3
+ ########################################################
4
+ # Author: Almudena Bocinos Rioboo
5
+ #
6
+ # Defines the main methods that are necessary to execute PluginLowQuality. See the main method called execute.
7
+
8
+ #
9
+ # Inherit: Plugin
10
+ ########################################################
11
+
12
+ class PluginLowQuality < Plugin
13
+
14
+
15
+
16
+ def create_sum_window(qual,ini,index_window_end)
17
+
18
+ # puts "--------index w #{index_window_end}"
19
+ sum=[]
20
+ i=ini
21
+ # puts "#{i} #{index_window_end}"
22
+ while (i<=index_window_end) # initialize sum
23
+ sum[i]=0
24
+ i += 1
25
+ end
26
+ # puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
27
+
28
+ i=ini
29
+ while (i<ini+@window)
30
+
31
+ sum[ini] += qual[i]
32
+ i+=1
33
+ end
34
+
35
+
36
+ i=ini+1
37
+
38
+ while (i<=index_window_end)
39
+
40
+ sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
41
+ i+=1
42
+
43
+ end
44
+
45
+ # puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
46
+
47
+ return sum
48
+
49
+ end
50
+
51
+ def find_bounds_high_quality(sum,ini,index_window_end)
52
+
53
+ new_start = -1
54
+ new_end = -1
55
+
56
+ # puts " ini #{ini} iwe #{index_window_end}"
57
+ # puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
58
+ if (ini>index_window_end)
59
+ temp_start= ini
60
+ # new_start, new_end = temp_start, index_window_end
61
+ new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
62
+ # new_start, new_end = index_window_end, index_window_end
63
+ end
64
+ # puts " temp_start #{temp_start}" if (ini>index_window_end)
65
+ temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
66
+
67
+ i=ini+1
68
+ while (i<=index_window_end)
69
+ if (sum[i]>=@cut_off)
70
+ if (temp_start<0)
71
+ temp_start=i #just in!
72
+ # puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
73
+ end
74
+
75
+ else
76
+ # puts "sum #{sum[i]} < cut off "
77
+ if(temp_start>=0) #just out!
78
+ # puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
79
+ if (((i-1-temp_start)>=(new_end-new_start)))
80
+ new_start,new_end=temp_start,i-1
81
+ # puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
82
+ end
83
+ temp_start= -1
84
+ end
85
+ end
86
+ i+=1
87
+
88
+
89
+ end
90
+ # puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
91
+
92
+ if (temp_start != -1) # finished while ok
93
+ # puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
94
+ if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
95
+ new_start, new_end = temp_start, index_window_end #-1
96
+ end
97
+ end
98
+
99
+ # puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
100
+
101
+ # puts " newstart #{new_start} newend #{new_end}"
102
+
103
+ return new_start,new_end
104
+
105
+
106
+ end
107
+
108
+ def cut_fine_bounds_short(qual,new_start,new_end)
109
+
110
+ i=0
111
+ # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
112
+ while (i<@window)
113
+ if (qual[new_start+i]>=@low)
114
+ break
115
+ end
116
+ i+=1
117
+ end
118
+ new_start +=i
119
+ # puts "#{new_start} ***********"
120
+
121
+ i=@window -1
122
+ while (i>=0)
123
+ if (qual[new_end+i]>=@low)
124
+ break
125
+ end
126
+ i-=1
127
+ end
128
+ new_end += i
129
+ # puts "6a new_start #{new_start} new-end #{new_end}"
130
+
131
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
132
+ return new_start, new_end
133
+
134
+ end
135
+
136
+
137
+ # cuts fine the high quality bounds
138
+ def cut_fine_bounds(qual,new_start,new_end)
139
+ # puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
140
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
141
+ # cut it fine
142
+
143
+ one_ok = 0
144
+
145
+ i=@window-1
146
+ # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
147
+ while (i>=0)
148
+ if (qual[new_start+i] < @low)
149
+ break if one_ok
150
+ else
151
+ one_ok = 1
152
+ end
153
+ i-=1
154
+ end
155
+ new_start += i+1
156
+ oneOk = 0
157
+ i=0
158
+ while (i<@window)
159
+ if (qual[new_end+i] < @low)
160
+ break if oneOk
161
+ else
162
+ oneOk = 1
163
+ end
164
+ i+=1
165
+ end
166
+ new_end += i-1
167
+ # puts "6b new_start #{new_start} new-end #{new_end}"
168
+
169
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
170
+ return new_start, new_end
171
+
172
+ end
173
+
174
+ def find_high_quality(qual,ini=0)
175
+
176
+ # puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
177
+
178
+ update=false
179
+ # if @window>qual.length-ini #search in the last window although has a low size
180
+ # @window=qual.length-ini
181
+ # # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
182
+ # @cut_off=@window*@low
183
+ # update=true
184
+ # end
185
+
186
+ if (ini==0 or update)
187
+ #index_window_start = ini
188
+ @index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
189
+ #TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
190
+
191
+
192
+ @sum = create_sum_window(qual,ini,@index_window_end)
193
+ # puts "SUMA #{@sum.join(' ')}"
194
+ end
195
+
196
+ new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
197
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
198
+
199
+ if (new_start>=0)
200
+ if (new_start+@window >= new_end)
201
+ # puts "cfs"
202
+ new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
203
+ # puts "cfs"
204
+
205
+ else
206
+ # puts "cf"
207
+ new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
208
+ # puts "cf"
209
+ end
210
+ end
211
+
212
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
213
+
214
+ return new_start,new_end #+1
215
+
216
+
217
+ end
218
+
219
+
220
+ def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
221
+
222
+ action_size = p_begin-1
223
+ if action_size>=(@window/2)
224
+
225
+
226
+ # puts "action_SIZE1 #{action_size} > #{@window/2}"
227
+
228
+ if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
229
+ # it's created an action before of the high quality part
230
+ a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
231
+ # puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
232
+ actions.push a
233
+ end
234
+ end
235
+ end
236
+
237
+ def add_action_after_high_qual(p_begin,p_end,actions,seq)
238
+
239
+ action_size = seq.insert_end-p_end
240
+ if action_size>=(@window/2)
241
+
242
+
243
+ # puts "action_SIZE2 #{action_size} > #{@window/2}"
244
+
245
+ if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
246
+ # it's created an action before of the high quality part
247
+ a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
248
+
249
+ actions.push a
250
+ end
251
+ end
252
+ end
253
+
254
+
255
+
256
+
257
+
258
+ ######################################################################
259
+ #---------------------------------------------------------------------
260
+
261
+ # Begins the plugin1's execution whit the sequence "seq"
262
+ # Creates an action by each subsequence with low quality to eliminate it
263
+ # A subsequence has low quality if (the add of all its qualitis < subsequence_size*20)
264
+ # Creates the qualities windows from the sequence, looks for the subsequence with high quality
265
+ # and mark, with an action, the before part to the High Quality Subsequence like a low quality part
266
+ # Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
267
+ #-----------------------------------------------------------------
268
+
269
+ def execute(seqs)
270
+ seqs.each do |s|
271
+ exec_seq(s)
272
+ end
273
+ end
274
+
275
+
276
+ def exec_seq(seq)
277
+
278
+ if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
279
+ $LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
280
+ elsif (seq.seq_qual.size>0)
281
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
282
+
283
+ @low=@params.get_param('min_quality').to_i
284
+
285
+ if @params.get_param('window_width').to_i>seq.seq_fasta.length
286
+ @window=seq.seq_fasta.length
287
+
288
+ else
289
+ @window=@params.get_param('window_width').to_i
290
+
291
+ end
292
+ @cut_off=@window*@low
293
+
294
+ type='ActionLowQuality'
295
+ low_qual=0
296
+ actions=[]
297
+
298
+ p_begin,p_end =0,-1 # positions from high quality bounds
299
+
300
+ # @stats[:low_qual]={}
301
+ # @stats['low_qual']={}
302
+
303
+
304
+ while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
305
+
306
+
307
+ p_begin_old,p_end_old= p_begin, p_end
308
+ p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
309
+ # entra=(p_begin>0) or (p_end_old<0)
310
+ #
311
+ # puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
312
+
313
+ if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
314
+ # it's created an action before of the high quality part
315
+ add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
316
+ # puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
317
+ low_qual = p_begin-1-p_end_old-1 + 1
318
+
319
+ add_stats('low_qual',low_qual)
320
+ # @stats[:low_qual]={low_qual => 1}
321
+
322
+ end
323
+
324
+ # puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
325
+
326
+ end
327
+
328
+ # puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
329
+ if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
330
+
331
+ # it's created an action after of the high quality part
332
+ add_action_after_high_qual(p_begin,p_end,actions,seq)
333
+ # puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}"
334
+ low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1
335
+ # if @stats[:low_qual][low_qual].nil?
336
+ # @stats[:low_qual][low_qual] = 0
337
+ # end
338
+ # @stats[:low_qual][low_qual] += 1
339
+ add_stats('low_qual',low_qual)
340
+ # @stats[:low_qual]={low_qual => 1}
341
+ end
342
+
343
+ # puts "-----ññññ----- high quality #{p_begin} #{p_end}"
344
+
345
+
346
+ if p_end<0 and p_end_old #add action low qual to all the part
347
+ a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
348
+ # puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
349
+ low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
350
+
351
+ # if @stats[:low_qual][low_qual].nil?
352
+ # @stats[:low_qual][low_qual] = 0
353
+ # end
354
+ # @stats[:low_qual][low_qual] += 1
355
+ add_stats('low_qual',low_qual)
356
+ # @stats[:low_qual]={'low_qual' => 1}
357
+
358
+ actions.push a
359
+ end
360
+
361
+ # puts "------- ADDING ACTIONs LOW QUAL #{actions.size}"
362
+ seq.add_actions(actions)
363
+ end
364
+
365
+ end
366
+
367
+ #-----------------------------------------------------------------
368
+
369
+
370
+ ######################################################################
371
+ #---------------------------------------------------------------------
372
+
373
+ #Returns an array with the errors due to parameters are missing
374
+ def self.check_params(params)
375
+
376
+ errors=[]
377
+
378
+ comment='Minimum quality value for every nucleotide'
379
+ default_value = 20
380
+ params.check_param(errors,'min_quality','Integer',default_value,comment)
381
+
382
+ comment='Quality window for scanning low quality segments'
383
+ default_value = 15
384
+ params.check_param(errors,'window_width','Integer',default_value,comment)
385
+
386
+
387
+
388
+ return errors
389
+ end
390
+
391
+
392
+ private :find_high_quality
393
+
394
+ end
@@ -0,0 +1,231 @@
1
+ require "plugin"
2
+ require 'recover_mid'
3
+ include RecoverMid
4
+
5
+ ########################################################
6
+ # Author: Almudena Bocinos Rioboo
7
+ #
8
+ # Defines the main methods that are necessary to execute PluginMids
9
+ # Inherit: Plugin
10
+ ########################################################
11
+
12
+ class PluginMids < Plugin
13
+ SIZE_SEARCH_MID=20
14
+ MAX_MID_ERRORS = 2
15
+ #MIN_MID_SIZE = 7 # very important, don't touch
16
+ # DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
17
+
18
+
19
+
20
+ #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
21
+ def execute(seqs)
22
+ blasts= do_blasts(seqs)
23
+
24
+ seqs.each_with_index do |s,i|
25
+ exec_seq(s,blasts.querys[i])
26
+ end
27
+ end
28
+
29
+ def do_blasts(seqs)
30
+ # find MIDS with less results than max_target_seqs value
31
+ blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
32
+ $LOG.info('BLAST:'+blast.get_blast_cmd)
33
+
34
+ fastas=[]
35
+
36
+ seqs.each do |seq|
37
+ fastas.push ">"+seq.seq_name
38
+ fastas.push seq.seq_fasta[0..SIZE_SEARCH_MID]
39
+ end
40
+
41
+ # fastas=fastas.join("\n")
42
+
43
+ blast_table_results = blast.do_blast(fastas)
44
+
45
+ # puts blast_table_results.inspect
46
+
47
+ return blast_table_results
48
+ end
49
+
50
+
51
+ def exec_seq(seq,blast_query)
52
+ if blast_query.query_id != seq.seq_name
53
+ raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
54
+ end
55
+
56
+
57
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
58
+
59
+
60
+ # blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
61
+ # blast_table_results.inspect
62
+
63
+ actions=[]
64
+ file_tag='no_MID'
65
+
66
+ key_size=0
67
+ mid_size=0
68
+
69
+ key_already_found=!seq.get_actions(ActionKey).empty?
70
+
71
+ mid_errors=[] #number of MIDs with 1 error, and number of MIDs with 2 errors
72
+ mid_id=[] #number of MIDs from each type
73
+ mid_found = false
74
+
75
+ if !blast_query.hits.empty? # mid found
76
+
77
+ # blast_query.hits.sort!{|h1,h2| h1.q_beg <=> h2.q_beg}
78
+ # puts blast_query.count.to_s + "============== #{blast_query.hits[0].inspect}"
79
+ # blast_table_results.inspect
80
+
81
+ # select first sorted mid
82
+ mid=blast_query.hits[0]
83
+
84
+ # find a not reversed mid
85
+ if mid.reversed
86
+
87
+ blast_query.hits.each do |hit|
88
+ if !hit.reversed # take the first non-reversed one
89
+ mid = hit
90
+ break
91
+ end
92
+ end
93
+
94
+ end
95
+
96
+ # puts "DOES THE MID HAVE ENOUGHT SIZE? #{mid.q_end-mid.q_beg+1} >= #{MIN_MID_SIZE}?"
97
+ mid_size=mid.q_end-mid.q_beg+1
98
+
99
+ db_mid=@params.get_mid(mid.subject_id)
100
+ db_mid_size = db_mid.size #get mid's size from DB
101
+
102
+ mid_initial_pos=mid.q_beg-mid.s_beg
103
+ has_full_key=false
104
+ if @params.get_param('sequencing_key')
105
+ has_full_key = !seq.seq_fasta.index(@params.get_param('sequencing_key')).nil?
106
+ end
107
+
108
+ if mid.reversed
109
+ # discard mid
110
+ elsif (mid.gaps+mid.mismatches > MAX_MID_ERRORS) # number of ERRORS and GAPs is higher than MAX_MID_ERRORS,
111
+ # discard mid
112
+ elsif (mid.q_beg<3) # if found mid starts below 3, then discard it
113
+ # discard mid
114
+ elsif (has_full_key && (mid_initial_pos >=6))
115
+ # discard mid
116
+ elsif (!has_full_key && (mid_initial_pos >=7))
117
+ # discard mid
118
+ elsif (mid_size >= db_mid_size-1) # MID found and MID's size is enought, THEN create key and mid
119
+
120
+ key_beg,key_end=[0,mid.q_beg-1]
121
+ key_size=mid.q_beg
122
+
123
+ # Create an ActionKey before the ActionMid
124
+ if key_size>0 && !key_already_found
125
+ a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
126
+ actions.push a
127
+ end
128
+
129
+ #Create an ActionMid
130
+ a = seq.new_action(mid.q_beg,mid.q_end,"ActionMid") # adds the ActionMids to the sequence
131
+ a.message = mid.subject_id
132
+ a.tag_id = mid.subject_id
133
+ file_tag = mid.subject_id
134
+ actions.push a
135
+
136
+ mid_found = true
137
+
138
+ elsif (mid_size >= db_mid_size-3)
139
+ # To recover a MID it must start or end in one edge
140
+ if (mid.s_beg==0) || (mid.s_end==mid_size)
141
+
142
+ new_q_beg, new_q_end, recovered_size,recovered_mid = recover_mid(mid, db_mid, seq.seq_fasta[0..SIZE_SEARCH_MID])
143
+
144
+ $LOG.debug("Recover mid: #{recovered_mid} valid (#{recovered_size} >= #{10-1}) = #{recovered_size>=10-1}, #{seq.seq_fasta[new_q_beg..new_q_end]}")
145
+
146
+ if recovered_size >= db_mid_size-1
147
+ mid_size = recovered_size
148
+
149
+ # if MID found and MID's size is enought to recover a MID, THEN create an action key and mid
150
+ key_beg,key_end=[0,new_q_beg-1]
151
+ key_size=new_q_beg
152
+
153
+ $LOG.debug "RECOVER OUTPUT: #{new_q_beg} #{new_q_end} #{recovered_size}"
154
+
155
+ # if key_size > 4(or max_size_key) then seq.seq_rejected
156
+
157
+ # Create an ActionKey before the ActionMid
158
+ if key_size>0 && !key_already_found
159
+ a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
160
+ actions.push a
161
+ end
162
+
163
+ #Create an ActionMid to a recovered mid
164
+ a = seq.new_action(new_q_beg,new_q_end,"ActionMid") # adds the ActionMids to the sequence
165
+ a.message = "Recovered " + mid.subject_id
166
+ a.tag_id = mid.subject_id
167
+ file_tag = mid.subject_id
168
+ actions.push a
169
+ add_stats('recovered_mid_id',mid.subject_id)
170
+
171
+ mid_found = true
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ if !mid_found && !key_already_found # MID not found, take only the key
178
+ mid_size=0
179
+ key_beg,key_end=[0,3]
180
+ key_size=4
181
+ a = seq.new_action(key_beg,key_end,'ActionKey') # adds the actionKey to the sequence
182
+ actions.push a
183
+ end
184
+
185
+ #Add actions
186
+ seq.add_actions(actions)
187
+
188
+ seq.add_file_tag(1, file_tag, :both)
189
+ # seq.add_file_tag(2,'sequence')
190
+
191
+ if (mid_found) # MID without errors
192
+
193
+ add_stats('mid_id',mid.subject_id)
194
+ add_stats('mid_id','total')
195
+
196
+ #save MID count by ID
197
+ add_stats(mid.subject_id,mid_size)
198
+
199
+ if (mid.gaps+mid.mismatches > 0)
200
+ add_stats('mid_with_errors',mid.gaps+mid.mismatches)
201
+ end
202
+
203
+ end
204
+
205
+ if !key_already_found
206
+ add_stats('key_size',key_size)
207
+ add_stats('mid_size',mid_size)
208
+ end
209
+ end
210
+
211
+ #Returns an array with the errors due to parameters are missing
212
+ def self.check_params(params)
213
+ errors=[]
214
+
215
+ comment='Blast E-value used as cut-off when searching for MIDs'
216
+ default_value = 1e-10
217
+ params.check_param(errors,'blast_evalue_mids','Float',default_value,comment)
218
+
219
+ comment='Minimum required identity (%) for a reliable MID'
220
+ default_value = 95
221
+ params.check_param(errors,'blast_percent_mids','Integer',default_value,comment)
222
+
223
+ comment='Path for MID database'
224
+ default_value = File.join($FORMATTED_DB_PATH,'mids.fasta')
225
+ params.check_param(errors,'mids_db','DB',default_value,comment)
226
+
227
+ return errors
228
+ end
229
+
230
+
231
+ end