seqtrimnext 2.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,394 @@
1
+ require "plugin"
2
+
3
+ ########################################################
4
+ # Author: Almudena Bocinos Rioboo
5
+ #
6
+ # Defines the main methods that are necessary to execute PluginLowQuality. See the main method called execute.
7
+
8
+ #
9
+ # Inherit: Plugin
10
+ ########################################################
11
+
12
+ class PluginLowQuality < Plugin
13
+
14
+
15
+
16
+ def create_sum_window(qual,ini,index_window_end)
17
+
18
+ # puts "--------index w #{index_window_end}"
19
+ sum=[]
20
+ i=ini
21
+ # puts "#{i} #{index_window_end}"
22
+ while (i<=index_window_end) # initialize sum
23
+ sum[i]=0
24
+ i += 1
25
+ end
26
+ # puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}"
27
+
28
+ i=ini
29
+ while (i<ini+@window)
30
+
31
+ sum[ini] += qual[i]
32
+ i+=1
33
+ end
34
+
35
+
36
+ i=ini+1
37
+
38
+ while (i<=index_window_end)
39
+
40
+ sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
41
+ i+=1
42
+
43
+ end
44
+
45
+ # puts '2____' + sum.join(',') + 'pos sum' + ini.to_s
46
+
47
+ return sum
48
+
49
+ end
50
+
51
+ def find_bounds_high_quality(sum,ini,index_window_end)
52
+
53
+ new_start = -1
54
+ new_end = -1
55
+
56
+ # puts " ini #{ini} iwe #{index_window_end}"
57
+ # puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} "
58
+ if (ini>index_window_end)
59
+ temp_start= ini
60
+ # new_start, new_end = temp_start, index_window_end
61
+ new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
62
+ # new_start, new_end = index_window_end, index_window_end
63
+ end
64
+ # puts " temp_start #{temp_start}" if (ini>index_window_end)
65
+ temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1
66
+
67
+ i=ini+1
68
+ while (i<=index_window_end)
69
+ if (sum[i]>=@cut_off)
70
+ if (temp_start<0)
71
+ temp_start=i #just in!
72
+ # puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"
73
+ end
74
+
75
+ else
76
+ # puts "sum #{sum[i]} < cut off "
77
+ if(temp_start>=0) #just out!
78
+ # puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
79
+ if (((i-1-temp_start)>=(new_end-new_start)))
80
+ new_start,new_end=temp_start,i-1
81
+ # puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}"
82
+ end
83
+ temp_start= -1
84
+ end
85
+ end
86
+ i+=1
87
+
88
+
89
+ end
90
+ # puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
91
+
92
+ if (temp_start != -1) # finished while ok
93
+ # puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
94
+ if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
95
+ new_start, new_end = temp_start, index_window_end #-1
96
+ end
97
+ end
98
+
99
+ # puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"
100
+
101
+ # puts " newstart #{new_start} newend #{new_end}"
102
+
103
+ return new_start,new_end
104
+
105
+
106
+ end
107
+
108
+ def cut_fine_bounds_short(qual,new_start,new_end)
109
+
110
+ i=0
111
+ # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
112
+ while (i<@window)
113
+ if (qual[new_start+i]>=@low)
114
+ break
115
+ end
116
+ i+=1
117
+ end
118
+ new_start +=i
119
+ # puts "#{new_start} ***********"
120
+
121
+ i=@window -1
122
+ while (i>=0)
123
+ if (qual[new_end+i]>=@low)
124
+ break
125
+ end
126
+ i-=1
127
+ end
128
+ new_end += i
129
+ # puts "6a new_start #{new_start} new-end #{new_end}"
130
+
131
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"
132
+ return new_start, new_end
133
+
134
+ end
135
+
136
+
137
+ # cuts fine the high quality bounds
138
+ def cut_fine_bounds(qual,new_start,new_end)
139
+ # puts " ççççççççççççççç #{new_start+@window} >= #{new_end} "
140
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
141
+ # cut it fine
142
+
143
+ one_ok = 0
144
+
145
+ i=@window-1
146
+ # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
147
+ while (i>=0)
148
+ if (qual[new_start+i] < @low)
149
+ break if one_ok
150
+ else
151
+ one_ok = 1
152
+ end
153
+ i-=1
154
+ end
155
+ new_start += i+1
156
+ oneOk = 0
157
+ i=0
158
+ while (i<@window)
159
+ if (qual[new_end+i] < @low)
160
+ break if oneOk
161
+ else
162
+ oneOk = 1
163
+ end
164
+ i+=1
165
+ end
166
+ new_end += i-1
167
+ # puts "6b new_start #{new_start} new-end #{new_end}"
168
+
169
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
170
+ return new_start, new_end
171
+
172
+ end
173
+
174
+ def find_high_quality(qual,ini=0)
175
+
176
+ # puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s
177
+
178
+ update=false
179
+ # if @window>qual.length-ini #search in the last window although has a low size
180
+ # @window=qual.length-ini
181
+ # # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s
182
+ # @cut_off=@window*@low
183
+ # update=true
184
+ # end
185
+
186
+ if (ini==0 or update)
187
+ #index_window_start = ini
188
+ @index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
189
+ #TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada
190
+
191
+
192
+ @sum = create_sum_window(qual,ini,@index_window_end)
193
+ # puts "SUMA #{@sum.join(' ')}"
194
+ end
195
+
196
+ new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end)
197
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
198
+
199
+ if (new_start>=0)
200
+ if (new_start+@window >= new_end)
201
+ # puts "cfs"
202
+ new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
203
+ # puts "cfs"
204
+
205
+ else
206
+ # puts "cf"
207
+ new_start, new_end = cut_fine_bounds(qual,new_start,new_end)
208
+ # puts "cf"
209
+ end
210
+ end
211
+
212
+ # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
213
+
214
+ return new_start,new_end #+1
215
+
216
+
217
+ end
218
+
219
+
220
+ def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
221
+
222
+ action_size = p_begin-1
223
+ if action_size>=(@window/2)
224
+
225
+
226
+ # puts "action_SIZE1 #{action_size} > #{@window/2}"
227
+
228
+ if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part
229
+ # it's created an action before of the high quality part
230
+ a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
231
+ # puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
232
+ actions.push a
233
+ end
234
+ end
235
+ end
236
+
237
+ def add_action_after_high_qual(p_begin,p_end,actions,seq)
238
+
239
+ action_size = seq.insert_end-p_end
240
+ if action_size>=(@window/2)
241
+
242
+
243
+ # puts "action_SIZE2 #{action_size} > #{@window/2}"
244
+
245
+ if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part
246
+ # it's created an action before of the high quality part
247
+ a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
248
+
249
+ actions.push a
250
+ end
251
+ end
252
+ end
253
+
254
+
255
+
256
+
257
+
258
+ ######################################################################
259
+ #---------------------------------------------------------------------
260
+
261
+ # Begins the plugin1's execution whit the sequence "seq"
262
+ # Creates an action by each subsequence with low quality to eliminate it
263
+ # A subsequence has low quality if (the add of all its qualitis < subsequence_size*20)
264
+ # Creates the qualities windows from the sequence, looks for the subsequence with high quality
265
+ # and mark, with an action, the before part to the High Quality Subsequence like a low quality part
266
+ # Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part
267
+ #-----------------------------------------------------------------
268
+
269
+ def execute(seqs)
270
+ seqs.each do |s|
271
+ exec_seq(s)
272
+ end
273
+ end
274
+
275
+
276
+ def exec_seq(seq)
277
+
278
+ if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
279
+ $LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
280
+ elsif (seq.seq_qual.size>0)
281
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
282
+
283
+ @low=@params.get_param('min_quality').to_i
284
+
285
+ if @params.get_param('window_width').to_i>seq.seq_fasta.length
286
+ @window=seq.seq_fasta.length
287
+
288
+ else
289
+ @window=@params.get_param('window_width').to_i
290
+
291
+ end
292
+ @cut_off=@window*@low
293
+
294
+ type='ActionLowQuality'
295
+ low_qual=0
296
+ actions=[]
297
+
298
+ p_begin,p_end =0,-1 # positions from high quality bounds
299
+
300
+ # @stats[:low_qual]={}
301
+ # @stats['low_qual']={}
302
+
303
+
304
+ while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) )
305
+
306
+
307
+ p_begin_old,p_end_old= p_begin, p_end
308
+ p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)
309
+ # entra=(p_begin>0) or (p_end_old<0)
310
+ #
311
+ # puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1"
312
+
313
+ if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size
314
+ # it's created an action before of the high quality part
315
+ add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1)
316
+ # puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"
317
+ low_qual = p_begin-1-p_end_old-1 + 1
318
+
319
+ add_stats('low_qual',low_qual)
320
+ # @stats[:low_qual]={low_qual => 1}
321
+
322
+ end
323
+
324
+ # puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"
325
+
326
+ end
327
+
328
+ # puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"
329
+ if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part
330
+
331
+ # it's created an action after of the high quality part
332
+ add_action_after_high_qual(p_begin,p_end,actions,seq)
333
+ # puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}"
334
+ low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1
335
+ # if @stats[:low_qual][low_qual].nil?
336
+ # @stats[:low_qual][low_qual] = 0
337
+ # end
338
+ # @stats[:low_qual][low_qual] += 1
339
+ add_stats('low_qual',low_qual)
340
+ # @stats[:low_qual]={low_qual => 1}
341
+ end
342
+
343
+ # puts "-----ññññ----- high quality #{p_begin} #{p_end}"
344
+
345
+
346
+ if p_end<0 and p_end_old #add action low qual to all the part
347
+ a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
348
+ # puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}"
349
+ low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1
350
+
351
+ # if @stats[:low_qual][low_qual].nil?
352
+ # @stats[:low_qual][low_qual] = 0
353
+ # end
354
+ # @stats[:low_qual][low_qual] += 1
355
+ add_stats('low_qual',low_qual)
356
+ # @stats[:low_qual]={'low_qual' => 1}
357
+
358
+ actions.push a
359
+ end
360
+
361
+ # puts "------- ADDING ACTIONs LOW QUAL #{actions.size}"
362
+ seq.add_actions(actions)
363
+ end
364
+
365
+ end
366
+
367
+ #-----------------------------------------------------------------
368
+
369
+
370
+ ######################################################################
371
+ #---------------------------------------------------------------------
372
+
373
+ #Returns an array with the errors due to parameters are missing
374
+ def self.check_params(params)
375
+
376
+ errors=[]
377
+
378
+ comment='Minimum quality value for every nucleotide'
379
+ default_value = 20
380
+ params.check_param(errors,'min_quality','Integer',default_value,comment)
381
+
382
+ comment='Quality window for scanning low quality segments'
383
+ default_value = 15
384
+ params.check_param(errors,'window_width','Integer',default_value,comment)
385
+
386
+
387
+
388
+ return errors
389
+ end
390
+
391
+
392
+ private :find_high_quality
393
+
394
+ end
@@ -0,0 +1,231 @@
1
+ require "plugin"
2
+ require 'recover_mid'
3
+ include RecoverMid
4
+
5
+ ########################################################
6
+ # Author: Almudena Bocinos Rioboo
7
+ #
8
+ # Defines the main methods that are necessary to execute PluginMids
9
+ # Inherit: Plugin
10
+ ########################################################
11
+
12
+ class PluginMids < Plugin
13
+ SIZE_SEARCH_MID=20
14
+ MAX_MID_ERRORS = 2
15
+ #MIN_MID_SIZE = 7 # very important, don't touch
16
+ # DB_MID_SIZE = 10 # DONE read formatted db and save the mid sizes
17
+
18
+
19
+
20
+ #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
21
+ def execute(seqs)
22
+ blasts= do_blasts(seqs)
23
+
24
+ seqs.each_with_index do |s,i|
25
+ exec_seq(s,blasts.querys[i])
26
+ end
27
+ end
28
+
29
+ def do_blasts(seqs)
30
+ # find MIDS with less results than max_target_seqs value
31
+ blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
32
+ $LOG.info('BLAST:'+blast.get_blast_cmd)
33
+
34
+ fastas=[]
35
+
36
+ seqs.each do |seq|
37
+ fastas.push ">"+seq.seq_name
38
+ fastas.push seq.seq_fasta[0..SIZE_SEARCH_MID]
39
+ end
40
+
41
+ # fastas=fastas.join("\n")
42
+
43
+ blast_table_results = blast.do_blast(fastas)
44
+
45
+ # puts blast_table_results.inspect
46
+
47
+ return blast_table_results
48
+ end
49
+
50
+
51
+ def exec_seq(seq,blast_query)
52
+ if blast_query.query_id != seq.seq_name
53
+ raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
54
+ end
55
+
56
+
57
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
58
+
59
+
60
+ # blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
61
+ # blast_table_results.inspect
62
+
63
+ actions=[]
64
+ file_tag='no_MID'
65
+
66
+ key_size=0
67
+ mid_size=0
68
+
69
+ key_already_found=!seq.get_actions(ActionKey).empty?
70
+
71
+ mid_errors=[] #number of MIDs with 1 error, and number of MIDs with 2 errors
72
+ mid_id=[] #number of MIDs from each type
73
+ mid_found = false
74
+
75
+ if !blast_query.hits.empty? # mid found
76
+
77
+ # blast_query.hits.sort!{|h1,h2| h1.q_beg <=> h2.q_beg}
78
+ # puts blast_query.count.to_s + "============== #{blast_query.hits[0].inspect}"
79
+ # blast_table_results.inspect
80
+
81
+ # select first sorted mid
82
+ mid=blast_query.hits[0]
83
+
84
+ # find a not reversed mid
85
+ if mid.reversed
86
+
87
+ blast_query.hits.each do |hit|
88
+ if !hit.reversed # take the first non-reversed one
89
+ mid = hit
90
+ break
91
+ end
92
+ end
93
+
94
+ end
95
+
96
+ # puts "DOES THE MID HAVE ENOUGHT SIZE? #{mid.q_end-mid.q_beg+1} >= #{MIN_MID_SIZE}?"
97
+ mid_size=mid.q_end-mid.q_beg+1
98
+
99
+ db_mid=@params.get_mid(mid.subject_id)
100
+ db_mid_size = db_mid.size #get mid's size from DB
101
+
102
+ mid_initial_pos=mid.q_beg-mid.s_beg
103
+ has_full_key=false
104
+ if @params.get_param('sequencing_key')
105
+ has_full_key = !seq.seq_fasta.index(@params.get_param('sequencing_key')).nil?
106
+ end
107
+
108
+ if mid.reversed
109
+ # discard mid
110
+ elsif (mid.gaps+mid.mismatches > MAX_MID_ERRORS) # number of ERRORS and GAPs is higher than MAX_MID_ERRORS,
111
+ # discard mid
112
+ elsif (mid.q_beg<3) # if found mid starts below 3, then discard it
113
+ # discard mid
114
+ elsif (has_full_key && (mid_initial_pos >=6))
115
+ # discard mid
116
+ elsif (!has_full_key && (mid_initial_pos >=7))
117
+ # discard mid
118
+ elsif (mid_size >= db_mid_size-1) # MID found and MID's size is enought, THEN create key and mid
119
+
120
+ key_beg,key_end=[0,mid.q_beg-1]
121
+ key_size=mid.q_beg
122
+
123
+ # Create an ActionKey before the ActionMid
124
+ if key_size>0 && !key_already_found
125
+ a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
126
+ actions.push a
127
+ end
128
+
129
+ #Create an ActionMid
130
+ a = seq.new_action(mid.q_beg,mid.q_end,"ActionMid") # adds the ActionMids to the sequence
131
+ a.message = mid.subject_id
132
+ a.tag_id = mid.subject_id
133
+ file_tag = mid.subject_id
134
+ actions.push a
135
+
136
+ mid_found = true
137
+
138
+ elsif (mid_size >= db_mid_size-3)
139
+ # To recover a MID it must start or end in one edge
140
+ if (mid.s_beg==0) || (mid.s_end==mid_size)
141
+
142
+ new_q_beg, new_q_end, recovered_size,recovered_mid = recover_mid(mid, db_mid, seq.seq_fasta[0..SIZE_SEARCH_MID])
143
+
144
+ $LOG.debug("Recover mid: #{recovered_mid} valid (#{recovered_size} >= #{10-1}) = #{recovered_size>=10-1}, #{seq.seq_fasta[new_q_beg..new_q_end]}")
145
+
146
+ if recovered_size >= db_mid_size-1
147
+ mid_size = recovered_size
148
+
149
+ # if MID found and MID's size is enought to recover a MID, THEN create an action key and mid
150
+ key_beg,key_end=[0,new_q_beg-1]
151
+ key_size=new_q_beg
152
+
153
+ $LOG.debug "RECOVER OUTPUT: #{new_q_beg} #{new_q_end} #{recovered_size}"
154
+
155
+ # if key_size > 4(or max_size_key) then seq.seq_rejected
156
+
157
+ # Create an ActionKey before the ActionMid
158
+ if key_size>0 && !key_already_found
159
+ a = seq.new_action(key_beg,key_end,"ActionKey") # adds the actionKey to the sequence
160
+ actions.push a
161
+ end
162
+
163
+ #Create an ActionMid to a recovered mid
164
+ a = seq.new_action(new_q_beg,new_q_end,"ActionMid") # adds the ActionMids to the sequence
165
+ a.message = "Recovered " + mid.subject_id
166
+ a.tag_id = mid.subject_id
167
+ file_tag = mid.subject_id
168
+ actions.push a
169
+ add_stats('recovered_mid_id',mid.subject_id)
170
+
171
+ mid_found = true
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ if !mid_found && !key_already_found # MID not found, take only the key
178
+ mid_size=0
179
+ key_beg,key_end=[0,3]
180
+ key_size=4
181
+ a = seq.new_action(key_beg,key_end,'ActionKey') # adds the actionKey to the sequence
182
+ actions.push a
183
+ end
184
+
185
+ #Add actions
186
+ seq.add_actions(actions)
187
+
188
+ seq.add_file_tag(1, file_tag, :both)
189
+ # seq.add_file_tag(2,'sequence')
190
+
191
+ if (mid_found) # MID without errors
192
+
193
+ add_stats('mid_id',mid.subject_id)
194
+ add_stats('mid_id','total')
195
+
196
+ #save MID count by ID
197
+ add_stats(mid.subject_id,mid_size)
198
+
199
+ if (mid.gaps+mid.mismatches > 0)
200
+ add_stats('mid_with_errors',mid.gaps+mid.mismatches)
201
+ end
202
+
203
+ end
204
+
205
+ if !key_already_found
206
+ add_stats('key_size',key_size)
207
+ add_stats('mid_size',mid_size)
208
+ end
209
+ end
210
+
211
+ #Returns an array with the errors due to parameters are missing
212
+ def self.check_params(params)
213
+ errors=[]
214
+
215
+ comment='Blast E-value used as cut-off when searching for MIDs'
216
+ default_value = 1e-10
217
+ params.check_param(errors,'blast_evalue_mids','Float',default_value,comment)
218
+
219
+ comment='Minimum required identity (%) for a reliable MID'
220
+ default_value = 95
221
+ params.check_param(errors,'blast_percent_mids','Integer',default_value,comment)
222
+
223
+ comment='Path for MID database'
224
+ default_value = File.join($FORMATTED_DB_PATH,'mids.fasta')
225
+ params.check_param(errors,'mids_db','DB',default_value,comment)
226
+
227
+ return errors
228
+ end
229
+
230
+
231
+ end