seqtrimnext 2.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,393 @@
1
+ ########################################################
2
+ # Author: Almudena Bocinos Rioboo
3
+ #
4
+ # Defines the main methods that are necessary to execute PluginFindPolyATs
5
+
6
+ #
7
+ # Inherit: Plugin
8
+ ########################################################
9
+
10
+ require "plugin"
11
+ require "global_match"
12
+
13
+ def overlap(polys,mi_start,mi_end)
14
+
15
+ # overlap = polys.find{|e| ( mi_start < e['end'])}
16
+ overlap = polys.find{|e| ( overlapX?(mi_start,mi_end, e['begin'],e['end']) )}
17
+ # puts " Overlap #{mi_start} #{mi_end} => #{overlap}"
18
+
19
+ return overlap
20
+ end
21
+
22
+ # MAX_RUBBISH = 3
23
+ MAX_POLY_T_FROM_LEFT = 4
24
+ MIN_TN_COUNT=15
25
+ MAX_POLY_A_FROM_RIGHT = 10
26
+ MIN_MIDDLE_POLY_A_SIZE = 35
27
+ MIN_MIDDLE_POLY_T_SIZE = 35
28
+ MAX_DUST_DISTANCE_FROM_POLYT=30
29
+
30
+ class PluginFindPolyAt < Plugin
31
+
32
+ # Uses the param poly_at_length to look for at least that number of contiguous A's
33
+ def find_polys(ta,seq)
34
+ #minn = poly_at_length
35
+ # puts "="*20 + seq.seq_name + "="*20
36
+
37
+ minn = 4
38
+ m2 = (minn/2)
39
+ m4 = (minn/4)
40
+ r = [-1,0,0]
41
+ re2 = /(([#{ta}]{#{m2},})(.{0,2})([#{ta}]{#{m2},}))/i
42
+ # re2 = /(([#{ta}]{#{m2},})(.{0,3})([#{ta}]{#{m2},}))/i
43
+
44
+ # if ta =~/A/
45
+ # type='ActionPolyA'
46
+ # else
47
+ # type='ActionPolyT'
48
+ # poly_base = 'T'
49
+ # end
50
+
51
+ matches = re2.global_match(seq.seq_fasta,3)
52
+
53
+ # HASH
54
+ polys = []
55
+
56
+ # crear una region poly nuevo
57
+ poly = {}
58
+ #i=0
59
+
60
+ matches.each do |pattern2|
61
+
62
+ #puts pattern2.match[0]
63
+ m_start = pattern2.match.begin(0)+pattern2.offset
64
+ m_end = pattern2.match.end(0)+pattern2.offset-1
65
+
66
+
67
+ # does one exist in polys with overlap?
68
+ # yes => group it, updated end
69
+ # no => one new
70
+
71
+ if (e=overlap(polys,m_start,m_end))
72
+ # puts "OVERLAPS #{e}"
73
+ # found=seq.seq_fasta.slice(e['begin'],m_end-e['begin']+1)
74
+ # if base_percent(poly,ta)>= 60
75
+ e['end'] = m_end
76
+ e['found'] = seq.seq_fasta.slice(e['begin'],e['end']-e['begin']+1)
77
+ # else
78
+ # puts "Ignored because lowers the base percent of poly"
79
+ # end
80
+
81
+
82
+ else
83
+ poly={}
84
+ poly['begin'] = m_start
85
+ poly['end'] = m_end # the next pos to pattern's end
86
+ poly['found'] = seq.seq_fasta.slice(poly['begin'],poly['end']-poly['begin']+1)
87
+ polys.push poly
88
+
89
+ # puts " NEW POLY#{ta}: #{poly}"
90
+
91
+ end
92
+
93
+
94
+
95
+
96
+ end
97
+
98
+ # polys.each do |p|
99
+ # puts "P#{ta}: #{p}, bp: #{base_percent(p['found'],ta)}"
100
+ # end
101
+
102
+ return polys
103
+
104
+ end
105
+
106
+
107
+ def find_polyA(seq)
108
+
109
+ actions=[]
110
+ polys=find_polys('AN',seq)
111
+ poly_base = 'AN'
112
+ type='ActionPolyA'
113
+
114
+ poly_size=0
115
+
116
+ # for each poly found cut it, from right to left (reverse order)
117
+ polys.reverse_each do |poly|
118
+
119
+ poly_size=poly['end'] - poly['begin'] +1
120
+
121
+ # if polya is near right and is big enought and has a base percent
122
+
123
+
124
+
125
+ # check if poly lenth and percent are above limits
126
+ if (poly['end']>=seq.seq_fasta.length-MAX_POLY_A_FROM_RIGHT) && (poly_size>= @params.get_param('poly_a_length').to_i) && (base_percent(poly,poly_base)>= @params.get_param('poly_a_percent').to_i)
127
+
128
+ a = seq.new_action(poly['begin'],poly['end'],type)
129
+ a.right_action=true #mark as rigth action to get the left insert
130
+
131
+ actions.push a
132
+
133
+ add_stats('poly_a_size',poly_size)
134
+
135
+ # if poly a is not near right but is bigger, then cut
136
+ elsif (poly['end']<<seq.seq_fasta.length-MAX_POLY_A_FROM_RIGHT) && (poly_size>=MIN_MIDDLE_POLY_A_SIZE) && (base_percent(poly,poly_base)>= @params.get_param('poly_a_percent').to_i)
137
+
138
+ a = seq.new_action(poly['begin'],poly['end'],type)
139
+ a.right_action=true #mark as rigth action to get the left insert
140
+
141
+ actions.push a
142
+
143
+ add_stats('in_middle_poly_a_size',poly_size)
144
+ # else
145
+ # puts "REJECTED: #{poly}"
146
+
147
+ end
148
+ if poly['found'].length > @params.get_param('poly_a_length').to_i
149
+ add_stats("poly_#{poly_base}_base_percents","#{poly['found'].length} #{base_percent(poly,poly_base)}")
150
+ end
151
+
152
+ end
153
+
154
+ if !actions.empty?
155
+ add_stats('seqs_with_polyA',1)
156
+ seq.add_actions(actions)
157
+ actions=[]
158
+ end
159
+
160
+ end
161
+
162
+ def find_polyT(seq)
163
+
164
+ actions=[]
165
+ poly_base = 'TN'
166
+ type='ActionPolyT'
167
+
168
+ polys=find_polys('TN',seq)
169
+
170
+ poly_size=0
171
+ check_for_dust=nil
172
+
173
+ # for each poly found process it
174
+
175
+ polys.each do |poly|
176
+
177
+ poly_size=poly['end'] - poly['begin'] + 1
178
+ # puts "#{poly}, size: #{poly['found'].length}, bcount:#{base_percent(poly,poly_base)}"
179
+ # check if poly lenth and percent are above limits
180
+ if (poly_size>= @params.get_param('poly_t_length').to_i) && (base_percent(poly,poly_base) >= @params.get_param('poly_t_percent').to_i)
181
+
182
+ if (actions.empty?) # first poly, check if polyT is on the left of sequence
183
+
184
+ #if is polyT and is near left, then the sequence is reversed
185
+ if (poly['begin']==0)
186
+
187
+ seq.seq_reversed=true
188
+ a = seq.new_action(poly['begin'],poly['end'],type)
189
+ a.left_action=true
190
+ actions.push a
191
+
192
+ check_for_dust=poly
193
+
194
+ elsif (poly['begin']<=MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_TN_COUNT)
195
+
196
+ seq.seq_reversed=true
197
+ a = seq.new_action(poly['begin'],poly['end'],type)
198
+ a.left_action=true
199
+ actions.push a
200
+
201
+ check_for_dust=poly
202
+ elsif (poly['begin']>MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_MIDDLE_POLY_T_SIZE)
203
+
204
+ seq.seq_reversed=true
205
+ # seq.seq_rejected=true
206
+ # seq.seq_rejected_by_message='unexpected polyT'
207
+ check_for_dust=poly
208
+ a = seq.new_action(poly['begin'],poly['end'],'ActionUnexpectedPolyT')
209
+ a.left_action=true
210
+ actions.push a
211
+ add_stats('unexpected_poly_t_count',poly_size)
212
+
213
+ end
214
+
215
+ else # there are multiple polyTs
216
+
217
+ if (poly['begin']>MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_MIDDLE_POLY_T_SIZE)
218
+
219
+ seq.seq_reversed=true
220
+ # seq.seq_rejected=true
221
+ # seq.seq_rejected_by_message='unexpected polyT'
222
+
223
+ check_for_dust=poly
224
+
225
+ a = seq.new_action(poly['begin'],poly['end'],'ActionUnexpectedPolyT')
226
+ a.left_action=true
227
+ actions.push a
228
+ add_stats('unexpected_poly_t_count',poly_size)
229
+
230
+ end
231
+
232
+ end
233
+
234
+
235
+
236
+
237
+
238
+ # if (poly['begin']<=MAX_POLY_T_FROM_LEFT*2)
239
+ # seq.seq_rejected=true
240
+ # seq.seq_rejected_by_message='polyT found'
241
+ # end
242
+
243
+ # @stats[:poly_t_size]={poly_size => 1}
244
+ add_stats('poly_t_size',poly_size)
245
+
246
+
247
+
248
+ end
249
+
250
+ end
251
+
252
+ if !actions.empty?
253
+ add_stats('seqs_with_polyT',1)
254
+ seq.add_actions(actions)
255
+
256
+ actions=[]
257
+ if check_for_dust && !seq.seq_fasta.nil? && !seq.seq_fasta.empty?
258
+ dust_masker=DustMasker.new()
259
+ dust_poly_size=check_for_dust['end']-check_for_dust['begin']+1
260
+ found_dust = dust_masker.do_dust([">"+seq.seq_name,seq.seq_fasta])
261
+ # puts "Checking for dust: #{seq.seq_fasta}"
262
+ # puts found_dust.to_json
263
+ total_dust=0
264
+ last_dust_start=0
265
+
266
+ if !found_dust.empty?
267
+ found_dust[0].dust.each do |dust|
268
+ start=dust[0]
269
+ stop=dust[1]
270
+ dust_size=dust[1]-dust[0]+1
271
+ total_dust+=dust_size
272
+
273
+ # dust must be big enought and be near the polyt to be a induced one
274
+ if (dust_size)>10 && (start<last_dust_start+MAX_DUST_DISTANCE_FROM_POLYT)
275
+ last_dust_start=stop
276
+ a = seq.new_action(start,stop,'ActionInducedLowComplexity')
277
+ # a.left_action=true
278
+ actions.push a
279
+ elsif dust_size>10
280
+ a = seq.new_action(start,stop,'ActionLowComplexity')
281
+ # a.left_action=true
282
+ actions.push a
283
+ end
284
+ end
285
+ end
286
+
287
+
288
+
289
+ if !actions.empty?
290
+ add_stats('poly_t_dust',dust_poly_size)
291
+ seq.add_actions(actions)
292
+ else
293
+ add_stats('poly_t_no_dust',dust_poly_size)
294
+ end
295
+
296
+ # reject sequences if total dust is greater than 30
297
+ if total_dust>30
298
+ # if seq.seq_fasta.length<50
299
+ seq.seq_rejected=true
300
+ seq.seq_rejected_by_message='low complexity by polyt'
301
+ # end
302
+
303
+ add_stats('induced_low_complexity',total_dust)
304
+ end
305
+
306
+
307
+ end
308
+ end
309
+
310
+ end
311
+
312
+
313
+
314
+ def execute(seqs)
315
+ seqs.each do |s|
316
+ exec_seq(s)
317
+ end
318
+ end
319
+
320
+
321
+ def exec_seq(seq)
322
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
323
+
324
+ find_polyT(seq)
325
+ find_polyA(seq)
326
+
327
+ end
328
+
329
+ ######################################################################
330
+ #---------------------------------------------------------------------
331
+
332
+
333
+ def base_percent(poly,poly_base)
334
+
335
+ # count Ts en poly['found']
336
+ s=poly['found']
337
+ ta_count = s.count(poly_base.downcase+poly_base.upcase)
338
+
339
+ res=(ta_count.to_f/s.size.to_f)*100
340
+
341
+ # puts "poly #{s} base percent #{res}"
342
+
343
+
344
+ return res
345
+ end
346
+
347
+ def base_count(poly,poly_base)
348
+
349
+ # count bases en poly['found']
350
+ s=poly['found']
351
+ res = s.count(poly_base.downcase+poly_base.upcase)
352
+
353
+ # puts "poly #{s} base count #{res}"
354
+
355
+ return res
356
+ end
357
+
358
+
359
+ ######################################################################
360
+ #---------------------------------------------------------------------
361
+
362
+ #Returns an array with the errors due to parameters are missing
363
+ def self.check_params(params)
364
+ errors=[]
365
+
366
+ comment='Minimum length of a poly-A'
367
+ default_value = 6
368
+ params.check_param(errors,'poly_a_length','Integer',default_value,comment)
369
+
370
+ comment='Minimum percent of As in a sequence segment to be considered a poly-A'
371
+ # default_value = 80
372
+ default_value = 75
373
+ params.check_param(errors,'poly_a_percent','Integer',default_value,comment)
374
+
375
+
376
+ comment='Minimum length of a poly-T'
377
+ default_value = 15
378
+ params.check_param(errors,'poly_t_length','Integer',default_value,comment)
379
+
380
+ comment='Minimum percent of Ts in a sequence segment to be considered a poly-T'
381
+ # default_value = 80
382
+ default_value = 75
383
+ params.check_param(errors,'poly_t_percent','Integer',default_value,comment)
384
+
385
+ return errors
386
+ end
387
+
388
+
389
+
390
+
391
+ private :overlap
392
+
393
+ end
@@ -0,0 +1,101 @@
1
+ require "plugin"
2
+
3
+ require "make_blast_db"
4
+ ########################################################
5
+ # Author: Almudena Bocinos Rioboo
6
+ #
7
+ # Defines the main methods that are necessary to execute PluginIgnoreRepeated
8
+ # Inherit: Plugin
9
+ ########################################################
10
+
11
+ class PluginIgnoreRepeated < Plugin
12
+
13
+ SIZE_SEARCH_IN_IGNORE=15
14
+
15
+ #Begins the plugin1's execution to warn that there are repeated sequences, and disables all but one"
16
+ def execute(seqs)
17
+ seqs.each do |s|
18
+ exec_seq(s)
19
+ end
20
+ end
21
+
22
+
23
+ def exec_seq(seq)
24
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
25
+
26
+ fasta_input=@params.get_param('truncated_input_file')
27
+
28
+ blast = BatchBlast.new("-db #{fasta_input}" ,'blastn'," -task blastn-short -searchsp #{SIZE_SEARCH_IN_IGNORE} -evalue #{@params.get_param('blast_evalue_ignore_repeated')} -perc_identity #{@params.get_param('blast_percent_ignore_repeated')}") #get contaminants
29
+
30
+ p_start = @params.get_param('piro_repeated_start').to_i
31
+ p_length = @params.get_param('piro_repeated_length').to_i
32
+
33
+
34
+ blast_table_results = blast.do_blast(seq.seq_fasta[p_start,p_length]) #rise seq to contaminants executing over blast
35
+
36
+ #blast_table_results = BlastTableResult.new(res)
37
+
38
+
39
+ type = "ActionIgnoreRepeated"
40
+
41
+ # @stats[:rejected_seqs]={}
42
+
43
+ actions=[]
44
+ blast_table_results.querys.each do |query|
45
+
46
+ # puts "BLAST IGUALES:"
47
+ # puts res.join("\n")
48
+ if query.size>1
49
+ names = query.hits.collect{ |h|
50
+ if h.align_len > (p_length-2)
51
+ h.subject_id
52
+ end
53
+ }
54
+
55
+ names.compact!
56
+
57
+ # puts "IGUALES:" + names.size.to_s
58
+ # puts names.join(',')
59
+
60
+ if !names.empty?
61
+ names.sort!
62
+
63
+ if (names[0] != seq.seq_name) # Add action when the sequence is repeated
64
+ # if true
65
+ a = seq.new_action(0,0,type)
66
+ a.message = seq.seq_name + ' equal to ' + names[0]
67
+ actions.push a
68
+ seq.seq_rejected=true
69
+ seq.seq_rejected_by_message='repeated'
70
+ seq.seq_repeated=true
71
+
72
+ # @stats[:rejected_seqs]={'rejected_seqs_by_repe' => 1}
73
+ add_stats('rejected_seqs','rejected_seqs_by_repe')
74
+ # puts "#{names[0]} != #{seq.seq_name} >>>>>>"
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ end
81
+
82
+ seq.add_actions(actions)
83
+
84
+ end
85
+
86
+
87
+
88
+ #Returns an array with the errors due to parameters are missing
89
+ def self.check_params(params)
90
+ errors=[]
91
+
92
+ # self.check_param(errors,params,'fasta_file_input','String')
93
+ self.check_param(errors,params,'blast_evalue_ignore_repeated','Float')
94
+ self.check_param(errors,params,'blast_percent_ignore_repeated','Integer')
95
+ self.check_param(errors,params,'piro_repeated_start','Integer')
96
+ self.check_param(errors,params,'piro_repeated_length','Integer')
97
+ return errors
98
+ end
99
+
100
+
101
+ end
@@ -0,0 +1,199 @@
1
+ ########################################################
2
+ # Author: Almudena Bocinos Rioboo
3
+ #
4
+ # Defines the main methods that are necessary to execute PluginIndeterminations
5
+ # Inherit: Plugin
6
+ ########################################################
7
+ require "plugin"
8
+ require "global_match"
9
+
10
+
11
+ class PluginIndeterminations < Plugin
12
+
13
+
14
+ def overlap(polys,mi_start,mi_end)
15
+
16
+ # overlap = polys.find{|e| ( mi_start < e['end'])}
17
+ overlap = polys.find{|e| ( overlapX?(mi_start,mi_end, e['begin'],e['end']) )}
18
+ # puts " Overlap #{mi_start} #{mi_end} => #{overlap}"
19
+
20
+ return overlap
21
+ end
22
+
23
+ MAX_RUBBISH = 3
24
+
25
+ # Begins the pluginFindPolyAt's execution whit the sequence "seq"
26
+
27
+ # Uses the param poly_at_length to look for at least that number of contiguous A's
28
+ def find_polys(ta,seq,actions)
29
+
30
+ minn = 4
31
+ m2 = 1#(minn/2)
32
+ m4 = (minn/4)
33
+ r = [-1,0,0]
34
+ re2 = /((#{ta}{#{m2},})(.{0,3})(#{ta}{#{1},}))/i
35
+
36
+
37
+ type='ActionIndetermination'
38
+ poly_base = 'N'
39
+
40
+ matches = re2.global_match(seq.seq_fasta,3)
41
+
42
+ matches2 = /[^N]N$/.match(seq.seq_fasta)
43
+
44
+
45
+ # HASH
46
+ polys = []
47
+
48
+ # crear una region poly nuevo
49
+ poly = {}
50
+ #i=0
51
+
52
+ matches.each do |pattern2|
53
+
54
+ #puts pattern2.match[0]
55
+ m_start = pattern2.match.begin(0)+pattern2.offset
56
+ m_end = pattern2.match.end(0)+pattern2.offset-1
57
+
58
+ #puts "MATCH: #{m_start} #{m_end}"
59
+
60
+ # does one exist in polys with overlap?
61
+
62
+ # yes => group it, updated end
63
+
64
+ # no => one new
65
+
66
+ if (e=overlap(polys,m_start,m_end))
67
+
68
+ e['end'] = m_end
69
+ e['found'] = seq.seq_fasta.slice(e['begin'],e['end']-e['begin']+1)
70
+
71
+ else
72
+ poly={}
73
+ poly['begin'] = m_start
74
+ poly['end'] = m_end # the next pos to pattern's end
75
+ poly['found'] = seq.seq_fasta.slice(poly['begin'],poly['end']-poly['begin']+1)
76
+ polys.push poly
77
+ end
78
+
79
+ end
80
+
81
+
82
+ poly_size=0
83
+
84
+ polys.each do |poly|
85
+ #puts "NEW POLY: #{poly.to_json}"
86
+
87
+ if poly_near_end(poly['end'],seq.seq_fasta) # near right side
88
+ #puts "near end"
89
+ a = seq.new_action(poly['begin'],poly['end'],type)
90
+ a.right_action=true
91
+ actions.push a
92
+
93
+ poly_size=poly['end']-poly['begin']+1
94
+ add_stats('size',poly_size)
95
+ else
96
+ #puts "far of end"
97
+ if check_poly_length(poly['begin'],poly['end']) and (check_poly_percent(poly,poly_base))
98
+ #puts "ok"
99
+ a = seq.new_action(poly['begin'],poly['end'],type)
100
+ a.right_action=true
101
+ actions.push a
102
+
103
+ seq.seq_rejected=true
104
+ seq.seq_rejected_by_message='Indeterminations in middle of sequence'
105
+
106
+ poly_size=poly['end']-poly['begin']+1
107
+ add_stats('size',poly_size)
108
+ end
109
+
110
+
111
+ end
112
+ end
113
+
114
+
115
+ end
116
+
117
+
118
+ def check_poly_length(poly_start,poly_end)
119
+ #puts "poly_length: #{1+(poly_end-poly_start)} nt"
120
+ return (1+(poly_end-poly_start)) >= @params.get_param('poly_n_length').to_i
121
+ end
122
+
123
+ def check_poly_percent(poly,poly_base)
124
+
125
+ # count Ts en poly['found']
126
+ s=poly['found']
127
+ ta_count = s.count(poly_base.downcase+poly_base.upcase)
128
+ #puts "poly_percent: #{(ta_count.to_f/s.size.to_f)*100}%"
129
+ res=((ta_count.to_f/s.size.to_f)*100 >= @params.get_param('poly_n_percent').to_i)
130
+
131
+ return res
132
+ end
133
+
134
+ def poly_near_end(pos,seq_fasta)
135
+
136
+ max_to_end = @params.get_param('poly_n_max_to_end').to_i
137
+
138
+ res = (pos>=(seq_fasta.length-max_to_end))
139
+
140
+ end
141
+
142
+
143
+ #Begins the pluginKey's execution to warn where is a key in the sequence "seq"
144
+ def execute(seqs)
145
+ seqs.each do |s|
146
+ exec_seq(s)
147
+ end
148
+ end
149
+
150
+
151
+ def exec_seq(seq)
152
+ $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
153
+
154
+ actions=[]
155
+
156
+ # find simple indeterminations at end of sequence
157
+ match=seq.seq_fasta.match(/[nN]+$/)
158
+
159
+ if !match.nil?
160
+ found=match[0].length
161
+
162
+
163
+ a = seq.new_action(seq.seq_fasta.length-found,seq.seq_fasta.length,'ActionIndetermination')
164
+ a.right_action=true
165
+ actions.push a
166
+
167
+ #Add actions
168
+ seq.add_actions(actions)
169
+ actions=[]
170
+ add_stats('indetermination_size',found)
171
+
172
+ end
173
+
174
+ find_polys('[N]',seq,actions)
175
+ seq.add_actions(actions)
176
+
177
+ end
178
+
179
+ #Returns an array with the errors due to parameters are missing
180
+ def self.check_params(params)
181
+ errors=[]
182
+
183
+ comment='Minimum number of Ns within the sequence to be rejected by having an internal segment of indeterminations. Indeterminations at the end of the sequence will be removed regardless of their size and without rejecting the sequence'
184
+ default_value = 15
185
+ params.check_param(errors,'poly_n_length','Integer',default_value,comment)
186
+
187
+ comment='Minimum percent of Ns in a segment to be considered a valid indetermination'
188
+ default_value = 80
189
+ params.check_param(errors,'poly_n_percent','Integer',default_value,comment)
190
+
191
+ comment='Maximum distance to the end of the sequence to be considered an internal segment'
192
+ default_value = 15
193
+ params.check_param(errors,'poly_n_max_to_end','Integer',default_value,comment)
194
+
195
+ return errors
196
+ end
197
+
198
+
199
+ end