seqtrimnext 2.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
########################################################
|
2
|
+
# Author: Almudena Bocinos Rioboo
|
3
|
+
#
|
4
|
+
# Defines the main methods that are necessary to execute PluginFindPolyATs
|
5
|
+
|
6
|
+
#
|
7
|
+
# Inherit: Plugin
|
8
|
+
########################################################
|
9
|
+
|
10
|
+
require "plugin"
|
11
|
+
require "global_match"
|
12
|
+
|
13
|
+
def overlap(polys,mi_start,mi_end)
|
14
|
+
|
15
|
+
# overlap = polys.find{|e| ( mi_start < e['end'])}
|
16
|
+
overlap = polys.find{|e| ( overlapX?(mi_start,mi_end, e['begin'],e['end']) )}
|
17
|
+
# puts " Overlap #{mi_start} #{mi_end} => #{overlap}"
|
18
|
+
|
19
|
+
return overlap
|
20
|
+
end
|
21
|
+
|
22
|
+
# MAX_RUBBISH = 3
|
23
|
+
MAX_POLY_T_FROM_LEFT = 4
|
24
|
+
MIN_TN_COUNT=15
|
25
|
+
MAX_POLY_A_FROM_RIGHT = 10
|
26
|
+
MIN_MIDDLE_POLY_A_SIZE = 35
|
27
|
+
MIN_MIDDLE_POLY_T_SIZE = 35
|
28
|
+
MAX_DUST_DISTANCE_FROM_POLYT=30
|
29
|
+
|
30
|
+
class PluginFindPolyAt < Plugin
|
31
|
+
|
32
|
+
# Uses the param poly_at_length to look for at least that number of contiguous A's
|
33
|
+
def find_polys(ta,seq)
|
34
|
+
#minn = poly_at_length
|
35
|
+
# puts "="*20 + seq.seq_name + "="*20
|
36
|
+
|
37
|
+
minn = 4
|
38
|
+
m2 = (minn/2)
|
39
|
+
m4 = (minn/4)
|
40
|
+
r = [-1,0,0]
|
41
|
+
re2 = /(([#{ta}]{#{m2},})(.{0,2})([#{ta}]{#{m2},}))/i
|
42
|
+
# re2 = /(([#{ta}]{#{m2},})(.{0,3})([#{ta}]{#{m2},}))/i
|
43
|
+
|
44
|
+
# if ta =~/A/
|
45
|
+
# type='ActionPolyA'
|
46
|
+
# else
|
47
|
+
# type='ActionPolyT'
|
48
|
+
# poly_base = 'T'
|
49
|
+
# end
|
50
|
+
|
51
|
+
matches = re2.global_match(seq.seq_fasta,3)
|
52
|
+
|
53
|
+
# HASH
|
54
|
+
polys = []
|
55
|
+
|
56
|
+
# crear una region poly nuevo
|
57
|
+
poly = {}
|
58
|
+
#i=0
|
59
|
+
|
60
|
+
matches.each do |pattern2|
|
61
|
+
|
62
|
+
#puts pattern2.match[0]
|
63
|
+
m_start = pattern2.match.begin(0)+pattern2.offset
|
64
|
+
m_end = pattern2.match.end(0)+pattern2.offset-1
|
65
|
+
|
66
|
+
|
67
|
+
# does one exist in polys with overlap?
|
68
|
+
# yes => group it, updated end
|
69
|
+
# no => one new
|
70
|
+
|
71
|
+
if (e=overlap(polys,m_start,m_end))
|
72
|
+
# puts "OVERLAPS #{e}"
|
73
|
+
# found=seq.seq_fasta.slice(e['begin'],m_end-e['begin']+1)
|
74
|
+
# if base_percent(poly,ta)>= 60
|
75
|
+
e['end'] = m_end
|
76
|
+
e['found'] = seq.seq_fasta.slice(e['begin'],e['end']-e['begin']+1)
|
77
|
+
# else
|
78
|
+
# puts "Ignored because lowers the base percent of poly"
|
79
|
+
# end
|
80
|
+
|
81
|
+
|
82
|
+
else
|
83
|
+
poly={}
|
84
|
+
poly['begin'] = m_start
|
85
|
+
poly['end'] = m_end # the next pos to pattern's end
|
86
|
+
poly['found'] = seq.seq_fasta.slice(poly['begin'],poly['end']-poly['begin']+1)
|
87
|
+
polys.push poly
|
88
|
+
|
89
|
+
# puts " NEW POLY#{ta}: #{poly}"
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
# polys.each do |p|
|
99
|
+
# puts "P#{ta}: #{p}, bp: #{base_percent(p['found'],ta)}"
|
100
|
+
# end
|
101
|
+
|
102
|
+
return polys
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
def find_polyA(seq)
|
108
|
+
|
109
|
+
actions=[]
|
110
|
+
polys=find_polys('AN',seq)
|
111
|
+
poly_base = 'AN'
|
112
|
+
type='ActionPolyA'
|
113
|
+
|
114
|
+
poly_size=0
|
115
|
+
|
116
|
+
# for each poly found cut it, from right to left (reverse order)
|
117
|
+
polys.reverse_each do |poly|
|
118
|
+
|
119
|
+
poly_size=poly['end'] - poly['begin'] +1
|
120
|
+
|
121
|
+
# if polya is near right and is big enought and has a base percent
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
# check if poly lenth and percent are above limits
|
126
|
+
if (poly['end']>=seq.seq_fasta.length-MAX_POLY_A_FROM_RIGHT) && (poly_size>= @params.get_param('poly_a_length').to_i) && (base_percent(poly,poly_base)>= @params.get_param('poly_a_percent').to_i)
|
127
|
+
|
128
|
+
a = seq.new_action(poly['begin'],poly['end'],type)
|
129
|
+
a.right_action=true #mark as rigth action to get the left insert
|
130
|
+
|
131
|
+
actions.push a
|
132
|
+
|
133
|
+
add_stats('poly_a_size',poly_size)
|
134
|
+
|
135
|
+
# if poly a is not near right but is bigger, then cut
|
136
|
+
elsif (poly['end']<<seq.seq_fasta.length-MAX_POLY_A_FROM_RIGHT) && (poly_size>=MIN_MIDDLE_POLY_A_SIZE) && (base_percent(poly,poly_base)>= @params.get_param('poly_a_percent').to_i)
|
137
|
+
|
138
|
+
a = seq.new_action(poly['begin'],poly['end'],type)
|
139
|
+
a.right_action=true #mark as rigth action to get the left insert
|
140
|
+
|
141
|
+
actions.push a
|
142
|
+
|
143
|
+
add_stats('in_middle_poly_a_size',poly_size)
|
144
|
+
# else
|
145
|
+
# puts "REJECTED: #{poly}"
|
146
|
+
|
147
|
+
end
|
148
|
+
if poly['found'].length > @params.get_param('poly_a_length').to_i
|
149
|
+
add_stats("poly_#{poly_base}_base_percents","#{poly['found'].length} #{base_percent(poly,poly_base)}")
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
|
154
|
+
if !actions.empty?
|
155
|
+
add_stats('seqs_with_polyA',1)
|
156
|
+
seq.add_actions(actions)
|
157
|
+
actions=[]
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
def find_polyT(seq)
|
163
|
+
|
164
|
+
actions=[]
|
165
|
+
poly_base = 'TN'
|
166
|
+
type='ActionPolyT'
|
167
|
+
|
168
|
+
polys=find_polys('TN',seq)
|
169
|
+
|
170
|
+
poly_size=0
|
171
|
+
check_for_dust=nil
|
172
|
+
|
173
|
+
# for each poly found process it
|
174
|
+
|
175
|
+
polys.each do |poly|
|
176
|
+
|
177
|
+
poly_size=poly['end'] - poly['begin'] + 1
|
178
|
+
# puts "#{poly}, size: #{poly['found'].length}, bcount:#{base_percent(poly,poly_base)}"
|
179
|
+
# check if poly lenth and percent are above limits
|
180
|
+
if (poly_size>= @params.get_param('poly_t_length').to_i) && (base_percent(poly,poly_base) >= @params.get_param('poly_t_percent').to_i)
|
181
|
+
|
182
|
+
if (actions.empty?) # first poly, check if polyT is on the left of sequence
|
183
|
+
|
184
|
+
#if is polyT and is near left, then the sequence is reversed
|
185
|
+
if (poly['begin']==0)
|
186
|
+
|
187
|
+
seq.seq_reversed=true
|
188
|
+
a = seq.new_action(poly['begin'],poly['end'],type)
|
189
|
+
a.left_action=true
|
190
|
+
actions.push a
|
191
|
+
|
192
|
+
check_for_dust=poly
|
193
|
+
|
194
|
+
elsif (poly['begin']<=MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_TN_COUNT)
|
195
|
+
|
196
|
+
seq.seq_reversed=true
|
197
|
+
a = seq.new_action(poly['begin'],poly['end'],type)
|
198
|
+
a.left_action=true
|
199
|
+
actions.push a
|
200
|
+
|
201
|
+
check_for_dust=poly
|
202
|
+
elsif (poly['begin']>MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_MIDDLE_POLY_T_SIZE)
|
203
|
+
|
204
|
+
seq.seq_reversed=true
|
205
|
+
# seq.seq_rejected=true
|
206
|
+
# seq.seq_rejected_by_message='unexpected polyT'
|
207
|
+
check_for_dust=poly
|
208
|
+
a = seq.new_action(poly['begin'],poly['end'],'ActionUnexpectedPolyT')
|
209
|
+
a.left_action=true
|
210
|
+
actions.push a
|
211
|
+
add_stats('unexpected_poly_t_count',poly_size)
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
else # there are multiple polyTs
|
216
|
+
|
217
|
+
if (poly['begin']>MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_MIDDLE_POLY_T_SIZE)
|
218
|
+
|
219
|
+
seq.seq_reversed=true
|
220
|
+
# seq.seq_rejected=true
|
221
|
+
# seq.seq_rejected_by_message='unexpected polyT'
|
222
|
+
|
223
|
+
check_for_dust=poly
|
224
|
+
|
225
|
+
a = seq.new_action(poly['begin'],poly['end'],'ActionUnexpectedPolyT')
|
226
|
+
a.left_action=true
|
227
|
+
actions.push a
|
228
|
+
add_stats('unexpected_poly_t_count',poly_size)
|
229
|
+
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
# if (poly['begin']<=MAX_POLY_T_FROM_LEFT*2)
|
239
|
+
# seq.seq_rejected=true
|
240
|
+
# seq.seq_rejected_by_message='polyT found'
|
241
|
+
# end
|
242
|
+
|
243
|
+
# @stats[:poly_t_size]={poly_size => 1}
|
244
|
+
add_stats('poly_t_size',poly_size)
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
end
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
if !actions.empty?
|
253
|
+
add_stats('seqs_with_polyT',1)
|
254
|
+
seq.add_actions(actions)
|
255
|
+
|
256
|
+
actions=[]
|
257
|
+
if check_for_dust && !seq.seq_fasta.nil? && !seq.seq_fasta.empty?
|
258
|
+
dust_masker=DustMasker.new()
|
259
|
+
dust_poly_size=check_for_dust['end']-check_for_dust['begin']+1
|
260
|
+
found_dust = dust_masker.do_dust([">"+seq.seq_name,seq.seq_fasta])
|
261
|
+
# puts "Checking for dust: #{seq.seq_fasta}"
|
262
|
+
# puts found_dust.to_json
|
263
|
+
total_dust=0
|
264
|
+
last_dust_start=0
|
265
|
+
|
266
|
+
if !found_dust.empty?
|
267
|
+
found_dust[0].dust.each do |dust|
|
268
|
+
start=dust[0]
|
269
|
+
stop=dust[1]
|
270
|
+
dust_size=dust[1]-dust[0]+1
|
271
|
+
total_dust+=dust_size
|
272
|
+
|
273
|
+
# dust must be big enought and be near the polyt to be a induced one
|
274
|
+
if (dust_size)>10 && (start<last_dust_start+MAX_DUST_DISTANCE_FROM_POLYT)
|
275
|
+
last_dust_start=stop
|
276
|
+
a = seq.new_action(start,stop,'ActionInducedLowComplexity')
|
277
|
+
# a.left_action=true
|
278
|
+
actions.push a
|
279
|
+
elsif dust_size>10
|
280
|
+
a = seq.new_action(start,stop,'ActionLowComplexity')
|
281
|
+
# a.left_action=true
|
282
|
+
actions.push a
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
if !actions.empty?
|
290
|
+
add_stats('poly_t_dust',dust_poly_size)
|
291
|
+
seq.add_actions(actions)
|
292
|
+
else
|
293
|
+
add_stats('poly_t_no_dust',dust_poly_size)
|
294
|
+
end
|
295
|
+
|
296
|
+
# reject sequences if total dust is greater than 30
|
297
|
+
if total_dust>30
|
298
|
+
# if seq.seq_fasta.length<50
|
299
|
+
seq.seq_rejected=true
|
300
|
+
seq.seq_rejected_by_message='low complexity by polyt'
|
301
|
+
# end
|
302
|
+
|
303
|
+
add_stats('induced_low_complexity',total_dust)
|
304
|
+
end
|
305
|
+
|
306
|
+
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
|
312
|
+
|
313
|
+
|
314
|
+
def execute(seqs)
|
315
|
+
seqs.each do |s|
|
316
|
+
exec_seq(s)
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
|
321
|
+
def exec_seq(seq)
|
322
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
|
323
|
+
|
324
|
+
find_polyT(seq)
|
325
|
+
find_polyA(seq)
|
326
|
+
|
327
|
+
end
|
328
|
+
|
329
|
+
######################################################################
|
330
|
+
#---------------------------------------------------------------------
|
331
|
+
|
332
|
+
|
333
|
+
def base_percent(poly,poly_base)
|
334
|
+
|
335
|
+
# count Ts en poly['found']
|
336
|
+
s=poly['found']
|
337
|
+
ta_count = s.count(poly_base.downcase+poly_base.upcase)
|
338
|
+
|
339
|
+
res=(ta_count.to_f/s.size.to_f)*100
|
340
|
+
|
341
|
+
# puts "poly #{s} base percent #{res}"
|
342
|
+
|
343
|
+
|
344
|
+
return res
|
345
|
+
end
|
346
|
+
|
347
|
+
def base_count(poly,poly_base)
|
348
|
+
|
349
|
+
# count bases en poly['found']
|
350
|
+
s=poly['found']
|
351
|
+
res = s.count(poly_base.downcase+poly_base.upcase)
|
352
|
+
|
353
|
+
# puts "poly #{s} base count #{res}"
|
354
|
+
|
355
|
+
return res
|
356
|
+
end
|
357
|
+
|
358
|
+
|
359
|
+
######################################################################
|
360
|
+
#---------------------------------------------------------------------
|
361
|
+
|
362
|
+
#Returns an array with the errors due to parameters are missing
|
363
|
+
def self.check_params(params)
|
364
|
+
errors=[]
|
365
|
+
|
366
|
+
comment='Minimum length of a poly-A'
|
367
|
+
default_value = 6
|
368
|
+
params.check_param(errors,'poly_a_length','Integer',default_value,comment)
|
369
|
+
|
370
|
+
comment='Minimum percent of As in a sequence segment to be considered a poly-A'
|
371
|
+
# default_value = 80
|
372
|
+
default_value = 75
|
373
|
+
params.check_param(errors,'poly_a_percent','Integer',default_value,comment)
|
374
|
+
|
375
|
+
|
376
|
+
comment='Minimum length of a poly-T'
|
377
|
+
default_value = 15
|
378
|
+
params.check_param(errors,'poly_t_length','Integer',default_value,comment)
|
379
|
+
|
380
|
+
comment='Minimum percent of Ts in a sequence segment to be considered a poly-T'
|
381
|
+
# default_value = 80
|
382
|
+
default_value = 75
|
383
|
+
params.check_param(errors,'poly_t_percent','Integer',default_value,comment)
|
384
|
+
|
385
|
+
return errors
|
386
|
+
end
|
387
|
+
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
private :overlap
|
392
|
+
|
393
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
require "make_blast_db"
|
4
|
+
########################################################
|
5
|
+
# Author: Almudena Bocinos Rioboo
|
6
|
+
#
|
7
|
+
# Defines the main methods that are necessary to execute PluginIgnoreRepeated
|
8
|
+
# Inherit: Plugin
|
9
|
+
########################################################
|
10
|
+
|
11
|
+
class PluginIgnoreRepeated < Plugin
|
12
|
+
|
13
|
+
SIZE_SEARCH_IN_IGNORE=15
|
14
|
+
|
15
|
+
#Begins the plugin1's execution to warn that there are repeated sequences, and disables all but one"
|
16
|
+
def execute(seqs)
|
17
|
+
seqs.each do |s|
|
18
|
+
exec_seq(s)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def exec_seq(seq)
|
24
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
|
25
|
+
|
26
|
+
fasta_input=@params.get_param('truncated_input_file')
|
27
|
+
|
28
|
+
blast = BatchBlast.new("-db #{fasta_input}" ,'blastn'," -task blastn-short -searchsp #{SIZE_SEARCH_IN_IGNORE} -evalue #{@params.get_param('blast_evalue_ignore_repeated')} -perc_identity #{@params.get_param('blast_percent_ignore_repeated')}") #get contaminants
|
29
|
+
|
30
|
+
p_start = @params.get_param('piro_repeated_start').to_i
|
31
|
+
p_length = @params.get_param('piro_repeated_length').to_i
|
32
|
+
|
33
|
+
|
34
|
+
blast_table_results = blast.do_blast(seq.seq_fasta[p_start,p_length]) #rise seq to contaminants executing over blast
|
35
|
+
|
36
|
+
#blast_table_results = BlastTableResult.new(res)
|
37
|
+
|
38
|
+
|
39
|
+
type = "ActionIgnoreRepeated"
|
40
|
+
|
41
|
+
# @stats[:rejected_seqs]={}
|
42
|
+
|
43
|
+
actions=[]
|
44
|
+
blast_table_results.querys.each do |query|
|
45
|
+
|
46
|
+
# puts "BLAST IGUALES:"
|
47
|
+
# puts res.join("\n")
|
48
|
+
if query.size>1
|
49
|
+
names = query.hits.collect{ |h|
|
50
|
+
if h.align_len > (p_length-2)
|
51
|
+
h.subject_id
|
52
|
+
end
|
53
|
+
}
|
54
|
+
|
55
|
+
names.compact!
|
56
|
+
|
57
|
+
# puts "IGUALES:" + names.size.to_s
|
58
|
+
# puts names.join(',')
|
59
|
+
|
60
|
+
if !names.empty?
|
61
|
+
names.sort!
|
62
|
+
|
63
|
+
if (names[0] != seq.seq_name) # Add action when the sequence is repeated
|
64
|
+
# if true
|
65
|
+
a = seq.new_action(0,0,type)
|
66
|
+
a.message = seq.seq_name + ' equal to ' + names[0]
|
67
|
+
actions.push a
|
68
|
+
seq.seq_rejected=true
|
69
|
+
seq.seq_rejected_by_message='repeated'
|
70
|
+
seq.seq_repeated=true
|
71
|
+
|
72
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_repe' => 1}
|
73
|
+
add_stats('rejected_seqs','rejected_seqs_by_repe')
|
74
|
+
# puts "#{names[0]} != #{seq.seq_name} >>>>>>"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
seq.add_actions(actions)
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
#Returns an array with the errors due to parameters are missing
|
89
|
+
def self.check_params(params)
|
90
|
+
errors=[]
|
91
|
+
|
92
|
+
# self.check_param(errors,params,'fasta_file_input','String')
|
93
|
+
self.check_param(errors,params,'blast_evalue_ignore_repeated','Float')
|
94
|
+
self.check_param(errors,params,'blast_percent_ignore_repeated','Integer')
|
95
|
+
self.check_param(errors,params,'piro_repeated_start','Integer')
|
96
|
+
self.check_param(errors,params,'piro_repeated_length','Integer')
|
97
|
+
return errors
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
########################################################
|
2
|
+
# Author: Almudena Bocinos Rioboo
|
3
|
+
#
|
4
|
+
# Defines the main methods that are necessary to execute PluginIndeterminations
|
5
|
+
# Inherit: Plugin
|
6
|
+
########################################################
|
7
|
+
require "plugin"
|
8
|
+
require "global_match"
|
9
|
+
|
10
|
+
|
11
|
+
class PluginIndeterminations < Plugin
|
12
|
+
|
13
|
+
|
14
|
+
def overlap(polys,mi_start,mi_end)
|
15
|
+
|
16
|
+
# overlap = polys.find{|e| ( mi_start < e['end'])}
|
17
|
+
overlap = polys.find{|e| ( overlapX?(mi_start,mi_end, e['begin'],e['end']) )}
|
18
|
+
# puts " Overlap #{mi_start} #{mi_end} => #{overlap}"
|
19
|
+
|
20
|
+
return overlap
|
21
|
+
end
|
22
|
+
|
23
|
+
MAX_RUBBISH = 3
|
24
|
+
|
25
|
+
# Begins the pluginFindPolyAt's execution whit the sequence "seq"
|
26
|
+
|
27
|
+
# Uses the param poly_at_length to look for at least that number of contiguous A's
|
28
|
+
def find_polys(ta,seq,actions)
|
29
|
+
|
30
|
+
minn = 4
|
31
|
+
m2 = 1#(minn/2)
|
32
|
+
m4 = (minn/4)
|
33
|
+
r = [-1,0,0]
|
34
|
+
re2 = /((#{ta}{#{m2},})(.{0,3})(#{ta}{#{1},}))/i
|
35
|
+
|
36
|
+
|
37
|
+
type='ActionIndetermination'
|
38
|
+
poly_base = 'N'
|
39
|
+
|
40
|
+
matches = re2.global_match(seq.seq_fasta,3)
|
41
|
+
|
42
|
+
matches2 = /[^N]N$/.match(seq.seq_fasta)
|
43
|
+
|
44
|
+
|
45
|
+
# HASH
|
46
|
+
polys = []
|
47
|
+
|
48
|
+
# crear una region poly nuevo
|
49
|
+
poly = {}
|
50
|
+
#i=0
|
51
|
+
|
52
|
+
matches.each do |pattern2|
|
53
|
+
|
54
|
+
#puts pattern2.match[0]
|
55
|
+
m_start = pattern2.match.begin(0)+pattern2.offset
|
56
|
+
m_end = pattern2.match.end(0)+pattern2.offset-1
|
57
|
+
|
58
|
+
#puts "MATCH: #{m_start} #{m_end}"
|
59
|
+
|
60
|
+
# does one exist in polys with overlap?
|
61
|
+
|
62
|
+
# yes => group it, updated end
|
63
|
+
|
64
|
+
# no => one new
|
65
|
+
|
66
|
+
if (e=overlap(polys,m_start,m_end))
|
67
|
+
|
68
|
+
e['end'] = m_end
|
69
|
+
e['found'] = seq.seq_fasta.slice(e['begin'],e['end']-e['begin']+1)
|
70
|
+
|
71
|
+
else
|
72
|
+
poly={}
|
73
|
+
poly['begin'] = m_start
|
74
|
+
poly['end'] = m_end # the next pos to pattern's end
|
75
|
+
poly['found'] = seq.seq_fasta.slice(poly['begin'],poly['end']-poly['begin']+1)
|
76
|
+
polys.push poly
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
poly_size=0
|
83
|
+
|
84
|
+
polys.each do |poly|
|
85
|
+
#puts "NEW POLY: #{poly.to_json}"
|
86
|
+
|
87
|
+
if poly_near_end(poly['end'],seq.seq_fasta) # near right side
|
88
|
+
#puts "near end"
|
89
|
+
a = seq.new_action(poly['begin'],poly['end'],type)
|
90
|
+
a.right_action=true
|
91
|
+
actions.push a
|
92
|
+
|
93
|
+
poly_size=poly['end']-poly['begin']+1
|
94
|
+
add_stats('size',poly_size)
|
95
|
+
else
|
96
|
+
#puts "far of end"
|
97
|
+
if check_poly_length(poly['begin'],poly['end']) and (check_poly_percent(poly,poly_base))
|
98
|
+
#puts "ok"
|
99
|
+
a = seq.new_action(poly['begin'],poly['end'],type)
|
100
|
+
a.right_action=true
|
101
|
+
actions.push a
|
102
|
+
|
103
|
+
seq.seq_rejected=true
|
104
|
+
seq.seq_rejected_by_message='Indeterminations in middle of sequence'
|
105
|
+
|
106
|
+
poly_size=poly['end']-poly['begin']+1
|
107
|
+
add_stats('size',poly_size)
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
def check_poly_length(poly_start,poly_end)
|
119
|
+
#puts "poly_length: #{1+(poly_end-poly_start)} nt"
|
120
|
+
return (1+(poly_end-poly_start)) >= @params.get_param('poly_n_length').to_i
|
121
|
+
end
|
122
|
+
|
123
|
+
def check_poly_percent(poly,poly_base)
|
124
|
+
|
125
|
+
# count Ts en poly['found']
|
126
|
+
s=poly['found']
|
127
|
+
ta_count = s.count(poly_base.downcase+poly_base.upcase)
|
128
|
+
#puts "poly_percent: #{(ta_count.to_f/s.size.to_f)*100}%"
|
129
|
+
res=((ta_count.to_f/s.size.to_f)*100 >= @params.get_param('poly_n_percent').to_i)
|
130
|
+
|
131
|
+
return res
|
132
|
+
end
|
133
|
+
|
134
|
+
def poly_near_end(pos,seq_fasta)
|
135
|
+
|
136
|
+
max_to_end = @params.get_param('poly_n_max_to_end').to_i
|
137
|
+
|
138
|
+
res = (pos>=(seq_fasta.length-max_to_end))
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
#Begins the pluginKey's execution to warn where is a key in the sequence "seq"
|
144
|
+
def execute(seqs)
|
145
|
+
seqs.each do |s|
|
146
|
+
exec_seq(s)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
def exec_seq(seq)
|
152
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
|
153
|
+
|
154
|
+
actions=[]
|
155
|
+
|
156
|
+
# find simple indeterminations at end of sequence
|
157
|
+
match=seq.seq_fasta.match(/[nN]+$/)
|
158
|
+
|
159
|
+
if !match.nil?
|
160
|
+
found=match[0].length
|
161
|
+
|
162
|
+
|
163
|
+
a = seq.new_action(seq.seq_fasta.length-found,seq.seq_fasta.length,'ActionIndetermination')
|
164
|
+
a.right_action=true
|
165
|
+
actions.push a
|
166
|
+
|
167
|
+
#Add actions
|
168
|
+
seq.add_actions(actions)
|
169
|
+
actions=[]
|
170
|
+
add_stats('indetermination_size',found)
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
find_polys('[N]',seq,actions)
|
175
|
+
seq.add_actions(actions)
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
#Returns an array with the errors due to parameters are missing
|
180
|
+
def self.check_params(params)
|
181
|
+
errors=[]
|
182
|
+
|
183
|
+
comment='Minimum number of Ns within the sequence to be rejected by having an internal segment of indeterminations. Indeterminations at the end of the sequence will be removed regardless of their size and without rejecting the sequence'
|
184
|
+
default_value = 15
|
185
|
+
params.check_param(errors,'poly_n_length','Integer',default_value,comment)
|
186
|
+
|
187
|
+
comment='Minimum percent of Ns in a segment to be considered a valid indetermination'
|
188
|
+
default_value = 80
|
189
|
+
params.check_param(errors,'poly_n_percent','Integer',default_value,comment)
|
190
|
+
|
191
|
+
comment='Maximum distance to the end of the sequence to be considered an internal segment'
|
192
|
+
default_value = 15
|
193
|
+
params.check_param(errors,'poly_n_max_to_end','Integer',default_value,comment)
|
194
|
+
|
195
|
+
return errors
|
196
|
+
end
|
197
|
+
|
198
|
+
|
199
|
+
end
|