seqtrimnext 2.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
@@ -0,0 +1,246 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
|
4
|
+
########################################################
|
5
|
+
# Author: Almudena Bocinos Rioboo
|
6
|
+
#
|
7
|
+
# Defines the main methods that are necessary to execute PluginRemAditArtifacts
|
8
|
+
|
9
|
+
#
|
10
|
+
# Inherit: Plugin
|
11
|
+
########################################################
|
12
|
+
|
13
|
+
class PluginRemAditArtifacts < Plugin
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
# Begins the plugin_low_high_size's execution whit the sequence "seq"
|
18
|
+
# Returns a list with start of polyA or polyT seq or 0 if not found
|
19
|
+
# start of a possible second polyAT what was found in the second search, since it looks for both
|
20
|
+
# Uses the param polyA_length to look for at least that number of contiguous A's
|
21
|
+
def execute(seqs)
|
22
|
+
seqs.each do |s|
|
23
|
+
exec_seq(s)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
def exec_seq(seq)
|
29
|
+
|
30
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing artifacts into the sequence"
|
31
|
+
seq2 = seq.seq_fasta
|
32
|
+
first = 0
|
33
|
+
last = seq2.size-1
|
34
|
+
old_first=first
|
35
|
+
old_last=last
|
36
|
+
|
37
|
+
|
38
|
+
while (seq2 =~ /^(GCGGGG|CCCCGC)/i)
|
39
|
+
first += 6
|
40
|
+
seq2.slice!(0..5)
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
while (seq2 =~ /(GCGGGG|CCCCGC)$/i)
|
46
|
+
last -= 6
|
47
|
+
seq2.slice!(seq2.size-1-5..seq2.size-1)
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
#is_forward, is_cDNA,
|
53
|
+
#TrimExtremeNXs(first,last)
|
54
|
+
is_forward = @params.get_param('is_forward')=='true'
|
55
|
+
is_cDNA = @params.get_param('is_cDNA')=='true'
|
56
|
+
|
57
|
+
previous_first,previous_last =0,0
|
58
|
+
|
59
|
+
until ((previous_first == first) && (previous_last == last))
|
60
|
+
previous_first,previous_last = first, last
|
61
|
+
|
62
|
+
if (is_cDNA)
|
63
|
+
if (is_forward)
|
64
|
+
|
65
|
+
nTs = 0
|
66
|
+
nTs = $1.length if (seq2 =~ /^(T+)/i)
|
67
|
+
|
68
|
+
if (nTs > 3)
|
69
|
+
seq2.slice!(0..nTs -1)
|
70
|
+
first += nTs #-1
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
nAs = 0
|
75
|
+
nAs = $1.length if (seq2 =~ /(A+)$/i)
|
76
|
+
|
77
|
+
if (nAs > 3)
|
78
|
+
seq2.slice!(seq2.size - nAs..seq2.size - 1)
|
79
|
+
last -= nAs
|
80
|
+
|
81
|
+
end
|
82
|
+
else #si es backward
|
83
|
+
|
84
|
+
nTs = 0
|
85
|
+
nTs = $1.length if (seq2 =~ /(T+)$/i)
|
86
|
+
|
87
|
+
if (nTs > 3)
|
88
|
+
seq2.slice!(seq2.size-nTs..seq2.size-1)
|
89
|
+
last -= nTs
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
nAs = 0
|
94
|
+
nAs = $1.length if (seq2 =~ /^(A+)/i)
|
95
|
+
|
96
|
+
if (nAs > 3)
|
97
|
+
seq2.slice!(0..nAs -1)
|
98
|
+
first += nAs
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
if (((first>=0) && (first>old_first)) || ((last>=0) && (last<old_last)))
|
107
|
+
type='ActionRemAditArtifacts'
|
108
|
+
actions = []
|
109
|
+
# seq.add_action(first,last,type)
|
110
|
+
a=seq.new_action(first,last,type)
|
111
|
+
actions.push a
|
112
|
+
seq.add_actions(actions)
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
end
|
117
|
+
######################################################################
|
118
|
+
#---------------------------------------------------------------------
|
119
|
+
def execute_old(seq)
|
120
|
+
seq2 = seq.seq_fasta
|
121
|
+
#seq2 = 'dGCGGGG'
|
122
|
+
first = 0
|
123
|
+
last = seq2.size-1
|
124
|
+
old_first=first
|
125
|
+
old_last=last
|
126
|
+
|
127
|
+
# puts '1 '+seq2
|
128
|
+
# puts 'POS '+first.to_s
|
129
|
+
# puts 'POS '+last.to_s
|
130
|
+
while (seq2 =~ /^(GCGGGG|CCCCGC)/i)
|
131
|
+
first += 6
|
132
|
+
seq2.slice!(0..5)
|
133
|
+
# puts '2 '+seq2
|
134
|
+
# already = true
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
while (seq2 =~ /(GCGGGG|CCCCGC)$/i)
|
139
|
+
last -= 6
|
140
|
+
seq2.slice!(seq2.size-1-5..seq2.size-1)
|
141
|
+
# puts '3 '+seq2
|
142
|
+
# already = true
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
#is_forward, is_cDNA,
|
147
|
+
#TrimExtremeNXs(first,last)
|
148
|
+
is_forward = @params.get_param('is_forward')
|
149
|
+
is_cDNA = @params.get_param('is_cDNA')
|
150
|
+
# puts '4 '+seq2
|
151
|
+
previous_first,previous_last =0,0
|
152
|
+
|
153
|
+
until ((previous_first == first) && (previous_last == last))
|
154
|
+
previous_first,previous_last = first, last
|
155
|
+
# puts 'POS5-F '+first.to_s
|
156
|
+
# puts 'POS5-L '+last.to_s
|
157
|
+
|
158
|
+
if (is_cDNA)
|
159
|
+
if (is_forward)
|
160
|
+
# puts '5 '+seq2
|
161
|
+
nTs = 0
|
162
|
+
nTs = $1.length if (seq2 =~ /^(T+)/i)
|
163
|
+
if (nTs > 3)
|
164
|
+
seq2.slice!(0..nTs -1)
|
165
|
+
# puts '6 '+seq2
|
166
|
+
first += nTs #-1
|
167
|
+
# puts 'POS6-F '+first.to_s
|
168
|
+
end
|
169
|
+
nAs = 0
|
170
|
+
nAs = $1.length if (seq2 =~ /(A+)$/i)
|
171
|
+
# puts '6-7 '+seq2 + nAs.to_s
|
172
|
+
if (nAs > 3)
|
173
|
+
# puts '7 '+seq2
|
174
|
+
seq2.slice!(seq2.size - nAs..seq2.size - 1)
|
175
|
+
last -= nAs#seq2.size-nAs-2
|
176
|
+
# puts 'POS7-L '+last.to_s
|
177
|
+
end
|
178
|
+
else #si es backward
|
179
|
+
# puts '5b '+seq2
|
180
|
+
nTs = 0
|
181
|
+
nTs = $1.length if (seq2 =~ /(T+)$/i)
|
182
|
+
if (nTs > 3)
|
183
|
+
# puts '6b '+seq2
|
184
|
+
seq2.slice!(seq2.size-nTs..seq2.size-1)
|
185
|
+
last -= nTs#seq2.size-nTs -2
|
186
|
+
# puts 'POS6b-L '+last.to_s
|
187
|
+
end
|
188
|
+
|
189
|
+
nAs = 0
|
190
|
+
nAs = $1.length if (seq2 =~ /^(A+)/i)
|
191
|
+
if (nAs > 3)
|
192
|
+
# puts '7b '+seq2
|
193
|
+
seq2.slice!(0..nAs -1)
|
194
|
+
first += nAs#nAs -1
|
195
|
+
# puts 'POS7b-f '+first.to_s
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
#first -= 1 if (old_first!= first)
|
202
|
+
#last += 1 if (old_last!= last)
|
203
|
+
|
204
|
+
# puts 'POS7-8 '+first.to_s
|
205
|
+
# puts 'POS7-8 '+last.to_s
|
206
|
+
|
207
|
+
if (((first>=0) && (first>old_first)) || ((last>=0) && (last<old_last)))
|
208
|
+
type='ActionRemAditArtifacts'
|
209
|
+
|
210
|
+
# puts '8 '+seq2
|
211
|
+
seq.add_action(first,last,type)
|
212
|
+
end
|
213
|
+
# puts '9 '+seq2
|
214
|
+
|
215
|
+
end
|
216
|
+
|
217
|
+
|
218
|
+
|
219
|
+
######################################################################
|
220
|
+
#---------------------------------------------------------------------
|
221
|
+
|
222
|
+
#Returns an array with the errors due to parameters are missing
|
223
|
+
def self.check_params(params)
|
224
|
+
errors=[]
|
225
|
+
|
226
|
+
|
227
|
+
|
228
|
+
# if !params.exists?('ta')
|
229
|
+
# errors.push " The param <ta> doesn't exist"
|
230
|
+
# end
|
231
|
+
|
232
|
+
# if !params.exists?('poly_at_length')
|
233
|
+
# errors.push " The param <poly_at_length> doesn't exist"
|
234
|
+
# end
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
return errors
|
239
|
+
end
|
240
|
+
|
241
|
+
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
end
|
@@ -0,0 +1,244 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
########################################################
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
5
|
+
#
|
6
|
+
# Defines the main methods that are necessary to execute PluginShortInserted
|
7
|
+
# Inherit: Plugin
|
8
|
+
########################################################
|
9
|
+
|
10
|
+
class PluginShortInsert < Plugin
|
11
|
+
|
12
|
+
def cut_by(items,sub_inserts)
|
13
|
+
|
14
|
+
|
15
|
+
delete=false
|
16
|
+
# puts " eee1 #{sub_inserts.inspect} item #{items.inspect}"
|
17
|
+
# puts " eee1 #{sub_inserts.join('-')}"
|
18
|
+
|
19
|
+
items.each do |item|
|
20
|
+
sub_inserts.each do |sub_i|
|
21
|
+
|
22
|
+
if ((item.start_pos<=sub_i[0]) && (item.end_pos>=sub_i[1]))
|
23
|
+
# if not exists any subinsert
|
24
|
+
delete=true
|
25
|
+
|
26
|
+
elsif ((item.end_pos>=sub_i[0]) && (item.end_pos+1<=sub_i[1]))
|
27
|
+
# if exists an subinsert between the item one and the end of subinsert
|
28
|
+
|
29
|
+
sub_inserts.push [item.end_pos+1,sub_i[1]] # mark subinsert after the item
|
30
|
+
|
31
|
+
delete=true
|
32
|
+
# puts " !!!! 1 #{sub_inserts.inspect}"
|
33
|
+
if ((item.start_pos-1>=sub_i[0]))
|
34
|
+
# if exists an subinsert between the start of the subinsert and the item
|
35
|
+
sub_inserts.push [sub_i[0],item.start_pos-1] # mark subinsert before the item
|
36
|
+
delete=true
|
37
|
+
|
38
|
+
# puts " !!!! 2-1 #{sub_inserts.inspect}"
|
39
|
+
end
|
40
|
+
|
41
|
+
elsif ((item.start_pos-1>=sub_i[0]) && (item.start_pos<=sub_i[1]))
|
42
|
+
# if exists an subinsert between the start of the subinsert and the item
|
43
|
+
sub_inserts.push [sub_i[0],item.start_pos-1,] # mark subinsert before the item
|
44
|
+
delete=true
|
45
|
+
|
46
|
+
# puts " !!!! 2-2 #{sub_inserts.inspect}"
|
47
|
+
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
# sub_inserts.delete [sub_i[0],sub_i[1]] and delete=false and puts " DELETEEE ___________ #{delete}" if delete
|
53
|
+
if delete
|
54
|
+
sub_inserts.delete [sub_i[0],sub_i[1]]
|
55
|
+
delete=false
|
56
|
+
# puts " DELETEEE ___________ #{delete} #{[sub_i[0] , sub_i[1]] }"
|
57
|
+
end
|
58
|
+
# puts " eee2 #{sub_inserts.join(',')}"
|
59
|
+
|
60
|
+
|
61
|
+
end #each sub_insert
|
62
|
+
end #each low_qual
|
63
|
+
end
|
64
|
+
|
65
|
+
#select the best subinsert, when there is not a linker
|
66
|
+
def select_the_best(sub_inserts)
|
67
|
+
|
68
|
+
insert_size = 0
|
69
|
+
|
70
|
+
insert = nil
|
71
|
+
|
72
|
+
sub_inserts.each do |sub_i|
|
73
|
+
|
74
|
+
if (insert_size<(sub_i[1]-sub_i[0]+1))
|
75
|
+
insert_size = (sub_i[1]-sub_i[0]+1)
|
76
|
+
insert=sub_i
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
sub_inserts=[]
|
82
|
+
sub_inserts.push insert if !insert.nil?
|
83
|
+
|
84
|
+
# puts " subinsert #{sub_inserts.join(' ')}"
|
85
|
+
|
86
|
+
return sub_inserts
|
87
|
+
end
|
88
|
+
|
89
|
+
#Begins the plugin1's execution to warn if the inserted is so short
|
90
|
+
def execute(seqs)
|
91
|
+
seqs.each do |s|
|
92
|
+
exec_seq(s)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
def exec_seq(seq)
|
98
|
+
|
99
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
100
|
+
# puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
|
101
|
+
|
102
|
+
if (seq.seq_fasta.size > 0)
|
103
|
+
|
104
|
+
# para acciones que no tienen activado el cortar, se las corta aquí
|
105
|
+
sub_inserts=[]
|
106
|
+
sub_inserts.push [ seq.insert_start, seq.insert_end]
|
107
|
+
low_quals=seq.get_actions(ActionLowQuality)
|
108
|
+
# puts "low qual size #{low_quals.size}"
|
109
|
+
cut_by(low_quals,sub_inserts)
|
110
|
+
|
111
|
+
# puts '?' + sub_inserts.join('?')
|
112
|
+
|
113
|
+
if sub_inserts.empty?
|
114
|
+
p_beg,p_end = 0,-1 # position from an empty insert
|
115
|
+
|
116
|
+
else
|
117
|
+
sub_inserts=select_the_best(sub_inserts)
|
118
|
+
|
119
|
+
#vemos el tamaño del inserto actual
|
120
|
+
# puts " antes current_insert #{seq.seq_fasta.length}"
|
121
|
+
# p_beg,p_end = seq.current_insert
|
122
|
+
|
123
|
+
# p_beg,p_end = seq.insert_bounds
|
124
|
+
p_beg,p_end = sub_inserts[0][0],sub_inserts[0][1] # insert positions
|
125
|
+
# puts " p_beg p_end #{p_beg} #{p_end}"
|
126
|
+
|
127
|
+
|
128
|
+
# puts " despues current_insert #{p_beg} #{p_end}"
|
129
|
+
size_min_insert = @params.get_param('min_insert_size_trimmed').to_i
|
130
|
+
end
|
131
|
+
|
132
|
+
else
|
133
|
+
p_beg,p_end = 0,-1 # position from an empty insert
|
134
|
+
# puts " p_beg p_end #{p_beg} #{p_end} size= #{p_end-p_beg+1}"
|
135
|
+
end
|
136
|
+
|
137
|
+
# puts "INSERTO:"+seq.seq_fasta
|
138
|
+
actions=[]
|
139
|
+
# puts " in PLUGIN SHORT INSERT previous to add action #{p_beg} #{p_end}"
|
140
|
+
if p_end-p_beg+1 <= 0
|
141
|
+
type = "ActionEmptyInsert"
|
142
|
+
# puts " in PLUGIN EMPTY previous to add action #{p_beg} #{p_end}"
|
143
|
+
# a = seq.add_action(p_beg,p_end,type)
|
144
|
+
a=seq.new_action(0,0,type)
|
145
|
+
actions.push a
|
146
|
+
add_stats('short_inserts',0)
|
147
|
+
# puts "1 p_beg p_end #{p_beg} #{p_end}"
|
148
|
+
|
149
|
+
seq.seq_rejected=true
|
150
|
+
seq.seq_rejected_by_message='empty insert'
|
151
|
+
elsif ((p_end-p_beg+1)<size_min_insert)
|
152
|
+
type = "ActionShortInsert"
|
153
|
+
a_beg,a_end = p_beg-seq.insert_start, p_end-seq.insert_start
|
154
|
+
|
155
|
+
# puts " in PLUGIN SHORT previous to add action"
|
156
|
+
# a = seq.add_action(p_beg,p_end,type)
|
157
|
+
a=seq.new_action(a_beg,a_end,type)
|
158
|
+
actions.push a
|
159
|
+
add_stats('short_inserts',a_end-a_beg+1)
|
160
|
+
|
161
|
+
# puts "2 p_beg p_end #{p_beg} #{p_end}"
|
162
|
+
|
163
|
+
seq.seq_rejected=true
|
164
|
+
seq.seq_rejected_by_message='short insert'
|
165
|
+
else
|
166
|
+
type= "ActionInsert"
|
167
|
+
|
168
|
+
# a=seq.add_action(p_beg,p_end,type)
|
169
|
+
a_beg,a_end = sub_inserts[0][0]-seq.insert_start, sub_inserts[0][1]-seq.insert_start
|
170
|
+
a=seq.new_action(a_beg,a_end,type)
|
171
|
+
actions.push a
|
172
|
+
|
173
|
+
add_stats('inserts',a_end-a_beg+1)
|
174
|
+
|
175
|
+
# puts "3 p_beg p_end #{p_beg} #{p_end}"
|
176
|
+
end
|
177
|
+
|
178
|
+
seq.add_actions(actions)
|
179
|
+
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
#Begins the plugin1's execution to warn if the inserted is so short
|
185
|
+
def execute_no_cut_quality(seq)
|
186
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
187
|
+
|
188
|
+
|
189
|
+
|
190
|
+
#vemos el tamaño del inserto actual
|
191
|
+
# puts " antes current_insert #{seq.seq_fasta.length}"
|
192
|
+
# p_beg,p_end = seq.current_insert
|
193
|
+
p_beg,p_end = seq.insert_bounds
|
194
|
+
# puts " despues current_insert #{p_beg} #{p_end}"
|
195
|
+
size_min_insert = @params.get_param('min_insert_size_trimmed').to_i
|
196
|
+
|
197
|
+
# puts "INSERTO:"+seq.seq_fasta
|
198
|
+
actions=[]
|
199
|
+
# puts " in PLUGIN SHORT INSERT previous to add action #{p_beg} #{p_end}"
|
200
|
+
if p_end-p_beg+1 <= 0
|
201
|
+
type = "ActionEmptyInsert"
|
202
|
+
# puts " in PLUGIN EMPTY previous to add action #{p_beg} #{p_end}"
|
203
|
+
# a = seq.add_action(p_beg,p_end,type)
|
204
|
+
a=seq.new_action(0,0,type)
|
205
|
+
actions.push a
|
206
|
+
|
207
|
+
elsif ((p_end-p_beg+1)<size_min_insert)
|
208
|
+
type = "ActionShortInsert"
|
209
|
+
# puts " in PLUGIN SHORT previous to add action"
|
210
|
+
# a = seq.add_action(p_beg,p_end,type)
|
211
|
+
a=seq.new_action(0,p_end-p_beg,type)
|
212
|
+
|
213
|
+
actions.push a
|
214
|
+
else
|
215
|
+
type= "ActionInsert"
|
216
|
+
|
217
|
+
# a=seq.add_action(p_beg,p_end,type)
|
218
|
+
a=seq.new_action(0,p_end-p_beg,type)
|
219
|
+
actions.push a
|
220
|
+
end
|
221
|
+
|
222
|
+
seq.add_actions(actions)
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
#Returns an array with the errors due to parameters are missing
|
227
|
+
def self.check_params(params)
|
228
|
+
errors=[]
|
229
|
+
|
230
|
+
self.check_param(errors,params,'min_insert_size_trimmed','Integer')
|
231
|
+
|
232
|
+
# if !params.exists?('genus')
|
233
|
+
# errors.push " The param genus doesn't exist "
|
234
|
+
# end
|
235
|
+
|
236
|
+
# if !params.exists?('p2')
|
237
|
+
# errors.push " The param p2 doesn't exist"
|
238
|
+
# end
|
239
|
+
|
240
|
+
return errors
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
########################################################
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
5
|
+
#
|
6
|
+
# Defines the main methods that are necessary to execute PluginVectors
|
7
|
+
# Inherit: Plugin
|
8
|
+
########################################################
|
9
|
+
|
10
|
+
class PluginVectors < Plugin
|
11
|
+
|
12
|
+
# MIN_VECTOR_SIZE=30
|
13
|
+
# MAX_TO_EXTREME=(MIN_VECTOR_SIZE/2).to_i
|
14
|
+
MAX_TARGETS_SEQS=20 #MAXIMUM NUMBER OF DIFFERENT ALIGNED SEQUENCES TO KEEP FROM BLAST DATABASE
|
15
|
+
|
16
|
+
def near_to_extrem(c,seq,min_vector_size)
|
17
|
+
max_to_extreme=(min_vector_size/2).to_i
|
18
|
+
return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert)
|
19
|
+
end
|
20
|
+
|
21
|
+
def all_vector_in_linker(vector_beg,vector_end,seq)
|
22
|
+
linkers=seq.get_actions(ActionLinker)
|
23
|
+
# res=((linkers.count>=1) && (vector_beg>=linkers[0].start_pos) && (vector_end<=linkers[0].end_pos))
|
24
|
+
# puts " RES #{res} insert-start #{seq.insert_start} #{linkers.count}>=1 #{vector_beg+seq.insert_start}>=#{linkers[0].start_pos}) && #{vector_end+seq.insert_start}<=#{linkers[0].end_pos})) "
|
25
|
+
return ((linkers.count>=1) && (vector_beg+seq.insert_start>=linkers[0].start_pos) && (vector_end+seq.insert_start<=linkers[0].end_pos))
|
26
|
+
end
|
27
|
+
|
28
|
+
#Begins the plugin1's execution to warn that there are vectors in the sequence "seq"
|
29
|
+
def execute(seqs)
|
30
|
+
blasts= do_blasts(seqs)
|
31
|
+
|
32
|
+
seqs.each_with_index do |s,i|
|
33
|
+
exec_seq(s,blasts.querys[i])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def do_blasts(seqs)
|
38
|
+
# find MIDS with less results than max_target_seqs value
|
39
|
+
blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
|
40
|
+
|
41
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd)
|
42
|
+
|
43
|
+
fastas=[]
|
44
|
+
|
45
|
+
seqs.each do |seq|
|
46
|
+
fastas.push ">"+seq.seq_name
|
47
|
+
fastas.push seq.seq_fasta
|
48
|
+
end
|
49
|
+
|
50
|
+
# fastas=fastas.join("\n")
|
51
|
+
|
52
|
+
blast_table_results = blast.do_blast(fastas,:xml)
|
53
|
+
|
54
|
+
# puts blast_table_results.inspect
|
55
|
+
|
56
|
+
return blast_table_results
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def exec_seq(seq,blast_query)
|
61
|
+
if blast_query.query_id != seq.seq_name
|
62
|
+
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for vectors into the sequence "
|
66
|
+
|
67
|
+
#blast contra contaminantes
|
68
|
+
|
69
|
+
# blast = BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'vectors.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
|
70
|
+
|
71
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta,:xml) #rise seq to contaminants executing over blast
|
72
|
+
type = "ActionVectors"
|
73
|
+
|
74
|
+
# puts res
|
75
|
+
# blast_table_results.inspect
|
76
|
+
# blast_table_results.querys.each do |query| # adds the correspondent action to the sequence
|
77
|
+
# query.hits.each do |hit|
|
78
|
+
# seq.add_action(hit.q_beg,hit.q_end,type)
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
|
82
|
+
vectors=[]
|
83
|
+
vectors_ids=[]
|
84
|
+
# blast_table_results.querys.each do |query| # first round to save vectors without overlap
|
85
|
+
# vectors_ids.push query.hits.subject_id if (not vectors_ids.include?(query.hits.subject_id))
|
86
|
+
merge_hits(blast_query.hits,vectors,vectors_ids)
|
87
|
+
# end
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
begin
|
92
|
+
vectors2=vectors # second round to save vectors without overlap
|
93
|
+
vectors = []
|
94
|
+
merge_hits(vectors2,vectors)
|
95
|
+
end until (vectors2.count == vectors.count)
|
96
|
+
|
97
|
+
|
98
|
+
actions = []
|
99
|
+
vectors_size=0
|
100
|
+
min_vector_size=@params.get_param('min_vector_seq_presence').to_i
|
101
|
+
|
102
|
+
vectors.each do |v| # adds the correspondent action to the sequence
|
103
|
+
|
104
|
+
#puts "*VECTOR* #{v.subject_id[0..40].ljust(40)} #{v.q_beg.to_s.rjust(6)} #{v.q_end.to_s.rjust(6)} #{v.s_beg.to_s.rjust(6)} #{v.s_end.to_s.rjust(6)}"
|
105
|
+
|
106
|
+
vector_size=v.q_end-v.q_beg+1
|
107
|
+
# puts " in PLUGIN VECTOR previous to add action #{seq.insert_start} #{seq.insert_end}"
|
108
|
+
# if ((vector_size>=MIN_VECTOR_SIZE) || ((vector_size<MIN_VECTOR_SIZE) && near_to_extrem(v,seq)))
|
109
|
+
if (near_to_extrem(v,seq,10) || (vector_size>=min_vector_size) )
|
110
|
+
# puts " near #{near_to_extrem(v,seq,min_vector_size)} #{vector_size}>=#{min_vector_size}"
|
111
|
+
#c.q_end+seq.insert_start+max_to_end)>=seq.seq_fasta_orig.size-1) #if ab adapter is very near to the end of original sequence
|
112
|
+
|
113
|
+
piro_on=@params.get_param('next_generation_sequences')
|
114
|
+
|
115
|
+
if (((piro_on=='true') && (!seq.range_inside_action_type?(v.q_beg,v.q_end,ActionLinker)) && (!seq.range_inside_action_type?(v.q_beg,v.q_end,ActionMultipleLinker))) || # if vectors DB not is contained inside detected linkers
|
116
|
+
(piro_on=='false'))
|
117
|
+
|
118
|
+
# if vector is too big, and it isn't in an extreme, then it is an unexpected vector
|
119
|
+
if !near_to_extrem(v,seq,min_vector_size)
|
120
|
+
type = 'ActionUnexpectedVector'
|
121
|
+
|
122
|
+
seq.seq_rejected=true
|
123
|
+
seq.seq_rejected_by_message='unexpected vector'
|
124
|
+
|
125
|
+
add_stats('rejected','unexpected_vector')
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
a = seq.new_action(v.q_beg,v.q_end,type)
|
131
|
+
a.message = v.definition
|
132
|
+
# a.found_definition.push v.subject_id # save the vectors definitions, each separately
|
133
|
+
a.found_definition=vectors_ids # save the vectors definitions, each separately
|
134
|
+
a.reversed = v.reversed
|
135
|
+
a.cut=false if (piro_on=='true') # vectors don't cut when piro is on
|
136
|
+
|
137
|
+
# puts "piro on #{piro_on} vector cut #{a.cut} ________________|||||||||| "
|
138
|
+
# puts " no piro" if (piro_on=='false')
|
139
|
+
|
140
|
+
actions.push a
|
141
|
+
|
142
|
+
# @stats[:vector_size]={vector_size => 1}
|
143
|
+
add_stats('vector_size',vector_size)
|
144
|
+
vectors_ids.each do |v|
|
145
|
+
add_stats('vectors_ids',v)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
seq.add_actions(actions)
|
153
|
+
#
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
#Returns an array with the errors due to parameters are missing
|
158
|
+
def self.check_params(params)
|
159
|
+
errors=[]
|
160
|
+
|
161
|
+
|
162
|
+
comment='Blast E-value used as cut-off when searching for vector fragments'
|
163
|
+
default_value = 1e-1
|
164
|
+
params.check_param(errors,'blast_evalue_vectors','Float',default_value,comment)
|
165
|
+
|
166
|
+
comment='Minimum required identity (%) for a reliable vector fragment'
|
167
|
+
default_value = 90
|
168
|
+
params.check_param(errors,'blast_percent_vectors','Integer',default_value,comment)
|
169
|
+
|
170
|
+
comment='Correct sequences could contain vectors only close to the read end (not within the sequence). The following variable indicates the number of nucleotides from the 5\' or 3\' end that are allowed for considering a vector fragment located at the end. Otherwise, the vector fragment will be qualified as internal and the sequence will be rejected'
|
171
|
+
default_value = 8
|
172
|
+
params.check_param(errors,'max_vector_to_end','Integer',default_value,comment)
|
173
|
+
|
174
|
+
comment='If a vector fragment is qualified as internal, the fragment should be long enough to be sure that it is a true vector fragment. This is the minimum length of a vector fragment that enables sequence rejection by an internal, unexpected vector'
|
175
|
+
default_value = 50
|
176
|
+
params.check_param(errors,'min_vector_seq_presence','Integer',default_value,comment)
|
177
|
+
|
178
|
+
|
179
|
+
comment='Vectors database path'
|
180
|
+
default_value = File.join($FORMATTED_DB_PATH,'vectors.fasta')
|
181
|
+
params.check_param(errors,'vectors_db','DB',default_value,comment)
|
182
|
+
|
183
|
+
# params.split_databases('vectors_db')
|
184
|
+
|
185
|
+
return errors
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
|
190
|
+
|
191
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# ======================================
|
2
|
+
# General parameters to extract Amplicons
|
3
|
+
# ======================================
|
4
|
+
|
5
|
+
plugin_list = PluginLowHighSize,PluginKey,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginLowQuality,PluginAmplicons
|
6
|
+
|
7
|
+
# do not remove cloned sequences
|
8
|
+
remove_clonality=false
|
9
|
+
|
10
|
+
# sequences containing with diferent keys (barcodes) are saved to separate folders
|
11
|
+
use_independent_folder_for_each_key=true
|
12
|
+
|
13
|
+
# remove amplicons containing less or equal number of sequences indicated
|
14
|
+
|
15
|
+
minimal_repetitions_for_amplicons=1
|
16
|
+
|
@@ -0,0 +1,5 @@
|
|
1
|
+
# ======================================
|
2
|
+
# General parameters GENOMICS WITH POSSIBLE LINKER
|
3
|
+
# ======================================
|
4
|
+
|
5
|
+
plugin_list = PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginContaminants,PluginVectors,PluginLowQuality
|