seqtrimnext 2.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
@@ -0,0 +1,221 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
########################################################
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
5
|
+
#
|
6
|
+
# Defines the main methods that are necessary to execute PluginAdapters
|
7
|
+
# Inherit: Plugin
|
8
|
+
########################################################
|
9
|
+
|
10
|
+
class PluginAmplicons < Plugin
|
11
|
+
|
12
|
+
# adapters found at end of sequence are even 2 nt wide, cut in 5 because of statistics
|
13
|
+
# MIN_PRIMER_SIZE = 5
|
14
|
+
# MIN_FAR_ADAPTER_SIZE = 13
|
15
|
+
# MIN_LEFT_ADAPTER_SIZE = 9
|
16
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
17
|
+
def execute(seqs)
|
18
|
+
blasts= do_blasts(seqs)
|
19
|
+
|
20
|
+
seqs.each_with_index do |s,i|
|
21
|
+
exec_seq(s,blasts.querys[i])
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def do_blasts(seqs)
|
26
|
+
# find MIDS with less results than max_target_seqs value
|
27
|
+
blast=BatchBlast.new("-db #{@params.get_param('primers_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_primers')}")
|
28
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd)
|
29
|
+
|
30
|
+
fastas=[]
|
31
|
+
|
32
|
+
seqs.each do |seq|
|
33
|
+
fastas.push ">"+seq.seq_name
|
34
|
+
fastas.push seq.seq_fasta
|
35
|
+
end
|
36
|
+
|
37
|
+
# fastas=fastas.join("\n")
|
38
|
+
|
39
|
+
blast_table_results = blast.do_blast(fastas)
|
40
|
+
|
41
|
+
# puts blast_table_results.inspect
|
42
|
+
|
43
|
+
return blast_table_results
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
def exec_seq(seq,blast_query)
|
48
|
+
if blast_query.query_id != seq.seq_name
|
49
|
+
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
50
|
+
end
|
51
|
+
|
52
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for primers into the sequence"
|
53
|
+
|
54
|
+
# puts blast_query.inspect
|
55
|
+
|
56
|
+
# merge hits
|
57
|
+
# primers=blast_query.merged_hits!
|
58
|
+
|
59
|
+
# do not merge hits, since id is important
|
60
|
+
primers=blast_query.hits
|
61
|
+
|
62
|
+
min_primer_size=@params.get_param('min_primer_size').to_i
|
63
|
+
# puts "MERGED:"
|
64
|
+
# puts primers.inspect
|
65
|
+
|
66
|
+
# type = 'ActionAbAdapter'
|
67
|
+
actions=[]
|
68
|
+
adapter_size=0
|
69
|
+
|
70
|
+
# filter primers by size
|
71
|
+
primers = primers.select{|primer| (primer.size >= min_primer_size)}.sort{|p1,p2| p1.size<=>p2.size}.reverse
|
72
|
+
# puts "FILTERED:"
|
73
|
+
# puts primers.inspect
|
74
|
+
|
75
|
+
# reject sequences with less than two primers
|
76
|
+
if primers.count < 2
|
77
|
+
|
78
|
+
seq.seq_rejected=true
|
79
|
+
seq.seq_rejected_by_message='Primer pair not found'
|
80
|
+
|
81
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1}
|
82
|
+
add_stats('rejected','primers_not_found')
|
83
|
+
|
84
|
+
else # has two primers, or more
|
85
|
+
|
86
|
+
if seq.seq_fasta.index('N')
|
87
|
+
seq.seq_rejected=true
|
88
|
+
seq.seq_rejected_by_message='At least one N found'
|
89
|
+
|
90
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1}
|
91
|
+
add_stats('rejected','one_n_found')
|
92
|
+
|
93
|
+
else
|
94
|
+
# puts "EL DE ARRIBA"
|
95
|
+
|
96
|
+
# take first two primers and order them by qbeg
|
97
|
+
left_primer = primers[0..1].sort{|p1,p2| p1.q_beg<=>p2.q_beg}.first
|
98
|
+
right_primer = primers[0..1].sort{|p1,p2| p1.q_beg<=>p2.q_beg}.last
|
99
|
+
|
100
|
+
# puts "LEFT_PRIMER:"
|
101
|
+
# puts left_primer.inspect
|
102
|
+
# puts "RIGHT_PRIMER:"
|
103
|
+
# puts right_primer.inspect
|
104
|
+
|
105
|
+
# if (left_primer.size>= min_primer_size) && (right_primer.size>= min_primer_size)
|
106
|
+
|
107
|
+
a = seq.new_action(left_primer.q_beg,left_primer.q_end,'ActionLeftPrimer')
|
108
|
+
a.message = left_primer.subject_id
|
109
|
+
a.tag_id = left_primer.subject_id
|
110
|
+
a.reversed = left_primer.reversed
|
111
|
+
a.left_action = true
|
112
|
+
actions.push a
|
113
|
+
|
114
|
+
add_stats('primer_size',left_primer.size)
|
115
|
+
add_stats('primer_id',left_primer.subject_id)
|
116
|
+
|
117
|
+
a = seq.new_action(right_primer.q_beg,right_primer.q_end,'ActionRightPrimer')
|
118
|
+
a.message = right_primer.subject_id
|
119
|
+
a.reversed = right_primer.reversed
|
120
|
+
a.tag_id = right_primer.subject_id
|
121
|
+
a.right_action = true
|
122
|
+
actions.push a
|
123
|
+
|
124
|
+
add_stats('primer_size',right_primer.size)
|
125
|
+
add_stats('primer_id',right_primer.subject_id)
|
126
|
+
|
127
|
+
seq.add_file_tag(2, left_primer.subject_id, :file)
|
128
|
+
seq.add_file_tag(2, right_primer.subject_id, :file)
|
129
|
+
|
130
|
+
|
131
|
+
# end
|
132
|
+
|
133
|
+
|
134
|
+
if !actions.empty?
|
135
|
+
seq.add_actions(actions)
|
136
|
+
add_stats('sequences_with_primers','count')
|
137
|
+
|
138
|
+
# add_stats('sequences',seq.seq_fasta)
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
#
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
#Returns an array with the errors due to parameters are missing
|
147
|
+
def self.check_params(params)
|
148
|
+
errors=[]
|
149
|
+
|
150
|
+
comment='Blast E-value used as cut-off when searching for primers'
|
151
|
+
# default_value = 1e-6
|
152
|
+
default_value = 1
|
153
|
+
params.check_param(errors,'blast_evalue_primers','Float',default_value,comment)
|
154
|
+
|
155
|
+
comment='Minimum required identity (%) for a reliable primer'
|
156
|
+
default_value = 95
|
157
|
+
params.check_param(errors,'blast_percent_primers','Integer',default_value,comment)
|
158
|
+
|
159
|
+
comment='Minimun primer size'
|
160
|
+
default_value = 15
|
161
|
+
params.check_param(errors,'min_primer_size','Integer',default_value,comment)
|
162
|
+
|
163
|
+
comment='Path for primers database'
|
164
|
+
# default_value = File.join($FORMATTED_DB_PATH,'adapters_ab.fasta')
|
165
|
+
default_value=nil
|
166
|
+
params.check_param(errors,'primers_db','DB',default_value,comment)
|
167
|
+
|
168
|
+
return errors
|
169
|
+
end
|
170
|
+
|
171
|
+
# def self.get_graph_title(plugin_name,stats_name)
|
172
|
+
# case stats_name
|
173
|
+
# when 'adapter_type'
|
174
|
+
# 'AB adapters by type'
|
175
|
+
# when 'adapter_size'
|
176
|
+
# 'AB adapters by size'
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
#
|
180
|
+
# def self.get_graph_filename(plugin_name,stats_name)
|
181
|
+
# return stats_name
|
182
|
+
#
|
183
|
+
# # case stats_name
|
184
|
+
# # when 'adapter_type'
|
185
|
+
# # 'AB adapters by type'
|
186
|
+
# # when 'adapter_size'
|
187
|
+
# # 'AB adapters by size'
|
188
|
+
# # end
|
189
|
+
# end
|
190
|
+
#
|
191
|
+
# def self.valid_graphs
|
192
|
+
# return ['adapter_type']
|
193
|
+
# end
|
194
|
+
|
195
|
+
# def self.plot_setup(stats_value,stats_name,x,y,init_stats,plot)
|
196
|
+
#
|
197
|
+
# # puts "============== #{stats_name}"
|
198
|
+
#
|
199
|
+
# # puts stats_name
|
200
|
+
# case stats_name
|
201
|
+
#
|
202
|
+
# when 'primer_size'
|
203
|
+
# plot.x_label= "Length"
|
204
|
+
# plot.y_label= "Count"
|
205
|
+
# # plot.x_range="[0:#{init_stats['biggest_sequence_size'].to_i}]"
|
206
|
+
# plot.x_range="[0:200]"
|
207
|
+
# puts x.class
|
208
|
+
# plot.add_x(x)
|
209
|
+
# plot.add_y(y)
|
210
|
+
#
|
211
|
+
# plot.do_graph
|
212
|
+
#
|
213
|
+
# return true
|
214
|
+
# else
|
215
|
+
# return false
|
216
|
+
# end
|
217
|
+
#
|
218
|
+
# end
|
219
|
+
|
220
|
+
|
221
|
+
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
require "make_blast_db"
|
4
|
+
########################################################
|
5
|
+
# Author: Almudena Bocinos Rioboo
|
6
|
+
#
|
7
|
+
# Defines the main methods that are necessary to execute PluginContaminants
|
8
|
+
# Inherit: Plugin
|
9
|
+
########################################################
|
10
|
+
|
11
|
+
class PluginContaminants < Plugin
|
12
|
+
|
13
|
+
|
14
|
+
MAX_TARGETS_SEQS=4 #MAXIMUM NUMBER OF DIFFERENT ALIGNED SEQUENCES TO KEEP FROM BLAST DATABASE
|
15
|
+
|
16
|
+
|
17
|
+
def near_to_extrem(c,seq,min_cont_size)
|
18
|
+
max_to_extreme=(min_cont_size/2).to_i
|
19
|
+
return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert)
|
20
|
+
end
|
21
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
22
|
+
def execute(seqs)
|
23
|
+
blasts= do_blasts(seqs)
|
24
|
+
|
25
|
+
seqs.each_with_index do |s,i|
|
26
|
+
exec_seq(s,blasts.querys[i])
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def do_blasts(seqs)
|
31
|
+
# find MIDS with less results than max_target_seqs value
|
32
|
+
# blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
33
|
+
|
34
|
+
# TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
|
35
|
+
# y una secuencia de baja complejidad como entrada
|
36
|
+
|
37
|
+
blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
38
|
+
|
39
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd(:xml))
|
40
|
+
|
41
|
+
fastas=[]
|
42
|
+
|
43
|
+
seqs.each do |seq|
|
44
|
+
fastas.push ">"+seq.seq_name
|
45
|
+
fastas.push seq.seq_fasta
|
46
|
+
end
|
47
|
+
|
48
|
+
# fastas=fastas.join("\n")
|
49
|
+
# $LOG.info('doing blast to:')
|
50
|
+
# $LOG.info('-'*20)
|
51
|
+
# $LOG.info(fastas)
|
52
|
+
# $LOG.info('-'*20)
|
53
|
+
|
54
|
+
blast_table_results = blast.do_blast(fastas,:xml)
|
55
|
+
|
56
|
+
# $LOG.info(blast_table_results.inspect)
|
57
|
+
|
58
|
+
return blast_table_results
|
59
|
+
end
|
60
|
+
|
61
|
+
# TODO - Contaminants databases grouped by folders
|
62
|
+
# TODO - User can select a set of contaminants folders
|
63
|
+
|
64
|
+
|
65
|
+
def exec_seq(seq,blast_query)
|
66
|
+
if blast_query.query_id != seq.seq_name
|
67
|
+
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
68
|
+
end
|
69
|
+
|
70
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence"
|
71
|
+
|
72
|
+
|
73
|
+
#blast = BatchBlast.new('-db DB/formatted/contaminants.fasta','blastn',' -task blastn -evalue 1e-10 -perc_identity 95') #get contaminants
|
74
|
+
# blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
75
|
+
|
76
|
+
|
77
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta,:xml) #rise seq to contaminants executing over blast
|
78
|
+
|
79
|
+
|
80
|
+
#blast_table_results = BlastTableResult.new(res)
|
81
|
+
|
82
|
+
type = "ActionIsContaminated"
|
83
|
+
|
84
|
+
contaminants=[]
|
85
|
+
|
86
|
+
contaminants_ids=[]
|
87
|
+
|
88
|
+
# blast_table_results.querys.each do |query| #first round to save contaminants without overlap
|
89
|
+
# contaminants_ids.push query.hits.definition if (not contaminants_ids.include?(query.hits.definition))
|
90
|
+
merge_hits(blast_query.hits,contaminants,contaminants_ids)
|
91
|
+
# end
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
begin
|
97
|
+
contaminants2=contaminants
|
98
|
+
contaminants = [] #second round to save contaminants without overlap
|
99
|
+
merge_hits(contaminants2,contaminants)
|
100
|
+
# DONE describir cada ID contaminante encontradomerge_hits(contaminants2,contaminants,ids_contaminants)
|
101
|
+
end until (contaminants2.count == contaminants.count)
|
102
|
+
|
103
|
+
|
104
|
+
actions=[]
|
105
|
+
contaminants_size=0
|
106
|
+
|
107
|
+
# @stats[:contaminants_size]={}
|
108
|
+
@stats['contaminants_size']={}
|
109
|
+
@stats['rejected_seqs']={}
|
110
|
+
|
111
|
+
min_cont_size=@params.get_param('min_contam_seq_presence').to_i
|
112
|
+
|
113
|
+
contaminants.each do |c|
|
114
|
+
contaminants_size=c.q_end - c.q_beg + 1
|
115
|
+
#if ( (@params.get_param('genus')!=c.subject_id.split('_')[1]) &&
|
116
|
+
valid_genus=@params.get_param('genus').empty? || !c.definition.upcase.index(@params.get_param('genus').upcase)
|
117
|
+
|
118
|
+
if (valid_genus) &&
|
119
|
+
(contaminants_size>=min_cont_size)
|
120
|
+
|
121
|
+
#( (min_cont_size<=contaminants_size) || (near_to_extrem(c,seq,min_cont_size)) ) )
|
122
|
+
|
123
|
+
if !seq.range_inside_action_type?(c.q_beg,c.q_end,ActionVectors)
|
124
|
+
|
125
|
+
# puts "DIFFERENT SPECIE #{specie} ,#{hit.subject_id.split('_')[1].to_s}"
|
126
|
+
a = seq.new_action(c.q_beg,c.q_end,type) # adds the correspondent action to the sequence
|
127
|
+
a.message = c.definition
|
128
|
+
|
129
|
+
a.found_definition = contaminants_ids # save the contaminants definitions, each separately
|
130
|
+
actions.push a
|
131
|
+
|
132
|
+
contaminants_size=c.q_end-c.q_beg+1
|
133
|
+
|
134
|
+
# if @stats[:contaminants_size][contaminants_size].nil?
|
135
|
+
# @stats[:contaminants_size][contaminants_size] = 0
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
# @stats[:contaminants_size][contaminants_size] += 1
|
139
|
+
add_stats('contaminants_size',contaminants_size)
|
140
|
+
contaminants_ids.each do |c|
|
141
|
+
add_stats('contaminants_ids',c)
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
else
|
146
|
+
$LOG.info('Contaminant ignored due to genus match: '+c.definition)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
reject=@params.get_param('contaminants_reject')
|
151
|
+
# cond_if=false
|
152
|
+
# cond_if=true if (not actions.empty? ) && (reject=='true')
|
153
|
+
#
|
154
|
+
# puts "Before check SEQ_REJECTED= TRUE (reject= .#{reject}. #{reject.class}&& not actions empty= #{not actions.empty?} ) == #{cond_if} >>> "
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
if ((not actions.empty? ) && (reject=='true'))
|
159
|
+
#reject sequence
|
160
|
+
# puts "SEQ_REJECTED= TRUE >>> "
|
161
|
+
seq.seq_rejected=true
|
162
|
+
seq.seq_rejected_by_message='contaminated'
|
163
|
+
|
164
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1}
|
165
|
+
add_stats('rejected','contaminated')
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
seq.add_actions(actions)
|
170
|
+
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
#Returns an array with the errors due to parameters are missing
|
175
|
+
def self.check_params(params)
|
176
|
+
errors=[]
|
177
|
+
|
178
|
+
|
179
|
+
comment='Blast E-value used as cut-off when searching for contaminations'
|
180
|
+
default_value = 1e-10
|
181
|
+
params.check_param(errors,'blast_evalue_contaminants','Float',default_value,comment)
|
182
|
+
|
183
|
+
comment='Minimum required identity (%) for a reliable contamination'
|
184
|
+
default_value = 85
|
185
|
+
params.check_param(errors,'blast_percent_contaminants','Integer',default_value,comment)
|
186
|
+
|
187
|
+
comment='Minimum hit size (nt) for considering a true contamination'
|
188
|
+
default_value = 40
|
189
|
+
params.check_param(errors,'min_contam_seq_presence','Integer',default_value,comment)
|
190
|
+
|
191
|
+
comment='Genus of input data: contaminations belonging to this genus will be ignored'
|
192
|
+
default_value = ''
|
193
|
+
params.check_param(errors,'genus','String',default_value,comment)
|
194
|
+
|
195
|
+
comment='Is a contamination considered a source of sequence rejection? (setting to false will only trim contaminated sequences instead of rejecting the complete read)'
|
196
|
+
default_value = 'true'
|
197
|
+
params.check_param(errors,'contaminants_reject','String',default_value,comment)
|
198
|
+
|
199
|
+
|
200
|
+
comment='Path for contaminants database'
|
201
|
+
default_value = File.join($FORMATTED_DB_PATH,'contaminants.fasta')
|
202
|
+
params.check_param(errors,'contaminants_db','DB',default_value,comment)
|
203
|
+
|
204
|
+
|
205
|
+
return errors
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
end
|