seqtrimnext 2.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
require "plugin"
|
|
2
|
+
|
|
3
|
+
########################################################
|
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
|
5
|
+
#
|
|
6
|
+
# Defines the main methods that are necessary to execute PluginAdapters
|
|
7
|
+
# Inherit: Plugin
|
|
8
|
+
########################################################
|
|
9
|
+
|
|
10
|
+
class PluginAmplicons < Plugin
|
|
11
|
+
|
|
12
|
+
# adapters found at end of sequence are even 2 nt wide, cut in 5 because of statistics
|
|
13
|
+
# MIN_PRIMER_SIZE = 5
|
|
14
|
+
# MIN_FAR_ADAPTER_SIZE = 13
|
|
15
|
+
# MIN_LEFT_ADAPTER_SIZE = 9
|
|
16
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
|
17
|
+
def execute(seqs)
|
|
18
|
+
blasts= do_blasts(seqs)
|
|
19
|
+
|
|
20
|
+
seqs.each_with_index do |s,i|
|
|
21
|
+
exec_seq(s,blasts.querys[i])
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def do_blasts(seqs)
|
|
26
|
+
# find MIDS with less results than max_target_seqs value
|
|
27
|
+
blast=BatchBlast.new("-db #{@params.get_param('primers_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_primers')}")
|
|
28
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd)
|
|
29
|
+
|
|
30
|
+
fastas=[]
|
|
31
|
+
|
|
32
|
+
seqs.each do |seq|
|
|
33
|
+
fastas.push ">"+seq.seq_name
|
|
34
|
+
fastas.push seq.seq_fasta
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# fastas=fastas.join("\n")
|
|
38
|
+
|
|
39
|
+
blast_table_results = blast.do_blast(fastas)
|
|
40
|
+
|
|
41
|
+
# puts blast_table_results.inspect
|
|
42
|
+
|
|
43
|
+
return blast_table_results
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def exec_seq(seq,blast_query)
|
|
48
|
+
if blast_query.query_id != seq.seq_name
|
|
49
|
+
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for primers into the sequence"
|
|
53
|
+
|
|
54
|
+
# puts blast_query.inspect
|
|
55
|
+
|
|
56
|
+
# merge hits
|
|
57
|
+
# primers=blast_query.merged_hits!
|
|
58
|
+
|
|
59
|
+
# do not merge hits, since id is important
|
|
60
|
+
primers=blast_query.hits
|
|
61
|
+
|
|
62
|
+
min_primer_size=@params.get_param('min_primer_size').to_i
|
|
63
|
+
# puts "MERGED:"
|
|
64
|
+
# puts primers.inspect
|
|
65
|
+
|
|
66
|
+
# type = 'ActionAbAdapter'
|
|
67
|
+
actions=[]
|
|
68
|
+
adapter_size=0
|
|
69
|
+
|
|
70
|
+
# filter primers by size
|
|
71
|
+
primers = primers.select{|primer| (primer.size >= min_primer_size)}.sort{|p1,p2| p1.size<=>p2.size}.reverse
|
|
72
|
+
# puts "FILTERED:"
|
|
73
|
+
# puts primers.inspect
|
|
74
|
+
|
|
75
|
+
# reject sequences with less than two primers
|
|
76
|
+
if primers.count < 2
|
|
77
|
+
|
|
78
|
+
seq.seq_rejected=true
|
|
79
|
+
seq.seq_rejected_by_message='Primer pair not found'
|
|
80
|
+
|
|
81
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1}
|
|
82
|
+
add_stats('rejected','primers_not_found')
|
|
83
|
+
|
|
84
|
+
else # has two primers, or more
|
|
85
|
+
|
|
86
|
+
if seq.seq_fasta.index('N')
|
|
87
|
+
seq.seq_rejected=true
|
|
88
|
+
seq.seq_rejected_by_message='At least one N found'
|
|
89
|
+
|
|
90
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1}
|
|
91
|
+
add_stats('rejected','one_n_found')
|
|
92
|
+
|
|
93
|
+
else
|
|
94
|
+
# puts "EL DE ARRIBA"
|
|
95
|
+
|
|
96
|
+
# take first two primers and order them by qbeg
|
|
97
|
+
left_primer = primers[0..1].sort{|p1,p2| p1.q_beg<=>p2.q_beg}.first
|
|
98
|
+
right_primer = primers[0..1].sort{|p1,p2| p1.q_beg<=>p2.q_beg}.last
|
|
99
|
+
|
|
100
|
+
# puts "LEFT_PRIMER:"
|
|
101
|
+
# puts left_primer.inspect
|
|
102
|
+
# puts "RIGHT_PRIMER:"
|
|
103
|
+
# puts right_primer.inspect
|
|
104
|
+
|
|
105
|
+
# if (left_primer.size>= min_primer_size) && (right_primer.size>= min_primer_size)
|
|
106
|
+
|
|
107
|
+
a = seq.new_action(left_primer.q_beg,left_primer.q_end,'ActionLeftPrimer')
|
|
108
|
+
a.message = left_primer.subject_id
|
|
109
|
+
a.tag_id = left_primer.subject_id
|
|
110
|
+
a.reversed = left_primer.reversed
|
|
111
|
+
a.left_action = true
|
|
112
|
+
actions.push a
|
|
113
|
+
|
|
114
|
+
add_stats('primer_size',left_primer.size)
|
|
115
|
+
add_stats('primer_id',left_primer.subject_id)
|
|
116
|
+
|
|
117
|
+
a = seq.new_action(right_primer.q_beg,right_primer.q_end,'ActionRightPrimer')
|
|
118
|
+
a.message = right_primer.subject_id
|
|
119
|
+
a.reversed = right_primer.reversed
|
|
120
|
+
a.tag_id = right_primer.subject_id
|
|
121
|
+
a.right_action = true
|
|
122
|
+
actions.push a
|
|
123
|
+
|
|
124
|
+
add_stats('primer_size',right_primer.size)
|
|
125
|
+
add_stats('primer_id',right_primer.subject_id)
|
|
126
|
+
|
|
127
|
+
seq.add_file_tag(2, left_primer.subject_id, :file)
|
|
128
|
+
seq.add_file_tag(2, right_primer.subject_id, :file)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# end
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if !actions.empty?
|
|
135
|
+
seq.add_actions(actions)
|
|
136
|
+
add_stats('sequences_with_primers','count')
|
|
137
|
+
|
|
138
|
+
# add_stats('sequences',seq.seq_fasta)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
end
|
|
142
|
+
#
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
#Returns an array with the errors due to parameters are missing
|
|
147
|
+
def self.check_params(params)
|
|
148
|
+
errors=[]
|
|
149
|
+
|
|
150
|
+
comment='Blast E-value used as cut-off when searching for primers'
|
|
151
|
+
# default_value = 1e-6
|
|
152
|
+
default_value = 1
|
|
153
|
+
params.check_param(errors,'blast_evalue_primers','Float',default_value,comment)
|
|
154
|
+
|
|
155
|
+
comment='Minimum required identity (%) for a reliable primer'
|
|
156
|
+
default_value = 95
|
|
157
|
+
params.check_param(errors,'blast_percent_primers','Integer',default_value,comment)
|
|
158
|
+
|
|
159
|
+
comment='Minimun primer size'
|
|
160
|
+
default_value = 15
|
|
161
|
+
params.check_param(errors,'min_primer_size','Integer',default_value,comment)
|
|
162
|
+
|
|
163
|
+
comment='Path for primers database'
|
|
164
|
+
# default_value = File.join($FORMATTED_DB_PATH,'adapters_ab.fasta')
|
|
165
|
+
default_value=nil
|
|
166
|
+
params.check_param(errors,'primers_db','DB',default_value,comment)
|
|
167
|
+
|
|
168
|
+
return errors
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# def self.get_graph_title(plugin_name,stats_name)
|
|
172
|
+
# case stats_name
|
|
173
|
+
# when 'adapter_type'
|
|
174
|
+
# 'AB adapters by type'
|
|
175
|
+
# when 'adapter_size'
|
|
176
|
+
# 'AB adapters by size'
|
|
177
|
+
# end
|
|
178
|
+
# end
|
|
179
|
+
#
|
|
180
|
+
# def self.get_graph_filename(plugin_name,stats_name)
|
|
181
|
+
# return stats_name
|
|
182
|
+
#
|
|
183
|
+
# # case stats_name
|
|
184
|
+
# # when 'adapter_type'
|
|
185
|
+
# # 'AB adapters by type'
|
|
186
|
+
# # when 'adapter_size'
|
|
187
|
+
# # 'AB adapters by size'
|
|
188
|
+
# # end
|
|
189
|
+
# end
|
|
190
|
+
#
|
|
191
|
+
# def self.valid_graphs
|
|
192
|
+
# return ['adapter_type']
|
|
193
|
+
# end
|
|
194
|
+
|
|
195
|
+
# def self.plot_setup(stats_value,stats_name,x,y,init_stats,plot)
|
|
196
|
+
#
|
|
197
|
+
# # puts "============== #{stats_name}"
|
|
198
|
+
#
|
|
199
|
+
# # puts stats_name
|
|
200
|
+
# case stats_name
|
|
201
|
+
#
|
|
202
|
+
# when 'primer_size'
|
|
203
|
+
# plot.x_label= "Length"
|
|
204
|
+
# plot.y_label= "Count"
|
|
205
|
+
# # plot.x_range="[0:#{init_stats['biggest_sequence_size'].to_i}]"
|
|
206
|
+
# plot.x_range="[0:200]"
|
|
207
|
+
# puts x.class
|
|
208
|
+
# plot.add_x(x)
|
|
209
|
+
# plot.add_y(y)
|
|
210
|
+
#
|
|
211
|
+
# plot.do_graph
|
|
212
|
+
#
|
|
213
|
+
# return true
|
|
214
|
+
# else
|
|
215
|
+
# return false
|
|
216
|
+
# end
|
|
217
|
+
#
|
|
218
|
+
# end
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
require "plugin"
|
|
2
|
+
|
|
3
|
+
require "make_blast_db"
|
|
4
|
+
########################################################
|
|
5
|
+
# Author: Almudena Bocinos Rioboo
|
|
6
|
+
#
|
|
7
|
+
# Defines the main methods that are necessary to execute PluginContaminants
|
|
8
|
+
# Inherit: Plugin
|
|
9
|
+
########################################################
|
|
10
|
+
|
|
11
|
+
class PluginContaminants < Plugin
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
MAX_TARGETS_SEQS=4 #MAXIMUM NUMBER OF DIFFERENT ALIGNED SEQUENCES TO KEEP FROM BLAST DATABASE
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def near_to_extrem(c,seq,min_cont_size)
|
|
18
|
+
max_to_extreme=(min_cont_size/2).to_i
|
|
19
|
+
return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert)
|
|
20
|
+
end
|
|
21
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
|
22
|
+
def execute(seqs)
|
|
23
|
+
blasts= do_blasts(seqs)
|
|
24
|
+
|
|
25
|
+
seqs.each_with_index do |s,i|
|
|
26
|
+
exec_seq(s,blasts.querys[i])
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def do_blasts(seqs)
|
|
31
|
+
# find MIDS with less results than max_target_seqs value
|
|
32
|
+
# blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
|
33
|
+
|
|
34
|
+
# TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
|
|
35
|
+
# y una secuencia de baja complejidad como entrada
|
|
36
|
+
|
|
37
|
+
blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
|
38
|
+
|
|
39
|
+
$LOG.info('BLAST:'+blast.get_blast_cmd(:xml))
|
|
40
|
+
|
|
41
|
+
fastas=[]
|
|
42
|
+
|
|
43
|
+
seqs.each do |seq|
|
|
44
|
+
fastas.push ">"+seq.seq_name
|
|
45
|
+
fastas.push seq.seq_fasta
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# fastas=fastas.join("\n")
|
|
49
|
+
# $LOG.info('doing blast to:')
|
|
50
|
+
# $LOG.info('-'*20)
|
|
51
|
+
# $LOG.info(fastas)
|
|
52
|
+
# $LOG.info('-'*20)
|
|
53
|
+
|
|
54
|
+
blast_table_results = blast.do_blast(fastas,:xml)
|
|
55
|
+
|
|
56
|
+
# $LOG.info(blast_table_results.inspect)
|
|
57
|
+
|
|
58
|
+
return blast_table_results
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# TODO - Contaminants databases grouped by folders
|
|
62
|
+
# TODO - User can select a set of contaminants folders
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def exec_seq(seq,blast_query)
|
|
66
|
+
if blast_query.query_id != seq.seq_name
|
|
67
|
+
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
$LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
#blast = BatchBlast.new('-db DB/formatted/contaminants.fasta','blastn',' -task blastn -evalue 1e-10 -perc_identity 95') #get contaminants
|
|
74
|
+
# blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta,:xml) #rise seq to contaminants executing over blast
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
#blast_table_results = BlastTableResult.new(res)
|
|
81
|
+
|
|
82
|
+
type = "ActionIsContaminated"
|
|
83
|
+
|
|
84
|
+
contaminants=[]
|
|
85
|
+
|
|
86
|
+
contaminants_ids=[]
|
|
87
|
+
|
|
88
|
+
# blast_table_results.querys.each do |query| #first round to save contaminants without overlap
|
|
89
|
+
# contaminants_ids.push query.hits.definition if (not contaminants_ids.include?(query.hits.definition))
|
|
90
|
+
merge_hits(blast_query.hits,contaminants,contaminants_ids)
|
|
91
|
+
# end
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
begin
|
|
97
|
+
contaminants2=contaminants
|
|
98
|
+
contaminants = [] #second round to save contaminants without overlap
|
|
99
|
+
merge_hits(contaminants2,contaminants)
|
|
100
|
+
# DONE describir cada ID contaminante encontradomerge_hits(contaminants2,contaminants,ids_contaminants)
|
|
101
|
+
end until (contaminants2.count == contaminants.count)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
actions=[]
|
|
105
|
+
contaminants_size=0
|
|
106
|
+
|
|
107
|
+
# @stats[:contaminants_size]={}
|
|
108
|
+
@stats['contaminants_size']={}
|
|
109
|
+
@stats['rejected_seqs']={}
|
|
110
|
+
|
|
111
|
+
min_cont_size=@params.get_param('min_contam_seq_presence').to_i
|
|
112
|
+
|
|
113
|
+
contaminants.each do |c|
|
|
114
|
+
contaminants_size=c.q_end - c.q_beg + 1
|
|
115
|
+
#if ( (@params.get_param('genus')!=c.subject_id.split('_')[1]) &&
|
|
116
|
+
valid_genus=@params.get_param('genus').empty? || !c.definition.upcase.index(@params.get_param('genus').upcase)
|
|
117
|
+
|
|
118
|
+
if (valid_genus) &&
|
|
119
|
+
(contaminants_size>=min_cont_size)
|
|
120
|
+
|
|
121
|
+
#( (min_cont_size<=contaminants_size) || (near_to_extrem(c,seq,min_cont_size)) ) )
|
|
122
|
+
|
|
123
|
+
if !seq.range_inside_action_type?(c.q_beg,c.q_end,ActionVectors)
|
|
124
|
+
|
|
125
|
+
# puts "DIFFERENT SPECIE #{specie} ,#{hit.subject_id.split('_')[1].to_s}"
|
|
126
|
+
a = seq.new_action(c.q_beg,c.q_end,type) # adds the correspondent action to the sequence
|
|
127
|
+
a.message = c.definition
|
|
128
|
+
|
|
129
|
+
a.found_definition = contaminants_ids # save the contaminants definitions, each separately
|
|
130
|
+
actions.push a
|
|
131
|
+
|
|
132
|
+
contaminants_size=c.q_end-c.q_beg+1
|
|
133
|
+
|
|
134
|
+
# if @stats[:contaminants_size][contaminants_size].nil?
|
|
135
|
+
# @stats[:contaminants_size][contaminants_size] = 0
|
|
136
|
+
# end
|
|
137
|
+
#
|
|
138
|
+
# @stats[:contaminants_size][contaminants_size] += 1
|
|
139
|
+
add_stats('contaminants_size',contaminants_size)
|
|
140
|
+
contaminants_ids.each do |c|
|
|
141
|
+
add_stats('contaminants_ids',c)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
end
|
|
145
|
+
else
|
|
146
|
+
$LOG.info('Contaminant ignored due to genus match: '+c.definition)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
reject=@params.get_param('contaminants_reject')
|
|
151
|
+
# cond_if=false
|
|
152
|
+
# cond_if=true if (not actions.empty? ) && (reject=='true')
|
|
153
|
+
#
|
|
154
|
+
# puts "Before check SEQ_REJECTED= TRUE (reject= .#{reject}. #{reject.class}&& not actions empty= #{not actions.empty?} ) == #{cond_if} >>> "
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if ((not actions.empty? ) && (reject=='true'))
|
|
159
|
+
#reject sequence
|
|
160
|
+
# puts "SEQ_REJECTED= TRUE >>> "
|
|
161
|
+
seq.seq_rejected=true
|
|
162
|
+
seq.seq_rejected_by_message='contaminated'
|
|
163
|
+
|
|
164
|
+
# @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1}
|
|
165
|
+
add_stats('rejected','contaminated')
|
|
166
|
+
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
seq.add_actions(actions)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
#Returns an array with the errors due to parameters are missing
|
|
175
|
+
def self.check_params(params)
|
|
176
|
+
errors=[]
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
comment='Blast E-value used as cut-off when searching for contaminations'
|
|
180
|
+
default_value = 1e-10
|
|
181
|
+
params.check_param(errors,'blast_evalue_contaminants','Float',default_value,comment)
|
|
182
|
+
|
|
183
|
+
comment='Minimum required identity (%) for a reliable contamination'
|
|
184
|
+
default_value = 85
|
|
185
|
+
params.check_param(errors,'blast_percent_contaminants','Integer',default_value,comment)
|
|
186
|
+
|
|
187
|
+
comment='Minimum hit size (nt) for considering a true contamination'
|
|
188
|
+
default_value = 40
|
|
189
|
+
params.check_param(errors,'min_contam_seq_presence','Integer',default_value,comment)
|
|
190
|
+
|
|
191
|
+
comment='Genus of input data: contaminations belonging to this genus will be ignored'
|
|
192
|
+
default_value = ''
|
|
193
|
+
params.check_param(errors,'genus','String',default_value,comment)
|
|
194
|
+
|
|
195
|
+
comment='Is a contamination considered a source of sequence rejection? (setting to false will only trim contaminated sequences instead of rejecting the complete read)'
|
|
196
|
+
default_value = 'true'
|
|
197
|
+
params.check_param(errors,'contaminants_reject','String',default_value,comment)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
comment='Path for contaminants database'
|
|
201
|
+
default_value = File.join($FORMATTED_DB_PATH,'contaminants.fasta')
|
|
202
|
+
params.check_param(errors,'contaminants_db','DB',default_value,comment)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
return errors
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
end
|