seqtrimnext 2.0.51 → 2.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -15,23 +15,48 @@ class Plugin
|
|
15
15
|
attr_accessor :stats
|
16
16
|
|
17
17
|
#Loads the plugin's execution whit the sequence "seq"
|
18
|
-
def initialize(seq,params)
|
19
|
-
# $LOG.debug self.class.to_s + " processing sequence: " + seq.seq_name
|
20
|
-
# if (!(self.class.to_s=='PluginLowQuality') )
|
18
|
+
def initialize(seq, params)
|
21
19
|
@params = params
|
22
20
|
@stats ={}
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
|
22
|
+
if can_execute?
|
23
|
+
execute(seq)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
def can_execute?
|
29
|
+
return true
|
28
30
|
end
|
29
31
|
|
30
32
|
#Begins the plugin's execution whit the sequence "seq"
|
31
33
|
def execute(seqs)
|
34
|
+
blasts=do_blasts(seqs)
|
35
|
+
|
36
|
+
if !blasts.empty?
|
37
|
+
|
38
|
+
if blasts.is_a?(Array)
|
39
|
+
queries=blasts
|
40
|
+
else
|
41
|
+
queries = blasts.querys
|
42
|
+
end
|
43
|
+
|
44
|
+
seqs.each_with_index do |s,i|
|
45
|
+
exec_seq(s,queries[i])
|
46
|
+
end
|
47
|
+
|
48
|
+
else # there is no blast
|
32
49
|
|
50
|
+
seqs.each do |s|
|
51
|
+
exec_seq(s,nil)
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
33
55
|
end
|
34
56
|
|
57
|
+
def do_blasts(seqs)
|
58
|
+
return []
|
59
|
+
end
|
35
60
|
|
36
61
|
#Initializes the structure stats to the given key and value , only when it is neccesary, and increases its counter
|
37
62
|
def add_stats(key,value)
|
@@ -69,14 +94,19 @@ class Plugin
|
|
69
94
|
return ((r1_start<=r2_end+1) and (r1_end>=r2_start-1) )
|
70
95
|
end
|
71
96
|
|
72
|
-
def merge_hits(hits,merged_hits,merged_ids=nil)
|
97
|
+
def merge_hits(hits,merged_hits,merged_ids=nil, merge_different_ids=true)
|
73
98
|
# puts " merging ============"
|
74
99
|
# hits.each do |hit|
|
75
100
|
hits.sort{|h1,h2| (h1.q_end-h1.q_beg+1)<=>(h2.q_end-h2.q_beg+1)}.reverse_each do |hit|
|
76
101
|
|
77
102
|
merged_ids.push hit.definition if !merged_ids.nil? && (! merged_ids.include?(hit.definition))
|
78
103
|
# if new hit's position is already contained in hits, then ignore the new hit
|
79
|
-
|
104
|
+
if merge_different_ids
|
105
|
+
c=merged_hits.find{|c| overlapX?(hit.q_beg, hit.q_end,c.q_beg,c.q_end)}
|
106
|
+
else
|
107
|
+
# overlap with existent hit and same subject id
|
108
|
+
c=merged_hits.find{|c| (overlapX?(hit.q_beg, hit.q_end,c.q_beg,c.q_end) && (hit.subject_id==c.subject_id))}
|
109
|
+
end
|
80
110
|
# puts " c #{c.inspect}"
|
81
111
|
|
82
112
|
if (c.nil?)
|
@@ -89,10 +119,10 @@ class Plugin
|
|
89
119
|
|
90
120
|
# one is inside each other, just ignore
|
91
121
|
if ((hit.q_beg>=c.q_beg && hit.q_end <=c.q_end) || (c.q_beg>=hit.q_beg && c.q_end <= hit.q_end))
|
92
|
-
|
122
|
+
# puts "* #{hit.subject_id} inside #{c.subject_id}"
|
93
123
|
else
|
94
124
|
# merge with old contaminant
|
95
|
-
|
125
|
+
# puts "#{hit.subject_id} NOT inside #{c.subject_id}"
|
96
126
|
min=[c.q_beg,hit.q_beg].min
|
97
127
|
max=[c.q_end,hit.q_end].max
|
98
128
|
|
@@ -13,16 +13,9 @@ class PluginAbAdapters < Plugin
|
|
13
13
|
MIN_ADAPTER_SIZE = 5
|
14
14
|
MIN_FAR_ADAPTER_SIZE = 13
|
15
15
|
MIN_LEFT_ADAPTER_SIZE = 9
|
16
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
17
|
-
def execute(seqs)
|
18
|
-
blasts= do_blasts(seqs)
|
19
|
-
|
20
|
-
seqs.each_with_index do |s,i|
|
21
|
-
exec_seq(s,blasts.querys[i])
|
22
|
-
end
|
23
|
-
end
|
24
16
|
|
25
17
|
def do_blasts(seqs)
|
18
|
+
|
26
19
|
# find MIDS with less results than max_target_seqs value
|
27
20
|
blast=BatchBlast.new("-db #{@params.get_param('adapters_ab_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
|
28
21
|
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
@@ -12,15 +12,6 @@ class PluginAdapters < Plugin
|
|
12
12
|
# adapters found at end of sequence are even 2 nt wide, cut in 5 because of statistics
|
13
13
|
MIN_ADAPTER_SIZE = 5
|
14
14
|
MIN_FAR_ADAPTER_SIZE = 13
|
15
|
-
# MIN_LEFT_ADAPTER_SIZE = 9
|
16
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
17
|
-
def execute(seqs)
|
18
|
-
blasts= do_blasts(seqs)
|
19
|
-
|
20
|
-
seqs.each_with_index do |s,i|
|
21
|
-
exec_seq(s,blasts.querys[i])
|
22
|
-
end
|
23
|
-
end
|
24
15
|
|
25
16
|
def do_blasts(seqs)
|
26
17
|
# find MIDS with less results than max_target_seqs value
|
@@ -9,18 +9,6 @@ require "plugin"
|
|
9
9
|
|
10
10
|
class PluginAmplicons < Plugin
|
11
11
|
|
12
|
-
# adapters found at end of sequence are even 2 nt wide, cut in 5 because of statistics
|
13
|
-
# MIN_PRIMER_SIZE = 5
|
14
|
-
# MIN_FAR_ADAPTER_SIZE = 13
|
15
|
-
# MIN_LEFT_ADAPTER_SIZE = 9
|
16
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
17
|
-
def execute(seqs)
|
18
|
-
blasts= do_blasts(seqs)
|
19
|
-
|
20
|
-
seqs.each_with_index do |s,i|
|
21
|
-
exec_seq(s,blasts.querys[i])
|
22
|
-
end
|
23
|
-
end
|
24
12
|
|
25
13
|
def do_blasts(seqs)
|
26
14
|
# find MIDS with less results than max_target_seqs value
|
@@ -18,14 +18,6 @@ class PluginContaminants < Plugin
|
|
18
18
|
max_to_extreme=(min_cont_size/2).to_i
|
19
19
|
return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert)
|
20
20
|
end
|
21
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
22
|
-
def execute(seqs)
|
23
|
-
blasts= do_blasts(seqs)
|
24
|
-
|
25
|
-
seqs.each_with_index do |s,i|
|
26
|
-
exec_seq(s,blasts.querys[i])
|
27
|
-
end
|
28
|
-
end
|
29
21
|
|
30
22
|
def do_blasts(seqs)
|
31
23
|
# find MIDS with less results than max_target_seqs value
|
@@ -63,6 +55,11 @@ class PluginContaminants < Plugin
|
|
63
55
|
|
64
56
|
|
65
57
|
def exec_seq(seq,blast_query)
|
58
|
+
|
59
|
+
if seq.get_actions(ActionUserContaminant)
|
60
|
+
return
|
61
|
+
end
|
62
|
+
|
66
63
|
if blast_query.query_id != seq.seq_name
|
67
64
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
68
65
|
end
|
@@ -276,16 +276,7 @@ class PluginExtractInserts < Plugin
|
|
276
276
|
return sub_inserts
|
277
277
|
end
|
278
278
|
|
279
|
-
|
280
|
-
# Begins the plugin1's execution to warn that there is linker into the sequence
|
281
|
-
def execute(seqs)
|
282
|
-
seqs.each do |s|
|
283
|
-
exec_seq(s)
|
284
|
-
end
|
285
|
-
end
|
286
|
-
|
287
|
-
|
288
|
-
def exec_seq(seq)
|
279
|
+
def exec_seq(seq,blast_query)
|
289
280
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: extract inserts"
|
290
281
|
|
291
282
|
# puts "INSERTO ANTES LINKER INSERT:"+seq.seq_fasta
|
@@ -232,9 +232,6 @@ class PluginFindPolyAt < Plugin
|
|
232
232
|
end
|
233
233
|
|
234
234
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
235
|
# if (poly['begin']<=MAX_POLY_T_FROM_LEFT*2)
|
239
236
|
# seq.seq_rejected=true
|
240
237
|
# seq.seq_rejected_by_message='polyT found'
|
@@ -311,14 +308,7 @@ class PluginFindPolyAt < Plugin
|
|
311
308
|
|
312
309
|
|
313
310
|
|
314
|
-
|
315
|
-
seqs.each do |s|
|
316
|
-
exec_seq(s)
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|
320
|
-
|
321
|
-
def exec_seq(seq)
|
311
|
+
def exec_seq(seq,blast_query)
|
322
312
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
|
323
313
|
|
324
314
|
find_polyT(seq)
|
@@ -13,14 +13,8 @@ class PluginIgnoreRepeated < Plugin
|
|
13
13
|
SIZE_SEARCH_IN_IGNORE=15
|
14
14
|
|
15
15
|
#Begins the plugin1's execution to warn that there are repeated sequences, and disables all but one"
|
16
|
-
def
|
17
|
-
seqs.each do |s|
|
18
|
-
exec_seq(s)
|
19
|
-
end
|
20
|
-
end
|
16
|
+
def exec_seq(seq,blast_query)
|
21
17
|
|
22
|
-
|
23
|
-
def exec_seq(seq)
|
24
18
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
|
25
19
|
|
26
20
|
fasta_input=@params.get_param('truncated_input_file')
|
@@ -140,15 +140,8 @@ class PluginIndeterminations < Plugin
|
|
140
140
|
end
|
141
141
|
|
142
142
|
|
143
|
-
|
144
|
-
def execute(seqs)
|
145
|
-
seqs.each do |s|
|
146
|
-
exec_seq(s)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
143
|
+
def exec_seq(seq,blast_query)
|
150
144
|
|
151
|
-
def exec_seq(seq)
|
152
145
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
|
153
146
|
|
154
147
|
actions=[]
|
@@ -11,16 +11,8 @@ class PluginKey < Plugin
|
|
11
11
|
|
12
12
|
|
13
13
|
#Begins the pluginKey's execution to warn where is a key in the sequence "seq"
|
14
|
-
|
15
|
-
@group_by_key=(@params.get_param('use_independent_folder_for_each_key')=='true')
|
16
|
-
|
17
|
-
seqs.each do |s|
|
18
|
-
exec_seq(s)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
14
|
+
def exec_seq(seq,blast_query)
|
22
15
|
|
23
|
-
def exec_seq(seq)
|
24
16
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: marking key into the sequence"
|
25
17
|
# blast_table_results.inspect
|
26
18
|
|
@@ -70,15 +70,6 @@ class PluginLinker < Plugin
|
|
70
70
|
return res
|
71
71
|
end
|
72
72
|
|
73
|
-
# Begins the plugin1's execution to warn that there is linker into the sequence
|
74
|
-
def execute(seqs)
|
75
|
-
blasts= do_blasts(seqs)
|
76
|
-
|
77
|
-
seqs.each_with_index do |s,i|
|
78
|
-
exec_seq(s,blasts.querys[i])
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
73
|
def do_blasts(seqs)
|
83
74
|
# find MIDS with less results than max_target_seqs value
|
84
75
|
blast = BatchBlast.new("-db #{@params.get_param('linkers_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
|
@@ -12,19 +12,10 @@ require "plugin"
|
|
12
12
|
MIN_DUST_SIZE = 30
|
13
13
|
|
14
14
|
class PluginLowComplexity < Plugin
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
def execute(seqs)
|
19
|
-
dust= do_dust(seqs)
|
20
|
-
|
21
|
-
seqs.each_with_index do |s,i|
|
22
|
-
exec_seq(s,dust[i])
|
23
|
-
end
|
24
|
-
end
|
25
15
|
|
26
|
-
|
27
|
-
|
16
|
+
# do the dust masker instead of blast
|
17
|
+
def do_blasts(seqs)
|
18
|
+
|
28
19
|
dust_masker=DustMasker.new()
|
29
20
|
|
30
21
|
fastas=[]
|
@@ -44,7 +35,9 @@ class PluginLowComplexity < Plugin
|
|
44
35
|
end
|
45
36
|
|
46
37
|
|
47
|
-
def exec_seq(seq,
|
38
|
+
def exec_seq(seq,blast_query)
|
39
|
+
dust_query=blast_query
|
40
|
+
|
48
41
|
if dust_query.query_id != seq.seq_name
|
49
42
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
50
43
|
end
|
@@ -65,14 +58,6 @@ class PluginLowComplexity < Plugin
|
|
65
58
|
if (dust_size)>=MIN_DUST_SIZE
|
66
59
|
|
67
60
|
# check if low complexity is inside a lowqual region
|
68
|
-
# in_low_qual=false
|
69
|
-
# low_quals.each do |lq|
|
70
|
-
# if lq.contains_action?(start,stop,0)
|
71
|
-
# in_low_qual=true
|
72
|
-
# break
|
73
|
-
# end
|
74
|
-
# end
|
75
|
-
|
76
61
|
if !seq.range_inside_action_type?(start,stop,ActionLowQuality)
|
77
62
|
|
78
63
|
total_dust+=dust_size
|
@@ -13,15 +13,9 @@ class PluginLowHighSize < Plugin
|
|
13
13
|
|
14
14
|
|
15
15
|
# Begins the plugin_low_high_size's execution with the sequence "seq"
|
16
|
-
|
17
|
-
def
|
18
|
-
|
19
|
-
exec_seq(s)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
def exec_seq(seq)
|
16
|
+
|
17
|
+
def exec_seq(seq,blast_query)
|
18
|
+
|
25
19
|
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low or high size of the sequence"
|
26
20
|
|
27
21
|
min_size = @params.get_param('min_sequence_size_raw').to_i #min_size is: mean - 2dev
|
@@ -46,10 +40,6 @@ class PluginLowHighSize < Plugin
|
|
46
40
|
|
47
41
|
|
48
42
|
end
|
49
|
-
|
50
|
-
######################################################################
|
51
|
-
#---------------------------------------------------------------------
|
52
|
-
|
53
43
|
|
54
44
|
|
55
45
|
######################################################################
|