seqtrimnext 2.0.51 → 2.0.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/History.txt +7 -0
  2. data/Manifest.txt +3 -3
  3. data/README.rdoc +18 -3
  4. data/Rakefile +2 -1
  5. data/bin/parse_params.rb +5 -1
  6. data/bin/seqtrimnext +53 -21
  7. data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
  8. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
  9. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
  10. data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
  11. data/lib/seqtrimnext/classes/params.rb +109 -123
  12. data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
  13. data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
  14. data/lib/seqtrimnext/classes/sequence.rb +2 -2
  15. data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
  16. data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
  17. data/lib/seqtrimnext/plugins/plugin.rb +42 -12
  18. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
  19. data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
  20. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
  21. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
  22. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
  23. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
  24. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
  25. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
  26. data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
  27. data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
  28. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
  29. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
  30. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
  31. data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
  32. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
  33. data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
  34. data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
  35. data/lib/seqtrimnext/templates/amplicons.txt +1 -8
  36. data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
  37. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
  38. data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
  39. data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
  40. data/lib/seqtrimnext/templates/only_quality.txt +24 -0
  41. data/lib/seqtrimnext/templates/sanger.txt +25 -0
  42. data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
  43. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
  44. data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
  45. data/lib/seqtrimnext.rb +1 -1
  46. metadata +20 -7
  47. data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
  48. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -15,23 +15,48 @@ class Plugin
15
15
  attr_accessor :stats
16
16
 
17
17
  #Loads the plugin's execution whit the sequence "seq"
18
- def initialize(seq,params)
19
- # $LOG.debug self.class.to_s + " processing sequence: " + seq.seq_name
20
- # if (!(self.class.to_s=='PluginLowQuality') )
18
+ def initialize(seq, params)
21
19
  @params = params
22
20
  @stats ={}
23
- execute(seq)
24
- # puts self.class.to_s + ' PPPPPPPPPP'
25
- # else
26
- # $LOG.error " Quality File haven't been provided. It's impossible to execute " +self.class.to_s + seq.seq_qual.nil?.to_s
27
- # end
21
+
22
+ if can_execute?
23
+ execute(seq)
24
+ end
25
+
26
+ end
27
+
28
+ def can_execute?
29
+ return true
28
30
  end
29
31
 
30
32
  #Begins the plugin's execution whit the sequence "seq"
31
33
  def execute(seqs)
34
+ blasts=do_blasts(seqs)
35
+
36
+ if !blasts.empty?
37
+
38
+ if blasts.is_a?(Array)
39
+ queries=blasts
40
+ else
41
+ queries = blasts.querys
42
+ end
43
+
44
+ seqs.each_with_index do |s,i|
45
+ exec_seq(s,queries[i])
46
+ end
47
+
48
+ else # there is no blast
32
49
 
50
+ seqs.each do |s|
51
+ exec_seq(s,nil)
52
+ end
53
+
54
+ end
33
55
  end
34
56
 
57
+ def do_blasts(seqs)
58
+ return []
59
+ end
35
60
 
36
61
  #Initializes the structure stats to the given key and value , only when it is neccesary, and increases its counter
37
62
  def add_stats(key,value)
@@ -69,14 +94,19 @@ class Plugin
69
94
  return ((r1_start<=r2_end+1) and (r1_end>=r2_start-1) )
70
95
  end
71
96
 
72
- def merge_hits(hits,merged_hits,merged_ids=nil)
97
+ def merge_hits(hits,merged_hits,merged_ids=nil, merge_different_ids=true)
73
98
  # puts " merging ============"
74
99
  # hits.each do |hit|
75
100
  hits.sort{|h1,h2| (h1.q_end-h1.q_beg+1)<=>(h2.q_end-h2.q_beg+1)}.reverse_each do |hit|
76
101
 
77
102
  merged_ids.push hit.definition if !merged_ids.nil? && (! merged_ids.include?(hit.definition))
78
103
  # if new hit's position is already contained in hits, then ignore the new hit
79
- c=merged_hits.find{|c| overlapX?(hit.q_beg, hit.q_end,c.q_beg,c.q_end)}
104
+ if merge_different_ids
105
+ c=merged_hits.find{|c| overlapX?(hit.q_beg, hit.q_end,c.q_beg,c.q_end)}
106
+ else
107
+ # overlap with existent hit and same subject id
108
+ c=merged_hits.find{|c| (overlapX?(hit.q_beg, hit.q_end,c.q_beg,c.q_end) && (hit.subject_id==c.subject_id))}
109
+ end
80
110
  # puts " c #{c.inspect}"
81
111
 
82
112
  if (c.nil?)
@@ -89,10 +119,10 @@ class Plugin
89
119
 
90
120
  # one is inside each other, just ignore
91
121
  if ((hit.q_beg>=c.q_beg && hit.q_end <=c.q_end) || (c.q_beg>=hit.q_beg && c.q_end <= hit.q_end))
92
- # puts "* #{hit.subject_id} inside #{c.subject_id}"
122
+ # puts "* #{hit.subject_id} inside #{c.subject_id}"
93
123
  else
94
124
  # merge with old contaminant
95
- # puts "#{hit.subject_id} NOT inside #{c.subject_id}"
125
+ # puts "#{hit.subject_id} NOT inside #{c.subject_id}"
96
126
  min=[c.q_beg,hit.q_beg].min
97
127
  max=[c.q_end,hit.q_end].max
98
128
 
@@ -13,16 +13,9 @@ class PluginAbAdapters < Plugin
13
13
  MIN_ADAPTER_SIZE = 5
14
14
  MIN_FAR_ADAPTER_SIZE = 13
15
15
  MIN_LEFT_ADAPTER_SIZE = 9
16
- #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
17
- def execute(seqs)
18
- blasts= do_blasts(seqs)
19
-
20
- seqs.each_with_index do |s,i|
21
- exec_seq(s,blasts.querys[i])
22
- end
23
- end
24
16
 
25
17
  def do_blasts(seqs)
18
+
26
19
  # find MIDS with less results than max_target_seqs value
27
20
  blast=BatchBlast.new("-db #{@params.get_param('adapters_ab_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
28
21
  $LOG.debug('BLAST:'+blast.get_blast_cmd)
@@ -12,15 +12,6 @@ class PluginAdapters < Plugin
12
12
  # adapters found at end of sequence are even 2 nt wide, cut in 5 because of statistics
13
13
  MIN_ADAPTER_SIZE = 5
14
14
  MIN_FAR_ADAPTER_SIZE = 13
15
- # MIN_LEFT_ADAPTER_SIZE = 9
16
- #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
17
- def execute(seqs)
18
- blasts= do_blasts(seqs)
19
-
20
- seqs.each_with_index do |s,i|
21
- exec_seq(s,blasts.querys[i])
22
- end
23
- end
24
15
 
25
16
  def do_blasts(seqs)
26
17
  # find MIDS with less results than max_target_seqs value
@@ -9,18 +9,6 @@ require "plugin"
9
9
 
10
10
  class PluginAmplicons < Plugin
11
11
 
12
- # adapters found at end of sequence are even 2 nt wide, cut in 5 because of statistics
13
- # MIN_PRIMER_SIZE = 5
14
- # MIN_FAR_ADAPTER_SIZE = 13
15
- # MIN_LEFT_ADAPTER_SIZE = 9
16
- #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
17
- def execute(seqs)
18
- blasts= do_blasts(seqs)
19
-
20
- seqs.each_with_index do |s,i|
21
- exec_seq(s,blasts.querys[i])
22
- end
23
- end
24
12
 
25
13
  def do_blasts(seqs)
26
14
  # find MIDS with less results than max_target_seqs value
@@ -18,14 +18,6 @@ class PluginContaminants < Plugin
18
18
  max_to_extreme=(min_cont_size/2).to_i
19
19
  return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert)
20
20
  end
21
- #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
22
- def execute(seqs)
23
- blasts= do_blasts(seqs)
24
-
25
- seqs.each_with_index do |s,i|
26
- exec_seq(s,blasts.querys[i])
27
- end
28
- end
29
21
 
30
22
  def do_blasts(seqs)
31
23
  # find MIDS with less results than max_target_seqs value
@@ -63,6 +55,11 @@ class PluginContaminants < Plugin
63
55
 
64
56
 
65
57
  def exec_seq(seq,blast_query)
58
+
59
+ if seq.get_actions(ActionUserContaminant)
60
+ return
61
+ end
62
+
66
63
  if blast_query.query_id != seq.seq_name
67
64
  # raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
68
65
  end
@@ -276,16 +276,7 @@ class PluginExtractInserts < Plugin
276
276
  return sub_inserts
277
277
  end
278
278
 
279
-
280
- # Begins the plugin1's execution to warn that there is linker into the sequence
281
- def execute(seqs)
282
- seqs.each do |s|
283
- exec_seq(s)
284
- end
285
- end
286
-
287
-
288
- def exec_seq(seq)
279
+ def exec_seq(seq,blast_query)
289
280
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: extract inserts"
290
281
 
291
282
  # puts "INSERTO ANTES LINKER INSERT:"+seq.seq_fasta
@@ -232,9 +232,6 @@ class PluginFindPolyAt < Plugin
232
232
  end
233
233
 
234
234
 
235
-
236
-
237
-
238
235
  # if (poly['begin']<=MAX_POLY_T_FROM_LEFT*2)
239
236
  # seq.seq_rejected=true
240
237
  # seq.seq_rejected_by_message='polyT found'
@@ -311,14 +308,7 @@ class PluginFindPolyAt < Plugin
311
308
 
312
309
 
313
310
 
314
- def execute(seqs)
315
- seqs.each do |s|
316
- exec_seq(s)
317
- end
318
- end
319
-
320
-
321
- def exec_seq(seq)
311
+ def exec_seq(seq,blast_query)
322
312
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
323
313
 
324
314
  find_polyT(seq)
@@ -13,14 +13,8 @@ class PluginIgnoreRepeated < Plugin
13
13
  SIZE_SEARCH_IN_IGNORE=15
14
14
 
15
15
  #Begins the plugin1's execution to warn that there are repeated sequences, and disables all but one"
16
- def execute(seqs)
17
- seqs.each do |s|
18
- exec_seq(s)
19
- end
20
- end
16
+ def exec_seq(seq,blast_query)
21
17
 
22
-
23
- def exec_seq(seq)
24
18
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
25
19
 
26
20
  fasta_input=@params.get_param('truncated_input_file')
@@ -140,15 +140,8 @@ class PluginIndeterminations < Plugin
140
140
  end
141
141
 
142
142
 
143
- #Begins the pluginKey's execution to warn where is a key in the sequence "seq"
144
- def execute(seqs)
145
- seqs.each do |s|
146
- exec_seq(s)
147
- end
148
- end
149
-
143
+ def exec_seq(seq,blast_query)
150
144
 
151
- def exec_seq(seq)
152
145
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
153
146
 
154
147
  actions=[]
@@ -11,16 +11,8 @@ class PluginKey < Plugin
11
11
 
12
12
 
13
13
  #Begins the pluginKey's execution to warn where is a key in the sequence "seq"
14
- def execute(seqs)
15
- @group_by_key=(@params.get_param('use_independent_folder_for_each_key')=='true')
16
-
17
- seqs.each do |s|
18
- exec_seq(s)
19
- end
20
- end
21
-
14
+ def exec_seq(seq,blast_query)
22
15
 
23
- def exec_seq(seq)
24
16
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: marking key into the sequence"
25
17
  # blast_table_results.inspect
26
18
 
@@ -70,15 +70,6 @@ class PluginLinker < Plugin
70
70
  return res
71
71
  end
72
72
 
73
- # Begins the plugin1's execution to warn that there is linker into the sequence
74
- def execute(seqs)
75
- blasts= do_blasts(seqs)
76
-
77
- seqs.each_with_index do |s,i|
78
- exec_seq(s,blasts.querys[i])
79
- end
80
- end
81
-
82
73
  def do_blasts(seqs)
83
74
  # find MIDS with less results than max_target_seqs value
84
75
  blast = BatchBlast.new("-db #{@params.get_param('linkers_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
@@ -12,19 +12,10 @@ require "plugin"
12
12
  MIN_DUST_SIZE = 30
13
13
 
14
14
  class PluginLowComplexity < Plugin
15
-
16
-
17
-
18
- def execute(seqs)
19
- dust= do_dust(seqs)
20
-
21
- seqs.each_with_index do |s,i|
22
- exec_seq(s,dust[i])
23
- end
24
- end
25
15
 
26
- def do_dust(seqs)
27
- # find MIDS with less results than max_target_seqs value
16
+ # do the dust masker instead of blast
17
+ def do_blasts(seqs)
18
+
28
19
  dust_masker=DustMasker.new()
29
20
 
30
21
  fastas=[]
@@ -44,7 +35,9 @@ class PluginLowComplexity < Plugin
44
35
  end
45
36
 
46
37
 
47
- def exec_seq(seq,dust_query)
38
+ def exec_seq(seq,blast_query)
39
+ dust_query=blast_query
40
+
48
41
  if dust_query.query_id != seq.seq_name
49
42
  raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
50
43
  end
@@ -65,14 +58,6 @@ class PluginLowComplexity < Plugin
65
58
  if (dust_size)>=MIN_DUST_SIZE
66
59
 
67
60
  # check if low complexity is inside a lowqual region
68
- # in_low_qual=false
69
- # low_quals.each do |lq|
70
- # if lq.contains_action?(start,stop,0)
71
- # in_low_qual=true
72
- # break
73
- # end
74
- # end
75
-
76
61
  if !seq.range_inside_action_type?(start,stop,ActionLowQuality)
77
62
 
78
63
  total_dust+=dust_size
@@ -13,15 +13,9 @@ class PluginLowHighSize < Plugin
13
13
 
14
14
 
15
15
  # Begins the plugin_low_high_size's execution with the sequence "seq"
16
-
17
- def execute(seqs)
18
- seqs.each do |s|
19
- exec_seq(s)
20
- end
21
- end
22
-
23
-
24
- def exec_seq(seq)
16
+
17
+ def exec_seq(seq,blast_query)
18
+
25
19
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low or high size of the sequence"
26
20
 
27
21
  min_size = @params.get_param('min_sequence_size_raw').to_i #min_size is: mean - 2dev
@@ -46,10 +40,6 @@ class PluginLowHighSize < Plugin
46
40
 
47
41
 
48
42
  end
49
-
50
- ######################################################################
51
- #---------------------------------------------------------------------
52
-
53
43
 
54
44
 
55
45
  ######################################################################