seqtrimnext 2.0.51 → 2.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
| @@ -12,245 +12,94 @@ require "plugin" | |
| 12 12 | 
             
            class PluginLowQuality < Plugin   
         | 
| 13 13 |  | 
| 14 14 |  | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 15 | 
            +
             | 
| 16 | 
            +
              def next_low_qual_region(quals,from_pos,min_value,max_good_quals=2)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                 rstart=nil
         | 
| 19 | 
            +
                 rend=nil
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                 i=from_pos
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                 good_q=0
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                 # skip good values
         | 
| 26 | 
            +
                 while (i< quals.length) && (quals[i]>=min_value)
         | 
| 27 | 
            +
                   i +=1 
         | 
| 28 | 
            +
                 end 
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                 # now we have found a bad quality, or end of sequence
         | 
| 31 | 
            +
                 if i < quals.length
         | 
| 32 | 
            +
                   rstart=i
         | 
| 33 | 
            +
                   len=0
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    # puts "   - [#{rstart},#{len}]"
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                   # continue growing while region of lowqual until more than 2 bases of good qual are found
         | 
| 38 | 
            +
                   begin
         | 
| 39 | 
            +
                     q=quals[i]
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                     if q<min_value
         | 
| 42 | 
            +
                       len += 1
         | 
| 43 | 
            +
                       # puts "BAD #{q}<#{min_value}"
         | 
| 44 | 
            +
                       len += good_q
         | 
| 45 | 
            +
                       good_q=0
         | 
| 46 | 
            +
                     else
         | 
| 47 | 
            +
                       good_q+=1
         | 
| 48 | 
            +
                     end
         | 
| 49 | 
            +
                      # puts "#{q} - q[#{rstart},#{rend}], #{good_q}"     
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                     i+=1
         | 
| 52 | 
            +
                   end while (i < quals.length) && (good_q <= max_good_quals)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                   rend = rstart + len -1
         | 
| 55 | 
            +
                   # puts "#{q} - q[#{rstart},#{rend}], #{good_q}"     
         | 
| 56 | 
            +
                 end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                 return [rstart,rend]
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              # A region is valid if it starts in 0, ends in seq.length or is big enought
         | 
| 62 | 
            +
              def valid_low_qual_region?(quals,rstart,rend,min_region_size)
         | 
| 63 | 
            +
                # puts [rstart,rend,0,quals.length,(rend-rstart+1)].join(';')
         | 
| 64 | 
            +
                # res =((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
         | 
| 65 | 
            +
                # if res
         | 
| 66 | 
            +
                #    puts "VALID"
         | 
| 67 | 
            +
                # end
         | 
| 68 | 
            +
                return ((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size))
         | 
| 69 | 
            +
              end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
| 72 | 
            +
              def get_low_qual_regions(quals,min_value, min_region_size,max_good_quals=2)
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                # the initial region is the whole array
         | 
| 75 | 
            +
                left=0
         | 
| 76 | 
            +
                right=quals.length-1
         | 
| 77 | 
            +
                # puts quals.map{|e| ("%2d" % e.to_s)}.join(' ')
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                # puts "[#{left},#{right}]"
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                i = 0
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                from_pos=0
         | 
| 84 | 
            +
                regions =[]
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                # get all new regions
         | 
| 87 | 
            +
                begin
         | 
| 88 | 
            +
                  rstart, rend = next_low_qual_region(quals,from_pos,min_value,max_good_quals)
         | 
| 89 | 
            +
                  if !rstart.nil?
         | 
| 90 | 
            +
                    from_pos= rend+1
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    if valid_low_qual_region?(quals,rstart,rend,min_region_size)
         | 
| 93 | 
            +
                      regions << [rstart,rend]
         | 
| 25 94 | 
             
                    end
         | 
| 26 | 
            -
                    # puts " contenido de sum" + sum.join.to_s  + " i index_window_end  window #{i} #{index_window_end} #{@window}" 
         | 
| 27 | 
            -
                  
         | 
| 28 | 
            -
                    i=ini
         | 
| 29 | 
            -
                    while (i<ini+@window)
         | 
| 30 | 
            -
                  
         | 
| 31 | 
            -
                      sum[ini] += qual[i] 
         | 
| 32 | 
            -
                      i+=1
         | 
| 33 | 
            -
                    end                                           
         | 
| 34 | 
            -
                  
         | 
| 35 | 
            -
                  
         | 
| 36 | 
            -
                    i=ini+1 
         | 
| 37 | 
            -
                  
         | 
| 38 | 
            -
                    while (i<=index_window_end)            
         | 
| 39 | 
            -
                  
         | 
| 40 | 
            -
                      sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1]
         | 
| 41 | 
            -
                      i+=1
         | 
| 42 | 
            -
                  
         | 
| 43 | 
            -
                    end   
         | 
| 44 | 
            -
                  
         | 
| 45 | 
            -
                    # puts '2____' + sum.join(',') + 'pos sum' + ini.to_s    
         | 
| 46 | 
            -
                  
         | 
| 47 | 
            -
                    return sum 
         | 
| 48 | 
            -
                  
         | 
| 49 | 
            -
                  end   
         | 
| 50 | 
            -
                  
         | 
| 51 | 
            -
                  def find_bounds_high_quality(sum,ini,index_window_end) 
         | 
| 52 | 
            -
                  
         | 
| 53 | 
            -
                    new_start = -1
         | 
| 54 | 
            -
                    new_end = -1
         | 
| 55 | 
            -
                    
         | 
| 56 | 
            -
                  # puts " ini #{ini} iwe #{index_window_end}"
         | 
| 57 | 
            -
                  # puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} " 
         | 
| 58 | 
            -
                   if (ini>index_window_end) 
         | 
| 59 | 
            -
                       temp_start= ini
         | 
| 60 | 
            -
                       # new_start, new_end = temp_start, index_window_end 
         | 
| 61 | 
            -
                       new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana
         | 
| 62 | 
            -
                         # new_start, new_end = index_window_end, index_window_end 
         | 
| 63 | 
            -
                   end   
         | 
| 64 | 
            -
                  # puts " temp_start #{temp_start}" if (ini>index_window_end)
         | 
| 65 | 
            -
                  temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1    
         | 
| 66 | 
            -
                  
         | 
| 67 | 
            -
                    i=ini+1
         | 
| 68 | 
            -
                    while (i<=index_window_end)
         | 
| 69 | 
            -
                      if (sum[i]>=@cut_off)  
         | 
| 70 | 
            -
                        if (temp_start<0)
         | 
| 71 | 
            -
                           temp_start=i  #just in! 
         | 
| 72 | 
            -
                           # puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}"   
         | 
| 73 | 
            -
                        end
         | 
| 74 | 
            -
                  
         | 
| 75 | 
            -
                      else 
         | 
| 76 | 
            -
                          # puts "sum #{sum[i]} < cut off "
         | 
| 77 | 
            -
                          if(temp_start>=0)              #just out!   
         | 
| 78 | 
            -
                            # puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}"
         | 
| 79 | 
            -
                            if (((i-1-temp_start)>=(new_end-new_start)))   
         | 
| 80 | 
            -
                              new_start,new_end=temp_start,i-1 
         | 
| 81 | 
            -
                              # puts "just out ---- new start,new_end = #{temp_start}, #{i-1}  index_window_end = #{index_window_end}"   
         | 
| 82 | 
            -
                            end
         | 
| 83 | 
            -
                            temp_start= -1 
         | 
| 84 | 
            -
                          end
         | 
| 85 | 
            -
                      end
         | 
| 86 | 
            -
                      i+=1  
         | 
| 87 | 
            -
                  
         | 
| 88 | 
            -
                  
         | 
| 89 | 
            -
                    end 
         | 
| 90 | 
            -
                    # puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"  
         | 
| 91 | 
            -
                  
         | 
| 92 | 
            -
                    if (temp_start != -1)   # finished while ok           
         | 
| 93 | 
            -
                      # puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}"
         | 
| 94 | 
            -
                        if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence
         | 
| 95 | 
            -
                            new_start, new_end = temp_start, index_window_end     #-1
         | 
| 96 | 
            -
                        end
         | 
| 97 | 
            -
                    end  
         | 
| 98 | 
            -
                  
         | 
| 99 | 
            -
                    # puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}"   
         | 
| 100 | 
            -
                    
         | 
| 101 | 
            -
                    # puts  " newstart  #{new_start} newend #{new_end}" 
         | 
| 102 | 
            -
                   
         | 
| 103 | 
            -
                    return new_start,new_end 
         | 
| 104 | 
            -
                   
         | 
| 105 | 
            -
                  
         | 
| 106 | 
            -
                  end  
         | 
| 107 | 
            -
                  
         | 
| 108 | 
            -
                  def cut_fine_bounds_short(qual,new_start,new_end)
         | 
| 109 | 
            -
                  
         | 
| 110 | 
            -
                      i=0                    
         | 
| 111 | 
            -
                      # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
         | 
| 112 | 
            -
                      while (i<@window)
         | 
| 113 | 
            -
                        if (qual[new_start+i]>=@low)
         | 
| 114 | 
            -
                          break
         | 
| 115 | 
            -
                        end    
         | 
| 116 | 
            -
                        i+=1
         | 
| 117 | 
            -
                      end  
         | 
| 118 | 
            -
                      new_start +=i 
         | 
| 119 | 
            -
                      # puts "#{new_start} ***********"
         | 
| 120 | 
            -
                  
         | 
| 121 | 
            -
                      i=@window -1
         | 
| 122 | 
            -
                      while (i>=0)  
         | 
| 123 | 
            -
                        if (qual[new_end+i]>=@low)    
         | 
| 124 | 
            -
                          break            
         | 
| 125 | 
            -
                        end
         | 
| 126 | 
            -
                        i-=1            
         | 
| 127 | 
            -
                      end     
         | 
| 128 | 
            -
                      new_end += i
         | 
| 129 | 
            -
                      # puts "6a new_start #{new_start} new-end #{new_end}"     
         | 
| 130 | 
            -
                      
         | 
| 131 | 
            -
                       # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short"    
         | 
| 132 | 
            -
                      return new_start, new_end  
         | 
| 133 | 
            -
                  
         | 
| 134 | 
            -
                  end  
         | 
| 135 | 
            -
                  
         | 
| 136 | 
            -
                  
         | 
| 137 | 
            -
                  # cuts fine the high quality bounds
         | 
| 138 | 
            -
                  def cut_fine_bounds(qual,new_start,new_end)   
         | 
| 139 | 
            -
                    # puts "  ççççççççççççççç #{new_start+@window} >= #{new_end} " 
         | 
| 140 | 
            -
                    # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
         | 
| 141 | 
            -
                    # cut it fine
         | 
| 142 | 
            -
                  
         | 
| 143 | 
            -
                     one_ok = 0         
         | 
| 144 | 
            -
                  
         | 
| 145 | 
            -
                      i=@window-1
         | 
| 146 | 
            -
                      # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}"
         | 
| 147 | 
            -
                      while (i>=0) 
         | 
| 148 | 
            -
                          if (qual[new_start+i] < @low) 
         | 
| 149 | 
            -
                              break if one_ok
         | 
| 150 | 
            -
                          else 
         | 
| 151 | 
            -
                              one_ok = 1
         | 
| 152 | 
            -
                          end    
         | 
| 153 | 
            -
                          i-=1
         | 
| 154 | 
            -
                      end
         | 
| 155 | 
            -
                      new_start += i+1
         | 
| 156 | 
            -
                      oneOk = 0  
         | 
| 157 | 
            -
                      i=0
         | 
| 158 | 
            -
                      while (i<@window) 
         | 
| 159 | 
            -
                          if (qual[new_end+i] < @low) 
         | 
| 160 | 
            -
                              break if oneOk
         | 
| 161 | 
            -
                          else 
         | 
| 162 | 
            -
                              oneOk = 1
         | 
| 163 | 
            -
                          end  
         | 
| 164 | 
            -
                          i+=1
         | 
| 165 | 
            -
                      end
         | 
| 166 | 
            -
                      new_end += i-1 
         | 
| 167 | 
            -
                      # puts "6b  new_start #{new_start} new-end #{new_end}"  
         | 
| 168 | 
            -
                  
         | 
| 169 | 
            -
                    # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2"
         | 
| 170 | 
            -
                    return new_start, new_end
         | 
| 171 | 
            -
                  
         | 
| 172 | 
            -
                  end
         | 
| 173 | 
            -
                  
         | 
| 174 | 
            -
                  def find_high_quality(qual,ini=0)  
         | 
| 175 | 
            -
                  
         | 
| 176 | 
            -
                    # puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',')  + 'size' + qual.size.to_s
         | 
| 177 | 
            -
                    
         | 
| 178 | 
            -
                    update=false
         | 
| 179 | 
            -
                    # if @window>qual.length-ini     #search in the last window although has a low size
         | 
| 180 | 
            -
                    #     @window=qual.length-ini   
         | 
| 181 | 
            -
                    #      # puts ' UPDATE WINDOW  Y CUT OFF ' + @window.to_s
         | 
| 182 | 
            -
                    #      @cut_off=@window*@low   
         | 
| 183 | 
            -
                    #      update=true
         | 
| 184 | 
            -
                    #   end          
         | 
| 185 | 
            -
                               
         | 
| 186 | 
            -
                    if (ini==0 or update)
         | 
| 187 | 
            -
                      #index_window_start = ini
         | 
| 188 | 
            -
                      @index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1;
         | 
| 189 | 
            -
                      #TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada 
         | 
| 190 | 
            -
                  
         | 
| 191 | 
            -
                  
         | 
| 192 | 
            -
                      @sum = create_sum_window(qual,ini,@index_window_end) 
         | 
| 193 | 
            -
                      # puts "SUMA #{@sum.join(' ')}"   
         | 
| 194 | 
            -
                    end              
         | 
| 195 | 
            -
                          
         | 
| 196 | 
            -
                    new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end) 
         | 
| 197 | 
            -
                    # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1"
         | 
| 198 | 
            -
                  
         | 
| 199 | 
            -
                    if (new_start>=0)
         | 
| 200 | 
            -
                      if (new_start+@window >= new_end)
         | 
| 201 | 
            -
                         # puts "cfs"     
         | 
| 202 | 
            -
                        new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end)
         | 
| 203 | 
            -
                        # puts "cfs"
         | 
| 204 | 
            -
                  
         | 
| 205 | 
            -
                      else  
         | 
| 206 | 
            -
                        # puts "cf"
         | 
| 207 | 
            -
                        new_start, new_end = cut_fine_bounds(qual,new_start,new_end) 
         | 
| 208 | 
            -
                        # puts "cf"
         | 
| 209 | 
            -
                      end 
         | 
| 210 | 
            -
                    end 
         | 
| 211 | 
            -
                    
         | 
| 212 | 
            -
                     # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2" 
         | 
| 213 | 
            -
                  
         | 
| 214 | 
            -
                    return new_start,new_end #+1
         | 
| 215 | 
            -
                  
         | 
| 216 | 
            -
                  
         | 
| 217 95 | 
             
                  end
         | 
| 96 | 
            +
                end while !rstart.nil?
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                return regions  
         | 
| 99 | 
            +
             | 
| 100 | 
            +
              end
         | 
| 218 101 |  | 
| 219 | 
            -
             | 
| 220 | 
            -
                  def add_action_before_high_qual(p_begin,p_end,actions,seq,start)
         | 
| 221 | 
            -
                  
         | 
| 222 | 
            -
                    action_size = p_begin-1
         | 
| 223 | 
            -
                    if action_size>=(@window/2)  
         | 
| 224 | 
            -
                  
         | 
| 225 | 
            -
                  
         | 
| 226 | 
            -
                      # puts "action_SIZE1 #{action_size} > #{@window/2}"
         | 
| 227 | 
            -
                  
         | 
| 228 | 
            -
                      if ( (p_begin>0) && (action_size>0) )  #if there is action before the high qual part 
         | 
| 229 | 
            -
                        # it's created an action before of the high quality part
         | 
| 230 | 
            -
                        a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
         | 
| 231 | 
            -
                        # puts " new low qual start: #{start}  = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}"
         | 
| 232 | 
            -
                        actions.push a   
         | 
| 233 | 
            -
                      end 
         | 
| 234 | 
            -
                    end             
         | 
| 235 | 
            -
                  end  
         | 
| 236 | 
            -
                  
         | 
| 237 | 
            -
                  def add_action_after_high_qual(p_begin,p_end,actions,seq)
         | 
| 238 | 
            -
                  
         | 
| 239 | 
            -
                    action_size = seq.insert_end-p_end
         | 
| 240 | 
            -
                    if action_size>=(@window/2)
         | 
| 241 | 
            -
                  
         | 
| 242 | 
            -
                  
         | 
| 243 | 
            -
                       # puts "action_SIZE2 #{action_size} > #{@window/2}"
         | 
| 244 | 
            -
                  
         | 
| 245 | 
            -
                       if ((p_end<seq.seq_fasta.size-1) && (action_size>0) )  #if there is action before the high qual part 
         | 
| 246 | 
            -
                         # it's created an action before of the high quality part
         | 
| 247 | 
            -
                         a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
         | 
| 248 | 
            -
                  
         | 
| 249 | 
            -
                         actions.push a   
         | 
| 250 | 
            -
                       end 
         | 
| 251 | 
            -
                     end 
         | 
| 252 | 
            -
                  end
         | 
| 253 | 
            -
                   
         | 
| 102 | 
            +
             | 
| 254 103 |  | 
| 255 104 |  | 
| 256 105 |  | 
| @@ -266,100 +115,42 @@ class PluginLowQuality < Plugin | |
| 266 115 | 
             
              # Finally  mark, with an action, the after part to the High Quality Subsequence like a low quality part 
         | 
| 267 116 | 
             
              #----------------------------------------------------------------- 
         | 
| 268 117 |  | 
| 269 | 
            -
             | 
| 270 | 
            -
               seqs.each do |s|
         | 
| 271 | 
            -
                   exec_seq(s)
         | 
| 272 | 
            -
                end
         | 
| 273 | 
            -
              end
         | 
| 274 | 
            -
             | 
| 275 | 
            -
             | 
| 276 | 
            -
              def exec_seq(seq)
         | 
| 118 | 
            +
              def exec_seq(seq,blast_query)
         | 
| 277 119 |  | 
| 278 120 | 
             
                 if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? ) 
         | 
| 279 | 
            -
                   $LOG. | 
| 121 | 
            +
                   $LOG.debug " Quality File haven't been provided. It's impossible to execute " + self.class.to_s     
         | 
| 280 122 | 
             
                 elsif (seq.seq_qual.size>0)
         | 
| 281 | 
            -
                   $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"    
         | 
| 282 | 
            -
             | 
| 283 | 
            -
                   @low=@params.get_param('min_quality').to_i
         | 
| 284 | 
            -
             | 
| 285 | 
            -
                   if @params.get_param('window_width').to_i>seq.seq_fasta.length   
         | 
| 286 | 
            -
                     @window=seq.seq_fasta.length   
         | 
| 287 | 
            -
                   
         | 
| 288 | 
            -
                   else 
         | 
| 289 | 
            -
                     @window=@params.get_param('window_width').to_i  
         | 
| 290 | 
            -
                   
         | 
| 291 | 
            -
                   end 
         | 
| 292 | 
            -
                   @cut_off=@window*@low   
         | 
| 293 | 
            -
                                                       
         | 
| 294 | 
            -
                   type='ActionLowQuality' 
         | 
| 295 | 
            -
                   low_qual=0
         | 
| 296 | 
            -
                   actions=[] 
         | 
| 297 | 
            -
                   
         | 
| 298 | 
            -
                   p_begin,p_end =0,-1 # positions from high quality bounds    
         | 
| 299 | 
            -
                                            
         | 
| 300 | 
            -
                   # @stats[:low_qual]={}    
         | 
| 301 | 
            -
                   # @stats['low_qual']={} 
         | 
| 302 | 
            -
                   
         | 
| 303 | 
            -
                   
         | 
| 304 | 
            -
                   while ((p_begin>=0)  && (p_end + 1 < seq.seq_qual.size) ) 
         | 
| 305 | 
            -
                     
         | 
| 306 | 
            -
                     
         | 
| 307 | 
            -
                     p_begin_old,p_end_old= p_begin, p_end
         | 
| 308 | 
            -
                     p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1)  
         | 
| 309 | 
            -
                     # entra=(p_begin>0) or (p_end_old<0)
         | 
| 310 | 
            -
                     #         
         | 
| 311 | 
            -
                     # puts "high ini fin #{p_begin} #{p_end} ini-old fin-old  #{p_begin_old} #{p_end_old} __ ___ ___ ___1"     
         | 
| 312 | 
            -
                     
         | 
| 313 | 
            -
                     if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and  the low quality part has enough size 
         | 
| 314 | 
            -
                        # it's created an action before of the high quality part 
         | 
| 315 | 
            -
                        add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1) 
         | 
| 316 | 
            -
                        # puts "low1 ini fin  #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}"     
         | 
| 317 | 
            -
                        low_qual = p_begin-1-p_end_old-1 + 1 
         | 
| 318 | 
            -
                        
         | 
| 319 | 
            -
                        add_stats('low_qual',low_qual)
         | 
| 320 | 
            -
                        # @stats[:low_qual]={low_qual => 1} 
         | 
| 321 | 
            -
                         
         | 
| 322 | 
            -
                     end
         | 
| 323 | 
            -
                     
         | 
| 324 | 
            -
                     # puts "-----ññññ----- high quality  #{p_begin}   #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}"   
         | 
| 325 | 
            -
             | 
| 326 | 
            -
                   end
         | 
| 327 | 
            -
                    
         | 
| 328 | 
            -
                   # puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}"   
         | 
| 329 | 
            -
                   if ((p_begin>=0) && (p_end+1<seq.seq_qual.size))  #if we have found the high quality part 
         | 
| 330 123 |  | 
| 331 | 
            -
                      #  | 
| 332 | 
            -
                       | 
| 333 | 
            -
                       | 
| 334 | 
            -
                       | 
| 335 | 
            -
                       | 
| 336 | 
            -
                       | 
| 337 | 
            -
                       | 
| 338 | 
            -
                       | 
| 339 | 
            -
                       | 
| 340 | 
            -
                       | 
| 341 | 
            -
                   end                                     
         | 
| 342 | 
            -
             | 
| 343 | 
            -
                   # puts "-----ññññ----- high quality  #{p_begin}   #{p_end}"  
         | 
| 344 | 
            -
               
         | 
| 345 | 
            -
                   
         | 
| 346 | 
            -
                   if p_end<0 and p_end_old #add action low qual to all the part      
         | 
| 347 | 
            -
                     a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid
         | 
| 348 | 
            -
                     # puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}" 
         | 
| 349 | 
            -
                     low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1 
         | 
| 350 | 
            -
                     
         | 
| 351 | 
            -
                      # if @stats[:low_qual][low_qual].nil?
         | 
| 352 | 
            -
                      #             @stats[:low_qual][low_qual] = 0
         | 
| 353 | 
            -
                      #          end
         | 
| 354 | 
            -
                      #          @stats[:low_qual][low_qual] += 1   
         | 
| 355 | 
            -
                     add_stats('low_qual',low_qual) 
         | 
| 356 | 
            -
                     # @stats[:low_qual]={'low_qual' => 1} 
         | 
| 124 | 
            +
                      $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
         | 
| 125 | 
            +
                      
         | 
| 126 | 
            +
                      min_quality=@params.get_param('min_quality').to_i
         | 
| 127 | 
            +
                      min_length_inside_seq=@params.get_param('min_length_inside_seq').to_i
         | 
| 128 | 
            +
                      max_consecutive_good_bases=@params.get_param('max_consecutive_good_bases').to_i
         | 
| 129 | 
            +
                      
         | 
| 130 | 
            +
                      type='ActionLowQuality'
         | 
| 131 | 
            +
                      actions=[]
         | 
| 132 | 
            +
                      
         | 
| 133 | 
            +
                      regions=get_low_qual_regions(seq.seq_qual,min_quality,min_length_inside_seq,max_consecutive_good_bases)
         | 
| 357 134 |  | 
| 358 | 
            -
             | 
| 359 | 
            -
             | 
| 360 | 
            -
             | 
| 361 | 
            -
             | 
| 362 | 
            -
             | 
| 135 | 
            +
                      regions.each do |r|
         | 
| 136 | 
            +
                        low_qual_size=r.last-r.first+1
         | 
| 137 | 
            +
                        
         | 
| 138 | 
            +
                        # puts "(#{low_qual_size}) = [#{r.first},#{r.last}]: #{a[r.first..r.last].map{|e| ("%2d" % e.to_s)}.join(' ')}"
         | 
| 139 | 
            +
                       
         | 
| 140 | 
            +
                       
         | 
| 141 | 
            +
                       add_stats('low_qual',low_qual_size)
         | 
| 142 | 
            +
                       
         | 
| 143 | 
            +
                       
         | 
| 144 | 
            +
                       # create action
         | 
| 145 | 
            +
                       a = seq.new_action(r.first,r.last,type) # adds the correspondent action to the sequence
         | 
| 146 | 
            +
                       actions.push a
         | 
| 147 | 
            +
                       
         | 
| 148 | 
            +
                       
         | 
| 149 | 
            +
                       
         | 
| 150 | 
            +
                      end
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                      # add quals
         | 
| 153 | 
            +
                      seq.add_actions(actions)
         | 
| 363 154 | 
             
                 end       
         | 
| 364 155 |  | 
| 365 156 | 
             
               end 
         | 
| @@ -379,16 +170,21 @@ class PluginLowQuality < Plugin | |
| 379 170 | 
             
            		default_value = 20
         | 
| 380 171 | 
             
            		params.check_param(errors,'min_quality','Integer',default_value,comment)
         | 
| 381 172 |  | 
| 382 | 
            -
             | 
| 173 | 
            +
            	  comment='Quality window for scanning low quality segments'
         | 
| 383 174 | 
             
            		default_value = 15
         | 
| 384 175 | 
             
            		params.check_param(errors,'window_width','Integer',default_value,comment)
         | 
| 385 176 |  | 
| 386 | 
            -
             | 
| 177 | 
            +
                
         | 
| 178 | 
            +
            	  comment='Minimum length of a bad quality segment inside the sequence'
         | 
| 179 | 
            +
            		default_value = 8
         | 
| 180 | 
            +
            		params.check_param(errors,'min_length_inside_seq','Integer',default_value,comment)
         | 
| 181 | 
            +
               
         | 
| 182 | 
            +
                
         | 
| 183 | 
            +
            	  comment='Maximum consecutive good-quality bases between two bad quality regions'
         | 
| 184 | 
            +
            		default_value = 2
         | 
| 185 | 
            +
            		params.check_param(errors,'max_consecutive_good_bases','Integer',default_value,comment)
         | 
| 387 186 |  | 
| 388 187 | 
             
                return errors
         | 
| 389 188 | 
             
              end
         | 
| 390 189 |  | 
| 391 | 
            -
              
         | 
| 392 | 
            -
              private :find_high_quality
         | 
| 393 | 
            -
              
         | 
| 394 190 | 
             
            end
         | 
| @@ -14,17 +14,6 @@ class PluginMids < Plugin | |
| 14 14 | 
             
              MAX_MID_ERRORS = 2
         | 
| 15 15 | 
             
              #MIN_MID_SIZE = 7  # very important, don't touch 
         | 
| 16 16 | 
             
              # DB_MID_SIZE = 10  # DONE read formatted db and save the mid sizes    
         | 
| 17 | 
            -
                        
         | 
| 18 | 
            -
              
         | 
| 19 | 
            -
              
         | 
| 20 | 
            -
              #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"    
         | 
| 21 | 
            -
              def execute(seqs)
         | 
| 22 | 
            -
                blasts= do_blasts(seqs)
         | 
| 23 | 
            -
                
         | 
| 24 | 
            -
                seqs.each_with_index do |s,i|
         | 
| 25 | 
            -
                  exec_seq(s,blasts.querys[i])
         | 
| 26 | 
            -
                end
         | 
| 27 | 
            -
              end
         | 
| 28 17 |  | 
| 29 18 | 
             
              def do_blasts(seqs)
         | 
| 30 19 | 
             
                 # find MIDS  with less results than max_target_seqs value 
         | 
| @@ -86,16 +86,7 @@ class PluginShortInsert < Plugin | |
| 86 86 | 
             
                return sub_inserts
         | 
| 87 87 | 
             
              end
         | 
| 88 88 |  | 
| 89 | 
            -
              | 
| 90 | 
            -
             def execute(seqs)
         | 
| 91 | 
            -
               seqs.each do |s|
         | 
| 92 | 
            -
                   exec_seq(s)
         | 
| 93 | 
            -
                end
         | 
| 94 | 
            -
              end
         | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
              def exec_seq(seq)
         | 
| 98 | 
            -
                
         | 
| 89 | 
            +
              def exec_seq(seq,blast_query)
         | 
| 99 90 | 
             
                $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size" 
         | 
| 100 91 | 
             
                # puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}" 
         | 
| 101 92 |  | 
| @@ -33,21 +33,17 @@ class PluginUserContaminants < Plugin | |
| 33 33 | 
             
                return res
         | 
| 34 34 | 
             
              end
         | 
| 35 35 |  | 
| 36 | 
            -
               | 
| 37 | 
            -
             | 
| 38 | 
            -
                blasts= do_blasts(seqs)
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                seqs.each_with_index do |s,i|
         | 
| 41 | 
            -
                  exec_seq(s,blasts.querys[i])
         | 
| 42 | 
            -
                end
         | 
| 36 | 
            +
              def can_execute?
         | 
| 37 | 
            +
                return !@params.get_param('user_contaminant_db').empty?
         | 
| 43 38 | 
             
              end
         | 
| 44 39 |  | 
| 40 | 
            +
             | 
| 45 41 | 
             
              def do_blasts(seqs)
         | 
| 46 42 |  | 
| 47 43 | 
             
                # TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no
         | 
| 48 44 | 
             
                # y una secuencia de baja complejidad como entrada
         | 
| 49 45 |  | 
| 50 | 
            -
                blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param(' | 
| 46 | 
            +
                blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_user_contaminant')} -perc_identity #{@params.get_param('blast_percent_user_contaminant')} -culling_limit 1")  #get classify -max_target_seqs #{MAX_TARGETS_SEQS}
         | 
| 51 47 |  | 
| 52 48 | 
             
                $LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
         | 
| 53 49 |  | 
| @@ -72,42 +68,55 @@ class PluginUserContaminants < Plugin | |
| 72 68 |  | 
| 73 69 | 
             
                $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for classify into the sequence"
         | 
| 74 70 |  | 
| 75 | 
            -
                type = " | 
| 71 | 
            +
                type = "ActionUserContaminant"
         | 
| 76 72 |  | 
| 77 73 | 
             
                classify={}
         | 
| 74 | 
            +
                contaminants=[]
         | 
| 78 75 |  | 
| 79 | 
            -
                 | 
| 76 | 
            +
                
         | 
| 77 | 
            +
                merge_hits(blast_query.hits,contaminants,nil,false)
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                begin
         | 
| 80 | 
            +
                  contaminants2=contaminants
         | 
| 81 | 
            +
                  contaminants = []                            #second round to save contaminants without overlap
         | 
| 82 | 
            +
                  merge_hits(contaminants2,contaminants,nil,false)
         | 
| 83 | 
            +
                end until (contaminants2.count == contaminants.count)
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                contaminants.sort {|c1,c2| (c1.q_end - c1.q_beg + 1)<=>(c2.q_end - c2.q_beg + 1)}
         | 
| 80 86 |  | 
| 81 | 
            -
                classify=sum_hits_by_id( | 
| 87 | 
            +
                # classify=sum_hits_by_id(contaminants.hits)
         | 
| 82 88 |  | 
| 83 89 | 
             
                actions=[]
         | 
| 84 | 
            -
                classify_size=0
         | 
| 90 | 
            +
                # classify_size=0
         | 
| 85 91 |  | 
| 86 | 
            -
                min_cont_size=@params.get_param(' | 
| 92 | 
            +
                min_cont_size=@params.get_param('min_user_contaminant_size').to_i
         | 
| 87 93 |  | 
| 88 | 
            -
                biggest_classify =  | 
| 94 | 
            +
                # biggest_classify = contaminants.sort {|c1,c2| c1[1]<=>c2[1]}
         | 
| 89 95 |  | 
| 90 | 
            -
                if ! | 
| 96 | 
            +
                if !contaminants.empty?
         | 
| 91 97 |  | 
| 92 | 
            -
                  definition,classify_size = biggest_classify.last
         | 
| 98 | 
            +
                  # definition,classify_size = biggest_classify.last
         | 
| 93 99 |  | 
| 100 | 
            +
                  biggest_contaminant=contaminants.last
         | 
| 101 | 
            +
                  hit_size=(biggest_contaminant.q_end - biggest_contaminant.q_beg + 1)
         | 
| 94 102 |  | 
| 95 | 
            -
                  a = seq.new_action( | 
| 103 | 
            +
                  a = seq.new_action(biggest_contaminant.q_beg,biggest_contaminant.q_end,type) # adds the correspondent action to the sequence
         | 
| 96 104 |  | 
| 97 | 
            -
                  a.message = definition
         | 
| 105 | 
            +
                  a.message = biggest_contaminant.definition
         | 
| 98 106 |  | 
| 99 | 
            -
                   | 
| 107 | 
            +
                  seq.add_comment("Contaminated: #{biggest_contaminant.definition}")
         | 
| 108 | 
            +
                  
         | 
| 109 | 
            +
                  a.tag_id = biggest_contaminant.definition.gsub(' ','_')
         | 
| 100 110 |  | 
| 101 111 | 
             
                  # a.found_definition = c.definition    # save the classify definitions, each separately
         | 
| 102 112 |  | 
| 103 113 | 
             
                  #save to this file
         | 
| 104 | 
            -
                  seq.add_file_tag( | 
| 105 | 
            -
                  
         | 
| 114 | 
            +
                  seq.add_file_tag(0, 'with_user_contaminant', :both, 10)
         | 
| 106 115 |  | 
| 107 116 | 
             
                  actions.push a
         | 
| 108 | 
            -
             | 
| 109 | 
            -
                  add_stats(' | 
| 110 | 
            -
                  add_stats(' | 
| 117 | 
            +
                  
         | 
| 118 | 
            +
                  add_stats('user_contaminant_size',hit_size)
         | 
| 119 | 
            +
                  add_stats('user_contaminant_ids',biggest_contaminant.definition)
         | 
| 111 120 |  | 
| 112 121 | 
             
                  seq.add_actions(actions)
         | 
| 113 122 | 
             
                end
         | 
| @@ -121,21 +130,20 @@ class PluginUserContaminants < Plugin | |
| 121 130 |  | 
| 122 131 | 
             
                comment='Blast E-value used as cut-off when searching for contaminations'
         | 
| 123 132 | 
             
                default_value = 1e-10
         | 
| 124 | 
            -
                params.check_param(errors,' | 
| 133 | 
            +
                params.check_param(errors,'blast_evalue_user_contaminant','Float',default_value,comment)
         | 
| 125 134 |  | 
| 126 | 
            -
                comment='Minimum required identity (%) for a reliable  | 
| 135 | 
            +
                comment='Minimum required identity (%) for a reliable user contaminant match'
         | 
| 127 136 | 
             
                default_value = 85
         | 
| 128 | 
            -
                params.check_param(errors,' | 
| 137 | 
            +
                params.check_param(errors,'blast_percent_user_contaminant','Integer',default_value,comment)
         | 
| 129 138 |  | 
| 130 | 
            -
                comment='Minimum hit size (nt) for considering  | 
| 139 | 
            +
                comment='Minimum hit size (nt) for considering for user contaminant'
         | 
| 131 140 | 
             
                default_value = 30 # era 40
         | 
| 132 | 
            -
                params.check_param(errors,' | 
| 141 | 
            +
                params.check_param(errors,'min_user_contaminant_size','Integer',default_value,comment)
         | 
| 133 142 |  | 
| 134 | 
            -
                comment='Path for  | 
| 135 | 
            -
                default_value = File.join($FORMATTED_DB_PATH,' | 
| 143 | 
            +
                comment='Path for user contaminant database'
         | 
| 144 | 
            +
                default_value = "" #File.join($FORMATTED_DB_PATH,'user_contaminant.fasta')
         | 
| 136 145 | 
             
                params.check_param(errors,'user_contaminant_db','DB',default_value,comment)
         | 
| 137 146 |  | 
| 138 | 
            -
             | 
| 139 147 | 
             
                return errors
         | 
| 140 148 | 
             
              end
         | 
| 141 149 |  | 
| @@ -25,15 +25,6 @@ class PluginVectors < Plugin | |
| 25 25 | 
             
                return  ((linkers.count>=1) && (vector_beg+seq.insert_start>=linkers[0].start_pos) && (vector_end+seq.insert_start<=linkers[0].end_pos))
         | 
| 26 26 | 
             
              end
         | 
| 27 27 |  | 
| 28 | 
            -
             #Begins the plugin1's execution to warn that there are vectors in the sequence "seq"
         | 
| 29 | 
            -
             def execute(seqs)
         | 
| 30 | 
            -
               blasts= do_blasts(seqs)
         | 
| 31 | 
            -
             | 
| 32 | 
            -
               seqs.each_with_index do |s,i|
         | 
| 33 | 
            -
                 exec_seq(s,blasts.querys[i])
         | 
| 34 | 
            -
               end
         | 
| 35 | 
            -
             end
         | 
| 36 | 
            -
             | 
| 37 28 | 
             
             def do_blasts(seqs)
         | 
| 38 29 | 
             
                # find MIDS  with less results than max_target_seqs value 
         | 
| 39 30 | 
             
                blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1")  #get vectors
         |