RubyGems - seqtrimnext - Versions diffs - 2.0.51 → 2.0.52 - Mend

seqtrimnext 2.0.51 → 2.0.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/History.txt +7 -0
data/Manifest.txt +3 -3
data/README.rdoc +18 -3
data/Rakefile +2 -1
data/bin/parse_params.rb +5 -1
data/bin/seqtrimnext +53 -21
data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
data/lib/seqtrimnext/classes/params.rb +109 -123
data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
data/lib/seqtrimnext/classes/sequence.rb +2 -2
data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
data/lib/seqtrimnext/plugins/plugin.rb +42 -12
data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
data/lib/seqtrimnext/templates/amplicons.txt +1 -8
data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
data/lib/seqtrimnext/templates/only_quality.txt +24 -0
data/lib/seqtrimnext/templates/sanger.txt +25 -0
data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
data/lib/seqtrimnext.rb +1 -1
metadata +20 -7
data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245

data/History.txt CHANGED Viewed

@@ -1,3 +1,10 @@
+=== 2.0.52 2012-06-26
+* Added new plugin for user contaminants.
+* Sequences contaminated with user contaminants are stored in separate files.
+* Processing of both illumina fastq paired-end files in the same execution.
+* Template reorganization.
 === 2.0.51 2012-06-20
 Added cont_viruses database

data/Manifest.txt CHANGED Viewed

@@ -24,7 +24,7 @@ History.txt
 lib/seqtrimnext/actions/action_ab_adapter.rb
 lib/seqtrimnext/actions/action_ab_far_adapter.rb
 lib/seqtrimnext/actions/action_ab_left_adapter.rb
-lib/seqtrimnext/actions/action_classify.rb
+lib/seqtrimnext/actions/action_user_contaminant.rb
 lib/seqtrimnext/actions/action_empty_insert.rb
 lib/seqtrimnext/actions/action_ignore_repeated.rb
 lib/seqtrimnext/actions/action_indetermination.rb
@@ -75,7 +75,6 @@ lib/seqtrimnext/classes/sequence_with_action.rb
 lib/seqtrimnext/plugins/plugin.rb
 lib/seqtrimnext/plugins/plugin_ab_adapters.rb
 lib/seqtrimnext/plugins/plugin_adapters.rb
-lib/seqtrimnext/plugins/plugin_adapters_old.rb
 lib/seqtrimnext/plugins/plugin_amplicons.rb
 lib/seqtrimnext/plugins/plugin_contaminants.rb
 lib/seqtrimnext/plugins/plugin_user_contaminants.rb
@@ -89,10 +88,11 @@ lib/seqtrimnext/plugins/plugin_low_complexity.rb
 lib/seqtrimnext/plugins/plugin_low_high_size.rb
 lib/seqtrimnext/plugins/plugin_low_quality.rb
 lib/seqtrimnext/plugins/plugin_mids.rb
-lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb
 lib/seqtrimnext/plugins/plugin_short_insert.rb
 lib/seqtrimnext/plugins/plugin_vectors.rb
 lib/seqtrimnext/templates/amplicons.txt
+lib/seqtrimnext/templates/sanger.txt
+lib/seqtrimnext/templates/only_quality.txt
 lib/seqtrimnext/templates/genomics_454.txt
 lib/seqtrimnext/templates/genomics_454_with_paired.txt
 lib/seqtrimnext/templates/genomics_short_reads.txt

data/README.rdoc CHANGED Viewed

@@ -48,7 +48,7 @@ To install core databases (it should be done at installation time):
   $> seqtrimnext -i core
-Databases will be installed nearby SeqtrimNEXT by default, but you can override this location by setting the environment variable +BASTDB+. Eg.:
+Databases will be installed nearby SeqtrimNEXT by default, but you can override this location by setting the environment variable +BLASTDB+. Eg.:
 If you with your database installed at /var:
@@ -56,6 +56,10 @@ If you with your database installed at /var:
 Be sure that this environment variable is always loaded before SeqtrimNEXT execution (Eg.: add it to /etc/profile.local).
+There are aditional databases. To list them:
+  $> seqtrimnext -i LIST
 To perform an analisys using a predefined template with a FASTQ file format using 4 cpus:
   $> seqtrimnext -t genomics_454.txt -Q input_file_in_FASTQ -w 4
@@ -64,6 +68,13 @@ To perform an analisys using a predefined template with a FASTQ file format:
   $> seqtrimnext -t genomics_454.txt -f input_file_in_FASTA -q input_file_in_QUAL
+To clean illumina fastq files, with paired-ends and qualities encoded in illumina 1.5 format, using 4 cpus and disabling verbose output:
+  $> seqtrimnext -t genomics_short_reads.txt -F illumina15 -Q p1.fastq,p2.fastq -w 4 -K
+To clean illumina fastq files, with paired-ends and qualities encoded in standard phred format, using 4 cpus and disabling verbose output:
+  $> seqtrimnext -t genomics_short_reads.txt  -Q p1.fastq,p2.fastq -w 4 -K
 To get additional help and list available templates and databases:
@@ -186,13 +197,17 @@ SeqtrimNEXT needs some core databases to work. To install them:
   seqtrimnext -i core
-You can change default database location by setting the environment variable +BASTDB+. Refer to SYNOPSIS for an example.
+You can change default database location by setting the environment variable +BLASTDB+. Refer to SYNOPSIS for an example.
+There are aditional databases that can be listed with:
+  seqtrimnext -i LIST
 === Database modifications
 Included databases will be usefull for a lot of people, but if you prefer, you can modify them, or add more elements to be search against your sequences.
-You only need to drop new fasta files to each respective directory:
+You only need to drop new fasta files to each respective directory, or even create new directories with new fasta files inside. Each directory with fasta files will be used as a database:
 DB/vectors to add more vectors
 DB/contaminants to add more contaminants

data/Rakefile CHANGED Viewed

@@ -16,7 +16,7 @@ $hoe = Hoe.spec 'seqtrimnext' do
   self.rubyforge_name       = self.name # TODO this is default value
   # self.extra_deps         = ['narray','gnuplot','term-ansicolor','xml-simple','scbi_blast','scbi_drb','scbi_fasta','scbi_fastq','scbi_plot','scbi_math']
-  self.extra_deps         = []
+  self.extra_deps = []
   self.extra_deps << ['narray','>=0']
   self.extra_deps << ['gnuplot','>=0']
   self.extra_deps << ['term-ansicolor','>=1.0.5']
@@ -27,6 +27,7 @@ $hoe = Hoe.spec 'seqtrimnext' do
   self.extra_deps << ['scbi_fastq','>=0.0.16']
   self.extra_deps << ['scbi_plot','>=0.0.6']
   self.extra_deps << ['scbi_math','>=0.0.1']
+  self.extra_deps << ['scbi_headers','>=0.0.2']
 end

data/bin/parse_params.rb CHANGED Viewed

@@ -26,6 +26,7 @@ params={}
 params['vector_db_field']='vectors_db'
 params['primers_db_field']='primers_db'
 params['contaminants_db_field']='contaminants_db'
+params['user_contaminants_db_field']='user_contaminants_db'
 params['species_field']='genus'
 params['min_insert_size_field']='min_insert_size_trimmed'
 params['min_paired_insert_size_field']='min_insert_size_paired'
@@ -53,6 +54,7 @@ end
 sq_params=File.open(params_file,'r')
 data=get_json_data(input_file)
 # puts data.keys
 # puts data['vector_db_field']
@@ -69,10 +71,12 @@ data=get_json_data(input_file)
 sq_params=File.open(params_file,'a+')
+sq_params.puts ""
 data.each do |k,v|
   sq_name=params[k]
-  # puts k,sq_name
+  # puts k,sq_name
   if sq_name && v && !v.empty?
     sq_params.puts "#{sq_name}=#{v}"

data/bin/seqtrimnext CHANGED Viewed

@@ -1,4 +1,6 @@
 #!/usr/bin/env ruby
+# encoding: utf-8
 #     SeqTrimNext: Next generation sequencing preprocessor
 #     Copyright (C) <2011>
 #     Authors: Almudena Bocinos Rioboo, Diego Dario Guerrero Fernandez,
@@ -57,9 +59,35 @@
 # $: << File.expand_path(ROOT_PATH)
 $: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
-$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
+# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
 require 'seqtrimnext'
+require 'scbi_headers'
+def put_header
+  header = ScbiHeader.new('SeqTrimNEXT',Seqtrimnext::SEQTRIM_VERSION)
+  header.description="SeqtrimNEXT is a customizable and distributed pre-processing software for NGS (Next Generation Sequencing) biological data. It makes use of scbi_mapreduce gem to be able to run in parallel and distributed environments. It is specially suited for Roche 454 (normal and paired-end) & Ilumina datasets, although it could be easyly adapted to any other situation."
+  header.copyright='2011'
+  header.authors<< "Darío Guerrero"
+  header.authors<< "Almudena Bocinos"
+  header.authors<< "Rocío Bautista"
+  header.authors<< "Noé Fernández"
+  header.authors<< "Juan Falgueras"
+  header.authors<< "M. Gonzalo Claros"
+  # header.articles<< "Article one: with one description line"
+  # header.articles<< "Article two: with one description line"
+  # To output the header
+  puts header
+end
+put_header
 ############ PATHS #######################
 $SEQTRIM_PATH = ROOT_PATH
@@ -192,7 +220,7 @@ optparse = OptionParser.new do |opts|
       end
     end
   end
@@ -211,12 +239,12 @@ optparse = OptionParser.new do |opts|
   opts.on( '-C', '--use_checkpoint', 'Restore at checkpoint if scbi_mapreduce_checkpoint file is available' ) do
     options[:use_checkpoint] = true
   end
   # options[:skip_initial_stats] = false
   # opts.on( '-k', '--skip_initial_stats', 'Skip initial stats' ) do
   #   options[:skip_initial_stats] = true
   # end
   options[:install_db] = nil
   opts.on( '-i', '--install_databases TYPE', 'Install base databases and reformat them if necessary') do |db_type|
@@ -229,10 +257,12 @@ optparse = OptionParser.new do |opts|
   end
   options[:fastq] = nil
-  opts.on( '-Q', '--fastq FILE', 'Fastq input file. Use - for <STDIN>' ) do |file|
+  opts.on( '-Q', '--fastq FILE1,FILE2',Array, 'Fastq input file. Use - for <STDIN>' ) do |file|
     options[:fastq] = file
+    puts "FILES:",file,file.class
   end
   options[:format] = nil
   opts.on( '-F', '--fastq_quality_format FORMAT', 'Fastq input quality format use sanger or illumina18 for phred+33 based scores. Use illumina15 for phred+64 based scores (default is sanger) file. Use - for <STDIN>' ) do |value|
     options[:format] = value
@@ -241,7 +271,7 @@ optparse = OptionParser.new do |opts|
       exit
     end
   end
   options[:fasta] = nil
   opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
@@ -255,7 +285,7 @@ optparse = OptionParser.new do |opts|
   options[:list_db] = nil
   options[:list_db_name] = 'ALL'
   opts.on( '-L', '--list_db [DB_NAME]', 'List entries IDs in DB_NAME. Use "-L all" to view all available databases' ) do |value|
     options[:list_db] = true
     options[:list_db_name] = value if value
@@ -281,12 +311,12 @@ optparse = OptionParser.new do |opts|
   opts.on( '-j', '--json', 'Save results in json file' ) do
     options[:json] = true
   end
   options[:skip_output] = false
   opts.on( '-K', '--no-verbose', 'Change to no verbose mode. Every sequence will not be written to output log' ) do
     options[:skip_output] = true
   end
   options[:skip_report] = false
   opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do
     options[:skip_report] = true
@@ -335,7 +365,7 @@ $LOG.info("Using options: "+ options.to_json)
 if options[:install_db] then
   #install databases
   InstallDatabase.new(options[:install_db],$DB_PATH)
   # reformat databases
   MakeBlastDb.new($DB_PATH)
   exit
@@ -376,13 +406,17 @@ end
 $LOG.info "Using init file: #{$SEQTRIMNEXT_INIT}"
 $LOG.info "Using params file: #{options[:template]}"
-# fastq file
-if (!options[:fastq].nil? && options[:fastq]!='-' && !File.exists?(options[:fastq]))
-  $LOG.error "Input file: #{options[:fasta]} doesn't exists"
-  exit
-end
+# check file existence
+if options[:fastq]
+  options[:fastq].each do |fastq_file|
+    # fastq file
+    if (!fastq_file.nil? && fastq_file!='-' && !File.exists?(File.expand_path(fastq_file)))
+      $LOG.error "Input file: #{fastq_file} doesn't exists"
+      exit
+    end
+  end
+end
 # fasta file
 if (!options[:fasta].nil? && !File.exists?(options[:fasta]))
@@ -398,8 +432,6 @@ end
 s = Seqtrim.new(options)
 #generate report
 if !options[:skip_report] && system("which generate_report.rb > /dev/null ")
@@ -408,10 +440,10 @@ if !options[:skip_report] && system("which generate_report.rb > /dev/null ")
   `#{cmd}`
 else
   skip_text='.'
   if options[:skip_report]
     skip_text=' and remove the -R option from the command line.'
   end
   $LOG.info "If you want a detailed report in PDF format, be sure you have installed the optional seqtrimnext_report gem (gem install seqtrimnext_report)#{skip_text}"
 end

data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} RENAMED Viewed

@@ -7,10 +7,10 @@ require "seqtrim_action"
 # Inherit: Plugin
 ########################################################
-class ActionClassify < SeqtrimAction
+class ActionUserContaminant < SeqtrimAction
    def initialize(start_pos,end_pos)
-     super(start_pos,end_pos)
+     super(start_pos,end_pos)
      @cut =false
    end

data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb CHANGED Viewed

@@ -13,7 +13,7 @@ STATS_PATH=File.join(OUTPUT_PATH,'stats.json')
 class SeqtrimWorkManager < ScbiMapreduce::WorkManager
-  def self.init_work_manager(sequence_reader, params, chunk_size = 100, use_json=false, skip_output=false)
+  def self.init_work_manager(sequence_readers, params, chunk_size = 100, use_json=false, skip_output=false)
     @@full_stats={}
     @@params= params
     @@exit = false
@@ -22,7 +22,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
     @@ongoing_stats[:sequence_count] = 0
     @@ongoing_stats[:smallest_sequence_size] = 900000000000000
     @@ongoing_stats[:biggest_sequence_size] = 0
     @@skip_output=skip_output
     @@chunk_size = chunk_size
@@ -36,17 +36,20 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
     end
     #open input file
-    @@fqr=sequence_reader
+    @@sequence_readers=sequence_readers
     # @@use_qual = @@fqr.with_qual?
     # @@use_json = use_json
-    @@params.set_param('use_qual',@@fqr.with_qual?)
+    @@params.set_param('use_qual',@@sequence_readers.first.with_qual?)
     @@params.set_param('use_json',use_json)
+    @@params.set_param('tuple_size',@@sequence_readers.count)
     @@use_json=use_json
-    @@fqr.rewind
+    @@sequence_readers.each do |sequence_reader|
+      sequence_reader.rewind
+    end
     # open output files
@@ -77,6 +80,8 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
     @@low_sffinfo_files={}
+    @@tuple_id=0
   end
   def self.end_work_manager
@@ -94,13 +99,12 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
         f.puts JSON.pretty_generate(@@ongoing_stats)
       end
     end
     # load stats
     r=File.read(STATS_PATH)
     stats=JSON::parse(r)
     # make graphs
     gs=GraphStats.new(stats)
@@ -198,7 +202,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
     # puts "Loaded Stats"
     # puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
-    # TODO - remove sequences from rejected file that were added by cloned
+    # TODO - remove sequences from rejected file that were added by cloned
     super
     # return checkpoint
@@ -218,20 +222,16 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
     warn "Deprecated: trash_checkpointed_work was deprecated, it is automatic now"
   end
-  def next_work
-    if @@exit
-      return nil
-    end
+  def get_next_seq_from_file(file)
+    # find a valid and no repeated sequence in file
     begin
-      n,f,q,c = @@fqr.next_seq
+      n,f,q,c = file.next_seq
       if !n.nil? && @@params.repeated_seq?(n)
         @@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
         @@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
         get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
       end
@@ -240,17 +240,61 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
         @@ongoing_stats[:sequence_count] += 1
         @@ongoing_stats[:smallest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].min
         @@ongoing_stats[:biggest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].max
         @@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
       end
     end while (!n.nil? && @@params.repeated_seq?(n))
-    if !n.nil?
-      return SequenceWithAction.new(n,f.upcase,q,c)
-    else
+    return n,f,q,c
+  end
+  def next_work
+    if @@exit
       return nil
     end
+    tuple=[]
+    order_in_tuple=0
+    @@tuple_id += 1
+    tuple_size=@@sequence_readers.count
+    @@sequence_readers.each do |sequence_reader|
+      n,f,q,c = get_next_seq_from_file(sequence_reader)
+      if !n.nil?
+        seq=SequenceWithAction.new(n,f.upcase,q,c)
+        seq.tuple_id=@@tuple_id
+        seq.order_in_tuple=order_in_tuple
+        seq.tuple_size=tuple_size
+        tuple << seq
+        order_in_tuple+=1
+      end
+    end
+    if tuple_size>1
+      # check duplicated names
+      names = tuple.map{|s| s.seq_name}
+      if names.uniq.count!=tuple_size
+        # puts "NAMES EQUAL IN TUPLE"
+        tuple.each_with_index do |seq,i|
+          # puts seq.class # seq_name
+          seq.seq_name = "#{seq.seq_name}/#{i+1}"
+        end
+      end
+    end
+    # tuple is complete
+    if tuple.count==tuple_size
+      return tuple
+    else
+      return nil
+    end
   end