RubyGems - npsearch - Versions diffs - 2.0.1 → 2.1.0 - Mend

npsearch 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/.gitignore +2 -1
data/README.md +3 -2
data/Rakefile +14 -5
data/bin/npsearch +45 -33
data/lib/npsearch/arg_validator.rb +70 -241
data/lib/npsearch/output.rb +6 -5
data/lib/npsearch/pool.rb +1 -1
data/lib/npsearch/scoresequence.rb +62 -60
data/lib/npsearch/sequence.rb +12 -9
data/lib/npsearch/signalp.rb +29 -10
data/lib/npsearch/version.rb +1 -1
data/lib/npsearch.rb +27 -52
data/npsearch.gemspec +2 -1
data/templates/contents.slim +3 -3
data/test/files/mixed_content.fa +167 -0
data/test/test_argument_validator.rb +50 -0
data/test/test_helper.rb +1 -0
data/test/test_sequence.rb +81 -0
data/test/test_sequence_scoring.rb +142 -0
metadata +27 -17
data/test/files/1_protein.fa +0 -204
data/test/files/2_orf.fa +0 -1330
data/test/files/3_signalp_out.txt +0 -667
data/test/files/4_secretome.fa +0 -6
data/test/files/5_output.fa +0 -6
data/test/files/5_output.html +0 -37
data/test/test_np_search.rb +0 -122

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7ede73d531ae96b5790db5dfe81704e44981470d
-  data.tar.gz: bbcbcff58473ba403044aded6e2882e150821a7b
+  metadata.gz: af22531e55865ab286dd6599917196765d72af12
+  data.tar.gz: 5a3bf459332ff8bc70c3c6e431cae9e09fe0494c
 SHA512:
-  metadata.gz: 9c2a40a4ee188e2a3c159e66c92e4949b41775684ae4a4d87943e562ca361b2619534cc69d4c1ac6d263467edbb4b08aba982d9d48026d48882720b8e30b9a12
-  data.tar.gz: 3d258345776914e041dbb5dfd4447b88bd7feb6176a654f147598b41cb4fa011c4051d8934a8fe9df4e7ab324b229cf252d9c6c8cacfc7c9da1a02e34cff8c1d
+  metadata.gz: 899fed317d7ceb7a62d52fb2b3e0e24e835f630c058a9dacf05e267019a900c5c2357588e9f1bdd674731eb951a44f16d5898747efb872e6ae1ceaf5efb8acf4
+  data.tar.gz: 0aab6be7e635dd63b2e8e2d4d5eda128f7f39b41977f5b08fb090408dd612ac98d41d3fe60a086ea5a16d2cff56bccbce79b8168c3f2af2e3fff222b1657b908

data/.gitignore CHANGED Viewed

@@ -15,4 +15,5 @@ spec/reports
 test/tmp
 test/version_tmp
 tmp
-coverage
+coverage
+.temp

data/README.md CHANGED Viewed

@@ -32,8 +32,9 @@ NpSearch orders the results based on the following characteristics:
 ### Installation Requirements
 * Ruby (>= 2.0.0)
 * SignalP 4.1 (Available from [here](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?signalp))
-* USearch (Available from [here]())
-* A web browser.
+* CD-HIT (Available from [here](http://weizhongli-lab.org/cd-hit/) - Suggested Installation via [Homebrew](http://brew.sh) or [Linuxbrew](http://linuxbrew.sh) - `brew install homebrew/science/cd-hit`)
+* EMBOSS (Available from [here](http://emboss.sourceforge.net) - Suggested Installation via [Homebrew](http://brew.sh) or [Linuxbrew](http://linuxbrew.sh) - `brew install homebrew/science/emboss`)
 ## Installation
 Simply run the following command in the terminal.

data/Rakefile CHANGED Viewed

@@ -1,14 +1,23 @@
-require 'bundler/gem_tasks'
 require 'rake/testtask'
 task default: [:build]
-desc 'Installs the ruby gem'
-task :build do
-  exec("gem build np_search.gemspec && gem install ./NpSearch-#{NpSearch::VERSION}.gem")
+desc 'Builds and installs'
+task install: [:build] do
+  require_relative 'lib/npsearch/version'
+  sh "gem install ./npsearch-#{NpSearch::VERSION}.gem"
+end
+desc 'Runs tests, generates documentation, builds gem (default)'
+task build: [:test] do
+  sh 'gem build npsearch.gemspec'
 end
+desc 'Runs tests'
 task :test do
   Rake::TestTask.new do |t|
-    t.pattern = 'test/test_np_search.rb'
+    t.libs.push 'lib'
+    t.test_files = FileList['test/test_*.rb']
+    t.verbose = true
   end
 end

data/bin/npsearch CHANGED Viewed

@@ -1,56 +1,53 @@
 #!/usr/bin/env ruby
 require 'optparse'
-require 'npsearch'
-require 'npsearch/arg_validator'
-require 'npsearch/version'
+require 'English'
+require 'tempfile'
 opt = {}
 optparse = OptionParser.new do |opts|
   opts.banner = <<Banner
+* Description: A tool to identify novel neuropeptides.
-* Usage: npsearch [Options] -i [Input File]
-* Mandatory Options:
+* Usage: npsearch [Options] [Input File]
+* Options
 Banner
-  opt[:input_file] = nil
-  opts.on('-i', '--input [file]',
-          'Path to the input fasta file') do |f|
-    opt[:input_file] = f
-  end
-  opts.separator ''
-  opts.separator '* Optional Options:'
-  opt[:signalp_path] = File.join(ENV['HOME'], 'signalp/signalp')
-  opts.on('-s', '--signalp_path', String,
+  opt[:signalp_path] = 'signalp'
+  opts.on('-s', '--signalp_path path_to_signalp',
           'The full path to the signalp script. This can be downloaded from',
           ' CBS. See https://www.github.com/wurmlab/NpSearch for more',
           ' information') do |p|
     opt[:signalp_path] = p
   end
-  opt[:usearch_path] = File.join(ENV['HOME'], 'bin/uclust')
-  opts.on('-u', '--usearch_path', String,
-          'The full path to the usearch binary. This script can be downloaded',
-          ' from .... See https://www.github.com/wurmlab/NpSearch for more',
-          ' information') do |p|
-    opt[:usearch_path] = p
+  opt[:temp_dir] = File.join(Dir.pwd, '.temp',
+                             Dir::Tmpname.make_tmpname('', nil))
+  opts.on('-d', '--temp_dir path_to_temp_dir',
+          'The full path to the temp dir. NpSearch will create the folder and',
+          ' then delete the folder once it has finished using them.',
+          ' Default: Hidden folder in the current working dirctory') do |p|
+    opt[:temp_dir] = p
   end
   opt[:num_threads] = 1
-  opts.on('-n', '--num_threads', Integer,
+  opts.on('-n', '--num_threads num_of_threads', Integer,
           'The number of threads to use when analysing the input file') do |n|
     opt[:num_threads] = n
   end
-  opt[:orf_min_length] = 10
-  opts.on('-m', '--orf_min_length N', Integer,
+  opt[:min_orf_length] = 30
+  opts.on('-m', '--min_orf_length N', Integer,
           'The minimum length of a potential neuropeptide precursor.',
           ' Default: 30') do |n|
-    opt[:orf_min_length] = n
+    opt[:min_orf_length] = n
+  end
+  opt[:max_seq_length] = 600
+  opts.on('-m', '--max_seq_length N', Integer,
+          'The maximum length of a potential neuropeptide precursor.',
+          ' Default: 600') do |n|
+    opt[:max_seq_length] = n
   end
   opts.on('-h', '--help', 'Display this screen') do
@@ -59,16 +56,31 @@ Banner
   end
   opts.on('-v', '--version', 'Shows version') do
+    require 'npsearch/version'
     puts NpSearch::VERSION
     exit
   end
 end
-optparse.parse!
+begin
+  optparse.parse!
+  if ARGV.length > 1
+    $stderr.puts "Error: It seems that you have #{ARGV.length} input fasta" \
+                 ' files. Please ensure that you have a single input fasta' \
+                 " file\n"
+    exit 1
+  elsif ARGV.empty?
+    $stderr.puts optparse
+    exit 1
+  end
+rescue OptionParser::ParseError
+  $stderr.print 'Error: ' + $ERROR_INFO.to_s + "\n"
+  exit 1
+end
-# Temporary hard coding my defaults...
-opt[:num_threads] = 8
-opt[:signalp_path] = '/Volumes/Data/data/programs/signalp-4.1/signalp'
-opt[:usearch_path] = '/Volumes/Data/data/programs/bin/usearch'
+opt[:input_file] = ARGV[0]
+require 'npsearch'
+require 'npsearch/arg_validator'
 NpSearch.init(opt)
 NpSearch.run

data/lib/npsearch/arg_validator.rb CHANGED Viewed

@@ -1,264 +1,93 @@
+require 'bio'
+# Top level module / namespace.
 module NpSearch
-  class ArgValidators
-    # Changes the logger level to output extra info when the verbose option is
-    #   true.
-    def initialize(verbose_opt)
-      LOG.level = Logger::INFO if verbose_opt == true
-    end
+  # A class that validates the command line opts
+  class ArgumentsValidators
+    class << self
+      def run(opt)
+        assert_file_present('input fasta file', opt[:input_file])
+        assert_input_file_not_empty(opt[:input_file])
+        assert_input_file_probably_fasta(opt[:input_file])
+        opt[:type] = assert_input_sequence(opt[:input_file])
+        opt[:num_threads] = check_num_threads(opt[:num_threads])
+        assert_binaries('SignalP 4.1 Script', opt[:signalp_path])
+        opt
+      end
-    # Runs all the arguments method...
-    def arg(motif, input, output_dir, orf_min_length, extract_orf,
-            signalp_file, help_banner)
-      comp_arg(input, motif, output_dir, extract_orf, help_banner)
-      input_type = guess_input_type(input)
-      extract_orf_conflict(input_type, extract_orf)
-      input_sp_file_conflict(input_type, signalp_file)
-      orf_min_length(orf_min_length)
-      input_type
-    end
+      private
-    # Ensures that the compulsory input arguments are supplied...
-    def comp_arg(input, motif, output_dir, extract_orf, help_banner)
-      comp_arg_error(motif, 'Query Motif ("-m" option)') if extract_orf == false
-      comp_arg_error(input, 'Input file ("-i option")')
-      comp_arg_error(output_dir, 'Output Folder ("-o" option)')
-      return unless input.nil? || (motif.nil? && extract_orf == false)
-      puts help_banner
-      exit
-    end
+      def assert_file_present(desc, file, exit_code = 1)
+        return if file && File.exist?(File.expand_path(file))
+        $stderr.puts "*** Error: Couldn't find the #{desc}: #{file}."
+        exit exit_code
+      end
-    # Ensures that a message is provided for all missing compulsory args.
-    #   Run from comp_arg method
-    def comp_arg_error(arg, message)
-      puts 'Usage Error: No ' + message + ' is supplied' if arg.nil?
-    end
+      def assert_input_file_not_empty(file)
+        return unless File.zero?(File.expand_path(file))
+        $stderr.puts "*** Error: The input_file (#{file})" \
+                     ' seems to be empty.'
+        exit 1
+      end
-    # Guesses the type of data within the input file on the first 100 lines of
-    #   the file (ignores all identifiers (lines that start with a '>').
-    #   It has a 80% threshold.
-    def guess_input_type(input_file)
-      input_file_format(input_file)
-      sequences = []
-      File.open(input_file, 'r') do |file_stream|
-        file_stream.readlines[0..100].each do |line|
-          sequences << line.to_s unless line.match(/^>/)
+      def assert_input_file_probably_fasta(file)
+        File.open(file, 'r') do |f|
+          fasta = (f.readline[0] == '>') ? true : false
+          return fasta if fasta
         end
+        $stderr.puts "*** Error: The input_file (#{file})" \
+                     ' does not seems to be a fasta file.'
+        exit 1
       end
-      type = Bio::Sequence.new(sequences).guess(0.8)
-      if type == Bio::Sequence::NA
-        input_type = 'genetic'
-      elsif type == Bio::Sequence::AA
-        input_type = 'protein'
-      end
-      input_type
-    end
-    # Ensures that the input file a) exists b) is not empty and c) is a fasta
-    #   file. Run from the guess_input_type method.
-    def input_file_format(input_file)
-      unless File.exist?(input_file)
-        fail ArgumentError("Critical Error: The input file '#{input_file}'" \
-                           ' does not exist.')
-      end
-      if File.zero?(input_file)
-        fail ArgumentError("Critical Error: The input file '#{input_file}'" \
-                            ' is empty.')
+      def assert_input_sequence(file)
+        type = type_of_sequences(file)
+        return type unless type.nil?
+        $stderr.puts '*** Error: The input files seems to contain a mixture of'
+        $stderr.puts '    both protein and nucleotide data.'
+        $stderr.puts '    Please correct this and try again.'
+        exit 1
       end
-      unless File.probably_fasta?(input_file)
-        fail ArgumentError("Critical Error: The input file '#{input_file}'" \
-                            ' does not seem to be in fasta format. Only' \
-                            ' input files in fasta format are supported.')
-      end
-    end
-    # Ensures that the extract_orf option is only used with genetic data.
-    def extract_orf_conflict(input_type, extract_orf)
-      return unless input_type == 'protein' && extract_orf == true
-      fail ArgumentError('Usage Error: Conflicting arguments detected:' \
-                          ' Protein data detected within the input file,' \
-                          ' when using the  Extract_ORF option (option' \
-                          ' "-e"). This option is only available when' \
-                          ' input file contains genetic data.')
-    end
-    # Ensures that the protein data (or open reading frames) are supplied as
-    #   the input file when the signal p output file is passed.
-    def input_sp_file_conflict(input_type, signalp_file)
-      return unless input_type == 'genetic' && !signalp_file.nil?
-      fail ArgumentError('Usage Error: Conflicting arguments detected' \
-                          ': Genetic data detected within the input file' \
-                          ' when using the Signal P Input Option (Option' \
-                          ' "-s"). The Signal P input Option requires the' \
-                          ' input of two files: the Signal P Script Result' \
-                          ' files (at the "-s" option) and the protein' \
-                          ' data file used to run the Signal P Script.')
-    end
-    # Ensures that the ORF minimum length is a number. Any digits after the
-    #   decimal place are ignored.
-    def orf_min_length(orf_min_length)
-      return unless orf_min_length.to_i < 1
-      fail ArgumentError('Usage Error: The Open Reading Frames minimum' \
-                          ' length can only be a full integer.')
-    end
-  end
-  class Validators
-    # Checks for the presence of the output directory; if not found, it asks
-    #   the user whether they want to create the output directory.
-    def output_dir(output_dir)
-      unless File.directory? output_dir # If output_dir doesn't exist
-        fail IOError, "\n\nThe output directory deoes not exist\n\n"
-      end
-    rescue IOError
-      puts # a blank line
-      puts 'The output directory does not exist.'
-      puts # a blank line
-      puts "The directory '#{output_dir}' will be created in this location."
-      puts 'Do you to continue? [y/n]'
-      print '> '
-      inp = $stdin.gets.chomp
-      until inp.downcase == 'n' || inp.downcase == 'y' || inp == ''
-        puts # a blank line
-        puts "The input: '#{inp}' is not recognised - 'y' or 'n' are the" \
-             ' only recognisable inputs.'
-        puts 'Please try again.'
-        puts "The directory '#{output_dir}' will be created in this" \
-             ' location.'
-        puts 'Do you to continue? [y/n]'
-        print '> '
-        inp = $stdin.gets.chomp
+      def type_of_sequences(file)
+        fasta_content = IO.binread(file)
+        # the first sequence does not need to have a fasta definition line
+        sequences = fasta_content.split(/^>.*$/).delete_if(&:empty?)
+        # get all sequence types
+        sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
+                                  .uniq.compact
+        return nil if sequence_types.empty?
+        sequence_types.first if sequence_types.length == 1
       end
-      if inp.downcase == 'y' || inp == ''
-        FileUtils.mkdir_p "#{output_dir}"
-        puts 'Created output directory...'
-      elsif inp.downcase == 'n'
-        raise ArgumentError('Critical Error: An output directory is' \
-                            ' required; please create an output directory' \
-                            ' and then try again.')
-      end
-    end
-    # Ensures that the Signal P Script is present. If not found in the home
-    #   directory, it asks the user for its location.
-    def signalp_dir
-      signalp_dir = "#{Dir.home}/SignalPeptide"
-      if File.exist? "#{signalp_dir}/signalp"
-        signalp_directory = signalp_dir
-      else
-        begin
-          fail IOError('The Signal P Script directory cannot be found at' \
-                        " the following location: '#{signalp_dir}/'.")
-        rescue IOError
-          puts # a blank line
-          puts 'Error: The Signal P Script directory cannot be found at the' \
-               " following location: '#{signalp_dir}/'."
-          puts # a blank line
-          puts 'Please enter the full path or a relative path to the Signal' \
-               ' P Script directory (i.e. to the folder containing the' \
-               ' Signal P script). Refer to the online tutorial for more help'
-          print '> '
-          inp = $stdin.gets.chomp
-          until (File.exist? "#{signalp_dir}/signalp") ||
-                (File.exist? "#{inp}/signalp")
-            puts # a blank line
-            puts 'The Signal P directory cannot be found at the following' \
-                 " location: '#{inp}'"
-            puts 'Please enter the full path or a relative path to the Signal' \
-                 ' Peptide directory again.'
-            print '> '
-            inp = $stdin.gets.chomp
-          end
-          signalp_directory = inp
-          puts # a blank line
-          puts "The Signal P directory has been found at '#{signalp_directory}'"
-          FileUtils.ln_s "#{signalp_directory}", "#{Dir.home}/SignalPeptide",
-                         force: true
-          puts # a blank line
-        end
+      def guess_sequence_type(seq)
+        # removing non-letter and ambiguous characters
+        cleaned_sequence = seq.gsub(/[^A-Z]|[NX]/i, '')
+        return nil if cleaned_sequence.length < 10 # conservative
+        type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
+        type == Bio::Sequence::NA ? :genetic : :protein
       end
-      signalp_directory
-    end
-    # Ensures that the supported version of the Signal P Script has been linked
-    #   to NpSearch. Run from the 'sp_results' method.
-    def sp_version(input_file)
-      File.open(input_file, 'r') do |file_stream|
-        first_line = file_stream.readline
-        if first_line.match(/# SignalP-4.1/)
-          return true
-        else
-          return false
+      def check_num_threads(num_threads)
+        num_threads = Integer(num_threads)
+        unless num_threads > 0
+          $stderr.puts 'Number of threads can not be lower than 0'
+          $stderr.puts 'Changing number of threads to 1'
+          num_threads = 1
         end
+        return num_threads unless num_threads > 256
+        $stderr.puts "Number of threads set at #{num_threads} is" \
+                     ' unusually high.'
       end
-    end
-    # Ensures that the critical columns in the tabular results produced by the
-    #   Signal P script are conserved. Run from the 'sp_results' method.
-    def sp_column(_input_file)
-      File.open('signalp_out.txt', 'r') do |file_stream|
-        secondline = file_stream.readlines[1]
-        row = secondline.gsub(/\s+/m, ' ').chomp.split(' ')
-        if row[1] != 'name' && row[4] != 'Ymax' && row[5] != 'pos' &&
-           row[9] != 'D'
-          return true
-        else
-          return false
-        end
+      def assert_binaries(desc, bin)
+        return if command?(bin.to_s)
+        $stderr.puts "NpSearch is unable to use the #{desc} at #{bin}"
       end
-    end
-    # Ensure that the right version of the Signal P script is used (via
-    #   'sp_version' Method). If the wrong signal p script has been linked to
-    #   NpSearch, check whether the critical columns in the tabular results
-    #   produced by the Signal P Script are conserved (via 'sp_column'
-    #   Method).
-    def sp_results(signalp_output_file)
-      return if sp_version(signalp_output_file)
-      # i.e. if Signal P is the wrong version
-      if sp_column(signalp_output_file) # If wrong version but correct columns
-        puts # a blank line
-        puts 'Warning: The wrong version of signalp has been linked.' \
-             ' However, the signal peptide output file still seems to' \
-             ' be in the right format.'
-      else
-        puts # a blank line
-        puts 'Warning: The wrong version of the signal p has been linked' \
-             ' and the signal peptide output is in an unrecognised format.'
-        puts 'Continuing may give you meaningless results.'
-      end
-      puts # a blank line
-      puts 'Do you still want to continue? [y/n]'
-      print '> '
-      inp = $stdin.gets.chomp
-      until inp.downcase == 'n' || inp.downcase == 'y'
-        puts # a blank line
-        puts "The input: '#{inp}' is not recognised - 'y' or 'n' are the" \
-             ' only recognisable inputs.'
-        puts 'Please try again.'
+      # Return `true` if the given command exists and is executable.
+      def command?(command)
+        system("which #{command} > /dev/null 2>&1")
       end
-      if inp.downcase == 'y'
-        puts 'Continuing.'
-      elsif inp.downcase == 'n'
-        fail IOError('Critical Error: NpSearch only supports SignalP 4.1' \
-                      ' (downloadable form CBS) Please ensure the version' \
-                      ' of the signal p script is downloaded.')
-      end
-    end
-    # Guesses the type of the data in the supplied motif. It ignores all
-    #   non-word characters (e.g. '|' that is used for regex). It has a 90%
-    #   threshold.
-    def motif_type(motif)
-      motif_seq = Bio::Sequence.new(motif.gsub(/\W/, ''))
-      type = motif_seq.guess(0.9)
-      return unless type.to_s != 'Bio::Sequence::AA'
-      fail IOError('Critical Error: There seems to be an error in' \
-                    ' processing the motif. Please ensure that the motif' \
-                    ' contains amino acid residues that you wish to search' \
-                    ' for.')
     end
   end
 end

data/lib/npsearch/output.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'slim'
+# Top level module / namespace.
 module NpSearch
   # Class that generates the output
   class Output
@@ -8,17 +9,17 @@ module NpSearch
         templates_path = File.expand_path(File.join(__FILE__, '../../../',
                                                     'templates/contents.slim'))
         contents_temp = File.read(templates_path)
-        html_content = Slim::Template.new { contents_temp }.render(NpSearch)
-        File.open("#{input_file}.out.html", 'w') { |f| f.puts html_content }
+        h_content = Slim::Template.new { contents_temp }.render(NpSearch)
+        File.open("#{input_file}.npsearch.html", 'w') { |f| f.puts h_content }
       end
       def to_fasta(input_file, sorted_sequences, input_type)
-        File.open("#{input_file}.out.fa", 'w') do |f|
+        File.open("#{input_file}.npsearch.fa", 'w') do |f|
           sorted_sequences.each do |s|
             if input_type == :protein
-              f.puts ">#{s.id}\n#{s.signalp}#{s.seq}"
+              f.puts ">#{s.defline}\n#{s.signalp}#{s.seq}"
             elsif input_type == :nucleotide
-              f.puts ">#{s.id}-(frame:#{s.translated_frame})"
+              f.puts ">#{s.defline}-(frame:#{s.translated_frame})"
               f.puts "#{s.signalp}#{s.seq}"
             end
           end

data/lib/npsearch/pool.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # coding: utf-8
 # From http://burgestrand.se/code/ruby-thread-pool/
 #
-# Copyright © 2012, Kim Burgestrand kim@burgestrand.se
+# Copyright 2012, Kim Burgestrand kim@burgestrand.se
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal