RubyGems - protk - Versions diffs - 1.1.0.pre - Mend

protk 1.1.0.pre

Files changed (63) hide show

data/README.md +85 -0
data/bin/annotate_ids.rb +59 -0
data/bin/big_search.rb +41 -0
data/bin/correct_omssa_retention_times.rb +27 -0
data/bin/feature_finder.rb +76 -0
data/bin/file_convert.rb +157 -0
data/bin/generate_omssa_loc.rb +42 -0
data/bin/interprophet.rb +91 -0
data/bin/make_decoy.rb +64 -0
data/bin/manage_db.rb +123 -0
data/bin/mascot_search.rb +187 -0
data/bin/mascot_to_pepxml.rb +44 -0
data/bin/msgfplus_search.rb +191 -0
data/bin/omssa_search.rb +205 -0
data/bin/peptide_prophet.rb +245 -0
data/bin/pepxml_to_table.rb +78 -0
data/bin/protein_prophet.rb +140 -0
data/bin/protk_setup.rb +31 -0
data/bin/repair_run_summary.rb +113 -0
data/bin/tandem_search.rb +292 -0
data/bin/template_search.rb +144 -0
data/bin/unimod_to_loc.rb +118 -0
data/bin/xls_to_table.rb +46 -0
data/ext/protk/extconf.rb +3 -0
data/ext/protk/protk.c +235 -0
data/lib/protk/big_search_rakefile.rake +16 -0
data/lib/protk/big_search_tool.rb +23 -0
data/lib/protk/bio_sptr_extensions.rb +210 -0
data/lib/protk/biotools_excel_converter.rb +60 -0
data/lib/protk/command_runner.rb +84 -0
data/lib/protk/constants.rb +296 -0
data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
data/lib/protk/data/apt-get_packages.yaml +47 -0
data/lib/protk/data/brew_packages.yaml +10 -0
data/lib/protk/data/default_config.yml +20 -0
data/lib/protk/data/predefined_db.crap.yaml +19 -0
data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
data/lib/protk/data/tandem_params.xml +56 -0
data/lib/protk/data/taxonomy_template.xml +9 -0
data/lib/protk/data/unimod.xml +16780 -0
data/lib/protk/eupathdb_gene_information_table.rb +158 -0
data/lib/protk/galaxy_stager.rb +24 -0
data/lib/protk/galaxy_util.rb +9 -0
data/lib/protk/manage_db_rakefile.rake +484 -0
data/lib/protk/manage_db_tool.rb +181 -0
data/lib/protk/mascot_util.rb +63 -0
data/lib/protk/omssa_util.rb +57 -0
data/lib/protk/plasmodb.rb +50 -0
data/lib/protk/prophet_tool.rb +85 -0
data/lib/protk/protein_annotator.rb +646 -0
data/lib/protk/protxml.rb +137 -0
data/lib/protk/randomize.rb +7 -0
data/lib/protk/search_tool.rb +182 -0
data/lib/protk/setup_rakefile.rake +245 -0
data/lib/protk/setup_tool.rb +19 -0
data/lib/protk/spreadsheet_extensions.rb +78 -0
data/lib/protk/swissprot_database.rb +38 -0
data/lib/protk/tool.rb +182 -0
data/lib/protk/xtandem_defaults.rb +11 -0
data/lib/protk.rb +18 -0
metadata +256 -0

data/README.md ADDED Viewed

@@ -0,0 +1,85 @@
+# protk ( Proteomics toolkit )
+***
+## What is it?
+Protk is a wrapper for various proteomics tools. Initially it focusses on MS/MS database search and validation.
+## Why do we need a wrapper around these tools
+The aim of protk is present a consistent interface to numerous proteomics tools that is as uniform as possible. Protk also provides built-in support for managing protein databases.
+***
+## Basic Installation
+1. Install rvm
+curl -L https://get.rvm.io | bash -s stable
+On OSX
+- rvm install 1.9.3 --with-gcc=clang
+- rvm use 1.9.3
+- gem install protk
+- protk_setup.rb all
+On Linux
+- rvm install 1.9.3
+- rvm use 1.9.3
+- gem install protk
+- sudo protk_setup.rb system_dependencies
+- protk_setup all
+## Sequence databases
+After running the setup.sh script you should run manage_db.rb to install specific sequence databases for use by the search engines. Protk comes with several predefined database configurations. For example, to install a database consisting of human entries from Swissprot plus known contaminants use the following command;
+    manage_db.rb add sphuman
+You should now be able to run database searches, specifying this database by using the -d sphuman flag.  Every month or so swissprot will release a new database version. You can keep your database up to date using;
+    manage_db.rb update sphuman
+This will update the database only if any of its source files (or ftp release notes) have changed. The manage_db.rb tool also allows completely custom databases to be configured. Setup requires adding quite a few command-line options but once setup databases can easily be updated without further config. The example below shows the commandline arguments required to manually configure the sphuman database.
+    manage_db.rb add --ftp-source 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt' --include-filters '/OS=Homo\ssapiens/' --id-regex 'sp\|.*\|(.*?)\s' --add-decoys --make-blast-index --archive-old sphuman
+## Galaxy integration
+Although all the protk tools can be run directly from the command-line a nicer way to run them (and visualise outputs) is to use the galaxy web application.
+1. Check out and install the latest stable galaxy [see the official galaxy wiki for more detailed setup instructions](http://wiki.g2.bx.psu.edu/Admin/Get%20Galaxy,"galaxy wiki")
+        hg clone https://bitbucket.org/galaxy/galaxy-dist
+		cd galaxy-dist
+		sh run.sh
+2. Make the protk tools available to galaxy.
+    - Create a directory for galaxy tool dependencies. It's best if this directory is outside the galaxy-dist directory. I usually create a directory called `tool_depends` alongside `galaxy-dist`.
+    - Open the file `universe_wsgi.ini` in the `galaxy-dist` directory and set the configuration option `tool_dependency_dir` to point to the directory you just created
+    - Create a symbolic link from the protk directory to the appropriate subdirectory of `<tool_dependency_dir>`. In the instructions below substitute 1.0.0 for the version number of [the protk galaxy tools](https://bitbucket.org/iracooke/protk-toolshed "protk galaxy tools") you are using.
+            cd <tool_dependency_dir>
+            mkdir protk
+			cd protk
+            mkdir 1.0.0
+            ln -s 1.0.0 default
+            ln -s <path_where_protk_was_installed> 1.0.0/bin
+3. Configure the shell in which galaxy tools will run.
+    - Create a symlink to the `env.sh` file so it will be sourced by galaxy as it runs each tool. This file should have been autogenerated by `setup.sh`
+            ln -s <path_where_protk_was_installed>/env.sh 1.0.0/env.sh
+4. Install the protk galaxy wrapper tools from the galaxy toolshed. You will need to restart galaxy after doing so for the new datatype sniffers to be activated.
+5. After installing the protk wrapper tools from the toolshed it will be necessary to tell those tools about databases you have installed. Use the manage_db.rb tool to do this. To do this, first edit config.yml to make sure the `galaxy_root` setting points to the root directory of your galaxy installation (this will allow `manage_db.rb` to update the `pepxml_databases.loc` file inside `galaxy_root/tool-data`). The run the following command and then restart the galaxy server;
+		manage_db.rb list -G

data/bin/annotate_ids.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env ruby
+#
+# This file is part of MSLIMS
+# Created by Ira Cooke 21/7/2011
+#
+# Takes an input file with a list of identified proteins and creates a table with swissprot/uniprot database details in various columns for each protein in the input file.
+#
+#
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/prophet_tool'
+require 'protk/protein_annotator'
+# Setup specific command-line options for this tool. Other options are inherited from Tool
+#
+id_tool=ProphetTool.new({:explicit_output=>true,:over_write=>true})
+id_tool.option_parser.banner = "Run ID annotation on a prot.xml input file.\n\nUsage: annotate_ids.rb [options] file1.prot.xml"
+id_tool.options.output_prefix="annotated_"
+id_tool.options.input_format=nil
+id_tool.option_parser.on( '-I', '--input-format format', 'Format of input file' ) do |format|
+  id_tool.options.input_format = format
+end
+id_tool.option_parser.parse!
+# Obtain a global environment object
+genv=Constants.new
+input_file=ARGV[0]
+database_file=id_tool.extract_db(input_file)
+output_file=nil
+if ( id_tool.explicit_output==nil)
+  output_file="#{id_tool.output_prefix}#{input_file}#{id_tool.output_suffix}.xls"
+else
+  output_file=id_tool.explicit_output
+end
+converter=ProteinAnnotator.new
+begin
+  outpath=Pathname.new(output_file)
+  if ( id_tool.over_write || !outpath.exist? )
+    converter.convert(input_file,output_file,id_tool.input_format)
+  else
+    p "Output file #{output_file} already exists"
+  end
+rescue Exception
+  p "Couldn't convert #{input_file}"
+  raise
+end

data/bin/big_search.rb ADDED Viewed

@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 14/12/2010
+#
+# Runs an MS/MS search using multiple search engines on multiple files in parallel
+# Merges results using interprophet to produce a single output file
+#
+# This tool assumes that datasets are from an ESI-QUAD-TOF instrument
+#
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/search_tool'
+require 'protk/big_search_tool'
+require 'rest_client'
+require 'rake'
+# Environment with global constants
+#
+genv=Constants.new
+# Setup specific command-line options for this tool. Other options are inherited from SearchTool
+#
+search_tool=SearchTool.new({:msms_search=>true,:background=>false,:database=>true,:over_write=>true,:glyco=>true,:explicit_output=>true})
+search_tool.jobid_prefix="b"
+search_tool.option_parser.banner = "Run a multi-search engine search on a set of input files.\n\nUsage: big_search.rb [options] file1.mzML file2.mzML ..."
+search_tool.options.output_suffix="_multisearch"
+search_tool.options.ncpu=1
+search_tool.option_parser.on( '-N', '--ncpu n', 'Split tasks into n separate processes if possible' ) do |n|
+  search_tool.options.ncpu=n
+end
+search_tool.option_parser.parse!
+bgsrch = BigSearchTool.new
+p bgsrch.run ["hi", "howdy"]

data/bin/correct_omssa_retention_times.rb ADDED Viewed

@@ -0,0 +1,27 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 14/12/2010
+#
+# Corrects retention times in omssa output
+#
+$VERBOSE=nil
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/tool'
+require 'protk/omssa_util'
+# Environment with global constants
+#
+genv=Constants.new
+tool=Tool.new
+tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
+tool.option_parser.parse!
+OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)

data/bin/feature_finder.rb ADDED Viewed

@@ -0,0 +1,76 @@
+#
+# This file is part of protk
+# Created by Ira Cooke 21/3/2012
+#
+# A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
+#
+#
+#!/bin/sh
+if [ -z "$PROTK_RUBY_PATH" ] ; then
+  PROTK_RUBY_PATH=`which ruby`
+fi
+eval 'exec "$PROTK_RUBY_PATH" $PROTK_RUBY_FLAGS -rubygems -x -S $0 ${1+"$@"}'
+echo "The 'exec \"$PROTK_RUBY_PATH\" -x -S ...' failed!" >&2
+exit 1
+#! ruby
+#
+$LOAD_PATH.unshift("#{File.dirname(__FILE__)}/lib/")
+require 'constants'
+require 'command_runner'
+require 'tool'
+# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
+#
+tool=Tool.new({:explicit_output=>true, :background=>true,:over_write=>true})
+tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
+tool.options.profile = false
+tool.option_parser.on( '--profile',"Input files are profile data" ) do
+  tool.options.profile = true
+end
+tool.option_parser.parse!
+# Obtain a global environment object
+genv=Constants.new
+def run_ff(genv,tool,cmd,output_path,jobid)
+  if ( !tool.over_write && Pathname.new(output_path).exist? )
+    genv.log("Skipping analysis on existing file #{output_path}",:warn)
+  else
+    jobscript_path="#{output_path}.pbs.sh"
+    job_params={:jobid=>jobid, :vmem=>"12Gb", :queue => "sixteen"}
+    code=tool.run(cmd,genv,job_params,jobscript_path)
+    throw "Command failed with exit code #{code}" unless code==0
+  end
+end
+throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
+throw "The profile option is not yet implemented" if ( tool.profile )
+ini_file="#{File.dirname(__FILE__)}/params/FeatureFinderCentroided.ini"
+ARGV.each do |filen|
+  input_file=filen.chomp
+  throw "Input must be an mzML file" unless input_file=~/\.mzML$/
+  input_basename=input_file.gsub(/\.mzML$/,'')
+  output_filename=tool.explicit_output
+  output_file="#{input_basename}.featureXML" if output_filename==nil
+  if ( tool.over_write || !Pathname.new(output_file).exist? )
+    output_dir=Pathname.new(output_file).dirname.realpath.to_s
+    output_base_filename=Pathname.new(output_file).basename.to_s
+    cmd=""
+    cmd<<"#{genv.openms_root}/FeatureFinderCentroided -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
+    run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
+  else
+    genv.log("Skipping search on existing file #{output_file}",:warn)
+  end
+end

data/bin/file_convert.rb ADDED Viewed

@@ -0,0 +1,157 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 14/12/2010
+#
+# Wrapper for msconvert
+#
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/tool'
+require 'tempfile'
+require 'libxml'
+include LibXML
+# Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
+#
+def has_charge_information(input_filename)
+  #<precursorList count="1">
+  #        <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
+  #          <isolationWindow>
+  #            <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
+  #            <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
+  #            <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
+  #          </isolationWindow>
+  #          <selectedIonList count="1">
+  #            <selectedIon>
+  #              <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
+  #              <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
+  #              <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
+  #            </selectedIon>
+  #          </selectedIonList>
+  reader=XML::Reader.file(input_filename)
+  while(reader.read)
+    if ( reader.local_name=="precursor")
+      subdoc=reader.read_inner_xml
+      if ( subdoc =~ /MS:1000041/ )
+        return true
+      end
+    end
+  end
+  return false
+end
+# Setup specific command-line options for this tool. Other options are inherited from Tool
+#
+convert_tool=Tool.new({:explicit_output=>true,:over_write=>true,:maldi=>true})
+convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
+# Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
+#
+convert_tool.options.maldi=false
+convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
+  convert_tool.options.maldi=true
+end
+convert_tool.options.output_format="mgf"
+convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
+  convert_tool.options.output_format=fmt
+end
+#convert_tool.options.missing_charge_state="false"
+#convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
+#  convert_tool.options.output_format=fmt
+#end
+#end
+convert_tool.option_parser.parse!
+# Environment with global constants
+#
+genv=Constants.new
+filename=ARGV[0]
+input_ext=Pathname.new(filename).extname
+input_relative_filename=Pathname.new(filename).basename.to_s
+base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
+output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
+if ( convert_tool.explicit_output )
+  output_filepath=Pathname.new(convert_tool.explicit_output)
+  base_output_dir=output_filepath.dirname.to_s
+  if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
+    # Convert base_output_dir to realpath
+    #
+    base_output_dir=Pathname.new(base_output_dir).realpath.to_s
+  end
+  output_filename=output_filepath.basename.to_s
+end
+# Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
+#
+output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
+Dir.mkdir(output_dir)
+throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
+genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
+runner=CommandRunner.new(genv)
+basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
+if ( convert_tool.maldi )
+  #For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
+  runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
+else
+  if ( has_charge_information(filename) )
+    runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
+  else
+    # If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
+    #
+    runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
+  end
+end
+# Find out what the output name was
+#
+tmp_output_filename=""
+Dir.foreach(output_dir) { |entry_name|
+  if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
+  else
+    tmp_output_filename=entry_name
+  end
+}
+# Cleanup after converting
+cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename}  #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
+code =runner.run_local(cmd)
+throw "Command failed with exit code #{code}" unless code==0
+throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )

data/bin/generate_omssa_loc.rb ADDED Viewed

@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+#
+# This file is part of MSLIMS
+# Created by Ira Cooke 12/4/2010
+#
+# Generates files required by the omssa galaxy wrapper
+#
+require 'protk/constants'
+# Environment with global constants
+#
+genv=Constants.new
+# Set search engine specific parameters on the SearchTool object
+#
+omssa_root="#{genv.omssa_root}/omssacl"
+# Get ommssa to print out a list of its acceptable modifications
+acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
+mod_vals=mod.split(":")
+[mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
+end
+# Drop the header
+#
+acceptable_mods.shift
+loc_output=File.new("omssa_mods.loc",'w')
+loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
+loc_output << "#\n"
+loc_output << "#\n"
+acceptable_mods.each { |am|
+  key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
+  loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
+}
+loc_output.close

data/bin/interprophet.rb ADDED Viewed

@@ -0,0 +1,91 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 18/1/2011
+#
+# Runs the InterProphet tool on a set of pep.xml files generated by peptide_prophet
+#
+#
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/prophet_tool'
+# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
+#
+prophet_tool=ProphetTool.new({:explicit_output=>true})
+prophet_tool.option_parser.banner = "Run InterProphet on a set of pep.xml input files.\n\nUsage: interprophet.rb [options] file1.pep.xml file2.pep.xml ..."
+prophet_tool.options.output_suffix="_iproph"
+prophet_tool.options.no_nss=""
+prophet_tool.option_parser.on( '--no-nss', 'Don\'t use NSS (Number of Sibling Searches) in Model' ) do
+  prophet_tool.options.no_nss="NONSS"
+end
+prophet_tool.options.no_nrs=""
+prophet_tool.option_parser.on('--no-nrs', 'Don\'t use NRS (Number of Replicate Spectra) in Model' ) do
+  prophet_tool.options.no_nrs="NONRS"
+end
+prophet_tool.options.no_nse=""
+prophet_tool.option_parser.on('--no-nse', 'Don\'t use NSE (Number of Sibling Experiments) in Model' ) do
+  prophet_tool.options.no_nse="NONSE"
+end
+prophet_tool.options.no_nsi=""
+prophet_tool.option_parser.on("--no-nsi",'Don\'t use NSE (Number of Sibling Ions) in Model' ) do
+  prophet_tool.options.no_nsi="NONSI"
+end
+prophet_tool.options.no_nsm=""
+prophet_tool.option_parser.on("--no-nsm",'Don\'t use NSE (Number of Sibling Modifications) in Model' ) do
+  prophet_tool.options.no_nsm="NONSM"
+end
+prophet_tool.options.min_prob=""
+prophet_tool.option_parser.on("--minprob mp","Minimum probability cutoff ") do |mp|
+  prophet_tool.options.min_prob=mp
+end
+prophet_tool.option_parser.parse!
+# Obtain a global environment object
+genv=Constants.new
+if ( prophet_tool.explicit_output != nil )
+    output_file=prophet_tool.explicit_output
+else
+  output_file="#{prophet_tool.output_prefix}interact#{prophet_tool.output_suffix}.pep.xml"
+end
+if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
+  cmd="#{genv.interprophetparser} #{prophet_tool.options.no_nss} #{prophet_tool.options.no_nrs} #{prophet_tool.options.no_nse} #{prophet_tool.options.no_nsi} #{prophet_tool.options.no_nsm}"
+  cmd << " MINPROB=#{min_prob}" if ( prophet_tool.options.min_prob !="" )
+  inputs = ARGV.collect {|file_name|
+    file_name.chomp
+  }
+  cmd << " #{inputs.join(" ")} #{output_file}"
+  genv.log("Running #{cmd}",:info)
+  # Run the analysis
+  #
+  jobscript_path="#{output_file}.pbs.sh"
+  job_params={:jobid=>"iprophet", :vmem=>"900mb", :queue => "lowmem"}
+  code = prophet_tool.run(cmd,genv,job_params,jobscript_path)
+  throw "Command failed with exit code #{code}" unless code==0
+else
+  genv.log("Interprophet output file #{output_file} already exists. Run with -r option to replace",:warn)
+end

data/bin/make_decoy.rb ADDED Viewed

@@ -0,0 +1,64 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 9/3/2012
+#
+# Create a decoy database based on a set of real protein sequences
+#
+#
+require 'libxml'
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/tool'
+require 'bio'
+include LibXML
+# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
+#
+tool=Tool.new({:explicit_output=>true})
+tool.option_parser.banner = "Create a decoy database from real protein sequences.\n\nUsage: make_decoy.rb [options] realdb.fasta"
+tool.options.db_length=0
+tool.option_parser.on('-L len','--db-length len','Number of sequences to generate') do |len|
+  tool.options.db_length=len.to_i
+end
+tool.options.prefix_string="decoy_"
+tool.option_parser.on('-P str','--prefix-string str','String to prepend to sequence ids') do |str|
+  tool.options.prefix_string=str
+end
+tool.options.append=false
+tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
+  tool.options.append=true
+end
+tool.option_parser.parse!
+input_file=ARGV[0]
+db_length=tool.db_length
+if ( db_length==0) #If no db length was specified use the number of entries in the input file
+  db_length=Bio::FastaFormat.open(input_file).count
+  p "Found #{db_length} entries in input file"
+end
+output_file="decoy_#{input_file}"
+output_file = tool.explicit_output if tool.explicit_output!=nil
+genv=Constants.new()
+Randomize.make_decoys #{input_file} #{db_length} #{output_file} #{tool.prefix_string}"
+cmd << "cat #{input_file} >> #{output_file}" if ( tool.append )
+p cmd
+# Run the conversion
+#
+job_params= {:jobid => tool.jobid_from_filename(input_file) }
+job_params[:queue]="lowmem"
+job_params[:vmem]="900mb"
+tool.run(cmd,genv,job_params)