RubyGems - protk - Versions diffs - 1.2.0 → 1.2.1 - Mend

protk 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/README.md +69 -36
data/bin/gffmerge.rb +16 -5
data/bin/mascot_search.rb +174 -97
data/bin/omssa_search.rb +31 -8
data/bin/protk_setup.rb +0 -3
data/bin/sixframe.rb +0 -6
data/bin/tandem_search.rb +97 -6
data/bin/template_search.rb +144 -0
data/bin/uniprot_annotation.rb +130 -0
data/lib/convert_util.rb +27 -0
data/lib/pepxml.rb +22 -0
data/lib/protk/big_search_rakefile.rake +16 -0
data/lib/protk/big_search_tool.rb +23 -0
data/lib/protk/bio_sptr_extensions.rb +41 -2
data/lib/protk/constants.rb +13 -0
data/lib/protk/data/apt-get_packages.yaml +4 -0
data/lib/protk/data/default_config.yml +1 -0
data/lib/protk/data/make_uniprot_table.rb +29 -0
data/lib/protk/data/predefined_db.trembl_annotation.yaml +20 -0
data/lib/protk/data/uniprot_accessions.loc +96 -0
data/lib/protk/data/uniprot_accessions_table.txt +97 -0
data/lib/protk/data/uniprot_input_accessions.loc +95 -0
data/lib/protk/manage_db_rakefile.rake +25 -11
data/lib/protk/omssa_util.rb +1 -1
data/lib/protk/setup_rakefile.rake +39 -2
metadata +13 -1

data/bin/omssa_search.rb CHANGED

@@ -40,6 +40,20 @@ search_tool.option_parser.on( '--galaxy-index-dir dir', 'Specify galaxy index di
   search_tool.options.galaxy_index_dir=dir
 end
+search_tool.options.omx_output=nil
+search_tool.option_parser.on( '--omx-output path', 'Specify path for additional OMX output (optional).' ) do |path|
+  search_tool.options.omx_output=path
+end
+if ( ENV['PROTK_OMSSA_NTHREADS'] )
+  search_tool.options.nthreads=ENV['PROTK_OMSSA_NTHREADS']
+else
+  search_tool.options.nthreads=0
+end
+search_tool.option_parser.on( '--nthreads num', 'Number of search threads to use. Default is to use the value in environment variable PROTK_OMSSA_NTHREADS or else to autodetect' ) do |num|
+  search_tool.options.nthreads=num
+end
 search_tool.option_parser.parse!
 # Environment with global constants
@@ -96,7 +110,7 @@ ARGV.each do |filename|
     # The basic command
     #
-    cmd = "#{make_blastdb_cmd} #{genv.omssacl} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
+    cmd = "#{make_blastdb_cmd} #{genv.omssacl} -nt #{search_tool.nthreads} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
     #Missed cleavages
     #
@@ -118,6 +132,10 @@ ARGV.each do |filename|
       end
     end
+    if ( search_tool.omx_output )
+      cmd << " -ox #{search_tool.omx_output} "
+    end
     # Precursor tolerance
     #
@@ -150,8 +168,9 @@ ARGV.each do |filename|
     # Variable Modifications
     #
-    if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
+    if ( search_tool.var_mods !="" && !(search_tool.var_mods =~/None/)) # Checking for none is to cope with galaxy input
       var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
       if ( var_mods !="" )
         cmd << " -mv #{var_mods}"
       end
@@ -163,7 +182,7 @@ ARGV.each do |filename|
       end
     end
-    if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
+    if ( search_tool.fix_mods !="" && !(search_tool.fix_mods=~/None/))
       fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
       if ( fix_mods !="")
         cmd << " -mf #{fix_mods}"
@@ -182,7 +201,7 @@ ARGV.each do |filename|
       end
     end
-    if ( search_tool.searched_ions !="" && !search_tool.searched_ions=~/None/)
+    if ( search_tool.searched_ions !="" && !(search_tool.searched_ions=~/None/))
       searched_ions=search_tool.searched_ions.split(",").collect{ |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
       if ( searched_ions!="")
         cmd << " -i #{searched_ions}"
@@ -204,19 +223,23 @@ ARGV.each do |filename|
     # Intensity cut-off
     cmd << " -ci #{search_tool.intensity_cut_off}"
+    # Send output to logfile. OMSSA Logging does not play well with Ruby Open4
+    cmd << " -logfile omssa.log"
     # Up to here we've formulated the omssa command. The rest is cleanup
     p "Running:#{cmd}"
     # Add retention time corrections
     #
     if (search_tool.options.add_retention_times)
-      cmd << "; #{rt_correct_bin} #{output_path} #{input_path} "
+      # TODO: Really correct rts
+#      cmd << "; #{rt_correct_bin} #{output_path} #{input_path} "
     end
     # Correct the pepXML file
     #
-    cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
-    genv.log("Running repair script command #{cmd}",:info)
+#    cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
+#    genv.log("Running repair script command #{cmd}",:info)
     # Run the search
     #
@@ -234,4 +257,4 @@ ARGV.each do |filename|
   #
   make_blastdb_cmd=""
-end
+end

data/bin/protk_setup.rb CHANGED

@@ -41,8 +41,5 @@ env=Constants.new
 toolname=ARGV.shift
-p ARGV
-p toolname
 p "Installing #{toolname}"
 tool.install toolname

data/bin/sixframe.rb CHANGED

@@ -40,12 +40,6 @@ file.each do |entry|
         position_start = position
         position_end = position_start + orf.length*3 -1
-        if ( frame > 3)
-            position_start = length - (position - 1)
-            position_end = position_start - orf.length * 3 + 1
-        end
         # Create accession compliant with NCBI naming standard
         # See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
         ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')

data/bin/tandem_search.rb CHANGED

@@ -48,7 +48,18 @@ search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem O
 end
 search_tool.options.thresholds_type = 'isb_kscore'
-search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold)' ) do |thresholds_type|
+search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold, system_default)' ) do |thresholds_type|
+  # This options sets up various X! Tandem thresholds.
+  #  - system_default: Don't change any defaults just use
+  #      the defaults for this TPP install as is.
+  #  - tandem_default: These thresholds are found on the
+  #      tandem api page. http://www.thegpm.org/tandem/api/index.html
+  #  - isb_native: These are the defaults found in
+  #      isb_default_input_native.xml distributed with TPP 4.6.
+  #  - isb_kscore: These are the defaults found in
+  #      isb_default_input_kscore.xml distributed with TPP 4.6.
+  #  - scaffold: These are the defaults recommend by Proteome Software
+  #      for use with Scaffold.
   search_tool.options.thresholds_type = thresholds_type
 end
@@ -57,11 +68,12 @@ search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscor
   search_tool.options.algorithm = algorithm
 end
-search_tool.options.cleavage_semi = false
-search_tool.option_parser.on( '--cleavage-semi' ) do
-  search_tool.options.cleavage_semi = true
+search_tool.options.cleavage_semi = true
+search_tool.option_parser.on( '--no-cleavage-semi' ) do
+  search_tool.options.cleavage_semi = false
 end
 search_tool.options.n_terminal_mod_mass=nil
 search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
     search_tool.options.n_terminal_mod_mass = mass
@@ -122,9 +134,16 @@ def decode_modification_string(mstring)
   mstring
 end
+def set_option(std_params, tandem_key, value)
+  notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
+  throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
+  notes[0].content=value
+end
 def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
+  set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
+  set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
   # Set the input and output paths
   #
   input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
@@ -184,6 +203,78 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
     isotopic_error[0].content="no"
   end
+  if search_tool.tandem_output
+    # If one is interested in the tandem output (e.g. for consumption by Scaffold)
+    # want to store additional information.
+    set_option(std_params, "output, spectra", "yes")
+  end
+  thresholds_type = search_tool.thresholds_type
+  if thresholds_type != "system_default"
+    maximum_valid_expectation_value = "0.1"
+    if thresholds_type == "scaffold"
+      maximum_valid_expectation_value = "1000"
+    end
+    minimum_ion_count = "4"
+    case thresholds_type
+    when "isb_kscore", "isb_native"
+      minimum_ion_count = "1"
+    when "scaffold"
+      minimum_ion_count = "0"
+    end
+    minimum_peaks = "15"
+    case thresholds_type
+    when "isb_native"
+      minimum_peaks = "6"
+    when "isb_kscore"
+      minimum_peaks = "10"
+    when "scaffold"
+      minimum_peaks = "0"
+    end
+    minimum_fragement_mz = "150"
+    case thresholds_type
+    when "isb_native"
+      minimum_fragement_mz = "50"
+    when "isb_kscore"
+      minimum_fragement_mz = "125"
+    when "scaffold"
+      minimum_fragement_mz = "0"
+    end
+    minimum_parent_mh = "500" # tandem and isb_native defaults
+    case thresholds_type
+    when "isb_kscore"
+      minimum_parent_mh = "600"
+    when "scaffold"
+      minimum_parent_mh = "0"
+    end
+    use_noise_suppression = "yes"
+    if thresholds_type == "isb_kscore" or thresholds_type == "scaffold"
+      use_noise_suppression = "no"
+    end
+    dynamic_range = "100.0"
+    case thresholds_type
+    when "isb_kscore"
+      dynamic_range = "10000.0"
+    when "scaffold"
+      dynamic_range = "1000.0"
+    end
+    set_option(std_params, "spectrum, dynamic range", dynamic_range)
+    set_option(std_params, "spectrum, use noise suppression", use_noise_suppression)
+    set_option(std_params, "spectrum, minimum parent m+h", minimum_parent_mh)
+    set_option(std_params, "spectrum, minimum fragment mz", minimum_fragement_mz)
+    set_option(std_params, "spectrum, minimum peaks", minimum_peaks)
+    set_option(std_params, "scoring, minimum ion count", minimum_ion_count)
+    set_option(std_params, "output, maximum valid expectation value", maximum_valid_expectation_value)
+  end
   # Fixed and Variable Modifications
   #

data/bin/template_search.rb ADDED

@@ -0,0 +1,144 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 14/12/2010
+#
+# Runs an MS/MS search using the MSGFPlus search engine
+#
+require 'protk/search_tool'
+# Setup specific command-line options for this tool. Other options are inherited from SearchTool
+#
+search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
+search_tool.option_parser.banner = "Run an msms search on a set of msms spectrum input files.\n\nUsage: template_search.rb [options] file1.mzML file2.mzML ..."
+search_tool.options.output_suffix="_template"
+search_tool.options.custom_option="default"
+search_tool.option_parser.on('--custom-opt value','Custom option relevant to this tool only (Default default)') do |val|
+  search_tool.options.custom_option=val
+end
+search_tool.option_parser.parse!
+# Set search engine specific parameters on the SearchTool object
+#
+msgf_bin="#{genv.msgf_bin}/MSGFPlus.jar"
+case
+when Pathname.new(search_tool.database).exist? # It's an explicitly named db
+  current_db=Pathname.new(search_tool.database).realpath.to_s
+else
+  current_db=search_tool.current_database :fasta
+end
+fragment_tol = search_tool.fragment_tol
+precursor_tol = search_tool.precursor_tol
+throw "When --output is set only one file at a time can be run" if  ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
+# Run the search engine on each input file
+#
+ARGV.each do |filename|
+  if ( search_tool.explicit_output!=nil)
+    output_path=search_tool.explicit_output
+  else
+    output_path="#{search_tool.output_base_path(filename.chomp)}.mzid"
+  end
+  # (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
+  # Get the input file extension
+  ext = Pathname.new(filename).extname
+  input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
+  # Only proceed if the output file is not present or we have opted to over-write it
+  #
+  if ( search_tool.over_write || !Pathname.new(output_path).exist? )
+    # The basic command
+    #
+    cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{output_path} "
+    #Missed cleavages
+    #
+    throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages > 2)
+    cmd << " -ntt #{search_tool.missed_cleavages}"
+    # Precursor tolerance
+    #
+    cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
+    # Instrument type
+    #
+    cmd << " -inst 2"
+#    cmd << " -m 4"
+    cmd << " -addFeatures 1"
+    # Enzyme
+    #
+  #    if ( search_tool.enzyme!="Trypsin")
+  #      cmd << " -e #{search_tool.enzyme}"
+  #    end
+  mods_path="#{search_tool.input_base_path(filename.chomp)}.msgfplus_mods.txt"
+  mods_file=File.open(mods_path,'w+')
+    # Variable Modifications
+    #
+    if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
+      var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
+      if ( var_mods !="" )
+        cmd << " -mv #{var_mods}"
+      end
+    else
+      # Add options related to peptide modifications
+      #
+      if ( search_tool.glyco )
+        cmd << " -mv 119 "
+      end
+    end
+  # Fixed modifications
+  #
+    if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
+      fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
+      if ( fix_mods !="")
+        cmd << " -mf #{fix_mods}"
+      end
+    else
+      if ( search_tool.has_modifications )
+        cmd << " -mf "
+        if ( search_tool.carbamidomethyl )
+          cmd<<"3 "
+        end
+        if ( search_tool.methionine_oxidation )
+          cmd<<"1 "
+        end
+      end
+    end
+    # Up to here we've formulated the omssa command. The rest is cleanup
+    p "Running:#{cmd}"
+    # Run the search
+    #
+    job_params= {:jobid => search_tool.jobid_from_filename(filename) }
+    job_params[:queue]="lowmem"
+    job_params[:vmem]="900mb"
+    search_tool.run(cmd,genv,job_params)
+  else
+    genv.log("Skipping search on existing file #{output_path}",:warn)
+  end
+end

data/bin/uniprot_annotation.rb ADDED

@@ -0,0 +1,130 @@
+#!/usr/bin/env ruby
+#
+# This file is part of Protk
+# Created by Ira Cooke 24/3/2013
+#
+# Retrieve annotation information for proteins from the Uniprot Swissprot database
+#
+#
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/tool'
+require 'protk/swissprot_database'
+require 'protk/bio_sptr_extensions'
+# Setup specific command-line options for this tool. Other options are inherited from Tool
+#
+tool=Tool.new({:explicit_output=>true})
+tool.option_parser.banner = "Retrieve information from the Uniprot database given a list of ID's.\n\n\
+Usage: uniprot_annotation.rb [options] input.tsv"
+tool.options.id_column=1
+tool.option_parser.on(  '--id-column num', 'Specify a column for ids (default is column 1)' ) do |col|
+  tool.options.id_column=col.to_i
+end
+tool.options.fields=nil
+tool.option_parser.on(  '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
+  tool.options.fields=flds
+end
+tool.option_parser.parse!
+# Obtain a global environment object
+genv=Constants.new
+input_file=ARGV[0]
+swissprotdb=SwissprotDatabase.new(genv)
+output_file=nil
+if ( tool.explicit_output==nil)
+  output_file=$stdout
+else
+  output_file=File.open(tool.explicit_output,'w+')
+end
+ac_column = tool.id_column-1
+db_fields = {
+  'recname'=>"Primary Name",
+  'cd'=>"CD Antigen Name",
+  'altnames'=>"Alternate Names",
+  'location' => "Subcellular Location",
+  'function' => "Known Function",
+  'similarity' => "Similarity",
+  'tissues' => "Tissue Specificity",
+  'disease' => "Disease Association",
+  'domain' => "Domain",
+  'subunit' => "Sub Unit",
+  'nextbio' => "NextBio",
+  'ipi' => "IPI",
+  'intact' => "Interactions",
+  'pride' => 'Pride',
+  'ensembl'=> 'Ensembl',
+  'num_transmem'=>"Transmembrane Regions",
+  'signalp'=>'Signal Peptide',
+  'ref_dump'=>'References',
+  'tax_dump'=>'Taxonomy Cross Ref',
+  'species_dump'=>'Species',
+  'feature_dump'=>'Feature Table',
+  'seq_dump' => 'AA Sequence'
+  }
+hyperlink_fields = {
+  'uniprot_link'=>"Uniprot Link",
+  'nextbio_link'=>'NextBio Link',
+  'intact_link'=>"Interactions Link",
+  'pride_link'=>"Pride Link",
+  'ensembl_link'=>"Ensembl Link"
+}
+if tool.fields !=nil
+  fields = tool.fields.split(",").collect { |f| f.lstrip.rstrip }.reject {|e| e.empty? }
+  db_fields = db_fields.select { |k| fields.include? k }
+  hyperlink_fields = hyperlink_fields.select { |k| fields.include? k}
+end
+output_file.write db_fields.values.join("\t")
+if ( hyperlink_fields.count > 0 )
+  output_file.write("\t")
+  output_file.write hyperlink_fields.values.join("\t")
+end
+output_file.write("\n")
+line_num=0
+File.foreach(input_file) { |line|
+  input_cols=line.split("\t")
+  throw "Not enough columns in line #{line_num}" unless input_cols.count > ac_column
+  accession=input_cols[ac_column].chomp
+  sptr_entry=swissprotdb.get_entry_for_name(accession)
+  if ( sptr_entry==nil)
+    genv.log("No entry for #{accession} in uniprot database",:warn)
+  else
+    db_values = db_fields.collect { |key,value|
+      sptr_entry.send(key)
+    }
+    hyperlink_values = hyperlink_fields.collect { |key,value|
+      sptr_entry.send(key)
+    }
+    output_file.write db_values.join("\t")
+    if ( hyperlink_fields.count > 0 )
+      output_file.write("\t")
+      output_file.write hyperlink_values.join("\t")
+    end
+    output_file.write "\n"
+  end
+  line_num+=1
+}