RubyGems - protk - Versions diffs - 1.2.6.pre5 → 1.3.0.pre1 - Mend

protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +4 -4
data/README.md +84 -45
data/bin/add_retention_times.rb +9 -5
data/bin/augustus_to_proteindb.rb +7 -11
data/bin/interprophet.rb +28 -46
data/bin/make_decoy.rb +16 -48
data/bin/mascot_search.rb +57 -71
data/bin/mascot_to_pepxml.rb +13 -26
data/bin/msgfplus_search.rb +70 -107
data/bin/omssa_search.rb +52 -109
data/bin/peptide_prophet.rb +44 -119
data/bin/pepxml_to_table.rb +24 -27
data/bin/protein_prophet.rb +22 -82
data/bin/protxml_to_gff.rb +22 -519
data/bin/protxml_to_table.rb +2 -16
data/bin/sixframe.rb +10 -32
data/bin/tandem_search.rb +30 -403
data/bin/tandem_to_pepxml.rb +43 -0
data/bin/unimod_to_loc.rb +1 -1
data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
data/ext/decoymaker/extconf.rb +3 -0
data/lib/protk/constants.rb +16 -2
data/lib/protk/data/default_config.yml +2 -1
data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
data/lib/protk/data/tandem_params.xml +17 -54
data/lib/protk/fastadb.rb +2 -2
data/lib/protk/prophet_tool.rb +1 -1
data/lib/protk/protxml_to_gff_tool.rb +474 -0
data/lib/protk/search_tool.rb +58 -103
data/lib/protk/setup_rakefile.rake +9 -5
data/lib/protk/tandem_search_tool.rb +256 -0
data/lib/protk/tool.rb +85 -104
data/lib/protk.rb +1 -6
metadata +24 -103
data/bin/annotate_ids.rb +0 -59
data/bin/asapratio.rb +0 -27
data/bin/blastxml_to_table.rb +0 -119
data/bin/correct_omssa_retention_times.rb +0 -27
data/bin/feature_finder.rb +0 -95
data/bin/file_convert.rb +0 -164
data/bin/generate_omssa_loc.rb +0 -42
data/bin/gffmerge.rb +0 -208
data/bin/libra.rb +0 -70
data/bin/toppas_pipeline.rb +0 -84
data/bin/uniprot_annotation.rb +0 -141
data/bin/xls_to_table.rb +0 -52
data/bin/xpress.rb +0 -27
data/ext/protk/decoymaker/extconf.rb +0 -3
data/ext/protk/simplealign/extconf.rb +0 -3
data/lib/protk/biotools_excel_converter.rb +0 -60
data/lib/protk/eupathdb_gene_information_table.rb +0 -158
data/lib/protk/gapped_aligner.rb +0 -264
data/lib/protk/protein_annotator.rb +0 -646
data/lib/protk/spreadsheet_extensions.rb +0 -79
data/lib/protk/xtandem_defaults.rb +0 -11

data/bin/peptide_prophet.rb CHANGED Viewed

@@ -17,102 +17,34 @@ input_stager = nil
 # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
 #
-prophet_tool=ProphetTool.new([:glyco,:explicit_output,:over_write,:maldi,:prefix_suffix])
+prophet_tool=ProphetTool.new([
+  :glyco,
+  :explicit_output,
+  :over_write,
+  :maldi,
+  :prefix,
+  :database])
 prophet_tool.option_parser.banner = "Run PeptideProphet on a set of pep.xml input files.\n\nUsage: peptide_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
-prophet_tool.options.output_suffix="_pproph"
-prophet_tool.options.useicat = false
-prophet_tool.option_parser.on( '--useicat',"Use icat information" ) do
-  prophet_tool.options.useicat = true
-end
-prophet_tool.options.nouseicat = false
-prophet_tool.option_parser.on( '--no-useicat',"Do not use icat information" ) do
-  prophet_tool.options.nouseicat = true
-end
-prophet_tool.options.phospho = false
-prophet_tool.option_parser.on( '--phospho',"Use phospho information" ) do
-  prophet_tool.options.phospho = true
-end
-prophet_tool.options.usepi = false
-prophet_tool.option_parser.on( '--usepi',"Use pI information" ) do
-  prophet_tool.options.usepi = true
-end
-prophet_tool.options.usert = false
-prophet_tool.option_parser.on( '--usert',"Use hydrophobicity / RT information" ) do
-  prophet_tool.options.usert = true
-end
-prophet_tool.options.accurate_mass = false
-prophet_tool.option_parser.on( '--accurate-mass',"Use accurate mass binning" ) do
-  prophet_tool.options.accurate_mass = true
-end
-prophet_tool.options.no_ntt = false
-prophet_tool.option_parser.on( '--no-ntt',"Don't use NTT model" ) do
-  prophet_tool.options.no_ntt = true
-end
-prophet_tool.options.no_nmc = false
-prophet_tool.option_parser.on( '--no-nmc',"Don't use NMC model" ) do
-  prophet_tool.options.no_nmc = true
-end
-prophet_tool.options.usegamma = false
-prophet_tool.option_parser.on( '--usegamma',"Use Gamma distribution to model the negatives" ) do
-  prophet_tool.options.usegamma = true
-end
-prophet_tool.options.use_only_expect = false
-prophet_tool.option_parser.on( '--use-only-expect',"Only use Expect Score as the discriminant" ) do
-  prophet_tool.options.use_only_expect = true
-end
-prophet_tool.options.force_fit = false
-prophet_tool.option_parser.on( '--force-fit',"Force fitting of mixture model and bypass checks" ) do
-  prophet_tool.options.force_fit = true
-end
-prophet_tool.options.allow_alt_instruments=false
-prophet_tool.option_parser.on( '--allow-alt-instruments',"Warning instead of exit with error if instrument types between runs is different" ) do
-  prophet_tool.options.allow_alt_instruments = true
-end
-prophet_tool.options.one_ata_time = false
-prophet_tool.option_parser.on( '-F', '--one-ata-time', 'Create a separate pproph output file for each analysis' ) do
-  prophet_tool.options.one_ata_time = true
-end
-prophet_tool.options.decoy_prefix="decoy"
-prophet_tool.option_parser.on( '--decoy-prefix prefix', 'Prefix for decoy sequences') do |prefix|
-  prophet_tool.options.decoy_prefix = prefix
-end
-prophet_tool.options.no_decoys = false
-prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin down the negative distribution') do
-  prophet_tool.options.no_decoys = true
-end
-prophet_tool.options.experiment_label=nil
-prophet_tool.option_parser.on('--experiment-label label','used to commonly label all spectra belonging to one experiment (required by iProphet)') do |label|
-  prophet_tool.options.experiment_label = label
-end
-prophet_tool.options.override_database=nil
-prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
-  prophet_tool.options.override_database = database
-end
-exit unless prophet_tool.check_options
-if ( ARGV[0].nil? )
-    puts "You must supply an input file"
-    puts prophet_tool.option_parser
-    exit
-end
+@output_suffix="_pproph"
+prophet_tool.options.database=nil
+prophet_tool.add_boolean_option(:useicat,false,['--useicat',"Use icat information"])
+prophet_tool.add_boolean_option(:phospho,false,['--phospho',"Use phospho information"])
+prophet_tool.add_boolean_option(:usepi,false,['--usepi',"Use pI information"])
+prophet_tool.add_boolean_option(:usert,false,['--usert',"Use hydrophobicity / RT information"])
+prophet_tool.add_boolean_option(:accurate_mass,false,['--accurate-mass',"Use accurate mass binning"])
+prophet_tool.add_boolean_option(:no_ntt,false,['--no-ntt',"Don't use NTT model"])
+prophet_tool.add_boolean_option(:no_nmc,false,['--no-nmc',"Don't use NMC model"])
+prophet_tool.add_boolean_option(:usegamma,false,['--usegamma',"Use Gamma distribution to model the negatives"])
+prophet_tool.add_boolean_option(:use_only_expect,false,['--use-only-expect',"Only use Expect Score as the discriminant"])
+prophet_tool.add_boolean_option(:force_fit,false,['--force-fit',"Force fitting of mixture model and bypass checks"])
+prophet_tool.add_boolean_option(:allow_alt_instruments,false,['--allow-alt-instruments',"Warning instead of exit with error if instrument types between runs is different"])
+prophet_tool.add_boolean_option(:one_ata_time,false,['-F', '--one-ata-time', 'Create a separate pproph output file for each analysis'])
+prophet_tool.add_value_option(:decoy_prefix,"decoy",['--decoy-prefix prefix', 'Prefix for decoy sequences'])
+prophet_tool.add_boolean_option(:no_decoys,false,['--no-decoy', 'Don\'t use decoy sequences to pin down the negative distribution'])
+prophet_tool.add_value_option(:experiment_label,nil,['--experiment-label label','used to commonly label all spectra belonging to one experiment (required by iProphet)'])
+exit unless prophet_tool.check_options(true)
 throw "When --output and -F options are set only one file at a time can be run" if  ( ARGV.length> 1 ) && ( prophet_tool.explicit_output!=nil ) && (prophet_tool.one_ata_time!=nil)
@@ -133,10 +65,11 @@ inputs.each {|file_name|
   name=file_name.chomp
   engine=prophet_tool.extract_engine(name)
-  if prophet_tool.override_database
-    db_path = prophet_tool.override_database
+  if prophet_tool.database
+    db_path = prophet_tool.database_info.path
   else
     db_path=prophet_tool.extract_db(name)
+    throw "Unable to find database #{db_path} used for searching. Specify database path using -d option" unless File.exist?(db_path)
   end
@@ -157,10 +90,10 @@ inputs=file_info.collect do |info|
   end
   throw "All files to be analyzed must have been searched with the same database and search engine" unless (info[1][:engine]==engine) && (info[1][:database])
-  retname=  "#{prophet_tool.input_base_path(info[0],".pep.xml")}.pep.xml"
-  if ( info[0]=~/\.dat$/)
-    retname=info[0]
-  end
+  retname=  info[0]
+  # if ( info[0]=~/\.dat$/)
+  #   retname=info[0]
+  # end
   retname
@@ -219,9 +152,7 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
   if prophet_tool.useicat
     cmd << " -Oi "
-  end
-  if prophet_tool.nouseicat
+  else
     cmd << " -Of"
   end
@@ -255,9 +186,7 @@ def run_peptide_prophet(genv,prophet_tool,cmd,output_path,engine)
   if ( !prophet_tool.over_write && Pathname.new(output_path).exist? )
     genv.log("Skipping analysis on existing file #{output_path}",:warn)
   else
-    jobscript_path="#{output_path}.pbs.sh"
-    job_params={:jobid=>engine, :vmem=>"900mb", :queue => "lowmem"}
-    code=prophet_tool.run(cmd,genv,job_params,jobscript_path)
+    code=prophet_tool.run(cmd,genv)
     throw "Command failed with exit code #{code}" unless code==0
   end
 end
@@ -265,27 +194,23 @@ end
 cmd=""
 if ( prophet_tool.one_ata_time )
-  inputs.each { |input|
-    output_file_name="#{prophet_tool.output_prefix}#{input}_#{engine}_interact#{prophet_tool.output_suffix}.pep.xml"
+  inputs.each do |input|
+    output_file_name=Tool.default_output_path(input,".pep.xml",prophet_tool.output_prefix,@output_suffix)
     cmd=generate_command(genv,prophet_tool,input,output_file_name,database,engine)
+    run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
+  end
-    run_peptide_prophet(genv,prophet_tool,cmd,output_file_base_name,engine)
-  }
 else
   if (prophet_tool.explicit_output==nil)
-    output_file_name="#{prophet_tool.output_prefix}#{engine}_interact#{prophet_tool.output_suffix}.pep.xml"
+    output_file_name=Tool.default_output_path(inputs,".pep.xml",prophet_tool.output_prefix,@output_suffix)
   else
     output_file_name=prophet_tool.explicit_output
   end
   cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
-  puts cmd
-  %x['ls']
   run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
 end

data/bin/pepxml_to_table.rb CHANGED Viewed

@@ -19,26 +19,17 @@ include LibXML
 tool=Tool.new([:explicit_output])
 tool.option_parser.banner = "Convert a pepXML file to a tab delimited table.\n\nUsage: pepxml_to_table.rb [options] file1.pep.xml"
-exit unless tool.check_options
-if ( ARGV[0].nil? )
-    puts "You must supply an input file"
-    puts tool.option_parser
-    exit
-end
-# Obtain a global environment object
-#genv=Constants.new
+exit unless tool.check_options(true)
 input_file=ARGV[0]
-output_file="#{input_file}.txt"
-output_file = tool.explicit_output if tool.explicit_output!=nil
-output_fh=File.new("#{output_file}",'w')
+if tool.explicit_output
+  output_fh=File.new("#{tool.explicit_output}",'w')
+else
+  output_fh=$stdout
+end
-output_fh.write "protein\tpeptide\tassumed_charge\tcalc_neutral_pep_mass\tneutral_mass\tretention_time\tstart_scan\tend_scan\tsearch_engine\tpeptideprophet_prob\tinterprophet_prob\n"
+output_fh.write "protein\tpeptide\tassumed_charge\tcalc_neutral_pep_mass\tneutral_mass\tretention_time\tstart_scan\tend_scan\tsearch_engine\traw_score\tpeptideprophet_prob\tinterprophet_prob\n"
 XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
@@ -68,18 +59,24 @@ spectrum_queries.each do |query|
   start_scan=query.attributes['start_scan']
   end_scan=query.attributes['end_scan']
-  search_engine=""
-  search_score_names=top_search_hit.find("./#{pepxml_ns_prefix}search_score/@name",pepxml_ns).collect {|s| s.to_s}
+  run_summary_node=query.parent
+  # puts run_summary_node
+  search_summary_node=run_summary_node.find("./#{pepxml_ns_prefix}search_summary",pepxml_ns)[0]
+   # puts search_summary_node.attributes.each { |e| puts e }
+  search_engine=search_summary_node.attributes['search_engine']
+  # search_engine=""
-  search_engine=query.parent.attributes['search_engine']
-  # if ( search_score_names.length==2 && search_score_names.grep(/^name.*=.*pvalue/))
-  #   search_engine="omssa"
-  # elsif ( search_score_names.grep(/^name.*=.*ionscore/))
-  #   search_engine="mascot"
-  # elsif ( search_score_names.grep(/^name.*=.*hyperscore/) )
-  #   search_engine="x!tandem"
-  # end
+  raw_score=""
+  case search_engine
+  when /[Tt]andem/
+    search_score_nodes=top_search_hit.find("./#{pepxml_ns_prefix}search_score[@name=\"expect\"]",[pepxml_ns])
+    raw_score=search_score_nodes[0].attributes['value']
+  when /MS\-GF/
+    search_score_nodes=top_search_hit.find("./#{pepxml_ns_prefix}search_score[@name=\"EValue\"]",[pepxml_ns])
+    raw_score=search_score_nodes[0].attributes['value']
+  end
   pp_result=top_search_hit.find("./#{pepxml_ns_prefix}analysis_result/#{pepxml_ns_prefix}peptideprophet_result/@probability",pepxml_ns)
@@ -90,7 +87,7 @@ spectrum_queries.each do |query|
   peptide_prophet_prob=pp_result[0].value if ( pp_result.length>0 )
   interprophet_prob=ip_result[0].value if ( ip_result.length>0)
-  output_fh.write "#{protein}\t#{peptide}\t#{assumed_charge}\t#{calc_neutral_pep_mass}\t#{neutral_mass}\t#{retention_time}\t#{start_scan}\t#{end_scan}\t#{search_engine}\t#{peptide_prophet_prob}\t#{interprophet_prob}\n"
+  output_fh.write "#{protein}\t#{peptide}\t#{assumed_charge}\t#{calc_neutral_pep_mass}\t#{neutral_mass}\t#{retention_time}\t#{start_scan}\t#{end_scan}\t#{search_engine}\t#{raw_score}\t#{peptide_prophet_prob}\t#{interprophet_prob}\n"
 end

data/bin/protein_prophet.rb CHANGED Viewed

@@ -17,98 +17,41 @@ for_galaxy = GalaxyUtil.for_galaxy?
 # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
 #
-prophet_tool=ProphetTool.new([:glyco,:explicit_output,:over_write,:prefix_suffix])
+prophet_tool=ProphetTool.new([:glyco,:explicit_output,:over_write,:prefix])
 prophet_tool.option_parser.banner = "Run ProteinProphet on a set of pep.xml input files.\n\nUsage: protein_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
-prophet_tool.options.output_suffix="_protproph"
-prophet_tool.options.iproph = false
-prophet_tool.option_parser.on( '--iprophet-input',"Inputs are from iProphet" ) do
-  prophet_tool.options.iproph = true
-end
-prophet_tool.options.nooccam = false
-prophet_tool.option_parser.on( '--no-occam',"Do not attempt to derive the simplest protein list explaining observed peptides" ) do
-  prophet_tool.options.nooccam = true
-end
-prophet_tool.options.groupwts = false
-prophet_tool.option_parser.on( '--group-wts',"Check peptide's total weight (rather than actual weight) in the Protein Group against the threshold" ) do
-  prophet_tool.options.groupwts = true
-end
-prophet_tool.options.normprotlen = false
-prophet_tool.option_parser.on( '--norm-protlen',"Normalize NSP using Protein Length" ) do
-  prophet_tool.options.normprotlen = true
-end
+@output_suffix="_protproph"
-prophet_tool.options.logprobs = false
-prophet_tool.option_parser.on( '--log-prob',"Use the log of probability in the confidence calculations" ) do
-  prophet_tool.options.logprobs = true
-end
-prophet_tool.options.confem = false
-prophet_tool.option_parser.on( '--confem',"Use the EM to compute probability given the confidence" ) do
-  prophet_tool.options.confem = true
-end
-prophet_tool.options.allpeps = false
-prophet_tool.option_parser.on( '--allpeps',"Consider all possible peptides in the database in the confidence model" ) do
-  prophet_tool.options.allpeps = true
-end
-prophet_tool.options.unmapped = false
-prophet_tool.option_parser.on( '--unmapped',"Report results for unmapped proteins" ) do
-  prophet_tool.options.unmapped = true
-end
-prophet_tool.options.instances = false
-prophet_tool.option_parser.on( '--instances',"Use Expected Number of Ion Instances to adjust the peptide probabilities prior to NSP adjustment" ) do
-  prophet_tool.options.instances = true
-end
-prophet_tool.options.delude = false
-prophet_tool.option_parser.on( '--delude',"Do NOT use peptide degeneracy information when assessing proteins" ) do
-  prophet_tool.options.delude = true
-end
-prophet_tool.options.minprob = 0.05
-prophet_tool.option_parser.on( '--minprob mp',"Minimum peptide prophet probability for peptides to be considered" ) do |mp|
-  prophet_tool.options.minprob = mp
-end
-prophet_tool.options.minindep = 0
-prophet_tool.option_parser.on( '--minindep mp',"Minimum percentage of independent peptides required for a protein" ) do |mp|
-  prophet_tool.options.minindep = mp
-end
-exit unless prophet_tool.check_options
-if ( ARGV[0].nil? )
-    puts "You must supply an input file"
-    puts prophet_tool.option_parser
-    exit
-end
+prophet_tool.add_boolean_option(:iproph,false,['--iprophet-input',"Inputs are from iProphet"])
+prophet_tool.add_boolean_option(:nooccam,false,['--no-occam',"Do not attempt to derive the simplest protein list explaining observed peptides"])
+prophet_tool.add_boolean_option(:groupwts,false,['--group-wts',"Check peptide's total weight (rather than actual weight) in the Protein Group against the threshold"])
+prophet_tool.add_boolean_option(:normprotlen,false,['--norm-protlen',"Normalize NSP using Protein Length"])
+prophet_tool.add_boolean_option(:logprobs,false,['--log-prob',"Use the log of probability in the confidence calculations"])
+prophet_tool.add_boolean_option(:confem,false,['--confem',"Use the EM to compute probability given the confidence"])
+prophet_tool.add_boolean_option(:allpeps,false,['--allpeps',"Consider all possible peptides in the database in the confidence model"])
+prophet_tool.add_boolean_option(:unmapped,false,['--unmapped',"Report results for unmapped proteins"])
+prophet_tool.add_boolean_option(:instances,false,['--instances',"Use Expected Number of Ion Instances to adjust the peptide probabilities prior to NSP adjustment"])
+prophet_tool.add_boolean_option(:delude,false,['--delude',"Do NOT use peptide degeneracy information when assessing proteins"])
+prophet_tool.add_value_option(:minprob,0.05,['--minprob mp',"Minimum peptide prophet probability for peptides to be considered"])
+prophet_tool.add_value_option(:minindep,0,['--minindep mp',"Minimum percentage of independent peptides required for a protein"])
+exit unless prophet_tool.check_options(true)
 # Obtain a global environment object
 genv=Constants.new
-if ( prophet_tool.explicit_output==nil )
-	output_file="#{prophet_tool.output_prefix}interact#{prophet_tool.output_suffix}.prot.xml"
- else
-	output_file=prophet_tool.explicit_output
-end
+inputs = ARGV.collect {|file_name| file_name.chomp }
-p output_file
+if ( prophet_tool.explicit_output )
+  output_file=prophet_tool.explicit_output
+else
+  output_file=Tool.default_output_path(inputs,".prot.xml",prophet_tool.output_prefix,@output_suffix)
+end
 if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
   cmd="ProteinProphet NOPLOT "
-  inputs = ARGV.collect {|file_name|
-    file_name.chomp
-  }
   if for_galaxy
     inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
   end
@@ -122,10 +65,7 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
   # Run the analysis
   #
-  jobscript_path="#{output_file}.pbs.sh"
-  job_params={:jobid=>"protproph", :vmem=>"900mb", :queue => "lowmem"}
-  genv.log("Running #{cmd}",:info)
-  code = prophet_tool.run(cmd,genv,job_params,jobscript_path)
+  code = prophet_tool.run(cmd,genv)
   throw "Command failed with exit code #{code}" unless code==0
 else
   genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)