RubyGems - protk - Versions diffs - 1.2.1 → 1.2.2 - Mend

protk 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/bin/annotate_ids.rb +2 -2
data/bin/feature_finder.rb +8 -2
data/bin/file_convert.rb +8 -2
data/bin/gffmerge.rb +15 -20
data/bin/interprophet.rb +7 -3
data/bin/make_decoy.rb +9 -2
data/bin/mascot2xml.rb +87 -0
data/bin/mascot_search.rb +126 -187
data/bin/mascot_to_pepxml.rb +32 -3
data/bin/msgfplus_search.rb +58 -12
data/bin/omssa_search.rb +13 -2
data/bin/peptide_prophet.rb +8 -2
data/bin/pepxml_to_table.rb +8 -2
data/bin/protein_prophet.rb +8 -2
data/bin/protxml_to_table.rb +82 -0
data/bin/repair_run_summary.rb +7 -1
data/bin/sixframe.rb +48 -2
data/bin/tandem_search.rb +11 -2
data/bin/toppas_pipeline.rb +8 -2
data/bin/uniprot_annotation.rb +8 -2
data/bin/uniprot_mapper.rb +8 -2
data/bin/xls_to_table.rb +8 -2
data/lib/protk/constants.rb +2 -0
data/lib/protk/data/pepxml_mascot_template.xml +29 -0
data/lib/protk/mascot_util.rb +5 -0
data/lib/protk/prophet_tool.rb +1 -3
data/lib/protk/search_tool.rb +75 -86
data/lib/protk/setup_rakefile.rake +12 -5
data/lib/protk/tool.rb +26 -12
metadata +23 -9
data/bin/big_search.rb +0 -41
data/bin/template_search.rb +0 -144
data/lib/convert_util.rb +0 -27
data/lib/pepxml.rb +0 -22
data/lib/protk/big_search_rakefile.rake +0 -16
data/lib/protk/big_search_tool.rb +0 -23

data/bin/mascot_to_pepxml.rb CHANGED

@@ -15,10 +15,36 @@ require 'protk/mascot_util'
 #
 genv=Constants.new
-tool=SearchTool.new({:database=>true,:explicit_output=>true,:over_write=>true})
+tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme])
 tool.option_parser.banner = "Convert mascot dat files to pep.xml files.\n\nUsage: mascot_to_pepxml.rb [options] file1.dat file2.dat ... "
+tool.options.enzyme="trypsin"
+tool.options.shortid=false
+tool.option_parser.on( '--shortid', 'Use short protein id as per Mascot result (default uses full protein ids in fasta file)' ) do
+    tool.options.shortid=true
+end
 tool.option_parser.parse!
+exit unless tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts tool.option_parser
+    exit
+end
+current_db=""
+case
+when Pathname.new(tool.database).exist? # It's an explicitly named db
+  current_db=Pathname.new(tool.database).realpath.to_s
+else
+  current_db=tool.current_database :fasta
+end
 ARGV.each do |file_name|
   name=file_name.chomp
@@ -28,12 +54,15 @@ ARGV.each do |file_name|
   if ( tool.explicit_output==nil )
     new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
     cmd="cp #{name} #{new_basename}.dat"
-    cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{tool.current_database :fasta}"
+    cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{current_db} -E#{tool.enzyme}"
+    cmd << " -shortid" if tool.shortid
   else  #Mascot2XML doesn't support explicitly named output files so we move the file to an appropriate output filename after finishing
     new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
     cmd="cp #{name} #{new_basename}.dat"
-    cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{tool.current_database :fasta}"
+    cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{current_db} -E#{tool.enzyme}"
+    cmd << " -shortid" if tool.shortid
     cmd << "; mv #{new_basename}.pep.xml #{tool.explicit_output}; rm #{new_basename}.dat"
     repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
     cmd << "; #{repair_script} #{tool.explicit_output}"

data/bin/msgfplus_search.rb CHANGED

@@ -17,17 +17,32 @@ input_stager = nil
 # Setup specific command-line options for this tool. Other options are inherited from SearchTool
 #
-search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>false,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
+search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
+  :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
 search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
 search_tool.options.output_suffix="_msgfplus"
+search_tool.options.enzyme=1
+search_tool.options.instrument=0
+search_tool.options.no_pepxml=false
+search_tool.option_parser.on(  '--no-pepxml', 'Dont convert results to pepxml. Keep native mzidentml format' ) do
+  search_tool.options.no_pepxml=true
+end
+search_tool.options.isotope_error_range="0,1"
+search_tool.option_parser.on(  '--isotope-error-range range', 'Takes into account of the error introduced by chooosing a non-monoisotopic peak for fragmentation.(Default 0,1)' ) do |range|
+  search_tool.options.isotope_error_range=range
+end
 search_tool.options.fragment_method=0
 search_tool.option_parser.on(  '--fragment-method method', 'Fragment method 0: As written in the spectrum or CID if no info (Default), 1: CID, 2: ETD, 3: HCD, 4: Merge spectra from the same precursor' ) do |method|
   search_tool.options.fragment_method=method
 end
 search_tool.options.protocol=0
-search_tool.option_parser.on(  '--protocol p', '0: NoProtocol (Default), 1: Phosphorylation' ) do |p|
+search_tool.option_parser.on(  '--protocol p', '0: NoProtocol (Default), 1: Phosphorylation, 2: iTRAQ, 3: iTRAQPhospho' ) do |p|
   search_tool.options.protocol=p
 end
@@ -61,12 +76,23 @@ search_tool.option_parser.on(  '--add-features', 'output additional features' )
   search_tool.options.add_features=true
 end
+search_tool.options.num_threads=nil
+search_tool.option_parser.on('--threads NumThreads','Number of processing threads to use') do |nt|
+  search_tool.options.num_threads=nt
+end
 search_tool.options.java_mem="3500M"
 search_tool.option_parser.on('--java-mem mem','Java memory limit when running the search (Default 3.5Gb)') do |mem|
   search_tool.options.java_mem=mem
 end
-search_tool.option_parser.parse!
+exit unless search_tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts search_tool.option_parser
+    exit
+end
 # Environment with global constants
 #
@@ -149,17 +175,33 @@ ARGV.each do |filename|
     # Instrument type
     cmd << " -inst #{search_tool.instrument}"
-#    cmd << " -m 4"
+    cmd << " -m #{search_tool.fragment_method}"
     cmd << " -addFeatures 1"
+    cmd << " -protocol #{search_tool.protocol}"
+    cmd << " -minLength #{search_tool.min_pep_length}"
+    cmd << " -maxLength #{search_tool.max_pep_length}"
+    cmd << " -minCharge #{search_tool.min_pep_charge}"
+    cmd << " -maxCharge #{search_tool.max_pep_charge}"
+    cmd << " -ti #{search_tool.isotope_error_range}"
+    cmd << " -n #{search_tool.num_reported_matches}"
     # Enzyme
     #
-  #    if ( search_tool.enzyme!="Trypsin")
-  #      cmd << " -e #{search_tool.enzyme}"
-  #    end
+    cmd << " -e #{search_tool.enzyme}"
+    # Num Threads
+    #
+    cmd << " -thread #{search_tool.num_threads}" if search_tool.num_threads
-  mods_file_content = ""
+    mods_file_content = ""
     # Variable Modifications
     #
@@ -188,10 +230,14 @@ ARGV.each do |filename|
     end
     # As a final part of the command we convert to pepxml
-    cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
-    #Then copy the pepxml to the final output path
-    cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
+    if search_tool.no_pepxml
+      cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
+      #Then copy the pepxml to the final output path
+      cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
+    elsif search_tool.explicit_output
+      cmd << "; cp #{mzid_output_path} #{output_path}"
+    end
     # Up to here we've formulated the command. The rest is cleanup
     p "Running:#{cmd}"

data/bin/omssa_search.rb CHANGED

@@ -16,7 +16,12 @@ for_galaxy = GalaxyUtil.for_galaxy?
 # Setup specific command-line options for this tool. Other options are inherited from SearchTool
 #
-search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
+search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
+  :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages,
+  :precursor_search_type,:respect_precursor_charges,:num_peaks_for_multi_isotope_search,:searched_ions
+  ])
 search_tool.option_parser.banner = "Run an OMSSA msms search on a set of mgf input files.\n\nUsage: omssa_search.rb [options] file1.mgf file2.mgf ..."
 search_tool.options.output_suffix="_omssa"
@@ -54,7 +59,13 @@ search_tool.option_parser.on( '--nthreads num', 'Number of search threads to use
   search_tool.options.nthreads=num
 end
-search_tool.option_parser.parse!
+exit unless search_tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts search_tool.option_parser
+    exit
+end
 # Environment with global constants
 #

data/bin/peptide_prophet.rb CHANGED

@@ -13,7 +13,7 @@ require 'protk/prophet_tool'
 # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
 #
-prophet_tool=ProphetTool.new({:glyco=>true,:explicit_output=>true,:maldi=>true})
+prophet_tool=ProphetTool.new([:glyco,:explicit_output,:maldi])
 prophet_tool.option_parser.banner = "Run PeptideProphet on a set of pep.xml input files.\n\nUsage: peptide_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
 prophet_tool.options.output_suffix="_pproph"
@@ -92,7 +92,13 @@ prophet_tool.option_parser.on( '--override-database database', 'Manually specify
   prophet_tool.options.override_database = database
 end
-prophet_tool.option_parser.parse!
+exit unless prophet_tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts prophet_tool.option_parser
+    exit
+end
 throw "When --output and -F options are set only one file at a time can be run" if  ( ARGV.length> 1 ) && ( prophet_tool.explicit_output!=nil ) && (prophet_tool.one_ata_time!=nil)

data/bin/pepxml_to_table.rb CHANGED

@@ -16,10 +16,16 @@ include LibXML
 # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
 #
-tool=Tool.new({:explicit_output=>true})
+tool=Tool.new([:explicit_output])
 tool.option_parser.banner = "Convert a pepXML file to a tab delimited table.\n\nUsage: pepxml_to_table.rb [options] file1.pep.xml"
-tool.option_parser.parse!
+exit unless tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts tool.option_parser
+    exit
+end
 # Obtain a global environment object
 #genv=Constants.new

data/bin/protein_prophet.rb CHANGED

@@ -26,7 +26,7 @@ end
 # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
 #
-prophet_tool=ProphetTool.new({:glyco=>true,:explicit_output=>true})
+prophet_tool=ProphetTool.new([:glyco,:explicit_output])
 prophet_tool.option_parser.banner = "Run ProteinProphet on a set of pep.xml input files.\n\nUsage: protein_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
 prophet_tool.options.output_suffix="_protproph"
@@ -90,7 +90,13 @@ prophet_tool.option_parser.on( '--minindep mp',"Minimum percentage of independen
   prophet_tool.options.minindep = mp
 end
-prophet_tool.option_parser.parse!
+exit unless prophet_tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts prophet_tool.option_parser
+    exit
+end
 # Obtain a global environment object

data/bin/protxml_to_table.rb ADDED

@@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 18/1/2011
+#
+# Convert a pepXML file to a tab delimited table
+#
+#
+require 'libxml'
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/tool'
+include LibXML
+# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
+#
+tool=Tool.new([:explicit_output])
+tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
+exit unless tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts tool.option_parser
+    exit
+end
+input_file=ARGV[0]
+output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
+output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
+XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
+protxml_parser=XML::Parser.file("#{input_file}")
+protxml_ns_prefix="xmlns:"
+protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
+protxml_doc=protxml_parser.parse
+if not protxml_doc.root.namespaces.default
+  protxml_ns_prefix=""
+  protxml_ns=nil
+end
+column_headers=[
+	"group_number","group_probability","protein_name",
+	"protein_probability","coverage","peptides",
+	"num_peptides","confidence"
+]
+output_fh.write "#{column_headers.join("\t")}\n"
+protein_groups=protxml_doc.find("//#{protxml_ns_prefix}protein_group", protxml_ns)
+protein_groups.each do |protein_group|
+	proteins=protein_group.find("./#{protxml_ns_prefix}protein", protxml_ns)
+	proteins.each do |protein|
+		column_values=[]
+		column_values << protein_group.attributes['group_number']
+		column_values << protein_group.attributes['probability']
+		column_values << protein.attributes['protein_name']
+		column_values << protein.attributes['probability']
+		column_values << protein.attributes['percent_coverage']
+		column_values << protein.attributes['unique_stripped_peptides']
+		column_values << protein.attributes['total_number_peptides']
+		column_values << protein.attributes['confidence']
+		output_fh.write(column_values.join("\t"))
+		output_fh.write("\n")
+	end
+end

data/bin/repair_run_summary.rb CHANGED

@@ -40,7 +40,13 @@ tool.option_parser.on('--omssa-itol fitol','Add a fragment ion tolerance paramet
   tool.options.omssa_ion_tolerance=fitol
 end
-tool.option_parser.parse!
+exit unless tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts tool.option_parser
+    exit
+end
 pepxml_file=ARGV[0]

data/bin/sixframe.rb CHANGED

@@ -10,10 +10,34 @@ require 'protk/constants'
 require 'protk/tool'
 require 'bio'
-tool=Tool.new(:explicit_output=>true)
+def check_coords(naseq,aaseq,frame,pstart,pend)
+  orf_from_coords=""
+  if ( frame<=3)
+    orf_from_coords=naseq[pstart-1..pend-1].translate(1)
+  else
+    orf_from_coords=naseq[pstart-1..pend-1].reverse_complement.translate(1)
+    # current coords give
+    # naseq.reverse_complement[pstart-1..pend-1].translate(1)
+    # naseq[350368-pend..(350367-pstart+1)].reverse_complement.translate(1)
+#    orf_from_coords=naseq[naseq.length-pend..naseq.length-pstart].reverse_complement.translate(1)
+  end
+  if ( orf_from_coords!=aaseq)
+    require 'debugger'; debugger
+  end
+#  p "#{aaseq} #{frame}"
+end
+tool=Tool.new([:explicit_output])
 tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage: sixframe.rb [options] genome.fasta"
-tool.option_parser.parse!
+exit unless tool.check_options
+if ( ARGV[0].nil? )
+    puts "You must supply an input file"
+    puts tool.option_parser
+    exit
+end
 inname=ARGV.shift
@@ -26,7 +50,11 @@ end
 file = Bio::FastaFormat.open(inname)
 file.each do |entry|
+  puts entry.entry_id
   length = entry.naseq.length
   (1...7).each do |frame|
     translated_seq= entry.naseq.translate(frame)
     orfs=translated_seq.split("*")
@@ -37,15 +65,30 @@ file.each do |entry|
     orfs.each do |orf|
       oi+=1
       if ( orf.length > 20 )
         position_start = position
         position_end = position_start + orf.length*3 -1
+        if ( frame>3) #On reverse strand. Coordinates need translating to forward strand
+          forward_position_start=length-position_end+1
+          forward_position_end = length-position_start+1
+          position_start=forward_position_start
+          position_end=forward_position_end
+        end
         # Create accession compliant with NCBI naming standard
         # See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
         ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')
         ncbi_accession = "lcl|#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
+#        check_coords(entry.naseq,orf,frame,position_start,position_end)
         # Output in fasta format
+        # start and end positions are always relative to the forward strand
         outfile.write(">#{ncbi_accession} #{position_start}|#{position_end}\n#{orf}\n")
       end
@@ -54,3 +97,6 @@ file.each do |entry|
   end
 end