RubyGems - ms-sequest - Versions diffs - 0.0.17 → 0.0.18 - Mend

ms-sequest 0.0.17 → 0.0.18

Files changed (24) hide show

data/.autotest +26 -10
data/Gemfile +4 -1
data/Gemfile.lock +17 -2
data/VERSION +1 -1
data/bin/srf_to_pepxml.rb +7 -0
data/bin/srf_to_search.rb +1 -1
data/lib/ms/sequest/bioworks.rb +2 -2
data/lib/ms/sequest/params.rb +0 -20
data/lib/ms/sequest/pepxml.rb +7 -245
data/lib/ms/sequest/pepxml/modifications.rb +247 -0
data/lib/ms/sequest/pepxml/params.rb +32 -0
data/lib/ms/sequest/sqt.rb +17 -17
data/lib/ms/sequest/srf.rb +64 -54
data/lib/ms/sequest/srf/pepxml.rb +316 -0
data/lib/ms/sequest/srf/pepxml/sequest.rb +21 -0
data/lib/ms/sequest/srf/sqt.rb +1 -1
data/spec/ms/sequest/bioworks_spec.rb +11 -11
data/spec/ms/sequest/pepxml/modifications_spec.rb +50 -0
data/spec/ms/sequest/pepxml_spec.rb +0 -65
data/spec/ms/sequest/srf/pepxml_spec.rb +84 -0
data/spec/ms/sequest/srf_spec.rb +3 -3
data/spec/ms/sequest/srf_spec_helper.rb +2 -2
data/spec/spec_helper.rb +17 -18
metadata +73 -19

data/lib/ms/sequest/srf/pepxml.rb ADDED Viewed

@@ -0,0 +1,316 @@
+require 'ms/ident/pepxml'
+require 'ms/ident/pepxml/spectrum_query'
+require 'ms/ident/pepxml/search_result'
+require 'ms/ident/pepxml/search_hit'
+require 'ms/msrun'
+require 'ms/sequest/srf'
+require 'ms/sequest/pepxml'
+class Ms::Sequest::Srf
+  module Pepxml
+    #  A hash with the following *symbol* keys may be set:
+    #
+    # Run Info
+    # *:ms_model*:: nil
+    # *:ms_ionization*:: 'ESI'
+    # *:ms_detector*:: 'UNKNOWN'
+    # *:ms_mass_analyzer*:: nil - <i>typically extracted from the srf file and matched with <b>ModelToMsAnalyzer</b></i>
+    # *:ms_manufacturer*:: 'Thermo'
+    #
+    # Raw data
+    # *:mz_dir*:: nil - <i>path to the mz[X]ML directory, defaults to the directory the srf file is contained in.  mz[X]ML data must be available to embed retention times</i>
+    # *:raw_data*:: \['.mzML', '.mzXML'\] - <i>preferred extension for raw data</i>
+    #
+    # Database
+    # *:db_seq_type*:: 'AA' - <i>AA or NA</i>
+    # *:db_dir*:: nil - <i>the directory the fasta file used for the search is housed in. A valid pepxml file must point to a valid fasta file!</i>
+    # *:db_residue_size*:: nil - <i>An integer for the number of residues in the database.  if true, calculates the size of the fasta database.</i>
+    # *:db_name:: nil
+    # *:db_orig_database_url*:: nil
+    # *:db_release_date*:: nil
+    # *:db_release_identifier*:: nil
+    #
+    # Search Hits
+    # *:num_hits*:: 1 - <i>the top number of hits to include</i>
+    # *:retention_times*:: false - <i>include retention times in the file (requires mz_dir to be set)</i>
+    # *:deltacn_orig*:: false - <i>when true, the original SEQUEST deltacn values are used.  If false, Bioworks deltacn values are used which are derived by taking the original deltacn of the following hit.  This gives the top ranking hit an informative deltacn but makes the deltacn meaningless for other hits.</i>
+    #
+    # *:pepxml_version*:: Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION, - <i>Integer to set the pepxml version.  The converter and xml output attempts to produce xml specific to the version.</i>
+    # *:verbose*:: true - <i>set to false to quiet warnings</i>
+    DEFAULT_OPTIONS = {
+      :ms_model => nil,
+      :ms_ionization => 'ESI',
+      :ms_detector => 'UNKNOWN',
+      :ms_mass_analyzer => nil,
+      :ms_manufacturer => 'Thermo',
+      :mz_dir => nil,
+      #:raw_data => [".mzXML", '.mzML'],
+      :raw_data => ['.mzML', '.mzXML'],
+      :db_seq_type => 'AA',
+      :db_dir => nil,
+      :db_residue_size => nil,
+      :db_name => nil,
+      :db_orig_database_url => nil,
+      :db_release_date => nil,
+      :db_release_identifier => nil,
+      :num_hits => 1,
+      :retention_times => false,
+      :deltacn_orig => false,
+      :pepxml_version => Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION,
+      :verbose => true,
+    }
+    # An array of regexp to string pairs.  The regexps are matched against the
+    # model (srf.header.model) and the corresponding string will be used as
+    # the mass analyzer.
+    #
+    # /Orbitrap/:: 'Orbitrap'
+    # /LCQ Deca XP/:: 'Ion Trap'
+    # /LTQ/:: 'Ion Trap'
+    # /\w+/:: 'UNKNOWN'
+    ModelToMsAnalyzer = [
+      [/Orbitrap/, 'Orbitrap'],
+      [/LCQ Deca XP/, 'Ion Trap'],
+      [/LTQ/, 'Ion Trap'],
+      [/\w+/, 'UNKNOWN'],
+    ]
+    # returns an Ms::Ident::Pepxml object.  See that object for creating an
+    # xml string or writing to file.
+    def to_pepxml(opts={})
+      opt = DEFAULT_OPTIONS.merge(opts)
+      srf = self
+      # with newer pepxml version these are not required anymore
+      hidden_opts = {
+        # format of file storing the runner up peptides (if not present in
+        # pepXML) this was made optional after version 19
+        :out_data_type => "out", ## may be srf??
+        # runner up search hit data type extension (e.g. .tgz)
+        :out_data => ".srf",
+      }
+      opt.merge!(hidden_opts)
+      params = srf.params
+      header = srf.header
+      opt[:ms_model] ||= srf.header.model
+      unless opt[:ms_mass_analyzer]
+        ModelToMsAnalyzer.each do |regexp, val|
+          if opt[:ms_model].match(regexp)
+            opt[:ms_mass_analyzer] = val
+            break
+          end
+        end
+      end
+      # get the database name
+      db_filename = header.db_filename.sub(/\.hdr$/, '')
+      if opt[:db_dir]
+        db_filename = File.join(opt[:db_dir], db_filename.split(/[\/\\]+/).last)
+      end
+      if File.exist?(db_filename)
+        db_filename = File.expand_path(db_filename)
+      else
+        msg = ["!!! WARNING !!!"]
+        msg << "!!! Can't find database: #{db_filename}"
+        msg << "!!! pepxml *requires* that the db path be valid"
+        msg << "!!! make sure 1) the fasta file is available on this system"
+        msg << "!!!           2) you've specified a valid directory with --db-dir (or :db_dir)"
+        puts msg.join("\n") if opt[:verbose]
+      end
+      modifications_obj = Ms::Sequest::Pepxml::Modifications.new(params, srf.header.modifications)
+      mass_index = params.mass_index(:precursor)
+      h_plus = mass_index['h+']
+      opt[:mz_dir] ||= srf.resident_dir
+      found_ext = opt[:raw_data].find do |raw_data|
+        Dir[File.join(opt[:mz_dir], srf.base_name_noext + raw_data)].first
+      end
+      opt[:raw_data] = [found_ext] if found_ext
+      scan_to_ret_time =
+        if opt[:retention_times]
+          mz_file = Dir[File.join(opt[:mz_dir], srf.base_name_noext + opt[:raw_data].first)].first
+          if mz_file
+            Ms::Msrun.scans_to_times(mz_file)
+          else
+            warn "turning retention_times off since no valid mz[X]ML file was found!!!"
+            opt[:retention_times] = false
+            nil
+          end
+        end
+      summary_xml_filename = srf.base_name_noext + '.xml'
+      pepxml = Ms::Ident::Pepxml.new do |msms_pipeline_analysis|
+        msms_pipeline_analysis.merge!(:summary_xml => summary_xml_filename, :pepxml_version => opt[:pepxml_version]) do |msms_run_summary|
+          # prep the sample enzyme and search_summary
+          msms_run_summary.merge!(
+            :base_name => File.join(opt[:mz_dir], srf.base_name_noext),
+            :ms_manufacturer => opt[:ms_manufacturer],
+            :ms_model => opt[:ms_model],
+            :ms_ionization => opt[:ms_ionization],
+            :ms_mass_analyzer => opt[:ms_mass_analyzer],
+            :ms_detector => opt[:ms_detector],
+            :raw_data => opt[:raw_data].first,
+            :raw_data_type => opt[:raw_data].first,
+          ) do |sample_enzyme, search_summary, spectrum_queries|
+            sample_enzyme.merge!(params.sample_enzyme_hash)
+            search_summary.merge!(
+              :base_name=> srf.resident_dir + '/' + srf.base_name_noext,
+              :search_engine => 'SEQUEST',
+              :precursor_mass_type => params.precursor_mass_type,
+              :fragment_mass_type => params.fragment_mass_type,
+              :out_data_type => opt[:out_data_type],
+              :out_data => opt[:out_data],
+            ) do |search_database, enzymatic_search_constraint, modifications_ar, parameters_hash|
+              search_database.merge!(:local_path => db_filename, :seq_type => opt[:db_seq_type], :database_name => opt[:db_name], :orig_database_url => opt[:db_orig_database_url], :database_release_date => opt[:db_release_date], :database_release_identifier => opt[:db_release_identifier])
+              case opt[:db_residue_size]
+              when Integer
+                search_database.size_of_residues = opt[:db_residue_size]
+              when true
+                search_database.set_size_of_residues!
+              end
+              enzymatic_search_constraint.merge!(
+                :enzyme => params.enzyme,
+                :max_num_internal_cleavages => params.max_num_internal_cleavages,
+                :min_number_termini => params.min_number_termini,
+              )
+              modifications_ar.replace(modifications_obj.modifications)
+              parameters_hash.merge!(params.opts)
+            end
+            spec_queries = srf.dta_files.zip(srf.out_files, index).map do |dta_file,out_file,i_ar|
+              precursor_neutral_mass = dta_file.mh - h_plus
+              search_hits = out_file.hits[0,opt[:num_hits]].each_with_index.map do |pep,i|
+                (prev_aa, pure_aaseq, next_aa) = Ms::Ident::Peptide.prepare_sequence(pep.sequence)
+                calc_neutral_pep_mass = pep.mh - h_plus
+                sh = Ms::Ident::Pepxml::SearchHit.new(
+                  :hit_rank => i+1,
+                  :peptide => pure_aaseq,
+                  :peptide_prev_aa => prev_aa,
+                  :peptide_next_aa => next_aa,
+                  :protein => pep.proteins.first.reference.split(' ')[0],
+                  :num_tot_proteins => pep.proteins.size,
+                  :num_matched_ions => pep.ions_matched,
+                  :tot_num_ions => pep.ions_total,
+                  :calc_neutral_pep_mass => calc_neutral_pep_mass,
+                  :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
+                  :num_tol_term => sample_enzyme.num_tol_term(prev_aa, pure_aaseq, next_aa),
+                  :num_missed_cleavages => sample_enzyme.num_missed_cleavages(pure_aaseq),
+                  :modification_info => modifications_obj.modification_info(Ms::Ident::Peptide.split_sequence(pep.sequence)[1])
+                ) do |search_scores|
+                  if opt[:deltacn_orig]
+                    deltacn = pep.deltacn_orig
+                    deltacnstar = nil
+                  else
+                    deltacn = pep.deltacn
+                    deltacn = 1.0 if deltacn == 1.1
+                    deltcnstar = out_file.hits[i+1].nil? ? '1' : '0'
+                  end
+                  search_scores.merge!( :xcorr => pep.xcorr, :deltcn => deltacn,
+                                       :spscore => pep.sp, :sprank => pep.rsp)
+                  search_scores[:deltacnstar] = deltacnstar if deltacnstar
+                end
+              end
+              sr = Ms::Ident::Pepxml::SearchResult.new(:search_hits => search_hits)
+              ret_time =
+                if opt[:retention_times]
+                  (first_scan, last_scan) = i_ar[0,2]
+                  if first_scan==last_scan
+                    scan_to_ret_time[i_ar[0]]
+                  else
+                    times = ((i_ar[0])..(i_ar[1])).step(1).map {|i| scan_to_ret_time[i] }.compact
+                    times.inject(&:+) / times.size.to_f
+                  end
+                end
+              Ms::Ident::Pepxml::SpectrumQuery.new(
+                :spectrum  => [srf.base_name_noext, *i_ar].join('.'), :start_scan => i_ar[0], :end_scan => i_ar[1],
+                :precursor_neutral_mass => dta_file.mh - h_plus, :assumed_charge => i_ar[2],
+                :retention_time_sec => ret_time,
+                :search_results => [sr],
+              )
+            end
+            spectrum_queries.replace(spec_queries)
+          end
+        end
+      end
+      pepxml
+    end # to_pepxml
+  end # Srf::Pepxml
+  include Pepxml
+end # Srf
+require 'trollop'
+module Ms::Sequest::Srf::Pepxml
+  def self.commandline(argv, progname=$0)
+    opts = Trollop::Parser.new do
+      banner %Q{
+        usage: #{progname} [OPTIONS] <file>.srf ...
+        output: <file>.xml ...
+      }.lines.map(&:lstrip).join
+      text ""
+      text "major options:"
+      opt :db_dir, "The dir holding the DB if different than in Srf. (pepxml requires a valid database path)", :type => :string
+      opt :mz_dir, "directory holding mz[X]ML files (defaults to the folder holding the srf file)", :type => :string
+      opt :retention_times, "include retention times (requires mz-dir)"
+      opt :deltacn_orig, "use original deltacn values created by SEQUEST.  By default, the top hit gets the next hit's original deltacn."
+      opt :no_filter, "do not filter hits by peptide_mass_tolerance (per sequest params)"
+      opt :num_hits, "include N top hits", :default => 1
+      opt :outdirs, "list of output directories", :type => :strings
+      opt :quiet, "do not print warnings, etc."
+      text ""
+      text "minor options:"
+      opt :ms_model, 'mass spectrometer model', :type => :string
+      opt :ms_ionization, 'type of ms ionization', :default => 'ESI'
+      opt :ms_detector, 'ms detector', :default => 'UNKNOWN'
+      opt :ms_mass_analyzer, 'ms mass analyzer', :type => :string
+      opt :ms_manufacturer, 'ms manufacturer', :default => 'Thermo'
+      opt :raw_data, 'preferred extension for raw data', :default => '.mzXML'
+      opt :db_seq_type, "'AA' or 'NA'", :default => 'AA'
+      opt :db_residue_size, 'calculate the size of the fasta file'
+      opt :db_name, 'the database name', :type => :string
+      opt :db_orig_database_url, 'original database url', :type => :string
+      opt :db_release_date, 'database release date', :type => :string
+      opt :db_release_identifier, 'the database release identifier', :type => :string
+    end
+    opt = opts.parse argv
+    opts.educate && exit if argv.empty?
+    Trollop.die :outdirs, "outdirs must be same size as number of input files" if opt.outdirs && opt.outdirs.size != argv.size
+    opt[:filter] = !opt.delete(:no_filter)
+    opt[:outdirs] ||= []
+    opt[:raw_data] = [opt[:raw_data]] if opt[:raw_data]
+    opt[:verbose] = !opt[:quiet]
+    argv.zip(opt.delete(:outdirs)) do |srf_file,outdir|
+      outdir ||= File.dirname(srf_file)
+      srf = Ms::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
+      pepxml = srf.to_pepxml(opt)
+      outfile = pepxml.to_xml(outdir)
+      puts "wrote file: #{outfile}" if opt[:verbose]
+    end
+  end
+end

data/lib/ms/sequest/srf/pepxml/sequest.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module Ms ; end
+module Ms::Ident ; end
+class Ms::Ident::Pepxml
+  class SearchHit
+    Sequest = Struct.new(:xcorr, :deltacn, :deltacnstar, :spscore, :sprank) do
+      # Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
+      def self.split_ions(ions)
+        ions.split("/").map {|ion| ion.to_i }
+      end
+      def to_xml(builder)
+        members.zip(self.to_a) do |sym, val|
+          builder.search_score(:name => sym, :value => val)
+        end
+      end
+    end
+  end
+end

data/lib/ms/sequest/srf/sqt.rb CHANGED Viewed

@@ -159,7 +159,7 @@ module Ms
                 end
                 # note that the rank is determined by the order..
                 out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig_updated, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
-                hit.prots.each do |prot|
+                hit.proteins.each do |prot|
                   out.puts ['L', prot.first_entry].join("\t")
                 end
               end

data/spec/ms/sequest/bioworks_spec.rb CHANGED Viewed

@@ -13,10 +13,10 @@ describe Bioworks, 'set from an xml file' do
   it 'can set one with labeled proteins' do
     file = Tfiles + "/bioworks_with_INV_small.xml"
     obj = Bioworks.new(file)
-    obj.prots.size.should == 19
+    obj.proteins.size.should == 19
     file = Tfiles + '/bioworks_small.xml'
     obj = Bioworks.new(file)
-    obj.prots.size.should == 106
+    obj.proteins.size.should == 106
   end
   it 'can parse an xml file NOT derived from multi-concensus' do
@@ -28,10 +28,10 @@ describe Bioworks, 'set from an xml file' do
     obj.global_filename.should == gfn
     obj.origfilename.should == origfilename
     obj.origfilepath.should == origfilepath
-    obj.prots.size.should == 7
-    obj.prots.first.peps.first.base_name.should ==  gfn
-    obj.prots.first.peps.first.file.should ==  "152"
-    obj.prots.first.peps.first.charge.should == 2
+    obj.proteins.size.should == 7
+    obj.proteins.first.peptides.first.base_name.should ==  gfn
+    obj.proteins.first.peptides.first.file.should ==  "152"
+    obj.proteins.first.peptides.first.charge.should == 2
     # @TODO: add more tests here
   end
@@ -57,7 +57,7 @@ describe Bioworks, 'set from an xml file' do
   def _assert_equal_pieces(exp, act, prot)
     # equal as floats (by delta)
     exp.each_index do |i|
-      if i == 5  # both prots and peps
+      if i == 5  # both proteins and peptides
         act[i].to_f.should be_close(exp[i].to_f, 0.1)
       elsif i == 3 && !prot
         act[i].to_f.should be_close(exp[i].to_f, 0.01)
@@ -99,7 +99,7 @@ describe Bioworks, 'set from an xml file' do
     end
     exp_peps = exp_peps.zip(exp_prots)
     exp_peps.collect! do |both|
-      both[0].prots = [both[1]]
+      both[0].proteins = [both[1]]
       both[0]
     end
@@ -107,8 +107,8 @@ describe Bioworks, 'set from an xml file' do
       pep = Bioworks::Pep.new
       pep.charge = arr[0]
       pep.sequence = arr[1]
-      pep.prots = [Bioworks::Prot.new]
-      pep.prots.first.reference = "#{cnt}"
+      pep.proteins = [Bioworks::Prot.new]
+      pep.proteins.first.reference = "#{cnt}"
       cnt += 1
       pep
     end
@@ -130,7 +130,7 @@ end
 describe Bioworks::Pep do
   it 'can be initialized from a hash' do
-    hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prots => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18}
+    hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :proteins => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_proteins => 17, :_first_prot => 18}
     pep = Bioworks::Pep.new(hash)
     hash.each do |k,v|
       pep.send(k).should == v

data/spec/ms/sequest/pepxml/modifications_spec.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require 'spec_helper'
+require 'ms/sequest/params'
+require 'ms/sequest/pepxml/modifications'
+describe 'Ms::Sequest::Pepxml::Modifications' do
+  before do
+    tf_params = TESTFILES + "/bioworks32.params"
+    @params = Ms::Sequest::Params.new(tf_params)
+    # The params object here is completely unnecessary for this test, except
+    # that it sets up the mass table
+    @obj = Ms::Sequest::Pepxml::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
+  end
+  it 'creates a mod_symbols_hash' do
+    answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
+    @obj.mod_symbols_hash.should == answ
+    ## need more here
+  end
+  it 'creates a ModificationInfo object given a special peptide sequence' do
+    mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
+    @params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
+    @params.term_diff_search_options = "14.20000 12.33000"
+    mod = Ms::Sequest::Pepxml::Modifications.new(@params, mod_string)
+    ## no mods
+    peptide_nomod = "PEPTIDE"
+    ok mod.modification_info(peptide_nomod).nil?
+    peptide_mod = "]M*EC^S@IDM#M*EMSCM["
+    modinfo = mod.modification_info(peptide_mod)
+    xml_string = modinfo.to_xml
+    xml_string.matches /<mod_aminoacid_mass /
+    xml_string.matches /mod_nterm_mass=/
+    xml_string.matches /mod_cterm_mass=/
+    xml_string.matches /modified_peptide=/
+    modinfo.mod_aminoacid_masses.size.is 5
+    mod_aa_masses = modinfo.mod_aminoacid_masses
+    # positions are verified, masses are just frozen
+    [1,3,4,7,8].zip([147.09606, 115.1429, 167.0772999, 160.19606, 147.09606], mod_aa_masses) do |pos, mass, obj|
+      obj.position.is pos
+      obj.mass.should.be.close mass, 0.0001
+    end
+    # These values are just frozen and not independently verified yet
+    modinfo.mod_nterm_mass.should.be.close 146.4033, 0.0001
+    modinfo.mod_cterm_mass.should.be.close 160.5334, 0.0001
+  end
+end