RubyGems - ms-ident - Versions diffs - 0.0.2 - Mend

ms-ident 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/.document +5 -0
data/Gemfile +31 -0
data/Gemfile.lock +32 -0
data/LICENSE +61 -0
data/README.rdoc +97 -0
data/Rakefile +54 -0
data/VERSION +1 -0
data/lib/merge.rb +7 -0
data/lib/ms/ident/pepxml/modifications/sequest.rb +237 -0
data/lib/ms/ident/pepxml/modifications.rb +94 -0
data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
data/lib/ms/ident/pepxml/msms_run_summary.rb +81 -0
data/lib/ms/ident/pepxml/parameters.rb +14 -0
data/lib/ms/ident/pepxml/pep_summary.rb +104 -0
data/lib/ms/ident/pepxml/prot_summary.rb +484 -0
data/lib/ms/ident/pepxml/sample_enzyme.rb +166 -0
data/lib/ms/ident/pepxml/search_database.rb +42 -0
data/lib/ms/ident/pepxml/search_hit/modification_info.rb +82 -0
data/lib/ms/ident/pepxml/search_hit.rb +141 -0
data/lib/ms/ident/pepxml/search_result.rb +28 -0
data/lib/ms/ident/pepxml/search_summary.rb +88 -0
data/lib/ms/ident/pepxml/spectrum_query.rb +83 -0
data/lib/ms/ident/pepxml.rb +61 -0
data/lib/ms/ident.rb +11 -0
data/schema/pepXML_v115.xsd +1458 -0
data/schema/pepXML_v19.xsd +1337 -0
data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
data/spec/ms/ident/pepxml_spec.rb +436 -0
data/spec/spec_helper.rb +40 -0
metadata +194 -0

data/lib/ms/ident/pepxml/prot_summary.rb ADDED Viewed

@@ -0,0 +1,484 @@
+require 'rexml/document'
+require 'hash_by'
+require 'instance_var_set_from_hash'
+require 'axml'
+require 'spec_id'
+require 'arrayclass'
+require 'spec_id/parser/proph'
+module SpecID ; end
+module SpecID::Prot ; end
+module SpecID::Pep ; end
+module Proph
+  class ProtSummary
+    include SpecID
+    # if you get this match it's a protein prophet file and the version is the
+    # first match!
+    Filetype_and_version_re_old = /ProteinProphet_v([\.\d]+)\.dtd/  # gives 1.9 or what else?
+    Filetype_and_version_re_new = /protXML_v([\.\d]+)\.xsd/        # gives 4 right now
+    # inherits prots and peps
+    # the protein groups
+    attr_accessor :prot_groups
+    attr_accessor :version
+    def hi_prob_best ; true end
+    def get_version(file)
+      answer = nil
+      File.open(file) do |fh|
+        5.times do
+          line = fh.gets
+          answer =
+            if line =~ Filetype_and_version_re_new
+              $1.dup
+            elsif line =~ Filetype_and_version_re_old
+              $1.dup
+            end
+          break if answer
+        end
+      end
+      raise(ArgumentError, "couldn't detect version in #{file}") unless answer
+      answer
+    end
+    def initialize(file=nil)
+      @prots = nil
+      if file
+        @version = get_version(file)
+        #@prot_groups = ProtSummary::Parser.new.parse_file(file)
+        SpecID::Parser::ProtProph.new(:spec_id).parse(file, :spec_id => self)
+      end
+    end
+    # returns a set of unique proteins
+    def unique_prots(prot_groups)
+      all_prots = []
+      prot_groups.each do |pg|
+        pg.prots.each do |prt|
+          all_prots << prt
+        end
+      end
+      all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first }
+    end
+  end
+  class ProtSummary::Parser
+    attr_accessor :prot_groups
+    def initialize(file=nil, with_peps=false, tp='axml')
+      if file
+        @prot_groups = parse_file(file, with_peps, tp)
+      end
+    end
+    # returns an array of protein_groups
+    def parse_file(file, with_peps=false, tp='axml')
+      File.open(file) do |fh|
+        @prot_groups = _parse_for_prot_groups(fh, with_peps, tp)
+      end
+      @prot_groups
+    end
+    # returns an array of ProtGroup objects
+    def _parse_for_prot_groups(stream, with_peps=false, tp='axml')
+      prtgrps = []
+      case tp
+      when 'axml'
+        root = AXML.parse(stream)
+        root.protein_group.each do |protein_group|
+          pg = ProtGroup.new(protein_group.attrs) do
+            protein_group.map do |protein|
+              Prot.new(protein.attrs)
+            end
+          end
+          prtgrps << pg
+        end
+      end
+      prtgrps
+    end
+  end   # ProtSummary::Parser
+  class ProtGroup
+    attr_accessor :group_number, :probability, :prots
+    def initialize(args=nil)
+      @prots = []
+      if args
+        instance_var_set_from_hash(args)
+      end
+      if block_given?
+        @prots = yield
+      end
+    end
+  end
+end  # Proph
+Proph::Prot = Arrayclass.new(%w(protein_name probability n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description peps))
+# note that 'description' is found in the element 'annotation', attribute 'protein_description'
+# NOTE!: unique_stripped peptides is an array rather than + joined string
+class Proph::Prot
+  include SpecID::Prot
+  # returns protein_name
+  def name ; self[0] end
+  def reference ; self[0] end
+  def first_entry ; self[0] end  # the name is also the first_entry
+end
+#def to_s
+#  '<Prot: protein_name=' + @protein_name + ' ' + 'probability=' + @probability.to_s + '>'
+#end
+# this is a pep from a -prot.xml file
+Proph::Prot::Pep = Arrayclass.new(%w(peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots))
+class Proph::Prot::Pep
+  include SpecID::Pep
+  alias_method :mod_info, :modification_info
+  alias_method :mod_info=, :modification_info=
+  def aaseq ; self[0] end
+  def probability ; self[3] end
+end # class Pep
+=begin
+  #attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
+  #attr_writer :arithmetic_avg_scan_by_parent_time
+  #def initialize(args=nil)
+  #  if args
+  #    @sequence = args[:sequence]
+  #    @probability = args[:probability]  ## nsp prob
+  #    @filenames = args[:filenames]
+  #    @charge = args[:charge]
+  #    @nsp_cutoff = args[:nsp_cutoff]
+  #    if args.key?(:scans)
+  #      @scans = args[:scans]
+  #    else
+  #      @scans = []  ## this is set later if needed
+  #    end
+  #  else
+  #    @scans = []
+  #  end
+  #end
+  # filter peptides based on the number of scans
+  # if a peptide has more than max_dups scans, the peptide is tossed
+  # note that multiple scans that were used as a single dtafile scan
+  # will be counted as a single scan for these purposes!
+  # (easy, since they are stored as a single item in the array of scans)
+  def self.filter_by_max_dup_scans(max_dups=nil, peps=nil)
+    if max_dups
+      new_peps = []
+      peps.each do |pep|
+        unless pep.scans.size > max_dups
+          new_peps << pep
+        end
+      end
+      new_peps
+    else
+      peps.dup
+    end
+  end
+  ## from the list of scans, creates a scan object whose time is the
+  ## arithmetic mean of the parent scans (based on prec_inten) and whose
+  ## prec_mz is the avg of all prec_mz's.  num is nil, charge is the first
+  def arithmetic_avg_scan_by_parent_time
+    unless @arithmetic_avg_scan_by_parent_time
+      flat_scans = @scans.flatten
+      # new_prec_mz
+      prec_mz_sum = 0.0
+      prec_inten_sum = 0.0
+      times = []
+      intens = []
+      tot_inten = 0.0
+      flat_scans.each do |c|
+        prec_inten = c.prec_inten
+        prec_inten_sum += prec_inten
+        prec_mz_sum += c.prec_mz
+        tot_inten += prec_inten
+        times << c.parent.time
+        intens << prec_inten
+      end
+      new_prec_mz = prec_mz_sum / flat_scans.size
+      new_prec_inten = prec_inten_sum / flat_scans.size
+      fraction_inten = []
+      intens.each do |inten|
+        fraction_inten.push( inten/tot_inten )
+      end
+      new_time = 0.0
+      (0...times.size).each do |i|
+        new_time += times[i] * fraction_inten[i]
+      end
+      @arithmetic_avg_scan_by_parent_time = MS::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten )
+    end
+    @arithmetic_avg_scan_by_parent_time
+  end
+  def to_s
+    '<Pep seq=' + @sequence + ' ' + 'prob=' + @probability.to_s + ' charge=' + @charge + '>'
+  end
+  def has_dta?(dta_filename)
+    if @filenames
+      @filenames.each do |fn|
+        if dta_filename == fn
+          return true
+        end
+      end
+    end
+    return false
+  end
+  # Given a list of peptides, returns only those unique based on
+  # sequence/charge
+  def self.uniq_by_seqcharge(peptides)
+    # @TODO: this could be done with one fewer traversals, but it is beautiful
+    peptides.hash_by(:sequence, :charge).collect do |k,v|
+      v.first
+    end
+  end
+=end
+=begin
+# Class for parsing the peptide prophet output files in various ways
+class Proph::Pep::Parser < Parser
+  # parse_type = "rexml" | "regex"
+  # regex's are about 50 times faster but are not guaranteed to work
+  # seq charge hash is keyed on an array -> [sequence,charge]
+  # @TODO: implement parsing on this with xmlparser
+  def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml")
+    seq_charge_hash = Hash.new {|hash,key| hash[key] = [] }
+    case parse_type
+    when "rexml"
+      #puts "READING: " + pep_xml_file + " ..."
+      doc = REXML::Document.new File.new(pep_xml_file)
+      ## Create a hash of peptides based on sequence_charge (takes an array)
+      doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result|
+        pep_charge = result.attributes['assumed_charge']
+        filename = result.attributes['spectrum']
+        result.elements.to_a('search_hit').each do |hit|
+          pep_seq = hit.attributes['peptide']
+          seq_charge = [pep_seq, pep_charge]
+          seq_charge_hash[seq_charge] << filename
+        end
+      end
+      seq_charge_hash
+    when "regex"
+      #puts "READING: " + pep_xml_file + " ..."
+      ## Create a hash of peptides based on sequence_charge (takes an array)
+      ## file from peptideAtlas:
+      search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
+      search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
+      search_hit_regex = /<search_hit .*peptide="(\w+)" /o
+      peptide_h = {}
+      filename = nil
+      pep_charge = nil
+      File.open(pep_xml_file).each do |line|
+        if line =~ search_result_regex1
+          filename = $1.dup
+          pep_charge = $2.dup
+        elsif line =~ search_result_regex2
+          filename = $1.dup
+          pep_charge = $2.dup
+        end
+        if line =~ search_hit_regex
+          pep_seq = $1.dup
+          seq_charge = [pep_seq, pep_charge]
+          seq_charge_hash[seq_charge] << filename
+        end
+      end
+    end
+    seq_charge_hash
+  end
+  # drops all search_hits that have peptideprophet probability < min_val
+  # and drops any search_results that end up with 0 search_hits
+  def filter_by_min_pep_prob(file, outfile, min_val)
+    root = root_el(file)
+    d_search_hit = nil
+    d_search_result = nil
+    root.children.each do |child1|
+      if child1.name == 'msms_run_summary'
+        d_search_result = []
+        child1.children.each do |child2|
+          if child2.name == 'search_result'
+            #puts "size before: " + child2.size.to_s
+            d_search_hit = []
+            child2.children.each do |child3|
+              if child3.name == 'search_hit'
+                child3.children.each do |child4|
+                if child4.name == 'peptideprophet_result'
+                  if child4.attrs["probability"].to_f < min_val
+                    #puts "dropping probability: #{child4.attrs["probability"]}"
+                    d_search_hit << child3
+                  else
+                    #puts "keeping probability: #{child4.attrs["probability"]}"
+                  end
+                end
+                end
+              end
+            end
+            d_search_hit.each do |to_drop|
+              to_drop.drop
+            end
+            #puts "size after: " + child2.size.to_s
+            if child2.size == 0
+              d_search_result << child2
+            end
+          end
+        end
+        d_search_result.each do |to_drop|
+          to_drop.drop
+        end
+      end
+    end
+    File.open(outfile, "w") do |fh|
+      fh.print root.to_s
+    end
+  end
+end   # Pep::Parser
+# Class for parsing the '*-prot.xml' files in different ways
+class Proph::Prot::Parser < Parser
+  attr_accessor :prots
+  attr_writer :peps
+  def initialize
+    @prots = []
+  end
+  # returns all the peptides from prots
+  def peps
+    unless @peps
+      @peps = []
+      @prots.each do |prot|
+        @peps.push(*(prot.peps))
+      end
+    end
+    @peps
+  end
+  # sets and returns an array of Prot objects
+  # parse_type = "rexml" | "regex"
+  def get_prots_and_peps(protxmlfile, prot_prob_cutoff=1.0, pep_init_prob_cutoff=1.0, pep_nsp_prob_cutoff=1.0, parse_type="rexml")
+    ## ensure these are all floats
+    (prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff) = [prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff].collect do |cutoff|
+      cutoff.to_f
+    end
+    case parse_type
+    when "rexml"
+      doc = REXML::Document.new File.new(protxmlfile)
+      doc.elements.each("protein_summary/protein_group/protein") do |elem|
+        if elem.attributes['probability'].to_f >= prot_prob_cutoff
+          prob = elem.attributes['probability'].to_f
+          name= elem.attributes['protein_name']
+          curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
+          peptides = []
+          elem.elements.to_a('peptide').each do |pep|
+            if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff
+              nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f
+              sequence = pep.attributes['peptide_sequence']
+              charge = pep.attributes['charge']
+              pnm = pep.attributes['precursor_neutral_mass']
+              peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff))
+            end
+            ## Only take proteins with peptides!
+            if peptides.size > 0
+              curr_prot.peps = peptides
+              @prots << curr_prot
+            end
+          end
+        end
+      end
+    when "regex"
+      prot_regex = /<protein protein_name="(.*)?" n_indistinguishable_proteins(.*)/o
+      prot_prob_regex = /probability="([\d\.]+)"/o
+      pep_regex = /<peptide peptide_sequence="(\w+)?"(.*)/o
+      pep_else_regex = /charge="(\d)" initial_probability="([\d\.]+)" nsp_adjusted_probability="([\d\.]+)"/o
+      curr_prot = nil
+      peptides = []
+      File.open(protxmlfile).each do |line|
+        if line =~ prot_regex
+          prob = nil
+          name = $1.dup
+          rest = $2
+          if rest =~ prot_prob_regex
+            prob = $1.dup
+          end
+          if curr_prot
+            if curr_prot.probability.to_f >= prot_prob_cutoff
+              if peptides.size > 0
+                curr_prot.peps = peptides
+                @prots.push(curr_prot)
+              end
+            end
+          end
+          curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
+          peptides = []
+        end
+        if line =~ pep_regex
+          sequence = $1.dup
+          rest = $2
+          if rest =~ pep_else_regex
+            charge = $1
+            init_prob = $2
+            nsp_prob = $3
+            if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff
+              peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff))
+            end
+          end
+        end
+        # get the last one:
+        if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0
+          curr_prot.peps = peptides
+          @prots.push(curr_prot)
+        end
+      end
+    end
+    @prots
+  end
+end  # Prot::Parser
+################ --END
+=end

data/lib/ms/ident/pepxml/sample_enzyme.rb ADDED Viewed

@@ -0,0 +1,166 @@
+require 'merge'
+module Ms ; end
+module Ms::Ident ; end
+class Ms::Ident::Pepxml ; end
+class Ms::Ident::Pepxml::SampleEnzyme
+  include Merge
+  # an identifier
+  attr_accessor :name
+  # amino acids after which to cleave
+  attr_accessor :cut
+  # cleave at 'cut' amino acids UNLESS it is followed by 'no_cut'
+  attr_accessor :no_cut
+  # 'C' or 'N'
+  attr_accessor :sense
+  # Can pass in a name of an enzyme that is recognized (meaning there is a
+  # set_<name> method), or
+  #   trypsin
+  # For other enzymes, you must set :cut, :no_cut, :name, and :sense will
+  def initialize(arg={})
+    if arg.is_a?(String)
+      @name = arg
+      send("set_#{@name}".to_sym)
+    else
+      merge!(arg)
+    end
+  end
+  def set_trypsin
+    @sense = 'C'
+    @cut = 'KR'
+    @no_cut = 'P'
+  end
+  # if an xml builder object is given, it adds to the object and returns the
+  # builder object, otherwise it returns an xml fragment string
+  def to_xml(builder=nil)
+    xmlb = builder || Nokogiri::XML::Builder.new
+    xmlb.sample_enzyme(:name => name) do |xmlb|
+      xmlb.specificity(:cut => cut, :no_cut => no_cut, :sense => sense)
+    end
+    builder || xmlb.doc.root.to_xml
+  end
+  # returns self
+  def from_pepxml_node(node)
+    self.name = node['name']
+    ch = node.child
+    self.cut = ch['cut']
+    self.no_cut= ch['no_cut']
+    self.sense = ch['sense']
+    self
+  end
+  def self.from_pepxml_node(node)
+    self.new.from_pepxml_node(node)
+  end
+end
+###################################################
+###################################################
+###################################################
+###################################################
+# This is digestion methodology:
+=begin
+require 'strscan'
+  # takes an amino acid sequence (e.g., -.PEPTIDK.L)
+  # returns the number of missed cleavages
+  def num_missed_cleavages(aaseq)
+    raise NotImplementedError, 'need to implement for N terminal sense'  if sense == 'N'
+    @num_missed_cleavages_regex =
+      if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
+      else
+        regex_string = "[#{@cut}]"
+        if @no_cut and @no_cut != ''
+          regex_string << "[^#{@no_cut}]"
+        end
+        /#{regex_string}/
+      end
+    arr = aaseq.scan(@num_missed_cleavages_regex)
+    num = arr.size
+    if aaseq[-1,1] =~ @num_missed_cleavages_regex
+      num -= 1
+    end
+    num
+  end
+  # requires full sequence (with heads and tails)
+  def num_tol_term(sequence)
+    raise NotImplementedError, 'need to implement for N terminal sense'  if sense == 'N'
+    no_cut = @no_cut || ''
+    num_tol = 0
+    first, middle, last = SpecID::Pep.split_sequence(sequence)
+    last_of_middle = middle[-1,1]
+    first_of_middle = middle[0,1]
+    if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
+      num_tol += 1
+    end
+    if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
+      num_tol += 1
+    end
+    num_tol
+  end
+  # returns all peptides of missed cleavages <= 'missed_cleavages'
+  # so 2 missed cleavages will return all no missed cleavage peptides
+  # all 1 missed cleavages and all 2 missed cleavages.
+  # options:
+  def digest(string, missed_cleavages=0, options={})
+    raise NotImplementedError if @sense == 'N'
+    s = StringScanner.new(string)
+    no_cut_regex = Regexp.new("[#{@no_cut}]")
+    regex = Regexp.new("[#{@cut}]")
+    peps = []
+    last_pos = 0
+    current_pep = ''
+    loop do
+      if s.eos?
+        break
+      end
+      m = s.scan_until(regex)
+      if m  ## found a cut point
+        last_pos = s.pos
+        # is the next amino acid a no_cut?
+        if string[s.pos,1] =~ no_cut_regex
+          current_pep << m
+        else
+          # cut it
+          current_pep << m
+          peps << current_pep
+          current_pep = ''
+        end
+      else  ## didn't find a cut point
+        current_pep << string[last_pos..-1]
+        peps << current_pep
+        break
+      end
+    end
+    ## LOOP through and grab each set of missed cleavages from num down to 0
+    all_sets_of_peps = []
+    (0..missed_cleavages).to_a.reverse.each do |num_mc|
+      all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
+    end
+    all_sets_of_peps
+  end
+  # takes an array of peptides and returns an array containing 'num' missed
+  # cleavages
+  # DOES NOT contain peptides that contain < num of missed cleavages
+  # (i.e., will not return missed cleaveages of 1 or 2 if num == 3
+  def get_missed_cleavages(ar_of_peptide_seqs, num)
+    (0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
+      ar_of_peptide_seqs[i,num+1].join
+    end
+  end
+  def self.tryptic(string, missed_cleavages=0)
+    self.new("trypsin").digest(string, missed_cleavages)
+  end
+end
+=end

data/lib/ms/ident/pepxml/search_database.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require 'merge'
+module Ms ; end
+module Ms::Ident ; end
+class Ms::Ident::Pepxml
+  class SearchDatabase
+    include Merge
+    # required! the local, full path to the protein sequence database
+    attr_accessor :local_path
+    # required! 'AA' or 'NA'
+    attr_accessor :seq_type
+    # optional
+    attr_accessor :database_name
+    # optional
+    attr_accessor :orig_database_url
+    # optional
+    attr_accessor :database_release_date
+    # optional
+    attr_accessor :database_release_identifier
+    # optional
+    attr_accessor :size_of_residues
+    # takes a hash to fill in values
+    def initialize(hash={}, get_size_of_residues=false)
+      merge!(hash)
+      if get_size_of_residues && File.exist?(@local_path)
+        @size_of_residues = 0
+        Ms::Fasta.foreach(@local_path) do |entry|
+          @size_of_residues += entry.sequence.size
+        end
+      end
+    end
+    def to_xml(builder)
+      attrs = [:local_path, :seq_type, :database_name, :orig_database_url, :database_release_date, :database_release_identifier, :size_of_residues].map {|k| v=send(k) ; [k, v] if v }.compact
+      builder.search_database(Hash[attrs])
+      builder
+    end
+  end
+end