RubyGems - divvy_proteomics - Versions diffs - 0.2.0 → 0.3.0 - Mend

divvy_proteomics 0.2.0 → 0.3.0

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile +6 -6
data/README.md +14 -4
data/VERSION +1 -1
data/bin/divvy_spectra +28 -15
data/divvy_proteomics.gemspec +30 -22
data/lib/divvy_proteomics.rb +16 -0
data/lib/divvyable_protein.rb +40 -0
data/lib/dta_select_output.rb +12 -50
data/lib/pep_xml.rb +130 -0
data/spec/data/contaminant.pep.xml +13 -0
data/spec/data/minimal.pep.xml +14 -0
data/spec/data/minimal2.pep.xml +12 -0
data/spec/data/minimal3.pep.xml +21 -0
data/spec/divvy_proteomics_spec.rb +0 -2
data/spec/pep_xml_spec.rb +99 -0
data/spec/spec_helper.rb +3 -1
metadata +36 -29

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 05ddf71aecd113201c370185104006af9705a525
-  data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
+  metadata.gz: 57c4b7c64ec34ec42b4b28f6a31537a8901d5a7c
+  data.tar.gz: c216e563aa04c13e7b935852862994579e45d61f
 SHA512:
-  metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
-  data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666
+  metadata.gz: 7a1e7dd2be6565d9503ce1e5fc43cb2362920fa8146f66d7c179c0a812de5e1ea2be9f42d70f63bb2bf3811e44f5800f01e4bbbb8731a4365d129c01c138edae
+  data.tar.gz: 846cf30c524054c62205de65aba81d37bdd1f2857cec1e623730d167efa6eacc71491a999806e73547b50d0c36a4b87993245d20217d47090c8e3da6eecf31b3

data/Gemfile CHANGED Viewed

@@ -1,13 +1,13 @@
 source "http://rubygems.org"
-gem 'bio-logger', ">=0"
+gem 'bio-logger', "~> 1.0"
 # Add dependencies to develop your gem here.
 # Include everything needed to run rake, tests, features, etc.
 group :development do
-  gem 'systemu', ">=0"
-  gem "rspec", ">= 2.8.0"
-  gem "rdoc", ">= 3.12"
-  gem "bundler", ">= 1.0.0"
-  gem "jeweler", ">= 1.8.4"
+  gem 'systemu', "~> 2.6"
+  gem "rspec", "~> 2.14"
+  gem "rdoc", "~> 3.12"
+  gem "bundler", "~> 1.5"
+  gem "jeweler", "~> 2.0"
 end

data/README.md CHANGED Viewed

@@ -13,20 +13,30 @@ $ gem install divvy_spectra
 $ divvy_spectra <DTASelectFile>
 ```
 Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
-estimated number of spectral counts after sorting out the non-uniqueness.
+estimated number of spectral counts after sorting out the non-uniqueness. Using the ```--pep-xml``` flag, PepXML files
+are can be used as input also:
+```
+$ divvy_spectra --pep-xml <PepXML_file>
+```
 Full usage information:
 ```
-$ divvy_spectra -h
-    Usage: divvy_spectra [options] <DTASelect_file>
+    Usage: divvy_spectra [options] <input_file>
-    Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.
+    Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.
         --merge-proteins FILE_OF_IDENTIFIERS
                                      Provide a space/tab separated file where the identifiers on each row should be treated as one protein
         --whitelist FILE_OF_PROTEINS_TO_REPORT
                                      Only report proteins that are in this whitelist, after divvying with everything
+        --contaminant-regexes REGEXES
+                                     Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: ]
+Optional arguments:
+        --pep-xml                    Input file is pep XML, rather than a DTA select output file [default: false]
 Verbosity:

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.3.0

data/bin/divvy_spectra CHANGED Viewed

@@ -5,23 +5,24 @@ require 'bio-logger'
 require 'pp'
 require 'set'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
+SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'divvy_proteomics'
 rootpath = File.dirname(File.dirname(__FILE__))
 $: << File.join(rootpath,'lib')
-require 'dta_select_output'
+require 'divvy_proteomics'
 # Parse command line options into the options hash
 options = {
   :logger => 'stderr',
   :log_level => 'info',
   :contaminant_regexes => [/^CNTM:/],
+  :input_is_pep_xml => false,
 }
 o = OptionParser.new do |opts|
   opts.banner = "
-    Usage: #{SCRIPT_NAME} [options] <DTASelect_file>
+    Usage: #{SCRIPT_NAME} [options] <input_file>
-    Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
+    Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
   opts.on("--merge-proteins FILE_OF_IDENTIFIERS", "Provide a space/tab separated file where the identifiers on each row should be treated as one protein") do |file|
     options[:merge_proteins_file] = file
@@ -32,6 +33,10 @@ o = OptionParser.new do |opts|
   opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
     options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
   end
+  opts.separator "\nOptional arguments:\n\n"
+  opts.on("--pep-xml", "Input file is pep XML, rather than a DTA select output file [default: #{options[:input_is_pep_xml]}]") do |arg|
+    options[:input_is_pep_xml] = true
+  end
   # logger options
@@ -74,7 +79,12 @@ if options[:whitelist_file]
 end
 # Parse the csv file
-parsed = Bio::DTASelect::OutputFile.parse(ARGF)
+parsed = nil
+if options[:input_is_pep_xml]
+  parsed = Bio::PepXML.parse(ARGF)
+else
+  parsed = Bio::DTASelect::OutputFile.parse(ARGF)
+end
 # Hashes of identifiers to objects
 proteins = parsed.protein_name_to_object
@@ -90,12 +100,13 @@ mergers.each do |secondary_id, primary_id|
     # Invalidate some things about the primary ID because they are no longer valid
     current_protein = proteins[primary_id]
-    current_protein.sequence_count = nil
-    current_protein.sequence_coverage = nil
-    current_protein.length = nil
-    current_protein.molwt = nil
-    current_protein.pi = nil
-    current_protein.validation_status = nil
+    # These variables are not used and are not present in pepXML files, so don't mess with them.
+    #    current_protein.sequence_count = nil
+    #    current_protein.sequence_coverage = nil
+    #    current_protein.length = nil
+    #    current_protein.molwt = nil
+    #    current_protein.pi = nil
+    #    current_protein.validation_status = nil
     # Keep the primary proteins' description, I reckon
     # When there is spectra that are in the secondary but not the primary, add them to the primary's repertoire.
@@ -172,11 +183,13 @@ number_non_shared_peptides = all_peptides.select{|pep| pep.parent_proteins.lengt
 total_peptides = number_shared_peptides+number_non_shared_peptides
 log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_peptides*100}%) shared peptides and #{number_non_shared_peptides} (#{number_non_shared_peptides.to_f/total_peptides*100}%) non-shared peptides"
-# Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
-non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
-  peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
+unless options[:input_is_pep_xml]
+  # Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
+  non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
+    peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
+  end
+  log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
 end
-log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
 # OK, finished parsing the file. Now output the score for each protein
 puts [

data/divvy_proteomics.gemspec CHANGED Viewed

@@ -2,14 +2,16 @@
 # DO NOT EDIT THIS FILE DIRECTLY
 # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 # -*- encoding: utf-8 -*-
+# stub: divvy_proteomics 0.3.0 ruby lib
 Gem::Specification.new do |s|
   s.name = "divvy_proteomics"
-  s.version = "0.2.0"
+  s.version = "0.3.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.require_paths = ["lib"]
   s.authors = ["Ben J Woodcroft"]
-  s.date = "2013-11-06"
+  s.date = "2014-01-07"
   s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
   s.email = "donttrustben@gmail.com"
   s.executables = ["divvy_spectra"]
@@ -28,8 +30,14 @@ Gem::Specification.new do |s|
     "bin/divvy_spectra",
     "divvy_proteomics.gemspec",
     "lib/divvy_proteomics.rb",
+    "lib/divvyable_protein.rb",
     "lib/dta_select_output.rb",
+    "lib/pep_xml.rb",
+    "spec/data/contaminant.pep.xml",
     "spec/data/merge_definition.csv",
+    "spec/data/minimal.pep.xml",
+    "spec/data/minimal2.pep.xml",
+    "spec/data/minimal3.pep.xml",
     "spec/data/multiply_mapped_spectra.csv",
     "spec/data/new_format.csv",
     "spec/data/new_format_some_all_shared_spectra.csv",
@@ -39,39 +47,39 @@ Gem::Specification.new do |s|
     "spec/data/three_proteins_meant_for_merge.csv",
     "spec/data/three_proteins_with_contaminant.csv",
     "spec/divvy_proteomics_spec.rb",
+    "spec/pep_xml_spec.rb",
     "spec/spec_helper.rb"
   ]
   s.homepage = "http://github.com/wwood/divvy_proteomics"
   s.licenses = ["MIT"]
-  s.require_paths = ["lib"]
-  s.rubygems_version = "2.0.3"
+  s.rubygems_version = "2.2.0"
   s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
   if s.respond_to? :specification_version then
     s.specification_version = 4
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_runtime_dependency(%q<bio-logger>, [">= 0"])
-      s.add_development_dependency(%q<systemu>, [">= 0"])
-      s.add_development_dependency(%q<rspec>, [">= 2.8.0"])
-      s.add_development_dependency(%q<rdoc>, [">= 3.12"])
-      s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
-      s.add_development_dependency(%q<jeweler>, [">= 1.8.4"])
+      s.add_runtime_dependency(%q<bio-logger>, ["~> 1.0"])
+      s.add_development_dependency(%q<systemu>, ["~> 2.6"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.14"])
+      s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.5"])
+      s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
     else
-      s.add_dependency(%q<bio-logger>, [">= 0"])
-      s.add_dependency(%q<systemu>, [">= 0"])
-      s.add_dependency(%q<rspec>, [">= 2.8.0"])
-      s.add_dependency(%q<rdoc>, [">= 3.12"])
-      s.add_dependency(%q<bundler>, [">= 1.0.0"])
-      s.add_dependency(%q<jeweler>, [">= 1.8.4"])
+      s.add_dependency(%q<bio-logger>, ["~> 1.0"])
+      s.add_dependency(%q<systemu>, ["~> 2.6"])
+      s.add_dependency(%q<rspec>, ["~> 2.14"])
+      s.add_dependency(%q<rdoc>, ["~> 3.12"])
+      s.add_dependency(%q<bundler>, ["~> 1.5"])
+      s.add_dependency(%q<jeweler>, ["~> 2.0"])
     end
   else
-    s.add_dependency(%q<bio-logger>, [">= 0"])
-    s.add_dependency(%q<systemu>, [">= 0"])
-    s.add_dependency(%q<rspec>, [">= 2.8.0"])
-    s.add_dependency(%q<rdoc>, [">= 3.12"])
-    s.add_dependency(%q<bundler>, [">= 1.0.0"])
-    s.add_dependency(%q<jeweler>, [">= 1.8.4"])
+    s.add_dependency(%q<bio-logger>, ["~> 1.0"])
+    s.add_dependency(%q<systemu>, ["~> 2.6"])
+    s.add_dependency(%q<rspec>, ["~> 2.14"])
+    s.add_dependency(%q<rdoc>, ["~> 3.12"])
+    s.add_dependency(%q<bundler>, ["~> 1.5"])
+    s.add_dependency(%q<jeweler>, ["~> 2.0"])
   end
 end

data/lib/divvy_proteomics.rb CHANGED Viewed

@@ -0,0 +1,16 @@
+require 'bio-logger'
+Bio::Log::LoggerPlus.new('divvy_proteomics')
+module Bio
+  module DivvyProteomics
+    module Logging
+      def log
+        Bio::Log::LoggerPlus['divvy_proteomics']
+      end
+    end
+  end
+end
+require 'divvyable_protein'
+require 'dta_select_output'
+require 'pep_xml'

data/lib/divvyable_protein.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Bio::DivvyProteomics::DivvyableProtein
+  def unique_spectra
+    return 0 if @peptides.nil? or @peptides.empty?
+    num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
+    num ||= 0
+    return num
+  end
+  def non_unique_spectra
+    return 0 if @peptides.nil? or @peptides.empty?
+    num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
+    num ||= 0
+    return num
+  end
+  # Are there any peptides that are assigned exclusively to this protein?
+  def uniquely_identified_by_any_peptides?
+    unique_spectra > 0
+  end
+  def estimated_spectral_count
+    # How many unique spectra are there for each protein that shares a peptide with the current peptide
+    return 0 if @peptides.nil? or @peptides.empty?
+    peptide_shares = []
+    # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
+    if !uniquely_identified_by_any_peptides?
+      # Don't attempt to divvy these up, because there are too many assumptions involved
+      return 0
+    else
+      peptides.each do |peptide|
+        log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
+        log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
+        total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
+        peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
+      end
+      return peptide_shares.reduce(:+)
+    end
+  end
+end

data/lib/dta_select_output.rb CHANGED Viewed

@@ -1,12 +1,9 @@
 module Bio::DTASelect
-  module Logging
-    def log
-      Bio::Log::LoggerPlus['divvy_spectra']
-    end
-  end
   class OutputFile
     def self.log
@@ -14,7 +11,8 @@ module Bio::DTASelect
     end
     class SelectedProtein
-      include Bio::DTASelect::Logging
+      include Bio::DivvyProteomics::Logging
+      include Bio::DivvyProteomics::DivvyableProtein
       attr_accessor :identifier
@@ -26,43 +24,7 @@ module Bio::DTASelect
         @peptides = []
       end
-      def unique_spectra
-        return 0 if @peptides.nil? or @peptides.empty?
-        num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
-        num ||= 0
-        return num
-      end
-      def non_unique_spectra
-        return 0 if @peptides.nil? or @peptides.empty?
-        num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
-        num ||= 0
-        return num
-      end
-      # Are there any peptides that are assigned exclusively to this protein?
-      def uniquely_identified_by_any_peptides?
-        unique_spectra > 0
-      end
-      def estimated_spectral_count
-        # How many unique spectra are there for each protein that shares a peptide with the current peptide
-        return 0 if @peptides.nil? or @peptides.empty?
-        peptide_shares = []
-        # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
-        if !uniquely_identified_by_any_peptides?
-          # Don't attempt to divvy these up, because there are too many assumptions involved
-          return 0
-        else
-          peptides.each do |peptide|
-            log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
-            log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
-            total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
-            peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
-          end
-          return peptide_shares.reduce(:+)
-        end
-      end
       def log
         Bio::Log::LoggerPlus[LOG_NAME]
@@ -70,7 +32,7 @@ module Bio::DTASelect
     end
     class Peptide
-      include Bio::DTASelect::Logging
+      include Bio::DivvyProteomics::Logging
       attr_accessor :identifier
@@ -98,7 +60,7 @@ module Bio::DTASelect
     end
     class Result
-      include Bio::DTASelect::Logging
+      include Bio::DivvyProteomics::Logging
       # hash of protein identifier to Protein object
       attr_accessor :protein_name_to_object
@@ -123,7 +85,7 @@ module Bio::DTASelect
       # Parse each line of the DTAselect file
       io.each_line do |line|
         splits = line.chomp.split("\t")
-        log.debug "Parsing line `#{line.chomp}'"
+        log.debug "Parsing line `#{line.chomp}'" if log.debug?
         if reading_header
           log.debug "reading header"
@@ -146,7 +108,7 @@ module Bio::DTASelect
           if !last_line_was_protein_name
             # Sometimes several proteins are given all in the one header line
             # start a new protein
-            log.debug "New protein now being parsed"
+            log.debug "New protein now being parsed" if log.debug?
             current_proteins = []
           end
@@ -174,13 +136,13 @@ module Bio::DTASelect
         elsif splits[1] == 'Proteins'
-          # Done processing, except for the bits down the bottom which aren't parsed (yet)
+          # Done processing, except for the bits down the bottom which aren't parsed (yet, at least)
           break
         else
-          log.debug "New spectra now being parsed"
+          log.debug "New spectra now being parsed" if log.debug?
           last_line_was_protein_name = false
           # Record a spectra
@@ -204,11 +166,11 @@ module Bio::DTASelect
             pep.parent_proteins.push current_protein
             current_protein.peptides.push pep
           end
-          log.debug "Parsed this peptide #{pep.inspect}"
+          log.debug "Parsed this peptide #{pep.inspect}" if log.debug?
         end
       end
-      log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
+      log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}" if log.debug?
       return result
     end
   end

data/lib/pep_xml.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'rexml/document'
+class Bio::PepXML
+  include Bio::DivvyProteomics::Logging
+  attr_accessor :protein_name_to_object, :peptide_name_to_object
+  class Protein
+    include Bio::DivvyProteomics::Logging
+    include Bio::DivvyProteomics::DivvyableProtein
+    # Array of peptide objects that have been assigned to this protein
+    attr_accessor :peptides
+    attr_accessor :identifier, :descriptive_name
+  end
+  # Named 'Peptide' but really mean Spectra. Just too hard to change
+  class Peptide
+    attr_accessor :parent_proteins
+    # Name of the spectra
+    attr_accessor :identifier
+    def initialize
+      @parent_proteins = []
+    end
+    #TODO: right now this just always returns 1. It should really be working out redundancy
+    #properly by comparison of peptide sequences, but this isn't yet parsed this info
+    def redundancy
+      1
+    end
+  end
+  def self.log
+    Bio::PepXML.new.log
+  end
+  def self.parse(io)
+    protein_name_to_object = {}
+    peptide_name_to_object = {}
+    #pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e|
+    #  c+=1; p e.attributes['protein_descr'].strip;
+    #  e.elements.each{|e|
+    #    p e.name, e.attributes['protein_descr'].strip};break}
+    xml = REXML::Document.new(io)
+    parse_name_and_description = lambda do |e|
+      name = e.attributes['protein'].strip
+      description = e.attributes['protein_descr'].strip
+      if name.nil? or name == ''
+        name = e.attributes['protein_descr'].strip
+      else
+        description = name+' '+description
+      end
+      name.gsub!(/\t.*/,'')
+      description.gsub!(/[\t\n]/,' ')
+      [name, description]
+    end
+    #TODO: some better sanity checking here would be ideal.
+    num_hits_parsed = 0
+    xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit|
+      hit_number = hit.attributes['hit_rank']
+      raise "Parsing error on #{hit}" if hit_number.nil?
+      next if hit_number != "1"
+      # Parse the primary hit
+      name1, description1 = parse_name_and_description.call(hit)
+      raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil?
+      spectrum_name = hit.parent.parent.attributes['spectrum'].strip
+      raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil?
+      # It is possible to have multiple peptides both hit the spectra with hit_rank="1"
+      # This happens when when e.g. leucine and isoleucine are possible.
+      spectrum = peptide_name_to_object[spectrum_name]
+      if spectrum.nil?
+        spectrum = Peptide.new
+        spectrum.identifier = spectrum_name
+        peptide_name_to_object[spectrum_name] = spectrum
+      end
+      protein1 = protein_name_to_object[name1]
+      if protein1.nil?
+        protein1 = Protein.new
+        protein1.identifier = name1
+        protein1.descriptive_name = description1
+        protein1.peptides = []
+        protein_name_to_object[name1] = protein1
+      end
+      protein1.peptides.push spectrum
+      spectrum.parent_proteins ||= []
+      spectrum.parent_proteins.push protein1
+      # Parse the alternate hits. Only look at children with protein_descr attributes - these are
+      # these are the alternate proteins
+      hit.each_element_with_attribute('protein_descr') do |e|
+        name, description = parse_name_and_description.call(e)
+        alternate = protein_name_to_object[name]
+        if alternate.nil?
+          alternate = Protein.new
+          alternate.identifier = name
+          alternate.descriptive_name = description
+          alternate.peptides = []
+          protein_name_to_object[name] = alternate
+        end
+        alternate.peptides.push spectrum
+        spectrum.parent_proteins.push alternate
+      end
+      # Don't count the same protein multiple times - might happen when a spectru
+      spectrum.parent_proteins.uniq!
+      num_hits_parsed += 1
+    end
+    log.info "Parsed #{num_hits_parsed} search hits"
+    pepxml = Bio::PepXML.new
+    pepxml.protein_name_to_object = protein_name_to_object
+    pepxml.peptide_name_to_object = peptide_name_to_object
+    return pepxml
+  end
+end

data/spec/data/contaminant.pep.xml ADDED Viewed

@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_03.1121.1121.2" start_scan="1121" end_scan="1121" retention_time_sec="5.4199816666666667" activation_method="CID" precursor_intensity="388495.5625" precursor_neutral_mass="1329.7252673153125" assumed_charge="2" index="221">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="NLDLDSIIAEVK" protein="CNTM:cont_sp" num_tot_proteins="2" num_matched_ions="0" calc_neutral_pep_mass="1329.7252673153125" massdiff="0" protein_descr="P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462" protein_mw="62.423064734660052" calc_pI="8.06005859375">
+          <alternative_protein protein="CNTM:cont_sp" protein_descr="P48668 K2CE_HUMAN Keratin, type II cytoskeletal 6E (Cytokeratin 6E) (CK 6E) (K6e keratin) - Homo sapiens (Human). # pI:8.14 MW:60092" protein_mw="60.05537958466001" />
+          <search_score name="XCorr" value="4.5027022361755371" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/data/minimal.pep.xml ADDED Viewed

@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/data/minimal2.pep.xml ADDED Viewed

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/data/minimal3.pep.xml ADDED Viewed

@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2_3" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/divvy_proteomics_spec.rb CHANGED Viewed

@@ -11,8 +11,6 @@ $:.unshift File.join(File.dirname(__FILE__),'..')
 script_under_test = File.basename(__FILE__).gsub(/^test_/,'')
 path_to_script = File.join(File.dirname(__FILE__),'..','bin','divvy_spectra')
-TEST_DATA_DIR = File.join(File.dirname(__FILE__),'data')
 describe script_under_test do
   let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
   it 'should do 1 protein hit' do

data/spec/pep_xml_spec.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'systemu'
+require 'pp'
+require 'open3'
+require 'tempfile'
+require 'spec_helper'
+describe 'pepxml parsing' do
+  let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
+  it 'should parse decently' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
+#        <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+#        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+#          <search_score name="XCorr" value="4.7916374206542969" />
+    pepxml.kind_of?(Bio::PepXML).should == true
+    pepxml.protein_name_to_object.keys.sort.should == [
+      '>38SUR_2379_1524213_2',
+      '>38SUR_6350_1528184_1',
+      '>38SUR_80622_1602456_1',
+    ].sort
+    pepxml.peptide_name_to_object.keys.sort.should == [
+      'Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2'
+    ]
+    pepxml.protein_name_to_object.values.each do |prot|
+      prot.kind_of?(Bio::PepXML::Protein).should == true
+    end
+    pepxml.peptide_name_to_object.values.each do |prot|
+      prot.kind_of?(Bio::PepXML::Peptide).should == true
+    end
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.identifier.should == '>38SUR_2379_1524213_2'
+    prot1.descriptive_name.should == '>38SUR_2379_1524213_2'
+  end
+  it 'should respond to divvy proteomics module things' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
+#        <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+#        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+#          <search_score name="XCorr" value="4.7916374206542969" />
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.peptides.length.should == 1
+    prot1.unique_spectra.should == 0
+    prot1.non_unique_spectra.should == 1
+    prot1.estimated_spectral_count.should == 0.0
+    prot1 = pepxml.protein_name_to_object['>38SUR_6350_1528184_1']
+    prot1.peptides.length.should == 1
+    prot1.unique_spectra.should == 0
+    prot1.non_unique_spectra.should == 1
+    prot1.estimated_spectral_count.should == 0.0
+  end
+  it 'should respond to divvy proteomics module things with 1 unique hit' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal2.pep.xml')))
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.peptides.length.should == 1
+    prot1.unique_spectra.should == 1
+    prot1.non_unique_spectra.should == 0
+    prot1.estimated_spectral_count.should == 1.0
+  end
+  it 'should respond to divvy proteomics module things with 2 hits, where 1 is unique' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal3.pep.xml')))
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.peptides.length.should == 2
+    prot1.unique_spectra.should == 1
+    prot1.non_unique_spectra.should == 1
+    prot1.estimated_spectral_count.should == 2.0
+  end
+  it 'should parse when the protein and protein_desc attributes are both defined' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'contaminant.pep.xml')))
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['CNTM:cont_sp']
+    prot1.nil?.should == false
+    prot1.identifier.should == 'CNTM:cont_sp'
+    prot1.descriptive_name.should == 'CNTM:cont_sp P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462'
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -8,5 +8,7 @@ require 'divvy_proteomics'
 Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
 RSpec.configure do |config|
 end
+TEST_DATA_DIR = File.join(File.dirname(__FILE__),'data')

metadata CHANGED Viewed

@@ -1,99 +1,99 @@
 --- !ruby/object:Gem::Specification
 name: divvy_proteomics
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Ben J Woodcroft
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-06 00:00:00.000000000 Z
+date: 2014-01-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio-logger
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.0'
 - !ruby/object:Gem::Dependency
   name: systemu
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.6'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.6'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 2.8.0
+        version: '2.14'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 2.8.0
+        version: '2.14'
 - !ruby/object:Gem::Dependency
   name: rdoc
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.12'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.12'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.0.0
+        version: '1.5'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.0.0
+        version: '1.5'
 - !ruby/object:Gem::Dependency
   name: jeweler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.8.4
+        version: '2.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.8.4
+        version: '2.0'
 description: divvy up spectra from DTASelect files in a somewhat parsimonious way
 email: donttrustben@gmail.com
 executables:
@@ -103,8 +103,8 @@ extra_rdoc_files:
 - LICENSE.txt
 - README.md
 files:
-- .document
-- .rspec
+- ".document"
+- ".rspec"
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -113,8 +113,14 @@ files:
 - bin/divvy_spectra
 - divvy_proteomics.gemspec
 - lib/divvy_proteomics.rb
+- lib/divvyable_protein.rb
 - lib/dta_select_output.rb
+- lib/pep_xml.rb
+- spec/data/contaminant.pep.xml
 - spec/data/merge_definition.csv
+- spec/data/minimal.pep.xml
+- spec/data/minimal2.pep.xml
+- spec/data/minimal3.pep.xml
 - spec/data/multiply_mapped_spectra.csv
 - spec/data/new_format.csv
 - spec/data/new_format_some_all_shared_spectra.csv
@@ -124,6 +130,7 @@ files:
 - spec/data/three_proteins_meant_for_merge.csv
 - spec/data/three_proteins_with_contaminant.csv
 - spec/divvy_proteomics_spec.rb
+- spec/pep_xml_spec.rb
 - spec/spec_helper.rb
 homepage: http://github.com/wwood/divvy_proteomics
 licenses:
@@ -135,17 +142,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.3
+rubygems_version: 2.2.0
 signing_key:
 specification_version: 4
 summary: divvy up spectra from DTASelect files in a parsimonious way