RubyGems - divvy_proteomics - Versions diffs - 0.2.0 → 0.3.0 - Mend

divvy_proteomics 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile +6 -6
data/README.md +14 -4
data/VERSION +1 -1
data/bin/divvy_spectra +28 -15
data/divvy_proteomics.gemspec +30 -22
data/lib/divvy_proteomics.rb +16 -0
data/lib/divvyable_protein.rb +40 -0
data/lib/dta_select_output.rb +12 -50
data/lib/pep_xml.rb +130 -0
data/spec/data/contaminant.pep.xml +13 -0
data/spec/data/minimal.pep.xml +14 -0
data/spec/data/minimal2.pep.xml +12 -0
data/spec/data/minimal3.pep.xml +21 -0
data/spec/divvy_proteomics_spec.rb +0 -2
data/spec/pep_xml_spec.rb +99 -0
data/spec/spec_helper.rb +3 -1
metadata +36 -29

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 05ddf71aecd113201c370185104006af9705a525
-  data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
+  metadata.gz: 57c4b7c64ec34ec42b4b28f6a31537a8901d5a7c
+  data.tar.gz: c216e563aa04c13e7b935852862994579e45d61f
 SHA512:
-  metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
-  data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666
+  metadata.gz: 7a1e7dd2be6565d9503ce1e5fc43cb2362920fa8146f66d7c179c0a812de5e1ea2be9f42d70f63bb2bf3811e44f5800f01e4bbbb8731a4365d129c01c138edae
+  data.tar.gz: 846cf30c524054c62205de65aba81d37bdd1f2857cec1e623730d167efa6eacc71491a999806e73547b50d0c36a4b87993245d20217d47090c8e3da6eecf31b3

data/Gemfile CHANGED Viewed

@@ -1,13 +1,13 @@
 source "http://rubygems.org"
-gem 'bio-logger', ">=0"
+gem 'bio-logger', "~> 1.0"
 # Add dependencies to develop your gem here.
 # Include everything needed to run rake, tests, features, etc.
 group :development do
-  gem 'systemu', ">=0"
-  gem "rspec", ">= 2.8.0"
-  gem "rdoc", ">= 3.12"
-  gem "bundler", ">= 1.0.0"
-  gem "jeweler", ">= 1.8.4"
+  gem 'systemu', "~> 2.6"
+  gem "rspec", "~> 2.14"
+  gem "rdoc", "~> 3.12"
+  gem "bundler", "~> 1.5"
+  gem "jeweler", "~> 2.0"
 end

data/README.md CHANGED Viewed

@@ -13,20 +13,30 @@ $ gem install divvy_spectra
 $ divvy_spectra <DTASelectFile>
 ```
 Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
-estimated number of spectral counts after sorting out the non-uniqueness.
+estimated number of spectral counts after sorting out the non-uniqueness. Using the ```--pep-xml``` flag, PepXML files
+are can be used as input also:
+```
+$ divvy_spectra --pep-xml <PepXML_file>
+```
 Full usage information:
 ```
-$ divvy_spectra -h
-    Usage: divvy_spectra [options] <DTASelect_file>
+    Usage: divvy_spectra [options] <input_file>
-    Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.
+    Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.
         --merge-proteins FILE_OF_IDENTIFIERS
                                      Provide a space/tab separated file where the identifiers on each row should be treated as one protein
         --whitelist FILE_OF_PROTEINS_TO_REPORT
                                      Only report proteins that are in this whitelist, after divvying with everything
+        --contaminant-regexes REGEXES
+                                     Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: ]
+Optional arguments:
+        --pep-xml                    Input file is pep XML, rather than a DTA select output file [default: false]
 Verbosity:

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.3.0

data/bin/divvy_spectra CHANGED Viewed

@@ -5,23 +5,24 @@ require 'bio-logger'
 require 'pp'
 require 'set'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
+SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'divvy_proteomics'
 rootpath = File.dirname(File.dirname(__FILE__))
 $: << File.join(rootpath,'lib')
-require 'dta_select_output'
+require 'divvy_proteomics'
 # Parse command line options into the options hash
 options = {
   :logger => 'stderr',
   :log_level => 'info',
   :contaminant_regexes => [/^CNTM:/],
+  :input_is_pep_xml => false,
 }
 o = OptionParser.new do |opts|
   opts.banner = "
-    Usage: #{SCRIPT_NAME} [options] <DTASelect_file>
+    Usage: #{SCRIPT_NAME} [options] <input_file>
-    Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
+    Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
   opts.on("--merge-proteins FILE_OF_IDENTIFIERS", "Provide a space/tab separated file where the identifiers on each row should be treated as one protein") do |file|
     options[:merge_proteins_file] = file
@@ -32,6 +33,10 @@ o = OptionParser.new do |opts|
   opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
     options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
   end
+  opts.separator "\nOptional arguments:\n\n"
+  opts.on("--pep-xml", "Input file is pep XML, rather than a DTA select output file [default: #{options[:input_is_pep_xml]}]") do |arg|
+    options[:input_is_pep_xml] = true
+  end
   # logger options
@@ -74,7 +79,12 @@ if options[:whitelist_file]
 end
 # Parse the csv file
-parsed = Bio::DTASelect::OutputFile.parse(ARGF)
+parsed = nil
+if options[:input_is_pep_xml]
+  parsed = Bio::PepXML.parse(ARGF)
+else
+  parsed = Bio::DTASelect::OutputFile.parse(ARGF)
+end
 # Hashes of identifiers to objects
 proteins = parsed.protein_name_to_object
@@ -90,12 +100,13 @@ mergers.each do |secondary_id, primary_id|
     # Invalidate some things about the primary ID because they are no longer valid
     current_protein = proteins[primary_id]
-    current_protein.sequence_count = nil
-    current_protein.sequence_coverage = nil
-    current_protein.length = nil
-    current_protein.molwt = nil
-    current_protein.pi = nil
-    current_protein.validation_status = nil
+    # These variables are not used and are not present in pepXML files, so don't mess with them.
+    #    current_protein.sequence_count = nil
+    #    current_protein.sequence_coverage = nil
+    #    current_protein.length = nil
+    #    current_protein.molwt = nil
+    #    current_protein.pi = nil
+    #    current_protein.validation_status = nil
     # Keep the primary proteins' description, I reckon
     # When there is spectra that are in the secondary but not the primary, add them to the primary's repertoire.
@@ -172,11 +183,13 @@ number_non_shared_peptides = all_peptides.select{|pep| pep.parent_proteins.lengt
 total_peptides = number_shared_peptides+number_non_shared_peptides
 log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_peptides*100}%) shared peptides and #{number_non_shared_peptides} (#{number_non_shared_peptides.to_f/total_peptides*100}%) non-shared peptides"
-# Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
-non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
-  peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
+unless options[:input_is_pep_xml]
+  # Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
+  non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
+    peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
+  end
+  log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
 end
-log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
 # OK, finished parsing the file. Now output the score for each protein
 puts [

data/divvy_proteomics.gemspec CHANGED Viewed

@@ -2,14 +2,16 @@
 # DO NOT EDIT THIS FILE DIRECTLY
 # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 # -*- encoding: utf-8 -*-
+# stub: divvy_proteomics 0.3.0 ruby lib
 Gem::Specification.new do |s|
   s.name = "divvy_proteomics"
-  s.version = "0.2.0"
+  s.version = "0.3.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.require_paths = ["lib"]
   s.authors = ["Ben J Woodcroft"]
-  s.date = "2013-11-06"
+  s.date = "2014-01-07"
   s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
   s.email = "donttrustben@gmail.com"
   s.executables = ["divvy_spectra"]
@@ -28,8 +30,14 @@ Gem::Specification.new do |s|
     "bin/divvy_spectra",
     "divvy_proteomics.gemspec",
     "lib/divvy_proteomics.rb",
+    "lib/divvyable_protein.rb",
     "lib/dta_select_output.rb",
+    "lib/pep_xml.rb",
+    "spec/data/contaminant.pep.xml",
     "spec/data/merge_definition.csv",
+    "spec/data/minimal.pep.xml",
+    "spec/data/minimal2.pep.xml",
+    "spec/data/minimal3.pep.xml",
     "spec/data/multiply_mapped_spectra.csv",
     "spec/data/new_format.csv",
     "spec/data/new_format_some_all_shared_spectra.csv",
@@ -39,39 +47,39 @@ Gem::Specification.new do |s|
     "spec/data/three_proteins_meant_for_merge.csv",
     "spec/data/three_proteins_with_contaminant.csv",
     "spec/divvy_proteomics_spec.rb",
+    "spec/pep_xml_spec.rb",
     "spec/spec_helper.rb"
   ]
   s.homepage = "http://github.com/wwood/divvy_proteomics"
   s.licenses = ["MIT"]
-  s.require_paths = ["lib"]
-  s.rubygems_version = "2.0.3"
+  s.rubygems_version = "2.2.0"
   s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
   if s.respond_to? :specification_version then
     s.specification_version = 4
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_runtime_dependency(%q<bio-logger>, [">= 0"])
-      s.add_development_dependency(%q<systemu>, [">= 0"])
-      s.add_development_dependency(%q<rspec>, [">= 2.8.0"])
-      s.add_development_dependency(%q<rdoc>, [">= 3.12"])
-      s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
-      s.add_development_dependency(%q<jeweler>, [">= 1.8.4"])
+      s.add_runtime_dependency(%q<bio-logger>, ["~> 1.0"])
+      s.add_development_dependency(%q<systemu>, ["~> 2.6"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.14"])
+      s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.5"])
+      s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
     else
-      s.add_dependency(%q<bio-logger>, [">= 0"])
-      s.add_dependency(%q<systemu>, [">= 0"])
-      s.add_dependency(%q<rspec>, [">= 2.8.0"])
-      s.add_dependency(%q<rdoc>, [">= 3.12"])
-      s.add_dependency(%q<bundler>, [">= 1.0.0"])
-      s.add_dependency(%q<jeweler>, [">= 1.8.4"])
+      s.add_dependency(%q<bio-logger>, ["~> 1.0"])
+      s.add_dependency(%q<systemu>, ["~> 2.6"])
+      s.add_dependency(%q<rspec>, ["~> 2.14"])
+      s.add_dependency(%q<rdoc>, ["~> 3.12"])
+      s.add_dependency(%q<bundler>, ["~> 1.5"])
+      s.add_dependency(%q<jeweler>, ["~> 2.0"])
     end
   else
-    s.add_dependency(%q<bio-logger>, [">= 0"])
-    s.add_dependency(%q<systemu>, [">= 0"])
-    s.add_dependency(%q<rspec>, [">= 2.8.0"])
-    s.add_dependency(%q<rdoc>, [">= 3.12"])
-    s.add_dependency(%q<bundler>, [">= 1.0.0"])
-    s.add_dependency(%q<jeweler>, [">= 1.8.4"])
+    s.add_dependency(%q<bio-logger>, ["~> 1.0"])
+    s.add_dependency(%q<systemu>, ["~> 2.6"])
+    s.add_dependency(%q<rspec>, ["~> 2.14"])
+    s.add_dependency(%q<rdoc>, ["~> 3.12"])
+    s.add_dependency(%q<bundler>, ["~> 1.5"])
+    s.add_dependency(%q<jeweler>, ["~> 2.0"])
   end
 end

data/lib/divvy_proteomics.rb CHANGED Viewed

@@ -0,0 +1,16 @@
+require 'bio-logger'
+Bio::Log::LoggerPlus.new('divvy_proteomics')
+module Bio
+  module DivvyProteomics
+    module Logging
+      def log
+        Bio::Log::LoggerPlus['divvy_proteomics']
+      end
+    end
+  end
+end
+require 'divvyable_protein'
+require 'dta_select_output'
+require 'pep_xml'

data/lib/divvyable_protein.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Bio::DivvyProteomics::DivvyableProtein
+  def unique_spectra
+    return 0 if @peptides.nil? or @peptides.empty?
+    num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
+    num ||= 0
+    return num
+  end
+  def non_unique_spectra
+    return 0 if @peptides.nil? or @peptides.empty?
+    num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
+    num ||= 0
+    return num
+  end
+  # Are there any peptides that are assigned exclusively to this protein?
+  def uniquely_identified_by_any_peptides?
+    unique_spectra > 0
+  end
+  def estimated_spectral_count
+    # How many unique spectra are there for each protein that shares a peptide with the current peptide
+    return 0 if @peptides.nil? or @peptides.empty?
+    peptide_shares = []
+    # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
+    if !uniquely_identified_by_any_peptides?
+      # Don't attempt to divvy these up, because there are too many assumptions involved
+      return 0
+    else
+      peptides.each do |peptide|
+        log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
+        log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
+        total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
+        peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
+      end
+      return peptide_shares.reduce(:+)
+    end
+  end
+end

data/lib/dta_select_output.rb CHANGED Viewed

@@ -1,12 +1,9 @@
 module Bio::DTASelect
-  module Logging
-    def log
-      Bio::Log::LoggerPlus['divvy_spectra']
-    end
-  end
   class OutputFile
     def self.log
@@ -14,7 +11,8 @@ module Bio::DTASelect
     end
     class SelectedProtein
-      include Bio::DTASelect::Logging
+      include Bio::DivvyProteomics::Logging
+      include Bio::DivvyProteomics::DivvyableProtein
       attr_accessor :identifier
@@ -26,43 +24,7 @@ module Bio::DTASelect
         @peptides = []
       end
-      def unique_spectra
-        return 0 if @peptides.nil? or @peptides.empty?
-        num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
-        num ||= 0
-        return num
-      end
-      def non_unique_spectra
-        return 0 if @peptides.nil? or @peptides.empty?
-        num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
-        num ||= 0
-        return num
-      end
-      # Are there any peptides that are assigned exclusively to this protein?
-      def uniquely_identified_by_any_peptides?
-        unique_spectra > 0
-      end
-      def estimated_spectral_count
-        # How many unique spectra are there for each protein that shares a peptide with the current peptide
-        return 0 if @peptides.nil? or @peptides.empty?
-        peptide_shares = []
-        # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
-        if !uniquely_identified_by_any_peptides?
-          # Don't attempt to divvy these up, because there are too many assumptions involved
-          return 0
-        else
-          peptides.each do |peptide|
-            log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
-            log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
-            total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
-            peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
-          end
-          return peptide_shares.reduce(:+)
-        end
-      end
       def log
         Bio::Log::LoggerPlus[LOG_NAME]
@@ -70,7 +32,7 @@ module Bio::DTASelect
     end
     class Peptide
-      include Bio::DTASelect::Logging
+      include Bio::DivvyProteomics::Logging
       attr_accessor :identifier
@@ -98,7 +60,7 @@ module Bio::DTASelect
     end
     class Result
-      include Bio::DTASelect::Logging
+      include Bio::DivvyProteomics::Logging
       # hash of protein identifier to Protein object
       attr_accessor :protein_name_to_object
@@ -123,7 +85,7 @@ module Bio::DTASelect
       # Parse each line of the DTAselect file
       io.each_line do |line|
         splits = line.chomp.split("\t")
-        log.debug "Parsing line `#{line.chomp}'"
+        log.debug "Parsing line `#{line.chomp}'" if log.debug?
         if reading_header
           log.debug "reading header"
@@ -146,7 +108,7 @@ module Bio::DTASelect
           if !last_line_was_protein_name
             # Sometimes several proteins are given all in the one header line
             # start a new protein
-            log.debug "New protein now being parsed"
+            log.debug "New protein now being parsed" if log.debug?
             current_proteins = []
           end
@@ -174,13 +136,13 @@ module Bio::DTASelect
         elsif splits[1] == 'Proteins'
-          # Done processing, except for the bits down the bottom which aren't parsed (yet)
+          # Done processing, except for the bits down the bottom which aren't parsed (yet, at least)
           break
         else
-          log.debug "New spectra now being parsed"
+          log.debug "New spectra now being parsed" if log.debug?
           last_line_was_protein_name = false
           # Record a spectra
@@ -204,11 +166,11 @@ module Bio::DTASelect
             pep.parent_proteins.push current_protein
             current_protein.peptides.push pep
           end
-          log.debug "Parsed this peptide #{pep.inspect}"
+          log.debug "Parsed this peptide #{pep.inspect}" if log.debug?
         end
       end
-      log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
+      log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}" if log.debug?
       return result
     end
   end

data/lib/pep_xml.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'rexml/document'
+class Bio::PepXML
+  include Bio::DivvyProteomics::Logging
+  attr_accessor :protein_name_to_object, :peptide_name_to_object
+  class Protein
+    include Bio::DivvyProteomics::Logging
+    include Bio::DivvyProteomics::DivvyableProtein
+    # Array of peptide objects that have been assigned to this protein
+    attr_accessor :peptides
+    attr_accessor :identifier, :descriptive_name
+  end
+  # Named 'Peptide' but really mean Spectra. Just too hard to change
+  class Peptide
+    attr_accessor :parent_proteins
+    # Name of the spectra
+    attr_accessor :identifier
+    def initialize
+      @parent_proteins = []
+    end
+    #TODO: right now this just always returns 1. It should really be working out redundancy
+    #properly by comparison of peptide sequences, but this isn't yet parsed this info
+    def redundancy
+      1
+    end
+  end
+  def self.log
+    Bio::PepXML.new.log
+  end
+  def self.parse(io)
+    protein_name_to_object = {}
+    peptide_name_to_object = {}
+    #pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e|
+    #  c+=1; p e.attributes['protein_descr'].strip;
+    #  e.elements.each{|e|
+    #    p e.name, e.attributes['protein_descr'].strip};break}
+    xml = REXML::Document.new(io)
+    parse_name_and_description = lambda do |e|
+      name = e.attributes['protein'].strip
+      description = e.attributes['protein_descr'].strip
+      if name.nil? or name == ''
+        name = e.attributes['protein_descr'].strip
+      else
+        description = name+' '+description
+      end
+      name.gsub!(/\t.*/,'')
+      description.gsub!(/[\t\n]/,' ')
+      [name, description]
+    end
+    #TODO: some better sanity checking here would be ideal.
+    num_hits_parsed = 0
+    xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit|
+      hit_number = hit.attributes['hit_rank']
+      raise "Parsing error on #{hit}" if hit_number.nil?
+      next if hit_number != "1"
+      # Parse the primary hit
+      name1, description1 = parse_name_and_description.call(hit)
+      raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil?
+      spectrum_name = hit.parent.parent.attributes['spectrum'].strip
+      raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil?
+      # It is possible to have multiple peptides both hit the spectra with hit_rank="1"
+      # This happens when when e.g. leucine and isoleucine are possible.
+      spectrum = peptide_name_to_object[spectrum_name]
+      if spectrum.nil?
+        spectrum = Peptide.new
+        spectrum.identifier = spectrum_name
+        peptide_name_to_object[spectrum_name] = spectrum
+      end
+      protein1 = protein_name_to_object[name1]
+      if protein1.nil?
+        protein1 = Protein.new
+        protein1.identifier = name1
+        protein1.descriptive_name = description1
+        protein1.peptides = []
+        protein_name_to_object[name1] = protein1
+      end
+      protein1.peptides.push spectrum
+      spectrum.parent_proteins ||= []
+      spectrum.parent_proteins.push protein1
+      # Parse the alternate hits. Only look at children with protein_descr attributes - these are
+      # these are the alternate proteins
+      hit.each_element_with_attribute('protein_descr') do |e|
+        name, description = parse_name_and_description.call(e)
+        alternate = protein_name_to_object[name]
+        if alternate.nil?
+          alternate = Protein.new
+          alternate.identifier = name
+          alternate.descriptive_name = description
+          alternate.peptides = []
+          protein_name_to_object[name] = alternate
+        end
+        alternate.peptides.push spectrum
+        spectrum.parent_proteins.push alternate
+      end
+      # Don't count the same protein multiple times - might happen when a spectru
+      spectrum.parent_proteins.uniq!
+      num_hits_parsed += 1
+    end
+    log.info "Parsed #{num_hits_parsed} search hits"
+    pepxml = Bio::PepXML.new
+    pepxml.protein_name_to_object = protein_name_to_object
+    pepxml.peptide_name_to_object = peptide_name_to_object
+    return pepxml
+  end
+end

data/spec/data/contaminant.pep.xml ADDED Viewed

@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_03.1121.1121.2" start_scan="1121" end_scan="1121" retention_time_sec="5.4199816666666667" activation_method="CID" precursor_intensity="388495.5625" precursor_neutral_mass="1329.7252673153125" assumed_charge="2" index="221">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="NLDLDSIIAEVK" protein="CNTM:cont_sp" num_tot_proteins="2" num_matched_ions="0" calc_neutral_pep_mass="1329.7252673153125" massdiff="0" protein_descr="P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462" protein_mw="62.423064734660052" calc_pI="8.06005859375">
+          <alternative_protein protein="CNTM:cont_sp" protein_descr="P48668 K2CE_HUMAN Keratin, type II cytoskeletal 6E (Cytokeratin 6E) (CK 6E) (K6e keratin) - Homo sapiens (Human). # pI:8.14 MW:60092" protein_mw="60.05537958466001" />
+          <search_score name="XCorr" value="4.5027022361755371" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/data/minimal.pep.xml ADDED Viewed

@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/data/minimal2.pep.xml ADDED Viewed

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/data/minimal3.pep.xml ADDED Viewed

@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="utf-8"?>
+<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
+  <msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+    <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2_3" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+      <search_result search_id="1">
+        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+          <search_score name="XCorr" value="4.7916374206542969" />
+        </search_hit>
+      </search_result>
+    </spectrum_query>
+  </msms_run_summary>
+</msms_pipeline_analysis>

data/spec/divvy_proteomics_spec.rb CHANGED Viewed

@@ -11,8 +11,6 @@ $:.unshift File.join(File.dirname(__FILE__),'..')
 script_under_test = File.basename(__FILE__).gsub(/^test_/,'')
 path_to_script = File.join(File.dirname(__FILE__),'..','bin','divvy_spectra')
-TEST_DATA_DIR = File.join(File.dirname(__FILE__),'data')
 describe script_under_test do
   let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
   it 'should do 1 protein hit' do

data/spec/pep_xml_spec.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'systemu'
+require 'pp'
+require 'open3'
+require 'tempfile'
+require 'spec_helper'
+describe 'pepxml parsing' do
+  let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
+  it 'should parse decently' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
+#        <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+#        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+#          <search_score name="XCorr" value="4.7916374206542969" />
+    pepxml.kind_of?(Bio::PepXML).should == true
+    pepxml.protein_name_to_object.keys.sort.should == [
+      '>38SUR_2379_1524213_2',
+      '>38SUR_6350_1528184_1',
+      '>38SUR_80622_1602456_1',
+    ].sort
+    pepxml.peptide_name_to_object.keys.sort.should == [
+      'Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2'
+    ]
+    pepxml.protein_name_to_object.values.each do |prot|
+      prot.kind_of?(Bio::PepXML::Protein).should == true
+    end
+    pepxml.peptide_name_to_object.values.each do |prot|
+      prot.kind_of?(Bio::PepXML::Peptide).should == true
+    end
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.identifier.should == '>38SUR_2379_1524213_2'
+    prot1.descriptive_name.should == '>38SUR_2379_1524213_2'
+  end
+  it 'should respond to divvy proteomics module things' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
+#        <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
+#        <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr="&gt;38SUR_2379_1524213_2&#x9;" protein_mw="43.185399974660044" calc_pI="5.63037109375">
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_6350_1528184_1&#x9;" protein_mw="24.663561404659987" />
+#          <alternative_protein protein="" protein_descr="&gt;38SUR_80622_1602456_1&#x9;" protein_mw="30.364007294659981" />
+#          <search_score name="XCorr" value="4.7916374206542969" />
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.peptides.length.should == 1
+    prot1.unique_spectra.should == 0
+    prot1.non_unique_spectra.should == 1
+    prot1.estimated_spectral_count.should == 0.0
+    prot1 = pepxml.protein_name_to_object['>38SUR_6350_1528184_1']
+    prot1.peptides.length.should == 1
+    prot1.unique_spectra.should == 0
+    prot1.non_unique_spectra.should == 1
+    prot1.estimated_spectral_count.should == 0.0
+  end
+  it 'should respond to divvy proteomics module things with 1 unique hit' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal2.pep.xml')))
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.peptides.length.should == 1
+    prot1.unique_spectra.should == 1
+    prot1.non_unique_spectra.should == 0
+    prot1.estimated_spectral_count.should == 1.0
+  end
+  it 'should respond to divvy proteomics module things with 2 hits, where 1 is unique' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal3.pep.xml')))
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
+    prot1.peptides.length.should == 2
+    prot1.unique_spectra.should == 1
+    prot1.non_unique_spectra.should == 1
+    prot1.estimated_spectral_count.should == 2.0
+  end
+  it 'should parse when the protein and protein_desc attributes are both defined' do
+    pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'contaminant.pep.xml')))
+    pepxml.kind_of?(Bio::PepXML).should == true
+    prot1 = pepxml.protein_name_to_object['CNTM:cont_sp']
+    prot1.nil?.should == false
+    prot1.identifier.should == 'CNTM:cont_sp'
+    prot1.descriptive_name.should == 'CNTM:cont_sp P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462'
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -8,5 +8,7 @@ require 'divvy_proteomics'
 Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
 RSpec.configure do |config|
 end
+TEST_DATA_DIR = File.join(File.dirname(__FILE__),'data')

metadata CHANGED Viewed

@@ -1,99 +1,99 @@
 --- !ruby/object:Gem::Specification
 name: divvy_proteomics
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Ben J Woodcroft
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-06 00:00:00.000000000 Z
+date: 2014-01-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio-logger
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.0'
 - !ruby/object:Gem::Dependency
   name: systemu
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.6'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.6'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 2.8.0
+        version: '2.14'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 2.8.0
+        version: '2.14'
 - !ruby/object:Gem::Dependency
   name: rdoc
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.12'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.12'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.0.0
+        version: '1.5'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.0.0
+        version: '1.5'
 - !ruby/object:Gem::Dependency
   name: jeweler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.8.4
+        version: '2.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.8.4
+        version: '2.0'
 description: divvy up spectra from DTASelect files in a somewhat parsimonious way
 email: donttrustben@gmail.com
 executables:
@@ -103,8 +103,8 @@ extra_rdoc_files:
 - LICENSE.txt
 - README.md
 files:
-- .document
-- .rspec
+- ".document"
+- ".rspec"
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -113,8 +113,14 @@ files:
 - bin/divvy_spectra
 - divvy_proteomics.gemspec
 - lib/divvy_proteomics.rb
+- lib/divvyable_protein.rb
 - lib/dta_select_output.rb
+- lib/pep_xml.rb
+- spec/data/contaminant.pep.xml
 - spec/data/merge_definition.csv
+- spec/data/minimal.pep.xml
+- spec/data/minimal2.pep.xml
+- spec/data/minimal3.pep.xml
 - spec/data/multiply_mapped_spectra.csv
 - spec/data/new_format.csv
 - spec/data/new_format_some_all_shared_spectra.csv
@@ -124,6 +130,7 @@ files:
 - spec/data/three_proteins_meant_for_merge.csv
 - spec/data/three_proteins_with_contaminant.csv
 - spec/divvy_proteomics_spec.rb
+- spec/pep_xml_spec.rb
 - spec/spec_helper.rb
 homepage: http://github.com/wwood/divvy_proteomics
 licenses:
@@ -135,17 +142,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.3
+rubygems_version: 2.2.0
 signing_key:
 specification_version: 4
 summary: divvy up spectra from DTASelect files in a parsimonious way