RubyGems - ms-ident - Versions diffs - 0.0.18 → 0.0.19 - Mend

ms-ident 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/Rakefile +1 -1
data/VERSION +1 -1
data/lib/ms/ident/peptide/db.rb +19 -1
data/lib/ms/ident/peptide_hit/qvalue.rb +55 -0
data/lib/ms/ident/peptide_hit.rb +8 -0
data/lib/ms/ident/protein.rb +0 -58
data/lib/ms/ident/protein_group.rb +72 -0
data/lib/ms/ident/protein_hit.rb +17 -0
data/spec/ms/ident/peptide/db_spec.rb +6 -0
data/spec/ms/ident/{protein_spec.rb → protein_group_spec.rb} +8 -9
metadata +10 -6

data/Rakefile CHANGED Viewed

@@ -8,7 +8,7 @@ Jeweler::Tasks.new do |gem|
   gem.homepage = "http://github.com/jtprince/ms-ident"
   gem.license = "MIT"
   gem.summary = %Q{mspire library for working with mzIdentML and pepxml}
-  gem.description = %Q{mspire library for working with mzIdentML and pepxml}
+  gem.description = %Q{mspire library for working with mzIdentML, pepxml, and related.}
   gem.email = "jtprince@gmail.com"
   gem.authors = ["John T. Prince"]
   gem.rubyforge_project = 'mspire'

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.18
1	+ 0.0.19

data/lib/ms/ident/peptide/db.rb CHANGED Viewed

@@ -5,7 +5,11 @@ module Ms ; end
 module Ms::Ident ; end
 module Ms::Ident::Peptide ; end
-module Ms::Ident::Peptide::Db
+# the object itself is a modified Hash.
+# It is initialized with the database file and a protein array can be
+# retrieved with the #[] method given an amino acid sequence.  All other
+# methods are untested at this time and should be avoided!
+class Ms::Ident::Peptide::Db < Hash
   MAX_NUM_AA_EXPANSION = 3
   # the twenty standard amino acids
@@ -165,12 +169,24 @@ module Ms::Ident::Peptide::Db
     to_expand
   end
+  def initialize(db_file)
+    self.replace(YAML.load_file(db_file))
+  end
+  alias_method :old_bracket, '[]'.to_sym
+  # returns the protein id's as an array
+  def [](key)
+    old_bracket(key).chomp.split(PROTEIN_DELIMITER)
+  end
   # an object for on disk retrieval of db entries
   # proteins are returned as an array.
   # behaves much like a hash once it is opened.
   class IO
     include Enumerable
     def self.open(filename, &block)
+      raise ArgumentError unless block
       File.open(filename) do |io|
         block.call(self.new(io))
       end
@@ -192,9 +208,11 @@ module Ms::Ident::Peptide::Db
         @index[key] = [start, end_pos-start]
       end
     end
     # returns an array of proteins for the given key (peptide aaseq)
     def [](key)
       (start, length) = @index[key]
+      return nil unless start
       @io.seek(start)
       string = @io.read(length)
       string.chomp!

data/lib/ms/ident/peptide_hit/qvalue.rb ADDED Viewed

@@ -0,0 +1,55 @@
+require 'ms/ident/peptide_hit'
+module Ms ; end
+module Ms::Ident ; end
+class Ms::Ident::PeptideHit
+  module Qvalue
+    attr_accessor :qvalue
+    FILE_EXTENSION = '.phq.tsv'
+    FILE_DELIMITER = "\t"
+    HEADER = %w(aaseq charge qvalue)
+    class << self
+      # writes to the file, adding an extension
+      def to_phq(base, hits, qvalues=nil)
+        to_file(base + FILE_EXTENSION, hits)
+      end
+      # writes the peptide hits to a phq.tsv file. qvalues is a parallel array
+      # to hits that can provide qvalues if not inherent to the hits
+      # returns the filename.
+      def to_file(filename, hits, qvalues=[])
+        File.open(filename,'w') do |out|
+          out.puts HEADER.join(FILE_DELIMITER)
+          hits.zip(qvalues) do |hit, qvalue|
+            out.puts [hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
+          end
+        end
+        filename
+      end
+      # returns an array of PeptideHit objects from a phq.tsv
+      def from_file(filename)
+        peptide_hits = []
+        File.open(filename) do |io|
+          header = io.readline.chomp.split(FILE_DELIMITER)
+          raise "bad headers" unless header == HEADER
+          io.each do |line|
+            line.chomp!
+            (aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
+            ph = Ms::Ident::PeptideHit.new
+            ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
+            peptide_hits << ph
+          end
+        end
+        peptide_hits
+      end
+      alias_method :from_phq, :from_file
+    end
+  end # Qvalue
+  include Qvalue
+end # Peptide Hit

data/lib/ms/ident/peptide_hit.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module Ms ; end
+module Ms::Ident ; end
+class Ms::Ident::PeptideHit
+  attr_accessor :aaseq
+  attr_accessor :charge
+  attr_accessor :proteins
+end

data/lib/ms/ident/protein.rb CHANGED Viewed

@@ -1,70 +1,12 @@
 module Ms ; end
 module Ms::Ident ; end
-require 'set'
 module Ms::Ident::Protein
-  class << self
-  end
   # gives the information up until the first space or carriage return.
   # Assumes the protein can respond_to? :reference
   def first_entry
     reference.split(/[\s\r]/)[0]
   end
-  PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
-    peptide_hits = protein_group_and_peptide_hits.last
-    num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
-    num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
-    [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
-  end
-  module_function
-  # greedy algorithm to map a set of peptide_hits to protein groups.  each
-  # peptide hit should respond to :aaseq, :charge, :proteins if a block is
-  # given, yields a single argument: a doublet of protein_group and peptide
-  # set.  It expects a metric or array to sort by for creating greedy protein
-  # groups (the greediest proteins should sort to the back of the array).  if
-  # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
-  # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS).  Sets of
-  # peptide_hits and the objects returned by peptide_hit#proteins are used as
-  # hash keys.  As long as each peptide hit has a unique signature (like an
-  # id) then any object will work.  If they are Struct objects, you might
-  # consider redefining the #hash method to be object_id for performance and
-  # accuracy.
-  def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
-    sort_by ||= PRIORITIZE_PROTEINS
-    # note to self: I wrote this in 2011, so I think I know what I'm doing now
-    protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
-    peptide_hits.each do |peptide_hit|
-      peptide_hit.proteins.each do |protein|
-        protein_to_peptides[protein] << peptide_hit
-      end
-    end
-    peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
-    protein_to_peptides.each do |protein, peptide_set|
-      peptides_to_protein_group[peptide_set] << protein
-    end
-    protein_group_to_peptides = peptides_to_protein_group.invert
-    greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
-    accounted_for = Set.new
-    surviving_protein_groups = []
-    # we are discarding the subsumed sets, but we could get them with
-    # partition
-    greedy_first.select do |group, peptide_set|
-      has_an_unaccounted_peptide = false
-      peptide_set.each do |peptide_hit|
-        unless accounted_for.include?(peptide_hit)
-          has_an_unaccounted_peptide = true
-          accounted_for.add(peptide_hit)
-        end
-      end
-      has_an_unaccounted_peptide
-    end
-  end
 end

data/lib/ms/ident/protein_group.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require 'set'
+module Ms
+  module Ident
+    # represents a group of proteins, typically indistinguishable in the
+    # experiment.
+    class ProteinGroup < Array
+      attr_accessor :peptide_hits
+      PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
+        peptide_hits = protein_group_and_peptide_hits.last
+        num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
+        num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
+        [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
+      end
+      # greedy algorithm to map a set of peptide_hits to protein groups.  each
+      # peptide hit should respond to :aaseq, :charge, :proteins if a block is
+      # given, yields a single argument: a doublet of protein_group and peptide
+      # set.  It expects a metric or array to sort by for creating greedy protein
+      # groups (the greediest proteins should sort to the back of the array).  if
+      # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
+      # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS).  Sets of
+      # peptide_hits and the objects returned by peptide_hit#proteins are used as
+      # hash keys.  As long as each peptide hit has a unique signature (like an
+      # id) then any object will work.  If they are Struct objects, you might
+      # consider redefining the #hash method to be object_id for performance and
+      # accuracy.
+      #
+      # returns an array of ProteinGroup objects, each set with :peptide_hits
+      def self.peptide_hits_to_protein_groups(peptide_hits, &sort_by)
+        sort_by ||= PRIORITIZE_PROTEINS
+        # note to self: I wrote this in 2011, so I think I know what I'm doing now
+        protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
+        peptide_hits.each do |peptide_hit|
+          peptide_hit.proteins.each do |protein|
+            protein_to_peptides[protein] << peptide_hit
+          end
+        end
+        peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
+        protein_to_peptides.each do |protein, peptide_set|
+          peptides_to_protein_group[peptide_set] << protein
+        end
+        peptides_to_protein_group.each do |pephits,ar_of_prots|
+          pg = Ms::Ident::ProteinGroup.new(ar_of_prots)
+          pg.peptide_hits = pephits
+          peptides_to_protein_group[pephits] = pg
+        end
+        protein_group_to_peptides = peptides_to_protein_group.invert
+        greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
+        accounted_for = Set.new
+        # we are discarding the subsumed sets, but we could get them with
+        # partition
+        greedy_first.select! do |group, peptide_set|
+          has_an_unaccounted_peptide = false
+          peptide_set.each do |peptide_hit|
+            unless accounted_for.include?(peptide_hit)
+              has_an_unaccounted_peptide = true
+              accounted_for.add(peptide_hit)
+            end
+          end
+          group.peptide_hits = peptide_set if has_an_unaccounted_peptide
+          has_an_unaccounted_peptide
+        end
+        greedy_first.map(&:first)
+      end
+    end
+  end
+end

data/lib/ms/ident/protein_hit.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Ms ; end
+module Ms::Ident ; end
+class Ms::Ident::ProteinHit
+  attr_accessor :id
+  attr_accessor :seq
+  alias_method :sequence, :seq
+  alias_method :sequence=, :seq=
+  attr_accessor :peptide_hits
+  def initialize(id=nil)
+    @peptide_hits = []
+    @id = id
+  end
+end

data/spec/ms/ident/peptide/db_spec.rb CHANGED Viewed

@@ -82,6 +82,12 @@ describe 'reading a peptide centric database' do
   outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
   @outfile = outfiles.first
+  it 'creates a hash that can retrieve peptides as an array' do
+    hash = Ms::Ident::Peptide::Db.new(@outfile)
+    hash["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN	sp|P31946-2|1433B_HUMAN)
+    hash["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
+  end
   it 'reads the file on disk with random access or is enumerable' do
     Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
       io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN	sp|P31946-2|1433B_HUMAN)

data/spec/ms/ident/{protein_spec.rb → protein_group_spec.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 require 'spec_helper'
-require 'ms/ident/protein'
+require 'ms/ident/protein_group'
 PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
   def inspect # easier to read output
@@ -36,22 +36,21 @@ describe 'creating minimal protein groups from peptide hits' do
   it 'is a greedy algorithm' do
     @prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
     # big_guy has all the peptides, so it takes them all
-    reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
-    reply.first.size.is 2 # the group and the peptide set
-    reply.first.first.size.is 1 # the group
-    reply.first.first.first.id.is 'big_guy'
+    protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
+    protein_groups.first.size.is 1# the group
+    protein_groups.first.first.id.is 'big_guy'
   end
   it 'removes proteins accounted for only as little pieces of larger proteins' do
     @prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
-    reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
+    protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
     # no subsumed_by_medium
-    reply.map(&:first).any? {|protein_list| protein_list.any? {|v| v.id == 'subsumed_by_medium' }}.is false
+    protein_groups.any? {|prot_group| prot_group.any? {|v| v.id == 'subsumed_by_medium' }}.is false
   end
   it 'allows alternate sorting algorithms for greediness' do
     @prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
-    reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
+    prot_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
       # deliberate using a counterintuitive sorting method to give little guys
       # a chance
       -prot_and_peptide_hits.last.size
@@ -61,7 +60,7 @@ describe 'creating minimal protein groups from peptide hits' do
     # to add to the mix.  This demonstrates how proteins can be weighted in
     # different ways based on their peptide hits.
     seen = []
-    reply.each {|pair| pair.first.each {|prot| seen << prot.id } }
+    prot_groups.each {|pg| pg.each {|prot| seen << prot.id } }
     # big guy is completely accounted for in the now prioritized little guy
     # and medium guys, etc.
     seen.sort.is @prot_hits_hash.keys[1..-1].sort

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 18
-  version: 0.0.18
+  - 19
+  version: 0.0.19
 platform: ruby
 authors:
 - John T. Prince
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-03-28 00:00:00 -06:00
+date: 2011-03-30 00:00:00 -06:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -97,7 +97,7 @@ dependencies:
         version: "0"
   type: :development
   version_requirements: *id006
-description: mspire library for working with mzIdentML and pepxml
+description: mspire library for working with mzIdentML, pepxml, and related.
 email: jtprince@gmail.com
 executables: []
@@ -116,6 +116,8 @@ files:
 - lib/ms/ident.rb
 - lib/ms/ident/peptide.rb
 - lib/ms/ident/peptide/db.rb
+- lib/ms/ident/peptide_hit.rb
+- lib/ms/ident/peptide_hit/qvalue.rb
 - lib/ms/ident/pepxml.rb
 - lib/ms/ident/pepxml/modifications.rb
 - lib/ms/ident/pepxml/msms_pipeline_analysis.rb
@@ -131,6 +133,8 @@ files:
 - lib/ms/ident/pepxml/search_summary.rb
 - lib/ms/ident/pepxml/spectrum_query.rb
 - lib/ms/ident/protein.rb
+- lib/ms/ident/protein_group.rb
+- lib/ms/ident/protein_hit.rb
 - lib/ms/ident/search.rb
 - schema/pepXML_v115.xsd
 - schema/pepXML_v19.xsd
@@ -138,7 +142,7 @@ files:
 - spec/ms/ident/pepxml/sample_enzyme_spec.rb
 - spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
 - spec/ms/ident/pepxml_spec.rb
-- spec/ms/ident/protein_spec.rb
+- spec/ms/ident/protein_group_spec.rb
 - spec/spec_helper.rb
 - spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta
 - spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml
@@ -179,5 +183,5 @@ test_files:
 - spec/ms/ident/pepxml/sample_enzyme_spec.rb
 - spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
 - spec/ms/ident/pepxml_spec.rb
-- spec/ms/ident/protein_spec.rb
+- spec/ms/ident/protein_group_spec.rb
 - spec/spec_helper.rb