RubyGems - ms-error_rate - Versions diffs - 0.0.9 → 0.0.10 - Mend

ms-error_rate 0.0.9 → 0.0.10

Files changed (27) hide show

data/.autotest +14 -0
data/.gitmodules +9 -0
data/History +16 -0
data/LICENSE +2 -0
data/Rakefile +52 -0
data/VERSION +1 -1
data/lib/ms/error_rate/decoy.rb +27 -0
data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
data/lib/ms/error_rate/qvalue.rb +93 -0
data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
data/lib/ms/error_rate/sbv.rb +111 -0
data/lib/ms/error_rate.rb +9 -0
data/lib/ms/ident.rb +125 -0
data/lib/support/sort_by_attributes.rb +51 -0
data/lib/transmembrane/phobius.rb +136 -0
data/lib/transmembrane/toppred.rb +368 -0
data/lib/transmembrane.rb +157 -0
data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
data/script/expert_addition.rb +26 -0
data/script/expert_list.rb +53 -0
data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
data/script/minimal_protein_set.rb +366 -0
data/script/unique_seq_stats.rb +72 -0
metadata +66 -14

data/.autotest ADDED Viewed

@@ -0,0 +1,14 @@
+# -*- ruby -*-
+Autotest.add_hook :initialize do |at|
+ at.clear_mappings
+end
+Autotest.add_hook :initialize do |at|
+at.add_mapping(%r%^lib/(.*)\.rb$%) { |_, m|
+    #["spec/#{m[1]}_spec.rb"]
+    #["test/#{m[1]}_test.rb"]
+    ## for both specs and tests:
+    ["spec/#{m[1]}_spec.rb","test/#{m[1]}_test.rb"]
+}
+end

data/.gitmodules ADDED Viewed

@@ -0,0 +1,9 @@
+[submodule "submodule/ms-testdata"]
+	path = submodule/ms-testdata
+	url = git://github.com/bahuvrihi/ms-testdata.git
+[submodule "submodule/ms-in_silico"]
+	path = submodule/ms-in_silico
+	url = git://github.com/bahuvrihi/ms-in_silico.git
+[submodule "submodule/tap-mechanize"]
+	path = submodule/tap-mechanize
+	url = git://github.com/bahuvrihi/tap-mechanize.git

data/History ADDED Viewed

@@ -0,0 +1,16 @@
+== 0.0.6
+* changed peptide centric db output to full YAML (i.e., the protein IDs are in an inline array)
+== 0.0.3
+* switching to ms-template-ish structure
+== 0.0.2 / 2009-10-14
+* basic validation with peptide and protein centric sample bias validation.
+* peptide centric database created that include methionine cleavage.
+== 0.0.1 / 2009-08-25
+* initial work - borrowing basic structure from ms-sequest and using original mspire lib/validators work.

data/LICENSE CHANGED Viewed

@@ -1,6 +1,8 @@
 Copyright shared among contributing institutions:
 Copyright (c) 2006-2008 University of Texas at Austin (the initial project)
 Copyright (c) 2009 Regents of the University of Colorado and Howard Hughes Medical Institute. (modularization of the project)
+Copyright (c) 2011 Brigham Young University (additions)
+Authored by John T. Prince
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

data/Rakefile ADDED Viewed

@@ -0,0 +1,52 @@
+require 'rubygems'
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "ms-error_rate"
+  gem.homepage = "http://github.com/jtprince/ms-error_rate"
+  gem.license = "MIT"
+  gem.summary = %Q{An mspire library for calculating error rates in MS/MS identifications (FDRs).}
+  gem.description = %Q{aids for creating and calculating error rates using target-decoy searches and sample validation.}
+  gem.email = "jtprince@gmail.com"
+  gem.authors = ["John Prince"]
+  # Include your dependencies below. Runtime dependencies are required when using your gem,
+  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
+  #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
+  #  gem.add_development_dependency 'rspec', '> 1.2.3'
+  gem.rubyforge_project = 'mspire'
+  gem.add_runtime_dependency("ms-core", ">= 0.0.2")
+  gem.add_runtime_dependency("ms-ident", ">= 0.0.20")
+  gem.add_development_dependency "spec-more", ">= 0"
+  gem.add_development_dependency "jeweler", "~> 1.5.2"
+  gem.add_development_dependency "rcov", ">= 0"
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.verbose = true
+end
+#require 'rcov/rcovtask'
+#Rcov::RcovTask.new do |spec|
+#  spec.libs << 'spec'
+#  spec.pattern = 'spec/**/*_spec.rb'
+#  spec.verbose = true
+#end
+task :default => :spec
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "ms-error_rate #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.9
1	+ 0.0.10

data/lib/ms/error_rate/decoy.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module Ms
+  module ErrorRate
+    module Decoy
+      # this is the # true positives (found by estimating the number of false
+      # hits using the # decoy)
+      # frit == fraction
+      def self.precision(num_target, num_decoy, frit=1.0)
+        # will calculate as floats in case fractional amounts passed in for
+        # whatever reason
+        num_target_f = num_target.to_f
+        num_true_pos = num_target_f - (num_decoy.to_f * frit)
+        precision =
+          if num_target_f == 0.0
+            if num_decoy.to_f > 0.0
+              0.0
+            else
+              1.0
+            end
+          else
+            num_true_pos/num_target_f
+          end
+        precision
+      end
+    end
+  end
+end

data/lib/ms/error_rate/qvalue/mascot/percolator.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require 'ms/mascot/dat'
+require 'ms/error_rate/qvalue'
+require 'ms/error_rate/qvalue/mascot'
+module Ms
+  module ErrorRate
+    module Qvalue
+      module Mascot
+        module Percolator
+          module_function
+          # returns an array of Structs where the keys are the first line
+          # everything is cast properly
+          # three additional keys are available query_num, rank, sequence
+          # sequence is the amino acid sequence without the surrounding X's
+          # and dots.
+          # (with '-' substituted for '_')
+          def tab_txt(file)
+            hits = []
+            File.open(file) do |io|
+              # PSMId	score	q-value	posterior_error_prob	peptide	proteinIds
+              atts = io.gets.chomp.split("\t").map {|v| v.gsub('-', '_').to_sym }
+              atts.push(:query_num, :rank, :sequence)
+              struct_class = Struct.new("Hit", *atts)
+              io.each do |line|
+                (query_rank, score, qvalue, perrp, peptide, *prots ) = line.chomp.split("\t")
+                (query, rank) = query_rank.split(';').map {|v| v.split(':').last.to_i }
+                hits << struct_class.new(query_rank, score.to_f, qvalue.to_f, perrp.to_f, peptide, prots, query, rank, peptide.split('.')[1])
+              end
+            end
+            hits
+          end
+        end
+      end
+    end
+  end
+end
+module Ms::ErrorRate::Qvalue::Mascot::Percolator
+  module_function
+  # returns an array of Structs of PeptideHit(:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
+  # opts =
+  #   :min_peptide_length => Integer
+  def qvalues(datp_files, tab_txt_files, opts={})
+    min_pep_len = opts[:min_peptide_length]
+    # we only want the top hit per query title (which should ensure that we
+    # get the top hit per scan)
+    hits_by_query_title = Hash.new {|h,k| h[k] = [] }
+    datp_files.zip(tab_txt_files) do |datp_file, tab_txt_file|
+      # build a hash based on the sequence
+      structs = Ms::ErrorRate::Qvalue::Mascot::Percolator.tab_txt( tab_txt_file )
+      qvalue_by_query_rank = {}
+      structs.each do |struct|
+        qvalue_by_query_rank[[struct.query_num, struct.rank]] = struct.q_value
+      end
+      base_no_ext = File.basename(datp_file, '.*')
+      Ms::Mascot::Dat.open(datp_file) do |dat|
+        dat.each_peptide_hit(:by => :groups, :yield_nil => false, :with_query => true) do |hits,query|
+          hits.each do |hit|
+            if qval = qvalue_by_query_rank[[hit.query_num, hit.hit_num]]
+              hit_as_struct = Ms::ErrorRate::Qvalue::Mascot::MascotPeptideHit.new(base_no_ext, query.title, query.charge, hit.sequence, hit.score, qval)
+              hits_by_query_title[hit_as_struct.query_title] << hit_as_struct
+            end
+          end
+        end
+      end
+    end
+    final_hits = []
+    hits_by_query_title.each do |title, hits|
+      best_hit =
+        if hits.size == 1
+          hits.first
+        else
+          hits.sort_by(&:mowse).last
+        end
+      # FILTER HERE:
+      # ONLY TAKE the BEST HIT IF it passes any filters
+      if min_pep_len
+        next unless best_hit.sequence.size >= min_pep_len
+      end
+      final_hits << best_hit
+    end
+    final_hits
+  end
+end

data/lib/ms/error_rate/qvalue/mascot.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'ms/error_rate/qvalue'
+require 'ms/mascot/dat'
+module Ms
+  module ErrorRate
+    module Qvalue
+      module Mascot
+      end
+    end
+  end
+end
+module Ms::ErrorRate::Qvalue::Mascot
+  MEMBERS = [:filename, :query_title, :charge, :sequence, :mowse, :qvalue]
+  MascotPeptideHit = Struct.new(*MEMBERS) do
+    # emits an array rather than a Struct object
+    def to_yaml(*args)
+      to_a.to_yaml(*args)
+    end
+  end
+  module_function
+  # returns an array of Structs of PeptideHit(:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
+  # opts =
+  #   :min_peptide_length => Integer
+  def qvalues(target_files, decoy_files, opts={})
+    min_pep_len = opts[:min_peptide_length]
+    # we only want the top hit per query title (which should ensure that we
+    # get the top hit per scan)
+    (target_hits, decoy_hits) = [target_files, decoy_files].map do |files|
+      hits_by_query_title = Hash.new {|h,k| h[k] = [] }
+      files.each do |file|
+        base_no_ext = File.basename(file, '.*')
+        Ms::Mascot::Dat.open(file) do |dat|
+          dat.each_peptide_hit(:by => :top, :yield_nil => false, :with_query => true) do |hit,query|
+            hit_as_struct = MascotPeptideHit.new(base_no_ext, query.title, query.charge, hit.sequence, hit.score)
+            hits_by_query_title[hit_as_struct.query_title] << hit_as_struct
+          end
+        end
+      end
+      final_hits = []
+      hits_by_query_title.each do |title, hits|
+        best_hit =
+          if hits.size == 1
+            hits.first
+          else
+            hits.sort_by(&:mowse).last
+          end
+        # FILTER HERE:
+        # ONLY TAKE the BEST HIT IF it passes any filters
+        if min_pep_len
+          next unless best_hit.sequence.size >= min_pep_len
+        end
+        final_hits << best_hit
+      end
+      final_hits
+    end
+    pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(target_hits, decoy_hits, opts, &:mowse)
+    pairs.map do |hit, qval|
+      hit.qvalue = qval
+      hit
+    end
+  end
+end

data/lib/ms/error_rate/qvalue/pepxml.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require 'ms/error_rate/qvalue'
+module Ms ; end
+module Ms::ErrorRate ; end
+module Ms::ErrorRate::Qvalue ; end
+module Ms::ErrorRate::Qvalue::Pepxml
+  module_function
+  # returns an array of hit and qvalue pairs
+  # retrieves the aaseq, charge, and all search_score keys and values for use
+  # in the search_hit.  caller must provide a sort_by block, where the best
+  # hits are last.  charge is an integer, and all other search scores are cast
+  # as floats.  returns the output filename.
+  def target_decoy_qvalues(target_pepxml, decoy_pepxml, opt={}, &sort_by)
+    # this is a list of high quality peptide hits associated with each group
+    fields = [:aaseq, :charge]
+    ss_names = []
+    have_ss_names = false
+    (target_hits, decoy_hits) = [target_pepxml, decoy_pepxml].map do |file|
+      # begin with aaseq, charge
+      File.open(file) do |io|
+        doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS)
+        # we can work with namespaces, or just remove them ...
+        doc.remove_namespaces!
+        root = doc.root
+        search_hits = root.xpath('//search_hit')
+        search_hits.map do |search_hit|
+          aaseq = search_hit['peptide']
+          charge = search_hit.parent.parent['assumed_charge'].to_i
+          search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
+          ss_values = []
+          search_score_nodes.each do |node|
+            ss_names << node['name'].to_sym unless have_ss_names
+            ss_values << node['value'].to_f
+          end
+          have_ss_names = true
+          [aaseq, charge, *ss_values]
+        end
+      end
+    end
+    fields.push(*ss_names)
+    peptide_hit_class = Struct.new(*fields)
+    (t_hits, d_hits) = [target_hits, decoy_hits].map {|hits| hits.map {|hit_values| peptide_hit_class.new(*hit_values) } }
+    # hit and qvalue pairs
+    Ms::ErrorRate::Qvalue.target_decoy_qvalues(t_hits, d_hits, :z_together => opt[:z_together], &sort_by)
+  end
+end

data/lib/ms/error_rate/qvalue.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require 'set'
+require 'ms/error_rate/decoy'
+class Array
+  def group_by(&block)
+    hash = Hash.new {|h,k| h[k] = [] }
+    each do |v|
+      hash[block.call(v)] << v
+    end
+    hash
+  end unless [].respond_to?(:group_by)
+end
+module Ms
+  module ErrorRate
+    # For generating and working with q-value calculations.  The q-value is the global false discovery rate when accepting that particular ID.  We do not necessarily distinguish here between *how* the FDR is generated (i.e., Storey's pFDR "the occurrence of false positives" vs. Benjamini-Hochberg's FDR "the rate of false positives" [except to prefer Storey when possible] ).  The main point is that we sort and threshold based on a global FDR.
+    module Qvalue
+      module_function
+      # returns a parallel array to target hits with qvalues
+      # opts = :z_together true/false (default false) group all charges
+      # together.
+      # the sort block should sort from worst to best
+      # by default, sorting is: {|hit| hit.score} if not provided
+      # options also passed through to mixed_target_decoy
+      def target_decoy_qvalues(target_hits, decoy_hits, opts={}, &sorting)
+        sorting ||= :score
+        opts = {:z_together => false}.merge(opts)
+        target_set = Set.new(target_hits)
+        # Proc.new doesn't do arity checking
+        hit_with_qvalue_pairs = Proc.new do |hits|
+          sorted_best_to_worst = (hits.sort_by(&sorting)).reverse
+          (target_hits, qvalues) = Ms::ErrorRate::Qvalue.mixed_target_decoy(sorted_best_to_worst, target_set, opts)
+          target_hits.zip(qvalues)
+        end
+        all_together = target_hits + decoy_hits
+        if !opts[:z_together]
+          hit_with_qvalue_pairs.call(all_together)
+        else
+          all_hits = []
+          by_charge = all_together.group_by(&:charge)
+          by_charge.each do |charge,hits|
+            all_hits.push(*(hit_with_qvalue_pairs.call(hits)))
+          end
+          all_hits
+        end
+      end
+      # returns [target_hits, qvalues] (parallel arrays sorted from best hit to
+      # worst hit).  expects an array-like object of hits sorted from best to worst
+      # hit with decoys interspersed and a target_setlike object that responds to
+      # :include? for the hit object assumes the hit is a decoy if not found
+      # in the target set!  if monotonic is false, then the guarantee that
+      # qvalues be monotonically increasing is not respected.
+      def mixed_target_decoy(best_to_worst, target_setlike, opts={})
+        opts = {:monotonic => true}.merge(opts)
+        num_target = 0 ; num_decoy = 0
+        monotonic = opts[:monotonic]
+        target_hits = []
+        qvalues = []
+        best_to_worst.each do |hit|
+          if target_setlike.include?(hit)
+            num_target += 1
+            precision = Ms::ErrorRate::Decoy.precision(num_target, num_decoy)
+            target_hits << hit
+            qvalues << (1.0 - precision)
+          else
+            num_decoy += 1
+          end
+        end
+        if opts[:monotonic]
+          min_qvalue = qvalues.last
+          qvalues = qvalues.reverse.map do |val| # from worst to best score
+            if min_qvalue < val
+              min_qvalue
+            else
+              min_qvalue = val
+              val
+            end
+          end.reverse
+        end
+        [target_hits, qvalues]
+      end
+    end
+  end
+end

data/lib/ms/error_rate/sbv/peptide_based.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require 'ms/error_rate/sbv'
+module Ms
+  module ErrorRate
+    class Sbv
+      # Constraints on aaseq attribute of peptides (the bare amino acid sequence)
+      # works by calculating amino acid frequencies in the fasta file used.
+      class PeptideBased
+        def self.generate_hashes(pep_to_prot_file, aa="C", min_num=1 )
+          Ms::ErrorRate::Sbv.generate_hashes(pep_to_prot_file, :type_code => "aa_min#{min_num}") do |pep|
+            if min_num == 1
+              if pep.include?(aa) ; 1
+              else ; 0
+              end
+            else
+              count = 0
+              pep.each_char {|c| count += 1 if c == aa }
+              if count >= min_num ; 1
+              else ; 0
+              end
+            end
+          end
+        end
+      end # class
+    end # Sbv
+  end # ER
+end # Ms

data/lib/ms/error_rate/sbv/protein_based.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'ms/fasta'
+require 'ms/error_rate/sbv'
+require 'transmembrane'
+module Ms
+  module ErrorRate
+    class Sbv
+      module ProteinBased
+        DEFAULT_NO_PROTS_VAL = 0.0
+        # note the pep to prot hash has proteins in a string separated by a
+        # hyphen.  returns the names of the files written
+        def self.generate_hashes(pep_to_prot_file, protid_to_val, options={})
+          options[:protein_hash] = protid_to_val
+          options[:type_code] = 'tm' unless options[:type_code]
+          files = Ms::ErrorRate::Sbv.generate_hashes(pep_to_prot_file, options) do |prot_return_vals|
+            total_with_bias = 0
+            total_known = 0
+            prot_return_vals.each do |val|
+              if !val.nil?
+                total_with_bias += val
+                total_known += 1
+              end
+            end
+            if total_known == 0
+              DEFAULT_NO_PROTS_VAL
+            else
+              total_with_bias.to_f / total_known
+            end
+          end #block
+          files
+        end # end method
+      end # module
+    end # class
+  end # ErrorRate
+end # Ms

data/lib/ms/error_rate/sbv.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module Ms
+  module ErrorRate
+    # Sample Bias Validator
+    class Sbv
+      LENGTH_EXT = 'freq_by_length'
+      AASEQ_EXT = 'by_aaseq'
+      # if a protein hash is given, will yield the return an array of
+      # values generated with the value from keying each protein of the
+      # peptide.  Otherwise, will yield each peptide in turn
+      def self.generate_hashes(pep_to_prot_file, opts={})
+        op = { :aaseq_ext => AASEQ_EXT,
+          :length_ext => LENGTH_EXT,
+          :file_ext => '.yml',
+          :type_code => '',
+          :protein_hash => nil,
+          :stderr_counter => true,
+        }.merge(opts)
+        base = pep_to_prot_file.chomp(File.extname(pep_to_prot_file))
+        freqs = Hash.new {|h,k| h[k] = 0.0 }
+        counts = Hash.new {|h,k| h[k] = 0 }
+        (fileout1, fileout2) = [:aaseq_ext, :length_ext].map do |type_ext|
+          base + '.' + op[:type_code] + '.' + op[type_ext] + op[:file_ext]
+        end
+        protein_hash = op[:protein_hash]
+        pep_count = 0
+        if op[:stderr_counter]
+          $stderr.print "[working, 100,000 peptides = '.'] "
+          $stderr.flush
+        end
+        File.open(fileout1 , 'w') do |out|
+          IO.foreach(pep_to_prot_file) do |line|
+            (pep, prot_string) = line.chomp!.split(': ')
+            total_transmembrane = 0
+            total_known = 0
+            answ =
+              if protein_hash
+                yield( protein_hash.values_at(*(prot_string.split('-'))) )
+              else
+                yield(pep)
+              end
+            out.puts "#{pep}: #{answ}"
+            freqs[pep.size] += answ
+            counts[pep.size] += 1
+            pep_count += 1
+            if pep_count % 100000 == 0 && op[:stderr_counter]
+              $stderr.print '.'
+              $stderr.flush
+            end
+          end
+        end
+        $stderr.puts "DONE!" if op[:stderr_counter]
+        avg_freq_ar = {}
+        freqs.each do |k,v|
+          avg_freq_ar[k] = v / counts[k]
+        end
+        File.open(fileout2, 'w') {|out| out.print avg_freq_ar.to_yaml }
+        [fileout1, fileout2]
+      end
+      # a hash by aaseq giving a value between 0 and 1 telling how much of
+      # an indicator the hit is
+      attr_accessor :indicator_by_aaseq
+      attr_accessor :frequency_indicator_opposite
+      attr_accessor :size_to_freq
+      # boolean
+      attr_accessor :indicators_signify_true_hit
+      def initialize(indicator_by_aaseq_hash, size_to_freq, frequency_indicator_opposite, indicators_signify_true_hit=true)
+        @indicators_signify_true_hit = indicators_signify_true_hit
+        @frequency_indicator_opposite = frequency_indicator_opposite
+        @indicator_by_aaseq = indicator_by_aaseq_hash
+        @tot_num_indicators = 0.0
+        @tot_num = 0
+      end
+      # returns the cumulative precision (fraction of true positives among
+      # total hits) frequency_of_indicators is the probability that a generic
+      # amino acid sequence will be an indicator (this may variable by
+      # sequence length).
+      def update_precision(aaseq)
+        @tot_num_indicators << indicator_by_aaseq[aaseq]
+        @tot_num += 1
+        @frequency_of_indicators_sum += @size_to_freq[aaseq.size]
+        # FP Indicator
+        value = @tot_num_indicators * (1.0 - @frequency_indicator_opposite) * @frequency_of_indicators_sum / (@tot_num**2)
+        precision =
+          if @indicators_signify_true_hit
+            value  # a true indicator type (gives precision)
+          else  # false indicator type
+            1 - value  # 1 - fdr == precision
+          end
+      end
+      def calculate_background_frequency
+        @aaseq_to_fraction
+      end
+    end
+  end
+end

data/lib/ms/error_rate.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'ms/error_rate/sbv'
+require 'ms/error_rate/decoy'
+module Ms
+  module ErrorRate
+  end
+end