RubyGems - mspire - Versions diffs - 0.5.0 → 0.6.1 - Mend

mspire 0.5.0 → 0.6.1

Files changed (107) hide show

data/README.rdoc +24 -0
data/Rakefile +51 -0
data/VERSION +1 -0
data/lib/cv/description.rb +18 -0
data/lib/cv/param.rb +33 -0
data/lib/cv.rb +3 -0
data/lib/io/bookmark.rb +13 -0
data/lib/merge.rb +7 -0
data/lib/ms/cvlist.rb +76 -0
data/lib/ms/digester.rb +245 -0
data/lib/ms/fasta.rb +86 -0
data/lib/ms/ident/peptide/db.rb +243 -0
data/lib/ms/ident/peptide.rb +72 -0
data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
data/lib/ms/ident/peptide_hit.rb +26 -0
data/lib/ms/ident/pepxml/modifications.rb +83 -0
data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
data/lib/ms/ident/pepxml/parameters.rb +14 -0
data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
data/lib/ms/ident/pepxml/search_database.rb +49 -0
data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
data/lib/ms/ident/pepxml/search_hit.rb +144 -0
data/lib/ms/ident/pepxml/search_result.rb +35 -0
data/lib/ms/ident/pepxml/search_summary.rb +92 -0
data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
data/lib/ms/ident/pepxml.rb +112 -0
data/lib/ms/ident/protein.rb +33 -0
data/lib/ms/ident/protein_group.rb +80 -0
data/lib/ms/ident/search.rb +114 -0
data/lib/ms/ident.rb +37 -0
data/lib/ms/isotope/aa.rb +59 -0
data/lib/ms/mascot.rb +6 -0
data/lib/ms/mass/aa.rb +79 -0
data/lib/ms/mass.rb +55 -0
data/lib/ms/mzml/index_list.rb +98 -0
data/lib/ms/mzml/plms1.rb +34 -0
data/lib/ms/mzml.rb +197 -0
data/lib/ms/obo.rb +38 -0
data/lib/ms/plms1.rb +156 -0
data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
data/lib/ms/quant/qspec.rb +112 -0
data/lib/ms/spectrum.rb +154 -8
data/lib/ms.rb +3 -10
data/lib/msplat.rb +2 -0
data/lib/obo/ims.rb +5 -0
data/lib/obo/ms.rb +7 -0
data/lib/obo/ontology.rb +41 -0
data/lib/obo/unit.rb +5 -0
data/lib/openany.rb +23 -0
data/lib/write_file_or_string.rb +18 -0
data/obo/ims.obo +562 -0
data/obo/ms.obo +11677 -0
data/obo/unit.obo +2563 -0
data/spec/ms/cvlist_spec.rb +60 -0
data/spec/ms/digester_spec.rb +351 -0
data/spec/ms/fasta_spec.rb +100 -0
data/spec/ms/ident/peptide/db_spec.rb +108 -0
data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
data/spec/ms/ident/pepxml_spec.rb +442 -0
data/spec/ms/ident/protein_group_spec.rb +68 -0
data/spec/ms/mass_spec.rb +8 -0
data/spec/ms/mzml/index_list_spec.rb +122 -0
data/spec/ms/mzml/plms1_spec.rb +62 -0
data/spec/ms/mzml_spec.rb +50 -0
data/spec/ms/plms1_spec.rb +38 -0
data/spec/ms/quant/qspec_spec.rb +25 -0
data/spec/msplat_spec.rb +24 -0
data/spec/obo_spec.rb +25 -0
data/spec/spec_helper.rb +25 -0
data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
data/spec/testfiles/plms1/output.key +0 -0
metadata +157 -40
data/README +0 -77
data/changelog.txt +0 -196
data/lib/ms/calc.rb +0 -32
data/lib/ms/data/interleaved.rb +0 -60
data/lib/ms/data/lazy_io.rb +0 -73
data/lib/ms/data/lazy_string.rb +0 -15
data/lib/ms/data/simple.rb +0 -59
data/lib/ms/data/transposed.rb +0 -41
data/lib/ms/data.rb +0 -57
data/lib/ms/format/format_error.rb +0 -12
data/lib/ms/support/binary_search.rb +0 -126

data/lib/ms/ident/pepxml.rb ADDED Viewed

@@ -0,0 +1,112 @@
+require 'nokogiri'
+require 'ms/ident'
+require 'ms/ident/pepxml/msms_pipeline_analysis'
+require 'ostruct'
+module MS ; module Ident ; end ; end
+class Numeric
+  # returns a string with a + or - on the front
+  def to_plus_minus_string
+    if self >= 0
+      '+' << self.to_s
+    else
+      self.to_s
+    end
+  end
+end
+class MS::Ident::Pepxml
+  XML_STYLESHEET_LOCATION = '/tools/bin/TPP/tpp/schema/pepXML_std.xsl'
+  DEFAULT_PEPXML_VERSION = MsmsPipelineAnalysis::PEPXML_VERSION
+  XML_ENCODING = 'UTF-8'
+  attr_accessor :msms_pipeline_analysis
+  # returns an array of MS::Ident::Pepxml::SearchHit::Simple structs
+  def self.simple_search_hits(file)
+    hit_values = File.open(file) do |io|
+      doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
+      # we can work with namespaces, or just remove them ...
+      doc.remove_namespaces!
+      root = doc.root
+      search_hits = root.xpath('//search_hit')
+      search_hits.each_with_index.map do |search_hit,i|
+        aaseq = search_hit['peptide']
+        charge = search_hit.parent.parent['assumed_charge'].to_i
+        search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
+        search_scores = {}
+        search_score_nodes.each do |node|
+          search_scores[node['name'].to_sym] = node['value'].to_f
+        end
+        MS::Ident::Pepxml::SearchHit::Simple.new("hit_#{i}", MS::Ident::Search.new(file.chomp(File.extname(file))), aaseq, charge, search_scores)
+      end
+    end
+  end
+  def pepxml_version
+    msms_pipeline_analysis.pepxml_version
+  end
+  # returns an array of spectrum queries
+  def spectrum_queries
+    msms_pipeline_analysis.msms_run_summary.spectrum_queries
+  end
+  # yields a new Msms_Pipeline_Analysis object if given a block
+  def initialize(&block)
+    block.call(@msms_pipeline_analysis=MsmsPipelineAnalysis.new) if block
+  end
+  # takes an xml document object and sets it with the xml stylesheet
+  def add_stylesheet(doc, location)
+    xml_stylesheet = Nokogiri::XML::ProcessingInstruction.new(doc, "xml-stylesheet", %Q{type="text/xsl" href="#{location}"})
+    doc.root.add_previous_sibling  xml_stylesheet
+    doc
+  end
+  # if no options are given, an xml string is returned.  If either :outdir or
+  # :outfile is given, the xml is written to file and the output filename is returned.
+  # A single string argument will be interpreted as :outfile if it ends in
+  # '.xml' and the :outdir otherwise.  In this case, update_summary_xml is still true
+  #
+  # options:
+  #
+  #     arg                    default
+  #     :outdir             => nil   write to disk using this outdir with summary_xml basename
+  #     :outfile            => nil   write to this filename (overrides outdir)
+  #     :update_summary_xml => true  update summary_xml attribute to point to the output file true/false
+  #
+  # set outdir to
+  # File.dirname(pepxml_obj.msms_pipeline_analysis.msms_run_summary.base_name)
+  # to write to the same directory as the input search file.
+  def to_xml(opts={})
+    opts ||= {}
+    if opts.is_a?(String)
+      opts = ( opts.match(/\.xml$/) ?  {:outfile => opts} : {:outdir => opts } )
+    end
+    opt = {:update_summary_xml => true, :outdir => nil, :outfile => nil}.merge(opts)
+    if opt[:outfile]
+      outfile = opt[:outfile]
+    elsif opt[:outdir]
+      outfile = File.join(opt[:outdir], msms_pipeline_analysis.summary_xml.split(/[\/\\]/).last)
+    end
+    self.msms_pipeline_analysis.summary_xml = File.expand_path(outfile) if (opt[:update_summary_xml] && outfile)
+    builder = Nokogiri::XML::Builder.new(:encoding => XML_ENCODING)
+    msms_pipeline_analysis.to_xml(builder)
+    add_stylesheet(builder.doc, MS::Ident::Pepxml::XML_STYLESHEET_LOCATION)
+    string = builder.doc.to_xml
+    if outfile
+      File.open(outfile,'w') {|out| out.print(string) }
+      outfile
+    else
+      string
+    end
+  end
+end

data/lib/ms/ident/protein.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'andand'
+module MS ; end
+module MS::Ident
+  module ProteinLike
+    # an id for the protein
+    attr_accessor :id
+    # the protein sequence
+    attr_accessor :sequence
+    alias_method :seq, :sequence
+    alias_method :seq=, :sequence=
+      # a description of the protein
+      attr_accessor :description
+    # if the GN=([^\s]+) regexp is found in the description, returns the first
+    # match, or nil if not found
+    def gene_id
+      description.andand.match(/ GN=(\w+) ?/)[1]
+    end
+  end
+  # a generic protein class that is ProteinLike
+  class Protein
+    include ProteinLike
+    def initialize(id=nil, sequence=nil)
+      (@id, @sequence) = id, sequence
+    end
+  end
+end

data/lib/ms/ident/protein_group.rb ADDED Viewed

@@ -0,0 +1,80 @@
+require 'set'
+module MS
+  module Ident
+    # represents a group of proteins, typically indistinguishable in the
+    # experiment.
+    class ProteinGroup < Array
+      attr_accessor :peptide_hits
+      PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
+        peptide_hits = protein_group_and_peptide_hits.last
+        num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
+        num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
+        [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
+      end
+      # greedy algorithm to map a set of peptide_hits to protein groups.  each
+      # peptide hit should respond to :aaseq, :charge, :proteins if a block is
+      # given, yields a single argument: a doublet of protein_group and peptide
+      # set.  It expects a metric or array to sort by for creating greedy protein
+      # groups (the greediest proteins should sort to the back of the array).  if
+      # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
+      # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS).  Sets of
+      # peptide_hits and the objects returned by peptide_hit#proteins are used as
+      # hash keys.  As long as each peptide hit has a unique signature (like an
+      # id) then any object will work.  If they are Struct objects, you might
+      # consider redefining the #hash method to be object_id for performance and
+      # accuracy.
+      #
+      # returns an array of ProteinGroup objects, each set with :peptide_hits
+      #
+      # If update_peptide_hits is true, then each peptide_hit is linked to the array
+      # of protein_groups it is associated with using :protein_groups.  A
+      # symbol can also be passed in, and that method will be called instead.
+      def self.peptide_hits_to_protein_groups(peptide_hits, update_peptide_hits=false, &sort_by)
+        update_peptide_hits = 'protein_groups='.to_sym if (update_peptide_hits==true)
+        sort_by ||= PRIORITIZE_PROTEINS
+        # note to self: I wrote this in 2011, so I think I know what I'm doing now
+        protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
+        peptide_hits.each do |peptide_hit|
+          peptide_hit.proteins.each do |protein|
+            protein_to_peptides[protein] << peptide_hit
+          end
+        end
+        peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
+        protein_to_peptides.each do |protein, peptide_set|
+          peptides_to_protein_group[peptide_set] << protein
+        end
+        peptides_to_protein_group.each do |pephits,ar_of_prots|
+          pg = MS::Ident::ProteinGroup.new(ar_of_prots)
+          pg.peptide_hits = pephits
+          peptides_to_protein_group[pephits] = pg
+        end
+        protein_group_to_peptides = peptides_to_protein_group.invert
+        greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
+        accounted_for = Set.new
+        # we are discarding the subsumed sets, but we could get them with
+        # partition
+        greedy_first.select! do |group, peptide_set|
+          has_an_unaccounted_peptide = false
+          peptide_set.each do |peptide_hit|
+            unless accounted_for.include?(peptide_hit)
+              has_an_unaccounted_peptide = true
+              accounted_for.add(peptide_hit)
+            end
+          end
+          group.peptide_hits = peptide_set if has_an_unaccounted_peptide
+          has_an_unaccounted_peptide
+        end
+        if update_peptide_hits
+          greedy_first.each {|pg, pephits| pephits.each {|hit| hit.send(update_peptide_hits, pg) } }
+        end
+        greedy_first.map(&:first)
+      end
+    end
+  end
+end

data/lib/ms/ident/search.rb ADDED Viewed

@@ -0,0 +1,114 @@
+module MS
+  module Ident
+    module SearchLike
+      attr_accessor :id
+      attr_accessor :peptide_hits
+      alias_method :hits, :peptide_hits
+      alias_method :hits=, :peptide_hits=
+      # returns an array of peptide_hits and protein_hits that are linked to
+      # one another.  NOTE: this will update peptide and protein
+      # hits :proteins and :peptide_hits attributes respectively).  Assumes that each search
+      # responds to :peptide_hits, each peptide responds to :proteins and each protein to
+      # :peptide_hits.  Can be done on a single file to restore protein/peptide
+      # linkages to their original single-file state.
+      # Assumes the protein is initialized with (reference, peptide_ar)
+      #
+      # yields the protein that will become the template for a new protein
+      # and expects a new protein hit
+      #def merge!(ar_of_peptide_hit_arrays)
+      #  all_peptide_hits = []
+      #  reference_hash = {}
+      #  ar_of_peptide_hit_arrays.each do |peptide_hits|
+      #    all_peptide_hits.push(*peptide_hits)
+      #    peptide_hits.each do |peptide|
+      #      peptide.proteins.each do |protein|
+      #        id = protein.id
+      #        if reference_hash.key?(id)
+      #          reference_hash[id].peptide_hits << peptide
+      #          reference_hash[id]
+      #        else
+      #          reference_hash[id] = yield(protein, [peptide])
+      #        end
+      #      end
+      #    end
+      #  end
+      #  [all_peptide_hits, reference_hash.values]
+      #end
+    end
+    class Search
+      include SearchLike
+      def initialize(id=nil, peptide_hits=[])
+        @id = id
+        @peptide_hits = peptide_hits
+      end
+    end
+    module SearchGroup
+      # an array of search objects
+      attr_accessor :searches
+      # the group's file extension (with no leading period)
+      def extension
+        'grp'
+      end
+      def search_class
+        Search
+      end
+      # a simple formatted file with paths to the search files
+      def to_paths(file)
+        IO.readlines(file).grep(/\w/).reject {|v| v =~ /^#/}.map {|v| v.chomp }
+      end
+      def from_file(file)
+        from_filenames(to_paths(file))
+      end
+      def from_filenames(filenames)
+        filenames.each do |file|
+          if !File.exist? file
+            message = "File: #{file} does not exist!\n"
+            message << "perhaps you need to modify the file with file paths"
+            abort message
+          end
+          @searches << search_class.new(file)
+        end
+      end
+      # takes an array of filenames or a single search filename (with
+      # extension defined by 'extendsion') or an array of objects passes any
+      # arguments to the initializer for each search
+      # the optional block yields the object for further processing
+      def initialize(arg=nil, opts={})
+        @peptide_hits = []
+        @reference_hash = {}
+        @searches = []
+        if arg
+          if arg.is_a?(String) && arg =~ /\.#{Regexp.escap(extension)}$/
+            from_file(arg)
+          elsif arg.is_a?(Array) && arg.first.is_a?(String)
+            from_filenames(arg)
+          elsif arg.is_a?(Array)
+            @searches = array
+          else
+            raise ArgumentError, "must be file, array of filenames, or array of objs"
+          end
+          @searches << search_class.new(file, opts)
+        end
+        yield(self) if block_given?
+      end
+    end
+  end
+end

data/lib/ms/ident.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require 'ms/ident/protein_group'
+require 'ms/ident/protein'
+require 'ms/ident/peptide_hit'
+module MS
+  # An MS::Ident::ProteinGroup is an array of proteins that responds to
+  # :peptide_hits.  All protein level identifications should be stored in a
+  # proteingroup object.
+  #
+  # An MS::Ident::Protein is an object representing a protein (:id,
+  # :sequence, :description).  Note, it is not a protein hit (use a
+  # ProteinGroup)
+  #
+  # An MS::Ident::PeptideHit is an object representing a match between an
+  # amino acid sequence and a spectrum.
+  #
+  # Typical usage:
+  #
+  #     require 'ms/ident'
+  #
+  #     hit1 = PeptideHit.new(:id => 1, :aaseq => 'PEPTIDE', :search =>
+  #     MS::Ident::Search.new, etc...)
+  #     peptide_hits = [hit1, hit2, ...]
+  #
+  #     protein_groups = MS::Ident::ProteinGroup.peptide_hits_to_protein_groups(peptide_hits)
+  #     protein_groups.first.peptide_hits  # => the peptide hits in that group
+  module Ident
+    # returns the filetype (if possible)
+    def self.filetype(file)
+      if file =~ /\.srf$/i
+        :srf
+      end
+    end
+  end
+end

data/lib/ms/isotope/aa.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module MS
+  module Isotope
+    module AA
+      ATOM_COUNTS_STR = {
+        'A' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'R' => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
+        'N' => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        'D' => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'C' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
+        'E' => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'Q' => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        'G' => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'H' => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
+        'I' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'L' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'K' => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        'M' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
+        'F' => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'P' => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'S' => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'T' => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'W' => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        'Y' => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'V' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        'U' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
+        'O' => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
+      }
+      ATOM_COUNTS_SYM = {
+        :A => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :R => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
+        :N => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        :D => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :C => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
+        :E => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :Q => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        :G => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :H => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
+        :I => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :L => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :K => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        :M => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
+        :F => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :P => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :S => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :T => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :W => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
+        :Y => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :V => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
+        :U => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
+        :O => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
+      }
+      ATOM_COUNTS_STR.each {|aa,val| ATOM_COUNTS[aa.to_sym] = val }
+      # string and symbol access of amino acid (atoms are all lower case
+      # symbols)
+      ATOM_COUNTS = ATOM_COUNTS_SYM.merge ATOM_COUNTS_STR
+    end
+  end
+end

data/lib/ms/mascot.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module MS
+  module Mascot
+    H_PLUS = 1.007276
+  end
+end

data/lib/ms/mass/aa.rb ADDED Viewed

@@ -0,0 +1,79 @@
+require 'ms/mass'
+module MS
+  module Mass
+    module AA
+      # amino_acids keys as strings, average masses
+      AVG_STRING = {
+        "*"=>118.88603,
+        "A"=>71.0779,
+        "B"=>172.1405,
+        "C"=>103.1429,
+        "D"=>115.0874,
+        "E"=>129.11398,
+        "F"=>147.17386,
+        "G"=>57.05132,
+        "H"=>137.13928,
+        "I"=>113.15764,
+        "K"=>128.17228,
+        "L"=>113.15764,
+        "M"=>131.19606,
+        "N"=>114.10264,
+        "O"=>211.28076,
+        "P"=>97.11518,
+        "Q"=>128.12922,
+        "R"=>156.18568,
+        "S"=>87.0773,
+        "T"=>101.10388,
+        "U"=>150.0379,
+        "V"=>99.13106,
+        "W"=>186.2099,
+        "X"=>118.88603,
+        "Y"=>163.17326,
+        "Z"=>128.6231
+      }
+      # amino_acids keys as strings, monoisotopic masses
+      MONO_STRING = {
+        "*"=>118.805716,
+        "A"=>71.0371137878,
+        "B"=>172.048405,
+        "C"=>103.0091844778,
+        "D"=>115.026943032,
+        "E"=>129.0425930962,
+        "F"=>147.0684139162,
+        "G"=>57.0214637236,
+        "H"=>137.0589118624,
+        "I"=>113.0840639804,
+        "K"=>128.0949630177,
+        "L"=>113.0840639804,
+        "M"=>131.0404846062,
+        "N"=>114.0429274472,
+        "O"=>211.1446528645,
+        "P"=>97.052763852,
+        "Q"=>128.0585775114,
+        "R"=>156.1011110281,
+        "S"=>87.0320284099,
+        "T"=>101.0476784741,
+        "U"=>150.9536355878,
+        "V"=>99.0684139162,
+        "W"=>186.0793129535,
+        "X"=>118.805716,
+        "Y"=>163.0633285383,
+        "Z"=>128.550585
+      }
+      # amino_acids keys as symbols, monoisotopic masses
+      MONO_SYM = Hash[MONO_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
+      # amino_acids keys as symbols, average masses
+      AVG_SYM = Hash[AVG_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
+      # Monoisotopic amino acid masses keyed as symbols and also strings (all
+      # upper case).  Also includes MS::Mass::MONO for things like protons ('h+')
+      MONO = MONO_SYM.merge(MONO_STRING).merge(MS::Mass::MONO)
+      # Average amino acid masses keyed as symbols and also strings (all
+      # uppder case).  Also includes MS::Mass::AVG for things like protons ('h+')
+      AVG = AVG_SYM.merge(AVG_STRING).merge(MS::Mass::AVG)
+    end
+  end
+end

data/lib/ms/mass.rb ADDED Viewed

@@ -0,0 +1,55 @@
+module MS
+  module Mass
+    # takes a chemical formula in this format: C2BrH12O
+    def self.formula_to_exact_mass(formula)
+      # TODO: add other input methods
+      pairs = formula.scan(/([A-Z][a-z]?)(\d*)/).map do |match|
+        if match.last == ''
+          match[-1] = 1
+        end
+        [match[0], match[1].to_i]
+      end
+      pairs.map do |pair|
+        MONO[pair.first.downcase] * pair.last
+      end.reduce(:+)
+    end
+    H_PLUS = 1.00727646677
+    #  + http://www.unimod.org/masses.html
+    MONO_STR = {
+      'c' => 12.0,  # +
+      'br' => 78.9183361,  # +
+      'd' => 2.014101779,  # +
+      'f' => 18.99840322,  # +
+      'n' => 14.003074,  # +
+      'o' => 15.99491463,  # +
+      'na' => 22.9897677,  # +
+      'p' => 30.973762,  # +
+      's' => 31.9720707,  # +
+      'li' => 7.016003,  # +
+      'cl' => 34.96885272,  # +
+      'k' => 38.9637074,  # +
+      'si' => 27.9769265325, # http://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl?ele=Si&ascii=html&isotype=some
+      'i' => 126.904473,  # +
+      'h+' => 1.00727646677,
+      'h' => 1.007825035,  # +
+      'h2o' => 18.0105647,
+      'oh' => 17.002739665,
+    }
+    AVG_STR = {
+      'h+' => 1.007276, # using Mascot_H_plus mass (is this right for AVG??)
+      'h' => 1.00794,
+      'h2o' => 18.01528,
+      'oh' => 17.00734,
+    }
+    # sets MONO_STR, MONO, AVG_STR, and AVG
+    %w(MONO AVG).each do |type|
+      const_set "#{type}_SYM", Hash[ const_get("#{type}_STR").map {|k,v| [k.to_sym, v] } ]
+      const_set type, const_get("#{type}_STR").merge( const_get("#{type}_SYM") )
+    end
+  end
+end

data/lib/ms/mzml/index_list.rb ADDED Viewed

@@ -0,0 +1,98 @@
+module MS
+  class Mzml
+    # A simple array of indices but #[] has been overloaded to find an index
+    # by name
+    #
+    #     index_list[0]  # the first index
+    #     index_list.map(&:names) # -> [:spectrum, :chromatogram]
+    #     index_list[:spectrum]  # the spectrum index
+    #     index_list[:chromatogram]  # the chromatogram index
+    class IndexList < Array
+      alias_method :old_bracket_slice, :'[]'
+      # @param [Object] an Integer (index number) or a Symbol (:spectrum or
+      #   :chromatogram)
+      # @return [MS::Mzml::Index] an index object
+      def [](int_or_symbol)
+        if int_or_symbol.is_a?(Integer)
+          old_bracket_slice(int_or_symbol)
+        else
+          self.find {|index| index.name == int_or_symbol }
+        end
+      end
+    end
+    # the array holds start bytes
+    class Index < Array
+      class << self
+        # returns an Integer or nil if not found
+        # does a single jump backwards from the tail of the file looking for
+        # an xml element based on tag.  If it is not found, returns nil
+        def index_offset(io, tag='indexListOffset', bytes_backwards=200)
+          tag_re = %r{<#{tag}>([\-\d]+)</#{tag}>}
+            io.pos = (io.size - 1) - bytes_backwards
+          md = io.readlines("\n").map {|line| line.match(tag_re) }.compact.shift
+          md[1].to_i if md
+        end
+      end
+      # an index indexed by scan number
+      attr_accessor :by_scans
+      # the name of the index (as a symbol)
+      attr_accessor :name
+      # a parallel array of ids (idRef's)
+      attr_accessor :ids
+      def start_byte_and_id(int)
+        [self[int], ids[int]]
+      end
+      # returns hash of id to start_byte
+      def create_id_index
+        Hash[self.ids.zip(self)]
+      end
+      # @return [Integer] the start byte of the spectrum
+      # @param [Object] an Integer (the index number) or String (an id string)
+      def start_byte(arg)
+        case arg
+        when Integer
+          self[arg]
+        when String
+          @id_index ||= create_id_index
+          @id_index[arg]
+        end
+      end
+      # generates a scan to index hash that points from scan number to the
+      # spectrum index number.  returns the index, nil if the scan ids
+      # are not present and spectra are, or false if they are not unique.
+      def create_scan_to_index
+        scan_re = /scan=(\d+)/
+          scan_to_index = {}
+        ids.each_with_index do |id, index|
+          md = id.match(scan_re)
+          scan_num = md[1].to_i if md
+          if scan_num
+            if scan_to_index.key?(scan_num)
+              return false
+            else
+              scan_to_index[scan_num] = index
+            end
+          end
+        end
+        if scan_to_index.size > 0
+          by_scans = scan_to_index
+        elsif ids.size > 0
+          nil  # there are scans, but we did not find scan numbers
+        else
+          scan_to_index
+        end
+      end
+    end
+  end
+end