RubyGems - molecules - Versions diffs - 0.1.0 - Mend

molecules 0.1.0

Files changed (21) hide show

data/MIT-LICENSE +21 -0
data/README +73 -0
data/Rakefile +78 -0
data/lib/molecules.rb +4 -0
data/lib/molecules/calc.rb +127 -0
data/lib/molecules/empirical_formula.rb +325 -0
data/lib/molecules/libraries/polypeptide.rb +91 -0
data/lib/molecules/libraries/residue.rb +165 -0
data/lib/molecules/utils.rb +49 -0
data/tap.yml +0 -0
data/test/molecules/calc_test.rb +37 -0
data/test/molecules/empirical_formula_class_test.rb +196 -0
data/test/molecules/empirical_formula_test.rb +204 -0
data/test/molecules/libraries/polypeptide_test.rb +128 -0
data/test/molecules/libraries/residue_test.rb +289 -0
data/test/molecules/utils_test.rb +147 -0
data/test/molecules_test.rb +24 -0
data/test/molecules_test_helper.rb +31 -0
data/test/molecules_test_suite.rb +3 -0
data/test/tap_test_helper.rb +3 -0
metadata +82 -0

data/lib/molecules/libraries/polypeptide.rb ADDED

@@ -0,0 +1,91 @@
+require 'molecules/libraries/residue'
+module Molecules
+  module Libraries
+    # Represents a polypeptide as a sequence of residues.  For convenience,
+    # polypeptides may contain whitespace in their sequences (thus allowing
+    # direct use with parsed FASTA formatted peptides sequences).
+    #
+    # Currently polypeptide only handles sequences with common residues.
+    class Polypeptide < EmpiricalFormula
+      class << self
+        # Normalizes the input sequence by removing whitespace and capitalizing.
+        def normalize(sequence)
+          sequence.gsub(/\s/, "").upcase
+        end
+      end
+      # The sequence of self (including whitespace)
+      attr_reader :sequence
+      # A hash of (Residue, Integer) pairs defining the number of a given residue in self.
+      attr_reader :residue_composition
+      # The number of residues in self (may differ from sequence.length
+      # if sequence contains whitespace).
+      attr_reader :length
+      # An array of tokens that may occur in a sequence, grouped
+      # as patterns (ie one token for all whitespace characters, and
+      # one token for each residue).  Used to count the number of
+      # each type of residue in a sequence.
+      SEQUENCE_TOKENS = ["\s\t\r\n"] + Residue.common.collect {|r| r.letter}
+      def initialize(sequence)
+        @sequence = sequence
+        @length = 0
+        @residue_composition = {}
+        @formula = Array.new(5, 0)
+        # count up the number of whitespaces and residues in self
+        tokens = Utils.count(sequence, SEQUENCE_TOKENS)
+        whitespace = tokens.shift
+        if whitespace == sequence.length
+          # as per the Base specification, factors
+          # should have no trailing zeros
+          @formula.clear
+          return
+        end
+        # add the residue masses and factors
+        Residue.common.each do |residue|
+          # benchmarks indicated that counting for each residue
+          # is quicker than trying anything like:
+          #
+          #   sequence.each_byte {|b| bytes[b] += 1}
+          #
+          # This is particularly an issue for long sequences.  The
+          # count operation could be optimized for isobaric residues
+          n = tokens.shift
+          next if n == 0
+          @length += n
+          @residue_composition[residue] = n
+          Utils.add(@formula, residue.formula, n)
+        end
+        if @length + whitespace != sequence.length
+          # raise an error if there are unaccounted characters
+          raise UnknownResidueError, "unknown characters in sequence: #{sequence}"
+        end
+      end
+      # Sequentially passes each residue in sequence to the block.
+      def each_residue
+        residues = Residue.residue_index
+        sequence.each_byte do |byte|
+          residue = residues[byte]
+          yield(residue) if residue
+        end
+      end
+      class UnknownResidueError < StandardError # :nodoc:
+      end
+    end
+  end
+end

data/lib/molecules/libraries/residue.rb ADDED

@@ -0,0 +1,165 @@
+require 'constants/library'
+require 'molecules/empirical_formula'
+module Molecules
+  module Libraries
+    # A library of amino acid residues.
+    #
+    #    r = Residue::A
+    #    r.name               # => "Alanine"
+    #    r.abbr               # => "Ala"
+    #    r.letter             # => "A"
+    #    r.side_chain.to_s    # => "CH(3)"
+    #
+    class Residue < EmpiricalFormula
+      class << self
+        # The 20 common amino acids.
+        def common
+          collection(:common)
+        end
+        # An array of the residues indexed by the byte
+        # corresponding to the residue letter.
+        def residue_index
+          collection(:residue_index)
+        end
+        # An array of the residue masses indexed by the byte
+        # corresponding to the residue letter.
+        def residue_mass_index
+          collection(:residue_mass_index)
+        end
+      end
+      # The full name of self
+      attr_reader :name
+      # The (typically) 3-letter abbreviation of self
+      attr_reader :abbr
+      # The letter code for self
+      attr_reader :letter
+      # The byte corresponding to letter
+      attr_reader :byte
+      # An EmpiricalFormula representing the side chain of self
+      attr_reader :side_chain
+      # A symbol classification of self
+      attr_reader :type
+      # The unrounded monoisotopic side chain mass of self
+      attr_reader :side_chain_mass
+      # The uncharged, unrounded, monoisotopic residue mass of self
+      # (the backbone plus side chain mass, with no N- or C-terminus)
+      attr_reader :residue_mass
+      # The unrounded mass of the immonium ion of self
+      # (residue_mass + DELTA_IMMONIUM.mass)
+      attr_reader :immonium_ion_mass
+      def initialize(letter, abbr, name, side_chain_formula, classification=nil)
+        @side_chain = EmpiricalFormula.parse_simple(side_chain_formula)
+        super( Utils.add(side_chain.formula.dup, BACKBONE.formula), false)
+        @letter = letter
+        @abbr = abbr
+        @name = name
+        @classification = classification
+        @side_chain_mass = side_chain.mass
+        @residue_mass = mass
+        @immonium_ion_mass = @residue_mass + DELTA_IMMONIUM.mass
+        @byte = nil
+        @letter.each_byte do |byte|
+          @byte = byte
+          break
+        end unless @letter == nil
+      end
+      # True if the residue of type :common
+      def common?
+        @classification == :common
+      end
+      # True if the residue is type :common or :standard.
+      def standard?
+        @classification == :common || @classification == :standard
+      end
+      # True if the residue is a composite representing a set of isobaric residues
+      def composite?
+        @type == :composite
+      end
+      # An EmpiricalFormula for the residue backbone
+      BACKBONE = EmpiricalFormula.parse_simple('C(2)H(2)NO')
+      # Add to a Residue to achieve an immonium ion
+      DELTA_IMMONIUM = EmpiricalFormula.parse('-CO+H')
+      A = Residue.new('A', "Ala", "Alanine", "CH(3)", :common)
+      C = Residue.new('C', "Cys", "Cysteine", "CH(3)S", :common)
+      D = Residue.new('D', "Asp", "Aspartic Acid", "C(2)H(3)O(2)", :common)
+      E = Residue.new('E', "Glu", "Glutamic Acid", "C(3)H(5)O(2)", :common)
+      F = Residue.new('F', "Phe", "Phenylalanine", "C(7)H(7)", :common)
+      G = Residue.new('G', "Gly", "Glycine", "H", :common)
+      H = Residue.new('H', "His", "Histidine", "C(4)H(5)N(2)", :common)
+      I = Residue.new('I', "Ile", "Isoleucine", "C(4)H(9)", :common)
+      K = Residue.new('K', "Lys", "Lysine", "C(4)H(10)N", :common)
+      L = Residue.new('L', "Leu", "Leucine", "C(4)H(9)", :common)
+      M = Residue.new('M', "Met", "Methionine", "C(3)H(7)S", :common)
+      N = Residue.new('N', "Asn", "Asparagine", "C(2)H(4)NO", :common)
+      O = Residue.new('O', "Pyl", "Pyrrolysine", "C(9)H(17)NO", :standard)
+      P = Residue.new('P', "Pro", "Proline", "C(3)H(5)", :common)
+      Q = Residue.new('Q', "Gln", "Glutamine", "C(3)H(6)NO", :common)
+      R = Residue.new('R', "Arg", "Arginine", "C(4)H(10)N(3)", :common)
+      S = Residue.new('S', "Ser", "Serine", "CH(3)O", :common)
+      T = Residue.new('T', "Thr", "Threonine", "C(2)H(5)O", :common)
+      U = Residue.new('U', "Sec", "Selenocysteine", "CH(3)Se", :standard)
+      V = Residue.new('V', "Val", "Valine", "C(3)H(7)", :common)
+      W = Residue.new('W', "Trp", "Tryptophan", "C(9)H(8)N", :common)
+      Y = Residue.new('Y', "Tyr", "Tyrosine", "C(7)H(7)O", :common)
+      ORN = Residue.new(nil,   "Orn",  "Ornithine", "C(3)H(8)N", :uncommon)
+      ABA = Residue.new(nil,   'Aba',  'Aminobutyric Acid', 'C(2)H(5)', :uncommon)
+      AECYS = Residue.new(nil, 'AECys','Aminoethylcysteine', 'C(3)H(8)NS', :uncommon)
+      AIB = Residue.new(nil,   'Aib',  'alpha-Aminoisobutyric Acid', 'C(2)H(5)', :uncommon)
+      CMCYS = Residue.new(nil, 'CMCys','Carboxymethylcysteine', 'C(3)H(5)O(2)S', :uncommon)
+      DHA = Residue.new(nil,   'Dha',  'Dehydroalanine', 'CH', :uncommon)
+      DHB = Residue.new(nil,   'Dhb',  'Dehydroamino-alpha-butyric Acid', 'C(2)H(3)', :uncommon)
+      HYL = Residue.new(nil,   'Hyl',  'Hydroxylysine', 'C(4)H(10)NO', :uncommon)
+      HYP = Residue.new(nil,   'Hyp',  'Hydroxyproline', 'C(3)H(5)O', :uncommon)
+      IVA = Residue.new(nil,   'Iva',  'Isovaline', 'C(3)H(7)', :uncommon)
+      NLEU = Residue.new(nil,  'nLeu', 'Norleucine', 'C(4)H(9)', :uncommon)
+      PIP = Residue.new(nil,   'Pip',  '2-Piperidinecarboxylic Acid', 'C(4)H(7)', :uncommon)
+      PGLU = Residue.new(nil,  'pGlu', 'Pyroglutamic Acid', 'C(3)H(3)O', :uncommon)
+      SAR = Residue.new(nil,   'Sar',  'Sarcosine', 'CH(3)', :uncommon)
+      include Constants::Library
+      library.index_by_attribute :letter
+      library.index_by_attribute :abbr
+      library.index_by_attribute :name
+      library.collect(:common) do |residue|
+        residue.common? ? residue : nil
+      end
+      library.collect(:residue_index) do |residue|
+        next unless residue.common?
+        [residue, residue.byte]
+      end
+      library.collect(:residue_mass_index) do |residue|
+        next unless residue.common?
+        [residue.residue_mass, residue.byte]
+      end
+    end
+  end
+end

data/lib/molecules/utils.rb ADDED

@@ -0,0 +1,49 @@
+module Molecules
+  # A number of utility routines used by EmpiricalFormula and elsewhere.
+  # These methods are used a great deal and are all prime candidates for
+  # optimization (for example using RubyInline).
+  module Utils
+    module_function
+    # Rounds n to the specified precision (ie number of decimal places)
+    def round(n, precision)
+      factor = 10**precision.to_i
+      (n * factor).round.to_f / factor
+    end
+    # Adds the elements of b to a at corresponding
+    # indicies, multiplying by n.  The input arrays
+    # do not have to be the same length.  Returns a
+    # with trailing zeros removed.
+    def add(a, b, n=1)
+      a << 0 while a.length < b.length
+      # oddly, this is faster than each_with_index
+      i = 0
+      b.each do |factor|
+        a[i] += n * factor
+        i += 1
+      end
+      a.pop while a[-1] == 0
+      a
+    end
+    # Multiples the elements of array a by factor, returning a.
+    # Clears a if factor == 0.
+    def multiply(a, factor)
+      factor == 0 ? a.clear : a.collect! {|i| i * factor}
+    end
+    # Collects the number of each of the patterns in str.  For example:
+    #
+    #   count("abcabca", ["a", "b", "c"])  # => [3, 2, 2]
+    #   count("abcabca", ["a", "bc"])      # => [3, 4]
+    #
+    def count(str, patterns)
+      patterns.collect {|pattern| str.count(pattern)}
+    end
+  end
+end

data/tap.yml ADDED

File without changes

data/test/molecules/calc_test.rb ADDED

@@ -0,0 +1,37 @@
+require File.join(File.dirname(__FILE__), '../tap_test_helper.rb')
+require 'molecules/calc'
+class Molecules::CalcTest < Test::Unit::TestCase
+  acts_as_tap_test
+  attr_reader :t
+  def setup
+    super
+    @t = Molecules::Calc.new
+  end
+  def test_mass_calculation
+    t.enq("H2O")
+    app.run
+    assert_equal [[Unit.new(18.0105646863, "Da")]], app.results(t)
+  end
+  def test_mass_calculation_with_precision
+    t.precision = 2
+    t.enq("H2O", "NH3 + H2O")
+    app.run
+    assert_equal [[Unit.new(18.01, "Da"), Unit.new(35.04, "Da")]], app.results(t)
+  end
+  def test_mass_calculation_with_precision_and_unit_conversion
+    t.units = "yg"
+    t.precision = 3
+    t.enq("H2O")
+    app.run
+    assert_equal [[Unit.new(29.907, "yg")]], app.results(t)
+  end
+end

data/test/molecules/empirical_formula_class_test.rb ADDED

@@ -0,0 +1,196 @@
+require File.join(File.dirname(__FILE__), '../molecules_test_helper.rb')
+require 'molecules/empirical_formula'
+class EmpiricalFormulaClassTest < Test::Unit::TestCase
+  include Molecules
+  #
+  # parse_simple test
+  #
+  def test_parse_simple_documentation
+    assert_equal "H(2)O", EmpiricalFormula.parse_simple("H(2)O").to_s
+    assert_equal "H(2)O", EmpiricalFormula.parse_simple("H (2) O").to_s
+    assert_equal "H(2)O", EmpiricalFormula.parse_simple("HO(-1)O(2)H").to_s
+  end
+  def test_parse_simple
+    assert_equal([2,1], EmpiricalFormula.parse_simple("HO(-1)O(2)H").formula)
+    assert_equal([2,1], EmpiricalFormula.parse_simple("H O (-1  )O( 2) H ").formula)
+  end
+  def test_parse_simple_fails_for_malformed_formulae
+    [
+      # numbers outside parenthesis
+      "H2",
+      # empty parenthesis
+      "H()",
+      # mismatched parenthesis
+      "H(",
+      ")H",
+      # anything complex
+      "H + O"
+    ].each do |formula|
+      assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse_simple(formula) }
+    end
+  end
+  #
+  # test class parse
+  #
+  def test_parse_documentation
+    assert_equal "H(2)O", EmpiricalFormula.parse("H2O").to_s
+    assert_equal "C(52)H(106)", EmpiricalFormula.parse("CH3(CH2)50CH3").to_s
+    assert_equal "C(2)H(4)N(2)", EmpiricalFormula.parse("C2H3NO - H2O + NH3").to_s
+    block = lambda do |formula|
+      case formula
+      when /\[(.*)\]/
+        factors = $1.split(/,/).collect {|i| i.strip.to_i }
+        EmpiricalFormula.new(factors)
+      else nil
+      end
+    end
+    assert_equal  "H(4)O(2)", EmpiricalFormula.parse("H2O + [2, 1]", &block).to_s
+    assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse("H2O + :not_expected", &block) }
+  end
+  def test_parse
+    {
+      nil => "",
+      "" => "",
+      "H" => "H",
+      "HO" => "HO",
+      "HFe" => "FeH",
+      "FeH" => "FeH",
+      "OH2" => "H(2)O",
+      "H2O" => "H(2)O",
+      "C6H12O4" => "C(6)H(12)O(4)",
+      "Fe2OMg3" => "Fe(2)Mg(3)O",
+      "(H)2" => "H(2)",
+      "(OH)2" => "H(2)O(2)",
+      "(HFe)" => "FeH",
+      "(FeH)" => "FeH",
+      "(OH2)2" => "H(4)O(2)",
+      "(H2O)2" => "H(4)O(2)",
+      "(C6H12O4)2" => "C(12)H(24)O(8)",
+      "(Fe2OMg3)2" => "Fe(4)Mg(6)O(2)",
+      "C6H12O4(C6H12O4)2C6H12O4" => "C(24)H(48)O(16)",
+      "Fe2OMg3(Fe2OMg3(Fe2OMg3))Fe2OMg3" => "Fe(8)Mg(12)O(4)",
+      "Fe2OMg3(Fe2OMg3)(Fe2OMg3)Fe2OMg3" => "Fe(8)Mg(12)O(4)",
+      "Fe2OMg3(Fe2OMg3(Fe2OMg3)3((C)6H12O4)2)2C" => "C(25)Fe(18)H(48)Mg(27)O(25)",
+      "  (H2O) 10 0   " => "H(200)O(100)",
+      "CH3(CH2)7CH" => "C(9)H(18)",
+      "H3NCHCO2" => "C(2)H(4)NO(2)",
+      "(CH3)2CuLi" => "C(2)CuH(6)Li",
+      # multipart
+      "-H" => "H(-1)",
+      "H2O-H" => "HO",
+      "H2O - (OH)2+ H2O2-H2O" => ""
+    }.each_pair do |formula, composition_str|
+      m = EmpiricalFormula.parse(formula)
+      assert_equal composition_str, m.to_s, formula
+    end
+  end
+  def test_parse_fails_for_malformed_formulae
+    [
+      # mismatched parenthesis
+      "H)2",
+      "(H2",
+      "(O2(H2)",
+      "(O)2H2)",
+      # hanging factors
+      "2C",
+      #"(2)",
+      "(2)2",
+      "(2C)",
+      "(2C)2",
+      "C(2C)",
+      # empty parenthesis
+      "()",
+      "()2"
+    ].each do |formula|
+      assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse(formula) }
+    end
+  end
+  #
+  # class mass test
+  #
+  def break_test_class_mass_method
+    water_mass = EmpiricalFormula::Element::H.mass * 2 + EmpiricalFormula::Element::O.mass
+    assert_equal 18.010565, water_mass
+    assert_equal 18.010565, EmpiricalFormula.mass("H2O")
+    assert_equal 18.010565, EmpiricalFormula.mass("H + OH")
+    assert_equal 18, EmpiricalFormula.mass("H2O", 0)
+  end
+  #
+  # library molecules
+  #
+  def break_test_access_library_molecules
+    water = EmpiricalFormula::H2O
+    assert_equal water, EmpiricalFormula.lookup('h2o')
+    assert_equal water, EmpiricalFormula.h2o
+    assert_equal 18.010565, EmpiricalFormula.h2o.mass
+  end
+  # vs the VG Analytical Organic Mass Spectrometry reference, reference date unknown (prior to 2005)
+  # the data from the data sheet was copied manually to doc/VG Analytical DataSheet.txt
+  def test_molecule_mass_values_vs_vg_analytical
+    str = %Q{
+NH2 16.01872 16.0226
+OH 17.00274 17.0073
+OCH3 31.01839 31.0342
+CH3CO 43.01839 43.0452}
+    molecules = str.split(/\n/)
+    molecules.each do |mol_str|
+      next if mol_str.empty?
+      name, monoisotopic, average = mol_str.split(/\s/)
+      monoisotopic = monoisotopic.to_f
+      average = average.to_f
+      molecule = EmpiricalFormula.parse(name)
+      assert_in_delta monoisotopic, molecule.mass, delta_mass, mol_str
+      # TODO -- check average mass
+    end
+  end
+  #
+  # benchmark
+  #
+  def test_parse_speed
+    benchmark_test(20) do |x|
+      n = 10
+      ["H20","H2(H2(H2))H2"].each do |formula|
+        x.report("#{n}k #{formula}") do
+          (n*1000).times { EmpiricalFormula.parse(formula) }
+        end
+      end
+    end
+  end
+  def test_parse_simple_speed
+    benchmark_test(20) do |x|
+      n = 10
+      ["H(20)","H(2)H(2)H(2)H(2)"].each do |formula|
+        x.report("#{n}k #{formula}") do
+          (n*1000).times { EmpiricalFormula.parse_simple(formula) }
+        end
+      end
+    end
+  end
+end