RubyGems - molecules - Versions diffs - 0.1.0 - Mend

molecules 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/MIT-LICENSE +21 -0
data/README +73 -0
data/Rakefile +78 -0
data/lib/molecules.rb +4 -0
data/lib/molecules/calc.rb +127 -0
data/lib/molecules/empirical_formula.rb +325 -0
data/lib/molecules/libraries/polypeptide.rb +91 -0
data/lib/molecules/libraries/residue.rb +165 -0
data/lib/molecules/utils.rb +49 -0
data/tap.yml +0 -0
data/test/molecules/calc_test.rb +37 -0
data/test/molecules/empirical_formula_class_test.rb +196 -0
data/test/molecules/empirical_formula_test.rb +204 -0
data/test/molecules/libraries/polypeptide_test.rb +128 -0
data/test/molecules/libraries/residue_test.rb +289 -0
data/test/molecules/utils_test.rb +147 -0
data/test/molecules_test.rb +24 -0
data/test/molecules_test_helper.rb +31 -0
data/test/molecules_test_suite.rb +3 -0
data/test/tap_test_helper.rb +3 -0
metadata +82 -0

data/lib/molecules/libraries/polypeptide.rb ADDED

@@ -0,0 +1,91 @@
+require 'molecules/libraries/residue'
+module Molecules
+  module Libraries
+    # Represents a polypeptide as a sequence of residues.  For convenience,
+    # polypeptides may contain whitespace in their sequences (thus allowing
+    # direct use with parsed FASTA formatted peptides sequences).
+    #
+    # Currently polypeptide only handles sequences with common residues.
+    class Polypeptide < EmpiricalFormula
+      class << self
+        # Normalizes the input sequence by removing whitespace and capitalizing.
+        def normalize(sequence)
+          sequence.gsub(/\s/, "").upcase
+        end
+      end
+      # The sequence of self (including whitespace)
+      attr_reader :sequence
+      # A hash of (Residue, Integer) pairs defining the number of a given residue in self.
+      attr_reader :residue_composition
+      # The number of residues in self (may differ from sequence.length
+      # if sequence contains whitespace).
+      attr_reader :length
+      # An array of tokens that may occur in a sequence, grouped
+      # as patterns (ie one token for all whitespace characters, and
+      # one token for each residue).  Used to count the number of
+      # each type of residue in a sequence.
+      SEQUENCE_TOKENS = ["\s\t\r\n"] + Residue.common.collect {|r| r.letter}
+      def initialize(sequence)
+        @sequence = sequence
+        @length = 0
+        @residue_composition = {}
+        @formula = Array.new(5, 0)
+        # count up the number of whitespaces and residues in self
+        tokens = Utils.count(sequence, SEQUENCE_TOKENS)
+        whitespace = tokens.shift
+        if whitespace == sequence.length
+          # as per the Base specification, factors
+          # should have no trailing zeros
+          @formula.clear
+          return
+        end
+        # add the residue masses and factors
+        Residue.common.each do |residue|
+          # benchmarks indicated that counting for each residue
+          # is quicker than trying anything like:
+          #
+          #   sequence.each_byte {|b| bytes[b] += 1}
+          #
+          # This is particularly an issue for long sequences.  The
+          # count operation could be optimized for isobaric residues
+          n = tokens.shift
+          next if n == 0
+          @length += n
+          @residue_composition[residue] = n
+          Utils.add(@formula, residue.formula, n)
+        end
+        if @length + whitespace != sequence.length
+          # raise an error if there are unaccounted characters
+          raise UnknownResidueError, "unknown characters in sequence: #{sequence}"
+        end
+      end
+      # Sequentially passes each residue in sequence to the block.
+      def each_residue
+        residues = Residue.residue_index
+        sequence.each_byte do |byte|
+          residue = residues[byte]
+          yield(residue) if residue
+        end
+      end
+      class UnknownResidueError < StandardError # :nodoc:
+      end
+    end
+  end
+end

data/lib/molecules/libraries/residue.rb ADDED

@@ -0,0 +1,165 @@
+require 'constants/library'
+require 'molecules/empirical_formula'
+module Molecules
+  module Libraries
+    # A library of amino acid residues.
+    #
+    #    r = Residue::A
+    #    r.name               # => "Alanine"
+    #    r.abbr               # => "Ala"
+    #    r.letter             # => "A"
+    #    r.side_chain.to_s    # => "CH(3)"
+    #
+    class Residue < EmpiricalFormula
+      class << self
+        # The 20 common amino acids.
+        def common
+          collection(:common)
+        end
+        # An array of the residues indexed by the byte
+        # corresponding to the residue letter.
+        def residue_index
+          collection(:residue_index)
+        end
+        # An array of the residue masses indexed by the byte
+        # corresponding to the residue letter.
+        def residue_mass_index
+          collection(:residue_mass_index)
+        end
+      end
+      # The full name of self
+      attr_reader :name
+      # The (typically) 3-letter abbreviation of self
+      attr_reader :abbr
+      # The letter code for self
+      attr_reader :letter
+      # The byte corresponding to letter
+      attr_reader :byte
+      # An EmpiricalFormula representing the side chain of self
+      attr_reader :side_chain
+      # A symbol classification of self
+      attr_reader :type
+      # The unrounded monoisotopic side chain mass of self
+      attr_reader :side_chain_mass
+      # The uncharged, unrounded, monoisotopic residue mass of self
+      # (the backbone plus side chain mass, with no N- or C-terminus)
+      attr_reader :residue_mass
+      # The unrounded mass of the immonium ion of self
+      # (residue_mass + DELTA_IMMONIUM.mass)
+      attr_reader :immonium_ion_mass
+      def initialize(letter, abbr, name, side_chain_formula, classification=nil)
+        @side_chain = EmpiricalFormula.parse_simple(side_chain_formula)
+        super( Utils.add(side_chain.formula.dup, BACKBONE.formula), false)
+        @letter = letter
+        @abbr = abbr
+        @name = name
+        @classification = classification
+        @side_chain_mass = side_chain.mass
+        @residue_mass = mass
+        @immonium_ion_mass = @residue_mass + DELTA_IMMONIUM.mass
+        @byte = nil
+        @letter.each_byte do |byte|
+          @byte = byte
+          break
+        end unless @letter == nil
+      end
+      # True if the residue of type :common
+      def common?
+        @classification == :common
+      end
+      # True if the residue is type :common or :standard.
+      def standard?
+        @classification == :common || @classification == :standard
+      end
+      # True if the residue is a composite representing a set of isobaric residues
+      def composite?
+        @type == :composite
+      end
+      # An EmpiricalFormula for the residue backbone
+      BACKBONE = EmpiricalFormula.parse_simple('C(2)H(2)NO')
+      # Add to a Residue to achieve an immonium ion
+      DELTA_IMMONIUM = EmpiricalFormula.parse('-CO+H')
+      A = Residue.new('A', "Ala", "Alanine", "CH(3)", :common)
+      C = Residue.new('C', "Cys", "Cysteine", "CH(3)S", :common)
+      D = Residue.new('D', "Asp", "Aspartic Acid", "C(2)H(3)O(2)", :common)
+      E = Residue.new('E', "Glu", "Glutamic Acid", "C(3)H(5)O(2)", :common)
+      F = Residue.new('F', "Phe", "Phenylalanine", "C(7)H(7)", :common)
+      G = Residue.new('G', "Gly", "Glycine", "H", :common)
+      H = Residue.new('H', "His", "Histidine", "C(4)H(5)N(2)", :common)
+      I = Residue.new('I', "Ile", "Isoleucine", "C(4)H(9)", :common)
+      K = Residue.new('K', "Lys", "Lysine", "C(4)H(10)N", :common)
+      L = Residue.new('L', "Leu", "Leucine", "C(4)H(9)", :common)
+      M = Residue.new('M', "Met", "Methionine", "C(3)H(7)S", :common)
+      N = Residue.new('N', "Asn", "Asparagine", "C(2)H(4)NO", :common)
+      O = Residue.new('O', "Pyl", "Pyrrolysine", "C(9)H(17)NO", :standard)
+      P = Residue.new('P', "Pro", "Proline", "C(3)H(5)", :common)
+      Q = Residue.new('Q', "Gln", "Glutamine", "C(3)H(6)NO", :common)
+      R = Residue.new('R', "Arg", "Arginine", "C(4)H(10)N(3)", :common)
+      S = Residue.new('S', "Ser", "Serine", "CH(3)O", :common)
+      T = Residue.new('T', "Thr", "Threonine", "C(2)H(5)O", :common)
+      U = Residue.new('U', "Sec", "Selenocysteine", "CH(3)Se", :standard)
+      V = Residue.new('V', "Val", "Valine", "C(3)H(7)", :common)
+      W = Residue.new('W', "Trp", "Tryptophan", "C(9)H(8)N", :common)
+      Y = Residue.new('Y', "Tyr", "Tyrosine", "C(7)H(7)O", :common)
+      ORN = Residue.new(nil,   "Orn",  "Ornithine", "C(3)H(8)N", :uncommon)
+      ABA = Residue.new(nil,   'Aba',  'Aminobutyric Acid', 'C(2)H(5)', :uncommon)
+      AECYS = Residue.new(nil, 'AECys','Aminoethylcysteine', 'C(3)H(8)NS', :uncommon)
+      AIB = Residue.new(nil,   'Aib',  'alpha-Aminoisobutyric Acid', 'C(2)H(5)', :uncommon)
+      CMCYS = Residue.new(nil, 'CMCys','Carboxymethylcysteine', 'C(3)H(5)O(2)S', :uncommon)
+      DHA = Residue.new(nil,   'Dha',  'Dehydroalanine', 'CH', :uncommon)
+      DHB = Residue.new(nil,   'Dhb',  'Dehydroamino-alpha-butyric Acid', 'C(2)H(3)', :uncommon)
+      HYL = Residue.new(nil,   'Hyl',  'Hydroxylysine', 'C(4)H(10)NO', :uncommon)
+      HYP = Residue.new(nil,   'Hyp',  'Hydroxyproline', 'C(3)H(5)O', :uncommon)
+      IVA = Residue.new(nil,   'Iva',  'Isovaline', 'C(3)H(7)', :uncommon)
+      NLEU = Residue.new(nil,  'nLeu', 'Norleucine', 'C(4)H(9)', :uncommon)
+      PIP = Residue.new(nil,   'Pip',  '2-Piperidinecarboxylic Acid', 'C(4)H(7)', :uncommon)
+      PGLU = Residue.new(nil,  'pGlu', 'Pyroglutamic Acid', 'C(3)H(3)O', :uncommon)
+      SAR = Residue.new(nil,   'Sar',  'Sarcosine', 'CH(3)', :uncommon)
+      include Constants::Library
+      library.index_by_attribute :letter
+      library.index_by_attribute :abbr
+      library.index_by_attribute :name
+      library.collect(:common) do |residue|
+        residue.common? ? residue : nil
+      end
+      library.collect(:residue_index) do |residue|
+        next unless residue.common?
+        [residue, residue.byte]
+      end
+      library.collect(:residue_mass_index) do |residue|
+        next unless residue.common?
+        [residue.residue_mass, residue.byte]
+      end
+    end
+  end
+end

data/lib/molecules/utils.rb ADDED

@@ -0,0 +1,49 @@
+module Molecules
+  # A number of utility routines used by EmpiricalFormula and elsewhere.
+  # These methods are used a great deal and are all prime candidates for
+  # optimization (for example using RubyInline).
+  module Utils
+    module_function
+    # Rounds n to the specified precision (ie number of decimal places)
+    def round(n, precision)
+      factor = 10**precision.to_i
+      (n * factor).round.to_f / factor
+    end
+    # Adds the elements of b to a at corresponding
+    # indicies, multiplying by n.  The input arrays
+    # do not have to be the same length.  Returns a
+    # with trailing zeros removed.
+    def add(a, b, n=1)
+      a << 0 while a.length < b.length
+      # oddly, this is faster than each_with_index
+      i = 0
+      b.each do |factor|
+        a[i] += n * factor
+        i += 1
+      end
+      a.pop while a[-1] == 0
+      a
+    end
+    # Multiples the elements of array a by factor, returning a.
+    # Clears a if factor == 0.
+    def multiply(a, factor)
+      factor == 0 ? a.clear : a.collect! {|i| i * factor}
+    end
+    # Collects the number of each of the patterns in str.  For example:
+    #
+    #   count("abcabca", ["a", "b", "c"])  # => [3, 2, 2]
+    #   count("abcabca", ["a", "bc"])      # => [3, 4]
+    #
+    def count(str, patterns)
+      patterns.collect {|pattern| str.count(pattern)}
+    end
+  end
+end

data/tap.yml ADDED

File without changes

data/test/molecules/calc_test.rb ADDED

@@ -0,0 +1,37 @@
+require File.join(File.dirname(__FILE__), '../tap_test_helper.rb')
+require 'molecules/calc'
+class Molecules::CalcTest < Test::Unit::TestCase
+  acts_as_tap_test
+  attr_reader :t
+  def setup
+    super
+    @t = Molecules::Calc.new
+  end
+  def test_mass_calculation
+    t.enq("H2O")
+    app.run
+    assert_equal [[Unit.new(18.0105646863, "Da")]], app.results(t)
+  end
+  def test_mass_calculation_with_precision
+    t.precision = 2
+    t.enq("H2O", "NH3 + H2O")
+    app.run
+    assert_equal [[Unit.new(18.01, "Da"), Unit.new(35.04, "Da")]], app.results(t)
+  end
+  def test_mass_calculation_with_precision_and_unit_conversion
+    t.units = "yg"
+    t.precision = 3
+    t.enq("H2O")
+    app.run
+    assert_equal [[Unit.new(29.907, "yg")]], app.results(t)
+  end
+end

data/test/molecules/empirical_formula_class_test.rb ADDED

@@ -0,0 +1,196 @@
+require File.join(File.dirname(__FILE__), '../molecules_test_helper.rb')
+require 'molecules/empirical_formula'
+class EmpiricalFormulaClassTest < Test::Unit::TestCase
+  include Molecules
+  #
+  # parse_simple test
+  #
+  def test_parse_simple_documentation
+    assert_equal "H(2)O", EmpiricalFormula.parse_simple("H(2)O").to_s
+    assert_equal "H(2)O", EmpiricalFormula.parse_simple("H (2) O").to_s
+    assert_equal "H(2)O", EmpiricalFormula.parse_simple("HO(-1)O(2)H").to_s
+  end
+  def test_parse_simple
+    assert_equal([2,1], EmpiricalFormula.parse_simple("HO(-1)O(2)H").formula)
+    assert_equal([2,1], EmpiricalFormula.parse_simple("H O (-1  )O( 2) H ").formula)
+  end
+  def test_parse_simple_fails_for_malformed_formulae
+    [
+      # numbers outside parenthesis
+      "H2",
+      # empty parenthesis
+      "H()",
+      # mismatched parenthesis
+      "H(",
+      ")H",
+      # anything complex
+      "H + O"
+    ].each do |formula|
+      assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse_simple(formula) }
+    end
+  end
+  #
+  # test class parse
+  #
+  def test_parse_documentation
+    assert_equal "H(2)O", EmpiricalFormula.parse("H2O").to_s
+    assert_equal "C(52)H(106)", EmpiricalFormula.parse("CH3(CH2)50CH3").to_s
+    assert_equal "C(2)H(4)N(2)", EmpiricalFormula.parse("C2H3NO - H2O + NH3").to_s
+    block = lambda do |formula|
+      case formula
+      when /\[(.*)\]/
+        factors = $1.split(/,/).collect {|i| i.strip.to_i }
+        EmpiricalFormula.new(factors)
+      else nil
+      end
+    end
+    assert_equal  "H(4)O(2)", EmpiricalFormula.parse("H2O + [2, 1]", &block).to_s
+    assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse("H2O + :not_expected", &block) }
+  end
+  def test_parse
+    {
+      nil => "",
+      "" => "",
+      "H" => "H",
+      "HO" => "HO",
+      "HFe" => "FeH",
+      "FeH" => "FeH",
+      "OH2" => "H(2)O",
+      "H2O" => "H(2)O",
+      "C6H12O4" => "C(6)H(12)O(4)",
+      "Fe2OMg3" => "Fe(2)Mg(3)O",
+      "(H)2" => "H(2)",
+      "(OH)2" => "H(2)O(2)",
+      "(HFe)" => "FeH",
+      "(FeH)" => "FeH",
+      "(OH2)2" => "H(4)O(2)",
+      "(H2O)2" => "H(4)O(2)",
+      "(C6H12O4)2" => "C(12)H(24)O(8)",
+      "(Fe2OMg3)2" => "Fe(4)Mg(6)O(2)",
+      "C6H12O4(C6H12O4)2C6H12O4" => "C(24)H(48)O(16)",
+      "Fe2OMg3(Fe2OMg3(Fe2OMg3))Fe2OMg3" => "Fe(8)Mg(12)O(4)",
+      "Fe2OMg3(Fe2OMg3)(Fe2OMg3)Fe2OMg3" => "Fe(8)Mg(12)O(4)",
+      "Fe2OMg3(Fe2OMg3(Fe2OMg3)3((C)6H12O4)2)2C" => "C(25)Fe(18)H(48)Mg(27)O(25)",
+      "  (H2O) 10 0   " => "H(200)O(100)",
+      "CH3(CH2)7CH" => "C(9)H(18)",
+      "H3NCHCO2" => "C(2)H(4)NO(2)",
+      "(CH3)2CuLi" => "C(2)CuH(6)Li",
+      # multipart
+      "-H" => "H(-1)",
+      "H2O-H" => "HO",
+      "H2O - (OH)2+ H2O2-H2O" => ""
+    }.each_pair do |formula, composition_str|
+      m = EmpiricalFormula.parse(formula)
+      assert_equal composition_str, m.to_s, formula
+    end
+  end
+  def test_parse_fails_for_malformed_formulae
+    [
+      # mismatched parenthesis
+      "H)2",
+      "(H2",
+      "(O2(H2)",
+      "(O)2H2)",
+      # hanging factors
+      "2C",
+      #"(2)",
+      "(2)2",
+      "(2C)",
+      "(2C)2",
+      "C(2C)",
+      # empty parenthesis
+      "()",
+      "()2"
+    ].each do |formula|
+      assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse(formula) }
+    end
+  end
+  #
+  # class mass test
+  #
+  def break_test_class_mass_method
+    water_mass = EmpiricalFormula::Element::H.mass * 2 + EmpiricalFormula::Element::O.mass
+    assert_equal 18.010565, water_mass
+    assert_equal 18.010565, EmpiricalFormula.mass("H2O")
+    assert_equal 18.010565, EmpiricalFormula.mass("H + OH")
+    assert_equal 18, EmpiricalFormula.mass("H2O", 0)
+  end
+  #
+  # library molecules
+  #
+  def break_test_access_library_molecules
+    water = EmpiricalFormula::H2O
+    assert_equal water, EmpiricalFormula.lookup('h2o')
+    assert_equal water, EmpiricalFormula.h2o
+    assert_equal 18.010565, EmpiricalFormula.h2o.mass
+  end
+  # vs the VG Analytical Organic Mass Spectrometry reference, reference date unknown (prior to 2005)
+  # the data from the data sheet was copied manually to doc/VG Analytical DataSheet.txt
+  def test_molecule_mass_values_vs_vg_analytical
+    str = %Q{
+NH2 16.01872 16.0226
+OH 17.00274 17.0073
+OCH3 31.01839 31.0342
+CH3CO 43.01839 43.0452}
+    molecules = str.split(/\n/)
+    molecules.each do |mol_str|
+      next if mol_str.empty?
+      name, monoisotopic, average = mol_str.split(/\s/)
+      monoisotopic = monoisotopic.to_f
+      average = average.to_f
+      molecule = EmpiricalFormula.parse(name)
+      assert_in_delta monoisotopic, molecule.mass, delta_mass, mol_str
+      # TODO -- check average mass
+    end
+  end
+  #
+  # benchmark
+  #
+  def test_parse_speed
+    benchmark_test(20) do |x|
+      n = 10
+      ["H20","H2(H2(H2))H2"].each do |formula|
+        x.report("#{n}k #{formula}") do
+          (n*1000).times { EmpiricalFormula.parse(formula) }
+        end
+      end
+    end
+  end
+  def test_parse_simple_speed
+    benchmark_test(20) do |x|
+      n = 10
+      ["H(20)","H(2)H(2)H(2)H(2)"].each do |formula|
+        x.report("#{n}k #{formula}") do
+          (n*1000).times { EmpiricalFormula.parse_simple(formula) }
+        end
+      end
+    end
+  end
+end