molecules 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ require 'molecules/libraries/residue'
2
+
3
+ module Molecules
4
+ module Libraries
5
+
6
+ # Represents a polypeptide as a sequence of residues. For convenience,
7
+ # polypeptides may contain whitespace in their sequences (thus allowing
8
+ # direct use with parsed FASTA formatted peptides sequences).
9
+ #
10
+ # Currently polypeptide only handles sequences with common residues.
11
+ class Polypeptide < EmpiricalFormula
12
+
13
+ class << self
14
+ # Normalizes the input sequence by removing whitespace and capitalizing.
15
+ def normalize(sequence)
16
+ sequence.gsub(/\s/, "").upcase
17
+ end
18
+ end
19
+
20
+ # The sequence of self (including whitespace)
21
+ attr_reader :sequence
22
+
23
+ # A hash of (Residue, Integer) pairs defining the number of a given residue in self.
24
+ attr_reader :residue_composition
25
+
26
+ # The number of residues in self (may differ from sequence.length
27
+ # if sequence contains whitespace).
28
+ attr_reader :length
29
+
30
+ # An array of tokens that may occur in a sequence, grouped
31
+ # as patterns (ie one token for all whitespace characters, and
32
+ # one token for each residue). Used to count the number of
33
+ # each type of residue in a sequence.
34
+ SEQUENCE_TOKENS = ["\s\t\r\n"] + Residue.common.collect {|r| r.letter}
35
+
36
+ def initialize(sequence)
37
+ @sequence = sequence
38
+
39
+ @length = 0
40
+ @residue_composition = {}
41
+ @formula = Array.new(5, 0)
42
+
43
+ # count up the number of whitespaces and residues in self
44
+ tokens = Utils.count(sequence, SEQUENCE_TOKENS)
45
+ whitespace = tokens.shift
46
+
47
+ if whitespace == sequence.length
48
+ # as per the Base specification, factors
49
+ # should have no trailing zeros
50
+ @formula.clear
51
+ return
52
+ end
53
+
54
+ # add the residue masses and factors
55
+ Residue.common.each do |residue|
56
+ # benchmarks indicated that counting for each residue
57
+ # is quicker than trying anything like:
58
+ #
59
+ # sequence.each_byte {|b| bytes[b] += 1}
60
+ #
61
+ # This is particularly an issue for long sequences. The
62
+ # count operation could be optimized for isobaric residues
63
+ n = tokens.shift
64
+ next if n == 0
65
+
66
+ @length += n
67
+ @residue_composition[residue] = n
68
+ Utils.add(@formula, residue.formula, n)
69
+ end
70
+
71
+ if @length + whitespace != sequence.length
72
+ # raise an error if there are unaccounted characters
73
+ raise UnknownResidueError, "unknown characters in sequence: #{sequence}"
74
+ end
75
+ end
76
+
77
+ # Sequentially passes each residue in sequence to the block.
78
+ def each_residue
79
+ residues = Residue.residue_index
80
+ sequence.each_byte do |byte|
81
+ residue = residues[byte]
82
+ yield(residue) if residue
83
+ end
84
+ end
85
+
86
+ class UnknownResidueError < StandardError # :nodoc:
87
+ end
88
+
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,165 @@
1
+ require 'constants/library'
2
+ require 'molecules/empirical_formula'
3
+
4
+ module Molecules
5
+ module Libraries
6
+
7
+ # A library of amino acid residues.
8
+ #
9
+ # r = Residue::A
10
+ # r.name # => "Alanine"
11
+ # r.abbr # => "Ala"
12
+ # r.letter # => "A"
13
+ # r.side_chain.to_s # => "CH(3)"
14
+ #
15
+ class Residue < EmpiricalFormula
16
+
17
+ class << self
18
+ # The 20 common amino acids.
19
+ def common
20
+ collection(:common)
21
+ end
22
+
23
+ # An array of the residues indexed by the byte
24
+ # corresponding to the residue letter.
25
+ def residue_index
26
+ collection(:residue_index)
27
+ end
28
+
29
+ # An array of the residue masses indexed by the byte
30
+ # corresponding to the residue letter.
31
+ def residue_mass_index
32
+ collection(:residue_mass_index)
33
+ end
34
+ end
35
+
36
+ # The full name of self
37
+ attr_reader :name
38
+
39
+ # The (typically) 3-letter abbreviation of self
40
+ attr_reader :abbr
41
+
42
+ # The letter code for self
43
+ attr_reader :letter
44
+
45
+ # The byte corresponding to letter
46
+ attr_reader :byte
47
+
48
+ # An EmpiricalFormula representing the side chain of self
49
+ attr_reader :side_chain
50
+
51
+ # A symbol classification of self
52
+ attr_reader :type
53
+
54
+ # The unrounded monoisotopic side chain mass of self
55
+ attr_reader :side_chain_mass
56
+
57
+ # The uncharged, unrounded, monoisotopic residue mass of self
58
+ # (the backbone plus side chain mass, with no N- or C-terminus)
59
+ attr_reader :residue_mass
60
+
61
+ # The unrounded mass of the immonium ion of self
62
+ # (residue_mass + DELTA_IMMONIUM.mass)
63
+ attr_reader :immonium_ion_mass
64
+
65
+ def initialize(letter, abbr, name, side_chain_formula, classification=nil)
66
+ @side_chain = EmpiricalFormula.parse_simple(side_chain_formula)
67
+ super( Utils.add(side_chain.formula.dup, BACKBONE.formula), false)
68
+
69
+ @letter = letter
70
+ @abbr = abbr
71
+ @name = name
72
+ @classification = classification
73
+
74
+ @side_chain_mass = side_chain.mass
75
+ @residue_mass = mass
76
+ @immonium_ion_mass = @residue_mass + DELTA_IMMONIUM.mass
77
+
78
+ @byte = nil
79
+ @letter.each_byte do |byte|
80
+ @byte = byte
81
+ break
82
+ end unless @letter == nil
83
+ end
84
+
85
+ # True if the residue of type :common
86
+ def common?
87
+ @classification == :common
88
+ end
89
+
90
+ # True if the residue is type :common or :standard.
91
+ def standard?
92
+ @classification == :common || @classification == :standard
93
+ end
94
+
95
+ # True if the residue is a composite representing a set of isobaric residues
96
+ def composite?
97
+ @type == :composite
98
+ end
99
+
100
+ # An EmpiricalFormula for the residue backbone
101
+ BACKBONE = EmpiricalFormula.parse_simple('C(2)H(2)NO')
102
+
103
+ # Add to a Residue to achieve an immonium ion
104
+ DELTA_IMMONIUM = EmpiricalFormula.parse('-CO+H')
105
+
106
+ A = Residue.new('A', "Ala", "Alanine", "CH(3)", :common)
107
+ C = Residue.new('C', "Cys", "Cysteine", "CH(3)S", :common)
108
+ D = Residue.new('D', "Asp", "Aspartic Acid", "C(2)H(3)O(2)", :common)
109
+ E = Residue.new('E', "Glu", "Glutamic Acid", "C(3)H(5)O(2)", :common)
110
+ F = Residue.new('F', "Phe", "Phenylalanine", "C(7)H(7)", :common)
111
+ G = Residue.new('G', "Gly", "Glycine", "H", :common)
112
+ H = Residue.new('H', "His", "Histidine", "C(4)H(5)N(2)", :common)
113
+ I = Residue.new('I', "Ile", "Isoleucine", "C(4)H(9)", :common)
114
+ K = Residue.new('K', "Lys", "Lysine", "C(4)H(10)N", :common)
115
+ L = Residue.new('L', "Leu", "Leucine", "C(4)H(9)", :common)
116
+ M = Residue.new('M', "Met", "Methionine", "C(3)H(7)S", :common)
117
+ N = Residue.new('N', "Asn", "Asparagine", "C(2)H(4)NO", :common)
118
+ O = Residue.new('O', "Pyl", "Pyrrolysine", "C(9)H(17)NO", :standard)
119
+ P = Residue.new('P', "Pro", "Proline", "C(3)H(5)", :common)
120
+ Q = Residue.new('Q', "Gln", "Glutamine", "C(3)H(6)NO", :common)
121
+ R = Residue.new('R', "Arg", "Arginine", "C(4)H(10)N(3)", :common)
122
+ S = Residue.new('S', "Ser", "Serine", "CH(3)O", :common)
123
+ T = Residue.new('T', "Thr", "Threonine", "C(2)H(5)O", :common)
124
+ U = Residue.new('U', "Sec", "Selenocysteine", "CH(3)Se", :standard)
125
+ V = Residue.new('V', "Val", "Valine", "C(3)H(7)", :common)
126
+ W = Residue.new('W', "Trp", "Tryptophan", "C(9)H(8)N", :common)
127
+ Y = Residue.new('Y', "Tyr", "Tyrosine", "C(7)H(7)O", :common)
128
+
129
+ ORN = Residue.new(nil, "Orn", "Ornithine", "C(3)H(8)N", :uncommon)
130
+ ABA = Residue.new(nil, 'Aba', 'Aminobutyric Acid', 'C(2)H(5)', :uncommon)
131
+ AECYS = Residue.new(nil, 'AECys','Aminoethylcysteine', 'C(3)H(8)NS', :uncommon)
132
+ AIB = Residue.new(nil, 'Aib', 'alpha-Aminoisobutyric Acid', 'C(2)H(5)', :uncommon)
133
+ CMCYS = Residue.new(nil, 'CMCys','Carboxymethylcysteine', 'C(3)H(5)O(2)S', :uncommon)
134
+ DHA = Residue.new(nil, 'Dha', 'Dehydroalanine', 'CH', :uncommon)
135
+ DHB = Residue.new(nil, 'Dhb', 'Dehydroamino-alpha-butyric Acid', 'C(2)H(3)', :uncommon)
136
+ HYL = Residue.new(nil, 'Hyl', 'Hydroxylysine', 'C(4)H(10)NO', :uncommon)
137
+ HYP = Residue.new(nil, 'Hyp', 'Hydroxyproline', 'C(3)H(5)O', :uncommon)
138
+ IVA = Residue.new(nil, 'Iva', 'Isovaline', 'C(3)H(7)', :uncommon)
139
+ NLEU = Residue.new(nil, 'nLeu', 'Norleucine', 'C(4)H(9)', :uncommon)
140
+ PIP = Residue.new(nil, 'Pip', '2-Piperidinecarboxylic Acid', 'C(4)H(7)', :uncommon)
141
+ PGLU = Residue.new(nil, 'pGlu', 'Pyroglutamic Acid', 'C(3)H(3)O', :uncommon)
142
+ SAR = Residue.new(nil, 'Sar', 'Sarcosine', 'CH(3)', :uncommon)
143
+
144
+ include Constants::Library
145
+
146
+ library.index_by_attribute :letter
147
+ library.index_by_attribute :abbr
148
+ library.index_by_attribute :name
149
+
150
+ library.collect(:common) do |residue|
151
+ residue.common? ? residue : nil
152
+ end
153
+
154
+ library.collect(:residue_index) do |residue|
155
+ next unless residue.common?
156
+ [residue, residue.byte]
157
+ end
158
+
159
+ library.collect(:residue_mass_index) do |residue|
160
+ next unless residue.common?
161
+ [residue.residue_mass, residue.byte]
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,49 @@
1
+ module Molecules
2
+
3
+ # A number of utility routines used by EmpiricalFormula and elsewhere.
4
+ # These methods are used a great deal and are all prime candidates for
5
+ # optimization (for example using RubyInline).
6
+ module Utils
7
+ module_function
8
+
9
+ # Rounds n to the specified precision (ie number of decimal places)
10
+ def round(n, precision)
11
+ factor = 10**precision.to_i
12
+ (n * factor).round.to_f / factor
13
+ end
14
+
15
+ # Adds the elements of b to a at corresponding
16
+ # indicies, multiplying by n. The input arrays
17
+ # do not have to be the same length. Returns a
18
+ # with trailing zeros removed.
19
+ def add(a, b, n=1)
20
+ a << 0 while a.length < b.length
21
+
22
+ # oddly, this is faster than each_with_index
23
+ i = 0
24
+ b.each do |factor|
25
+ a[i] += n * factor
26
+ i += 1
27
+ end
28
+
29
+ a.pop while a[-1] == 0
30
+ a
31
+ end
32
+
33
+ # Multiples the elements of array a by factor, returning a.
34
+ # Clears a if factor == 0.
35
+ def multiply(a, factor)
36
+ factor == 0 ? a.clear : a.collect! {|i| i * factor}
37
+ end
38
+
39
+ # Collects the number of each of the patterns in str. For example:
40
+ #
41
+ # count("abcabca", ["a", "b", "c"]) # => [3, 2, 2]
42
+ # count("abcabca", ["a", "bc"]) # => [3, 4]
43
+ #
44
+ def count(str, patterns)
45
+ patterns.collect {|pattern| str.count(pattern)}
46
+ end
47
+ end
48
+
49
+ end
data/tap.yml ADDED
File without changes
@@ -0,0 +1,37 @@
1
+ require File.join(File.dirname(__FILE__), '../tap_test_helper.rb')
2
+ require 'molecules/calc'
3
+
4
+ class Molecules::CalcTest < Test::Unit::TestCase
5
+ acts_as_tap_test
6
+
7
+ attr_reader :t
8
+
9
+ def setup
10
+ super
11
+ @t = Molecules::Calc.new
12
+ end
13
+
14
+ def test_mass_calculation
15
+ t.enq("H2O")
16
+ app.run
17
+
18
+ assert_equal [[Unit.new(18.0105646863, "Da")]], app.results(t)
19
+ end
20
+
21
+ def test_mass_calculation_with_precision
22
+ t.precision = 2
23
+ t.enq("H2O", "NH3 + H2O")
24
+ app.run
25
+
26
+ assert_equal [[Unit.new(18.01, "Da"), Unit.new(35.04, "Da")]], app.results(t)
27
+ end
28
+
29
+ def test_mass_calculation_with_precision_and_unit_conversion
30
+ t.units = "yg"
31
+ t.precision = 3
32
+ t.enq("H2O")
33
+ app.run
34
+
35
+ assert_equal [[Unit.new(29.907, "yg")]], app.results(t)
36
+ end
37
+ end
@@ -0,0 +1,196 @@
1
+ require File.join(File.dirname(__FILE__), '../molecules_test_helper.rb')
2
+ require 'molecules/empirical_formula'
3
+
4
+ class EmpiricalFormulaClassTest < Test::Unit::TestCase
5
+ include Molecules
6
+
7
+ #
8
+ # parse_simple test
9
+ #
10
+
11
+ def test_parse_simple_documentation
12
+ assert_equal "H(2)O", EmpiricalFormula.parse_simple("H(2)O").to_s
13
+ assert_equal "H(2)O", EmpiricalFormula.parse_simple("H (2) O").to_s
14
+ assert_equal "H(2)O", EmpiricalFormula.parse_simple("HO(-1)O(2)H").to_s
15
+ end
16
+
17
+ def test_parse_simple
18
+ assert_equal([2,1], EmpiricalFormula.parse_simple("HO(-1)O(2)H").formula)
19
+ assert_equal([2,1], EmpiricalFormula.parse_simple("H O (-1 )O( 2) H ").formula)
20
+ end
21
+
22
+ def test_parse_simple_fails_for_malformed_formulae
23
+ [
24
+ # numbers outside parenthesis
25
+ "H2",
26
+ # empty parenthesis
27
+ "H()",
28
+ # mismatched parenthesis
29
+ "H(",
30
+ ")H",
31
+ # anything complex
32
+ "H + O"
33
+ ].each do |formula|
34
+ assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse_simple(formula) }
35
+ end
36
+ end
37
+
38
+ #
39
+ # test class parse
40
+ #
41
+
42
+ def test_parse_documentation
43
+ assert_equal "H(2)O", EmpiricalFormula.parse("H2O").to_s
44
+ assert_equal "C(52)H(106)", EmpiricalFormula.parse("CH3(CH2)50CH3").to_s
45
+ assert_equal "C(2)H(4)N(2)", EmpiricalFormula.parse("C2H3NO - H2O + NH3").to_s
46
+
47
+ block = lambda do |formula|
48
+ case formula
49
+ when /\[(.*)\]/
50
+ factors = $1.split(/,/).collect {|i| i.strip.to_i }
51
+ EmpiricalFormula.new(factors)
52
+ else nil
53
+ end
54
+ end
55
+
56
+ assert_equal "H(4)O(2)", EmpiricalFormula.parse("H2O + [2, 1]", &block).to_s
57
+ assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse("H2O + :not_expected", &block) }
58
+ end
59
+
60
+ def test_parse
61
+ {
62
+ nil => "",
63
+ "" => "",
64
+ "H" => "H",
65
+ "HO" => "HO",
66
+ "HFe" => "FeH",
67
+ "FeH" => "FeH",
68
+ "OH2" => "H(2)O",
69
+ "H2O" => "H(2)O",
70
+ "C6H12O4" => "C(6)H(12)O(4)",
71
+ "Fe2OMg3" => "Fe(2)Mg(3)O",
72
+ "(H)2" => "H(2)",
73
+ "(OH)2" => "H(2)O(2)",
74
+ "(HFe)" => "FeH",
75
+ "(FeH)" => "FeH",
76
+ "(OH2)2" => "H(4)O(2)",
77
+ "(H2O)2" => "H(4)O(2)",
78
+ "(C6H12O4)2" => "C(12)H(24)O(8)",
79
+ "(Fe2OMg3)2" => "Fe(4)Mg(6)O(2)",
80
+ "C6H12O4(C6H12O4)2C6H12O4" => "C(24)H(48)O(16)",
81
+ "Fe2OMg3(Fe2OMg3(Fe2OMg3))Fe2OMg3" => "Fe(8)Mg(12)O(4)",
82
+ "Fe2OMg3(Fe2OMg3)(Fe2OMg3)Fe2OMg3" => "Fe(8)Mg(12)O(4)",
83
+ "Fe2OMg3(Fe2OMg3(Fe2OMg3)3((C)6H12O4)2)2C" => "C(25)Fe(18)H(48)Mg(27)O(25)",
84
+ " (H2O) 10 0 " => "H(200)O(100)",
85
+ "CH3(CH2)7CH" => "C(9)H(18)",
86
+ "H3NCHCO2" => "C(2)H(4)NO(2)",
87
+ "(CH3)2CuLi" => "C(2)CuH(6)Li",
88
+
89
+ # multipart
90
+ "-H" => "H(-1)",
91
+ "H2O-H" => "HO",
92
+ "H2O - (OH)2+ H2O2-H2O" => ""
93
+ }.each_pair do |formula, composition_str|
94
+ m = EmpiricalFormula.parse(formula)
95
+ assert_equal composition_str, m.to_s, formula
96
+ end
97
+ end
98
+
99
+ def test_parse_fails_for_malformed_formulae
100
+ [
101
+ # mismatched parenthesis
102
+ "H)2",
103
+ "(H2",
104
+ "(O2(H2)",
105
+ "(O)2H2)",
106
+ # hanging factors
107
+ "2C",
108
+ #"(2)",
109
+ "(2)2",
110
+ "(2C)",
111
+ "(2C)2",
112
+ "C(2C)",
113
+ # empty parenthesis
114
+ "()",
115
+ "()2"
116
+ ].each do |formula|
117
+ assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse(formula) }
118
+ end
119
+ end
120
+
121
+ #
122
+ # class mass test
123
+ #
124
+
125
+ def break_test_class_mass_method
126
+ water_mass = EmpiricalFormula::Element::H.mass * 2 + EmpiricalFormula::Element::O.mass
127
+ assert_equal 18.010565, water_mass
128
+
129
+ assert_equal 18.010565, EmpiricalFormula.mass("H2O")
130
+ assert_equal 18.010565, EmpiricalFormula.mass("H + OH")
131
+ assert_equal 18, EmpiricalFormula.mass("H2O", 0)
132
+ end
133
+
134
+ #
135
+ # library molecules
136
+ #
137
+
138
+ def break_test_access_library_molecules
139
+ water = EmpiricalFormula::H2O
140
+
141
+ assert_equal water, EmpiricalFormula.lookup('h2o')
142
+ assert_equal water, EmpiricalFormula.h2o
143
+ assert_equal 18.010565, EmpiricalFormula.h2o.mass
144
+ end
145
+
146
+ # vs the VG Analytical Organic Mass Spectrometry reference, reference date unknown (prior to 2005)
147
+ # the data from the data sheet was copied manually to doc/VG Analytical DataSheet.txt
148
+ def test_molecule_mass_values_vs_vg_analytical
149
+ str = %Q{
150
+ NH2 16.01872 16.0226
151
+ OH 17.00274 17.0073
152
+ OCH3 31.01839 31.0342
153
+ CH3CO 43.01839 43.0452}
154
+
155
+ molecules = str.split(/\n/)
156
+ molecules.each do |mol_str|
157
+ next if mol_str.empty?
158
+
159
+ name, monoisotopic, average = mol_str.split(/\s/)
160
+ monoisotopic = monoisotopic.to_f
161
+ average = average.to_f
162
+
163
+ molecule = EmpiricalFormula.parse(name)
164
+ assert_in_delta monoisotopic, molecule.mass, delta_mass, mol_str
165
+ # TODO -- check average mass
166
+ end
167
+ end
168
+
169
+ #
170
+ # benchmark
171
+ #
172
+
173
+ def test_parse_speed
174
+ benchmark_test(20) do |x|
175
+ n = 10
176
+
177
+ ["H20","H2(H2(H2))H2"].each do |formula|
178
+ x.report("#{n}k #{formula}") do
179
+ (n*1000).times { EmpiricalFormula.parse(formula) }
180
+ end
181
+ end
182
+ end
183
+ end
184
+
185
+ def test_parse_simple_speed
186
+ benchmark_test(20) do |x|
187
+ n = 10
188
+
189
+ ["H(20)","H(2)H(2)H(2)H(2)"].each do |formula|
190
+ x.report("#{n}k #{formula}") do
191
+ (n*1000).times { EmpiricalFormula.parse_simple(formula) }
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end