molecules 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,91 @@
1
+ require 'molecules/libraries/residue'
2
+
3
+ module Molecules
4
+ module Libraries
5
+
6
+ # Represents a polypeptide as a sequence of residues. For convenience,
7
+ # polypeptides may contain whitespace in their sequences (thus allowing
8
+ # direct use with parsed FASTA formatted peptides sequences).
9
+ #
10
+ # Currently polypeptide only handles sequences with common residues.
11
+ class Polypeptide < EmpiricalFormula
12
+
13
+ class << self
14
+ # Normalizes the input sequence by removing whitespace and capitalizing.
15
+ def normalize(sequence)
16
+ sequence.gsub(/\s/, "").upcase
17
+ end
18
+ end
19
+
20
+ # The sequence of self (including whitespace)
21
+ attr_reader :sequence
22
+
23
+ # A hash of (Residue, Integer) pairs defining the number of a given residue in self.
24
+ attr_reader :residue_composition
25
+
26
+ # The number of residues in self (may differ from sequence.length
27
+ # if sequence contains whitespace).
28
+ attr_reader :length
29
+
30
+ # An array of tokens that may occur in a sequence, grouped
31
+ # as patterns (ie one token for all whitespace characters, and
32
+ # one token for each residue). Used to count the number of
33
+ # each type of residue in a sequence.
34
+ SEQUENCE_TOKENS = ["\s\t\r\n"] + Residue.common.collect {|r| r.letter}
35
+
36
+ def initialize(sequence)
37
+ @sequence = sequence
38
+
39
+ @length = 0
40
+ @residue_composition = {}
41
+ @formula = Array.new(5, 0)
42
+
43
+ # count up the number of whitespaces and residues in self
44
+ tokens = Utils.count(sequence, SEQUENCE_TOKENS)
45
+ whitespace = tokens.shift
46
+
47
+ if whitespace == sequence.length
48
+ # as per the Base specification, factors
49
+ # should have no trailing zeros
50
+ @formula.clear
51
+ return
52
+ end
53
+
54
+ # add the residue masses and factors
55
+ Residue.common.each do |residue|
56
+ # benchmarks indicated that counting for each residue
57
+ # is quicker than trying anything like:
58
+ #
59
+ # sequence.each_byte {|b| bytes[b] += 1}
60
+ #
61
+ # This is particularly an issue for long sequences. The
62
+ # count operation could be optimized for isobaric residues
63
+ n = tokens.shift
64
+ next if n == 0
65
+
66
+ @length += n
67
+ @residue_composition[residue] = n
68
+ Utils.add(@formula, residue.formula, n)
69
+ end
70
+
71
+ if @length + whitespace != sequence.length
72
+ # raise an error if there are unaccounted characters
73
+ raise UnknownResidueError, "unknown characters in sequence: #{sequence}"
74
+ end
75
+ end
76
+
77
+ # Sequentially passes each residue in sequence to the block.
78
+ def each_residue
79
+ residues = Residue.residue_index
80
+ sequence.each_byte do |byte|
81
+ residue = residues[byte]
82
+ yield(residue) if residue
83
+ end
84
+ end
85
+
86
+ class UnknownResidueError < StandardError # :nodoc:
87
+ end
88
+
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,165 @@
1
+ require 'constants/library'
2
+ require 'molecules/empirical_formula'
3
+
4
+ module Molecules
5
+ module Libraries
6
+
7
+ # A library of amino acid residues.
8
+ #
9
+ # r = Residue::A
10
+ # r.name # => "Alanine"
11
+ # r.abbr # => "Ala"
12
+ # r.letter # => "A"
13
+ # r.side_chain.to_s # => "CH(3)"
14
+ #
15
+ class Residue < EmpiricalFormula
16
+
17
+ class << self
18
+ # The 20 common amino acids.
19
+ def common
20
+ collection(:common)
21
+ end
22
+
23
+ # An array of the residues indexed by the byte
24
+ # corresponding to the residue letter.
25
+ def residue_index
26
+ collection(:residue_index)
27
+ end
28
+
29
+ # An array of the residue masses indexed by the byte
30
+ # corresponding to the residue letter.
31
+ def residue_mass_index
32
+ collection(:residue_mass_index)
33
+ end
34
+ end
35
+
36
+ # The full name of self
37
+ attr_reader :name
38
+
39
+ # The (typically) 3-letter abbreviation of self
40
+ attr_reader :abbr
41
+
42
+ # The letter code for self
43
+ attr_reader :letter
44
+
45
+ # The byte corresponding to letter
46
+ attr_reader :byte
47
+
48
+ # An EmpiricalFormula representing the side chain of self
49
+ attr_reader :side_chain
50
+
51
+ # A symbol classification of self
52
+ attr_reader :type
53
+
54
+ # The unrounded monoisotopic side chain mass of self
55
+ attr_reader :side_chain_mass
56
+
57
+ # The uncharged, unrounded, monoisotopic residue mass of self
58
+ # (the backbone plus side chain mass, with no N- or C-terminus)
59
+ attr_reader :residue_mass
60
+
61
+ # The unrounded mass of the immonium ion of self
62
+ # (residue_mass + DELTA_IMMONIUM.mass)
63
+ attr_reader :immonium_ion_mass
64
+
65
+ def initialize(letter, abbr, name, side_chain_formula, classification=nil)
66
+ @side_chain = EmpiricalFormula.parse_simple(side_chain_formula)
67
+ super( Utils.add(side_chain.formula.dup, BACKBONE.formula), false)
68
+
69
+ @letter = letter
70
+ @abbr = abbr
71
+ @name = name
72
+ @classification = classification
73
+
74
+ @side_chain_mass = side_chain.mass
75
+ @residue_mass = mass
76
+ @immonium_ion_mass = @residue_mass + DELTA_IMMONIUM.mass
77
+
78
+ @byte = nil
79
+ @letter.each_byte do |byte|
80
+ @byte = byte
81
+ break
82
+ end unless @letter == nil
83
+ end
84
+
85
+ # True if the residue of type :common
86
+ def common?
87
+ @classification == :common
88
+ end
89
+
90
+ # True if the residue is type :common or :standard.
91
+ def standard?
92
+ @classification == :common || @classification == :standard
93
+ end
94
+
95
+ # True if the residue is a composite representing a set of isobaric residues
96
+ def composite?
97
+ @type == :composite
98
+ end
99
+
100
+ # An EmpiricalFormula for the residue backbone
101
+ BACKBONE = EmpiricalFormula.parse_simple('C(2)H(2)NO')
102
+
103
+ # Add to a Residue to achieve an immonium ion
104
+ DELTA_IMMONIUM = EmpiricalFormula.parse('-CO+H')
105
+
106
+ A = Residue.new('A', "Ala", "Alanine", "CH(3)", :common)
107
+ C = Residue.new('C', "Cys", "Cysteine", "CH(3)S", :common)
108
+ D = Residue.new('D', "Asp", "Aspartic Acid", "C(2)H(3)O(2)", :common)
109
+ E = Residue.new('E', "Glu", "Glutamic Acid", "C(3)H(5)O(2)", :common)
110
+ F = Residue.new('F', "Phe", "Phenylalanine", "C(7)H(7)", :common)
111
+ G = Residue.new('G', "Gly", "Glycine", "H", :common)
112
+ H = Residue.new('H', "His", "Histidine", "C(4)H(5)N(2)", :common)
113
+ I = Residue.new('I', "Ile", "Isoleucine", "C(4)H(9)", :common)
114
+ K = Residue.new('K', "Lys", "Lysine", "C(4)H(10)N", :common)
115
+ L = Residue.new('L', "Leu", "Leucine", "C(4)H(9)", :common)
116
+ M = Residue.new('M', "Met", "Methionine", "C(3)H(7)S", :common)
117
+ N = Residue.new('N', "Asn", "Asparagine", "C(2)H(4)NO", :common)
118
+ O = Residue.new('O', "Pyl", "Pyrrolysine", "C(9)H(17)NO", :standard)
119
+ P = Residue.new('P', "Pro", "Proline", "C(3)H(5)", :common)
120
+ Q = Residue.new('Q', "Gln", "Glutamine", "C(3)H(6)NO", :common)
121
+ R = Residue.new('R', "Arg", "Arginine", "C(4)H(10)N(3)", :common)
122
+ S = Residue.new('S', "Ser", "Serine", "CH(3)O", :common)
123
+ T = Residue.new('T', "Thr", "Threonine", "C(2)H(5)O", :common)
124
+ U = Residue.new('U', "Sec", "Selenocysteine", "CH(3)Se", :standard)
125
+ V = Residue.new('V', "Val", "Valine", "C(3)H(7)", :common)
126
+ W = Residue.new('W', "Trp", "Tryptophan", "C(9)H(8)N", :common)
127
+ Y = Residue.new('Y', "Tyr", "Tyrosine", "C(7)H(7)O", :common)
128
+
129
+ ORN = Residue.new(nil, "Orn", "Ornithine", "C(3)H(8)N", :uncommon)
130
+ ABA = Residue.new(nil, 'Aba', 'Aminobutyric Acid', 'C(2)H(5)', :uncommon)
131
+ AECYS = Residue.new(nil, 'AECys','Aminoethylcysteine', 'C(3)H(8)NS', :uncommon)
132
+ AIB = Residue.new(nil, 'Aib', 'alpha-Aminoisobutyric Acid', 'C(2)H(5)', :uncommon)
133
+ CMCYS = Residue.new(nil, 'CMCys','Carboxymethylcysteine', 'C(3)H(5)O(2)S', :uncommon)
134
+ DHA = Residue.new(nil, 'Dha', 'Dehydroalanine', 'CH', :uncommon)
135
+ DHB = Residue.new(nil, 'Dhb', 'Dehydroamino-alpha-butyric Acid', 'C(2)H(3)', :uncommon)
136
+ HYL = Residue.new(nil, 'Hyl', 'Hydroxylysine', 'C(4)H(10)NO', :uncommon)
137
+ HYP = Residue.new(nil, 'Hyp', 'Hydroxyproline', 'C(3)H(5)O', :uncommon)
138
+ IVA = Residue.new(nil, 'Iva', 'Isovaline', 'C(3)H(7)', :uncommon)
139
+ NLEU = Residue.new(nil, 'nLeu', 'Norleucine', 'C(4)H(9)', :uncommon)
140
+ PIP = Residue.new(nil, 'Pip', '2-Piperidinecarboxylic Acid', 'C(4)H(7)', :uncommon)
141
+ PGLU = Residue.new(nil, 'pGlu', 'Pyroglutamic Acid', 'C(3)H(3)O', :uncommon)
142
+ SAR = Residue.new(nil, 'Sar', 'Sarcosine', 'CH(3)', :uncommon)
143
+
144
+ include Constants::Library
145
+
146
+ library.index_by_attribute :letter
147
+ library.index_by_attribute :abbr
148
+ library.index_by_attribute :name
149
+
150
+ library.collect(:common) do |residue|
151
+ residue.common? ? residue : nil
152
+ end
153
+
154
+ library.collect(:residue_index) do |residue|
155
+ next unless residue.common?
156
+ [residue, residue.byte]
157
+ end
158
+
159
+ library.collect(:residue_mass_index) do |residue|
160
+ next unless residue.common?
161
+ [residue.residue_mass, residue.byte]
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,49 @@
1
+ module Molecules
2
+
3
+ # A number of utility routines used by EmpiricalFormula and elsewhere.
4
+ # These methods are used a great deal and are all prime candidates for
5
+ # optimization (for example using RubyInline).
6
+ module Utils
7
+ module_function
8
+
9
+ # Rounds n to the specified precision (ie number of decimal places)
10
+ def round(n, precision)
11
+ factor = 10**precision.to_i
12
+ (n * factor).round.to_f / factor
13
+ end
14
+
15
+ # Adds the elements of b to a at corresponding
16
+ # indicies, multiplying by n. The input arrays
17
+ # do not have to be the same length. Returns a
18
+ # with trailing zeros removed.
19
+ def add(a, b, n=1)
20
+ a << 0 while a.length < b.length
21
+
22
+ # oddly, this is faster than each_with_index
23
+ i = 0
24
+ b.each do |factor|
25
+ a[i] += n * factor
26
+ i += 1
27
+ end
28
+
29
+ a.pop while a[-1] == 0
30
+ a
31
+ end
32
+
33
+ # Multiples the elements of array a by factor, returning a.
34
+ # Clears a if factor == 0.
35
+ def multiply(a, factor)
36
+ factor == 0 ? a.clear : a.collect! {|i| i * factor}
37
+ end
38
+
39
+ # Collects the number of each of the patterns in str. For example:
40
+ #
41
+ # count("abcabca", ["a", "b", "c"]) # => [3, 2, 2]
42
+ # count("abcabca", ["a", "bc"]) # => [3, 4]
43
+ #
44
+ def count(str, patterns)
45
+ patterns.collect {|pattern| str.count(pattern)}
46
+ end
47
+ end
48
+
49
+ end
data/tap.yml ADDED
File without changes
@@ -0,0 +1,37 @@
1
+ require File.join(File.dirname(__FILE__), '../tap_test_helper.rb')
2
+ require 'molecules/calc'
3
+
4
+ class Molecules::CalcTest < Test::Unit::TestCase
5
+ acts_as_tap_test
6
+
7
+ attr_reader :t
8
+
9
+ def setup
10
+ super
11
+ @t = Molecules::Calc.new
12
+ end
13
+
14
+ def test_mass_calculation
15
+ t.enq("H2O")
16
+ app.run
17
+
18
+ assert_equal [[Unit.new(18.0105646863, "Da")]], app.results(t)
19
+ end
20
+
21
+ def test_mass_calculation_with_precision
22
+ t.precision = 2
23
+ t.enq("H2O", "NH3 + H2O")
24
+ app.run
25
+
26
+ assert_equal [[Unit.new(18.01, "Da"), Unit.new(35.04, "Da")]], app.results(t)
27
+ end
28
+
29
+ def test_mass_calculation_with_precision_and_unit_conversion
30
+ t.units = "yg"
31
+ t.precision = 3
32
+ t.enq("H2O")
33
+ app.run
34
+
35
+ assert_equal [[Unit.new(29.907, "yg")]], app.results(t)
36
+ end
37
+ end
@@ -0,0 +1,196 @@
1
+ require File.join(File.dirname(__FILE__), '../molecules_test_helper.rb')
2
+ require 'molecules/empirical_formula'
3
+
4
+ class EmpiricalFormulaClassTest < Test::Unit::TestCase
5
+ include Molecules
6
+
7
+ #
8
+ # parse_simple test
9
+ #
10
+
11
+ def test_parse_simple_documentation
12
+ assert_equal "H(2)O", EmpiricalFormula.parse_simple("H(2)O").to_s
13
+ assert_equal "H(2)O", EmpiricalFormula.parse_simple("H (2) O").to_s
14
+ assert_equal "H(2)O", EmpiricalFormula.parse_simple("HO(-1)O(2)H").to_s
15
+ end
16
+
17
+ def test_parse_simple
18
+ assert_equal([2,1], EmpiricalFormula.parse_simple("HO(-1)O(2)H").formula)
19
+ assert_equal([2,1], EmpiricalFormula.parse_simple("H O (-1 )O( 2) H ").formula)
20
+ end
21
+
22
+ def test_parse_simple_fails_for_malformed_formulae
23
+ [
24
+ # numbers outside parenthesis
25
+ "H2",
26
+ # empty parenthesis
27
+ "H()",
28
+ # mismatched parenthesis
29
+ "H(",
30
+ ")H",
31
+ # anything complex
32
+ "H + O"
33
+ ].each do |formula|
34
+ assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse_simple(formula) }
35
+ end
36
+ end
37
+
38
+ #
39
+ # test class parse
40
+ #
41
+
42
+ def test_parse_documentation
43
+ assert_equal "H(2)O", EmpiricalFormula.parse("H2O").to_s
44
+ assert_equal "C(52)H(106)", EmpiricalFormula.parse("CH3(CH2)50CH3").to_s
45
+ assert_equal "C(2)H(4)N(2)", EmpiricalFormula.parse("C2H3NO - H2O + NH3").to_s
46
+
47
+ block = lambda do |formula|
48
+ case formula
49
+ when /\[(.*)\]/
50
+ factors = $1.split(/,/).collect {|i| i.strip.to_i }
51
+ EmpiricalFormula.new(factors)
52
+ else nil
53
+ end
54
+ end
55
+
56
+ assert_equal "H(4)O(2)", EmpiricalFormula.parse("H2O + [2, 1]", &block).to_s
57
+ assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse("H2O + :not_expected", &block) }
58
+ end
59
+
60
+ def test_parse
61
+ {
62
+ nil => "",
63
+ "" => "",
64
+ "H" => "H",
65
+ "HO" => "HO",
66
+ "HFe" => "FeH",
67
+ "FeH" => "FeH",
68
+ "OH2" => "H(2)O",
69
+ "H2O" => "H(2)O",
70
+ "C6H12O4" => "C(6)H(12)O(4)",
71
+ "Fe2OMg3" => "Fe(2)Mg(3)O",
72
+ "(H)2" => "H(2)",
73
+ "(OH)2" => "H(2)O(2)",
74
+ "(HFe)" => "FeH",
75
+ "(FeH)" => "FeH",
76
+ "(OH2)2" => "H(4)O(2)",
77
+ "(H2O)2" => "H(4)O(2)",
78
+ "(C6H12O4)2" => "C(12)H(24)O(8)",
79
+ "(Fe2OMg3)2" => "Fe(4)Mg(6)O(2)",
80
+ "C6H12O4(C6H12O4)2C6H12O4" => "C(24)H(48)O(16)",
81
+ "Fe2OMg3(Fe2OMg3(Fe2OMg3))Fe2OMg3" => "Fe(8)Mg(12)O(4)",
82
+ "Fe2OMg3(Fe2OMg3)(Fe2OMg3)Fe2OMg3" => "Fe(8)Mg(12)O(4)",
83
+ "Fe2OMg3(Fe2OMg3(Fe2OMg3)3((C)6H12O4)2)2C" => "C(25)Fe(18)H(48)Mg(27)O(25)",
84
+ " (H2O) 10 0 " => "H(200)O(100)",
85
+ "CH3(CH2)7CH" => "C(9)H(18)",
86
+ "H3NCHCO2" => "C(2)H(4)NO(2)",
87
+ "(CH3)2CuLi" => "C(2)CuH(6)Li",
88
+
89
+ # multipart
90
+ "-H" => "H(-1)",
91
+ "H2O-H" => "HO",
92
+ "H2O - (OH)2+ H2O2-H2O" => ""
93
+ }.each_pair do |formula, composition_str|
94
+ m = EmpiricalFormula.parse(formula)
95
+ assert_equal composition_str, m.to_s, formula
96
+ end
97
+ end
98
+
99
+ def test_parse_fails_for_malformed_formulae
100
+ [
101
+ # mismatched parenthesis
102
+ "H)2",
103
+ "(H2",
104
+ "(O2(H2)",
105
+ "(O)2H2)",
106
+ # hanging factors
107
+ "2C",
108
+ #"(2)",
109
+ "(2)2",
110
+ "(2C)",
111
+ "(2C)2",
112
+ "C(2C)",
113
+ # empty parenthesis
114
+ "()",
115
+ "()2"
116
+ ].each do |formula|
117
+ assert_raise(EmpiricalFormula::ParseError) { EmpiricalFormula.parse(formula) }
118
+ end
119
+ end
120
+
121
+ #
122
+ # class mass test
123
+ #
124
+
125
+ def break_test_class_mass_method
126
+ water_mass = EmpiricalFormula::Element::H.mass * 2 + EmpiricalFormula::Element::O.mass
127
+ assert_equal 18.010565, water_mass
128
+
129
+ assert_equal 18.010565, EmpiricalFormula.mass("H2O")
130
+ assert_equal 18.010565, EmpiricalFormula.mass("H + OH")
131
+ assert_equal 18, EmpiricalFormula.mass("H2O", 0)
132
+ end
133
+
134
+ #
135
+ # library molecules
136
+ #
137
+
138
+ def break_test_access_library_molecules
139
+ water = EmpiricalFormula::H2O
140
+
141
+ assert_equal water, EmpiricalFormula.lookup('h2o')
142
+ assert_equal water, EmpiricalFormula.h2o
143
+ assert_equal 18.010565, EmpiricalFormula.h2o.mass
144
+ end
145
+
146
+ # vs the VG Analytical Organic Mass Spectrometry reference, reference date unknown (prior to 2005)
147
+ # the data from the data sheet was copied manually to doc/VG Analytical DataSheet.txt
148
+ def test_molecule_mass_values_vs_vg_analytical
149
+ str = %Q{
150
+ NH2 16.01872 16.0226
151
+ OH 17.00274 17.0073
152
+ OCH3 31.01839 31.0342
153
+ CH3CO 43.01839 43.0452}
154
+
155
+ molecules = str.split(/\n/)
156
+ molecules.each do |mol_str|
157
+ next if mol_str.empty?
158
+
159
+ name, monoisotopic, average = mol_str.split(/\s/)
160
+ monoisotopic = monoisotopic.to_f
161
+ average = average.to_f
162
+
163
+ molecule = EmpiricalFormula.parse(name)
164
+ assert_in_delta monoisotopic, molecule.mass, delta_mass, mol_str
165
+ # TODO -- check average mass
166
+ end
167
+ end
168
+
169
+ #
170
+ # benchmark
171
+ #
172
+
173
+ def test_parse_speed
174
+ benchmark_test(20) do |x|
175
+ n = 10
176
+
177
+ ["H20","H2(H2(H2))H2"].each do |formula|
178
+ x.report("#{n}k #{formula}") do
179
+ (n*1000).times { EmpiricalFormula.parse(formula) }
180
+ end
181
+ end
182
+ end
183
+ end
184
+
185
+ def test_parse_simple_speed
186
+ benchmark_test(20) do |x|
187
+ n = 10
188
+
189
+ ["H(20)","H(2)H(2)H(2)H(2)"].each do |formula|
190
+ x.report("#{n}k #{formula}") do
191
+ (n*1000).times { EmpiricalFormula.parse_simple(formula) }
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end