ms-in_silico 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/MIT-LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2008, Regents of the University of Colorado.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this
4
+ software and associated documentation files (the "Software"), to deal in the Software
5
+ without restriction, including without limitation the rights to use, copy, modify, merge,
6
+ publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
7
+ to whom the Software is furnished to do so, subject to the following conditions:
8
+
9
+ The above copyright notice and this permission notice shall be included in all copies or
10
+ substantial portions of the Software.
11
+
12
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
16
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
17
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,55 @@
1
+ = {Ms-InSilico}[http://mspire.rubyforge.org/projects/ms-in_silico]
2
+
3
+ An {Mspire}[http://mspire.rubyforge.org] library supporting in-silico calculations for mass spec data.
4
+
5
+ == Description
6
+
7
+ Ms-InSilico provides the following modules:
8
+
9
+ * Ms::InSilico::Digester (protein digestion)
10
+ * Ms::InSilico::Spectrum (peptide fragmentation)
11
+
12
+ Corresponding Tap[http://tap.rubyforge.org] tasks are also provided.
13
+
14
+ * Lighthouse[http://bahuvrihi.lighthouseapp.com/projects/16692-mspire/tickets]
15
+ * Github[http://github.com/bahuvrihi/ms-in_silico/tree/master]
16
+ * {Google Group}[http://groups.google.com/group/mspire-forum]
17
+
18
+ == Usage
19
+
20
+ require 'ms/in_silico/digester'
21
+ require 'ms/in_silico/spectrum'
22
+ include Ms::InSilico
23
+
24
+ trypsin = Digester['Trypsin']
25
+ peptides = trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
26
+ # => [
27
+ # 'MIVIGR',
28
+ # 'SIVHPYITNEYEPFAAEK',
29
+ # 'QQILSIMAG']
30
+
31
+ spectrum = Spectrum.new(peptides[0])
32
+ spectrum.parent_ion_mass
33
+ # => 688.417442373391
34
+
35
+ spectrum.series('b')
36
+ # => [
37
+ # 132.047761058391,
38
+ # 245.131825038791,
39
+ # 344.200238954991,
40
+ # 457.284302935391,
41
+ # 514.305766658991,
42
+ # 670.406877687091]
43
+
44
+ == Installation
45
+
46
+ Ms-InSilico is available as a gem on RubyForge[http://rubyforge.org/projects/mspire]. Use:
47
+
48
+ % gem install ms-in_silico
49
+
50
+ == Info
51
+
52
+ Copyright (c) 2006-2008, Regents of the University of Colorado.
53
+ Developer:: {Simon Chiang}[http://bahuvrihi.wordpress.com], {Biomolecular Structure Program}[http://biomol.uchsc.edu/], {Hansen Lab}[http://hsc-proteomics.uchsc.edu/hansenlab/]
54
+ Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
55
+ Licence:: {MIT-Style}[link:files/MIT-LICENSE.html]
@@ -0,0 +1,35 @@
1
+ require 'ms/in_silico/digester'
2
+
3
+ module Ms
4
+ module InSilico
5
+ # Ms::InSilico::Digest::manifest digest a protein sequence into peptides
6
+ # Digest a protein sequence into an array of peptides.
7
+ #
8
+ # % rap digest MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG --+ dump --no-audit
9
+ # I[14:37:55] digest MIVIGRSIVHP... to 3 peptides
10
+ # # date: 2008-09-15 14:37:55
11
+ # ---
12
+ # ms/in_silico/digest (23483900):
13
+ # - - MIVIGR
14
+ # - SIVHPYITNEYEPFAAEK
15
+ # - QQILSIMAG
16
+ #
17
+ class Digest < Tap::Task
18
+
19
+ config :digester, 'Trypsin' # the name of the digester
20
+ config :max_misses, 0, &c.integer # the max # of missed cleavage sites
21
+ config :site_digest, false, &c.boolean # digest to sites (rather than sequences)
22
+
23
+ def process(sequence)
24
+ unless d = Digester[digester]
25
+ raise ArgumentError, "unknown digester: #{digester}"
26
+ end
27
+
28
+ peptides = site_digest ? d.site_digest(sequence, max_misses): d.digest(sequence, max_misses)
29
+ log 'digest', "#{sequence[0..10]}#{sequence.length > 10 ? '...' : ''} to #{peptides.length} peptides"
30
+ peptides
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,263 @@
1
+ require 'constants/library'
2
+ require 'strscan'
3
+
4
+ module Ms
5
+ module InSilico
6
+
7
+ # Digester splits a protein sequence into peptides at sites specified
8
+ # during initialization; in short Digester models a cleavage enzyme.
9
+ # Digesters support missed cleavage sites, and can return either the
10
+ # peptide strings or the cleavage sites.
11
+ #
12
+ # Digester includes {Constants::Library}[http://bioactive.rubyforge.org/constants/classes/Constants/Library.html],
13
+ # allowing access to many common digesters using Digester[]:
14
+ #
15
+ # trypsin = Digester['Trypsin']
16
+ # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
17
+ # # => [
18
+ # # 'MIVIGR',
19
+ # # 'SIVHPYITNEYEPFAAEK',
20
+ # # 'QQILSIMAG']
21
+ #
22
+ # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
23
+ # # => [
24
+ # # 'MIVIGR',
25
+ # # 'MIVIGRSIVHPYITNEYEPFAAEK',
26
+ # # 'SIVHPYITNEYEPFAAEK',
27
+ # # 'SIVHPYITNEYEPFAAEKQQILSIMAG',
28
+ # # 'QQILSIMAG'
29
+ # # ]
30
+ #
31
+ # trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
32
+ # # => [
33
+ # # [0,6],
34
+ # # [0,24],
35
+ # # [6,24],
36
+ # # [6,33],
37
+ # # [24,33]
38
+ # # ]
39
+ #
40
+ # ==== Enzymes
41
+ #
42
+ # Enzymes in the library were adapted from the default Mascot[http://www.matrixscience.com/]
43
+ # enzyme list. Currently supported enzymes include:
44
+ #
45
+ # * Arg-C
46
+ # * Asp-N
47
+ # * Asp-N_ambic
48
+ # * Chymotrypsin
49
+ # * CNBr
50
+ # * Lys-C
51
+ # * Lys-C/P
52
+ # * PepsinA
53
+ # * Tryp-CNBr
54
+ # * TrypChymo
55
+ # * Trypsin/P
56
+ # * V8-DE
57
+ # * V8-E
58
+ # * Trypsin
59
+ # * V8-E+Trypsin
60
+ # * V8-DE+Trypsin
61
+ #
62
+ # Several enzymes require two or more digesters, or functionality that
63
+ # is not provided by Digester, and so remain unsupported:
64
+ #
65
+ # * CNBr+Trypsin
66
+ # * Formic_acid
67
+ # * LysC+AspN
68
+ # * semiTrypsin
69
+ #
70
+ class Digester
71
+
72
+ # The name of the digester
73
+ attr_reader :name
74
+
75
+ # A string of residues at which cleavage occurs
76
+ attr_reader :cleave_str
77
+
78
+ # A c-terminal resitriction residue which prevents
79
+ # cleavage at a potential cleavage site (optional).
80
+ attr_reader :cterm_exception
81
+
82
+ # True if cleavage occurs at the c-terminus of a
83
+ # cleavage residue, false if cleavage occurs at
84
+ # the n-terminus.
85
+ attr_reader :cterm_cleavage
86
+
87
+ # a multiline whitespace regexp
88
+ WHITESPACE = /\s*/m
89
+
90
+ def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
91
+ regexp = []
92
+ 0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
93
+
94
+ @name = name
95
+ @cleave_str = cleave_str
96
+ @cleave_regexp = Regexp.new(regexp.join('|'))
97
+ @cterm_exception = case
98
+ when cterm_exception == nil || cterm_exception.empty? then nil
99
+ when cterm_exception.length == 1 then cterm_exception[0]
100
+ else
101
+ raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
102
+ end
103
+
104
+ @cterm_cleavage = cterm_cleavage
105
+ @scanner = StringScanner.new('')
106
+ end
107
+
108
+ # Returns sites of digestion sites in sequence, as determined by
109
+ # thecleave_regexp boundaries. The digestion sites correspond
110
+ # to the positions where a peptide begins and ends, such that
111
+ # [n, (n+1) - n] corresponds to the [index, length] for peptide n.
112
+ #
113
+ # d = Digester.new('Trypsin', 'KR', 'P')
114
+ # seq = "AARGGR"
115
+ # sites = d.cleavage_sites(seq) # => [0, 3, 6]
116
+ #
117
+ # seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
118
+ # seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
119
+ #
120
+ # Trailing whitespace is included in the fragment.
121
+ #
122
+ # seq = "AAR \n GGR"
123
+ # sites = d.cleavage_sites(seq) # => [0, 8, 11]
124
+ #
125
+ # seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
126
+ # seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
127
+ #
128
+ # The digested section of sequence may be specified using offset
129
+ # and length.
130
+ def cleavage_sites(seq, offset=0, length=seq.length-offset)
131
+ adjustment = cterm_cleavage ? 0 : 1
132
+ limit = offset + length
133
+
134
+ positions = [offset]
135
+ pos = scan(seq, offset, limit) do |pos|
136
+ positions << pos - adjustment
137
+ end
138
+
139
+ # add the final position
140
+ if pos < limit || positions.length == 1
141
+ positions << limit
142
+ end
143
+
144
+ positions
145
+ end
146
+
147
+ # Returns digestion sites of sequence as [start_index, end_index] pairs,
148
+ # allowing for missed cleavages. Digestion sites are determined using
149
+ # cleavage_sites; as in that method, the digested section of sequence
150
+ # may be specified using offset and length.
151
+ #
152
+ # Each [start_index, end_index] pair is yielded to the block, if given,
153
+ # and the collected results are returned.
154
+ def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset) # :yields: start_index, end_index
155
+ frag_sites = cleavage_sites(seq, offset, length)
156
+
157
+ overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
158
+ start_index = frag_sites[start_index]
159
+ end_index = frag_sites[end_index]
160
+
161
+ block_given? ? yield(start_index, end_index) : [start_index, end_index]
162
+ end
163
+ end
164
+
165
+ # Returns an array of peptides produced by digesting sequence, allowing for
166
+ # missed cleavage sites. Digestion sites are determined using cleavage_sites;
167
+ # as in that method, the digested section of sequence may be specified using
168
+ # offset and length.
169
+ def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
170
+ site_digest(seq, max_misses, offset, length).collect do |s, e|
171
+ seq[s, e-s]
172
+ end
173
+ end
174
+
175
+ protected
176
+
177
+ # The cleavage regexp used to identify cleavage sites
178
+ attr_reader :cleave_regexp # :nodoc:
179
+
180
+ # The scanner used to digest strings.
181
+ attr_reader :scanner # :nodoc:
182
+
183
+ # Scans seq between offset and limit for the cleave_regexp, skipping whitespace
184
+ # and being mindful of exception characters. The positions of the scanner at
185
+ # each match are yielded to the block.
186
+ def scan(seq, offset, limit) # :nodoc:
187
+ scanner.string = seq
188
+ scanner.pos = offset
189
+
190
+ while scanner.search_full(cleave_regexp, true, false)
191
+ scanner.search_full(WHITESPACE, true, false)
192
+ pos = scanner.pos
193
+
194
+ # skip if the next character is the exception character
195
+ next if cterm_exception != nil && seq[pos] == cterm_exception
196
+
197
+ # break if you scanned past the upper limit
198
+ break if pos > limit
199
+
200
+ yield pos
201
+ end
202
+
203
+ scanner.pos
204
+ end
205
+
206
+ # Performs an overlap-collect algorithm providing the start and end
207
+ # indicies of spans skipping up to max_misses boundaries.
208
+ def overlay(n, max_misses, offset) # :nodoc:
209
+ results = []
210
+ 0.upto(n-1) do |start_index|
211
+ 0.upto(max_misses) do |n_miss|
212
+ end_index = start_index + offset + n_miss
213
+ break if end_index == n
214
+
215
+ results << yield(start_index, end_index)
216
+ end
217
+ end
218
+ results
219
+ end
220
+
221
+ #
222
+ # Enzymes adapted from the default Mascot enzyme list.
223
+ #
224
+
225
+ class << self
226
+ protected
227
+
228
+ # Utility method to parse a mascot enzyme configuration
229
+ # string into a Digester.
230
+ def mascot_parse(str) # :nodoc:
231
+ name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
232
+ cterm_cleavage = case sense
233
+ when 'C-Term' then true
234
+ when 'N-Term' then false
235
+ else raise ArgumentError, "unknown sense: #{sense}"
236
+ end
237
+
238
+ new(name, cleave_str, cterm_exception, cterm_cleavage)
239
+ end
240
+ end
241
+
242
+ ARG_C = mascot_parse('Arg-C C-Term R P no no')
243
+ ASP_N = mascot_parse('Asp-N N-Term BD no no')
244
+ ASP_N_AMBIC = mascot_parse('Asp-N_ambic N-Term DE no no')
245
+ CHYMOTRYPSIN = mascot_parse('Chymotrypsin C-Term FLWY P no no')
246
+ CNBR = mascot_parse('CNBr C-Term M no no')
247
+ LYS_C = mascot_parse('Lys-C C-Term K P no no')
248
+ LYS_C_P = mascot_parse('Lys-C/P C-Term K no no')
249
+ PEPSIN_A = mascot_parse('PepsinA C-Term FL no no')
250
+ TRYP_CNBR = mascot_parse('Tryp-CNBr C-Term KMR P no no')
251
+ TRYP_CHYMO = mascot_parse('TrypChymo C-Term FKLRWY P no no')
252
+ TRYPSIN_P = mascot_parse('Trypsin/P C-Term KR no no')
253
+ V8_DE = mascot_parse('V8-DE C-Term BDEZ P no no')
254
+ V8_E = mascot_parse('V8-E C-Term EZ P no no')
255
+ TRYPSIN = mascot_parse('Trypsin C-Term KR P no no')
256
+ V8_E_TRYPSIN = mascot_parse('V8-E+Trypsin C-Term EKRZ P no no')
257
+ V8_DE_TRYPSIN = mascot_parse('V8-DE+Trypsin C-Term BDEKRZ P no no')
258
+
259
+ include Constants::Library
260
+ library.index_by_attribute :name
261
+ end
262
+ end
263
+ end
@@ -0,0 +1,74 @@
1
+ require 'ms/in_silico/spectrum'
2
+
3
+ module Ms
4
+ module InSilico
5
+
6
+ # Ms::InSilico::Fragment::manifest calculates a theoretical ms/ms spectrum
7
+ #
8
+ # Calculates the parent ion mass and theoretical ms/ms spectrum for a
9
+ # peptide sequence. Configurations allow the specification of one or
10
+ # more fragmentation series to include, as well as charge, and intensity.
11
+ #
12
+ # % rap fragment TVQQEL --+ dump --no-audit
13
+ # # date: 2008-09-15 14:37:55
14
+ # ---
15
+ # ms/in_silico/fragment (:...:):
16
+ # - - 717.377745628191
17
+ # - - 102.054954926291
18
+ # - 132.101905118891
19
+ # - 201.123368842491
20
+ # - 261.144498215091
21
+ # - 329.181946353891
22
+ # - 389.203075726491
23
+ # - 457.240523865291
24
+ # - 517.261653237891
25
+ # - 586.283116961491
26
+ # - 616.330067154091
27
+ # - 699.367180941891
28
+ # - 717.377745628191
29
+ #
30
+ # In the output, the parent ion mass is given first, followed by an
31
+ # array of the sorted fragmentation data.
32
+ class Fragment < Tap::Task
33
+
34
+ # A block to validate a config input
35
+ # is an EmpericalFormula.
36
+ MOLECULE = lambda do |value|
37
+ case value
38
+ when Molecules::EmpiricalFormula then value
39
+ else Molecules::EmpiricalFormula.parse(value)
40
+ end
41
+ end
42
+
43
+ config :series, ['y', 'b'], &c.array # a list of the series to include
44
+ config :charge, 1, &c.integer # the charge for the parent ion
45
+ config :intensity, nil, &c.num_or_nil # a uniform intensity value
46
+ config :nterm, 'H', &MOLECULE # the n-terminal modification
47
+ config :cterm, 'OH', &MOLECULE # the c-terminal modification
48
+ config :sort, true, &c.switch # sorts the data by mass
49
+ config :unmask, true, &c.switch # remove masked (negative) masses
50
+
51
+ def process(peptide)
52
+ log :fragment, peptide
53
+ spec = spectrum(peptide)
54
+
55
+ masses = []
56
+ series.each {|s| masses.concat(spec.series(s)) }
57
+ masses.delete_if {|m| m < 0 } if unmask
58
+ masses.sort! if sort
59
+ masses.collect! {|m| [m, intensity] } if intensity
60
+
61
+ [spec.parent_ion_mass(charge), masses]
62
+ end
63
+
64
+ protected
65
+
66
+ # Returns a new Spectrum used in the calculation.
67
+ # Primarily a hook for custom spectra in subclasses.
68
+ def spectrum(peptide)
69
+ Spectrum.new(peptide, nterm, cterm)
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,450 @@
1
+ require 'molecules/libraries/residue'
2
+ require 'constants/libraries/particle'
3
+ require 'ms/in_silico'
4
+
5
+ module Ms
6
+ module InSilico
7
+
8
+ # Spectrum calculates the theoretical ion series produced by a fragmentation
9
+ # process such as collision induced disocciation (CID). The formula used to
10
+ # calculate the ion series were obtained from the {Matrix Science
11
+ # website}[http://www.matrixscience.com/]. Spectrum uses the
12
+ # {Constants}[http://bioactive.rubyforge.org/constants/] gem as the default
13
+ # source of element and particle masses.
14
+ #
15
+ # spec = Ms::InSilico::Spectrum.new('TVQQEL')
16
+ # spec.series('b')
17
+ # # => [
18
+ # # 102.054954926291,
19
+ # # 201.123368842491,
20
+ # # 329.181946353891,
21
+ # # 457.240523865291,
22
+ # # 586.283116961491,
23
+ # # 699.367180941891]
24
+ #
25
+ # spec.series('y')
26
+ # # => [
27
+ # # 717.377745628191,
28
+ # # 616.330067154091,
29
+ # # 517.261653237891,
30
+ # # 389.203075726491,
31
+ # # 261.144498215091,
32
+ # # 132.101905118891]
33
+ #
34
+ # ==== Formulae to Calculate Fragment Ion m/z values
35
+ #
36
+ # <em>Copied directly from the Matrix Science {fragmentation help
37
+ # section}[http://www.matrixscience.com/help/fragmentation_help.html]</em>
38
+ #
39
+ # [N] is the molecular mass of the neutral N-terminal group, [C] is the
40
+ # molecular mass of the neutral C-terminal group, [M] is molecular mass
41
+ # of the neutral amino acid residues. To obtain m/z values, add or
42
+ # subtract protons as required to obtain the required charge and divide
43
+ # by the number of charges. For example, to get a+, add 1 proton to the
44
+ # Mr value for a. To get a--, subtract 2 protons from the Mr value for
45
+ # a and divide by 2.
46
+ #
47
+ # Ion Type Neutral Mr
48
+ # a [N]+[M]-CHO
49
+ # a* a-NH3
50
+ # a� a-H2O
51
+ # b [N]+[M]-H
52
+ # b* b-NH3
53
+ # b� b-H2O
54
+ # c [N]+[M]+NH2
55
+ # d a - partial side chain
56
+ # v y - complete side chain
57
+ # w z - partial side chain
58
+ # x [C]+[M]+CO-H
59
+ # y [C]+[M]+H
60
+ # y* y-NH3
61
+ # y� y-H2O
62
+ # z [C]+[M]-NH2
63
+ #
64
+ # ==== Use of alternate masses
65
+ # By default a Spectrum will calculate the ion series' using the
66
+ # monoisotopic masses for each element. To calculate masses
67
+ # differently, provide a block to new; each Element will be
68
+ # passed to the block as needed, and the block should return
69
+ # the element mass used in the calculation.
70
+ #
71
+ # Alternatively, a subclass can override the mass method; all
72
+ # objects that need to be turned into a mass (nterm, cterm,
73
+ # a variety of molecules specified as strings, the elements,
74
+ # ELECTRON, etc) are passed to mass to yield the value used
75
+ # in any given calculation.
76
+ #
77
+ #--
78
+ # ALL of the collections could be sped up using inline
79
+ #++
80
+ class Spectrum
81
+ include Molecules
82
+ include Molecules::Libraries
83
+ include Constants::Libraries
84
+
85
+ class << self
86
+
87
+ def inherited(base)
88
+ base.instance_variable_set(:@residues_to_locate, @residues_to_locate.dup)
89
+ end
90
+
91
+ # A string of residues located by scan.
92
+ attr_accessor :residues_to_locate
93
+
94
+ # Adds residues to residues_to_locate (these residues
95
+ # will be located by scan). Generally used when some
96
+ # special fragmentation behavior occurs at specific
97
+ # residues. By default no residues are located.
98
+ #
99
+ # class Subclass < Spectrum
100
+ # locate_residues "PS"
101
+ # end
102
+ #
103
+ # Subclass.new('RPPGFSPFR').residue_locations
104
+ # # => {'P' => [1, 2, 6], 'S' => [5]}
105
+ #
106
+ # Calls to locate_residues are cumulative.
107
+ def locate_residues(residues)
108
+ @residues_to_locate += residues
109
+ end
110
+
111
+ # Scans the sequence to produce a ladder of masses and a
112
+ # hash of (residue, locations) pairs which indicate the
113
+ # indicies at which the residue occurs in sequence. The
114
+ # ladder corresponds to the M values described above.
115
+ #
116
+ # Returns [ladder, {residue => locations}].
117
+ #
118
+ # ==== Inputs
119
+ # sequence:: a string
120
+ # masses_by_byte:: an array of masses where the index of
121
+ # the mass is the byte of the
122
+ # corresponding residue.
123
+ # residues_to_locate:: a string of the residues to locate.
124
+ #
125
+ # Note: scan is an optimized utility function, but should
126
+ # be replaced by an inline function to do the same.
127
+ #
128
+ def scan(sequence, masses_by_byte, residues_to_locate)
129
+ locations = []
130
+ residues_to_locate.each_byte {|byte| locations[byte] = []}
131
+
132
+ mass = 0
133
+ ladder = []
134
+ sequence.each_byte do |byte|
135
+ mass += masses_by_byte[byte]
136
+ location = locations[byte]
137
+
138
+ location << ladder.length if location
139
+ ladder << mass
140
+ end
141
+
142
+ hash = {}
143
+ 0.upto(residues_to_locate.length-1) do |index|
144
+ letter = residues_to_locate[index, 1]
145
+ byte = letter[0]
146
+ hash[letter] = locations[byte]
147
+ end
148
+
149
+ [ladder, hash]
150
+ end
151
+ end
152
+
153
+ HYDROGEN = EmpiricalFormula.parse("H")
154
+ HYDROXIDE = EmpiricalFormula.parse("OH")
155
+ ELECTRON = Particle['Electron']
156
+
157
+ self.residues_to_locate = ""
158
+
159
+ # The peptide sequence.
160
+ attr_reader :sequence
161
+
162
+ # The n-terminal modification (default H)
163
+ attr_reader :nterm
164
+
165
+ # The c-terminal modification (default OH)
166
+ attr_reader :cterm
167
+
168
+ # An optional block used to calculate masses of molecules.
169
+ attr_reader :block
170
+
171
+ # A ladder of mass values corresponding to the
172
+ # M values used in the fragmentation formulae.
173
+ attr_reader :ladder
174
+
175
+ # A hash of (residue, [locations]) pairs where
176
+ # the locations are the indicies in sequence
177
+ # at which residue occurs.
178
+ attr_reader :residue_locations
179
+
180
+ # Initializes a new Spectrum using the specified n- and c-terminal
181
+ # modifications. Masses will be calculated using the block, if
182
+ # specified. If no block is specified, then the monoisoptopic
183
+ # masses will be used.
184
+ def initialize(sequence, nterm=HYDROGEN, cterm=HYDROXIDE, &block) # :yields: element
185
+ @sequence = sequence
186
+ @nterm = nterm
187
+ @cterm = cterm
188
+ @block = block
189
+
190
+ residue_masses = Residue.residue_index.collect do |residue|
191
+ next(0) if residue == nil
192
+ mass(residue)
193
+ end
194
+
195
+ @ladder, @residue_locations = self.class.scan(
196
+ sequence,
197
+ residue_masses,
198
+ self.class.residues_to_locate)
199
+
200
+ @series_hash = {}
201
+ @series_mask = {}
202
+ end
203
+
204
+ # Returns the mass of the parent ion for the sequence, given the charge.
205
+ def parent_ion_mass(charge=1)
206
+ (mass(nterm) + ladder.last + mass(cterm) + charge * proton_mass)/charge
207
+ end
208
+
209
+ # Returns the mass of a proton (ie Hydrogen minus an Electron)
210
+ def proton_mass
211
+ mass(HYDROGEN) - mass(ELECTRON)
212
+ end
213
+
214
+ # Retrieves the specfied series, assuming a charge of 1. A different charge
215
+ # can be specified for the series by using '+' and '-'. For example:
216
+ #
217
+ # f = Spectrum.new 'RPPGFSPFR'
218
+ # f.series('y') == f.y_series # => true
219
+ # f.series('b++') == f.b_series(2) # => true
220
+ # f.series('nladder-') == f.nladder_series(-1) # => true
221
+ #
222
+ # Series raises an error if the specified charge is zero.
223
+ def series(s)
224
+ s = s.to_s.strip
225
+ case s
226
+ when /^(immonium|nladder|cladder|[abcxyYz])(\+*)(-*)(\s[\+\-\s\w\d]+)?$/
227
+ series = $1
228
+ plus = $2
229
+ minus = $3
230
+ mod = $4.to_s.gsub(/\s/, "")
231
+
232
+ charge = case
233
+ when plus.empty? && minus.empty? then 1
234
+ when minus.empty? then plus.length
235
+ when plus.empty? then -minus.length
236
+ else
237
+ charge = plus.length - minus.length
238
+ raise ArgumentError.new("zero charge specified in series: #{s}") if charge == 0
239
+ charge
240
+ end
241
+
242
+ self.send("#{series}_series", charge, mod)
243
+ else
244
+ handle_unknown_series(s)
245
+ end
246
+ end
247
+
248
+ def immonium_series(charge=1, mod=nil)
249
+ get_series(:immonium, charge, mod) do
250
+ delta = mass(mod) - mass('CO')
251
+
252
+ previous = 0
253
+ series = []
254
+ ladder.each do |current|
255
+ series << (current - previous + delta + charge * proton_mass)/charge
256
+ previous = current
257
+ end
258
+ series
259
+ end
260
+ end
261
+
262
+ # [N]+[M]-CHO
263
+ def a_series(charge=1, mod=nil)
264
+ get_series(:a, charge, mod) do
265
+ delta = mass(mod) + mass(nterm) - mass('CHO') + charge * proton_mass
266
+ nterm_series(delta, charge)
267
+ end
268
+ end
269
+
270
+ # [N]+[M]-H
271
+ def b_series(charge=1, mod=nil)
272
+ get_series(:b, charge, mod) do
273
+ delta = mass(mod) + mass(nterm) - mass('H') + charge * proton_mass
274
+ nterm_series(delta, charge)
275
+ end
276
+ end
277
+
278
+ # [N]+[M]+NH2
279
+ def c_series(charge=1, mod=nil)
280
+ get_series(:c, charge, mod) do
281
+ delta = mass(mod) + mass(nterm) + mass('NH2') + charge * proton_mass
282
+ nterm_series(delta, charge)
283
+ end
284
+ end
285
+
286
+ # [M]+H20
287
+ #--
288
+ # Ask Peter about these as well... Currently I'm adding water to
289
+ # cap the ends, as if a hydrolysis reaction produced the ladder,
290
+ # then I'm adding H for charge... is this what is intended?
291
+ # Why not cladder[0] or cladder[-1]?
292
+ #++
293
+ def cladder_series(charge=1, mod=nil)
294
+ get_series(:cladder, charge, mod) do
295
+ delta = mass(mod) + mass('H2O') + charge * proton_mass
296
+ nterm_series(delta, charge)
297
+ end
298
+ end
299
+
300
+ # [C]+[M]+CO-H
301
+ def x_series(charge=1, mod=nil)
302
+ get_series(:x, charge, mod) do
303
+ delta = mass(mod) + ladder.last + mass(cterm) + mass('CO - H') + charge * proton_mass
304
+ cterm_series(delta, charge)
305
+ end
306
+ end
307
+
308
+ # [C]+[M]+H
309
+ def y_series(charge=1, mod=nil)
310
+ get_series(:y, charge, mod) do
311
+ delta = mass(mod) + ladder.last + mass(cterm) + mass('H') + charge * proton_mass
312
+ cterm_series(delta, charge)
313
+ end
314
+ end
315
+
316
+ # [C]+[M]-H
317
+ def Y_series(charge=1, mod=nil)
318
+ get_series(:Y, charge, mod) do
319
+ delta = mass(mod) + ladder.last + mass(cterm) - mass('H') + charge * proton_mass
320
+ cterm_series(delta, charge)
321
+ end
322
+ end
323
+
324
+ # [C]+[M]-NH2
325
+ def z_series(charge=1, mod=nil)
326
+ get_series(:z, charge, mod) do
327
+ delta = mass(mod) + ladder.last + mass(cterm) - mass('NH2') + charge * proton_mass
328
+ cterm_series(delta, charge)
329
+ end
330
+ end
331
+
332
+ # [M]+H20
333
+ #--
334
+ # Ask Peter about these as well... Currently I'm adding water to
335
+ # cap the ends, as if a hydrolysis reaction produced the ladder,
336
+ # then I'm adding H for charge... is this what is intended?
337
+ # Why not nladder[-1]?
338
+ #++
339
+ def nladder_series(charge=1, mod=nil)
340
+ get_series(:nladder, charge, mod) do
341
+ delta = mass(mod) + ladder.last + mass('H2O') + charge * proton_mass
342
+ cterm_series(delta, charge)
343
+ end
344
+ end
345
+
346
+ protected
347
+
348
+ # A hash holding all calculated series for self. Series are keyed
349
+ # by the type and charge of the series (ex: b1, b2, y1, y2).
350
+ attr_accessor :series_hash
351
+
352
+ # A hash holding the locations of residues that need to be masked (ie
353
+ # multiplied by -1) in a given series. Mask locations should be unique
354
+ # so that a given location will not be masked twice; the method
355
+ # mask_locations can assist in doing so. Series masks are keyed
356
+ # by the series type (ex: b, y).
357
+ attr_accessor :series_mask
358
+
359
+ # Calculates the mass of the molecule for a variety of input
360
+ # types:
361
+ #
362
+ # EmpiricalFormula molecule.mass(&block)
363
+ # Particle molecule.mass
364
+ # String EmpiricalFormula.mass(molecule, &block)
365
+ # Numeric molecule
366
+ # nil 0
367
+ #
368
+ def mass(molecule)
369
+
370
+ # note that Particles will not actually make use of the
371
+ # block, even though it is being passed to it.
372
+
373
+ case molecule
374
+ when EmpiricalFormula, Particle then molecule.mass(&block)
375
+ when String then EmpiricalFormula.mass(molecule, &block)
376
+ when nil then 0
377
+ when Numeric then molecule
378
+ else
379
+ raise "cannot calculate mass of: #{molecule}"
380
+ end
381
+ end
382
+
383
+ # Generates an n-terminal series (ex: a, b, or c) by adding delta
384
+ # to each element from ladder, and dividing by charge. Delta,
385
+ # therefore, should ALREADY take account of the protons added
386
+ # by charge.
387
+ def nterm_series(delta, charge)
388
+ ladder.collect {|m| (m + delta)/charge }
389
+ end
390
+
391
+ # Generates a c-terminal series (ex: x, y, or z) by subtracting each
392
+ # element from ladder from delta, and dividing by charge. Delta,
393
+ # therefore, should ALREADY take account of the protons added
394
+ # by charge.
395
+ def cterm_series(delta, charge)
396
+ series = ladder.collect {|m| (delta - m)/charge }
397
+ series.unshift(delta/charge)
398
+ series.pop
399
+ series
400
+ end
401
+
402
+ # Adds the specified locations to the series mask, ensuring that the
403
+ # specified locations will be unique within the mask. If overwrite
404
+ # is true, then the input locations will overwrite any existing mask
405
+ # locations.
406
+ def mask_locations(series, locations, overwrite=false)
407
+ locations = locations.collect do |location|
408
+ location < 0 ? ladder.length + location : location
409
+ end
410
+
411
+ if overwrite
412
+ series_mask[series] = locations.uniq
413
+ else
414
+ (series_mask[series] ||= []).concat(locations).uniq!
415
+ end
416
+ end
417
+
418
+ # Retrieves the series keyed by "#{key}#{charge}" in series_hash.
419
+ # If the series has not been initialized, the series will be
420
+ # initialized using the supplied block, and masked using the
421
+ # series_mask indicated by key (not "#{key}#{charge}").
422
+ def get_series(key, charge=nil, mod=nil)
423
+ series_hash["#{key}#{charge}#{mod}"] ||= mask(yield, key, mod)
424
+ end
425
+
426
+ # Mask the locations in the series by multiplying them by -1. Mask
427
+ # does NOT check to see if the location is negative or positive.
428
+ def mask(series, key, mod)
429
+ locations = series_mask[key]
430
+
431
+ unless mod == nil
432
+ mod_locations = series_mask["#{key}#{mod}"]
433
+ if mod_locations
434
+ locations += mod_locations
435
+ locations.uniq!
436
+ end
437
+ end
438
+
439
+ locations.each {|i| series[i] *= -1} unless locations == nil
440
+ series
441
+ end
442
+
443
+ # Hook to custom-handle an unknown series from the series method.
444
+ # By default, handle_unknown_series raises an ArgumentError.
445
+ def handle_unknown_series(s)
446
+ raise ArgumentError, "unknown series: #{s}"
447
+ end
448
+ end
449
+ end
450
+ end
@@ -0,0 +1,4 @@
1
+ module Ms
2
+ module InSilico
3
+ end
4
+ end
data/tap.yml ADDED
File without changes
@@ -0,0 +1,5 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
2
+
3
+ # runs all subsets (see Tap::Test::SubsetMethods)
4
+ ENV["ALL"] = "true"
5
+ Dir.glob("./**/*_test.rb").each {|test| require test}
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ms-in_silico
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Simon Chiang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-20 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: tap
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.11"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: molecules
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.0
34
+ version:
35
+ description:
36
+ email: simon.a.chiang@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README
43
+ - MIT-LICENSE
44
+ files:
45
+ - lib/ms/in_silico.rb
46
+ - lib/ms/in_silico/digest.rb
47
+ - lib/ms/in_silico/digester.rb
48
+ - lib/ms/in_silico/fragment.rb
49
+ - lib/ms/in_silico/spectrum.rb
50
+ - tap.yml
51
+ - README
52
+ - MIT-LICENSE
53
+ has_rdoc: true
54
+ homepage: http://mspire.rubyforge.org/projects/ms-in_silico/
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ requirements: []
73
+
74
+ rubyforge_project: mspire
75
+ rubygems_version: 1.3.0
76
+ signing_key:
77
+ specification_version: 2
78
+ summary: ms-in_silico task library
79
+ test_files:
80
+ - test/tap_test_suite.rb