ms-in_silico 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +19 -0
- data/README +55 -0
- data/lib/ms/in_silico/digest.rb +35 -0
- data/lib/ms/in_silico/digester.rb +263 -0
- data/lib/ms/in_silico/fragment.rb +74 -0
- data/lib/ms/in_silico/spectrum.rb +450 -0
- data/lib/ms/in_silico.rb +4 -0
- data/tap.yml +0 -0
- data/test/tap_test_suite.rb +5 -0
- metadata +80 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2008, Regents of the University of Colorado.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
4
|
+
software and associated documentation files (the "Software"), to deal in the Software
|
5
|
+
without restriction, including without limitation the rights to use, copy, modify, merge,
|
6
|
+
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
|
7
|
+
to whom the Software is furnished to do so, subject to the following conditions:
|
8
|
+
|
9
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
10
|
+
substantial portions of the Software.
|
11
|
+
|
12
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
13
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
14
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
15
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
16
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
17
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
18
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
19
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= {Ms-InSilico}[http://mspire.rubyforge.org/projects/ms-in_silico]
|
2
|
+
|
3
|
+
An {Mspire}[http://mspire.rubyforge.org] library supporting in-silico calculations for mass spec data.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Ms-InSilico provides the following modules:
|
8
|
+
|
9
|
+
* Ms::InSilico::Digester (protein digestion)
|
10
|
+
* Ms::InSilico::Spectrum (peptide fragmentation)
|
11
|
+
|
12
|
+
Corresponding Tap[http://tap.rubyforge.org] tasks are also provided.
|
13
|
+
|
14
|
+
* Lighthouse[http://bahuvrihi.lighthouseapp.com/projects/16692-mspire/tickets]
|
15
|
+
* Github[http://github.com/bahuvrihi/ms-in_silico/tree/master]
|
16
|
+
* {Google Group}[http://groups.google.com/group/mspire-forum]
|
17
|
+
|
18
|
+
== Usage
|
19
|
+
|
20
|
+
require 'ms/in_silico/digester'
|
21
|
+
require 'ms/in_silico/spectrum'
|
22
|
+
include Ms::InSilico
|
23
|
+
|
24
|
+
trypsin = Digester['Trypsin']
|
25
|
+
peptides = trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
|
26
|
+
# => [
|
27
|
+
# 'MIVIGR',
|
28
|
+
# 'SIVHPYITNEYEPFAAEK',
|
29
|
+
# 'QQILSIMAG']
|
30
|
+
|
31
|
+
spectrum = Spectrum.new(peptides[0])
|
32
|
+
spectrum.parent_ion_mass
|
33
|
+
# => 688.417442373391
|
34
|
+
|
35
|
+
spectrum.series('b')
|
36
|
+
# => [
|
37
|
+
# 132.047761058391,
|
38
|
+
# 245.131825038791,
|
39
|
+
# 344.200238954991,
|
40
|
+
# 457.284302935391,
|
41
|
+
# 514.305766658991,
|
42
|
+
# 670.406877687091]
|
43
|
+
|
44
|
+
== Installation
|
45
|
+
|
46
|
+
Ms-InSilico is available as a gem on RubyForge[http://rubyforge.org/projects/mspire]. Use:
|
47
|
+
|
48
|
+
% gem install ms-in_silico
|
49
|
+
|
50
|
+
== Info
|
51
|
+
|
52
|
+
Copyright (c) 2006-2008, Regents of the University of Colorado.
|
53
|
+
Developer:: {Simon Chiang}[http://bahuvrihi.wordpress.com], {Biomolecular Structure Program}[http://biomol.uchsc.edu/], {Hansen Lab}[http://hsc-proteomics.uchsc.edu/hansenlab/]
|
54
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
55
|
+
Licence:: {MIT-Style}[link:files/MIT-LICENSE.html]
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'ms/in_silico/digester'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module InSilico
|
5
|
+
# Ms::InSilico::Digest::manifest digest a protein sequence into peptides
|
6
|
+
# Digest a protein sequence into an array of peptides.
|
7
|
+
#
|
8
|
+
# % rap digest MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG --+ dump --no-audit
|
9
|
+
# I[14:37:55] digest MIVIGRSIVHP... to 3 peptides
|
10
|
+
# # date: 2008-09-15 14:37:55
|
11
|
+
# ---
|
12
|
+
# ms/in_silico/digest (23483900):
|
13
|
+
# - - MIVIGR
|
14
|
+
# - SIVHPYITNEYEPFAAEK
|
15
|
+
# - QQILSIMAG
|
16
|
+
#
|
17
|
+
class Digest < Tap::Task
|
18
|
+
|
19
|
+
config :digester, 'Trypsin' # the name of the digester
|
20
|
+
config :max_misses, 0, &c.integer # the max # of missed cleavage sites
|
21
|
+
config :site_digest, false, &c.boolean # digest to sites (rather than sequences)
|
22
|
+
|
23
|
+
def process(sequence)
|
24
|
+
unless d = Digester[digester]
|
25
|
+
raise ArgumentError, "unknown digester: #{digester}"
|
26
|
+
end
|
27
|
+
|
28
|
+
peptides = site_digest ? d.site_digest(sequence, max_misses): d.digest(sequence, max_misses)
|
29
|
+
log 'digest', "#{sequence[0..10]}#{sequence.length > 10 ? '...' : ''} to #{peptides.length} peptides"
|
30
|
+
peptides
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
require 'constants/library'
|
2
|
+
require 'strscan'
|
3
|
+
|
4
|
+
module Ms
|
5
|
+
module InSilico
|
6
|
+
|
7
|
+
# Digester splits a protein sequence into peptides at sites specified
|
8
|
+
# during initialization; in short Digester models a cleavage enzyme.
|
9
|
+
# Digesters support missed cleavage sites, and can return either the
|
10
|
+
# peptide strings or the cleavage sites.
|
11
|
+
#
|
12
|
+
# Digester includes {Constants::Library}[http://bioactive.rubyforge.org/constants/classes/Constants/Library.html],
|
13
|
+
# allowing access to many common digesters using Digester[]:
|
14
|
+
#
|
15
|
+
# trypsin = Digester['Trypsin']
|
16
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
|
17
|
+
# # => [
|
18
|
+
# # 'MIVIGR',
|
19
|
+
# # 'SIVHPYITNEYEPFAAEK',
|
20
|
+
# # 'QQILSIMAG']
|
21
|
+
#
|
22
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
23
|
+
# # => [
|
24
|
+
# # 'MIVIGR',
|
25
|
+
# # 'MIVIGRSIVHPYITNEYEPFAAEK',
|
26
|
+
# # 'SIVHPYITNEYEPFAAEK',
|
27
|
+
# # 'SIVHPYITNEYEPFAAEKQQILSIMAG',
|
28
|
+
# # 'QQILSIMAG'
|
29
|
+
# # ]
|
30
|
+
#
|
31
|
+
# trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
32
|
+
# # => [
|
33
|
+
# # [0,6],
|
34
|
+
# # [0,24],
|
35
|
+
# # [6,24],
|
36
|
+
# # [6,33],
|
37
|
+
# # [24,33]
|
38
|
+
# # ]
|
39
|
+
#
|
40
|
+
# ==== Enzymes
|
41
|
+
#
|
42
|
+
# Enzymes in the library were adapted from the default Mascot[http://www.matrixscience.com/]
|
43
|
+
# enzyme list. Currently supported enzymes include:
|
44
|
+
#
|
45
|
+
# * Arg-C
|
46
|
+
# * Asp-N
|
47
|
+
# * Asp-N_ambic
|
48
|
+
# * Chymotrypsin
|
49
|
+
# * CNBr
|
50
|
+
# * Lys-C
|
51
|
+
# * Lys-C/P
|
52
|
+
# * PepsinA
|
53
|
+
# * Tryp-CNBr
|
54
|
+
# * TrypChymo
|
55
|
+
# * Trypsin/P
|
56
|
+
# * V8-DE
|
57
|
+
# * V8-E
|
58
|
+
# * Trypsin
|
59
|
+
# * V8-E+Trypsin
|
60
|
+
# * V8-DE+Trypsin
|
61
|
+
#
|
62
|
+
# Several enzymes require two or more digesters, or functionality that
|
63
|
+
# is not provided by Digester, and so remain unsupported:
|
64
|
+
#
|
65
|
+
# * CNBr+Trypsin
|
66
|
+
# * Formic_acid
|
67
|
+
# * LysC+AspN
|
68
|
+
# * semiTrypsin
|
69
|
+
#
|
70
|
+
class Digester
|
71
|
+
|
72
|
+
# The name of the digester
|
73
|
+
attr_reader :name
|
74
|
+
|
75
|
+
# A string of residues at which cleavage occurs
|
76
|
+
attr_reader :cleave_str
|
77
|
+
|
78
|
+
# A c-terminal resitriction residue which prevents
|
79
|
+
# cleavage at a potential cleavage site (optional).
|
80
|
+
attr_reader :cterm_exception
|
81
|
+
|
82
|
+
# True if cleavage occurs at the c-terminus of a
|
83
|
+
# cleavage residue, false if cleavage occurs at
|
84
|
+
# the n-terminus.
|
85
|
+
attr_reader :cterm_cleavage
|
86
|
+
|
87
|
+
# a multiline whitespace regexp
|
88
|
+
WHITESPACE = /\s*/m
|
89
|
+
|
90
|
+
def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
|
91
|
+
regexp = []
|
92
|
+
0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
|
93
|
+
|
94
|
+
@name = name
|
95
|
+
@cleave_str = cleave_str
|
96
|
+
@cleave_regexp = Regexp.new(regexp.join('|'))
|
97
|
+
@cterm_exception = case
|
98
|
+
when cterm_exception == nil || cterm_exception.empty? then nil
|
99
|
+
when cterm_exception.length == 1 then cterm_exception[0]
|
100
|
+
else
|
101
|
+
raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
|
102
|
+
end
|
103
|
+
|
104
|
+
@cterm_cleavage = cterm_cleavage
|
105
|
+
@scanner = StringScanner.new('')
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns sites of digestion sites in sequence, as determined by
|
109
|
+
# thecleave_regexp boundaries. The digestion sites correspond
|
110
|
+
# to the positions where a peptide begins and ends, such that
|
111
|
+
# [n, (n+1) - n] corresponds to the [index, length] for peptide n.
|
112
|
+
#
|
113
|
+
# d = Digester.new('Trypsin', 'KR', 'P')
|
114
|
+
# seq = "AARGGR"
|
115
|
+
# sites = d.cleavage_sites(seq) # => [0, 3, 6]
|
116
|
+
#
|
117
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
|
118
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
119
|
+
#
|
120
|
+
# Trailing whitespace is included in the fragment.
|
121
|
+
#
|
122
|
+
# seq = "AAR \n GGR"
|
123
|
+
# sites = d.cleavage_sites(seq) # => [0, 8, 11]
|
124
|
+
#
|
125
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
|
126
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
127
|
+
#
|
128
|
+
# The digested section of sequence may be specified using offset
|
129
|
+
# and length.
|
130
|
+
def cleavage_sites(seq, offset=0, length=seq.length-offset)
|
131
|
+
adjustment = cterm_cleavage ? 0 : 1
|
132
|
+
limit = offset + length
|
133
|
+
|
134
|
+
positions = [offset]
|
135
|
+
pos = scan(seq, offset, limit) do |pos|
|
136
|
+
positions << pos - adjustment
|
137
|
+
end
|
138
|
+
|
139
|
+
# add the final position
|
140
|
+
if pos < limit || positions.length == 1
|
141
|
+
positions << limit
|
142
|
+
end
|
143
|
+
|
144
|
+
positions
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns digestion sites of sequence as [start_index, end_index] pairs,
|
148
|
+
# allowing for missed cleavages. Digestion sites are determined using
|
149
|
+
# cleavage_sites; as in that method, the digested section of sequence
|
150
|
+
# may be specified using offset and length.
|
151
|
+
#
|
152
|
+
# Each [start_index, end_index] pair is yielded to the block, if given,
|
153
|
+
# and the collected results are returned.
|
154
|
+
def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset) # :yields: start_index, end_index
|
155
|
+
frag_sites = cleavage_sites(seq, offset, length)
|
156
|
+
|
157
|
+
overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
|
158
|
+
start_index = frag_sites[start_index]
|
159
|
+
end_index = frag_sites[end_index]
|
160
|
+
|
161
|
+
block_given? ? yield(start_index, end_index) : [start_index, end_index]
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Returns an array of peptides produced by digesting sequence, allowing for
|
166
|
+
# missed cleavage sites. Digestion sites are determined using cleavage_sites;
|
167
|
+
# as in that method, the digested section of sequence may be specified using
|
168
|
+
# offset and length.
|
169
|
+
def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
|
170
|
+
site_digest(seq, max_misses, offset, length).collect do |s, e|
|
171
|
+
seq[s, e-s]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
protected
|
176
|
+
|
177
|
+
# The cleavage regexp used to identify cleavage sites
|
178
|
+
attr_reader :cleave_regexp # :nodoc:
|
179
|
+
|
180
|
+
# The scanner used to digest strings.
|
181
|
+
attr_reader :scanner # :nodoc:
|
182
|
+
|
183
|
+
# Scans seq between offset and limit for the cleave_regexp, skipping whitespace
|
184
|
+
# and being mindful of exception characters. The positions of the scanner at
|
185
|
+
# each match are yielded to the block.
|
186
|
+
def scan(seq, offset, limit) # :nodoc:
|
187
|
+
scanner.string = seq
|
188
|
+
scanner.pos = offset
|
189
|
+
|
190
|
+
while scanner.search_full(cleave_regexp, true, false)
|
191
|
+
scanner.search_full(WHITESPACE, true, false)
|
192
|
+
pos = scanner.pos
|
193
|
+
|
194
|
+
# skip if the next character is the exception character
|
195
|
+
next if cterm_exception != nil && seq[pos] == cterm_exception
|
196
|
+
|
197
|
+
# break if you scanned past the upper limit
|
198
|
+
break if pos > limit
|
199
|
+
|
200
|
+
yield pos
|
201
|
+
end
|
202
|
+
|
203
|
+
scanner.pos
|
204
|
+
end
|
205
|
+
|
206
|
+
# Performs an overlap-collect algorithm providing the start and end
|
207
|
+
# indicies of spans skipping up to max_misses boundaries.
|
208
|
+
def overlay(n, max_misses, offset) # :nodoc:
|
209
|
+
results = []
|
210
|
+
0.upto(n-1) do |start_index|
|
211
|
+
0.upto(max_misses) do |n_miss|
|
212
|
+
end_index = start_index + offset + n_miss
|
213
|
+
break if end_index == n
|
214
|
+
|
215
|
+
results << yield(start_index, end_index)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
results
|
219
|
+
end
|
220
|
+
|
221
|
+
#
|
222
|
+
# Enzymes adapted from the default Mascot enzyme list.
|
223
|
+
#
|
224
|
+
|
225
|
+
class << self
|
226
|
+
protected
|
227
|
+
|
228
|
+
# Utility method to parse a mascot enzyme configuration
|
229
|
+
# string into a Digester.
|
230
|
+
def mascot_parse(str) # :nodoc:
|
231
|
+
name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
|
232
|
+
cterm_cleavage = case sense
|
233
|
+
when 'C-Term' then true
|
234
|
+
when 'N-Term' then false
|
235
|
+
else raise ArgumentError, "unknown sense: #{sense}"
|
236
|
+
end
|
237
|
+
|
238
|
+
new(name, cleave_str, cterm_exception, cterm_cleavage)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
ARG_C = mascot_parse('Arg-C C-Term R P no no')
|
243
|
+
ASP_N = mascot_parse('Asp-N N-Term BD no no')
|
244
|
+
ASP_N_AMBIC = mascot_parse('Asp-N_ambic N-Term DE no no')
|
245
|
+
CHYMOTRYPSIN = mascot_parse('Chymotrypsin C-Term FLWY P no no')
|
246
|
+
CNBR = mascot_parse('CNBr C-Term M no no')
|
247
|
+
LYS_C = mascot_parse('Lys-C C-Term K P no no')
|
248
|
+
LYS_C_P = mascot_parse('Lys-C/P C-Term K no no')
|
249
|
+
PEPSIN_A = mascot_parse('PepsinA C-Term FL no no')
|
250
|
+
TRYP_CNBR = mascot_parse('Tryp-CNBr C-Term KMR P no no')
|
251
|
+
TRYP_CHYMO = mascot_parse('TrypChymo C-Term FKLRWY P no no')
|
252
|
+
TRYPSIN_P = mascot_parse('Trypsin/P C-Term KR no no')
|
253
|
+
V8_DE = mascot_parse('V8-DE C-Term BDEZ P no no')
|
254
|
+
V8_E = mascot_parse('V8-E C-Term EZ P no no')
|
255
|
+
TRYPSIN = mascot_parse('Trypsin C-Term KR P no no')
|
256
|
+
V8_E_TRYPSIN = mascot_parse('V8-E+Trypsin C-Term EKRZ P no no')
|
257
|
+
V8_DE_TRYPSIN = mascot_parse('V8-DE+Trypsin C-Term BDEKRZ P no no')
|
258
|
+
|
259
|
+
include Constants::Library
|
260
|
+
library.index_by_attribute :name
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'ms/in_silico/spectrum'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module InSilico
|
5
|
+
|
6
|
+
# Ms::InSilico::Fragment::manifest calculates a theoretical ms/ms spectrum
|
7
|
+
#
|
8
|
+
# Calculates the parent ion mass and theoretical ms/ms spectrum for a
|
9
|
+
# peptide sequence. Configurations allow the specification of one or
|
10
|
+
# more fragmentation series to include, as well as charge, and intensity.
|
11
|
+
#
|
12
|
+
# % rap fragment TVQQEL --+ dump --no-audit
|
13
|
+
# # date: 2008-09-15 14:37:55
|
14
|
+
# ---
|
15
|
+
# ms/in_silico/fragment (:...:):
|
16
|
+
# - - 717.377745628191
|
17
|
+
# - - 102.054954926291
|
18
|
+
# - 132.101905118891
|
19
|
+
# - 201.123368842491
|
20
|
+
# - 261.144498215091
|
21
|
+
# - 329.181946353891
|
22
|
+
# - 389.203075726491
|
23
|
+
# - 457.240523865291
|
24
|
+
# - 517.261653237891
|
25
|
+
# - 586.283116961491
|
26
|
+
# - 616.330067154091
|
27
|
+
# - 699.367180941891
|
28
|
+
# - 717.377745628191
|
29
|
+
#
|
30
|
+
# In the output, the parent ion mass is given first, followed by an
|
31
|
+
# array of the sorted fragmentation data.
|
32
|
+
class Fragment < Tap::Task
|
33
|
+
|
34
|
+
# A block to validate a config input
|
35
|
+
# is an EmpericalFormula.
|
36
|
+
MOLECULE = lambda do |value|
|
37
|
+
case value
|
38
|
+
when Molecules::EmpiricalFormula then value
|
39
|
+
else Molecules::EmpiricalFormula.parse(value)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
config :series, ['y', 'b'], &c.array # a list of the series to include
|
44
|
+
config :charge, 1, &c.integer # the charge for the parent ion
|
45
|
+
config :intensity, nil, &c.num_or_nil # a uniform intensity value
|
46
|
+
config :nterm, 'H', &MOLECULE # the n-terminal modification
|
47
|
+
config :cterm, 'OH', &MOLECULE # the c-terminal modification
|
48
|
+
config :sort, true, &c.switch # sorts the data by mass
|
49
|
+
config :unmask, true, &c.switch # remove masked (negative) masses
|
50
|
+
|
51
|
+
def process(peptide)
|
52
|
+
log :fragment, peptide
|
53
|
+
spec = spectrum(peptide)
|
54
|
+
|
55
|
+
masses = []
|
56
|
+
series.each {|s| masses.concat(spec.series(s)) }
|
57
|
+
masses.delete_if {|m| m < 0 } if unmask
|
58
|
+
masses.sort! if sort
|
59
|
+
masses.collect! {|m| [m, intensity] } if intensity
|
60
|
+
|
61
|
+
[spec.parent_ion_mass(charge), masses]
|
62
|
+
end
|
63
|
+
|
64
|
+
protected
|
65
|
+
|
66
|
+
# Returns a new Spectrum used in the calculation.
|
67
|
+
# Primarily a hook for custom spectra in subclasses.
|
68
|
+
def spectrum(peptide)
|
69
|
+
Spectrum.new(peptide, nterm, cterm)
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,450 @@
|
|
1
|
+
require 'molecules/libraries/residue'
|
2
|
+
require 'constants/libraries/particle'
|
3
|
+
require 'ms/in_silico'
|
4
|
+
|
5
|
+
module Ms
|
6
|
+
module InSilico
|
7
|
+
|
8
|
+
# Spectrum calculates the theoretical ion series produced by a fragmentation
|
9
|
+
# process such as collision induced disocciation (CID). The formula used to
|
10
|
+
# calculate the ion series were obtained from the {Matrix Science
|
11
|
+
# website}[http://www.matrixscience.com/]. Spectrum uses the
|
12
|
+
# {Constants}[http://bioactive.rubyforge.org/constants/] gem as the default
|
13
|
+
# source of element and particle masses.
|
14
|
+
#
|
15
|
+
# spec = Ms::InSilico::Spectrum.new('TVQQEL')
|
16
|
+
# spec.series('b')
|
17
|
+
# # => [
|
18
|
+
# # 102.054954926291,
|
19
|
+
# # 201.123368842491,
|
20
|
+
# # 329.181946353891,
|
21
|
+
# # 457.240523865291,
|
22
|
+
# # 586.283116961491,
|
23
|
+
# # 699.367180941891]
|
24
|
+
#
|
25
|
+
# spec.series('y')
|
26
|
+
# # => [
|
27
|
+
# # 717.377745628191,
|
28
|
+
# # 616.330067154091,
|
29
|
+
# # 517.261653237891,
|
30
|
+
# # 389.203075726491,
|
31
|
+
# # 261.144498215091,
|
32
|
+
# # 132.101905118891]
|
33
|
+
#
|
34
|
+
# ==== Formulae to Calculate Fragment Ion m/z values
|
35
|
+
#
|
36
|
+
# <em>Copied directly from the Matrix Science {fragmentation help
|
37
|
+
# section}[http://www.matrixscience.com/help/fragmentation_help.html]</em>
|
38
|
+
#
|
39
|
+
# [N] is the molecular mass of the neutral N-terminal group, [C] is the
|
40
|
+
# molecular mass of the neutral C-terminal group, [M] is molecular mass
|
41
|
+
# of the neutral amino acid residues. To obtain m/z values, add or
|
42
|
+
# subtract protons as required to obtain the required charge and divide
|
43
|
+
# by the number of charges. For example, to get a+, add 1 proton to the
|
44
|
+
# Mr value for a. To get a--, subtract 2 protons from the Mr value for
|
45
|
+
# a and divide by 2.
|
46
|
+
#
|
47
|
+
# Ion Type Neutral Mr
|
48
|
+
# a [N]+[M]-CHO
|
49
|
+
# a* a-NH3
|
50
|
+
# a� a-H2O
|
51
|
+
# b [N]+[M]-H
|
52
|
+
# b* b-NH3
|
53
|
+
# b� b-H2O
|
54
|
+
# c [N]+[M]+NH2
|
55
|
+
# d a - partial side chain
|
56
|
+
# v y - complete side chain
|
57
|
+
# w z - partial side chain
|
58
|
+
# x [C]+[M]+CO-H
|
59
|
+
# y [C]+[M]+H
|
60
|
+
# y* y-NH3
|
61
|
+
# y� y-H2O
|
62
|
+
# z [C]+[M]-NH2
|
63
|
+
#
|
64
|
+
# ==== Use of alternate masses
|
65
|
+
# By default a Spectrum will calculate the ion series' using the
|
66
|
+
# monoisotopic masses for each element. To calculate masses
|
67
|
+
# differently, provide a block to new; each Element will be
|
68
|
+
# passed to the block as needed, and the block should return
|
69
|
+
# the element mass used in the calculation.
|
70
|
+
#
|
71
|
+
# Alternatively, a subclass can override the mass method; all
|
72
|
+
# objects that need to be turned into a mass (nterm, cterm,
|
73
|
+
# a variety of molecules specified as strings, the elements,
|
74
|
+
# ELECTRON, etc) are passed to mass to yield the value used
|
75
|
+
# in any given calculation.
|
76
|
+
#
|
77
|
+
#--
|
78
|
+
# ALL of the collections could be sped up using inline
|
79
|
+
#++
|
80
|
+
class Spectrum
|
81
|
+
include Molecules
|
82
|
+
include Molecules::Libraries
|
83
|
+
include Constants::Libraries
|
84
|
+
|
85
|
+
class << self
|
86
|
+
|
87
|
+
def inherited(base)
|
88
|
+
base.instance_variable_set(:@residues_to_locate, @residues_to_locate.dup)
|
89
|
+
end
|
90
|
+
|
91
|
+
# A string of residues located by scan.
|
92
|
+
attr_accessor :residues_to_locate
|
93
|
+
|
94
|
+
# Adds residues to residues_to_locate (these residues
|
95
|
+
# will be located by scan). Generally used when some
|
96
|
+
# special fragmentation behavior occurs at specific
|
97
|
+
# residues. By default no residues are located.
|
98
|
+
#
|
99
|
+
# class Subclass < Spectrum
|
100
|
+
# locate_residues "PS"
|
101
|
+
# end
|
102
|
+
#
|
103
|
+
# Subclass.new('RPPGFSPFR').residue_locations
|
104
|
+
# # => {'P' => [1, 2, 6], 'S' => [5]}
|
105
|
+
#
|
106
|
+
# Calls to locate_residues are cumulative.
|
107
|
+
def locate_residues(residues)
|
108
|
+
@residues_to_locate += residues
|
109
|
+
end
|
110
|
+
|
111
|
+
# Scans the sequence to produce a ladder of masses and a
|
112
|
+
# hash of (residue, locations) pairs which indicate the
|
113
|
+
# indicies at which the residue occurs in sequence. The
|
114
|
+
# ladder corresponds to the M values described above.
|
115
|
+
#
|
116
|
+
# Returns [ladder, {residue => locations}].
|
117
|
+
#
|
118
|
+
# ==== Inputs
|
119
|
+
# sequence:: a string
|
120
|
+
# masses_by_byte:: an array of masses where the index of
|
121
|
+
# the mass is the byte of the
|
122
|
+
# corresponding residue.
|
123
|
+
# residues_to_locate:: a string of the residues to locate.
|
124
|
+
#
|
125
|
+
# Note: scan is an optimized utility function, but should
|
126
|
+
# be replaced by an inline function to do the same.
|
127
|
+
#
|
128
|
+
def scan(sequence, masses_by_byte, residues_to_locate)
|
129
|
+
locations = []
|
130
|
+
residues_to_locate.each_byte {|byte| locations[byte] = []}
|
131
|
+
|
132
|
+
mass = 0
|
133
|
+
ladder = []
|
134
|
+
sequence.each_byte do |byte|
|
135
|
+
mass += masses_by_byte[byte]
|
136
|
+
location = locations[byte]
|
137
|
+
|
138
|
+
location << ladder.length if location
|
139
|
+
ladder << mass
|
140
|
+
end
|
141
|
+
|
142
|
+
hash = {}
|
143
|
+
0.upto(residues_to_locate.length-1) do |index|
|
144
|
+
letter = residues_to_locate[index, 1]
|
145
|
+
byte = letter[0]
|
146
|
+
hash[letter] = locations[byte]
|
147
|
+
end
|
148
|
+
|
149
|
+
[ladder, hash]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
HYDROGEN = EmpiricalFormula.parse("H")
|
154
|
+
HYDROXIDE = EmpiricalFormula.parse("OH")
|
155
|
+
ELECTRON = Particle['Electron']
|
156
|
+
|
157
|
+
self.residues_to_locate = ""
|
158
|
+
|
159
|
+
# The peptide sequence.
|
160
|
+
attr_reader :sequence
|
161
|
+
|
162
|
+
# The n-terminal modification (default H)
|
163
|
+
attr_reader :nterm
|
164
|
+
|
165
|
+
# The c-terminal modification (default OH)
|
166
|
+
attr_reader :cterm
|
167
|
+
|
168
|
+
# An optional block used to calculate masses of molecules.
|
169
|
+
attr_reader :block
|
170
|
+
|
171
|
+
# A ladder of mass values corresponding to the
|
172
|
+
# M values used in the fragmentation formulae.
|
173
|
+
attr_reader :ladder
|
174
|
+
|
175
|
+
# A hash of (residue, [locations]) pairs where
|
176
|
+
# the locations are the indicies in sequence
|
177
|
+
# at which residue occurs.
|
178
|
+
attr_reader :residue_locations
|
179
|
+
|
180
|
+
# Initializes a new Spectrum using the specified n- and c-terminal
|
181
|
+
# modifications. Masses will be calculated using the block, if
|
182
|
+
# specified. If no block is specified, then the monoisoptopic
|
183
|
+
# masses will be used.
|
184
|
+
def initialize(sequence, nterm=HYDROGEN, cterm=HYDROXIDE, &block) # :yields: element
|
185
|
+
@sequence = sequence
|
186
|
+
@nterm = nterm
|
187
|
+
@cterm = cterm
|
188
|
+
@block = block
|
189
|
+
|
190
|
+
residue_masses = Residue.residue_index.collect do |residue|
|
191
|
+
next(0) if residue == nil
|
192
|
+
mass(residue)
|
193
|
+
end
|
194
|
+
|
195
|
+
@ladder, @residue_locations = self.class.scan(
|
196
|
+
sequence,
|
197
|
+
residue_masses,
|
198
|
+
self.class.residues_to_locate)
|
199
|
+
|
200
|
+
@series_hash = {}
|
201
|
+
@series_mask = {}
|
202
|
+
end
|
203
|
+
|
204
|
+
# Returns the mass of the parent ion for the sequence, given the charge.
|
205
|
+
def parent_ion_mass(charge=1)
|
206
|
+
(mass(nterm) + ladder.last + mass(cterm) + charge * proton_mass)/charge
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns the mass of a proton (ie Hydrogen minus an Electron)
|
210
|
+
def proton_mass
|
211
|
+
mass(HYDROGEN) - mass(ELECTRON)
|
212
|
+
end
|
213
|
+
|
214
|
+
# Retrieves the specfied series, assuming a charge of 1. A different charge
|
215
|
+
# can be specified for the series by using '+' and '-'. For example:
|
216
|
+
#
|
217
|
+
# f = Spectrum.new 'RPPGFSPFR'
|
218
|
+
# f.series('y') == f.y_series # => true
|
219
|
+
# f.series('b++') == f.b_series(2) # => true
|
220
|
+
# f.series('nladder-') == f.nladder_series(-1) # => true
|
221
|
+
#
|
222
|
+
# Series raises an error if the specified charge is zero.
|
223
|
+
def series(s)
|
224
|
+
s = s.to_s.strip
|
225
|
+
case s
|
226
|
+
when /^(immonium|nladder|cladder|[abcxyYz])(\+*)(-*)(\s[\+\-\s\w\d]+)?$/
|
227
|
+
series = $1
|
228
|
+
plus = $2
|
229
|
+
minus = $3
|
230
|
+
mod = $4.to_s.gsub(/\s/, "")
|
231
|
+
|
232
|
+
charge = case
|
233
|
+
when plus.empty? && minus.empty? then 1
|
234
|
+
when minus.empty? then plus.length
|
235
|
+
when plus.empty? then -minus.length
|
236
|
+
else
|
237
|
+
charge = plus.length - minus.length
|
238
|
+
raise ArgumentError.new("zero charge specified in series: #{s}") if charge == 0
|
239
|
+
charge
|
240
|
+
end
|
241
|
+
|
242
|
+
self.send("#{series}_series", charge, mod)
|
243
|
+
else
|
244
|
+
handle_unknown_series(s)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def immonium_series(charge=1, mod=nil)
|
249
|
+
get_series(:immonium, charge, mod) do
|
250
|
+
delta = mass(mod) - mass('CO')
|
251
|
+
|
252
|
+
previous = 0
|
253
|
+
series = []
|
254
|
+
ladder.each do |current|
|
255
|
+
series << (current - previous + delta + charge * proton_mass)/charge
|
256
|
+
previous = current
|
257
|
+
end
|
258
|
+
series
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
# [N]+[M]-CHO
|
263
|
+
def a_series(charge=1, mod=nil)
|
264
|
+
get_series(:a, charge, mod) do
|
265
|
+
delta = mass(mod) + mass(nterm) - mass('CHO') + charge * proton_mass
|
266
|
+
nterm_series(delta, charge)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# [N]+[M]-H
|
271
|
+
def b_series(charge=1, mod=nil)
|
272
|
+
get_series(:b, charge, mod) do
|
273
|
+
delta = mass(mod) + mass(nterm) - mass('H') + charge * proton_mass
|
274
|
+
nterm_series(delta, charge)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
# [N]+[M]+NH2
|
279
|
+
def c_series(charge=1, mod=nil)
|
280
|
+
get_series(:c, charge, mod) do
|
281
|
+
delta = mass(mod) + mass(nterm) + mass('NH2') + charge * proton_mass
|
282
|
+
nterm_series(delta, charge)
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
# [M]+H20
|
287
|
+
#--
|
288
|
+
# Ask Peter about these as well... Currently I'm adding water to
|
289
|
+
# cap the ends, as if a hydrolysis reaction produced the ladder,
|
290
|
+
# then I'm adding H for charge... is this what is intended?
|
291
|
+
# Why not cladder[0] or cladder[-1]?
|
292
|
+
#++
|
293
|
+
def cladder_series(charge=1, mod=nil)
|
294
|
+
get_series(:cladder, charge, mod) do
|
295
|
+
delta = mass(mod) + mass('H2O') + charge * proton_mass
|
296
|
+
nterm_series(delta, charge)
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# [C]+[M]+CO-H
|
301
|
+
def x_series(charge=1, mod=nil)
|
302
|
+
get_series(:x, charge, mod) do
|
303
|
+
delta = mass(mod) + ladder.last + mass(cterm) + mass('CO - H') + charge * proton_mass
|
304
|
+
cterm_series(delta, charge)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
# [C]+[M]+H
|
309
|
+
def y_series(charge=1, mod=nil)
|
310
|
+
get_series(:y, charge, mod) do
|
311
|
+
delta = mass(mod) + ladder.last + mass(cterm) + mass('H') + charge * proton_mass
|
312
|
+
cterm_series(delta, charge)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# [C]+[M]-H
|
317
|
+
def Y_series(charge=1, mod=nil)
|
318
|
+
get_series(:Y, charge, mod) do
|
319
|
+
delta = mass(mod) + ladder.last + mass(cterm) - mass('H') + charge * proton_mass
|
320
|
+
cterm_series(delta, charge)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
# [C]+[M]-NH2
|
325
|
+
def z_series(charge=1, mod=nil)
|
326
|
+
get_series(:z, charge, mod) do
|
327
|
+
delta = mass(mod) + ladder.last + mass(cterm) - mass('NH2') + charge * proton_mass
|
328
|
+
cterm_series(delta, charge)
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
# [M]+H20
|
333
|
+
#--
|
334
|
+
# Ask Peter about these as well... Currently I'm adding water to
|
335
|
+
# cap the ends, as if a hydrolysis reaction produced the ladder,
|
336
|
+
# then I'm adding H for charge... is this what is intended?
|
337
|
+
# Why not nladder[-1]?
|
338
|
+
#++
|
339
|
+
def nladder_series(charge=1, mod=nil)
|
340
|
+
get_series(:nladder, charge, mod) do
|
341
|
+
delta = mass(mod) + ladder.last + mass('H2O') + charge * proton_mass
|
342
|
+
cterm_series(delta, charge)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
protected
|
347
|
+
|
348
|
+
# A hash holding all calculated series for self. Series are keyed
|
349
|
+
# by the type and charge of the series (ex: b1, b2, y1, y2).
|
350
|
+
attr_accessor :series_hash
|
351
|
+
|
352
|
+
# A hash holding the locations of residues that need to be masked (ie
|
353
|
+
# multiplied by -1) in a given series. Mask locations should be unique
|
354
|
+
# so that a given location will not be masked twice; the method
|
355
|
+
# mask_locations can assist in doing so. Series masks are keyed
|
356
|
+
# by the series type (ex: b, y).
|
357
|
+
attr_accessor :series_mask
|
358
|
+
|
359
|
+
# Calculates the mass of the molecule for a variety of input
|
360
|
+
# types:
|
361
|
+
#
|
362
|
+
# EmpiricalFormula molecule.mass(&block)
|
363
|
+
# Particle molecule.mass
|
364
|
+
# String EmpiricalFormula.mass(molecule, &block)
|
365
|
+
# Numeric molecule
|
366
|
+
# nil 0
|
367
|
+
#
|
368
|
+
def mass(molecule)
|
369
|
+
|
370
|
+
# note that Particles will not actually make use of the
|
371
|
+
# block, even though it is being passed to it.
|
372
|
+
|
373
|
+
case molecule
|
374
|
+
when EmpiricalFormula, Particle then molecule.mass(&block)
|
375
|
+
when String then EmpiricalFormula.mass(molecule, &block)
|
376
|
+
when nil then 0
|
377
|
+
when Numeric then molecule
|
378
|
+
else
|
379
|
+
raise "cannot calculate mass of: #{molecule}"
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
# Generates an n-terminal series (ex: a, b, or c) by adding delta
|
384
|
+
# to each element from ladder, and dividing by charge. Delta,
|
385
|
+
# therefore, should ALREADY take account of the protons added
|
386
|
+
# by charge.
|
387
|
+
def nterm_series(delta, charge)
|
388
|
+
ladder.collect {|m| (m + delta)/charge }
|
389
|
+
end
|
390
|
+
|
391
|
+
# Generates a c-terminal series (ex: x, y, or z) by subtracting each
|
392
|
+
# element from ladder from delta, and dividing by charge. Delta,
|
393
|
+
# therefore, should ALREADY take account of the protons added
|
394
|
+
# by charge.
|
395
|
+
def cterm_series(delta, charge)
|
396
|
+
series = ladder.collect {|m| (delta - m)/charge }
|
397
|
+
series.unshift(delta/charge)
|
398
|
+
series.pop
|
399
|
+
series
|
400
|
+
end
|
401
|
+
|
402
|
+
# Adds the specified locations to the series mask, ensuring that the
|
403
|
+
# specified locations will be unique within the mask. If overwrite
|
404
|
+
# is true, then the input locations will overwrite any existing mask
|
405
|
+
# locations.
|
406
|
+
def mask_locations(series, locations, overwrite=false)
|
407
|
+
locations = locations.collect do |location|
|
408
|
+
location < 0 ? ladder.length + location : location
|
409
|
+
end
|
410
|
+
|
411
|
+
if overwrite
|
412
|
+
series_mask[series] = locations.uniq
|
413
|
+
else
|
414
|
+
(series_mask[series] ||= []).concat(locations).uniq!
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
# Retrieves the series keyed by "#{key}#{charge}" in series_hash.
|
419
|
+
# If the series has not been initialized, the series will be
|
420
|
+
# initialized using the supplied block, and masked using the
|
421
|
+
# series_mask indicated by key (not "#{key}#{charge}").
|
422
|
+
def get_series(key, charge=nil, mod=nil)
|
423
|
+
series_hash["#{key}#{charge}#{mod}"] ||= mask(yield, key, mod)
|
424
|
+
end
|
425
|
+
|
426
|
+
# Mask the locations in the series by multiplying them by -1. Mask
|
427
|
+
# does NOT check to see if the location is negative or positive.
|
428
|
+
def mask(series, key, mod)
|
429
|
+
locations = series_mask[key]
|
430
|
+
|
431
|
+
unless mod == nil
|
432
|
+
mod_locations = series_mask["#{key}#{mod}"]
|
433
|
+
if mod_locations
|
434
|
+
locations += mod_locations
|
435
|
+
locations.uniq!
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
locations.each {|i| series[i] *= -1} unless locations == nil
|
440
|
+
series
|
441
|
+
end
|
442
|
+
|
443
|
+
# Hook to custom-handle an unknown series from the series method.
|
444
|
+
# By default, handle_unknown_series raises an ArgumentError.
|
445
|
+
def handle_unknown_series(s)
|
446
|
+
raise ArgumentError, "unknown series: #{s}"
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
450
|
+
end
|
data/lib/ms/in_silico.rb
ADDED
data/tap.yml
ADDED
File without changes
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ms-in_silico
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simon Chiang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-11-20 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: tap
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.11"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: molecules
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.0
|
34
|
+
version:
|
35
|
+
description:
|
36
|
+
email: simon.a.chiang@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README
|
43
|
+
- MIT-LICENSE
|
44
|
+
files:
|
45
|
+
- lib/ms/in_silico.rb
|
46
|
+
- lib/ms/in_silico/digest.rb
|
47
|
+
- lib/ms/in_silico/digester.rb
|
48
|
+
- lib/ms/in_silico/fragment.rb
|
49
|
+
- lib/ms/in_silico/spectrum.rb
|
50
|
+
- tap.yml
|
51
|
+
- README
|
52
|
+
- MIT-LICENSE
|
53
|
+
has_rdoc: true
|
54
|
+
homepage: http://mspire.rubyforge.org/projects/ms-in_silico/
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
version:
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
requirements: []
|
73
|
+
|
74
|
+
rubyforge_project: mspire
|
75
|
+
rubygems_version: 1.3.0
|
76
|
+
signing_key:
|
77
|
+
specification_version: 2
|
78
|
+
summary: ms-in_silico task library
|
79
|
+
test_files:
|
80
|
+
- test/tap_test_suite.rb
|