ms-in_silico 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +19 -0
- data/README +55 -0
- data/lib/ms/in_silico/digest.rb +35 -0
- data/lib/ms/in_silico/digester.rb +263 -0
- data/lib/ms/in_silico/fragment.rb +74 -0
- data/lib/ms/in_silico/spectrum.rb +450 -0
- data/lib/ms/in_silico.rb +4 -0
- data/tap.yml +0 -0
- data/test/tap_test_suite.rb +5 -0
- metadata +80 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2008, Regents of the University of Colorado.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
4
|
+
software and associated documentation files (the "Software"), to deal in the Software
|
5
|
+
without restriction, including without limitation the rights to use, copy, modify, merge,
|
6
|
+
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
|
7
|
+
to whom the Software is furnished to do so, subject to the following conditions:
|
8
|
+
|
9
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
10
|
+
substantial portions of the Software.
|
11
|
+
|
12
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
13
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
14
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
15
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
16
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
17
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
18
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
19
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= {Ms-InSilico}[http://mspire.rubyforge.org/projects/ms-in_silico]
|
2
|
+
|
3
|
+
An {Mspire}[http://mspire.rubyforge.org] library supporting in-silico calculations for mass spec data.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Ms-InSilico provides the following modules:
|
8
|
+
|
9
|
+
* Ms::InSilico::Digester (protein digestion)
|
10
|
+
* Ms::InSilico::Spectrum (peptide fragmentation)
|
11
|
+
|
12
|
+
Corresponding Tap[http://tap.rubyforge.org] tasks are also provided.
|
13
|
+
|
14
|
+
* Lighthouse[http://bahuvrihi.lighthouseapp.com/projects/16692-mspire/tickets]
|
15
|
+
* Github[http://github.com/bahuvrihi/ms-in_silico/tree/master]
|
16
|
+
* {Google Group}[http://groups.google.com/group/mspire-forum]
|
17
|
+
|
18
|
+
== Usage
|
19
|
+
|
20
|
+
require 'ms/in_silico/digester'
|
21
|
+
require 'ms/in_silico/spectrum'
|
22
|
+
include Ms::InSilico
|
23
|
+
|
24
|
+
trypsin = Digester['Trypsin']
|
25
|
+
peptides = trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
|
26
|
+
# => [
|
27
|
+
# 'MIVIGR',
|
28
|
+
# 'SIVHPYITNEYEPFAAEK',
|
29
|
+
# 'QQILSIMAG']
|
30
|
+
|
31
|
+
spectrum = Spectrum.new(peptides[0])
|
32
|
+
spectrum.parent_ion_mass
|
33
|
+
# => 688.417442373391
|
34
|
+
|
35
|
+
spectrum.series('b')
|
36
|
+
# => [
|
37
|
+
# 132.047761058391,
|
38
|
+
# 245.131825038791,
|
39
|
+
# 344.200238954991,
|
40
|
+
# 457.284302935391,
|
41
|
+
# 514.305766658991,
|
42
|
+
# 670.406877687091]
|
43
|
+
|
44
|
+
== Installation
|
45
|
+
|
46
|
+
Ms-InSilico is available as a gem on RubyForge[http://rubyforge.org/projects/mspire]. Use:
|
47
|
+
|
48
|
+
% gem install ms-in_silico
|
49
|
+
|
50
|
+
== Info
|
51
|
+
|
52
|
+
Copyright (c) 2006-2008, Regents of the University of Colorado.
|
53
|
+
Developer:: {Simon Chiang}[http://bahuvrihi.wordpress.com], {Biomolecular Structure Program}[http://biomol.uchsc.edu/], {Hansen Lab}[http://hsc-proteomics.uchsc.edu/hansenlab/]
|
54
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
55
|
+
Licence:: {MIT-Style}[link:files/MIT-LICENSE.html]
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'ms/in_silico/digester'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module InSilico
|
5
|
+
# Ms::InSilico::Digest::manifest digest a protein sequence into peptides
|
6
|
+
# Digest a protein sequence into an array of peptides.
|
7
|
+
#
|
8
|
+
# % rap digest MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG --+ dump --no-audit
|
9
|
+
# I[14:37:55] digest MIVIGRSIVHP... to 3 peptides
|
10
|
+
# # date: 2008-09-15 14:37:55
|
11
|
+
# ---
|
12
|
+
# ms/in_silico/digest (23483900):
|
13
|
+
# - - MIVIGR
|
14
|
+
# - SIVHPYITNEYEPFAAEK
|
15
|
+
# - QQILSIMAG
|
16
|
+
#
|
17
|
+
class Digest < Tap::Task
|
18
|
+
|
19
|
+
config :digester, 'Trypsin' # the name of the digester
|
20
|
+
config :max_misses, 0, &c.integer # the max # of missed cleavage sites
|
21
|
+
config :site_digest, false, &c.boolean # digest to sites (rather than sequences)
|
22
|
+
|
23
|
+
def process(sequence)
|
24
|
+
unless d = Digester[digester]
|
25
|
+
raise ArgumentError, "unknown digester: #{digester}"
|
26
|
+
end
|
27
|
+
|
28
|
+
peptides = site_digest ? d.site_digest(sequence, max_misses): d.digest(sequence, max_misses)
|
29
|
+
log 'digest', "#{sequence[0..10]}#{sequence.length > 10 ? '...' : ''} to #{peptides.length} peptides"
|
30
|
+
peptides
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
require 'constants/library'
|
2
|
+
require 'strscan'
|
3
|
+
|
4
|
+
module Ms
|
5
|
+
module InSilico
|
6
|
+
|
7
|
+
# Digester splits a protein sequence into peptides at sites specified
|
8
|
+
# during initialization; in short Digester models a cleavage enzyme.
|
9
|
+
# Digesters support missed cleavage sites, and can return either the
|
10
|
+
# peptide strings or the cleavage sites.
|
11
|
+
#
|
12
|
+
# Digester includes {Constants::Library}[http://bioactive.rubyforge.org/constants/classes/Constants/Library.html],
|
13
|
+
# allowing access to many common digesters using Digester[]:
|
14
|
+
#
|
15
|
+
# trypsin = Digester['Trypsin']
|
16
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
|
17
|
+
# # => [
|
18
|
+
# # 'MIVIGR',
|
19
|
+
# # 'SIVHPYITNEYEPFAAEK',
|
20
|
+
# # 'QQILSIMAG']
|
21
|
+
#
|
22
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
23
|
+
# # => [
|
24
|
+
# # 'MIVIGR',
|
25
|
+
# # 'MIVIGRSIVHPYITNEYEPFAAEK',
|
26
|
+
# # 'SIVHPYITNEYEPFAAEK',
|
27
|
+
# # 'SIVHPYITNEYEPFAAEKQQILSIMAG',
|
28
|
+
# # 'QQILSIMAG'
|
29
|
+
# # ]
|
30
|
+
#
|
31
|
+
# trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
32
|
+
# # => [
|
33
|
+
# # [0,6],
|
34
|
+
# # [0,24],
|
35
|
+
# # [6,24],
|
36
|
+
# # [6,33],
|
37
|
+
# # [24,33]
|
38
|
+
# # ]
|
39
|
+
#
|
40
|
+
# ==== Enzymes
|
41
|
+
#
|
42
|
+
# Enzymes in the library were adapted from the default Mascot[http://www.matrixscience.com/]
|
43
|
+
# enzyme list. Currently supported enzymes include:
|
44
|
+
#
|
45
|
+
# * Arg-C
|
46
|
+
# * Asp-N
|
47
|
+
# * Asp-N_ambic
|
48
|
+
# * Chymotrypsin
|
49
|
+
# * CNBr
|
50
|
+
# * Lys-C
|
51
|
+
# * Lys-C/P
|
52
|
+
# * PepsinA
|
53
|
+
# * Tryp-CNBr
|
54
|
+
# * TrypChymo
|
55
|
+
# * Trypsin/P
|
56
|
+
# * V8-DE
|
57
|
+
# * V8-E
|
58
|
+
# * Trypsin
|
59
|
+
# * V8-E+Trypsin
|
60
|
+
# * V8-DE+Trypsin
|
61
|
+
#
|
62
|
+
# Several enzymes require two or more digesters, or functionality that
|
63
|
+
# is not provided by Digester, and so remain unsupported:
|
64
|
+
#
|
65
|
+
# * CNBr+Trypsin
|
66
|
+
# * Formic_acid
|
67
|
+
# * LysC+AspN
|
68
|
+
# * semiTrypsin
|
69
|
+
#
|
70
|
+
class Digester
|
71
|
+
|
72
|
+
# The name of the digester
|
73
|
+
attr_reader :name
|
74
|
+
|
75
|
+
# A string of residues at which cleavage occurs
|
76
|
+
attr_reader :cleave_str
|
77
|
+
|
78
|
+
# A c-terminal resitriction residue which prevents
|
79
|
+
# cleavage at a potential cleavage site (optional).
|
80
|
+
attr_reader :cterm_exception
|
81
|
+
|
82
|
+
# True if cleavage occurs at the c-terminus of a
|
83
|
+
# cleavage residue, false if cleavage occurs at
|
84
|
+
# the n-terminus.
|
85
|
+
attr_reader :cterm_cleavage
|
86
|
+
|
87
|
+
# a multiline whitespace regexp
|
88
|
+
WHITESPACE = /\s*/m
|
89
|
+
|
90
|
+
def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
|
91
|
+
regexp = []
|
92
|
+
0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
|
93
|
+
|
94
|
+
@name = name
|
95
|
+
@cleave_str = cleave_str
|
96
|
+
@cleave_regexp = Regexp.new(regexp.join('|'))
|
97
|
+
@cterm_exception = case
|
98
|
+
when cterm_exception == nil || cterm_exception.empty? then nil
|
99
|
+
when cterm_exception.length == 1 then cterm_exception[0]
|
100
|
+
else
|
101
|
+
raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
|
102
|
+
end
|
103
|
+
|
104
|
+
@cterm_cleavage = cterm_cleavage
|
105
|
+
@scanner = StringScanner.new('')
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns sites of digestion sites in sequence, as determined by
|
109
|
+
# thecleave_regexp boundaries. The digestion sites correspond
|
110
|
+
# to the positions where a peptide begins and ends, such that
|
111
|
+
# [n, (n+1) - n] corresponds to the [index, length] for peptide n.
|
112
|
+
#
|
113
|
+
# d = Digester.new('Trypsin', 'KR', 'P')
|
114
|
+
# seq = "AARGGR"
|
115
|
+
# sites = d.cleavage_sites(seq) # => [0, 3, 6]
|
116
|
+
#
|
117
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
|
118
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
119
|
+
#
|
120
|
+
# Trailing whitespace is included in the fragment.
|
121
|
+
#
|
122
|
+
# seq = "AAR \n GGR"
|
123
|
+
# sites = d.cleavage_sites(seq) # => [0, 8, 11]
|
124
|
+
#
|
125
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
|
126
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
127
|
+
#
|
128
|
+
# The digested section of sequence may be specified using offset
|
129
|
+
# and length.
|
130
|
+
def cleavage_sites(seq, offset=0, length=seq.length-offset)
|
131
|
+
adjustment = cterm_cleavage ? 0 : 1
|
132
|
+
limit = offset + length
|
133
|
+
|
134
|
+
positions = [offset]
|
135
|
+
pos = scan(seq, offset, limit) do |pos|
|
136
|
+
positions << pos - adjustment
|
137
|
+
end
|
138
|
+
|
139
|
+
# add the final position
|
140
|
+
if pos < limit || positions.length == 1
|
141
|
+
positions << limit
|
142
|
+
end
|
143
|
+
|
144
|
+
positions
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns digestion sites of sequence as [start_index, end_index] pairs,
|
148
|
+
# allowing for missed cleavages. Digestion sites are determined using
|
149
|
+
# cleavage_sites; as in that method, the digested section of sequence
|
150
|
+
# may be specified using offset and length.
|
151
|
+
#
|
152
|
+
# Each [start_index, end_index] pair is yielded to the block, if given,
|
153
|
+
# and the collected results are returned.
|
154
|
+
def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset) # :yields: start_index, end_index
|
155
|
+
frag_sites = cleavage_sites(seq, offset, length)
|
156
|
+
|
157
|
+
overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
|
158
|
+
start_index = frag_sites[start_index]
|
159
|
+
end_index = frag_sites[end_index]
|
160
|
+
|
161
|
+
block_given? ? yield(start_index, end_index) : [start_index, end_index]
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Returns an array of peptides produced by digesting sequence, allowing for
|
166
|
+
# missed cleavage sites. Digestion sites are determined using cleavage_sites;
|
167
|
+
# as in that method, the digested section of sequence may be specified using
|
168
|
+
# offset and length.
|
169
|
+
def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
|
170
|
+
site_digest(seq, max_misses, offset, length).collect do |s, e|
|
171
|
+
seq[s, e-s]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
protected
|
176
|
+
|
177
|
+
# The cleavage regexp used to identify cleavage sites
|
178
|
+
attr_reader :cleave_regexp # :nodoc:
|
179
|
+
|
180
|
+
# The scanner used to digest strings.
|
181
|
+
attr_reader :scanner # :nodoc:
|
182
|
+
|
183
|
+
# Scans seq between offset and limit for the cleave_regexp, skipping whitespace
|
184
|
+
# and being mindful of exception characters. The positions of the scanner at
|
185
|
+
# each match are yielded to the block.
|
186
|
+
def scan(seq, offset, limit) # :nodoc:
|
187
|
+
scanner.string = seq
|
188
|
+
scanner.pos = offset
|
189
|
+
|
190
|
+
while scanner.search_full(cleave_regexp, true, false)
|
191
|
+
scanner.search_full(WHITESPACE, true, false)
|
192
|
+
pos = scanner.pos
|
193
|
+
|
194
|
+
# skip if the next character is the exception character
|
195
|
+
next if cterm_exception != nil && seq[pos] == cterm_exception
|
196
|
+
|
197
|
+
# break if you scanned past the upper limit
|
198
|
+
break if pos > limit
|
199
|
+
|
200
|
+
yield pos
|
201
|
+
end
|
202
|
+
|
203
|
+
scanner.pos
|
204
|
+
end
|
205
|
+
|
206
|
+
# Performs an overlap-collect algorithm providing the start and end
|
207
|
+
# indicies of spans skipping up to max_misses boundaries.
|
208
|
+
def overlay(n, max_misses, offset) # :nodoc:
|
209
|
+
results = []
|
210
|
+
0.upto(n-1) do |start_index|
|
211
|
+
0.upto(max_misses) do |n_miss|
|
212
|
+
end_index = start_index + offset + n_miss
|
213
|
+
break if end_index == n
|
214
|
+
|
215
|
+
results << yield(start_index, end_index)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
results
|
219
|
+
end
|
220
|
+
|
221
|
+
#
|
222
|
+
# Enzymes adapted from the default Mascot enzyme list.
|
223
|
+
#
|
224
|
+
|
225
|
+
class << self
|
226
|
+
protected
|
227
|
+
|
228
|
+
# Utility method to parse a mascot enzyme configuration
|
229
|
+
# string into a Digester.
|
230
|
+
def mascot_parse(str) # :nodoc:
|
231
|
+
name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
|
232
|
+
cterm_cleavage = case sense
|
233
|
+
when 'C-Term' then true
|
234
|
+
when 'N-Term' then false
|
235
|
+
else raise ArgumentError, "unknown sense: #{sense}"
|
236
|
+
end
|
237
|
+
|
238
|
+
new(name, cleave_str, cterm_exception, cterm_cleavage)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
ARG_C = mascot_parse('Arg-C C-Term R P no no')
|
243
|
+
ASP_N = mascot_parse('Asp-N N-Term BD no no')
|
244
|
+
ASP_N_AMBIC = mascot_parse('Asp-N_ambic N-Term DE no no')
|
245
|
+
CHYMOTRYPSIN = mascot_parse('Chymotrypsin C-Term FLWY P no no')
|
246
|
+
CNBR = mascot_parse('CNBr C-Term M no no')
|
247
|
+
LYS_C = mascot_parse('Lys-C C-Term K P no no')
|
248
|
+
LYS_C_P = mascot_parse('Lys-C/P C-Term K no no')
|
249
|
+
PEPSIN_A = mascot_parse('PepsinA C-Term FL no no')
|
250
|
+
TRYP_CNBR = mascot_parse('Tryp-CNBr C-Term KMR P no no')
|
251
|
+
TRYP_CHYMO = mascot_parse('TrypChymo C-Term FKLRWY P no no')
|
252
|
+
TRYPSIN_P = mascot_parse('Trypsin/P C-Term KR no no')
|
253
|
+
V8_DE = mascot_parse('V8-DE C-Term BDEZ P no no')
|
254
|
+
V8_E = mascot_parse('V8-E C-Term EZ P no no')
|
255
|
+
TRYPSIN = mascot_parse('Trypsin C-Term KR P no no')
|
256
|
+
V8_E_TRYPSIN = mascot_parse('V8-E+Trypsin C-Term EKRZ P no no')
|
257
|
+
V8_DE_TRYPSIN = mascot_parse('V8-DE+Trypsin C-Term BDEKRZ P no no')
|
258
|
+
|
259
|
+
include Constants::Library
|
260
|
+
library.index_by_attribute :name
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'ms/in_silico/spectrum'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module InSilico
|
5
|
+
|
6
|
+
# Ms::InSilico::Fragment::manifest calculates a theoretical ms/ms spectrum
|
7
|
+
#
|
8
|
+
# Calculates the parent ion mass and theoretical ms/ms spectrum for a
|
9
|
+
# peptide sequence. Configurations allow the specification of one or
|
10
|
+
# more fragmentation series to include, as well as charge, and intensity.
|
11
|
+
#
|
12
|
+
# % rap fragment TVQQEL --+ dump --no-audit
|
13
|
+
# # date: 2008-09-15 14:37:55
|
14
|
+
# ---
|
15
|
+
# ms/in_silico/fragment (:...:):
|
16
|
+
# - - 717.377745628191
|
17
|
+
# - - 102.054954926291
|
18
|
+
# - 132.101905118891
|
19
|
+
# - 201.123368842491
|
20
|
+
# - 261.144498215091
|
21
|
+
# - 329.181946353891
|
22
|
+
# - 389.203075726491
|
23
|
+
# - 457.240523865291
|
24
|
+
# - 517.261653237891
|
25
|
+
# - 586.283116961491
|
26
|
+
# - 616.330067154091
|
27
|
+
# - 699.367180941891
|
28
|
+
# - 717.377745628191
|
29
|
+
#
|
30
|
+
# In the output, the parent ion mass is given first, followed by an
|
31
|
+
# array of the sorted fragmentation data.
|
32
|
+
class Fragment < Tap::Task
|
33
|
+
|
34
|
+
# A block to validate a config input
|
35
|
+
# is an EmpericalFormula.
|
36
|
+
MOLECULE = lambda do |value|
|
37
|
+
case value
|
38
|
+
when Molecules::EmpiricalFormula then value
|
39
|
+
else Molecules::EmpiricalFormula.parse(value)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
config :series, ['y', 'b'], &c.array # a list of the series to include
|
44
|
+
config :charge, 1, &c.integer # the charge for the parent ion
|
45
|
+
config :intensity, nil, &c.num_or_nil # a uniform intensity value
|
46
|
+
config :nterm, 'H', &MOLECULE # the n-terminal modification
|
47
|
+
config :cterm, 'OH', &MOLECULE # the c-terminal modification
|
48
|
+
config :sort, true, &c.switch # sorts the data by mass
|
49
|
+
config :unmask, true, &c.switch # remove masked (negative) masses
|
50
|
+
|
51
|
+
def process(peptide)
|
52
|
+
log :fragment, peptide
|
53
|
+
spec = spectrum(peptide)
|
54
|
+
|
55
|
+
masses = []
|
56
|
+
series.each {|s| masses.concat(spec.series(s)) }
|
57
|
+
masses.delete_if {|m| m < 0 } if unmask
|
58
|
+
masses.sort! if sort
|
59
|
+
masses.collect! {|m| [m, intensity] } if intensity
|
60
|
+
|
61
|
+
[spec.parent_ion_mass(charge), masses]
|
62
|
+
end
|
63
|
+
|
64
|
+
protected
|
65
|
+
|
66
|
+
# Returns a new Spectrum used in the calculation.
|
67
|
+
# Primarily a hook for custom spectra in subclasses.
|
68
|
+
def spectrum(peptide)
|
69
|
+
Spectrum.new(peptide, nterm, cterm)
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,450 @@
|
|
1
|
+
require 'molecules/libraries/residue'
|
2
|
+
require 'constants/libraries/particle'
|
3
|
+
require 'ms/in_silico'
|
4
|
+
|
5
|
+
module Ms
|
6
|
+
module InSilico
|
7
|
+
|
8
|
+
# Spectrum calculates the theoretical ion series produced by a fragmentation
|
9
|
+
# process such as collision induced disocciation (CID). The formula used to
|
10
|
+
# calculate the ion series were obtained from the {Matrix Science
|
11
|
+
# website}[http://www.matrixscience.com/]. Spectrum uses the
|
12
|
+
# {Constants}[http://bioactive.rubyforge.org/constants/] gem as the default
|
13
|
+
# source of element and particle masses.
|
14
|
+
#
|
15
|
+
# spec = Ms::InSilico::Spectrum.new('TVQQEL')
|
16
|
+
# spec.series('b')
|
17
|
+
# # => [
|
18
|
+
# # 102.054954926291,
|
19
|
+
# # 201.123368842491,
|
20
|
+
# # 329.181946353891,
|
21
|
+
# # 457.240523865291,
|
22
|
+
# # 586.283116961491,
|
23
|
+
# # 699.367180941891]
|
24
|
+
#
|
25
|
+
# spec.series('y')
|
26
|
+
# # => [
|
27
|
+
# # 717.377745628191,
|
28
|
+
# # 616.330067154091,
|
29
|
+
# # 517.261653237891,
|
30
|
+
# # 389.203075726491,
|
31
|
+
# # 261.144498215091,
|
32
|
+
# # 132.101905118891]
|
33
|
+
#
|
34
|
+
# ==== Formulae to Calculate Fragment Ion m/z values
|
35
|
+
#
|
36
|
+
# <em>Copied directly from the Matrix Science {fragmentation help
|
37
|
+
# section}[http://www.matrixscience.com/help/fragmentation_help.html]</em>
|
38
|
+
#
|
39
|
+
# [N] is the molecular mass of the neutral N-terminal group, [C] is the
|
40
|
+
# molecular mass of the neutral C-terminal group, [M] is molecular mass
|
41
|
+
# of the neutral amino acid residues. To obtain m/z values, add or
|
42
|
+
# subtract protons as required to obtain the required charge and divide
|
43
|
+
# by the number of charges. For example, to get a+, add 1 proton to the
|
44
|
+
# Mr value for a. To get a--, subtract 2 protons from the Mr value for
|
45
|
+
# a and divide by 2.
|
46
|
+
#
|
47
|
+
# Ion Type Neutral Mr
|
48
|
+
# a [N]+[M]-CHO
|
49
|
+
# a* a-NH3
|
50
|
+
# a� a-H2O
|
51
|
+
# b [N]+[M]-H
|
52
|
+
# b* b-NH3
|
53
|
+
# b� b-H2O
|
54
|
+
# c [N]+[M]+NH2
|
55
|
+
# d a - partial side chain
|
56
|
+
# v y - complete side chain
|
57
|
+
# w z - partial side chain
|
58
|
+
# x [C]+[M]+CO-H
|
59
|
+
# y [C]+[M]+H
|
60
|
+
# y* y-NH3
|
61
|
+
# y� y-H2O
|
62
|
+
# z [C]+[M]-NH2
|
63
|
+
#
|
64
|
+
# ==== Use of alternate masses
|
65
|
+
# By default a Spectrum will calculate the ion series' using the
|
66
|
+
# monoisotopic masses for each element. To calculate masses
|
67
|
+
# differently, provide a block to new; each Element will be
|
68
|
+
# passed to the block as needed, and the block should return
|
69
|
+
# the element mass used in the calculation.
|
70
|
+
#
|
71
|
+
# Alternatively, a subclass can override the mass method; all
|
72
|
+
# objects that need to be turned into a mass (nterm, cterm,
|
73
|
+
# a variety of molecules specified as strings, the elements,
|
74
|
+
# ELECTRON, etc) are passed to mass to yield the value used
|
75
|
+
# in any given calculation.
|
76
|
+
#
|
77
|
+
#--
|
78
|
+
# ALL of the collections could be sped up using inline
|
79
|
+
#++
|
80
|
+
class Spectrum
|
81
|
+
include Molecules
|
82
|
+
include Molecules::Libraries
|
83
|
+
include Constants::Libraries
|
84
|
+
|
85
|
+
class << self
|
86
|
+
|
87
|
+
def inherited(base)
|
88
|
+
base.instance_variable_set(:@residues_to_locate, @residues_to_locate.dup)
|
89
|
+
end
|
90
|
+
|
91
|
+
# A string of residues located by scan.
|
92
|
+
attr_accessor :residues_to_locate
|
93
|
+
|
94
|
+
# Adds residues to residues_to_locate (these residues
|
95
|
+
# will be located by scan). Generally used when some
|
96
|
+
# special fragmentation behavior occurs at specific
|
97
|
+
# residues. By default no residues are located.
|
98
|
+
#
|
99
|
+
# class Subclass < Spectrum
|
100
|
+
# locate_residues "PS"
|
101
|
+
# end
|
102
|
+
#
|
103
|
+
# Subclass.new('RPPGFSPFR').residue_locations
|
104
|
+
# # => {'P' => [1, 2, 6], 'S' => [5]}
|
105
|
+
#
|
106
|
+
# Calls to locate_residues are cumulative.
|
107
|
+
def locate_residues(residues)
|
108
|
+
@residues_to_locate += residues
|
109
|
+
end
|
110
|
+
|
111
|
+
# Scans the sequence to produce a ladder of masses and a
|
112
|
+
# hash of (residue, locations) pairs which indicate the
|
113
|
+
# indicies at which the residue occurs in sequence. The
|
114
|
+
# ladder corresponds to the M values described above.
|
115
|
+
#
|
116
|
+
# Returns [ladder, {residue => locations}].
|
117
|
+
#
|
118
|
+
# ==== Inputs
|
119
|
+
# sequence:: a string
|
120
|
+
# masses_by_byte:: an array of masses where the index of
|
121
|
+
# the mass is the byte of the
|
122
|
+
# corresponding residue.
|
123
|
+
# residues_to_locate:: a string of the residues to locate.
|
124
|
+
#
|
125
|
+
# Note: scan is an optimized utility function, but should
|
126
|
+
# be replaced by an inline function to do the same.
|
127
|
+
#
|
128
|
+
def scan(sequence, masses_by_byte, residues_to_locate)
|
129
|
+
locations = []
|
130
|
+
residues_to_locate.each_byte {|byte| locations[byte] = []}
|
131
|
+
|
132
|
+
mass = 0
|
133
|
+
ladder = []
|
134
|
+
sequence.each_byte do |byte|
|
135
|
+
mass += masses_by_byte[byte]
|
136
|
+
location = locations[byte]
|
137
|
+
|
138
|
+
location << ladder.length if location
|
139
|
+
ladder << mass
|
140
|
+
end
|
141
|
+
|
142
|
+
hash = {}
|
143
|
+
0.upto(residues_to_locate.length-1) do |index|
|
144
|
+
letter = residues_to_locate[index, 1]
|
145
|
+
byte = letter[0]
|
146
|
+
hash[letter] = locations[byte]
|
147
|
+
end
|
148
|
+
|
149
|
+
[ladder, hash]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
HYDROGEN = EmpiricalFormula.parse("H")
|
154
|
+
HYDROXIDE = EmpiricalFormula.parse("OH")
|
155
|
+
ELECTRON = Particle['Electron']
|
156
|
+
|
157
|
+
self.residues_to_locate = ""
|
158
|
+
|
159
|
+
# The peptide sequence.
|
160
|
+
attr_reader :sequence
|
161
|
+
|
162
|
+
# The n-terminal modification (default H)
|
163
|
+
attr_reader :nterm
|
164
|
+
|
165
|
+
# The c-terminal modification (default OH)
|
166
|
+
attr_reader :cterm
|
167
|
+
|
168
|
+
# An optional block used to calculate masses of molecules.
|
169
|
+
attr_reader :block
|
170
|
+
|
171
|
+
# A ladder of mass values corresponding to the
|
172
|
+
# M values used in the fragmentation formulae.
|
173
|
+
attr_reader :ladder
|
174
|
+
|
175
|
+
# A hash of (residue, [locations]) pairs where
|
176
|
+
# the locations are the indicies in sequence
|
177
|
+
# at which residue occurs.
|
178
|
+
attr_reader :residue_locations
|
179
|
+
|
180
|
+
# Initializes a new Spectrum using the specified n- and c-terminal
|
181
|
+
# modifications. Masses will be calculated using the block, if
|
182
|
+
# specified. If no block is specified, then the monoisoptopic
|
183
|
+
# masses will be used.
|
184
|
+
def initialize(sequence, nterm=HYDROGEN, cterm=HYDROXIDE, &block) # :yields: element
|
185
|
+
@sequence = sequence
|
186
|
+
@nterm = nterm
|
187
|
+
@cterm = cterm
|
188
|
+
@block = block
|
189
|
+
|
190
|
+
residue_masses = Residue.residue_index.collect do |residue|
|
191
|
+
next(0) if residue == nil
|
192
|
+
mass(residue)
|
193
|
+
end
|
194
|
+
|
195
|
+
@ladder, @residue_locations = self.class.scan(
|
196
|
+
sequence,
|
197
|
+
residue_masses,
|
198
|
+
self.class.residues_to_locate)
|
199
|
+
|
200
|
+
@series_hash = {}
|
201
|
+
@series_mask = {}
|
202
|
+
end
|
203
|
+
|
204
|
+
# Returns the mass of the parent ion for the sequence, given the charge.
|
205
|
+
def parent_ion_mass(charge=1)
|
206
|
+
(mass(nterm) + ladder.last + mass(cterm) + charge * proton_mass)/charge
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns the mass of a proton (ie Hydrogen minus an Electron)
|
210
|
+
def proton_mass
|
211
|
+
mass(HYDROGEN) - mass(ELECTRON)
|
212
|
+
end
|
213
|
+
|
214
|
+
# Retrieves the specfied series, assuming a charge of 1. A different charge
|
215
|
+
# can be specified for the series by using '+' and '-'. For example:
|
216
|
+
#
|
217
|
+
# f = Spectrum.new 'RPPGFSPFR'
|
218
|
+
# f.series('y') == f.y_series # => true
|
219
|
+
# f.series('b++') == f.b_series(2) # => true
|
220
|
+
# f.series('nladder-') == f.nladder_series(-1) # => true
|
221
|
+
#
|
222
|
+
# Series raises an error if the specified charge is zero.
|
223
|
+
def series(s)
|
224
|
+
s = s.to_s.strip
|
225
|
+
case s
|
226
|
+
when /^(immonium|nladder|cladder|[abcxyYz])(\+*)(-*)(\s[\+\-\s\w\d]+)?$/
|
227
|
+
series = $1
|
228
|
+
plus = $2
|
229
|
+
minus = $3
|
230
|
+
mod = $4.to_s.gsub(/\s/, "")
|
231
|
+
|
232
|
+
charge = case
|
233
|
+
when plus.empty? && minus.empty? then 1
|
234
|
+
when minus.empty? then plus.length
|
235
|
+
when plus.empty? then -minus.length
|
236
|
+
else
|
237
|
+
charge = plus.length - minus.length
|
238
|
+
raise ArgumentError.new("zero charge specified in series: #{s}") if charge == 0
|
239
|
+
charge
|
240
|
+
end
|
241
|
+
|
242
|
+
self.send("#{series}_series", charge, mod)
|
243
|
+
else
|
244
|
+
handle_unknown_series(s)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def immonium_series(charge=1, mod=nil)
|
249
|
+
get_series(:immonium, charge, mod) do
|
250
|
+
delta = mass(mod) - mass('CO')
|
251
|
+
|
252
|
+
previous = 0
|
253
|
+
series = []
|
254
|
+
ladder.each do |current|
|
255
|
+
series << (current - previous + delta + charge * proton_mass)/charge
|
256
|
+
previous = current
|
257
|
+
end
|
258
|
+
series
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
# [N]+[M]-CHO
|
263
|
+
def a_series(charge=1, mod=nil)
|
264
|
+
get_series(:a, charge, mod) do
|
265
|
+
delta = mass(mod) + mass(nterm) - mass('CHO') + charge * proton_mass
|
266
|
+
nterm_series(delta, charge)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# [N]+[M]-H
|
271
|
+
def b_series(charge=1, mod=nil)
|
272
|
+
get_series(:b, charge, mod) do
|
273
|
+
delta = mass(mod) + mass(nterm) - mass('H') + charge * proton_mass
|
274
|
+
nterm_series(delta, charge)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
# [N]+[M]+NH2
|
279
|
+
def c_series(charge=1, mod=nil)
|
280
|
+
get_series(:c, charge, mod) do
|
281
|
+
delta = mass(mod) + mass(nterm) + mass('NH2') + charge * proton_mass
|
282
|
+
nterm_series(delta, charge)
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
# [M]+H20
|
287
|
+
#--
|
288
|
+
# Ask Peter about these as well... Currently I'm adding water to
|
289
|
+
# cap the ends, as if a hydrolysis reaction produced the ladder,
|
290
|
+
# then I'm adding H for charge... is this what is intended?
|
291
|
+
# Why not cladder[0] or cladder[-1]?
|
292
|
+
#++
|
293
|
+
def cladder_series(charge=1, mod=nil)
|
294
|
+
get_series(:cladder, charge, mod) do
|
295
|
+
delta = mass(mod) + mass('H2O') + charge * proton_mass
|
296
|
+
nterm_series(delta, charge)
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# [C]+[M]+CO-H
|
301
|
+
def x_series(charge=1, mod=nil)
|
302
|
+
get_series(:x, charge, mod) do
|
303
|
+
delta = mass(mod) + ladder.last + mass(cterm) + mass('CO - H') + charge * proton_mass
|
304
|
+
cterm_series(delta, charge)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
# [C]+[M]+H
|
309
|
+
def y_series(charge=1, mod=nil)
|
310
|
+
get_series(:y, charge, mod) do
|
311
|
+
delta = mass(mod) + ladder.last + mass(cterm) + mass('H') + charge * proton_mass
|
312
|
+
cterm_series(delta, charge)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# [C]+[M]-H
|
317
|
+
def Y_series(charge=1, mod=nil)
|
318
|
+
get_series(:Y, charge, mod) do
|
319
|
+
delta = mass(mod) + ladder.last + mass(cterm) - mass('H') + charge * proton_mass
|
320
|
+
cterm_series(delta, charge)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
# [C]+[M]-NH2
|
325
|
+
def z_series(charge=1, mod=nil)
|
326
|
+
get_series(:z, charge, mod) do
|
327
|
+
delta = mass(mod) + ladder.last + mass(cterm) - mass('NH2') + charge * proton_mass
|
328
|
+
cterm_series(delta, charge)
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
# [M]+H20
|
333
|
+
#--
|
334
|
+
# Ask Peter about these as well... Currently I'm adding water to
|
335
|
+
# cap the ends, as if a hydrolysis reaction produced the ladder,
|
336
|
+
# then I'm adding H for charge... is this what is intended?
|
337
|
+
# Why not nladder[-1]?
|
338
|
+
#++
|
339
|
+
def nladder_series(charge=1, mod=nil)
|
340
|
+
get_series(:nladder, charge, mod) do
|
341
|
+
delta = mass(mod) + ladder.last + mass('H2O') + charge * proton_mass
|
342
|
+
cterm_series(delta, charge)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
protected
|
347
|
+
|
348
|
+
# A hash holding all calculated series for self. Series are keyed
|
349
|
+
# by the type and charge of the series (ex: b1, b2, y1, y2).
|
350
|
+
attr_accessor :series_hash
|
351
|
+
|
352
|
+
# A hash holding the locations of residues that need to be masked (ie
|
353
|
+
# multiplied by -1) in a given series. Mask locations should be unique
|
354
|
+
# so that a given location will not be masked twice; the method
|
355
|
+
# mask_locations can assist in doing so. Series masks are keyed
|
356
|
+
# by the series type (ex: b, y).
|
357
|
+
attr_accessor :series_mask
|
358
|
+
|
359
|
+
# Calculates the mass of the molecule for a variety of input
|
360
|
+
# types:
|
361
|
+
#
|
362
|
+
# EmpiricalFormula molecule.mass(&block)
|
363
|
+
# Particle molecule.mass
|
364
|
+
# String EmpiricalFormula.mass(molecule, &block)
|
365
|
+
# Numeric molecule
|
366
|
+
# nil 0
|
367
|
+
#
|
368
|
+
def mass(molecule)
|
369
|
+
|
370
|
+
# note that Particles will not actually make use of the
|
371
|
+
# block, even though it is being passed to it.
|
372
|
+
|
373
|
+
case molecule
|
374
|
+
when EmpiricalFormula, Particle then molecule.mass(&block)
|
375
|
+
when String then EmpiricalFormula.mass(molecule, &block)
|
376
|
+
when nil then 0
|
377
|
+
when Numeric then molecule
|
378
|
+
else
|
379
|
+
raise "cannot calculate mass of: #{molecule}"
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
# Generates an n-terminal series (ex: a, b, or c) by adding delta
|
384
|
+
# to each element from ladder, and dividing by charge. Delta,
|
385
|
+
# therefore, should ALREADY take account of the protons added
|
386
|
+
# by charge.
|
387
|
+
def nterm_series(delta, charge)
|
388
|
+
ladder.collect {|m| (m + delta)/charge }
|
389
|
+
end
|
390
|
+
|
391
|
+
# Generates a c-terminal series (ex: x, y, or z) by subtracting each
|
392
|
+
# element from ladder from delta, and dividing by charge. Delta,
|
393
|
+
# therefore, should ALREADY take account of the protons added
|
394
|
+
# by charge.
|
395
|
+
def cterm_series(delta, charge)
|
396
|
+
series = ladder.collect {|m| (delta - m)/charge }
|
397
|
+
series.unshift(delta/charge)
|
398
|
+
series.pop
|
399
|
+
series
|
400
|
+
end
|
401
|
+
|
402
|
+
# Adds the specified locations to the series mask, ensuring that the
|
403
|
+
# specified locations will be unique within the mask. If overwrite
|
404
|
+
# is true, then the input locations will overwrite any existing mask
|
405
|
+
# locations.
|
406
|
+
def mask_locations(series, locations, overwrite=false)
|
407
|
+
locations = locations.collect do |location|
|
408
|
+
location < 0 ? ladder.length + location : location
|
409
|
+
end
|
410
|
+
|
411
|
+
if overwrite
|
412
|
+
series_mask[series] = locations.uniq
|
413
|
+
else
|
414
|
+
(series_mask[series] ||= []).concat(locations).uniq!
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
# Retrieves the series keyed by "#{key}#{charge}" in series_hash.
|
419
|
+
# If the series has not been initialized, the series will be
|
420
|
+
# initialized using the supplied block, and masked using the
|
421
|
+
# series_mask indicated by key (not "#{key}#{charge}").
|
422
|
+
def get_series(key, charge=nil, mod=nil)
|
423
|
+
series_hash["#{key}#{charge}#{mod}"] ||= mask(yield, key, mod)
|
424
|
+
end
|
425
|
+
|
426
|
+
# Mask the locations in the series by multiplying them by -1. Mask
|
427
|
+
# does NOT check to see if the location is negative or positive.
|
428
|
+
def mask(series, key, mod)
|
429
|
+
locations = series_mask[key]
|
430
|
+
|
431
|
+
unless mod == nil
|
432
|
+
mod_locations = series_mask["#{key}#{mod}"]
|
433
|
+
if mod_locations
|
434
|
+
locations += mod_locations
|
435
|
+
locations.uniq!
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
locations.each {|i| series[i] *= -1} unless locations == nil
|
440
|
+
series
|
441
|
+
end
|
442
|
+
|
443
|
+
# Hook to custom-handle an unknown series from the series method.
|
444
|
+
# By default, handle_unknown_series raises an ArgumentError.
|
445
|
+
def handle_unknown_series(s)
|
446
|
+
raise ArgumentError, "unknown series: #{s}"
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
450
|
+
end
|
data/lib/ms/in_silico.rb
ADDED
data/tap.yml
ADDED
File without changes
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ms-in_silico
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simon Chiang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-11-20 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: tap
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.11"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: molecules
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.0
|
34
|
+
version:
|
35
|
+
description:
|
36
|
+
email: simon.a.chiang@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README
|
43
|
+
- MIT-LICENSE
|
44
|
+
files:
|
45
|
+
- lib/ms/in_silico.rb
|
46
|
+
- lib/ms/in_silico/digest.rb
|
47
|
+
- lib/ms/in_silico/digester.rb
|
48
|
+
- lib/ms/in_silico/fragment.rb
|
49
|
+
- lib/ms/in_silico/spectrum.rb
|
50
|
+
- tap.yml
|
51
|
+
- README
|
52
|
+
- MIT-LICENSE
|
53
|
+
has_rdoc: true
|
54
|
+
homepage: http://mspire.rubyforge.org/projects/ms-in_silico/
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
version:
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
requirements: []
|
73
|
+
|
74
|
+
rubyforge_project: mspire
|
75
|
+
rubygems_version: 1.3.0
|
76
|
+
signing_key:
|
77
|
+
specification_version: 2
|
78
|
+
summary: ms-in_silico task library
|
79
|
+
test_files:
|
80
|
+
- test/tap_test_suite.rb
|