ms-mascot 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History +11 -1
- data/lib/ms/mascot.rb +4 -1
- data/lib/ms/mascot/dat/archive.rb +85 -8
- data/lib/ms/mascot/dat/header.rb +12 -0
- data/lib/ms/mascot/dat/index.rb +13 -19
- data/lib/ms/mascot/dat/masses.rb +14 -0
- data/lib/ms/mascot/dat/parameters.rb +14 -0
- data/lib/ms/mascot/dat/peptides.rb +211 -2
- data/lib/ms/mascot/dat/proteins.rb +73 -1
- data/lib/ms/mascot/dat/query.rb +141 -6
- data/lib/ms/mascot/dat/section.rb +9 -5
- data/lib/ms/mascot/dat/summary.rb +223 -5
- data/lib/ms/mascot/export.rb +55 -72
- data/lib/ms/mascot/format_mgf.rb +7 -8
- data/lib/ms/mascot/fragment.rb +1 -1
- data/lib/ms/mascot/mgf.rb +32 -2
- data/lib/ms/mascot/mgf/archive.rb +8 -1
- data/lib/ms/mascot/submit.rb +52 -69
- data/lib/ms/mascot/validation.rb +17 -0
- metadata +7 -6
data/History
CHANGED
@@ -1,4 +1,14 @@
|
|
1
|
-
== 0.
|
1
|
+
== 0.2.2 / 2009-03-31
|
2
|
+
|
3
|
+
* updates to use latest tap
|
4
|
+
* ms-mascot now uses tap-mechanize instead of tap-http
|
5
|
+
* development of dat support
|
6
|
+
|
7
|
+
== 0.2.1 / 2009-02-26
|
8
|
+
|
9
|
+
* Further development of .dat support
|
10
|
+
|
11
|
+
== 0.2.0 / 2009-02-23
|
2
12
|
|
3
13
|
Updated release utilizing Tap.
|
4
14
|
|
data/lib/ms/mascot.rb
CHANGED
@@ -21,6 +21,9 @@ module Ms
|
|
21
21
|
|
22
22
|
# Provides access to a Mascot dat file.
|
23
23
|
class Archive < ExternalArchive
|
24
|
+
include Dat
|
25
|
+
|
26
|
+
# Parsing & Archive functions
|
24
27
|
module Utils
|
25
28
|
module_function
|
26
29
|
|
@@ -109,6 +112,8 @@ module Ms
|
|
109
112
|
reindex_by_sep(boundary,
|
110
113
|
:entry_follows_sep => true,
|
111
114
|
:exclude_sep => true,
|
115
|
+
# :blksize => 8388608, # default in ExternalArchive
|
116
|
+
:blksize => 33_554_432, # quadrupled the blksize
|
112
117
|
&block)
|
113
118
|
|
114
119
|
# remove the first and last entries, which contain
|
@@ -124,7 +129,7 @@ module Ms
|
|
124
129
|
# which should be present at the start of the string.
|
125
130
|
def str_to_entry(str)
|
126
131
|
if ctc = content_type_class(parse_content_type(str))
|
127
|
-
ctc.parse(str)
|
132
|
+
ctc.parse(str, self)
|
128
133
|
else
|
129
134
|
str
|
130
135
|
end
|
@@ -163,12 +168,19 @@ module Ms
|
|
163
168
|
@section_names[index] ||= parse_section_name(index)
|
164
169
|
end
|
165
170
|
|
166
|
-
|
167
|
-
|
168
|
-
|
171
|
+
# Returns the number of queries registered in self.
|
172
|
+
def nqueries
|
173
|
+
@nqueries ||= section_names.select {|name| name =~ /query/ }.length
|
174
|
+
end
|
175
|
+
|
176
|
+
# Yields each query to the block.
|
177
|
+
def each_query
|
178
|
+
1.upto(nqueries) do |n|
|
179
|
+
yield(query(n))
|
169
180
|
end
|
170
181
|
end
|
171
182
|
|
183
|
+
# Returns the specified query.
|
172
184
|
def query(num)
|
173
185
|
if si = section_index("query#{num}")
|
174
186
|
self[si]
|
@@ -177,6 +189,70 @@ module Ms
|
|
177
189
|
end
|
178
190
|
end
|
179
191
|
|
192
|
+
# by default, yields the top PeptideHit object per query
|
193
|
+
# opts may be:
|
194
|
+
# :by => :top
|
195
|
+
# :top top ranked hit (default)
|
196
|
+
# :groups an array of hits
|
197
|
+
# :all each peptide hit (all ranks)
|
198
|
+
#
|
199
|
+
# :yield_nil => true
|
200
|
+
# true returns nil when a query had no peptide hit (default)
|
201
|
+
# false this hit (or group) is not yielded
|
202
|
+
# :with_query => false
|
203
|
+
# false just returns peptide hits/groups (default)
|
204
|
+
# true yields the peptide_hit/group and associated query
|
205
|
+
def each_peptide_hit(opts={})
|
206
|
+
defaults = { :by => :top, :yield_nil => true, :with_query => false }
|
207
|
+
(by, yield_nil, with_query) = defaults.merge(opts).values_at(:by, :yield_nil, :with_query)
|
208
|
+
|
209
|
+
peptides = section('peptides')
|
210
|
+
1.upto(nqueries) do |n|
|
211
|
+
case by
|
212
|
+
when :top
|
213
|
+
hit = peptides.peptide_hit(n)
|
214
|
+
unless !yield_nil && hit.nil?
|
215
|
+
if with_query
|
216
|
+
yield hit, query(n)
|
217
|
+
else
|
218
|
+
yield hit
|
219
|
+
end
|
220
|
+
end
|
221
|
+
when :groups
|
222
|
+
group = peptides.peptide_hits(n)
|
223
|
+
group.shift # remove the 0 index
|
224
|
+
unless !yield_nil && group.first.nil?
|
225
|
+
if with_query
|
226
|
+
yield group, query(n)
|
227
|
+
else
|
228
|
+
yield group
|
229
|
+
end
|
230
|
+
end
|
231
|
+
when :all
|
232
|
+
|
233
|
+
group = peptides.peptide_hits(n)
|
234
|
+
group.shift # remove the 0 index
|
235
|
+
unless !yield_nil && group.first.nil?
|
236
|
+
# need to return the nil hit if we are yielding nils:
|
237
|
+
if group.first.nil?
|
238
|
+
if with_query
|
239
|
+
yield nil, query(n)
|
240
|
+
else
|
241
|
+
yield nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
group.each do |pep_hit|
|
245
|
+
if with_query
|
246
|
+
yield pep_hit, query(n)
|
247
|
+
else
|
248
|
+
yield pep_hit
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
180
256
|
private
|
181
257
|
|
182
258
|
# resolves each section
|
@@ -192,7 +268,8 @@ module Ms
|
|
192
268
|
io.pos = index[0] + 1
|
193
269
|
parse_content_type(io.readline)[:section_name]
|
194
270
|
end
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
end
|
271
|
+
|
272
|
+
end # Archive
|
273
|
+
end # Dat
|
274
|
+
end # Mascot
|
275
|
+
end # Ms
|
data/lib/ms/mascot/dat/header.rb
CHANGED
@@ -1,4 +1,16 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Header contains information describing the search environment, especially
|
4
|
+
# features of the search database, but also search statistics, like exec_time.
|
5
|
+
#
|
6
|
+
# Content-Type: application/x-Mascot; name="header"
|
7
|
+
#
|
8
|
+
# sequences=257964
|
9
|
+
# sequences_after_tax=257964
|
10
|
+
# residues=93947433
|
11
|
+
# ...
|
12
|
+
#
|
13
|
+
# Header is a standard Section and simply defines methods for convenient
|
14
|
+
# access. See Section for parsing details.
|
3
15
|
class Ms::Mascot::Dat::Header < Ms::Mascot::Dat::Section
|
4
16
|
end
|
data/lib/ms/mascot/dat/index.rb
CHANGED
@@ -1,23 +1,17 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Index maps section names to the line at which the multipart break (ex
|
4
|
+
# '--gc0p4Jq0M2Yt08jU534c0p') occurs. Achive creates it's own index and
|
5
|
+
# does not make use of this section.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="index"
|
8
|
+
#
|
9
|
+
# parameters=4
|
10
|
+
# masses=78
|
11
|
+
# unimod=117
|
12
|
+
# ...
|
13
|
+
#
|
14
|
+
# Index is a standard Section and simply defines methods for convenient
|
15
|
+
# access. See Section for parsing details.
|
3
16
|
class Ms::Mascot::Dat::Index < Ms::Mascot::Dat::Section
|
4
|
-
|
5
|
-
def nqueries
|
6
|
-
@nqueries ||= data.keys.select {|key| key =~ /query/ }.length
|
7
|
-
end
|
8
|
-
|
9
|
-
|
10
|
-
def query(index)
|
11
|
-
query_key = "query#{index}"
|
12
|
-
data.each_pair do |key, value|
|
13
|
-
return value if key == query_key
|
14
|
-
end
|
15
|
-
nil
|
16
|
-
end
|
17
|
-
|
18
|
-
# returns all query sections
|
19
|
-
def queries
|
20
|
-
data.keys.grep( /^query(\d+)$/o ).sort
|
21
|
-
end
|
22
|
-
|
23
17
|
end
|
data/lib/ms/mascot/dat/masses.rb
CHANGED
@@ -1,4 +1,18 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Masses contains the masses of elements, residues, particles (like 'Electron')
|
4
|
+
# and the delta masses for modifications used in an identification, including
|
5
|
+
# the mass of various neutral losses.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="masses"
|
8
|
+
#
|
9
|
+
# A=71.037114
|
10
|
+
# B=114.534940
|
11
|
+
# C=103.009185
|
12
|
+
# D=115.026943
|
13
|
+
# ...
|
14
|
+
#
|
15
|
+
# Masses is a standard Section and simply defines methods for convenient
|
16
|
+
# access. See Section for parsing details.
|
3
17
|
class Ms::Mascot::Dat::Masses < Ms::Mascot::Dat::Section
|
4
18
|
end
|
@@ -1,4 +1,18 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Parameters represents search parameters in a Dat file. This section appears
|
4
|
+
# to be a direct dump of the multipart data created by a Mascot search form.
|
5
|
+
#
|
6
|
+
# Content-Type: application/x-Mascot; name="parameters"
|
7
|
+
#
|
8
|
+
# LICENSE=Licensed to: Matrix Science Internal use only - Frill, (4 processors).
|
9
|
+
# MP=
|
10
|
+
# NM=
|
11
|
+
# COM=MS/MS Example
|
12
|
+
# IATOL=
|
13
|
+
# ...
|
14
|
+
#
|
15
|
+
# Parameters is a standard Section and simply defines methods for convenient
|
16
|
+
# access. See Section for parsing details.
|
3
17
|
class Ms::Mascot::Dat::Parameters < Ms::Mascot::Dat::Section
|
4
18
|
end
|
@@ -1,4 +1,213 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
module Ms::Mascot::Dat
|
4
|
+
|
5
|
+
# Peptides represent peptide identification information in a dat file.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="peptides"
|
8
|
+
#
|
9
|
+
# q1_p1=-1
|
10
|
+
# q2_p1=0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0;"Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
|
11
|
+
# q2_p1_terms=R,-:R,-
|
12
|
+
# q2_p2=0,499.300598,-0.051862,2,LAVTP,10,0000000,3.87,0001002000000000000,0,0;"RLPA_RICCN":0:316:320:1
|
13
|
+
# q2_p2_terms=K,-
|
14
|
+
# q2_p3=0,499.336990,-0.088254,2,LAVVV,10,0000000,3.87,0001002000000000000,0,0;"DYNA_NEUCR":0:1296:1300:1
|
15
|
+
# q2_p3_terms=R,-
|
16
|
+
#
|
17
|
+
# Peptides is a standard Section and simply defines methods for convenient
|
18
|
+
# access. See Section for parsing details.
|
19
|
+
#
|
20
|
+
# === Interpretation
|
21
|
+
#
|
22
|
+
# Deciphering the peptide information requires some cross-referencing with
|
23
|
+
# online results. Note that a single query can match multiple peptides.
|
24
|
+
#
|
25
|
+
# qN_pM=-1 # no matches
|
26
|
+
# qN_pM=peptide;protein_maps # query N peptide hit M
|
27
|
+
# qN_pM_terms=A,B:C,D # n and c-termini residues for each protein match
|
28
|
+
#
|
29
|
+
# See the Peptide and ProteinMap structures for interpretation of the
|
30
|
+
# specific query data.
|
31
|
+
class Peptides < Ms::Mascot::Dat::Section
|
32
|
+
|
33
|
+
# === PeptideHit
|
34
|
+
#
|
35
|
+
# Represents peptide hit data, infered by inspection of the MS/MS sample
|
36
|
+
# results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1§ion=5&ave_thresh=38].
|
37
|
+
#
|
38
|
+
# # str: 0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0
|
39
|
+
#
|
40
|
+
# index example meaning
|
41
|
+
# 0 0 n Missed Cleavages
|
42
|
+
# 1 499.300598 Monoisotopic mass of neutral peptide Mr(calc)
|
43
|
+
# 2 -0.051862 actual - theoretical delta mass
|
44
|
+
# 3 2
|
45
|
+
# 4 LAVPT matched sequence
|
46
|
+
# 5 10
|
47
|
+
# 6 0000000 modification sites (including n,c residues; number indicates mod)
|
48
|
+
# 7 3.87 peptide score
|
49
|
+
# 8 0001002000000000000
|
50
|
+
# 9 0
|
51
|
+
# 10 0
|
52
|
+
#
|
53
|
+
# The dat file is said to be generate by Mascot version 1.0, but the headers
|
54
|
+
# section records 2.1.119.
|
55
|
+
#
|
56
|
+
# ==== Modification Sequence
|
57
|
+
#
|
58
|
+
# The modification sequence indicates which residues are modified and includes
|
59
|
+
# the n and c-terminal residues. The index at each location indicates the
|
60
|
+
# modification used (0 indicates no modification).
|
61
|
+
#
|
62
|
+
# ==== Unaccounted for data
|
63
|
+
#
|
64
|
+
# Peptide data known to exist in the dat file:
|
65
|
+
#
|
66
|
+
# Homology threshold
|
67
|
+
# Identity threshold
|
68
|
+
# Frame number
|
69
|
+
# Number of fragment ion matches
|
70
|
+
# Experimental charge
|
71
|
+
#
|
72
|
+
PeptideHit = Struct.new(
|
73
|
+
:n_missed_cleavages,
|
74
|
+
:peptide_mass,
|
75
|
+
:delta_mass,
|
76
|
+
:unknown3,
|
77
|
+
:sequence,
|
78
|
+
:unknown5,
|
79
|
+
:modifications,
|
80
|
+
:score,
|
81
|
+
:unknown8,
|
82
|
+
:unknown9,
|
83
|
+
:unknown10,
|
84
|
+
:protein_maps,
|
85
|
+
:hit_num,
|
86
|
+
:query_num
|
87
|
+
)
|
88
|
+
|
89
|
+
# Indicies of PeptideHit terms that will be cast to floats.
|
90
|
+
PeptideHitFloatIndicies = [1,2,7]
|
91
|
+
|
92
|
+
# Indicies of PeptideHit terms that will be cast to integers.
|
93
|
+
PeptideHitIntIndicies = [0,3,5,9,10]
|
94
|
+
|
95
|
+
# === ProteinMap
|
96
|
+
#
|
97
|
+
# Represents a protein map, indicating which proteins contain the
|
98
|
+
# identified peptide. There may be many for a given peptide hit
|
99
|
+
#
|
100
|
+
# # str: "Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
|
101
|
+
# # terms: R,-:R,-
|
102
|
+
#
|
103
|
+
# index example meaning
|
104
|
+
# 0 "Y1319_MYCTU" matching protein id
|
105
|
+
# 1 0
|
106
|
+
# 2 531 peptide start index
|
107
|
+
# 3 535 peptide end index
|
108
|
+
# 4 1
|
109
|
+
# 5 R nterm
|
110
|
+
# 6 - cterm
|
111
|
+
#
|
112
|
+
ProteinMap = Struct.new(
|
113
|
+
:id,
|
114
|
+
:uknown1,
|
115
|
+
:peptide_start,
|
116
|
+
:peptide_end,
|
117
|
+
:unknown4,
|
118
|
+
:nterm,
|
119
|
+
:cterm
|
120
|
+
)
|
121
|
+
|
122
|
+
# Indicies of ProteinMap terms that will be cast to integers.
|
123
|
+
ProteinMapIntIndicies = [1,2,3,4]
|
124
|
+
|
125
|
+
module Utils
|
126
|
+
module_function
|
127
|
+
|
128
|
+
# Parses a PeptideHit from the query-hit string.
|
129
|
+
def parse_peptide_hit(str, terms)
|
130
|
+
return nil if str == nil || str == "-1"
|
131
|
+
|
132
|
+
peptide_data, protein_maps = str.split(";", 2)
|
133
|
+
protein_maps = protein_maps.split(",")
|
134
|
+
terms = terms.split(":")
|
135
|
+
|
136
|
+
# parse peptide data
|
137
|
+
peptide_data = peptide_data.split(",")
|
138
|
+
|
139
|
+
PeptideHitFloatIndicies.each do |index|
|
140
|
+
peptide_data[index] = peptide_data[index].to_f
|
141
|
+
end
|
142
|
+
|
143
|
+
PeptideHitIntIndicies.each do |index|
|
144
|
+
peptide_data[index] = peptide_data[index].to_i
|
145
|
+
end
|
146
|
+
|
147
|
+
# parse protein_map data
|
148
|
+
protein_maps = protein_maps.zip(terms).collect do |map_data, terms|
|
149
|
+
data = map_data.split(":") + terms.split(',')
|
150
|
+
|
151
|
+
# removes quotes from protein id
|
152
|
+
data[0] = data[0][1...-1]
|
153
|
+
|
154
|
+
ProteinMapIntIndicies.each {|index| data[index] = data[index].to_i }
|
155
|
+
ProteinMap.new(*data)
|
156
|
+
end
|
157
|
+
|
158
|
+
peptide_data << protein_maps
|
159
|
+
PeptideHit.new(*peptide_data)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
include Utils
|
164
|
+
|
165
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
166
|
+
super(data, section_name, dat)
|
167
|
+
@queries = []
|
168
|
+
end
|
169
|
+
|
170
|
+
# An array of peptides hits per query. Specify resolve=false to return
|
171
|
+
# the currently parsed queries.
|
172
|
+
#
|
173
|
+
# Note that the queries array is indexed the same as in Mascot, ie the
|
174
|
+
# PeptideHit for q1_p1 is located at queries[1][1], meaning there is
|
175
|
+
# always an empty cell at queries[0].
|
176
|
+
def queries(resolve=true)
|
177
|
+
return @queries unless resolve
|
178
|
+
|
179
|
+
query = 1
|
180
|
+
query += 1 while peptide_hits(query)
|
181
|
+
@queries
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns an array of PeptideHits for the specified query, or nil if no
|
185
|
+
# such query exists.
|
186
|
+
def peptide_hits(query)
|
187
|
+
hit = 1
|
188
|
+
hit += 1 while peptide_hit(query, hit)
|
189
|
+
@queries[query]
|
190
|
+
end
|
191
|
+
|
192
|
+
# Returns the PeptideHit at the query and hit index, or nil if no such hit
|
193
|
+
# exists.
|
194
|
+
def peptide_hit(query, hit=1)
|
195
|
+
key = "q#{query}_p#{hit}"
|
196
|
+
return nil unless data.has_key?(key)
|
197
|
+
|
198
|
+
hits = @queries[query] ||= []
|
199
|
+
if existing_hit = hits[hit]
|
200
|
+
return existing_hit
|
201
|
+
end
|
202
|
+
|
203
|
+
if parsed_hit = parse_peptide_hit(data[key], data["#{key}_terms"])
|
204
|
+
parsed_hit.query_num = query
|
205
|
+
parsed_hit.hit_num = hit
|
206
|
+
hits[hit] = parsed_hit
|
207
|
+
return parsed_hit
|
208
|
+
end
|
209
|
+
|
210
|
+
nil
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|