ms-mascot 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History +11 -1
- data/lib/ms/mascot.rb +4 -1
- data/lib/ms/mascot/dat/archive.rb +85 -8
- data/lib/ms/mascot/dat/header.rb +12 -0
- data/lib/ms/mascot/dat/index.rb +13 -19
- data/lib/ms/mascot/dat/masses.rb +14 -0
- data/lib/ms/mascot/dat/parameters.rb +14 -0
- data/lib/ms/mascot/dat/peptides.rb +211 -2
- data/lib/ms/mascot/dat/proteins.rb +73 -1
- data/lib/ms/mascot/dat/query.rb +141 -6
- data/lib/ms/mascot/dat/section.rb +9 -5
- data/lib/ms/mascot/dat/summary.rb +223 -5
- data/lib/ms/mascot/export.rb +55 -72
- data/lib/ms/mascot/format_mgf.rb +7 -8
- data/lib/ms/mascot/fragment.rb +1 -1
- data/lib/ms/mascot/mgf.rb +32 -2
- data/lib/ms/mascot/mgf/archive.rb +8 -1
- data/lib/ms/mascot/submit.rb +52 -69
- data/lib/ms/mascot/validation.rb +17 -0
- metadata +7 -6
data/History
CHANGED
@@ -1,4 +1,14 @@
|
|
1
|
-
== 0.
|
1
|
+
== 0.2.2 / 2009-03-31
|
2
|
+
|
3
|
+
* updates to use latest tap
|
4
|
+
* ms-mascot now uses tap-mechanize instead of tap-http
|
5
|
+
* development of dat support
|
6
|
+
|
7
|
+
== 0.2.1 / 2009-02-26
|
8
|
+
|
9
|
+
* Further development of .dat support
|
10
|
+
|
11
|
+
== 0.2.0 / 2009-02-23
|
2
12
|
|
3
13
|
Updated release utilizing Tap.
|
4
14
|
|
data/lib/ms/mascot.rb
CHANGED
@@ -21,6 +21,9 @@ module Ms
|
|
21
21
|
|
22
22
|
# Provides access to a Mascot dat file.
|
23
23
|
class Archive < ExternalArchive
|
24
|
+
include Dat
|
25
|
+
|
26
|
+
# Parsing & Archive functions
|
24
27
|
module Utils
|
25
28
|
module_function
|
26
29
|
|
@@ -109,6 +112,8 @@ module Ms
|
|
109
112
|
reindex_by_sep(boundary,
|
110
113
|
:entry_follows_sep => true,
|
111
114
|
:exclude_sep => true,
|
115
|
+
# :blksize => 8388608, # default in ExternalArchive
|
116
|
+
:blksize => 33_554_432, # quadrupled the blksize
|
112
117
|
&block)
|
113
118
|
|
114
119
|
# remove the first and last entries, which contain
|
@@ -124,7 +129,7 @@ module Ms
|
|
124
129
|
# which should be present at the start of the string.
|
125
130
|
def str_to_entry(str)
|
126
131
|
if ctc = content_type_class(parse_content_type(str))
|
127
|
-
ctc.parse(str)
|
132
|
+
ctc.parse(str, self)
|
128
133
|
else
|
129
134
|
str
|
130
135
|
end
|
@@ -163,12 +168,19 @@ module Ms
|
|
163
168
|
@section_names[index] ||= parse_section_name(index)
|
164
169
|
end
|
165
170
|
|
166
|
-
|
167
|
-
|
168
|
-
|
171
|
+
# Returns the number of queries registered in self.
|
172
|
+
def nqueries
|
173
|
+
@nqueries ||= section_names.select {|name| name =~ /query/ }.length
|
174
|
+
end
|
175
|
+
|
176
|
+
# Yields each query to the block.
|
177
|
+
def each_query
|
178
|
+
1.upto(nqueries) do |n|
|
179
|
+
yield(query(n))
|
169
180
|
end
|
170
181
|
end
|
171
182
|
|
183
|
+
# Returns the specified query.
|
172
184
|
def query(num)
|
173
185
|
if si = section_index("query#{num}")
|
174
186
|
self[si]
|
@@ -177,6 +189,70 @@ module Ms
|
|
177
189
|
end
|
178
190
|
end
|
179
191
|
|
192
|
+
# by default, yields the top PeptideHit object per query
|
193
|
+
# opts may be:
|
194
|
+
# :by => :top
|
195
|
+
# :top top ranked hit (default)
|
196
|
+
# :groups an array of hits
|
197
|
+
# :all each peptide hit (all ranks)
|
198
|
+
#
|
199
|
+
# :yield_nil => true
|
200
|
+
# true returns nil when a query had no peptide hit (default)
|
201
|
+
# false this hit (or group) is not yielded
|
202
|
+
# :with_query => false
|
203
|
+
# false just returns peptide hits/groups (default)
|
204
|
+
# true yields the peptide_hit/group and associated query
|
205
|
+
def each_peptide_hit(opts={})
|
206
|
+
defaults = { :by => :top, :yield_nil => true, :with_query => false }
|
207
|
+
(by, yield_nil, with_query) = defaults.merge(opts).values_at(:by, :yield_nil, :with_query)
|
208
|
+
|
209
|
+
peptides = section('peptides')
|
210
|
+
1.upto(nqueries) do |n|
|
211
|
+
case by
|
212
|
+
when :top
|
213
|
+
hit = peptides.peptide_hit(n)
|
214
|
+
unless !yield_nil && hit.nil?
|
215
|
+
if with_query
|
216
|
+
yield hit, query(n)
|
217
|
+
else
|
218
|
+
yield hit
|
219
|
+
end
|
220
|
+
end
|
221
|
+
when :groups
|
222
|
+
group = peptides.peptide_hits(n)
|
223
|
+
group.shift # remove the 0 index
|
224
|
+
unless !yield_nil && group.first.nil?
|
225
|
+
if with_query
|
226
|
+
yield group, query(n)
|
227
|
+
else
|
228
|
+
yield group
|
229
|
+
end
|
230
|
+
end
|
231
|
+
when :all
|
232
|
+
|
233
|
+
group = peptides.peptide_hits(n)
|
234
|
+
group.shift # remove the 0 index
|
235
|
+
unless !yield_nil && group.first.nil?
|
236
|
+
# need to return the nil hit if we are yielding nils:
|
237
|
+
if group.first.nil?
|
238
|
+
if with_query
|
239
|
+
yield nil, query(n)
|
240
|
+
else
|
241
|
+
yield nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
group.each do |pep_hit|
|
245
|
+
if with_query
|
246
|
+
yield pep_hit, query(n)
|
247
|
+
else
|
248
|
+
yield pep_hit
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
180
256
|
private
|
181
257
|
|
182
258
|
# resolves each section
|
@@ -192,7 +268,8 @@ module Ms
|
|
192
268
|
io.pos = index[0] + 1
|
193
269
|
parse_content_type(io.readline)[:section_name]
|
194
270
|
end
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
end
|
271
|
+
|
272
|
+
end # Archive
|
273
|
+
end # Dat
|
274
|
+
end # Mascot
|
275
|
+
end # Ms
|
data/lib/ms/mascot/dat/header.rb
CHANGED
@@ -1,4 +1,16 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Header contains information describing the search environment, especially
|
4
|
+
# features of the search database, but also search statistics, like exec_time.
|
5
|
+
#
|
6
|
+
# Content-Type: application/x-Mascot; name="header"
|
7
|
+
#
|
8
|
+
# sequences=257964
|
9
|
+
# sequences_after_tax=257964
|
10
|
+
# residues=93947433
|
11
|
+
# ...
|
12
|
+
#
|
13
|
+
# Header is a standard Section and simply defines methods for convenient
|
14
|
+
# access. See Section for parsing details.
|
3
15
|
class Ms::Mascot::Dat::Header < Ms::Mascot::Dat::Section
|
4
16
|
end
|
data/lib/ms/mascot/dat/index.rb
CHANGED
@@ -1,23 +1,17 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Index maps section names to the line at which the multipart break (ex
|
4
|
+
# '--gc0p4Jq0M2Yt08jU534c0p') occurs. Achive creates it's own index and
|
5
|
+
# does not make use of this section.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="index"
|
8
|
+
#
|
9
|
+
# parameters=4
|
10
|
+
# masses=78
|
11
|
+
# unimod=117
|
12
|
+
# ...
|
13
|
+
#
|
14
|
+
# Index is a standard Section and simply defines methods for convenient
|
15
|
+
# access. See Section for parsing details.
|
3
16
|
class Ms::Mascot::Dat::Index < Ms::Mascot::Dat::Section
|
4
|
-
|
5
|
-
def nqueries
|
6
|
-
@nqueries ||= data.keys.select {|key| key =~ /query/ }.length
|
7
|
-
end
|
8
|
-
|
9
|
-
|
10
|
-
def query(index)
|
11
|
-
query_key = "query#{index}"
|
12
|
-
data.each_pair do |key, value|
|
13
|
-
return value if key == query_key
|
14
|
-
end
|
15
|
-
nil
|
16
|
-
end
|
17
|
-
|
18
|
-
# returns all query sections
|
19
|
-
def queries
|
20
|
-
data.keys.grep( /^query(\d+)$/o ).sort
|
21
|
-
end
|
22
|
-
|
23
17
|
end
|
data/lib/ms/mascot/dat/masses.rb
CHANGED
@@ -1,4 +1,18 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Masses contains the masses of elements, residues, particles (like 'Electron')
|
4
|
+
# and the delta masses for modifications used in an identification, including
|
5
|
+
# the mass of various neutral losses.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="masses"
|
8
|
+
#
|
9
|
+
# A=71.037114
|
10
|
+
# B=114.534940
|
11
|
+
# C=103.009185
|
12
|
+
# D=115.026943
|
13
|
+
# ...
|
14
|
+
#
|
15
|
+
# Masses is a standard Section and simply defines methods for convenient
|
16
|
+
# access. See Section for parsing details.
|
3
17
|
class Ms::Mascot::Dat::Masses < Ms::Mascot::Dat::Section
|
4
18
|
end
|
@@ -1,4 +1,18 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
+
# Parameters represents search parameters in a Dat file. This section appears
|
4
|
+
# to be a direct dump of the multipart data created by a Mascot search form.
|
5
|
+
#
|
6
|
+
# Content-Type: application/x-Mascot; name="parameters"
|
7
|
+
#
|
8
|
+
# LICENSE=Licensed to: Matrix Science Internal use only - Frill, (4 processors).
|
9
|
+
# MP=
|
10
|
+
# NM=
|
11
|
+
# COM=MS/MS Example
|
12
|
+
# IATOL=
|
13
|
+
# ...
|
14
|
+
#
|
15
|
+
# Parameters is a standard Section and simply defines methods for convenient
|
16
|
+
# access. See Section for parsing details.
|
3
17
|
class Ms::Mascot::Dat::Parameters < Ms::Mascot::Dat::Section
|
4
18
|
end
|
@@ -1,4 +1,213 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
module Ms::Mascot::Dat
|
4
|
+
|
5
|
+
# Peptides represent peptide identification information in a dat file.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="peptides"
|
8
|
+
#
|
9
|
+
# q1_p1=-1
|
10
|
+
# q2_p1=0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0;"Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
|
11
|
+
# q2_p1_terms=R,-:R,-
|
12
|
+
# q2_p2=0,499.300598,-0.051862,2,LAVTP,10,0000000,3.87,0001002000000000000,0,0;"RLPA_RICCN":0:316:320:1
|
13
|
+
# q2_p2_terms=K,-
|
14
|
+
# q2_p3=0,499.336990,-0.088254,2,LAVVV,10,0000000,3.87,0001002000000000000,0,0;"DYNA_NEUCR":0:1296:1300:1
|
15
|
+
# q2_p3_terms=R,-
|
16
|
+
#
|
17
|
+
# Peptides is a standard Section and simply defines methods for convenient
|
18
|
+
# access. See Section for parsing details.
|
19
|
+
#
|
20
|
+
# === Interpretation
|
21
|
+
#
|
22
|
+
# Deciphering the peptide information requires some cross-referencing with
|
23
|
+
# online results. Note that a single query can match multiple peptides.
|
24
|
+
#
|
25
|
+
# qN_pM=-1 # no matches
|
26
|
+
# qN_pM=peptide;protein_maps # query N peptide hit M
|
27
|
+
# qN_pM_terms=A,B:C,D # n and c-termini residues for each protein match
|
28
|
+
#
|
29
|
+
# See the Peptide and ProteinMap structures for interpretation of the
|
30
|
+
# specific query data.
|
31
|
+
class Peptides < Ms::Mascot::Dat::Section
|
32
|
+
|
33
|
+
# === PeptideHit
|
34
|
+
#
|
35
|
+
# Represents peptide hit data, infered by inspection of the MS/MS sample
|
36
|
+
# results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1§ion=5&ave_thresh=38].
|
37
|
+
#
|
38
|
+
# # str: 0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0
|
39
|
+
#
|
40
|
+
# index example meaning
|
41
|
+
# 0 0 n Missed Cleavages
|
42
|
+
# 1 499.300598 Monoisotopic mass of neutral peptide Mr(calc)
|
43
|
+
# 2 -0.051862 actual - theoretical delta mass
|
44
|
+
# 3 2
|
45
|
+
# 4 LAVPT matched sequence
|
46
|
+
# 5 10
|
47
|
+
# 6 0000000 modification sites (including n,c residues; number indicates mod)
|
48
|
+
# 7 3.87 peptide score
|
49
|
+
# 8 0001002000000000000
|
50
|
+
# 9 0
|
51
|
+
# 10 0
|
52
|
+
#
|
53
|
+
# The dat file is said to be generate by Mascot version 1.0, but the headers
|
54
|
+
# section records 2.1.119.
|
55
|
+
#
|
56
|
+
# ==== Modification Sequence
|
57
|
+
#
|
58
|
+
# The modification sequence indicates which residues are modified and includes
|
59
|
+
# the n and c-terminal residues. The index at each location indicates the
|
60
|
+
# modification used (0 indicates no modification).
|
61
|
+
#
|
62
|
+
# ==== Unaccounted for data
|
63
|
+
#
|
64
|
+
# Peptide data known to exist in the dat file:
|
65
|
+
#
|
66
|
+
# Homology threshold
|
67
|
+
# Identity threshold
|
68
|
+
# Frame number
|
69
|
+
# Number of fragment ion matches
|
70
|
+
# Experimental charge
|
71
|
+
#
|
72
|
+
PeptideHit = Struct.new(
|
73
|
+
:n_missed_cleavages,
|
74
|
+
:peptide_mass,
|
75
|
+
:delta_mass,
|
76
|
+
:unknown3,
|
77
|
+
:sequence,
|
78
|
+
:unknown5,
|
79
|
+
:modifications,
|
80
|
+
:score,
|
81
|
+
:unknown8,
|
82
|
+
:unknown9,
|
83
|
+
:unknown10,
|
84
|
+
:protein_maps,
|
85
|
+
:hit_num,
|
86
|
+
:query_num
|
87
|
+
)
|
88
|
+
|
89
|
+
# Indicies of PeptideHit terms that will be cast to floats.
|
90
|
+
PeptideHitFloatIndicies = [1,2,7]
|
91
|
+
|
92
|
+
# Indicies of PeptideHit terms that will be cast to integers.
|
93
|
+
PeptideHitIntIndicies = [0,3,5,9,10]
|
94
|
+
|
95
|
+
# === ProteinMap
|
96
|
+
#
|
97
|
+
# Represents a protein map, indicating which proteins contain the
|
98
|
+
# identified peptide. There may be many for a given peptide hit
|
99
|
+
#
|
100
|
+
# # str: "Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
|
101
|
+
# # terms: R,-:R,-
|
102
|
+
#
|
103
|
+
# index example meaning
|
104
|
+
# 0 "Y1319_MYCTU" matching protein id
|
105
|
+
# 1 0
|
106
|
+
# 2 531 peptide start index
|
107
|
+
# 3 535 peptide end index
|
108
|
+
# 4 1
|
109
|
+
# 5 R nterm
|
110
|
+
# 6 - cterm
|
111
|
+
#
|
112
|
+
ProteinMap = Struct.new(
|
113
|
+
:id,
|
114
|
+
:uknown1,
|
115
|
+
:peptide_start,
|
116
|
+
:peptide_end,
|
117
|
+
:unknown4,
|
118
|
+
:nterm,
|
119
|
+
:cterm
|
120
|
+
)
|
121
|
+
|
122
|
+
# Indicies of ProteinMap terms that will be cast to integers.
|
123
|
+
ProteinMapIntIndicies = [1,2,3,4]
|
124
|
+
|
125
|
+
module Utils
|
126
|
+
module_function
|
127
|
+
|
128
|
+
# Parses a PeptideHit from the query-hit string.
|
129
|
+
def parse_peptide_hit(str, terms)
|
130
|
+
return nil if str == nil || str == "-1"
|
131
|
+
|
132
|
+
peptide_data, protein_maps = str.split(";", 2)
|
133
|
+
protein_maps = protein_maps.split(",")
|
134
|
+
terms = terms.split(":")
|
135
|
+
|
136
|
+
# parse peptide data
|
137
|
+
peptide_data = peptide_data.split(",")
|
138
|
+
|
139
|
+
PeptideHitFloatIndicies.each do |index|
|
140
|
+
peptide_data[index] = peptide_data[index].to_f
|
141
|
+
end
|
142
|
+
|
143
|
+
PeptideHitIntIndicies.each do |index|
|
144
|
+
peptide_data[index] = peptide_data[index].to_i
|
145
|
+
end
|
146
|
+
|
147
|
+
# parse protein_map data
|
148
|
+
protein_maps = protein_maps.zip(terms).collect do |map_data, terms|
|
149
|
+
data = map_data.split(":") + terms.split(',')
|
150
|
+
|
151
|
+
# removes quotes from protein id
|
152
|
+
data[0] = data[0][1...-1]
|
153
|
+
|
154
|
+
ProteinMapIntIndicies.each {|index| data[index] = data[index].to_i }
|
155
|
+
ProteinMap.new(*data)
|
156
|
+
end
|
157
|
+
|
158
|
+
peptide_data << protein_maps
|
159
|
+
PeptideHit.new(*peptide_data)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
include Utils
|
164
|
+
|
165
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
166
|
+
super(data, section_name, dat)
|
167
|
+
@queries = []
|
168
|
+
end
|
169
|
+
|
170
|
+
# An array of peptides hits per query. Specify resolve=false to return
|
171
|
+
# the currently parsed queries.
|
172
|
+
#
|
173
|
+
# Note that the queries array is indexed the same as in Mascot, ie the
|
174
|
+
# PeptideHit for q1_p1 is located at queries[1][1], meaning there is
|
175
|
+
# always an empty cell at queries[0].
|
176
|
+
def queries(resolve=true)
|
177
|
+
return @queries unless resolve
|
178
|
+
|
179
|
+
query = 1
|
180
|
+
query += 1 while peptide_hits(query)
|
181
|
+
@queries
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns an array of PeptideHits for the specified query, or nil if no
|
185
|
+
# such query exists.
|
186
|
+
def peptide_hits(query)
|
187
|
+
hit = 1
|
188
|
+
hit += 1 while peptide_hit(query, hit)
|
189
|
+
@queries[query]
|
190
|
+
end
|
191
|
+
|
192
|
+
# Returns the PeptideHit at the query and hit index, or nil if no such hit
|
193
|
+
# exists.
|
194
|
+
def peptide_hit(query, hit=1)
|
195
|
+
key = "q#{query}_p#{hit}"
|
196
|
+
return nil unless data.has_key?(key)
|
197
|
+
|
198
|
+
hits = @queries[query] ||= []
|
199
|
+
if existing_hit = hits[hit]
|
200
|
+
return existing_hit
|
201
|
+
end
|
202
|
+
|
203
|
+
if parsed_hit = parse_peptide_hit(data[key], data["#{key}_terms"])
|
204
|
+
parsed_hit.query_num = query
|
205
|
+
parsed_hit.hit_num = hit
|
206
|
+
hits[hit] = parsed_hit
|
207
|
+
return parsed_hit
|
208
|
+
end
|
209
|
+
|
210
|
+
nil
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|