ms-mascot 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/History CHANGED
@@ -1,4 +1,14 @@
1
- == 0.12.2 / 2009-02-23
1
+ == 0.2.2 / 2009-03-31
2
+
3
+ * updates to use latest tap
4
+ * ms-mascot now uses tap-mechanize instead of tap-http
5
+ * development of dat support
6
+
7
+ == 0.2.1 / 2009-02-26
8
+
9
+ * Further development of .dat support
10
+
11
+ == 0.2.0 / 2009-02-23
2
12
 
3
13
  Updated release utilizing Tap.
4
14
 
@@ -1,5 +1,8 @@
1
+ require 'ms/mascot/dat'
2
+ require 'ms/mascot/mgf'
3
+
1
4
  module Ms
2
5
  module Mascot
3
6
  FRAGMENT_TEST_MASS_UNCERTAINTY = 10**-2
4
7
  end
5
- end
8
+ end
@@ -21,6 +21,9 @@ module Ms
21
21
 
22
22
  # Provides access to a Mascot dat file.
23
23
  class Archive < ExternalArchive
24
+ include Dat
25
+
26
+ # Parsing & Archive functions
24
27
  module Utils
25
28
  module_function
26
29
 
@@ -109,6 +112,8 @@ module Ms
109
112
  reindex_by_sep(boundary,
110
113
  :entry_follows_sep => true,
111
114
  :exclude_sep => true,
115
+ # :blksize => 8388608, # default in ExternalArchive
116
+ :blksize => 33_554_432, # quadrupled the blksize
112
117
  &block)
113
118
 
114
119
  # remove the first and last entries, which contain
@@ -124,7 +129,7 @@ module Ms
124
129
  # which should be present at the start of the string.
125
130
  def str_to_entry(str)
126
131
  if ctc = content_type_class(parse_content_type(str))
127
- ctc.parse(str)
132
+ ctc.parse(str, self)
128
133
  else
129
134
  str
130
135
  end
@@ -163,12 +168,19 @@ module Ms
163
168
  @section_names[index] ||= parse_section_name(index)
164
169
  end
165
170
 
166
- def each_query(&block)
167
- section('index').queries.each do |key|
168
- block.call( self.section(key) )
171
+ # Returns the number of queries registered in self.
172
+ def nqueries
173
+ @nqueries ||= section_names.select {|name| name =~ /query/ }.length
174
+ end
175
+
176
+ # Yields each query to the block.
177
+ def each_query
178
+ 1.upto(nqueries) do |n|
179
+ yield(query(n))
169
180
  end
170
181
  end
171
182
 
183
+ # Returns the specified query.
172
184
  def query(num)
173
185
  if si = section_index("query#{num}")
174
186
  self[si]
@@ -177,6 +189,70 @@ module Ms
177
189
  end
178
190
  end
179
191
 
192
+ # by default, yields the top PeptideHit object per query
193
+ # opts may be:
194
+ # :by => :top
195
+ # :top top ranked hit (default)
196
+ # :groups an array of hits
197
+ # :all each peptide hit (all ranks)
198
+ #
199
+ # :yield_nil => true
200
+ # true returns nil when a query had no peptide hit (default)
201
+ # false this hit (or group) is not yielded
202
+ # :with_query => false
203
+ # false just returns peptide hits/groups (default)
204
+ # true yields the peptide_hit/group and associated query
205
+ def each_peptide_hit(opts={})
206
+ defaults = { :by => :top, :yield_nil => true, :with_query => false }
207
+ (by, yield_nil, with_query) = defaults.merge(opts).values_at(:by, :yield_nil, :with_query)
208
+
209
+ peptides = section('peptides')
210
+ 1.upto(nqueries) do |n|
211
+ case by
212
+ when :top
213
+ hit = peptides.peptide_hit(n)
214
+ unless !yield_nil && hit.nil?
215
+ if with_query
216
+ yield hit, query(n)
217
+ else
218
+ yield hit
219
+ end
220
+ end
221
+ when :groups
222
+ group = peptides.peptide_hits(n)
223
+ group.shift # remove the 0 index
224
+ unless !yield_nil && group.first.nil?
225
+ if with_query
226
+ yield group, query(n)
227
+ else
228
+ yield group
229
+ end
230
+ end
231
+ when :all
232
+
233
+ group = peptides.peptide_hits(n)
234
+ group.shift # remove the 0 index
235
+ unless !yield_nil && group.first.nil?
236
+ # need to return the nil hit if we are yielding nils:
237
+ if group.first.nil?
238
+ if with_query
239
+ yield nil, query(n)
240
+ else
241
+ yield nil
242
+ end
243
+ end
244
+ group.each do |pep_hit|
245
+ if with_query
246
+ yield pep_hit, query(n)
247
+ else
248
+ yield pep_hit
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end
254
+ end
255
+
180
256
  private
181
257
 
182
258
  # resolves each section
@@ -192,7 +268,8 @@ module Ms
192
268
  io.pos = index[0] + 1
193
269
  parse_content_type(io.readline)[:section_name]
194
270
  end
195
- end
196
- end
197
- end
198
- end
271
+
272
+ end # Archive
273
+ end # Dat
274
+ end # Mascot
275
+ end # Ms
@@ -1,4 +1,16 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Header contains information describing the search environment, especially
4
+ # features of the search database, but also search statistics, like exec_time.
5
+ #
6
+ # Content-Type: application/x-Mascot; name="header"
7
+ #
8
+ # sequences=257964
9
+ # sequences_after_tax=257964
10
+ # residues=93947433
11
+ # ...
12
+ #
13
+ # Header is a standard Section and simply defines methods for convenient
14
+ # access. See Section for parsing details.
3
15
  class Ms::Mascot::Dat::Header < Ms::Mascot::Dat::Section
4
16
  end
@@ -1,23 +1,17 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Index maps section names to the line at which the multipart break (ex
4
+ # '--gc0p4Jq0M2Yt08jU534c0p') occurs. Achive creates it's own index and
5
+ # does not make use of this section.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="index"
8
+ #
9
+ # parameters=4
10
+ # masses=78
11
+ # unimod=117
12
+ # ...
13
+ #
14
+ # Index is a standard Section and simply defines methods for convenient
15
+ # access. See Section for parsing details.
3
16
  class Ms::Mascot::Dat::Index < Ms::Mascot::Dat::Section
4
-
5
- def nqueries
6
- @nqueries ||= data.keys.select {|key| key =~ /query/ }.length
7
- end
8
-
9
-
10
- def query(index)
11
- query_key = "query#{index}"
12
- data.each_pair do |key, value|
13
- return value if key == query_key
14
- end
15
- nil
16
- end
17
-
18
- # returns all query sections
19
- def queries
20
- data.keys.grep( /^query(\d+)$/o ).sort
21
- end
22
-
23
17
  end
@@ -1,4 +1,18 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Masses contains the masses of elements, residues, particles (like 'Electron')
4
+ # and the delta masses for modifications used in an identification, including
5
+ # the mass of various neutral losses.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="masses"
8
+ #
9
+ # A=71.037114
10
+ # B=114.534940
11
+ # C=103.009185
12
+ # D=115.026943
13
+ # ...
14
+ #
15
+ # Masses is a standard Section and simply defines methods for convenient
16
+ # access. See Section for parsing details.
3
17
  class Ms::Mascot::Dat::Masses < Ms::Mascot::Dat::Section
4
18
  end
@@ -1,4 +1,18 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Parameters represents search parameters in a Dat file. This section appears
4
+ # to be a direct dump of the multipart data created by a Mascot search form.
5
+ #
6
+ # Content-Type: application/x-Mascot; name="parameters"
7
+ #
8
+ # LICENSE=Licensed to: Matrix Science Internal use only - Frill, (4 processors).
9
+ # MP=
10
+ # NM=
11
+ # COM=MS/MS Example
12
+ # IATOL=
13
+ # ...
14
+ #
15
+ # Parameters is a standard Section and simply defines methods for convenient
16
+ # access. See Section for parsing details.
3
17
  class Ms::Mascot::Dat::Parameters < Ms::Mascot::Dat::Section
4
18
  end
@@ -1,4 +1,213 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
- class Ms::Mascot::Dat::Peptides < Ms::Mascot::Dat::Section
4
- end
3
+ module Ms::Mascot::Dat
4
+
5
+ # Peptides represent peptide identification information in a dat file.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="peptides"
8
+ #
9
+ # q1_p1=-1
10
+ # q2_p1=0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0;"Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
11
+ # q2_p1_terms=R,-:R,-
12
+ # q2_p2=0,499.300598,-0.051862,2,LAVTP,10,0000000,3.87,0001002000000000000,0,0;"RLPA_RICCN":0:316:320:1
13
+ # q2_p2_terms=K,-
14
+ # q2_p3=0,499.336990,-0.088254,2,LAVVV,10,0000000,3.87,0001002000000000000,0,0;"DYNA_NEUCR":0:1296:1300:1
15
+ # q2_p3_terms=R,-
16
+ #
17
+ # Peptides is a standard Section and simply defines methods for convenient
18
+ # access. See Section for parsing details.
19
+ #
20
+ # === Interpretation
21
+ #
22
+ # Deciphering the peptide information requires some cross-referencing with
23
+ # online results. Note that a single query can match multiple peptides.
24
+ #
25
+ # qN_pM=-1 # no matches
26
+ # qN_pM=peptide;protein_maps # query N peptide hit M
27
+ # qN_pM_terms=A,B:C,D # n and c-termini residues for each protein match
28
+ #
29
+ # See the Peptide and ProteinMap structures for interpretation of the
30
+ # specific query data.
31
+ class Peptides < Ms::Mascot::Dat::Section
32
+
33
+ # === PeptideHit
34
+ #
35
+ # Represents peptide hit data, infered by inspection of the MS/MS sample
36
+ # results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1&section=5&ave_thresh=38].
37
+ #
38
+ # # str: 0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0
39
+ #
40
+ # index example meaning
41
+ # 0 0 n Missed Cleavages
42
+ # 1 499.300598 Monoisotopic mass of neutral peptide Mr(calc)
43
+ # 2 -0.051862 actual - theoretical delta mass
44
+ # 3 2
45
+ # 4 LAVPT matched sequence
46
+ # 5 10
47
+ # 6 0000000 modification sites (including n,c residues; number indicates mod)
48
+ # 7 3.87 peptide score
49
+ # 8 0001002000000000000
50
+ # 9 0
51
+ # 10 0
52
+ #
53
+ # The dat file is said to be generate by Mascot version 1.0, but the headers
54
+ # section records 2.1.119.
55
+ #
56
+ # ==== Modification Sequence
57
+ #
58
+ # The modification sequence indicates which residues are modified and includes
59
+ # the n and c-terminal residues. The index at each location indicates the
60
+ # modification used (0 indicates no modification).
61
+ #
62
+ # ==== Unaccounted for data
63
+ #
64
+ # Peptide data known to exist in the dat file:
65
+ #
66
+ # Homology threshold
67
+ # Identity threshold
68
+ # Frame number
69
+ # Number of fragment ion matches
70
+ # Experimental charge
71
+ #
72
+ PeptideHit = Struct.new(
73
+ :n_missed_cleavages,
74
+ :peptide_mass,
75
+ :delta_mass,
76
+ :unknown3,
77
+ :sequence,
78
+ :unknown5,
79
+ :modifications,
80
+ :score,
81
+ :unknown8,
82
+ :unknown9,
83
+ :unknown10,
84
+ :protein_maps,
85
+ :hit_num,
86
+ :query_num
87
+ )
88
+
89
+ # Indicies of PeptideHit terms that will be cast to floats.
90
+ PeptideHitFloatIndicies = [1,2,7]
91
+
92
+ # Indicies of PeptideHit terms that will be cast to integers.
93
+ PeptideHitIntIndicies = [0,3,5,9,10]
94
+
95
+ # === ProteinMap
96
+ #
97
+ # Represents a protein map, indicating which proteins contain the
98
+ # identified peptide. There may be many for a given peptide hit
99
+ #
100
+ # # str: "Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
101
+ # # terms: R,-:R,-
102
+ #
103
+ # index example meaning
104
+ # 0 "Y1319_MYCTU" matching protein id
105
+ # 1 0
106
+ # 2 531 peptide start index
107
+ # 3 535 peptide end index
108
+ # 4 1
109
+ # 5 R nterm
110
+ # 6 - cterm
111
+ #
112
+ ProteinMap = Struct.new(
113
+ :id,
114
+ :uknown1,
115
+ :peptide_start,
116
+ :peptide_end,
117
+ :unknown4,
118
+ :nterm,
119
+ :cterm
120
+ )
121
+
122
+ # Indicies of ProteinMap terms that will be cast to integers.
123
+ ProteinMapIntIndicies = [1,2,3,4]
124
+
125
+ module Utils
126
+ module_function
127
+
128
+ # Parses a PeptideHit from the query-hit string.
129
+ def parse_peptide_hit(str, terms)
130
+ return nil if str == nil || str == "-1"
131
+
132
+ peptide_data, protein_maps = str.split(";", 2)
133
+ protein_maps = protein_maps.split(",")
134
+ terms = terms.split(":")
135
+
136
+ # parse peptide data
137
+ peptide_data = peptide_data.split(",")
138
+
139
+ PeptideHitFloatIndicies.each do |index|
140
+ peptide_data[index] = peptide_data[index].to_f
141
+ end
142
+
143
+ PeptideHitIntIndicies.each do |index|
144
+ peptide_data[index] = peptide_data[index].to_i
145
+ end
146
+
147
+ # parse protein_map data
148
+ protein_maps = protein_maps.zip(terms).collect do |map_data, terms|
149
+ data = map_data.split(":") + terms.split(',')
150
+
151
+ # removes quotes from protein id
152
+ data[0] = data[0][1...-1]
153
+
154
+ ProteinMapIntIndicies.each {|index| data[index] = data[index].to_i }
155
+ ProteinMap.new(*data)
156
+ end
157
+
158
+ peptide_data << protein_maps
159
+ PeptideHit.new(*peptide_data)
160
+ end
161
+ end
162
+
163
+ include Utils
164
+
165
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
166
+ super(data, section_name, dat)
167
+ @queries = []
168
+ end
169
+
170
+ # An array of peptides hits per query. Specify resolve=false to return
171
+ # the currently parsed queries.
172
+ #
173
+ # Note that the queries array is indexed the same as in Mascot, ie the
174
+ # PeptideHit for q1_p1 is located at queries[1][1], meaning there is
175
+ # always an empty cell at queries[0].
176
+ def queries(resolve=true)
177
+ return @queries unless resolve
178
+
179
+ query = 1
180
+ query += 1 while peptide_hits(query)
181
+ @queries
182
+ end
183
+
184
+ # Returns an array of PeptideHits for the specified query, or nil if no
185
+ # such query exists.
186
+ def peptide_hits(query)
187
+ hit = 1
188
+ hit += 1 while peptide_hit(query, hit)
189
+ @queries[query]
190
+ end
191
+
192
+ # Returns the PeptideHit at the query and hit index, or nil if no such hit
193
+ # exists.
194
+ def peptide_hit(query, hit=1)
195
+ key = "q#{query}_p#{hit}"
196
+ return nil unless data.has_key?(key)
197
+
198
+ hits = @queries[query] ||= []
199
+ if existing_hit = hits[hit]
200
+ return existing_hit
201
+ end
202
+
203
+ if parsed_hit = parse_peptide_hit(data[key], data["#{key}_terms"])
204
+ parsed_hit.query_num = query
205
+ parsed_hit.hit_num = hit
206
+ hits[hit] = parsed_hit
207
+ return parsed_hit
208
+ end
209
+
210
+ nil
211
+ end
212
+ end
213
+ end