ms-mascot 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History CHANGED
@@ -1,4 +1,14 @@
1
- == 0.12.2 / 2009-02-23
1
+ == 0.2.2 / 2009-03-31
2
+
3
+ * updates to use latest tap
4
+ * ms-mascot now uses tap-mechanize instead of tap-http
5
+ * development of dat support
6
+
7
+ == 0.2.1 / 2009-02-26
8
+
9
+ * Further development of .dat support
10
+
11
+ == 0.2.0 / 2009-02-23
2
12
 
3
13
  Updated release utilizing Tap.
4
14
 
@@ -1,5 +1,8 @@
1
+ require 'ms/mascot/dat'
2
+ require 'ms/mascot/mgf'
3
+
1
4
  module Ms
2
5
  module Mascot
3
6
  FRAGMENT_TEST_MASS_UNCERTAINTY = 10**-2
4
7
  end
5
- end
8
+ end
@@ -21,6 +21,9 @@ module Ms
21
21
 
22
22
  # Provides access to a Mascot dat file.
23
23
  class Archive < ExternalArchive
24
+ include Dat
25
+
26
+ # Parsing & Archive functions
24
27
  module Utils
25
28
  module_function
26
29
 
@@ -109,6 +112,8 @@ module Ms
109
112
  reindex_by_sep(boundary,
110
113
  :entry_follows_sep => true,
111
114
  :exclude_sep => true,
115
+ # :blksize => 8388608, # default in ExternalArchive
116
+ :blksize => 33_554_432, # quadrupled the blksize
112
117
  &block)
113
118
 
114
119
  # remove the first and last entries, which contain
@@ -124,7 +129,7 @@ module Ms
124
129
  # which should be present at the start of the string.
125
130
  def str_to_entry(str)
126
131
  if ctc = content_type_class(parse_content_type(str))
127
- ctc.parse(str)
132
+ ctc.parse(str, self)
128
133
  else
129
134
  str
130
135
  end
@@ -163,12 +168,19 @@ module Ms
163
168
  @section_names[index] ||= parse_section_name(index)
164
169
  end
165
170
 
166
- def each_query(&block)
167
- section('index').queries.each do |key|
168
- block.call( self.section(key) )
171
+ # Returns the number of queries registered in self.
172
+ def nqueries
173
+ @nqueries ||= section_names.select {|name| name =~ /query/ }.length
174
+ end
175
+
176
+ # Yields each query to the block.
177
+ def each_query
178
+ 1.upto(nqueries) do |n|
179
+ yield(query(n))
169
180
  end
170
181
  end
171
182
 
183
+ # Returns the specified query.
172
184
  def query(num)
173
185
  if si = section_index("query#{num}")
174
186
  self[si]
@@ -177,6 +189,70 @@ module Ms
177
189
  end
178
190
  end
179
191
 
192
+ # by default, yields the top PeptideHit object per query
193
+ # opts may be:
194
+ # :by => :top
195
+ # :top top ranked hit (default)
196
+ # :groups an array of hits
197
+ # :all each peptide hit (all ranks)
198
+ #
199
+ # :yield_nil => true
200
+ # true returns nil when a query had no peptide hit (default)
201
+ # false this hit (or group) is not yielded
202
+ # :with_query => false
203
+ # false just returns peptide hits/groups (default)
204
+ # true yields the peptide_hit/group and associated query
205
+ def each_peptide_hit(opts={})
206
+ defaults = { :by => :top, :yield_nil => true, :with_query => false }
207
+ (by, yield_nil, with_query) = defaults.merge(opts).values_at(:by, :yield_nil, :with_query)
208
+
209
+ peptides = section('peptides')
210
+ 1.upto(nqueries) do |n|
211
+ case by
212
+ when :top
213
+ hit = peptides.peptide_hit(n)
214
+ unless !yield_nil && hit.nil?
215
+ if with_query
216
+ yield hit, query(n)
217
+ else
218
+ yield hit
219
+ end
220
+ end
221
+ when :groups
222
+ group = peptides.peptide_hits(n)
223
+ group.shift # remove the 0 index
224
+ unless !yield_nil && group.first.nil?
225
+ if with_query
226
+ yield group, query(n)
227
+ else
228
+ yield group
229
+ end
230
+ end
231
+ when :all
232
+
233
+ group = peptides.peptide_hits(n)
234
+ group.shift # remove the 0 index
235
+ unless !yield_nil && group.first.nil?
236
+ # need to return the nil hit if we are yielding nils:
237
+ if group.first.nil?
238
+ if with_query
239
+ yield nil, query(n)
240
+ else
241
+ yield nil
242
+ end
243
+ end
244
+ group.each do |pep_hit|
245
+ if with_query
246
+ yield pep_hit, query(n)
247
+ else
248
+ yield pep_hit
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end
254
+ end
255
+
180
256
  private
181
257
 
182
258
  # resolves each section
@@ -192,7 +268,8 @@ module Ms
192
268
  io.pos = index[0] + 1
193
269
  parse_content_type(io.readline)[:section_name]
194
270
  end
195
- end
196
- end
197
- end
198
- end
271
+
272
+ end # Archive
273
+ end # Dat
274
+ end # Mascot
275
+ end # Ms
@@ -1,4 +1,16 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Header contains information describing the search environment, especially
4
+ # features of the search database, but also search statistics, like exec_time.
5
+ #
6
+ # Content-Type: application/x-Mascot; name="header"
7
+ #
8
+ # sequences=257964
9
+ # sequences_after_tax=257964
10
+ # residues=93947433
11
+ # ...
12
+ #
13
+ # Header is a standard Section and simply defines methods for convenient
14
+ # access. See Section for parsing details.
3
15
  class Ms::Mascot::Dat::Header < Ms::Mascot::Dat::Section
4
16
  end
@@ -1,23 +1,17 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Index maps section names to the line at which the multipart break (ex
4
+ # '--gc0p4Jq0M2Yt08jU534c0p') occurs. Achive creates it's own index and
5
+ # does not make use of this section.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="index"
8
+ #
9
+ # parameters=4
10
+ # masses=78
11
+ # unimod=117
12
+ # ...
13
+ #
14
+ # Index is a standard Section and simply defines methods for convenient
15
+ # access. See Section for parsing details.
3
16
  class Ms::Mascot::Dat::Index < Ms::Mascot::Dat::Section
4
-
5
- def nqueries
6
- @nqueries ||= data.keys.select {|key| key =~ /query/ }.length
7
- end
8
-
9
-
10
- def query(index)
11
- query_key = "query#{index}"
12
- data.each_pair do |key, value|
13
- return value if key == query_key
14
- end
15
- nil
16
- end
17
-
18
- # returns all query sections
19
- def queries
20
- data.keys.grep( /^query(\d+)$/o ).sort
21
- end
22
-
23
17
  end
@@ -1,4 +1,18 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Masses contains the masses of elements, residues, particles (like 'Electron')
4
+ # and the delta masses for modifications used in an identification, including
5
+ # the mass of various neutral losses.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="masses"
8
+ #
9
+ # A=71.037114
10
+ # B=114.534940
11
+ # C=103.009185
12
+ # D=115.026943
13
+ # ...
14
+ #
15
+ # Masses is a standard Section and simply defines methods for convenient
16
+ # access. See Section for parsing details.
3
17
  class Ms::Mascot::Dat::Masses < Ms::Mascot::Dat::Section
4
18
  end
@@ -1,4 +1,18 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
+ # Parameters represents search parameters in a Dat file. This section appears
4
+ # to be a direct dump of the multipart data created by a Mascot search form.
5
+ #
6
+ # Content-Type: application/x-Mascot; name="parameters"
7
+ #
8
+ # LICENSE=Licensed to: Matrix Science Internal use only - Frill, (4 processors).
9
+ # MP=
10
+ # NM=
11
+ # COM=MS/MS Example
12
+ # IATOL=
13
+ # ...
14
+ #
15
+ # Parameters is a standard Section and simply defines methods for convenient
16
+ # access. See Section for parsing details.
3
17
  class Ms::Mascot::Dat::Parameters < Ms::Mascot::Dat::Section
4
18
  end
@@ -1,4 +1,213 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
- class Ms::Mascot::Dat::Peptides < Ms::Mascot::Dat::Section
4
- end
3
+ module Ms::Mascot::Dat
4
+
5
+ # Peptides represent peptide identification information in a dat file.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="peptides"
8
+ #
9
+ # q1_p1=-1
10
+ # q2_p1=0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0;"Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
11
+ # q2_p1_terms=R,-:R,-
12
+ # q2_p2=0,499.300598,-0.051862,2,LAVTP,10,0000000,3.87,0001002000000000000,0,0;"RLPA_RICCN":0:316:320:1
13
+ # q2_p2_terms=K,-
14
+ # q2_p3=0,499.336990,-0.088254,2,LAVVV,10,0000000,3.87,0001002000000000000,0,0;"DYNA_NEUCR":0:1296:1300:1
15
+ # q2_p3_terms=R,-
16
+ #
17
+ # Peptides is a standard Section and simply defines methods for convenient
18
+ # access. See Section for parsing details.
19
+ #
20
+ # === Interpretation
21
+ #
22
+ # Deciphering the peptide information requires some cross-referencing with
23
+ # online results. Note that a single query can match multiple peptides.
24
+ #
25
+ # qN_pM=-1 # no matches
26
+ # qN_pM=peptide;protein_maps # query N peptide hit M
27
+ # qN_pM_terms=A,B:C,D # n and c-termini residues for each protein match
28
+ #
29
+ # See the Peptide and ProteinMap structures for interpretation of the
30
+ # specific query data.
31
+ class Peptides < Ms::Mascot::Dat::Section
32
+
33
+ # === PeptideHit
34
+ #
35
+ # Represents peptide hit data, infered by inspection of the MS/MS sample
36
+ # results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1&section=5&ave_thresh=38].
37
+ #
38
+ # # str: 0,499.300598,-0.051862,2,LAVPT,10,0000000,3.87,0001002000000000000,0,0
39
+ #
40
+ # index example meaning
41
+ # 0 0 n Missed Cleavages
42
+ # 1 499.300598 Monoisotopic mass of neutral peptide Mr(calc)
43
+ # 2 -0.051862 actual - theoretical delta mass
44
+ # 3 2
45
+ # 4 LAVPT matched sequence
46
+ # 5 10
47
+ # 6 0000000 modification sites (including n,c residues; number indicates mod)
48
+ # 7 3.87 peptide score
49
+ # 8 0001002000000000000
50
+ # 9 0
51
+ # 10 0
52
+ #
53
+ # The dat file is said to be generate by Mascot version 1.0, but the headers
54
+ # section records 2.1.119.
55
+ #
56
+ # ==== Modification Sequence
57
+ #
58
+ # The modification sequence indicates which residues are modified and includes
59
+ # the n and c-terminal residues. The index at each location indicates the
60
+ # modification used (0 indicates no modification).
61
+ #
62
+ # ==== Unaccounted for data
63
+ #
64
+ # Peptide data known to exist in the dat file:
65
+ #
66
+ # Homology threshold
67
+ # Identity threshold
68
+ # Frame number
69
+ # Number of fragment ion matches
70
+ # Experimental charge
71
+ #
72
+ PeptideHit = Struct.new(
73
+ :n_missed_cleavages,
74
+ :peptide_mass,
75
+ :delta_mass,
76
+ :unknown3,
77
+ :sequence,
78
+ :unknown5,
79
+ :modifications,
80
+ :score,
81
+ :unknown8,
82
+ :unknown9,
83
+ :unknown10,
84
+ :protein_maps,
85
+ :hit_num,
86
+ :query_num
87
+ )
88
+
89
+ # Indicies of PeptideHit terms that will be cast to floats.
90
+ PeptideHitFloatIndicies = [1,2,7]
91
+
92
+ # Indicies of PeptideHit terms that will be cast to integers.
93
+ PeptideHitIntIndicies = [0,3,5,9,10]
94
+
95
+ # === ProteinMap
96
+ #
97
+ # Represents a protein map, indicating which proteins contain the
98
+ # identified peptide. There may be many for a given peptide hit
99
+ #
100
+ # # str: "Y1319_MYCTU":0:531:535:1,"Y1353_MYCBO":0:531:535:1
101
+ # # terms: R,-:R,-
102
+ #
103
+ # index example meaning
104
+ # 0 "Y1319_MYCTU" matching protein id
105
+ # 1 0
106
+ # 2 531 peptide start index
107
+ # 3 535 peptide end index
108
+ # 4 1
109
+ # 5 R nterm
110
+ # 6 - cterm
111
+ #
112
+ ProteinMap = Struct.new(
113
+ :id,
114
+ :uknown1,
115
+ :peptide_start,
116
+ :peptide_end,
117
+ :unknown4,
118
+ :nterm,
119
+ :cterm
120
+ )
121
+
122
+ # Indicies of ProteinMap terms that will be cast to integers.
123
+ ProteinMapIntIndicies = [1,2,3,4]
124
+
125
+ module Utils
126
+ module_function
127
+
128
+ # Parses a PeptideHit from the query-hit string.
129
+ def parse_peptide_hit(str, terms)
130
+ return nil if str == nil || str == "-1"
131
+
132
+ peptide_data, protein_maps = str.split(";", 2)
133
+ protein_maps = protein_maps.split(",")
134
+ terms = terms.split(":")
135
+
136
+ # parse peptide data
137
+ peptide_data = peptide_data.split(",")
138
+
139
+ PeptideHitFloatIndicies.each do |index|
140
+ peptide_data[index] = peptide_data[index].to_f
141
+ end
142
+
143
+ PeptideHitIntIndicies.each do |index|
144
+ peptide_data[index] = peptide_data[index].to_i
145
+ end
146
+
147
+ # parse protein_map data
148
+ protein_maps = protein_maps.zip(terms).collect do |map_data, terms|
149
+ data = map_data.split(":") + terms.split(',')
150
+
151
+ # removes quotes from protein id
152
+ data[0] = data[0][1...-1]
153
+
154
+ ProteinMapIntIndicies.each {|index| data[index] = data[index].to_i }
155
+ ProteinMap.new(*data)
156
+ end
157
+
158
+ peptide_data << protein_maps
159
+ PeptideHit.new(*peptide_data)
160
+ end
161
+ end
162
+
163
+ include Utils
164
+
165
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
166
+ super(data, section_name, dat)
167
+ @queries = []
168
+ end
169
+
170
+ # An array of peptides hits per query. Specify resolve=false to return
171
+ # the currently parsed queries.
172
+ #
173
+ # Note that the queries array is indexed the same as in Mascot, ie the
174
+ # PeptideHit for q1_p1 is located at queries[1][1], meaning there is
175
+ # always an empty cell at queries[0].
176
+ def queries(resolve=true)
177
+ return @queries unless resolve
178
+
179
+ query = 1
180
+ query += 1 while peptide_hits(query)
181
+ @queries
182
+ end
183
+
184
+ # Returns an array of PeptideHits for the specified query, or nil if no
185
+ # such query exists.
186
+ def peptide_hits(query)
187
+ hit = 1
188
+ hit += 1 while peptide_hit(query, hit)
189
+ @queries[query]
190
+ end
191
+
192
+ # Returns the PeptideHit at the query and hit index, or nil if no such hit
193
+ # exists.
194
+ def peptide_hit(query, hit=1)
195
+ key = "q#{query}_p#{hit}"
196
+ return nil unless data.has_key?(key)
197
+
198
+ hits = @queries[query] ||= []
199
+ if existing_hit = hits[hit]
200
+ return existing_hit
201
+ end
202
+
203
+ if parsed_hit = parse_peptide_hit(data[key], data["#{key}_terms"])
204
+ parsed_hit.query_num = query
205
+ parsed_hit.hit_num = hit
206
+ hits[hit] = parsed_hit
207
+ return parsed_hit
208
+ end
209
+
210
+ nil
211
+ end
212
+ end
213
+ end