ms-sequest 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History +8 -0
- data/MIT-LICENSE +20 -0
- data/README +23 -0
- data/lib/ms/sequest.rb +6 -0
- data/lib/ms/sequest/params.rb +343 -0
- data/lib/ms/sequest/sqt.rb +363 -0
- data/lib/ms/sequest/srf.rb +707 -0
- data/lib/ms/sequest/srf/sqt.rb +169 -0
- metadata +88 -0
data/History
ADDED
data/MIT-LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2006 University of Texas at Austin, Regents of the University of
|
|
2
|
+
Colorado, and Howard Hughes Medical Institute.
|
|
3
|
+
|
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
6
|
+
in the Software without restriction, including without limitation the rights
|
|
7
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
9
|
+
furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all
|
|
12
|
+
copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
20
|
+
SOFTWARE.
|
data/README
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
= {Ms-Sequest}[http://mspire.rubyforge.org/projects/ms-sequest]
|
|
2
|
+
|
|
3
|
+
An {Mspire}[http://mspire.rubyforge.org] library supporting SEQUEST, Bioworks, SQT and associated formats.
|
|
4
|
+
|
|
5
|
+
== Description
|
|
6
|
+
|
|
7
|
+
* Lighthouse[http://bahuvrihi.lighthouseapp.com/projects/16692-mspire/tickets]
|
|
8
|
+
* Github[http://github.com/jtprince/ms-sequest/tree/master]
|
|
9
|
+
* {Google Group}[http://groups.google.com/group/mspire-forum]
|
|
10
|
+
|
|
11
|
+
== Installation
|
|
12
|
+
|
|
13
|
+
Ms-Sequest is available as a gem on RubyForge[http://rubyforge.org/projects/mspire]. Use:
|
|
14
|
+
|
|
15
|
+
% gem install ms-sequest
|
|
16
|
+
|
|
17
|
+
== Info
|
|
18
|
+
|
|
19
|
+
Copyright (c) 2006 University of Texas at Austin
|
|
20
|
+
Copyright (c) Regents of the University of Colorado and Howard Hughes Medical Institute.
|
|
21
|
+
Developer:: {John Prince}, {Edward Marcotte Lab}[http://polaris.icmb.utexas.edu/home.html], {Natalie Ahn Lab}[http://www.colorado.edu/chem/people/ahnn.html], {Howard Hughes Medical Institute}[http://www.hhmi.org/], {BYU Dept. of Chemistry and Biochemistry}[http://www.chem.byu.edu/]
|
|
22
|
+
Support::
|
|
23
|
+
Licence:: {MIT-Style}[link:files/MIT-LICENSE.html]
|
data/lib/ms/sequest.rb
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
require 'ms/mass/aa'
|
|
2
|
+
|
|
3
|
+
# In the future, this guy should accept any version of bioworks params file
|
|
4
|
+
# and spit out any param queried.
|
|
5
|
+
|
|
6
|
+
module Ms ; end
|
|
7
|
+
module Ms::Sequest ; end
|
|
8
|
+
|
|
9
|
+
# 1) provides a reader and simple parameter lookup for SEQUEST params files
|
|
10
|
+
# supporting Bioworks 3.1-3.3.1.
|
|
11
|
+
# params = Ms::Sequest::Params.new("sequest.params") # filename by default
|
|
12
|
+
# params = Ms::Sequest::Params.new.parse_io(some_io_object)
|
|
13
|
+
#
|
|
14
|
+
# params.some_parameter # => any parameter defined has a method
|
|
15
|
+
# params.nonexistent_parameter # => nil
|
|
16
|
+
#
|
|
17
|
+
# Provides consistent behavior between different versions important info:
|
|
18
|
+
#
|
|
19
|
+
# # some basic methods shared by all versions:
|
|
20
|
+
# params.version # => '3.1' | '3.2' | '3.3'
|
|
21
|
+
# params.enzyme # => enzyme name with no parentheses
|
|
22
|
+
# params.min_number_termini
|
|
23
|
+
# params.database # => first_database_name
|
|
24
|
+
# params.enzyme_specificity # => [offset, cleave_at, expect_if_after]
|
|
25
|
+
# params.precursor_mass_type # => "average" | "monoisotopic"
|
|
26
|
+
# params.fragment_mass_type # => "average" | "monoisotopic"
|
|
27
|
+
#
|
|
28
|
+
# # some backwards/forwards compatibility methods:
|
|
29
|
+
# params.max_num_internal_cleavages # == max_num_internal_cleavage_sites
|
|
30
|
+
# params.fragment_ion_tol # => fragment_ion_tolerance
|
|
31
|
+
#
|
|
32
|
+
class Ms::Sequest::Params
|
|
33
|
+
|
|
34
|
+
Bioworks31_Enzyme_Info_Array = [
|
|
35
|
+
['No_Enzyme', 0, '-', '-'], # 0
|
|
36
|
+
['Trypsin', 1, 'KR', '-'], # 1
|
|
37
|
+
['Trypsin(KRLNH)', 1, 'KRLNH', '-'], # 2
|
|
38
|
+
['Chymotrypsin', 1, 'FWYL', '-'], # 3
|
|
39
|
+
['Chymotrypsin(FWY)', 1, 'FWY', 'P'], # 4
|
|
40
|
+
['Clostripain', 1, 'R', '-'], # 5
|
|
41
|
+
['Cyanogen_Bromide', 1, 'M', '-'], # 6
|
|
42
|
+
['IodosoBenzoate', 1, 'W', '-'], # 7
|
|
43
|
+
['Proline_Endopept', 1, 'P', '-'], # 8
|
|
44
|
+
['Staph_Protease', 1, 'E', '-'], # 9
|
|
45
|
+
['Trypsin_K', 1, 'K', 'P'], # 10
|
|
46
|
+
['Trypsin_R', 1, 'R', 'P'], # 11
|
|
47
|
+
['GluC', 1, 'ED', '-'], # 12
|
|
48
|
+
['LysC', 1, 'K', '-'], # 13
|
|
49
|
+
['AspN', 0, 'D', '-'], # 14
|
|
50
|
+
['Elastase', 1, 'ALIV', 'P'], # 15
|
|
51
|
+
['Elastase/Tryp/Chymo', 1, 'ALIVKRWFY', 'P'], # 16
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
# current attributes supported are:
|
|
55
|
+
# bioworks 3.2:
|
|
56
|
+
@@param_re = / = ?/o
|
|
57
|
+
@@param_two_split = ';'
|
|
58
|
+
@@sequest_line = /\[SEQUEST\]/o
|
|
59
|
+
|
|
60
|
+
# the general options
|
|
61
|
+
attr_accessor :opts
|
|
62
|
+
# the static weights added to amino acids
|
|
63
|
+
attr_accessor :mods
|
|
64
|
+
|
|
65
|
+
# all keys and values stored as strings!
|
|
66
|
+
# will accept a sequest.params file or .srf file
|
|
67
|
+
def initialize(file=nil)
|
|
68
|
+
if file
|
|
69
|
+
parse_file(file)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# returns hash of params up until add_U_user_amino_acid
|
|
74
|
+
def grab_params(fh)
|
|
75
|
+
hash = {}
|
|
76
|
+
in_add_amino_acid_section = false
|
|
77
|
+
add_section_re = /^\s*add_/
|
|
78
|
+
prev_pos = nil
|
|
79
|
+
while line = fh.gets
|
|
80
|
+
if line =~ add_section_re
|
|
81
|
+
in_add_amino_acid_section = true
|
|
82
|
+
end
|
|
83
|
+
if (in_add_amino_acid_section and !(line =~ add_section_re))
|
|
84
|
+
fh.pos = prev_pos
|
|
85
|
+
break
|
|
86
|
+
end
|
|
87
|
+
prev_pos = fh.pos
|
|
88
|
+
if line =~ /\w+/
|
|
89
|
+
one,two = line.split @@param_re
|
|
90
|
+
two,comment = two.split @@param_two_split
|
|
91
|
+
hash[one] = two.rstrip
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
hash
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# returns self
|
|
98
|
+
def parse_io(fh)
|
|
99
|
+
# seek to the SEQUEST file
|
|
100
|
+
loop do
|
|
101
|
+
if fh.gets =~ @@sequest_line
|
|
102
|
+
# double check that we are in a sequest params file:
|
|
103
|
+
pos = fh.pos
|
|
104
|
+
if fh.gets =~ /^first_database_name/
|
|
105
|
+
fh.pos = pos
|
|
106
|
+
break
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
@opts = grab_params(fh)
|
|
111
|
+
@opts["search_engine"] = "SEQUEST"
|
|
112
|
+
# extract out the mods
|
|
113
|
+
@mods = {}
|
|
114
|
+
@opts.each do |k,v|
|
|
115
|
+
if k =~ /^add_/
|
|
116
|
+
@mods[k] = @opts.delete(k)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
## this gets rid of the .hdr postfix on indexed databases
|
|
121
|
+
@opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
|
|
122
|
+
self
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
## parses file
|
|
126
|
+
## and drops the .hdr behind indexed fasta files
|
|
127
|
+
## returns self
|
|
128
|
+
## can read sequest.params file or .srf file handle
|
|
129
|
+
def parse_file(file)
|
|
130
|
+
File.open(file) do |fh|
|
|
131
|
+
parse_io(fh)
|
|
132
|
+
end
|
|
133
|
+
self
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# returns( offset, cleave_at, except_if_after )
|
|
137
|
+
# offset is an Integer specifying how far after an amino acid to cut
|
|
138
|
+
# cleave_at is a string of all amino acids that should be cut at
|
|
139
|
+
# except_if_after for not cutting after those
|
|
140
|
+
# normal tryptic behavior would be: [1, 'KR', 'P']
|
|
141
|
+
# NOTE: a '-' in a params file is returned as an '' (empty string)
|
|
142
|
+
# AspN is [0,'D','']
|
|
143
|
+
def enzyme_specificity
|
|
144
|
+
enzyme_ar =
|
|
145
|
+
if version == '3.1'
|
|
146
|
+
Bioworks31_Enzyme_Info_Array[@opts['enzyme_number'].to_i][1,3]
|
|
147
|
+
elsif version >= '3.2'
|
|
148
|
+
arr = enzyme_info.split(/\s+/)[2,3]
|
|
149
|
+
arr[0] = arr[0].to_i
|
|
150
|
+
arr
|
|
151
|
+
else
|
|
152
|
+
raise ArgumentError, "don't recognize anything but Bioworks 3.1--3.3"
|
|
153
|
+
end
|
|
154
|
+
enzyme_ar.map! do |str|
|
|
155
|
+
if str == '-' ; ''
|
|
156
|
+
else ; str
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
enzyme_ar
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Returns the version of the sequest.params file
|
|
163
|
+
# Returns String "3.3" if contains "fragment_ion_units"
|
|
164
|
+
# Returns String "3.2" if contains "enyzme_info"
|
|
165
|
+
# Returns String "3.1" if contains "enzyme_number"
|
|
166
|
+
def version
|
|
167
|
+
if @opts['fragment_ion_units'] ; return '3.3'
|
|
168
|
+
elsif @opts['enzyme_info'] ; return '3.2'
|
|
169
|
+
elsif @opts['enzyme_number'] ; return '3.1'
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
####################################################
|
|
174
|
+
# TO PEPXML
|
|
175
|
+
####################################################
|
|
176
|
+
# In some ways, this is merely translating to the older Bioworks
|
|
177
|
+
# sequest.params files
|
|
178
|
+
|
|
179
|
+
# I'm not sure if this is the right mapping for sequence_search_constraint?
|
|
180
|
+
def sequence
|
|
181
|
+
pseq = @opts['partial_sequence']
|
|
182
|
+
if !pseq || pseq == "" ; pseq = "0" end
|
|
183
|
+
pseq
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def precursor_mass_type
|
|
187
|
+
case @opts['mass_type_parent']
|
|
188
|
+
when '0' ; "average"
|
|
189
|
+
when '1' ; "monoisotopic"
|
|
190
|
+
else ; abort "error in mass_type_parent in sequest!"
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def fragment_mass_type
|
|
195
|
+
fmtype =
|
|
196
|
+
case @opts['mass_type_fragment']
|
|
197
|
+
when '0' ; "average"
|
|
198
|
+
when '1' ; "monoisotopic"
|
|
199
|
+
else ; abort "error in mass_type_fragment in sequest!"
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def method_missing(name, *args)
|
|
204
|
+
string = name.to_s
|
|
205
|
+
if @opts.key?(string) ; return @opts[string]
|
|
206
|
+
elsif @mods.key?(string) ; return @mods[string]
|
|
207
|
+
else ; return nil
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
## We only need to define values if they are different than sequest.params
|
|
212
|
+
## The method_missing will look them up in the hash!
|
|
213
|
+
|
|
214
|
+
# Returns a system independent basename
|
|
215
|
+
# Splits on "\" or "/"
|
|
216
|
+
def _sys_ind_basename(file)
|
|
217
|
+
return file.split(/[\\\/]/)[-1]
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# changes the path of the database
|
|
221
|
+
def database_path=(newpath)
|
|
222
|
+
db = @opts["first_database_name"]
|
|
223
|
+
newpath = File.join(newpath, _sys_ind_basename(db))
|
|
224
|
+
@opts["first_database_name"] = newpath
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def database
|
|
228
|
+
@opts["first_database_name"]
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# returns the appropriate aminoacid mass lookup table from Ms::Mass::AA
|
|
232
|
+
# based_on may be :precursor or :fragment
|
|
233
|
+
def mass_index(based_on=:precursor)
|
|
234
|
+
reply = case based_on
|
|
235
|
+
when :precursor : precursor_mass_type
|
|
236
|
+
when :fragment : fragment_mass_type
|
|
237
|
+
end
|
|
238
|
+
case reply
|
|
239
|
+
when 'average'
|
|
240
|
+
Ms::Mass::AA::AVG
|
|
241
|
+
when 'monoisotopic'
|
|
242
|
+
Ms::Mass::AA::MONO
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# at least in Bioworks 3.2, the First number after the enzyme
|
|
247
|
+
# is the indication of the enzymatic end stringency (required):
|
|
248
|
+
# 1 = Fully enzymatic
|
|
249
|
+
# 2 = Either end
|
|
250
|
+
# 3 = N terminal only
|
|
251
|
+
# 4 = C terminal only
|
|
252
|
+
# So, to get min_number_termini we map like this:
|
|
253
|
+
# 1 => 2
|
|
254
|
+
# 2 => 1
|
|
255
|
+
def min_number_termini
|
|
256
|
+
if e_info = @opts["enzyme_info"]
|
|
257
|
+
case e_info.split(" ")[1]
|
|
258
|
+
when "1": return "2"
|
|
259
|
+
when "2": return "1"
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
warn "No Enzyme termini info, using min_number_termini = '1'"
|
|
263
|
+
return "1"
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
## returns a SampleEnzyme object
|
|
267
|
+
#def sample_enzyme
|
|
268
|
+
# (offset, cleave_at, except_if_after) = enzyme_specificity.map do |v|
|
|
269
|
+
# if v == '' ; nil ; else v end
|
|
270
|
+
# end
|
|
271
|
+
# SampleEnzyme.new do |se|
|
|
272
|
+
# se.name = self.enzyme
|
|
273
|
+
# se.cut = cleave_at
|
|
274
|
+
# se.no_cut = except_if_after
|
|
275
|
+
# se.sense =
|
|
276
|
+
# if se.name == "No_Enzyme"
|
|
277
|
+
# nil
|
|
278
|
+
# elsif offset == 1
|
|
279
|
+
# 'C'
|
|
280
|
+
# elsif offset == 0
|
|
281
|
+
# 'N'
|
|
282
|
+
# end
|
|
283
|
+
# end
|
|
284
|
+
#end
|
|
285
|
+
|
|
286
|
+
# returns the enzyme name (but no parentheses connected with the name).
|
|
287
|
+
# this will likely be capitalized.
|
|
288
|
+
def enzyme
|
|
289
|
+
v = self.version
|
|
290
|
+
basic_name =
|
|
291
|
+
if v == '3.1'
|
|
292
|
+
Bioworks31_Enzyme_Info_Array[ @opts['enzyme_number'].to_i ][0]
|
|
293
|
+
elsif v >= '3.2'
|
|
294
|
+
@opts["enzyme_info"]
|
|
295
|
+
end
|
|
296
|
+
basic_name.split('(')[0]
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def max_num_internal_cleavages
|
|
300
|
+
@opts["max_num_internal_cleavage_sites"]
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# my take on peptide_mass_units:
|
|
304
|
+
# (see http://www.ionsource.com/tutorial/isotopes/slide2.htm)
|
|
305
|
+
# amu = atomic mass units = (mass_real - mass_measured).abs (??abs??)
|
|
306
|
+
# mmu = milli mass units (amu / 1000)
|
|
307
|
+
# ppm = parts per million = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
|
|
308
|
+
|
|
309
|
+
def peptide_mass_tol
|
|
310
|
+
if @opts["peptide_mass_units"] != "0"
|
|
311
|
+
puts "WARNING: peptide_mass_tol units need to be adjusted!"
|
|
312
|
+
end
|
|
313
|
+
@opts["peptide_mass_tolerance"]
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def fragment_ion_tol
|
|
317
|
+
@opts["fragment_ion_tolerance"]
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def max_num_differential_AA_per_mod
|
|
321
|
+
@opts["max_num_differential_AA_per_mod"] || @opts["max_num_differential_per_peptide"]
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# returns a hash by add_<whatever> of any static mods != 0
|
|
325
|
+
# the values are still as strings
|
|
326
|
+
def static_mods
|
|
327
|
+
hash = {}
|
|
328
|
+
@mods.each do |k,v|
|
|
329
|
+
if v.to_f != 0.0
|
|
330
|
+
hash[k] = v
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
hash
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
## @TODO: We could add some of the parameters not currently being asked for to be more complete
|
|
337
|
+
## @TODO: We could always add the Bioworks 3.2 specific params as params
|
|
338
|
+
|
|
339
|
+
####################################################
|
|
340
|
+
####################################################
|
|
341
|
+
|
|
342
|
+
end
|
|
343
|
+
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
|
|
2
|
+
require 'ms/fasta'
|
|
3
|
+
require 'arrayclass'
|
|
4
|
+
require 'set'
|
|
5
|
+
|
|
6
|
+
require 'ms/id/peptide'
|
|
7
|
+
require 'ms/id/search'
|
|
8
|
+
|
|
9
|
+
module Ms
|
|
10
|
+
module Sequest
|
|
11
|
+
class SqtGroup
|
|
12
|
+
include Ms::Id::SearchGroup
|
|
13
|
+
|
|
14
|
+
#attr_accessor :sqts, :filenames
|
|
15
|
+
|
|
16
|
+
def search_class
|
|
17
|
+
Ms::Sequest::Sqt
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def extension() 'sqg' end
|
|
21
|
+
|
|
22
|
+
def initialize(arg, opts={}, &block)
|
|
23
|
+
orig_opts = opts.dup
|
|
24
|
+
indiv_opts = { :link_protein_hits => false }
|
|
25
|
+
super(arg, opts.merge(indiv_opts)) do
|
|
26
|
+
unless orig_opts[:link_protein_hits] == false
|
|
27
|
+
puts "MERGING GROUP!"
|
|
28
|
+
(@peps, @prots) = merge!(@searches.map {|v| v.peps }, &Ms::Sequest::Sqt::NEW_PROT)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
block.call(self) if block_given?
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# # NOTE THAT this is copy/paste from srf.rb, should be refactored...
|
|
36
|
+
## returns the filename used
|
|
37
|
+
## if the file exists, the name will be expanded to full path, otherwise just
|
|
38
|
+
## what is given
|
|
39
|
+
#def to_sqg(sqg_filename='bioworks.sqg')
|
|
40
|
+
#File.open(sqg_filename, 'w') do |v|
|
|
41
|
+
#@filenames.each do |sqt_file|
|
|
42
|
+
#if File.exist? sqt_file
|
|
43
|
+
#v.puts File.expand_path(sqt_file)
|
|
44
|
+
#else
|
|
45
|
+
#v.puts sqt_file
|
|
46
|
+
#end
|
|
47
|
+
#end
|
|
48
|
+
#end
|
|
49
|
+
#sqg_filename
|
|
50
|
+
#end
|
|
51
|
+
|
|
52
|
+
end # SqtGroup
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Sqt
|
|
56
|
+
include Ms::Id::Search
|
|
57
|
+
PercolatorHeaderMatch = /^Percolator v/
|
|
58
|
+
Delimiter = "\t"
|
|
59
|
+
attr_accessor :header
|
|
60
|
+
attr_accessor :spectra
|
|
61
|
+
attr_accessor :base_name
|
|
62
|
+
# boolean
|
|
63
|
+
attr_accessor :percolator_results
|
|
64
|
+
|
|
65
|
+
# assumes the file exists and is readable
|
|
66
|
+
# returns [DBSeqLength, DBLocusCount, DBMD5Sum] or nil if no file
|
|
67
|
+
def self.get_db_info(dbfile)
|
|
68
|
+
Ms::Fasta.open(dbfile) do |fasta|
|
|
69
|
+
[fasta.total_sequence_length, fasta.size, fasta.md5_sum]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def protein_class
|
|
74
|
+
Ms::Sequest::Sqt::Locus
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# opts =
|
|
78
|
+
# :percolator_results => false | true (default false)
|
|
79
|
+
# :link_protein_hits => true | false (default true)
|
|
80
|
+
def initialize(filename=nil, opts={})
|
|
81
|
+
@peps = []
|
|
82
|
+
@prots = []
|
|
83
|
+
if filename
|
|
84
|
+
from_file(filename, opts)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
NEW_PROT = lambda do |_prot, _peps|
|
|
89
|
+
Ms::Sequest::Sqt::Locus.new([_prot.locus, _prot.description, _peps])
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# if the file contains the header key '/$Percolator v/' then the results
|
|
93
|
+
# will be interpreted as percolator results regardless of the value
|
|
94
|
+
# passed in.
|
|
95
|
+
def from_file(filename, opts={})
|
|
96
|
+
opts = {:percolator_results=>false, :link_protein_hits => true}.merge(opts)
|
|
97
|
+
@percolator_results = opts[:percolator_results]
|
|
98
|
+
@base_name = File.basename( filename.gsub('\\','/') ).sub(/\.\w+$/, '')
|
|
99
|
+
File.open(filename) do |fh|
|
|
100
|
+
@header = Ms::Sequest::Sqt::Header.new.from_handle(fh)
|
|
101
|
+
if @header.keys.any? {|v| v =~ PercolatorHeaderMatch }
|
|
102
|
+
@percolator_results = true
|
|
103
|
+
end
|
|
104
|
+
(@spectra, @peps) = Ms::Sequest::Sqt::Spectrum.spectra_from_handle(fh, @base_name, @percolator_results)
|
|
105
|
+
end
|
|
106
|
+
if opts[:link_protein_hits]
|
|
107
|
+
(@peps, @prots) = merge!([@peps], &NEW_PROT)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# Inherits from hash, so all header stuff can be accessed by key. Multiline
|
|
113
|
+
# values will be pushed into an array.
|
|
114
|
+
# All header values are stored as (newline-removed) strings!
|
|
115
|
+
class Header < Hash
|
|
116
|
+
Leader = 'H'
|
|
117
|
+
|
|
118
|
+
# These will be in arrays no matter what: StaticMod, DynamicMod, Comment
|
|
119
|
+
# Any other keys repeated will be shoved into an array; otherwise a string
|
|
120
|
+
Arrayed = %w(DyanmicMod StaticMod Comment).to_set
|
|
121
|
+
|
|
122
|
+
HeaderKeys = {
|
|
123
|
+
:sqt_generator => 'SQTGenerator',
|
|
124
|
+
:sqt_generator_version => 'SQTGeneratorVersion',
|
|
125
|
+
:database => 'Database',
|
|
126
|
+
:fragment_masses => 'FragmentMasses',
|
|
127
|
+
:precursor_masses => 'PrecursorMasses',
|
|
128
|
+
:start_time => 'StartTime',
|
|
129
|
+
:db_seq_length => 'DBSeqLength',
|
|
130
|
+
:db_locus_count => 'DBLocusCount',
|
|
131
|
+
:db_md5sum => 'DBMD5Sum',
|
|
132
|
+
:peptide_mass_tolerance => 'Alg-PreMassTol',
|
|
133
|
+
:fragment_ion_tolerance => 'Alg-FragMassTol',
|
|
134
|
+
# nonstandard (mine)
|
|
135
|
+
:peptide_mass_units => 'Alg-PreMassUnits',
|
|
136
|
+
:ion_series => 'Alg-IonSeries',
|
|
137
|
+
:enzyme => 'Alg-Enzyme',
|
|
138
|
+
# nonstandard (mine)
|
|
139
|
+
:ms_model => 'Alg-MSModel',
|
|
140
|
+
:static_mods => 'StaticMod',
|
|
141
|
+
:dynamic_mods => 'DynamicMod',
|
|
142
|
+
:comments => 'Comment'
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
KeysToAtts = HeaderKeys.invert
|
|
147
|
+
|
|
148
|
+
HeaderKeys.keys.each do |ky|
|
|
149
|
+
attr_accessor ky
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def from_handle(fh)
|
|
153
|
+
Arrayed.each do |ky|
|
|
154
|
+
self[ky] = []
|
|
155
|
+
end
|
|
156
|
+
pos = fh.pos
|
|
157
|
+
lines = []
|
|
158
|
+
loop do
|
|
159
|
+
line = fh.gets
|
|
160
|
+
if line && (line[0,1] == Ms::Sequest::Sqt::Header::Leader )
|
|
161
|
+
lines << line
|
|
162
|
+
else # reset the fh.pos and we're done
|
|
163
|
+
fh.pos = pos
|
|
164
|
+
break
|
|
165
|
+
end
|
|
166
|
+
pos = fh.pos
|
|
167
|
+
end
|
|
168
|
+
from_lines(lines)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def from_lines(array_of_header_lines)
|
|
172
|
+
array_of_header_lines.each do |line|
|
|
173
|
+
line.chomp!
|
|
174
|
+
(ky, *rest) = line.split(Ms::Sequest::Sqt::Delimiter)[1..-1]
|
|
175
|
+
# just in case they have any tabs in their field
|
|
176
|
+
value = rest.join(Ms::Sequest::Sqt::Delimiter)
|
|
177
|
+
if Arrayed.include?(ky)
|
|
178
|
+
self[ky] << value
|
|
179
|
+
elsif self.key? ky # already exists
|
|
180
|
+
if self[ky].is_a? Array
|
|
181
|
+
self[ky] << value
|
|
182
|
+
else
|
|
183
|
+
self[ky] = [self[ky], value]
|
|
184
|
+
end
|
|
185
|
+
else # normal
|
|
186
|
+
self[ky] = value
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
KeysToAtts.each do |ky,methd|
|
|
190
|
+
self.send("#{methd}=".to_sym, self[ky])
|
|
191
|
+
end
|
|
192
|
+
self
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# all are cast as expected (total_intensity is a float)
|
|
201
|
+
# mh = observed mh
|
|
202
|
+
Ms::Sequest::Sqt::Spectrum = Arrayclass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
|
|
203
|
+
|
|
204
|
+
# 0=first_scan 1=last_scan 2=charge 3=time_to_process 4=node 5=mh 6=total_intensity 7=lowest_sp 8=num_matched_peptides 9=matches
|
|
205
|
+
|
|
206
|
+
class Ms::Sequest::Sqt::Spectrum
|
|
207
|
+
Leader = 'S'
|
|
208
|
+
|
|
209
|
+
# assumes the first line starts with an 'S'
|
|
210
|
+
def self.spectra_from_handle(fh, base_name, percolator_results=false)
|
|
211
|
+
peps = []
|
|
212
|
+
spectra = []
|
|
213
|
+
|
|
214
|
+
while line = fh.gets
|
|
215
|
+
case line[0,1]
|
|
216
|
+
when Ms::Sequest::Sqt::Spectrum::Leader
|
|
217
|
+
spectrum = Ms::Sequest::Sqt::Spectrum.new.from_line( line )
|
|
218
|
+
spectra << spectrum
|
|
219
|
+
matches = []
|
|
220
|
+
spectrum.matches = matches
|
|
221
|
+
when Ms::Sequest::Sqt::Match::Leader
|
|
222
|
+
match_klass = if percolator_results
|
|
223
|
+
Ms::Sequest::Sqt::Match::Percolator
|
|
224
|
+
else
|
|
225
|
+
Ms::Sequest::Sqt::Match
|
|
226
|
+
end
|
|
227
|
+
match = match_klass.new.from_line( line )
|
|
228
|
+
match[10,3] = spectrum[0,3]
|
|
229
|
+
match[15] = base_name
|
|
230
|
+
matches << match
|
|
231
|
+
peps << match
|
|
232
|
+
loci = []
|
|
233
|
+
match.loci = loci
|
|
234
|
+
matches << match
|
|
235
|
+
when Ms::Sequest::Sqt::Locus::Leader
|
|
236
|
+
line.chomp!
|
|
237
|
+
key = line.split(Ms::Sequest::Sqt::Delimiter)[1]
|
|
238
|
+
locus = Ms::Sequest::Sqt::Locus.new.from_line( line )
|
|
239
|
+
loci << locus
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
# set the deltacn:
|
|
243
|
+
set_deltacn(spectra)
|
|
244
|
+
[spectra, peps]
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def self.set_deltacn(spectra)
|
|
248
|
+
spectra.each do |spec|
|
|
249
|
+
matches = spec.matches
|
|
250
|
+
if matches.size > 0
|
|
251
|
+
|
|
252
|
+
(0...(matches.size-1)).each do |i|
|
|
253
|
+
matches[i].deltacn = matches[i+1].deltacn_orig
|
|
254
|
+
end
|
|
255
|
+
matches[-1].deltacn = 1.1
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
spectra
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# returns an array -> [the next spectra line (or nil if eof), spectrum]
|
|
263
|
+
def from_line(line)
|
|
264
|
+
line.chomp!
|
|
265
|
+
ar = line.split(Ms::Sequest::Sqt::Delimiter)
|
|
266
|
+
self[0] = ar[1].to_i
|
|
267
|
+
self[1] = ar[2].to_i
|
|
268
|
+
self[2] = ar[3].to_i
|
|
269
|
+
self[3] = ar[4].to_f
|
|
270
|
+
self[4] = ar[5]
|
|
271
|
+
self[5] = ar[6].to_f
|
|
272
|
+
self[6] = ar[7].to_f
|
|
273
|
+
self[7] = ar[8].to_f
|
|
274
|
+
self[8] = ar[9].to_i
|
|
275
|
+
self[9] = []
|
|
276
|
+
self
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Sqt format uses only indices 0 - 9
|
|
281
|
+
Ms::Sequest::Sqt::Match = Arrayclass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
|
|
282
|
+
|
|
283
|
+
# 0=rxcorr 1=rsp 2=mh 3=deltacn_orig 4=xcorr 5=sp 6=ions_matched 7=ions_total 8=sequence 9=manual_validation_status 10=first_scan 11=last_scan 12=charge 13=deltacn 14=aaseq 15=base_name 16=loci
|
|
284
|
+
|
|
285
|
+
# rxcorr = rank by xcorr
|
|
286
|
+
# rsp = rank by sp
|
|
287
|
+
# NOTE:
|
|
288
|
+
# deltacn_orig
|
|
289
|
+
# deltacn is the adjusted deltacn (like Bioworks - shift all scores up and
|
|
290
|
+
# give the last one 1.1)
|
|
291
|
+
class Ms::Sequest::Sqt::Match
|
|
292
|
+
Leader = 'M'
|
|
293
|
+
|
|
294
|
+
# same as 'loci'
|
|
295
|
+
def prots
|
|
296
|
+
self[16]
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def from_line(line)
|
|
300
|
+
line.chomp!
|
|
301
|
+
ar = line.split(Ms::Sequest::Sqt::Delimiter)
|
|
302
|
+
self[0] = ar[1].to_i
|
|
303
|
+
self[1] = ar[2].to_i
|
|
304
|
+
self[2] = ar[3].to_f
|
|
305
|
+
self[3] = ar[4].to_f
|
|
306
|
+
self[4] = ar[5].to_f
|
|
307
|
+
self[5] = ar[6].to_f
|
|
308
|
+
self[6] = ar[7].to_i
|
|
309
|
+
self[7] = ar[8].to_i
|
|
310
|
+
self[8] = ar[9]
|
|
311
|
+
self[9] = ar[10]
|
|
312
|
+
self[14] = Ms::Id::Peptide.sequence_to_aaseq(self[8])
|
|
313
|
+
self
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class Ms::Sequest::Sqt::Match::Percolator < Ms::Sequest::Sqt::Match
|
|
319
|
+
# we will keep access to these old terms since we can then access routines
|
|
320
|
+
# that sort on xcorr...
|
|
321
|
+
#undef_method :xcorr
|
|
322
|
+
#undef_method :xcorr=
|
|
323
|
+
#undef_method :sp
|
|
324
|
+
#undef_method :sp=
|
|
325
|
+
|
|
326
|
+
def percolator_score
|
|
327
|
+
self[4]
|
|
328
|
+
end
|
|
329
|
+
def percolator_score=(score)
|
|
330
|
+
self[4] = score
|
|
331
|
+
end
|
|
332
|
+
def negative_q_value
|
|
333
|
+
self[5]
|
|
334
|
+
end
|
|
335
|
+
def negative_q_value=(arg)
|
|
336
|
+
self[5] = arg
|
|
337
|
+
end
|
|
338
|
+
def q_value
|
|
339
|
+
-self[5]
|
|
340
|
+
end
|
|
341
|
+
# for compatibility with scripts that want this guy
|
|
342
|
+
def probability
|
|
343
|
+
-self[5]
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
Ms::Sequest::Sqt::Locus = Arrayclass.new(%w[locus description peps])
|
|
348
|
+
|
|
349
|
+
class Ms::Sequest::Sqt::Locus
|
|
350
|
+
Leader = 'L'
|
|
351
|
+
|
|
352
|
+
def first_entry ; self[0] end
|
|
353
|
+
def reference ; self[0] end
|
|
354
|
+
|
|
355
|
+
def from_line(line)
|
|
356
|
+
line.chomp!
|
|
357
|
+
ar = line.split(Ms::Sequest::Sqt::Delimiter)
|
|
358
|
+
self[0] = ar[1]
|
|
359
|
+
self[1] = ar[2]
|
|
360
|
+
self
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
end
|