mspire 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/lib/spec_id/sequest.rb
CHANGED
@@ -6,6 +6,74 @@ require 'set_from_hash'
|
|
6
6
|
require 'spec_id/bioworks'
|
7
7
|
require 'instance_var_set_from_hash'
|
8
8
|
require 'spec/msrun'
|
9
|
+
require 'spec_id/srf'
|
10
|
+
|
11
|
+
class Numeric
|
12
|
+
# returns a string with a + or - on the front
|
13
|
+
def to_plus_minus_string
|
14
|
+
if self >= 0
|
15
|
+
'+' << self.to_s
|
16
|
+
else
|
17
|
+
'-' << self.to_s
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
##########################################
|
23
|
+
# NEED TO ADD MODIFICATIONS and generally verify pepxml creation!!! :
|
24
|
+
# HERE's an excerpt from an example file from tpp 2.9.2 that I'm going to follow:
|
25
|
+
=begin
|
26
|
+
<search_summary base_name="/regis/data3/search/akeller/LCQ/COMET/LIGHT/haloICAT2_41" search_engine="COMET" precursor_mass_type="average" fragment_mass_type="average">
|
27
|
+
<sequence_search_constraint sequence="C"/>
|
28
|
+
<aminoacid_modification aminoacid="C" massdiff="8.049" mass="553.765" variable="Y" binary="N"/>
|
29
|
+
<aminoacid_modification aminoacid="C" massdiff="442.5772" mass="545.7160" variable="N"/>
|
30
|
+
<aminoacid_modification aminoacid="M" massdiff="16.0000" mass="147.1926" variable="Y" binary="N" symbol="1"/>
|
31
|
+
<parameter name="peptide_mass_tol" value="3.0000"/>
|
32
|
+
<parameter name="peptide_mass_tol_units" value="DA"/>
|
33
|
+
<parameter name="num_output_lines" value="10"/>
|
34
|
+
<parameter name="remove_precursor_peak" value="0"/>
|
35
|
+
<parameter name="num_dup_headers" value="1"/>
|
36
|
+
<parameter name="email_address" value=""/>
|
37
|
+
<parameter name="ion_series" value="010000010"/>
|
38
|
+
<parameter name="max_num_var_mod_residues" value="3"/>
|
39
|
+
<parameter name="md5_check_sum" value="2547286a77a35abe2af3f2e9825ab814"/>
|
40
|
+
</search_summary>
|
41
|
+
=end
|
42
|
+
|
43
|
+
# and a guy with modifications:
|
44
|
+
=begin
|
45
|
+
<search_result spectrum="haloICAT2_41.1110.1110.2" start_scan="1110" end_scan="1110" precursor_neutral_mass="2000.6641" assumed_charge="2" index="28">
|
46
|
+
<search_hit hit_rank="1" peptide="GCMPSKEVLSAGAHR" peptide_prev_aa="R" peptide_next_aa="Y" protein="Chr_ORF0132" num_tot_proteins="1" num_matched_ions="19" tot_num_ions="30" calc_neutral_pep_mass="2001.3685" massdiff="-0.704" num_tol_term="2" num_missed_cleavages="1" is_rejected="0">
|
47
|
+
<modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
|
48
|
+
<mod_aminoacid_mass position="2" mass="545.7160"/>
|
49
|
+
<mod_aminoacid_mass position="3" mass="147.1926"/>
|
50
|
+
</modification_info>
|
51
|
+
<search_score name="dotproduct" value="359"/>
|
52
|
+
<search_score name="delta" value="0.296"/>
|
53
|
+
<search_score name="deltastar" value="0"/>
|
54
|
+
<search_score name="zscore" value="5.290"/>
|
55
|
+
<search_score name="expect" value="0.000E+00"/>
|
56
|
+
<peptideprophet_result probability="0.9994" all_ntt_prob="(0.3713,0.4360,0.9994)">
|
57
|
+
<search_score_summary>
|
58
|
+
<parameter name="fval" value="3.4002"/>
|
59
|
+
<parameter name="ntt" value="2"/>
|
60
|
+
<parameter name="nmc" value="1"/>
|
61
|
+
<parameter name="massd" value="-0.704"/>
|
62
|
+
</search_score_summary>
|
63
|
+
</peptideprophet_result>
|
64
|
+
=end
|
65
|
+
|
66
|
+
# sequest.params option:
|
67
|
+
# diff_search_options = 15.994910 M 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y
|
68
|
+
# permanent mods are at the bottom: ...
|
69
|
+
# add_A_Alanine = 0.0000 ; added to A
|
70
|
+
# add_S_Serine = 0.0000 ; added to S
|
71
|
+
# add_P_Proline = 0.0000 ; added to P
|
72
|
+
# add_V_Valine = 0.0000 ; added to V
|
73
|
+
# add_T_Threonine = 0.0000 ; added to T
|
74
|
+
# ...
|
75
|
+
|
76
|
+
|
9
77
|
|
10
78
|
module SpecID::Sequest; end
|
11
79
|
class SpecID::Sequest::PepXML; end
|
@@ -26,8 +94,12 @@ class SpecID::Sequest::PepXML::MSMSPipelineAnalysis
|
|
26
94
|
@xmlns = nil
|
27
95
|
@xmlns_xsi = nil
|
28
96
|
@xsi_schema_location = nil
|
29
|
-
|
30
|
-
|
97
|
+
if hash
|
98
|
+
self.set_from_hash(hash)
|
99
|
+
end
|
100
|
+
if block_given?
|
101
|
+
@msms_run_summary = yield
|
102
|
+
end
|
31
103
|
end
|
32
104
|
|
33
105
|
# if no date string given, then it will set to Time.now
|
@@ -80,7 +152,8 @@ class SpecID::Sequest::PepXML::MSMSRunSummary
|
|
80
152
|
include SpecIDXML
|
81
153
|
|
82
154
|
# the version of TPP you are using (determines xml output)
|
83
|
-
# The name of the pep xml file (without extension)
|
155
|
+
# The name of the pep xml file (without extension) (but this is a long
|
156
|
+
# filename!!!)
|
84
157
|
attr_accessor :base_name
|
85
158
|
# The name of the mass spec manufacturer
|
86
159
|
attr_accessor :ms_manufacturer
|
@@ -104,7 +177,9 @@ class SpecID::Sequest::PepXML::MSMSRunSummary
|
|
104
177
|
# set to the return value of the block
|
105
178
|
def initialize(hash=nil)
|
106
179
|
@spectrum_queries = []
|
107
|
-
|
180
|
+
if hash
|
181
|
+
instance_var_set_from_hash(hash)
|
182
|
+
end
|
108
183
|
if block_given? ; @spectrum_queries = yield end
|
109
184
|
end
|
110
185
|
|
@@ -137,6 +212,8 @@ end
|
|
137
212
|
|
138
213
|
class SpecID::Sequest::PepXML
|
139
214
|
include SpecIDXML
|
215
|
+
|
216
|
+
## CREATE a default version for the entire class
|
140
217
|
class << self
|
141
218
|
attr_accessor :pepxml_version
|
142
219
|
end
|
@@ -144,7 +221,11 @@ class SpecID::Sequest::PepXML
|
|
144
221
|
self.pepxml_version = DEF_VERSION # default version
|
145
222
|
|
146
223
|
attr_accessor :pepxml_version, :msms_pipeline_analysis
|
224
|
+
## the full path name (no extension)
|
147
225
|
attr_accessor :base_name
|
226
|
+
attr_accessor :h_plus
|
227
|
+
attr_accessor :avg_parent
|
228
|
+
|
148
229
|
#attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
|
149
230
|
|
150
231
|
# returns an array of spectrum queries
|
@@ -153,10 +234,29 @@ class SpecID::Sequest::PepXML
|
|
153
234
|
end
|
154
235
|
|
155
236
|
# msms_pipeline_analysis is set to the result of the yielded block
|
156
|
-
|
237
|
+
# and set_mono_or_avg is called with params if given
|
238
|
+
def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
|
157
239
|
self.class.pepxml_version = pepxml_version
|
158
|
-
|
159
|
-
|
240
|
+
if sequest_params_obj
|
241
|
+
set_mono_or_avg(sequest_params_obj)
|
242
|
+
end
|
243
|
+
if block_given?
|
244
|
+
@msms_pipeline_analysis = yield
|
245
|
+
@base_name = @msms_pipeline_analysis.msms_run_summary.base_name
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
# sets @h_plus and @avg_parent from the sequest params object
|
250
|
+
def set_mono_or_avg(sequest_params_obj)
|
251
|
+
case sequest_params_obj.precursor_mass_type
|
252
|
+
when "monoisotopic" ; @avg_parent = false
|
253
|
+
else ; @avg_parent = true
|
254
|
+
end
|
255
|
+
|
256
|
+
case @avg_parent
|
257
|
+
when true ; @h_plus = SpecID::AVG[:h_plus]
|
258
|
+
when false ; @h_plus = SpecID::MONO[:h_plus]
|
259
|
+
end
|
160
260
|
end
|
161
261
|
|
162
262
|
def date
|
@@ -203,6 +303,190 @@ class SpecID::Sequest::PepXML
|
|
203
303
|
end
|
204
304
|
end
|
205
305
|
|
306
|
+
|
307
|
+
Default_Options = {
|
308
|
+
:out_path => nil,
|
309
|
+
:backup_db_path => '/project/marcotte/marcotte/ms/database',
|
310
|
+
# a PepXML option
|
311
|
+
:pepxml_version => DEF_VERSION,
|
312
|
+
## MSMSRunSummary options:
|
313
|
+
# string must be recognized in sample_enzyme.rb
|
314
|
+
# or create your own SampleEnzyme object
|
315
|
+
:sample_enzyme => 'trypsin',
|
316
|
+
:ms_manufacturer => 'ThermoFinnigan',
|
317
|
+
:ms_model => 'LCQ Deca XP',
|
318
|
+
:ms_ionization => 'ESI',
|
319
|
+
:ms_mass_analyzer => 'Ion Trap',
|
320
|
+
:ms_detector => 'UNKNOWN',
|
321
|
+
:raw_data_type => "raw",
|
322
|
+
:raw_data => ".mzXML", ## even if you don't have it?
|
323
|
+
## SearchSummary options:
|
324
|
+
:out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
|
325
|
+
:out_data => ".tgz" ## may be srf??
|
326
|
+
}
|
327
|
+
|
328
|
+
# will dynamically set :ms_model and :ms_mass_analyzer from srf info
|
329
|
+
# (ignoring defaults or anything passed in) for LTQ Orbitrap
|
330
|
+
# and LCQ Deca XP
|
331
|
+
# See SRF::Sequest::PepXML::Default_Options hash for defaults
|
332
|
+
# unless given, the out_path will be given as the path of the srf_file
|
333
|
+
def self.new_from_srf(srf_file, opts={})
|
334
|
+
opts = Default_Options.merge(opts)
|
335
|
+
|
336
|
+
## set the outpath
|
337
|
+
out_path = opts.delete(:out_path)
|
338
|
+
unless out_path
|
339
|
+
out_path = File.dirname(srf_file)
|
340
|
+
end
|
341
|
+
|
342
|
+
## read the srf file
|
343
|
+
srf = SRF.new(srf_file)
|
344
|
+
|
345
|
+
params = srf.params
|
346
|
+
|
347
|
+
## check to see if we need backup_db
|
348
|
+
backup_db_path = opts.delete(:backup_db_path)
|
349
|
+
unless File.exist? params.database
|
350
|
+
params.database_path = backup_db_path
|
351
|
+
end
|
352
|
+
|
353
|
+
#######################################################################
|
354
|
+
# PREPARE THE OPTIONS:
|
355
|
+
#######################################################################
|
356
|
+
## remove items from the options hash that don't belong to
|
357
|
+
ppxml_version = opts.delete(:pepxml_version)
|
358
|
+
out_data_type = opts.delete(:out_data_type)
|
359
|
+
out_data = opts.delete(:out_data)
|
360
|
+
|
361
|
+
## Extract meta info from srf
|
362
|
+
bn_noext = base_name_noext(srf.header.raw_filename)
|
363
|
+
opts[:ms_model] = srf.header.model
|
364
|
+
case opts[:ms_model]
|
365
|
+
when /Orbitrap/
|
366
|
+
opts[:ms_mass_analyzer] = 'Orbitrap'
|
367
|
+
when /LCQ Deca XP/
|
368
|
+
opts[:ms_mass_analyzer] = 'Ion Trap'
|
369
|
+
end
|
370
|
+
|
371
|
+
## Create the base name
|
372
|
+
full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
|
373
|
+
opts[:base_name] = full_base_name_no_ext
|
374
|
+
|
375
|
+
## Create the search summary:
|
376
|
+
search_summary_options = {
|
377
|
+
:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params),
|
378
|
+
:base_name => full_base_name_no_ext,
|
379
|
+
:out_data_type => out_data_type,
|
380
|
+
:out_data => out_data
|
381
|
+
}
|
382
|
+
opts[:search_summary] = SpecID::Sequest::PepXML::SearchSummary.new( params, search_summary_options)
|
383
|
+
|
384
|
+
## Create the SampleEnzyme object if necessary
|
385
|
+
unless opts[:sample_enzyme].is_a? SampleEnzyme
|
386
|
+
opts[:sample_enzyme] = SampleEnzyme.new(opts[:sample_enzyme])
|
387
|
+
end
|
388
|
+
|
389
|
+
## Create the pepxml obj
|
390
|
+
pepxml_obj = SpecID::Sequest::PepXML.new(ppxml_version, params)
|
391
|
+
## name some common variables we'll need
|
392
|
+
h_plus = pepxml_obj.h_plus
|
393
|
+
avg_parent = pepxml_obj.avg_parent
|
394
|
+
|
395
|
+
#######################################################################
|
396
|
+
# CREATE the spectrum_queries_ar
|
397
|
+
#######################################################################
|
398
|
+
srf_index = srf.index
|
399
|
+
out_files = srf.out_files
|
400
|
+
spectrum_queries_arr = Array.new(srf.dta_files.size)
|
401
|
+
files_with_hits_index = 0 ## will end up being 1 indexed
|
402
|
+
srf.dta_files.each_with_index do |dta_file,i|
|
403
|
+
next if out_files[i].num_hits == 0
|
404
|
+
files_with_hits_index += 1
|
405
|
+
|
406
|
+
# Sort the hits
|
407
|
+
hits = out_files[i].hits
|
408
|
+
arr = hits.sort_by{|v| v.xcorr }
|
409
|
+
|
410
|
+
# Get proper deltacn and deltacnstar
|
411
|
+
# Prophet deltacn is not the same as the native Sequest deltacn
|
412
|
+
# It is the deltacn of the second best hit!
|
413
|
+
top_hit = arr.pop
|
414
|
+
second_hit = arr.last
|
415
|
+
if second_hit
|
416
|
+
top_hit[1] = second_hit[1]
|
417
|
+
deltacnstar = '0'
|
418
|
+
else
|
419
|
+
top_hit[1] = '1.0'
|
420
|
+
deltacnstar = '1'
|
421
|
+
end
|
422
|
+
|
423
|
+
## mass calculations:
|
424
|
+
precursor_neutral_mass = dta_file.mh - h_plus
|
425
|
+
calc_neutral_pep_mass = top_hit[0] - h_plus
|
426
|
+
massdiff = precursor_neutral_mass - calc_neutral_pep_mass
|
427
|
+
if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
|
428
|
+
else ; massdiff = massdiff.to_s end
|
429
|
+
|
430
|
+
(start_scan, end_scan, charge) = srf_index[i]
|
431
|
+
sq_hash = {
|
432
|
+
:spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
|
433
|
+
:start_scan => start_scan,
|
434
|
+
:end_scan => end_scan,
|
435
|
+
:precursor_neutral_mass => precursor_neutral_mass,
|
436
|
+
:assumed_charge => charge,
|
437
|
+
:pepxml_version => ppxml_version,
|
438
|
+
:index => files_with_hits_index,
|
439
|
+
}
|
440
|
+
|
441
|
+
# NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
|
442
|
+
## THIS IS ALL INNER LOOP, so we make every effort at speed here:
|
443
|
+
(prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_hit[8])
|
444
|
+
# ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
|
445
|
+
|
446
|
+
sh_hash = {
|
447
|
+
:hit_rank => "1",
|
448
|
+
:peptide => pepseq,
|
449
|
+
:peptide_prev_aa => prevaa,
|
450
|
+
:peptide_next_aa => nextaa,
|
451
|
+
:protein => top_hit[9].split(" ").first,
|
452
|
+
:num_tot_proteins => top_hit[10],
|
453
|
+
:num_matched_ions => top_hit[6],
|
454
|
+
:tot_num_ions => top_hit[7],
|
455
|
+
:calc_neutral_pep_mass => calc_neutral_pep_mass,
|
456
|
+
:massdiff => massdiff,
|
457
|
+
:num_tol_term => SpecID::Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_hit[8]),
|
458
|
+
:num_missed_cleavages => SpecID::Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_hit[8]),
|
459
|
+
:is_rejected => '0',
|
460
|
+
# These are search score attributes:
|
461
|
+
:xcorr => top_hit[3],
|
462
|
+
:deltacn => top_hit[1],
|
463
|
+
:deltacnstar => deltacnstar,
|
464
|
+
:spscore => top_hit[2],
|
465
|
+
:sprank => top_hit[5],
|
466
|
+
}
|
467
|
+
|
468
|
+
spectrum_queries_arr[files_with_hits_index] = SpecID::Sequest::PepXML::SpectrumQuery.new(sq_hash) do
|
469
|
+
search_result = SpecID::Sequest::PepXML::SearchResult.new do
|
470
|
+
[ SpecID::Sequest::PepXML::SearchHit.new(sh_hash) ] # there can be multiple hits
|
471
|
+
end # SearchResult
|
472
|
+
[search_result] # can be multiple
|
473
|
+
end
|
474
|
+
end
|
475
|
+
spectrum_queries_arr.compact!
|
476
|
+
|
477
|
+
#######################################################################
|
478
|
+
# ADD the pipeline analysis
|
479
|
+
#######################################################################
|
480
|
+
|
481
|
+
pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'}) do
|
482
|
+
SpecID::Sequest::PepXML::MSMSRunSummary.new(opts) { spectrum_queries_arr }
|
483
|
+
end
|
484
|
+
pepxml_obj.msms_pipeline_analysis = pipeline
|
485
|
+
pepxml_obj.base_name = pipeline.msms_run_summary.base_name
|
486
|
+
pepxml_obj
|
487
|
+
end
|
488
|
+
|
489
|
+
# Takes bioworks 3.2/3.3 xml output (with no filters)
|
206
490
|
# Returns a list of PepXML objects
|
207
491
|
# msdata = path to mzXML files (or .timeIndex files) (or @TODO: path to sqt file(s))
|
208
492
|
# params = sequest.params file
|
@@ -246,6 +530,7 @@ class SpecID::Sequest::PepXML
|
|
246
530
|
|
247
531
|
## Create a hash of spectrum_query arrays by filename (this very big block):
|
248
532
|
spectrum_queries_by_base_name = {}
|
533
|
+
pepxml_objs_by_base_name = {}
|
249
534
|
# Hash by the filenames to split into filenames:
|
250
535
|
bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr|
|
251
536
|
|
@@ -262,7 +547,10 @@ class SpecID::Sequest::PepXML
|
|
262
547
|
abort "invalid BioworksBrowser version: #{x}"
|
263
548
|
end
|
264
549
|
|
265
|
-
|
550
|
+
pepxml_obj = SpecID::Sequest::PepXML.new(pepxml_version, params)
|
551
|
+
pepxml_objs_by_base_name[base_name] = pepxml_obj
|
552
|
+
|
553
|
+
spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
|
266
554
|
|
267
555
|
|
268
556
|
# Sort_by_rank and take the top hit (to mimick out2summary):
|
@@ -270,28 +558,18 @@ class SpecID::Sequest::PepXML
|
|
270
558
|
top_pep = arr.pop
|
271
559
|
second_hit = arr.last # needed for deltacnstar
|
272
560
|
|
273
|
-
case params.precursor_mass_type
|
274
|
-
when "monoisotopic" ; avg_parent = false
|
275
|
-
else ; avg_parent = true
|
276
|
-
end
|
277
|
-
|
278
|
-
case avg_parent
|
279
|
-
when true ; h_plus = SpecID::AVG[:h_plus]
|
280
|
-
when false ; h_plus = SpecID::MONO[:h_plus]
|
281
|
-
end
|
282
561
|
|
283
|
-
|
284
562
|
case calc_prec_by
|
285
563
|
when :prec_mz_arr
|
286
|
-
precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, avg_parent)
|
564
|
+
precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, pepxml_obj.avg_parent)
|
287
565
|
when :deltamass
|
288
|
-
precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, avg_parent)
|
566
|
+
precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
|
289
567
|
end
|
290
568
|
|
291
|
-
calc_neutral_pep_mass = (top_pep.mass.to_f - h_plus)
|
569
|
+
calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
|
292
570
|
massdiff = precursor_neutral_mass - calc_neutral_pep_mass
|
293
571
|
if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
|
294
|
-
else ; massdiff = massdiff.to_s end
|
572
|
+
else ; massdiff = massdiff.to_s end #already has a -
|
295
573
|
# deltacn & star:
|
296
574
|
# (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
|
297
575
|
if second_hit
|
@@ -317,7 +595,7 @@ class SpecID::Sequest::PepXML
|
|
317
595
|
# NOTE: the bioworks mass is really M+H if two or more scans went
|
318
596
|
# into the search_hit; calc_neutral_pep_mass is simply the avg of
|
319
597
|
# precursor masses adjusted to be neutral
|
320
|
-
(prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.
|
598
|
+
(prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_pep.sequence)
|
321
599
|
(num_matched_ions, tot_num_ions) = SpecID::Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
|
322
600
|
search_hit = SpecID::Sequest::PepXML::SearchHit.new({
|
323
601
|
:hit_rank => "1",
|
@@ -348,32 +626,36 @@ class SpecID::Sequest::PepXML
|
|
348
626
|
|
349
627
|
# create an index by spectrum as results end up typically in out2summary
|
350
628
|
# (I really dislike this order, however)
|
351
|
-
|
352
|
-
|
629
|
+
spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
|
630
|
+
spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
|
353
631
|
|
354
|
-
spectrum_queries_by_base_name[base_name] =
|
632
|
+
spectrum_queries_by_base_name[base_name] = spectrum_queries_ar
|
355
633
|
end
|
356
634
|
|
357
|
-
|
635
|
+
modifications_string = bioworks.modifications
|
636
|
+
|
637
|
+
spectrum_queries_by_base_name.collect do |base_name, spectrum_queries_ar|
|
358
638
|
case pepxml_version
|
359
639
|
when 18
|
360
|
-
SpecID::Sequest::PepXML.new(
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
}) do spectrum_queries end
|
375
|
-
end
|
640
|
+
pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'}) do
|
641
|
+
full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
|
642
|
+
SpecID::Sequest::PepXML::MSMSRunSummary.new({
|
643
|
+
:base_name => full_base_name_no_ext,
|
644
|
+
:ms_manufacturer => ms_manufacturer,
|
645
|
+
:ms_model => ms_model,
|
646
|
+
:ms_ionization => ms_ionization,
|
647
|
+
:ms_mass_analyzer => ms_mass_analyzer,
|
648
|
+
:ms_detector => ms_detector,
|
649
|
+
:raw_data_type => raw_data_type,
|
650
|
+
:raw_data => raw_data,
|
651
|
+
:sample_enzyme => SampleEnzyme.new(sample_enzyme),
|
652
|
+
:search_summary => SpecID::Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params), :base_name => full_base_name_no_ext, :out_data_type => out_data_type, :out_data => out_data}),
|
653
|
+
}) { spectrum_queries_ar }
|
376
654
|
end
|
655
|
+
pepxml_obj = pepxml_objs_by_base_name[base_name]
|
656
|
+
pepxml_obj.msms_pipeline_analysis = pipeline
|
657
|
+
pepxml_obj.base_name = pipeline.msms_run_summary.base_name
|
658
|
+
pepxml_obj
|
377
659
|
when 0
|
378
660
|
## @TODO: NEED TO REVAMP THIS:
|
379
661
|
# SpecID::Sequest::PepXML.new(pepxml_version).set_from_hash({
|
@@ -429,6 +711,14 @@ class SpecID::Sequest::PepXML
|
|
429
711
|
string
|
430
712
|
end
|
431
713
|
|
714
|
+
# given any kind of filename (from windows or whatever)
|
715
|
+
# returns the base of the filename with no file extension
|
716
|
+
def self.base_name_noext(file)
|
717
|
+
file.gsub!("\\", '/')
|
718
|
+
File.basename(file).sub(/\.[\w^\.]+$/, '')
|
719
|
+
end
|
720
|
+
|
721
|
+
|
432
722
|
end # PepXML
|
433
723
|
|
434
724
|
##
|
@@ -461,6 +751,15 @@ class SpecID::Sequest::Params
|
|
461
751
|
one,two = line.split @@param_re
|
462
752
|
two,comment = two.split @@param_two_split
|
463
753
|
hash[one] = two.rstrip
|
754
|
+
# it is necessary to add this break so that params files inside srf
|
755
|
+
# files can be read. This will terminate the reading at the end of
|
756
|
+
# the file even though there are more lines
|
757
|
+
if line =~ /added to U/ || line =~ /digest_mass_range/## Will only work on bioworks 3.2 & 3.3 (bioworks 3.1 last line => Elastase/Tryp...)
|
758
|
+
break
|
759
|
+
end
|
760
|
+
if line =~ /digest_mass_range/ # there is no space in the srf params files
|
761
|
+
break
|
762
|
+
end
|
464
763
|
else
|
465
764
|
break
|
466
765
|
end
|
@@ -468,17 +767,26 @@ class SpecID::Sequest::Params
|
|
468
767
|
hash
|
469
768
|
end
|
470
769
|
|
770
|
+
# returns self
|
771
|
+
def parse_handle(fh)
|
772
|
+
sequest_line = fh.gets #[SEQUEST]
|
773
|
+
@opts = grab_params(fh)
|
774
|
+
@opts["search_engine"] = "SEQUEST"
|
775
|
+
@mods = grab_params(fh)
|
776
|
+
|
777
|
+
## this gets rid of the .hdr postfix on indexed databases
|
778
|
+
@opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
|
779
|
+
self
|
780
|
+
end
|
781
|
+
|
471
782
|
## parses file
|
472
783
|
## and drops the .hdr behind indexed fasta files
|
784
|
+
## returns self
|
473
785
|
def parse(file)
|
474
786
|
File.open(file) do |fh|
|
475
|
-
|
476
|
-
@opts = grab_params(fh)
|
477
|
-
@opts["search_engine"] = "SEQUEST"
|
478
|
-
@mods = grab_params(fh)
|
787
|
+
parse_handle(fh)
|
479
788
|
end
|
480
|
-
|
481
|
-
@opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
|
789
|
+
self
|
482
790
|
end
|
483
791
|
|
484
792
|
# returns( split_after, except_before)
|
@@ -569,6 +877,17 @@ class SpecID::Sequest::Params
|
|
569
877
|
@opts["first_database_name"]
|
570
878
|
end
|
571
879
|
|
880
|
+
# returns the appropriate aminoacid mass lookup table (in spec_id.rb SpecID::MONO or
|
881
|
+
# SpecID::AVG based on precursor_mass_type
|
882
|
+
def mass_table
|
883
|
+
case precursor_mass_type
|
884
|
+
when 'average'
|
885
|
+
SpecID::AVG
|
886
|
+
when 'monoisotopic'
|
887
|
+
SpecID::MONO
|
888
|
+
end
|
889
|
+
end
|
890
|
+
|
572
891
|
# at least in Bioworks 3.2, the First number after the enzyme
|
573
892
|
# is the indication of the enzymatic end stringency (required):
|
574
893
|
# 1 = Fully enzymatic
|
@@ -628,7 +947,7 @@ class SpecID::Sequest::PepXML::SearchResult
|
|
628
947
|
attr_accessor :search_hits
|
629
948
|
|
630
949
|
# if block given, then search_hits set to return value
|
631
|
-
def initialize
|
950
|
+
def initialize
|
632
951
|
if block_given? ; @search_hits = yield
|
633
952
|
else ; @search_hits = [] end
|
634
953
|
end
|
@@ -646,13 +965,16 @@ class SpecID::Sequest::PepXML::SearchSummary
|
|
646
965
|
attr_accessor :base_name
|
647
966
|
attr_accessor :out_data_type
|
648
967
|
attr_accessor :out_data
|
968
|
+
attr_accessor :modifications
|
649
969
|
# A SearchDatabase object (responds to :local_path and :type)
|
650
970
|
attr_accessor :search_database
|
651
971
|
# if given a sequest params object, then will set the following attributes:
|
652
972
|
# args is a hash of parameters
|
653
|
-
|
973
|
+
# modifications_string -> See Modifications
|
974
|
+
def initialize(params, modifications_string='', args=nil)
|
654
975
|
@search_id = nil
|
655
976
|
@params = params
|
977
|
+
@modifications = SpecID::Sequest::PepXML::Modifications.new(params, modifications_string)
|
656
978
|
if args ; set_from_hash(args) end
|
657
979
|
end
|
658
980
|
|
@@ -665,16 +987,304 @@ class SpecID::Sequest::PepXML::SearchSummary
|
|
665
987
|
else ; '1' end
|
666
988
|
end
|
667
989
|
|
990
|
+
|
668
991
|
def to_pepxml
|
669
992
|
element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
|
670
993
|
search_database.to_pepxml +
|
671
994
|
short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
|
995
|
+
@modifications.to_pepxml +
|
672
996
|
@params.pepxml_parameters
|
673
997
|
end
|
674
998
|
end
|
675
999
|
|
676
1000
|
end
|
677
1001
|
|
1002
|
+
class SpecID::Sequest::PepXML::Modifications
|
1003
|
+
include SpecIDXML
|
1004
|
+
|
1005
|
+
# sequest params object
|
1006
|
+
attr_accessor :params
|
1007
|
+
# array holding AAModifications
|
1008
|
+
attr_accessor :aa_mods
|
1009
|
+
# array holding TerminalModifications
|
1010
|
+
attr_accessor :term_mods
|
1011
|
+
# a hash of all differential modifications present by aa_one_letter_symbol
|
1012
|
+
# and special_symbol. This is NOT the mass difference but the total mass {
|
1013
|
+
# 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
|
1014
|
+
# the amino acid sequence, they are give the *differential* mass. The
|
1015
|
+
# termini are given the special symbol as in sequest e.g. '[' => 12.22, #
|
1016
|
+
# cterminus ']' => 14.55 # nterminus
|
1017
|
+
attr_accessor :masses_by_diff_mod_hash
|
1018
|
+
# a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
|
1019
|
+
# values are the special_symbols
|
1020
|
+
attr_accessor :mod_symbols_hash
|
1021
|
+
|
1022
|
+
# The modification symbols string looks like this:
|
1023
|
+
# (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
|
1024
|
+
# ct is cterminal peptide (differential)
|
1025
|
+
# nt is nterminal peptide (differential)
|
1026
|
+
# the C is just cysteine
|
1027
|
+
# will set_modifications and masses_by_diff_mod hash
|
1028
|
+
def initialize(params, modification_symbols_string='')
|
1029
|
+
@params = params
|
1030
|
+
set_modifications(params, modification_symbols_string)
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
# set the masses_by_diff_mod and mod_symbols_hash from
|
1034
|
+
def set_hashes(modification_symbols_string)
|
1035
|
+
@mod_symbols_hash = {}
|
1036
|
+
@masses_by_diff_mod = {}
|
1037
|
+
if modification_symbols_string == nil || modification_symbols_string == ''
|
1038
|
+
return nil
|
1039
|
+
end
|
1040
|
+
table = @params.mass_table
|
1041
|
+
modification_symbols_string.split(/\)\s+\(/).each do |mod|
|
1042
|
+
if mod =~ /\(?(\w{1,2})(.) (.[\d\.]+)\)?/
|
1043
|
+
aa_as_sym = $1.to_sym,
|
1044
|
+
@mod_symbols_hash[[aa_as_sym, $3.to_f]] = $2.dup
|
1045
|
+
if $1 == 'ct' || $1 == 'nt'
|
1046
|
+
@masses_by_diff_mod[$2] = $3.to_f
|
1047
|
+
else
|
1048
|
+
@masses_by_diff_mod[$1+$2] = $3.to_f + table[aa_as_sym]
|
1049
|
+
end
|
1050
|
+
end
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
|
1054
|
+
# given a bare peptide (no end pieces) returns a ModificationInfo object
|
1055
|
+
# e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
|
1056
|
+
# if there are no modifications, returns nil
|
1057
|
+
def modification_info(peptide)
|
1058
|
+
if @masses_by_diff_mod.size == 0
|
1059
|
+
return nil
|
1060
|
+
end
|
1061
|
+
hash[:modified_peptide] = peptide.dup
|
1062
|
+
hash = {}
|
1063
|
+
hsh = @masses_by_diff_mod
|
1064
|
+
table = @params.mass_table
|
1065
|
+
h = table[:h] # this? or h_plus ??
|
1066
|
+
oh = table[:o] + h
|
1067
|
+
## only the termini can match a single char
|
1068
|
+
if hsh.key? peptide[0,1]
|
1069
|
+
# AA + H + differential_mod
|
1070
|
+
hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
|
1071
|
+
peptide.slice!( 1..-1 )
|
1072
|
+
end
|
1073
|
+
if hsh.key? peptide[-1,1]
|
1074
|
+
# AA + OH + differential_mod
|
1075
|
+
hash[:mod_cterm_mass] = table[peptide[-2,1].to_sym] + oh + hsh[peptide[-1,1]]
|
1076
|
+
peptide.slice!( 0..-2 )
|
1077
|
+
end
|
1078
|
+
mod_array = []
|
1079
|
+
(0...peptide.size).each do |i|
|
1080
|
+
if hsh.key? peptide[i,2]
|
1081
|
+
mod_array << [ i+1 , hsh[peptide[i,2]] ]
|
1082
|
+
end
|
1083
|
+
end
|
1084
|
+
if mod_array.size > 0
|
1085
|
+
hash[:mod_aminoacid_mass_array] = mod_array
|
1086
|
+
end
|
1087
|
+
if hash.size > 0
|
1088
|
+
SpecID::Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
|
1089
|
+
else
|
1090
|
+
nil
|
1091
|
+
end
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
# 1. sets aa_mods and term_mods from a sequest params object
|
1095
|
+
# 2. sets @params
|
1096
|
+
# 3. sets @masses_by_diff_mod
|
1097
|
+
def set_modifications(params, modification_symbols_string)
|
1098
|
+
@params = params
|
1099
|
+
|
1100
|
+
set_hashes(modification_symbols_string)
|
1101
|
+
|
1102
|
+
####################################
|
1103
|
+
## static mods
|
1104
|
+
####################################
|
1105
|
+
|
1106
|
+
static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
|
1107
|
+
static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
|
1108
|
+
|
1109
|
+
params.mods.each do |k,v|
|
1110
|
+
v_to_f = v.to_f
|
1111
|
+
if v_to_f != 0.0
|
1112
|
+
if k =~ /add_(\w)_/
|
1113
|
+
static_mods << [$1.to_sym, v_to_f]
|
1114
|
+
else
|
1115
|
+
static_terminal_mods << [k, v_to_f]
|
1116
|
+
end
|
1117
|
+
end
|
1118
|
+
end
|
1119
|
+
aa_hash = params.mass_table
|
1120
|
+
|
1121
|
+
## Create the static_mods objects
|
1122
|
+
static_mods.map! do |mod|
|
1123
|
+
hash = {
|
1124
|
+
:aminoacid => mod[0].to_s,
|
1125
|
+
:massdiff => mod[1].to_plus_minus_string,
|
1126
|
+
:mass => aa_hash[mod[0]] + mod[1],
|
1127
|
+
:variable => 'N',
|
1128
|
+
:binary => 'Y',
|
1129
|
+
}
|
1130
|
+
SpecID::Sequest::PepXML::AAModification.new(hash)
|
1131
|
+
end
|
1132
|
+
|
1133
|
+
## Create the static_terminal_mods objects
|
1134
|
+
static_terminal_mods.map! do |mod|
|
1135
|
+
terminus = if mod[0] =~ /Cterm/ ; 'c'
|
1136
|
+
else ; 'n' # only two possible termini
|
1137
|
+
end
|
1138
|
+
protein_terminus = case mod[0]
|
1139
|
+
when /Nterm_protein/ ; 'n'
|
1140
|
+
when /Cterm_protein/ ; 'c'
|
1141
|
+
else nil
|
1142
|
+
end
|
1143
|
+
|
1144
|
+
# create the hash
|
1145
|
+
hash = {
|
1146
|
+
:terminus => terminus,
|
1147
|
+
:massdiff => mod[1].to_plus_minus_string,
|
1148
|
+
:variable => 'N',
|
1149
|
+
:description => mod[0],
|
1150
|
+
}
|
1151
|
+
hash[:protein_terminus] = protein_terminus if protein_terminus
|
1152
|
+
SpecID::Sequest::PepXML::TerminalModification.new(hash)
|
1153
|
+
end
|
1154
|
+
#################################
|
1155
|
+
# Variable Mods:
|
1156
|
+
#################################
|
1157
|
+
arr = params.diff_search_options.rstrip.split(/\s+/)
|
1158
|
+
# [aa.to_sym, diff.to_f]
|
1159
|
+
variable_mods = []
|
1160
|
+
(0...arr.size).step(2) do |i|
|
1161
|
+
if arr[i].to_f != 0.0
|
1162
|
+
variable_mods << [arr[i+1].to_sym, arr[i].to_f]
|
1163
|
+
end
|
1164
|
+
end
|
1165
|
+
variable_mods.map! do |mod|
|
1166
|
+
hash = {
|
1167
|
+
:aminoacid => mod[0].to_s,
|
1168
|
+
:massdiff => mod[1].to_plus_minus_string,
|
1169
|
+
:mass => aa_hash[mod[0]] + mod[1],
|
1170
|
+
:variable => 'Y',
|
1171
|
+
:binary => 'N',
|
1172
|
+
:symbol => @mod_symbols_hash[mod],
|
1173
|
+
}
|
1174
|
+
SpecID::Sequest::PepXML::AAModification.new(hash)
|
1175
|
+
end
|
1176
|
+
#################################
|
1177
|
+
# TERMINAL Variable Mods:
|
1178
|
+
#################################
|
1179
|
+
# These are always peptide, not protein termini (for sequest)
|
1180
|
+
(nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
|
1181
|
+
|
1182
|
+
to_add = []
|
1183
|
+
if nterm_diff != 0.0
|
1184
|
+
to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
|
1185
|
+
end
|
1186
|
+
if cterm_diff != 0.0
|
1187
|
+
to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
|
1188
|
+
end
|
1189
|
+
|
1190
|
+
variable_terminal_mods = to_add.map do |term, mssdiff, symb|
|
1191
|
+
hash = {
|
1192
|
+
:terminus => term,
|
1193
|
+
:massdiff => mssdiff,
|
1194
|
+
:variable => 'Y',
|
1195
|
+
:symbol => symb,
|
1196
|
+
}
|
1197
|
+
SpecID::Sequest::PepXML::TerminalModification.new(hash)
|
1198
|
+
end
|
1199
|
+
|
1200
|
+
#########################
|
1201
|
+
# COLLECT THEM
|
1202
|
+
#########################
|
1203
|
+
@aa_mods = static_mods + variable_mods
|
1204
|
+
@term_mods = static_terminal_mods + variable_terminal_mods
|
1205
|
+
end
|
1206
|
+
|
1207
|
+
## Generates the pepxml for static and differential amino acid mods based on
|
1208
|
+
## sequest object
|
1209
|
+
def to_pepxml
|
1210
|
+
st = ''
|
1211
|
+
if @aa_mods
|
1212
|
+
st << @aa_mods.map {|v| v.to_pepxml }.join
|
1213
|
+
end
|
1214
|
+
if @term_mods
|
1215
|
+
st << @term_mods.map {|v| v.to_pepxml }.join
|
1216
|
+
end
|
1217
|
+
st
|
1218
|
+
end
|
1219
|
+
|
1220
|
+
end
|
1221
|
+
|
1222
|
+
# Modified aminoacid, static or variable
|
1223
|
+
# unless otherwise stated, all attributes can be anything
|
1224
|
+
class SpecID::Sequest::PepXML::AAModification
|
1225
|
+
include SpecIDXML
|
1226
|
+
|
1227
|
+
# The amino acid (one letter code)
|
1228
|
+
attr_accessor :aminoacid
|
1229
|
+
# Must be a string!!!!
|
1230
|
+
# Mass difference with respect to unmodified aminoacid, must begin with
|
1231
|
+
# either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
|
1232
|
+
# consider Numeric#to_plus_minus_string at top
|
1233
|
+
attr_accessor :massdiff
|
1234
|
+
# Mass of modified aminoacid
|
1235
|
+
attr_accessor :mass
|
1236
|
+
# Y if both modified and unmodified aminoacid could be present in the
|
1237
|
+
# dataset, N if only modified aminoacid can be present
|
1238
|
+
attr_accessor :variable
|
1239
|
+
# whether modification can reside only at protein terminus (specified 'n',
|
1240
|
+
# 'c', or 'nc')
|
1241
|
+
attr_accessor :peptide_terminus
|
1242
|
+
# Special symbol used by search engine to designate this modification
|
1243
|
+
attr_accessor :symbol
|
1244
|
+
# Y if each peptide must have only modified or unmodified aminoacid, N if a
|
1245
|
+
# peptide may contain both modified and unmodified aminoacid
|
1246
|
+
attr_accessor :binary
|
1247
|
+
|
1248
|
+
def initialize(hash=nil)
|
1249
|
+
instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
|
1250
|
+
end
|
1251
|
+
|
1252
|
+
def to_pepxml
|
1253
|
+
short_element_xml_from_instance_vars("aminoacid_modification")
|
1254
|
+
end
|
1255
|
+
|
1256
|
+
end
|
1257
|
+
|
1258
|
+
# Modified aminoacid, static or variable
|
1259
|
+
class SpecID::Sequest::PepXML::TerminalModification
|
1260
|
+
include SpecIDXML
|
1261
|
+
|
1262
|
+
# n for N-terminus, c for C-terminus
|
1263
|
+
attr_accessor :terminus
|
1264
|
+
# Mass difference with respect to unmodified terminus
|
1265
|
+
attr_accessor :massdiff
|
1266
|
+
# Mass of modified terminus
|
1267
|
+
attr_accessor :mass
|
1268
|
+
# Y if both modified and unmodified terminus could be present in the
|
1269
|
+
# dataset, N if only modified terminus can be present
|
1270
|
+
attr_accessor :variable
|
1271
|
+
# Special symbol used by search engine to designate this modification
|
1272
|
+
attr_accessor :symbol
|
1273
|
+
# whether modification can reside only at protein terminus (specified n or
|
1274
|
+
# c)
|
1275
|
+
attr_accessor :protein_terminus
|
1276
|
+
attr_accessor :description
|
1277
|
+
|
1278
|
+
def initialize(hash=nil)
|
1279
|
+
instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
def to_pepxml
|
1283
|
+
short_element_xml_from_instance_vars("terminal_modification")
|
1284
|
+
end
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
|
678
1288
|
class SpecID::Sequest::PepXML::SearchDatabase
|
679
1289
|
include SpecIDXML
|
680
1290
|
attr_accessor :local_path
|
@@ -708,7 +1318,15 @@ end
|
|
708
1318
|
|
709
1319
|
class SpecID::Sequest::PepXML::SpectrumQuery
|
710
1320
|
include SpecIDXML
|
711
|
-
|
1321
|
+
|
1322
|
+
# basename_noext.first_scan.last_scan.charge
|
1323
|
+
attr_accessor :spectrum
|
1324
|
+
attr_accessor :start_scan
|
1325
|
+
attr_accessor :end_scan
|
1326
|
+
attr_accessor :precursor_neutral_mass
|
1327
|
+
attr_accessor :index
|
1328
|
+
attr_accessor :search_results
|
1329
|
+
|
712
1330
|
# this is a string
|
713
1331
|
attr_accessor :assumed_charge
|
714
1332
|
attr_accessor :pepxml_version
|
@@ -803,6 +1421,10 @@ end
|
|
803
1421
|
# this responds to flatten (so that it won't flatten).
|
804
1422
|
class SpecID::Sequest::PepXML::SearchHit < Array
|
805
1423
|
include SpecIDXML
|
1424
|
+
|
1425
|
+
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
|
1426
|
+
|
1427
|
+
# num_tot_proteins = "Number of unique proteins in search database containing peptide"
|
806
1428
|
#attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected
|
807
1429
|
#attr_accessor 13:deltacnstar
|
808
1430
|
#attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank
|
@@ -811,69 +1433,82 @@ class SpecID::Sequest::PepXML::SearchHit < Array
|
|
811
1433
|
ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17}
|
812
1434
|
@@methods = ind_keys.keys
|
813
1435
|
def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end
|
814
|
-
def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
|
815
|
-
def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
|
816
|
-
def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
|
817
|
-
def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
|
818
|
-
def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
|
819
|
-
def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
|
820
|
-
def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
|
821
|
-
def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
|
822
|
-
def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
|
823
|
-
def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
|
824
|
-
def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
|
825
|
-
def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
|
826
|
-
def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
|
827
|
-
def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
|
828
|
-
def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
|
829
|
-
def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
|
830
|
-
def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
|
831
|
-
|
832
|
-
@@arr_size = ind_keys.size
|
833
|
-
ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
|
834
|
-
ind_keys.merge!(ind_keys_w_eq)
|
835
|
-
ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
|
836
|
-
|
837
|
-
# These are all search_score elements:
|
838
|
-
|
839
|
-
# 1 if there is no second ranked hit, 0 otherwise
|
840
|
-
|
841
|
-
def initialize(hash=nil)
|
842
|
-
super(@@arr_size)
|
843
|
-
self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
|
844
|
-
self
|
845
|
-
#if hash ; set_from_hash(hash) end
|
846
|
-
end
|
1436
|
+
def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
|
1437
|
+
def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
|
1438
|
+
def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
|
1439
|
+
def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
|
1440
|
+
def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
|
1441
|
+
def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
|
1442
|
+
def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
|
1443
|
+
def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
|
1444
|
+
def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
|
1445
|
+
def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
|
1446
|
+
def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
|
1447
|
+
def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
|
1448
|
+
def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
|
1449
|
+
def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
|
1450
|
+
def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
|
1451
|
+
def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
|
1452
|
+
def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
|
1453
|
+
|
1454
|
+
@@arr_size = ind_keys.size
|
1455
|
+
ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
|
1456
|
+
ind_keys.merge!(ind_keys_w_eq)
|
1457
|
+
ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
|
1458
|
+
|
1459
|
+
# These are all search_score elements:
|
1460
|
+
|
1461
|
+
# 1 if there is no second ranked hit, 0 otherwise
|
847
1462
|
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
peptide_prev_aa, peptide, peptide_next_aa = *pieces
|
861
|
-
when 2
|
862
|
-
if pieces[0].size > 1 ## N termini
|
863
|
-
peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
|
864
|
-
else ## C termini
|
865
|
-
peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
|
866
|
-
end
|
867
|
-
when 1 ## this must be a parse error!
|
868
|
-
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
869
|
-
when 0
|
870
|
-
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
871
|
-
end
|
872
|
-
return peptide_prev_aa, peptide, peptide_next_aa
|
873
|
-
end
|
1463
|
+
def initialize(hash=nil)
|
1464
|
+
super(@@arr_size)
|
1465
|
+
self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
|
1466
|
+
self
|
1467
|
+
#if hash ; set_from_hash(hash) end
|
1468
|
+
end
|
1469
|
+
|
1470
|
+
# remove_non_amino_acids && split_sequence
|
1471
|
+
def self.prepare_sequence(val)
|
1472
|
+
nv = remove_non_amino_acids(val)
|
1473
|
+
split_sequence(nv)
|
1474
|
+
end
|
874
1475
|
|
875
|
-
|
876
|
-
|
1476
|
+
# Returns prev, peptide, next from sequence. Parse errors return
|
1477
|
+
# nil,nil,nil
|
1478
|
+
# R.PEPTIDE.A # -> R, PEPTIDE, A
|
1479
|
+
# R.PEPTIDE.- # -> R, PEPTIDE, -
|
1480
|
+
# PEPTIDE.A # -> -, PEPTIDE, A
|
1481
|
+
# A.PEPTIDE # -> A, PEPTIDE, -
|
1482
|
+
# PEPTIDE # -> nil,nil,nil
|
1483
|
+
def self.split_sequence(val)
|
1484
|
+
peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
|
1485
|
+
pieces = val.split('.')
|
1486
|
+
case pieces.size
|
1487
|
+
when 3
|
1488
|
+
peptide_prev_aa, peptide, peptide_next_aa = *pieces
|
1489
|
+
when 2
|
1490
|
+
if pieces[0].size > 1 ## N termini
|
1491
|
+
peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
|
1492
|
+
else ## C termini
|
1493
|
+
peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
|
1494
|
+
end
|
1495
|
+
when 1 ## this must be a parse error!
|
1496
|
+
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
1497
|
+
when 0
|
1498
|
+
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
1499
|
+
end
|
1500
|
+
return peptide_prev_aa, peptide, peptide_next_aa
|
1501
|
+
end
|
1502
|
+
|
1503
|
+
# removes nonstandard chars with Non_standard_amino_acid_char_re
|
1504
|
+
# preserves A-Z and '.
|
1505
|
+
def self.remove_non_amino_acids(sequence)
|
1506
|
+
sequence.gsub(Non_standard_amino_acid_char_re, '')
|
1507
|
+
end
|
1508
|
+
|
1509
|
+
def inspect
|
1510
|
+
var = @@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
|
1511
|
+
"#<SearchHit #{var}>"
|
877
1512
|
end
|
878
1513
|
|
879
1514
|
# requires Params object and full sequence (with heads and tails)
|
@@ -924,3 +1559,65 @@ def inspect
|
|
924
1559
|
|
925
1560
|
end
|
926
1561
|
|
1562
|
+
# Positions and masses of modifications
|
1563
|
+
class SpecID::Sequest::PepXML::SearchHit::ModificationInfo
|
1564
|
+
include SpecIDXML
|
1565
|
+
|
1566
|
+
## Should be something like this:
|
1567
|
+
# <modification_info mod_nterm_mass=" " mod_nterm_mass=" " modified_peptide=" ">
|
1568
|
+
# <mod_aminoacid_mass position=" " mass=" "/>
|
1569
|
+
# </modification_info>
|
1570
|
+
|
1571
|
+
|
1572
|
+
# Mass of modified N terminus<
|
1573
|
+
attr_accessor :mod_nterm_mass
|
1574
|
+
# Mass of modified C terminus<
|
1575
|
+
attr_accessor :mod_cterm_mass
|
1576
|
+
# Peptide sequence (with indicated modifications) I'm assuming that the
|
1577
|
+
# native sequest indicators are OK here
|
1578
|
+
attr_accessor :modified_peptide
|
1579
|
+
## A few main types:
|
1580
|
+
|
1581
|
+
# this should be an array of arrays: [[position, modified_mass], ...]
|
1582
|
+
# position ranges from 1 to peptide length
|
1583
|
+
attr_accessor :mod_aminoacid_mass_array
|
1584
|
+
|
1585
|
+
def initialize(hash=nil)
|
1586
|
+
instance_var_set_from_hash(hash)
|
1587
|
+
end
|
1588
|
+
|
1589
|
+
# Will escape any xml special chars in modified_peptide
|
1590
|
+
def to_pepxml
|
1591
|
+
## Collect the modifications:
|
1592
|
+
mod_strings = []
|
1593
|
+
if @mod_aminoacid_mass_array
|
1594
|
+
mod_strings = @mod_aminoacid_mass_array.map do |ar|
|
1595
|
+
"position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
|
1596
|
+
end
|
1597
|
+
end
|
1598
|
+
## Create the attribute string:
|
1599
|
+
att_parts = []
|
1600
|
+
if @mod_nterm_mass
|
1601
|
+
att_parts << "mod_nterm_mass=\"#{@mod_nterm_mass}\""
|
1602
|
+
end
|
1603
|
+
if @mod_cterm_mass
|
1604
|
+
att_parts << "mod_cterm_mass=\"#{@mod_cterm_mass}\""
|
1605
|
+
end
|
1606
|
+
if @modified_peptide
|
1607
|
+
att_parts << "modified_peptide=\"#{escape_special_chars(@modified_peptide)}\""
|
1608
|
+
end
|
1609
|
+
element_xml_and_att_string('modification_info', att_parts.join(" ")) do
|
1610
|
+
mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
|
1611
|
+
end
|
1612
|
+
end
|
1613
|
+
|
1614
|
+
##
|
1615
|
+
|
1616
|
+
# <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
|
1617
|
+
# <mod_aminoacid_mass position="2" mass="545.7160"/>
|
1618
|
+
# <mod_aminoacid_mass position="3" mass="147.1926"/>
|
1619
|
+
# </modification_info>
|
1620
|
+
|
1621
|
+
|
1622
|
+
end
|
1623
|
+
|