ms-sequest 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1458 @@
1
+
2
+ require 'sample_enzyme'
3
+ require 'ms/parser/mzxml'
4
+ require 'hash_by'
5
+ require 'set_from_hash'
6
+ require 'spec_id/bioworks'
7
+ require 'instance_var_set_from_hash'
8
+ require 'ms/msrun'
9
+ require 'spec_id/srf'
10
+ require 'spec_id/sequest/params'
11
+ require 'fileutils'
12
+
13
+ class Numeric
14
+ # returns a string with a + or - on the front
15
+ def to_plus_minus_string
16
+ if self >= 0
17
+ '+' << self.to_s
18
+ else
19
+ self.to_s
20
+ end
21
+ end
22
+ end
23
+
24
+
25
+ module Sequest ; end
26
+ class Sequest::PepXML; end
27
+
28
+ class Sequest::PepXML::MSMSPipelineAnalysis
29
+ include SpecIDXML
30
+ # Version 1.2.3
31
+ attr_writer :date
32
+ attr_writer :xmlns, :xmlns_xsi, :xsi_schemaLocation
33
+ attr_accessor :summary_xml
34
+ # Version 2.3.4
35
+ attr_writer :xmlns, :xmlns_xsi, :xsi_schema_location
36
+ attr_accessor :pepxml_version
37
+ attr_accessor :msms_run_summary
38
+
39
+ # if block given, sets msms_run_summary to block
40
+ def initialize(hash=nil)
41
+ @xmlns = nil
42
+ @xmlns_xsi = nil
43
+ @xsi_schema_location = nil
44
+ if hash
45
+ self.set_from_hash(hash)
46
+ end
47
+ if block_given?
48
+ @msms_run_summary = yield
49
+ end
50
+ end
51
+
52
+ # if no date string given, then it will set to Time.now
53
+ def date
54
+ if @date ; @date
55
+ else
56
+ case Sequest::PepXML.pepxml_version
57
+ when 18 ; tarr = Time.now.to_a ; tarr[3..5].reverse.join('-') + "T#{tarr[0..2].reverse.join(':')}"
58
+ end
59
+ end
60
+ end
61
+
62
+ def xmlns
63
+ if @xmlns ; @xmlns
64
+ else ; "http://regis-web.systemsbiology.net/pepXML"
65
+ end
66
+ end
67
+
68
+ def xmlns_xsi
69
+ if @xmlns_xsi ; @xmlns_xsi
70
+ else ; "http://www.w3.org/2001/XMLSchema-instance"
71
+ end
72
+ end
73
+
74
+ def xsi_schema_location
75
+ if @xsi_schema_location ; @xsi_schema_location
76
+ else ; "http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd"
77
+ end
78
+ end
79
+
80
+ def to_pepxml
81
+ case Sequest::PepXML.pepxml_version
82
+ when 18
83
+ element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do
84
+ @msms_run_summary.to_pepxml
85
+ end
86
+ else
87
+ abort "Don't know how to deal with version: #{Sequest::PepXML.pepxml_version}"
88
+ end
89
+ end
90
+
91
+ end
92
+
93
+ class Sequest::PepXML::MSMSRunSummary
94
+ include SpecID
95
+ include SpecIDXML
96
+
97
+ # the version of TPP you are using (determines xml output)
98
+ # The name of the pep xml file (without extension) (but this is a long
99
+ # filename!!!)
100
+ attr_accessor :base_name
101
+ # The name of the mass spec manufacturer
102
+ attr_accessor :ms_manufacturer
103
+ attr_accessor :ms_model
104
+ attr_accessor :ms_mass_analyzer
105
+ attr_accessor :ms_detector
106
+ attr_accessor :raw_data_type
107
+ attr_accessor :raw_data
108
+ attr_accessor :ms_ionization
109
+ attr_accessor :pepxml_version
110
+
111
+ # A SampleEnzyme object (responds to: name, cut, no_cut, sense)
112
+ attr_accessor :sample_enzyme
113
+ # A SearchSummary object
114
+ attr_accessor :search_summary
115
+ # An array of spectrum_queries
116
+ attr_accessor :spectrum_queries
117
+
118
+ # takes a hash of name, value pairs
119
+ # if block given, spectrum_queries (should be array of spectrum queries) is
120
+ # set to the return value of the block
121
+ def initialize(hash=nil)
122
+ @spectrum_queries = []
123
+ if hash
124
+ instance_var_set_from_hash(hash)
125
+ end
126
+ if block_given? ; @spectrum_queries = yield end
127
+ end
128
+
129
+ def to_pepxml
130
+ case Sequest::PepXML.pepxml_version
131
+ when 18
132
+ element_xml_and_att_string(:msms_run_summary, "base_name=\"#{base_name}\" msManufacturer=\"#{ms_manufacturer}\" msModel=\"#{ms_model}\" msIonization=\"#{ms_ionization}\" msMassAnalyzer=\"#{ms_mass_analyzer}\" msDetector=\"#{ms_detector}\" raw_data_type=\"#{raw_data_type}\" raw_data=\"#{raw_data}\"") do
133
+ sample_enzyme.to_pepxml +
134
+ search_summary.to_pepxml +
135
+ spectrum_queries.map {|sq| sq.to_pepxml }.join
136
+ end
137
+ end
138
+ end
139
+
140
+ def search_hit_class
141
+ Sequest::PepXML::SearchHit
142
+ end
143
+
144
+ def self.from_pepxml_node(node)
145
+ self.new.from_pepxml_node(node)
146
+ end
147
+
148
+ # peps correspond to search_results
149
+ def from_pepxml_node(node)
150
+ @base_name = node['base_name']
151
+ @ms_manufacturer = node['msManufacturer']
152
+ @ms_model = node['msModel']
153
+ @ms_manufacturer = node['msIonization']
154
+ @ms_mass_analyzer = node['msMassAnalyzer']
155
+ @ms_detector = node['msDetector']
156
+ @raw_data_type = node['raw_data_type']
157
+ @raw_data = node['raw_data']
158
+ self
159
+ end
160
+ end
161
+
162
+
163
+
164
+ class Sequest::PepXML
165
+ include SpecIDXML
166
+
167
+ ## CREATE a default version for the entire class
168
+ class << self
169
+ attr_accessor :pepxml_version
170
+ end
171
+ DEF_VERSION = 18
172
+ self.pepxml_version = DEF_VERSION # default version
173
+
174
+ attr_accessor :pepxml_version, :msms_pipeline_analysis
175
+ ## the full path name (no extension)
176
+ attr_accessor :base_name
177
+ attr_accessor :h_plus
178
+ attr_accessor :avg_parent
179
+
180
+ #attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
181
+
182
+ # returns an array of spectrum queries
183
+ def spectrum_queries
184
+ msms_pipeline_analysis.msms_run_summary.spectrum_queries
185
+ end
186
+
187
+ # msms_pipeline_analysis is set to the result of the yielded block
188
+ # and set_mono_or_avg is called with params if given
189
+ def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
190
+ self.class.pepxml_version = pepxml_version
191
+ if sequest_params_obj
192
+ set_mono_or_avg(sequest_params_obj)
193
+ end
194
+ if block_given?
195
+ @msms_pipeline_analysis = yield
196
+ @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
197
+ end
198
+ end
199
+
200
+ # sets @h_plus and @avg_parent from the sequest params object
201
+ def set_mono_or_avg(sequest_params_obj)
202
+ case sequest_params_obj.precursor_mass_type
203
+ when "monoisotopic" ; @avg_parent = false
204
+ else ; @avg_parent = true
205
+ end
206
+
207
+ case @avg_parent
208
+ when true ; @h_plus = SpecID::AVG[:h_plus]
209
+ when false ; @h_plus = SpecID::MONO[:h_plus]
210
+ end
211
+ end
212
+
213
+ def date
214
+ Time.new.to_s
215
+ end
216
+
217
+ def xml_version
218
+ '<?xml version="1.0" encoding="UTF-8"?>' + "\n"
219
+ end
220
+
221
+ # for pepxml_version == 0
222
+ def doctype
223
+ '<!DOCTYPE msms_pipeline_analysis SYSTEM "/usr/bin/msms_analysis3.dtd">' + "\n"
224
+ end
225
+
226
+ def style_sheet
227
+ case self.class.pepxml_version
228
+ when 18
229
+ '<?xml-stylesheet type="text/xsl" href="/tools/bin/TPP/tpp/schema/pepXML_std.xsl"?>'
230
+ end
231
+ end
232
+
233
+ def header
234
+ case self.class.pepxml_version
235
+ when 18 ; xml_version + style_sheet
236
+ end
237
+ end
238
+
239
+ # updates the private attrs _num_prots and _first_prot on bioworks pep
240
+ # objects. Ideally, we'd like these attributes to reside elsewhere, but for
241
+ # memory concerns, this is best for now.
242
+ def self._prot_num_and_first_prot_by_pep(pep_array)
243
+ pep_array.hash_by(:aaseq).each do |aasq, pep_arr|
244
+ prts = []
245
+ pep_arr.each { |pep| prts.push( *(pep.prots) ) }
246
+ prts.uniq!
247
+ _size = prts.size
248
+ pep_arr.each do |pep|
249
+ pep._num_prots = _size
250
+ pep._first_prot = prts.first
251
+ end
252
+ end
253
+ end
254
+
255
+
256
+ Default_Options = {
257
+ :out_path => '.',
258
+ #:backup_db_path => '.',
259
+ # a PepXML option
260
+ :pepxml_version => DEF_VERSION,
261
+ ## MSMSRunSummary options:
262
+ # string must be recognized in sample_enzyme.rb
263
+ # or create your own SampleEnzyme object
264
+ :ms_manufacturer => 'ThermoFinnigan',
265
+ :ms_model => 'LCQ Deca XP Plus',
266
+ :ms_ionization => 'ESI',
267
+ :ms_mass_analyzer => 'Ion Trap',
268
+ :ms_detector => 'UNKNOWN',
269
+ :ms_data => '.', # path to ms data files (raw or mzxml)
270
+ :raw_data_type => "raw",
271
+ :raw_data => ".mzXML", ## even if you don't have it?
272
+ ## SearchSummary options:
273
+ :out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
274
+ :out_data => ".tgz", ## may be srf??
275
+ :copy_mzxml => false, # copy the mzxml file to the out_path (create it if necessary)
276
+ :print => false, # print the objects to file
277
+ }
278
+
279
+ # will dynamically set :ms_model and :ms_mass_analyzer from srf info
280
+ # (ignoring defaults or anything passed in) for LTQ Orbitrap
281
+ # and LCQ Deca XP
282
+ # See SRF::Sequest::PepXML::Default_Options hash for defaults
283
+ # unless given, the out_path will be given as the path of the srf_file
284
+ # srf may be an object or a filename
285
+ def self.new_from_srf(srf, opts={})
286
+ opts = Default_Options.merge(opts)
287
+
288
+ ## read the srf file
289
+ if srf.is_a? String
290
+ srf = SRF.new(srf)
291
+ end
292
+
293
+ ## set the outpath
294
+ out_path = opts.delete(:out_path)
295
+
296
+ params = srf.params
297
+
298
+ ## check to see if we need backup_db
299
+ backup_db_path = opts.delete(:backup_db_path)
300
+ if !File.exist?(params.database) && backup_db_path
301
+ params.database_path = backup_db_path
302
+ end
303
+
304
+ #######################################################################
305
+ # PREPARE THE OPTIONS:
306
+ #######################################################################
307
+ ## remove items from the options hash that don't belong to
308
+ ppxml_version = opts.delete(:pepxml_version)
309
+ out_data_type = opts.delete(:out_data_type)
310
+ out_data = opts.delete(:out_data)
311
+
312
+ ## Extract meta info from srf
313
+ bn_noext = base_name_noext(srf.header.raw_filename)
314
+ opts[:ms_model] = srf.header.model
315
+ case opts[:ms_model]
316
+ when /Orbitrap/
317
+ opts[:ms_mass_analyzer] = 'Orbitrap'
318
+ when /LCQ Deca XP/
319
+ opts[:ms_mass_analyzer] = 'Ion Trap'
320
+ end
321
+
322
+ ## Create the base name
323
+ full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
324
+ opts[:base_name] = full_base_name_no_ext
325
+
326
+ ## Create the search summary:
327
+ search_summary_options = {
328
+ :search_database => Sequest::PepXML::SearchDatabase.new(params),
329
+ :base_name => full_base_name_no_ext,
330
+ :out_data_type => out_data_type,
331
+ :out_data => out_data
332
+ }
333
+ modifications_string = srf.header.modifications
334
+ search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
335
+
336
+ # create the sample enzyme from the params object:
337
+ sample_enzyme_obj =
338
+ if opts[:sample_enzyme]
339
+ opts[:sample_enzyme]
340
+ else
341
+ params.sample_enzyme
342
+ end
343
+ opts[:sample_enzyme] = sample_enzyme_obj
344
+
345
+ ## Create the pepxml obj and top level objects
346
+ pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
347
+ pipeline = Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'})
348
+ pepxml_obj.msms_pipeline_analysis = pipeline
349
+ pipeline.msms_run_summary = Sequest::PepXML::MSMSRunSummary.new(opts)
350
+ pipeline.msms_run_summary.search_summary = search_summary
351
+ modifications_obj = search_summary.modifications
352
+
353
+ ## name some common variables we'll need
354
+ h_plus = pepxml_obj.h_plus
355
+ avg_parent = pepxml_obj.avg_parent
356
+
357
+
358
+ ## COPY MZXML FILES IF NECESSARY
359
+ if opts[:copy_mzxml]
360
+ mzxml_pathname_noext = File.join(opts[:ms_data], bn_noext)
361
+ to_copy = MS::Converter::MzXML.file_to_mzxml(mzxml_pathname_noext)
362
+ if to_copy
363
+ FileUtils.cp to_copy, out_path
364
+ else
365
+ puts "Couldn't file mzXML file with base: #{mzxml_pathname_noext}"
366
+ puts "Perhaps you need to specifiy the location of the raw data"
367
+ puts "or need an mzXML converter (readw or t2x)"
368
+ exit
369
+ end
370
+ end
371
+
372
+
373
+ #######################################################################
374
+ # CREATE the spectrum_queries_ar
375
+ #######################################################################
376
+ srf_index = srf.index
377
+ out_files = srf.out_files
378
+ spectrum_queries_arr = Array.new(srf.dta_files.size)
379
+ files_with_hits_index = 0 ## will end up being 1 indexed
380
+
381
+ deltacn_orig = opts[:deltacn_orig]
382
+ deltacn_index =
383
+ if deltacn_orig ; 20
384
+ else 19
385
+ end
386
+
387
+ srf.dta_files.each_with_index do |dta_file,dta_i|
388
+ next if out_files[dta_i].num_hits == 0
389
+ files_with_hits_index += 1
390
+
391
+ precursor_neutral_mass = dta_file.mh - h_plus
392
+
393
+ (start_scan, end_scan, charge) = srf_index[dta_i]
394
+ sq_hash = {
395
+ :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
396
+ :start_scan => start_scan,
397
+ :end_scan => end_scan,
398
+ :precursor_neutral_mass => precursor_neutral_mass,
399
+ :assumed_charge => charge.to_i,
400
+ :pepxml_version => ppxml_version,
401
+ :index => files_with_hits_index,
402
+ }
403
+
404
+ spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
405
+
406
+
407
+ hits = out_files[dta_i].hits
408
+
409
+ search_hits =
410
+ if opts[:all_hits]
411
+ Array.new(out_files[dta_i].num_hits) # all hits
412
+ else
413
+ Array.new(1) # top hit only
414
+ end
415
+
416
+ (0...(search_hits.size)).each do |hit_i|
417
+ hit = hits[hit_i]
418
+ # under the modified deltacn schema (like bioworks)
419
+ # Get proper deltacn and deltacnstar
420
+ # under new srf, deltacn is already corrected for what prophet wants,
421
+ # deltacn_orig_updated is how to access the old one
422
+ # Prophet deltacn is not the same as the native Sequest deltacn
423
+ # It is the deltacn of the second best hit!
424
+
425
+ ## mass calculations:
426
+ calc_neutral_pep_mass = hit[0] - h_plus
427
+
428
+
429
+ sequence = hit.sequence
430
+
431
+ # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
432
+ ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
433
+ (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
434
+ # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
435
+
436
+ sh_hash = {
437
+ :hit_rank => hit_i+1,
438
+ :peptide => pepseq,
439
+ :peptide_prev_aa => prevaa,
440
+ :peptide_next_aa => nextaa,
441
+ :protein => hit[10].first.reference.split(" ").first,
442
+ :num_tot_proteins => hit[10].size,
443
+ :num_matched_ions => hit[7],
444
+ :tot_num_ions => hit[8],
445
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
446
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
447
+ :num_tol_term => sample_enzyme_obj.num_tol_term(sequence),
448
+ :num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
449
+ :is_rejected => 0,
450
+ # These are search score attributes:
451
+ :xcorr => hit[3],
452
+ :deltacn => hit[deltacn_index],
453
+ :spscore => hit[2],
454
+ :sprank => hit[6],
455
+ :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
456
+ }
457
+ unless deltacn_orig
458
+ sh_hash[:deltacnstar] =
459
+ if hits[hit_i+1].nil? # no next hit? then its deltacnstar == 1
460
+ '1'
461
+ else
462
+ '0'
463
+ end
464
+ end
465
+ search_hits[hit_i] = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
466
+ end
467
+
468
+ search_result = Sequest::PepXML::SearchResult.new
469
+ search_result.search_hits = search_hits
470
+ spectrum_query.search_results = [search_result]
471
+ spectrum_queries_arr[files_with_hits_index] = spectrum_query
472
+ end
473
+ spectrum_queries_arr.compact!
474
+
475
+ pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
476
+ pepxml_obj.base_name = pipeline.msms_run_summary.base_name
477
+ pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
478
+
479
+ pepxml_obj
480
+ end
481
+
482
+ # takes an .srg or bioworks.xml file
483
+ # if possible, ensures that an mzXML file is present for each pepxml file
484
+ # :print => true, will print files
485
+ # NOTES: num_tol_term and num_missing_cleavages are both calculated from the
486
+ # sample_enzyme. Thus, a No_Enzyme search may still pass in a
487
+ # :sample_enzyme option to get these calculated.
488
+ def self.set_from_bioworks(bioworks_file, opts={})
489
+ opts = Default_Options.merge(opts)
490
+ ## Create the out_path directory if necessary
491
+
492
+ unless File.exist? opts[:out_path]
493
+ FileUtils.mkpath(opts[:out_path])
494
+ end
495
+ unless File.directory? opts[:out_path]
496
+ abort "#{opts[:out_path]} must be a directory!"
497
+ end
498
+
499
+ spec_id = SpecID.new(bioworks_file)
500
+ pepxml_objs =
501
+ if spec_id.is_a? Bioworks
502
+ abort("must have opts[:params] set!") unless opts[:params]
503
+ set_from_bioworks_xml(bioworks_file, opts[:params], opts)
504
+ elsif spec_id.is_a? SRFGroup
505
+ spec_id.srfs.map do |srf|
506
+ new_from_srf(srf, opts)
507
+ end
508
+ else
509
+ abort "invalid object"
510
+ end
511
+
512
+ if opts[:print]
513
+ pepxml_objs.each do |obj|
514
+ obj.to_pepxml(obj.base_name + ".xml")
515
+ end
516
+ end
517
+ pepxml_objs
518
+ end
519
+
520
+
521
+ # Takes bioworks 3.2/3.3 xml output (with no filters)
522
+ # Returns a list of PepXML objects
523
+ # params = sequest.params file
524
+ # bioworks = bioworks.xml exported multi-consensus view file
525
+ # pepxml_version = 0 for tpp 1.2.3
526
+ # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
527
+ def self.set_from_bioworks_xml(bioworks, params, opts={})
528
+ opts = Default_Options.merge(opts)
529
+ pepxml_version, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
530
+
531
+
532
+
533
+ unless out_path
534
+ out_path = '.'
535
+ end
536
+
537
+ supported_versions = [18]
538
+
539
+ unless supported_versions.include?(opts[:pepxml_version])
540
+ abort "pepxml_version: #{pepxml_version} not currently supported. Current support is for versions #{supported_versions.join(', ')}"
541
+ end
542
+
543
+ ## Turn params and bioworks_obj into objects if necessary:
544
+ # Params:
545
+ if params.class == Sequest::Params # OK!
546
+ elsif params.class == String ; params = Sequest::Params.new(params)
547
+ else ; abort "Don't recognize #{params} as object or string!"
548
+ end
549
+ # Bioworks:
550
+ if bioworks.class == Bioworks # OK!
551
+ elsif bioworks.class == String ; bioworks = SpecID.new(bioworks)
552
+ else ; abort "Don't recognize #{bioworks} as object or string!"
553
+ end
554
+
555
+ sample_enzyme_obj =
556
+ if opts[:sample_enzyme]
557
+ opts[:sample_enzyme]
558
+ else
559
+ params.sample_enzyme
560
+ end
561
+
562
+ #puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
563
+
564
+ ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
565
+ ## bioworks
566
+ #bioworks.peps.each { |pep| if pep.class != Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end }
567
+
568
+ ## check to see if we need backup_db
569
+
570
+ backup_db_path = opts.delete(:backup_db_path)
571
+ if !File.exist?(params.database) && backup_db_path
572
+ params.database_path = backup_db_path
573
+ end
574
+
575
+ ## Start
576
+ split_bio_objs = []
577
+
578
+ ## (num_prots_by_pep, prot_by_pep) =
579
+ #num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE"
580
+
581
+ modifications_string = bioworks.modifications
582
+
583
+ ## Create a hash of spectrum_query arrays by filename (this very big block):
584
+ spectrum_queries_by_base_name = {}
585
+ # Hash by the filenames to split into filenames:
586
+ pepxml_objects = bioworks.peps.hash_by(:base_name).map do |base_name, pep_arr|
587
+
588
+ search_summary = Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => Sequest::PepXML::SearchDatabase.new(params), :out_data_type => out_data_type, :out_data => out_data})
589
+ modifications_obj = search_summary.modifications
590
+
591
+ pepxml_obj = Sequest::PepXML.new(pepxml_version, params)
592
+ full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
593
+
594
+ case pepxml_version
595
+ when 18
596
+ pipeline = Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'})
597
+ msms_run_summary = Sequest::PepXML::MSMSRunSummary.new({
598
+ :base_name => full_base_name_no_ext,
599
+ :ms_manufacturer => ms_manufacturer,
600
+ :ms_model => ms_model,
601
+ :ms_ionization => ms_ionization,
602
+ :ms_mass_analyzer => ms_mass_analyzer,
603
+ :ms_detector => ms_detector,
604
+ :raw_data_type => raw_data_type,
605
+ :raw_data => raw_data,
606
+ :sample_enzyme => sample_enzyme_obj, # usually, params.sample_enzyme,
607
+ :search_summary => search_summary,
608
+ })
609
+ pipeline.msms_run_summary = msms_run_summary
610
+ pepxml_obj.msms_pipeline_analysis = pipeline
611
+ pepxml_obj.msms_pipeline_analysis.msms_run_summary.search_summary.base_name = full_base_name_no_ext
612
+ pepxml_obj.base_name = full_base_name_no_ext
613
+ pepxml_obj
614
+ end
615
+
616
+ # Create a hash by pep object containing num_tot_proteins
617
+ # This is only valid if all hits are present (no previous thresholding)
618
+ # Since out2summary only acts on one folder at a time,
619
+ # we should only do it for one folder at a time! (that's why we do this
620
+ # here instead of globally)
621
+ self._prot_num_and_first_prot_by_pep(pep_arr)
622
+ prec_mz_arr = nil
623
+ case x = bioworks.version
624
+ when /3.2/
625
+ calc_prec_by = :prec_mz_arr
626
+ # get the precursor_mz array for this filename
627
+ mzxml_file = MS::Converter::MzXML.file_to_mzxml(File.join(ms_data, base_name))
628
+ prec_mz_arr = MS::MSRun.precursor_mz_by_scan_num(mzxml_file)
629
+ when /3.3/
630
+ calc_prec_by = :deltamass
631
+ else
632
+ abort "invalid BioworksBrowser version: #{x}"
633
+ end
634
+
635
+ if opts[:copy_mzxml]
636
+ to_copy = MS::Converter::MzXML.file_to_mzxml(File.join(ms_data, base_name))
637
+ if to_copy
638
+ FileUtils.cp to_copy, out_path
639
+ end
640
+ end
641
+
642
+
643
+ spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).map do |key,arr|
644
+
645
+
646
+ # Sort_by_rank and take the top hit (to mimick out2summary):
647
+
648
+ arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
649
+ top_pep = arr.pop
650
+ second_hit = arr.last # needed for deltacnstar
651
+
652
+
653
+ case calc_prec_by
654
+ when :prec_mz_arr
655
+ precursor_neutral_mass = Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge, pepxml_obj.avg_parent)
656
+ when :deltamass
657
+ precursor_neutral_mass = Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
658
+ end
659
+
660
+ calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
661
+
662
+ # deltacn & star:
663
+ # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
664
+ if second_hit
665
+ #top_pep.deltacn = second_hit.deltacn
666
+ deltacnstar = '0'
667
+ else
668
+ top_pep.deltacn = '1.0'
669
+ deltacnstar = '1'
670
+ end
671
+ # Create the nested structure of queries{results{hits}}
672
+ # (Ruby's blocks work beautifully for things like this)
673
+ spec_query = Sequest::PepXML::SpectrumQuery.new({
674
+ :spectrum => [top_pep.base_name, top_pep.first_scan, top_pep.last_scan, top_pep.charge].join("."),
675
+ :start_scan => top_pep.first_scan,
676
+ :end_scan => top_pep.last_scan,
677
+ :precursor_neutral_mass => precursor_neutral_mass,
678
+ :assumed_charge => top_pep.charge,
679
+ :pepxml_version => pepxml_version,
680
+ })
681
+
682
+
683
+ search_result = Sequest::PepXML::SearchResult.new
684
+ #puts "set MASSDIFF: "
685
+ #p precursor_neutral_mass - calc_neutral_pep_mass
686
+ ## Calculate some interdependent values;
687
+ # NOTE: the bioworks mass is reallyf M+H if two or more scans went
688
+ # into the search_hit; calc_neutral_pep_mass is simply the avg of
689
+ # precursor masses adjusted to be neutral
690
+ (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(top_pep.sequence)
691
+ (num_matched_ions, tot_num_ions) = Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
692
+ search_hit = Sequest::PepXML::SearchHit.new({
693
+ :hit_rank => 1,
694
+ :peptide => pepseq,
695
+ :peptide_prev_aa => prevaa,
696
+ :peptide_next_aa => nextaa,
697
+ :protein => top_pep._first_prot.reference.split(" ").first,
698
+ :num_tot_proteins => top_pep._num_prots,
699
+ :num_matched_ions => num_matched_ions,
700
+ :tot_num_ions => tot_num_ions,
701
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
702
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
703
+ :num_tol_term => sample_enzyme_obj.num_tol_term(top_pep.sequence),
704
+ :num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
705
+ :is_rejected => 0,
706
+ # These are search score attributes:
707
+ :xcorr => top_pep.xcorr,
708
+ :deltacn => top_pep.deltacn,
709
+ :deltacnstar => deltacnstar,
710
+ :spscore => top_pep.sp,
711
+ :sprank => top_pep.rsp,
712
+ :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
713
+ :spectrum_query => spec_query,
714
+ })
715
+ search_result.search_hits = [search_hit] # there can be multiple search hits
716
+ spec_query.search_results = [search_result] # can be multiple search_results
717
+ spec_query
718
+ end
719
+
720
+ # create an index by spectrum as results end up typically in out2summary
721
+ # (I really dislike this order, however)
722
+ spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
723
+ spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
724
+ pipeline.msms_run_summary.spectrum_queries = spectrum_queries_ar
725
+ pepxml_obj
726
+ end ## collects pepxml_objs
727
+ # summary_xml is the short basename of the pepxml file (e.g., "020.xml")
728
+ pepxml_objects.sort_by {|obj| obj.summary_xml }
729
+ end
730
+
731
+ def summary_xml
732
+ base_name + ".xml"
733
+ end
734
+
735
+ def precursor_mass_type
736
+ @params.precursor_mass_type
737
+ end
738
+
739
+ def fragment_mass_type
740
+ @params.fragment_mass_type
741
+ end
742
+
743
+ # combines filename in a manner consistent with the path
744
+ def self.make_base_name(path, filename)
745
+ sep = '/'
746
+ if path.split('/').size < path.split("\\").size
747
+ sep = "\\"
748
+ end
749
+ if path.split('').last == sep
750
+ path + File.basename(filename)
751
+ else
752
+ path + sep + File.basename(filename)
753
+ end
754
+ end
755
+
756
+ # outputs pepxml, (to file if given)
757
+ def to_pepxml(file=nil)
758
+ string = header
759
+ string << @msms_pipeline_analysis.to_pepxml
760
+
761
+ if file
762
+ File.open(file, "w") do |fh| fh.print string end
763
+ end
764
+ string
765
+ end
766
+
767
+ # given any kind of filename (from windows or whatever)
768
+ # returns the base of the filename with no file extension
769
+ def self.base_name_noext(file)
770
+ file.gsub!("\\", '/')
771
+ File.basename(file).sub(/\.[\w^\.]+$/, '')
772
+ end
773
+
774
+
775
+ end # PepXML
776
+
777
+
778
+ class Sequest::PepXML::SearchResult
779
+ include SpecIDXML
780
+ # an array of search_hits
781
+ attr_accessor :search_hits
782
+
783
+ # if block given, then search_hits set to return value
784
+ def initialize(search_hits = [])
785
+ @search_hits = search_hits
786
+ end
787
+
788
+ def to_pepxml
789
+ element_xml_no_atts(:search_result) do
790
+ @search_hits.map {|sh| sh.to_pepxml }.join
791
+ end
792
+ end
793
+
794
+ end
795
+
796
+ class Sequest::PepXML::SearchSummary
797
+ include SpecIDXML
798
+ attr_accessor :params
799
+ attr_accessor :base_name
800
+ attr_accessor :out_data_type
801
+ attr_accessor :out_data
802
+ # by default, "1"
803
+ attr_accessor :search_id
804
+ attr_accessor :modifications
805
+ # A SearchDatabase object (responds to :local_path and :type)
806
+ attr_accessor :search_database
807
+ # if given a sequest params object, then will set the following attributes:
808
+ # args is a hash of parameters
809
+ # modifications_string -> See Modifications
810
+ def initialize(prms=nil, modifications_string='', args=nil)
811
+ @search_id = "1"
812
+ if prms
813
+ @params = prms
814
+ @modifications = Sequest::PepXML::Modifications.new(prms, modifications_string)
815
+ end
816
+ if args ; set_from_hash(args) end
817
+ end
818
+
819
+ def method_missing(symbol, *args)
820
+ if @params ; @params.send(symbol, *args) end
821
+ end
822
+
823
+ def to_pepxml
824
+ element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
825
+ search_database.to_pepxml +
826
+ if @params.enzyme =~ /^No_Enzyme/
827
+ ''
828
+ else
829
+ short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini])
830
+ end +
831
+ @modifications.to_pepxml +
832
+ Sequest::PepXML::Parameters.new(@params).to_pepxml
833
+ end
834
+ end
835
+
836
+ def self.from_pepxml_node(node)
837
+ self.new.from_pepxml_node(node)
838
+ end
839
+
840
+ def from_pepxml_node(node)
841
+ raise NotImplementedError, "right now we just have the xml node at your disposal"
842
+ end
843
+
844
+ end
845
+
846
+ class Sequest::PepXML::Parameters
847
+ include SpecIDXML
848
+
849
+ attr_accessor :params
850
+
851
+ def initialize(obj=nil)
852
+ @params = obj
853
+ end
854
+ # (used to be called pepxml_parameters)
855
+ # Returns xml in the form <parameter name="#{method_name}"
856
+ # value="#{method_value}"/> for list of symbols
857
+ def to_pepxml
858
+ keys_as_symbols = @params.opts.sort.map do |k,v| k.to_s end
859
+ params_xml(@params, *keys_as_symbols)
860
+ # (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
861
+ end
862
+ end
863
+
864
+ class Sequest::PepXML::Modifications
865
+ include SpecIDXML
866
+
867
+ # sequest params object
868
+ attr_accessor :params
869
+ # array holding AAModifications
870
+ attr_accessor :aa_mods
871
+ # array holding TerminalModifications
872
+ attr_accessor :term_mods
873
+ # a hash of all differential modifications present by aa_one_letter_symbol
874
+ # and special_symbol. This is NOT the mass difference but the total mass {
875
+ # 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
876
+ # the amino acid sequence, they are give the *differential* mass. The
877
+ # termini are given the special symbol as in sequest e.g. '[' => 12.22, #
878
+ # cterminus ']' => 14.55 # nterminus
879
+ attr_accessor :masses_by_diff_mod_hash
880
+ # a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
881
+ # values are the special_symbols
882
+ attr_accessor :mod_symbols_hash
883
+
884
+ # The modification symbols string looks like this:
885
+ # (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
886
+ # ct is cterminal peptide (differential)
887
+ # nt is nterminal peptide (differential)
888
+ # the C is just cysteine
889
+ # will set_modifications and masses_by_diff_mod hash
890
+ def initialize(params=nil, modification_symbols_string='')
891
+ @params = params
892
+ if @params
893
+ set_modifications(params, modification_symbols_string)
894
+ end
895
+ end
896
+
897
+ # set the masses_by_diff_mod and mod_symbols_hash from
898
+ def set_hashes(modification_symbols_string)
899
+
900
+ @mod_symbols_hash = {}
901
+ @masses_by_diff_mod = {}
902
+ if (modification_symbols_string == nil || modification_symbols_string == '')
903
+ return nil
904
+ end
905
+ table = @params.mass_table
906
+ modification_symbols_string.split(/\)\s+\(/).each do |mod|
907
+ if mod =~ /\(?(\w+)(.) (.[\d\.]+)\)?/
908
+ if $1 == 'ct' || $1 == 'nt'
909
+ mass_diff = $3.to_f
910
+ @masses_by_diff_mod[$2] = mass_diff
911
+ @mod_symbols_hash[[$1.to_sym, mass_diff]] = $2.dup
912
+ # changed from below to match tests, is this right?
913
+ # @mod_symbols_hash[[$1, mass_diff]] = $2.dup
914
+ else
915
+ symbol_string = $2.dup
916
+ mass_diff = $3.to_f
917
+ $1.split('').each do |aa|
918
+ aa_as_sym = aa.to_sym
919
+ @masses_by_diff_mod[aa+symbol_string] = mass_diff + table[aa_as_sym]
920
+ @mod_symbols_hash[[aa_as_sym, mass_diff]] = symbol_string
921
+ end
922
+ end
923
+ end
924
+ end
925
+ end
926
+
927
+ # given a bare peptide (no end pieces) returns a ModificationInfo object
928
+ # e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
929
+ # if there are no modifications, returns nil
930
+ def modification_info(peptide)
931
+ if @masses_by_diff_mod.size == 0
932
+ return nil
933
+ end
934
+ hash = {}
935
+ hash[:modified_peptide] = peptide.dup
936
+ hsh = @masses_by_diff_mod
937
+ table = @params.mass_table
938
+ h = table[:h] # this? or h_plus ??
939
+ oh = table[:o] + h
940
+ ## only the termini can match a single char
941
+ if hsh.key? peptide[0,1]
942
+ # AA + H + differential_mod
943
+ hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
944
+ peptide = peptide[1...(peptide.size)]
945
+ end
946
+ if hsh.key? peptide[(peptide.size-1),1]
947
+ # AA + OH + differential_mod
948
+ hash[:mod_cterm_mass] = table[peptide[(peptide.size-2),1].to_sym] + oh + hsh[peptide[-1,1]]
949
+ peptide.slice!( 0..-2 )
950
+ peptide = peptide[0...(peptide.size-1)]
951
+ end
952
+ mod_array = []
953
+ (0...peptide.size).each do |i|
954
+ if hsh.key? peptide[i,2]
955
+ mod_array << Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([ i+1 , hsh[peptide[i,2]] ])
956
+ end
957
+ end
958
+ if mod_array.size > 0
959
+ hash[:mod_aminoacid_masses] = mod_array
960
+ end
961
+ if hash.size > 1 # if there is more than just the modified peptide there
962
+ Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
963
+ #Sequest::PepXML::SearchHit::ModificationInfo.new(hash.values_at(:modified_peptide, :mod_aminoacid_masses, :mod_nterm_mass, :mod_cterm_mass)
964
+ else
965
+ nil
966
+ end
967
+ end
968
+
969
+ # returns an array of static mod objects and static terminal mod objects
970
+ def create_static_mods(params)
971
+
972
+ ####################################
973
+ ## static mods
974
+ ####################################
975
+
976
+ static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
977
+ static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
978
+
979
+ params.mods.each do |k,v|
980
+ v_to_f = v.to_f
981
+ if v_to_f != 0.0
982
+ if k =~ /add_(\w)_/
983
+ static_mods << [$1.to_sym, v_to_f]
984
+ else
985
+ static_terminal_mods << [k, v_to_f]
986
+ end
987
+ end
988
+ end
989
+ aa_hash = params.mass_table
990
+
991
+ ## Create the static_mods objects
992
+ static_mods.map! do |mod|
993
+ hash = {
994
+ :aminoacid => mod[0].to_s,
995
+ :massdiff => mod[1],
996
+ :mass => aa_hash[mod[0]] + mod[1],
997
+ :variable => 'N',
998
+ :binary => 'Y',
999
+ }
1000
+ Sequest::PepXML::AAModification.new(hash)
1001
+ end
1002
+
1003
+ ## Create the static_terminal_mods objects
1004
+ static_terminal_mods.map! do |mod|
1005
+ terminus = if mod[0] =~ /Cterm/ ; 'c'
1006
+ else ; 'n' # only two possible termini
1007
+ end
1008
+ protein_terminus = case mod[0]
1009
+ when /Nterm_protein/ ; 'n'
1010
+ when /Cterm_protein/ ; 'c'
1011
+ else nil
1012
+ end
1013
+
1014
+ # create the hash
1015
+ hash = {
1016
+ :terminus => terminus,
1017
+ :massdiff => mod[1],
1018
+ :variable => 'N',
1019
+ :description => mod[0],
1020
+ }
1021
+ hash[:protein_terminus] = protein_terminus if protein_terminus
1022
+ Sequest::PepXML::TerminalModification.new(hash)
1023
+ end
1024
+ [static_mods, static_terminal_mods]
1025
+ end
1026
+
1027
+ # 1. sets aa_mods and term_mods from a sequest params object
1028
+ # 2. sets @params
1029
+ # 3. sets @masses_by_diff_mod
1030
+ def set_modifications(params, modification_symbols_string)
1031
+ @params = params
1032
+
1033
+ set_hashes(modification_symbols_string)
1034
+ (static_mods, static_terminal_mods) = create_static_mods(params)
1035
+
1036
+ aa_hash = params.mass_table
1037
+ #################################
1038
+ # Variable Mods:
1039
+ #################################
1040
+ arr = params.diff_search_options.rstrip.split(/\s+/)
1041
+ # [aa.to_sym, diff.to_f]
1042
+ variable_mods = []
1043
+ (0...arr.size).step(2) do |i|
1044
+ if arr[i].to_f != 0.0
1045
+ variable_mods << [arr[i+1], arr[i].to_f]
1046
+ end
1047
+ end
1048
+ mod_objects = []
1049
+ variable_mods.each do |mod|
1050
+ mod[0].split('').each do |aa|
1051
+ hash = {
1052
+
1053
+ :aminoacid => aa,
1054
+ :massdiff => mod[1],
1055
+ :mass => aa_hash[aa.to_sym] + mod[1],
1056
+ :variable => 'Y',
1057
+ :binary => 'N',
1058
+ :symbol => @mod_symbols_hash[[aa.to_sym, mod[1]]],
1059
+ }
1060
+ mod_objects << Sequest::PepXML::AAModification.new(hash)
1061
+ end
1062
+ end
1063
+ variable_mods = mod_objects
1064
+ #################################
1065
+ # TERMINAL Variable Mods:
1066
+ #################################
1067
+ # These are always peptide, not protein termini (for sequest)
1068
+ (nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
1069
+
1070
+ to_add = []
1071
+ if nterm_diff != 0.0
1072
+ to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
1073
+ end
1074
+ if cterm_diff != 0.0
1075
+ to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
1076
+ end
1077
+
1078
+ variable_terminal_mods = to_add.map do |term, mssdiff, symb|
1079
+ hash = {
1080
+ :terminus => term,
1081
+ :massdiff => mssdiff,
1082
+ :variable => 'Y',
1083
+ :symbol => symb,
1084
+ }
1085
+ Sequest::PepXML::TerminalModification.new(hash)
1086
+ end
1087
+
1088
+ #########################
1089
+ # COLLECT THEM
1090
+ #########################
1091
+ @aa_mods = static_mods + variable_mods
1092
+ @term_mods = static_terminal_mods + variable_terminal_mods
1093
+ end
1094
+
1095
+ ## Generates the pepxml for static and differential amino acid mods based on
1096
+ ## sequest object
1097
+ def to_pepxml
1098
+ st = ''
1099
+ if @aa_mods
1100
+ st << @aa_mods.map {|v| v.to_pepxml }.join
1101
+ end
1102
+ if @term_mods
1103
+ st << @term_mods.map {|v| v.to_pepxml }.join
1104
+ end
1105
+ st
1106
+ end
1107
+
1108
+ end
1109
+
1110
+ # Modified aminoacid, static or variable
1111
+ # unless otherwise stated, all attributes can be anything
1112
+ class Sequest::PepXML::AAModification
1113
+ include SpecIDXML
1114
+
1115
+ # The amino acid (one letter code)
1116
+ attr_accessor :aminoacid
1117
+ # Must be a string!!!!
1118
+ # Mass difference with respect to unmodified aminoacid, must begin with
1119
+ # either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
1120
+ # consider Numeric#to_plus_minus_string at top
1121
+ attr_accessor :massdiff
1122
+ # Mass of modified aminoacid
1123
+ attr_accessor :mass
1124
+ # Y if both modified and unmodified aminoacid could be present in the
1125
+ # dataset, N if only modified aminoacid can be present
1126
+ attr_accessor :variable
1127
+ # whether modification can reside only at protein terminus (specified 'n',
1128
+ # 'c', or 'nc')
1129
+ attr_accessor :peptide_terminus
1130
+ # MSial symbol used by search engine to designate this modification
1131
+ attr_accessor :symbol
1132
+ # Y if each peptide must have only modified or unmodified aminoacid, N if a
1133
+ # peptide may contain both modified and unmodified aminoacid
1134
+ attr_accessor :binary
1135
+
1136
+ def initialize(hash=nil)
1137
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1138
+ end
1139
+
1140
+ def to_pepxml
1141
+ # note massdiff
1142
+ short_element_xml_and_att_string("aminoacid_modification", "aminoacid=\"#{aminoacid}\" massdiff=\"#{massdiff.to_plus_minus_string}\" mass=\"#{mass}\" variable=\"#{variable}\" peptide_terminus=\"#{peptide_terminus}\" symbol=\"#{symbol}\" binary=\"#{binary}\"")
1143
+ end
1144
+
1145
+ end
1146
+
1147
+ # Modified aminoacid, static or variable
1148
+ class Sequest::PepXML::TerminalModification
1149
+ include SpecIDXML
1150
+
1151
+ # n for N-terminus, c for C-terminus
1152
+ attr_accessor :terminus
1153
+ # Mass difference with respect to unmodified terminus
1154
+ attr_accessor :massdiff
1155
+ # Mass of modified terminus
1156
+ attr_accessor :mass
1157
+ # Y if both modified and unmodified terminus could be present in the
1158
+ # dataset, N if only modified terminus can be present
1159
+ attr_accessor :variable
1160
+ # MSial symbol used by search engine to designate this modification
1161
+ attr_accessor :symbol
1162
+ # whether modification can reside only at protein terminus (specified n or
1163
+ # c)
1164
+ attr_accessor :protein_terminus
1165
+ attr_accessor :description
1166
+
1167
+ def initialize(hash=nil)
1168
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1169
+ end
1170
+
1171
+ def to_pepxml
1172
+ #short_element_xml_from_instance_vars("terminal_modification")
1173
+ short_element_xml_and_att_string("terminal_modification", "terminus=\"#{terminus}\" massdiff=\"#{massdiff.to_plus_minus_string}\" mass=\"#{mass}\" variable=\"#{variable}\" symbol=\"#{symbol}\" protein_terminus=\"#{protein_terminus}\" description=\"#{description}\"")
1174
+ end
1175
+ end
1176
+
1177
+
1178
+ class Sequest::PepXML::SearchDatabase
1179
+ include SpecIDXML
1180
+ attr_accessor :local_path
1181
+ attr_writer :seq_type
1182
+ # Takes a SequestParams object
1183
+ # Sets :local_path from the params object attr :database
1184
+ def initialize(params=nil, args=nil)
1185
+ @seq_type = nil
1186
+ if params
1187
+ @local_path = params.database
1188
+ end
1189
+ if args ; set_from_hash(args) end
1190
+ end
1191
+
1192
+ def seq_type
1193
+ if @seq_type ; @seq_type
1194
+ else
1195
+ if @local_path =~ /\.fasta/
1196
+ 'AA'
1197
+ else
1198
+ abort "Don't recognize type from your database local path: #{@local_path}"
1199
+ end
1200
+ end
1201
+ end
1202
+
1203
+ def to_pepxml
1204
+ short_element_xml_and_att_string(:search_database, "local_path=\"#{local_path}\" type=\"#{seq_type}\"")
1205
+ end
1206
+
1207
+ end
1208
+
1209
+ Sequest::PepXML::SpectrumQuery = Arrayclass.new(%w(spectrum start_scan end_scan precursor_neutral_mass index assumed_charge search_results pepxml_version))
1210
+
1211
+ class Sequest::PepXML::SpectrumQuery
1212
+ include SpecIDXML
1213
+
1214
+ ############################################################
1215
+ # FOR PEPXML:
1216
+ ############################################################
1217
+ def to_pepxml
1218
+ case Sequest::PepXML.pepxml_version
1219
+ when 18
1220
+ element_xml("spectrum_query", [:spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :assumed_charge, :index]) do
1221
+ search_results.collect { |sr| sr.to_pepxml }.join
1222
+ end
1223
+ end
1224
+ end
1225
+
1226
+ def self.from_pepxml_node(node)
1227
+ self.new.from_pepxml_node(node)
1228
+ end
1229
+
1230
+ def from_pepxml_node(node)
1231
+ self[0] = node['spectrum']
1232
+ self[1] = node['start_scan'].to_i
1233
+ self[2] = node['end_scan'].to_i
1234
+ self[3] = node['precursor_neutral_mass'].to_f
1235
+ self[4] = node['index'].to_i
1236
+ self[5] = node['assumed_charge'].to_i
1237
+ self
1238
+ end
1239
+
1240
+ # Returns the precursor_neutral based on the scans and an array indexed by
1241
+ # scan numbers. first and last scan and charge should be integers.
1242
+ # This is the precursor_mz - h_plus!
1243
+ # by=:prec_mz_arr|:deltamass
1244
+ # if prec_mz_arr then the following arguments must be supplied:
1245
+ # :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor
1246
+ # m/z for each product scan, :charge = int
1247
+ # if deltamass then the following arguments must be supplied:
1248
+ # m_plus_h = float, deltamass = float
1249
+ # For both flavors, a final additional argument 'average_weights'
1250
+ # can be used. If true (default), average weights will be used, if false,
1251
+ # monoisotopic weights (currently this is simply the mass of the proton)
1252
+ def self.calc_precursor_neutral_mass(by, *args)
1253
+ average_weights = true
1254
+ case by
1255
+ when :prec_mz_arr
1256
+ (first_scan, last_scan, prec_mz_arr, charge, average_weights) = args
1257
+ when :deltamass
1258
+ (m_plus_h, deltamass, average_weights) = args
1259
+ end
1260
+
1261
+ if average_weights
1262
+ mass_h_plus = SpecID::AVG[:h_plus]
1263
+ else
1264
+ mass_h_plus = SpecID::MONO[:h_plus]
1265
+ end
1266
+
1267
+ case by
1268
+ when :prec_mz_arr
1269
+ mz = nil
1270
+ if first_scan != last_scan
1271
+ sum = 0.0
1272
+ tot_num = 0
1273
+ (first_scan..last_scan).each do |scan|
1274
+ val = prec_mz_arr[scan]
1275
+ if val # if the scan is not an mslevel 2
1276
+ sum += val
1277
+ tot_num += 1
1278
+ end
1279
+ end
1280
+ mz = sum/tot_num
1281
+ else
1282
+ mz = prec_mz_arr[first_scan]
1283
+ end
1284
+ charge * (mz - mass_h_plus)
1285
+ when :deltamass
1286
+ m_plus_h - mass_h_plus + deltamass
1287
+ else
1288
+ abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}"
1289
+ end
1290
+ end
1291
+
1292
+ end
1293
+
1294
+
1295
+ Sequest::PepXML::SearchHit = Arrayclass.new( %w( hit_rank peptide peptide_prev_aa peptide_next_aa protein num_tot_proteins num_matched_ions tot_num_ions calc_neutral_pep_mass massdiff num_tol_term num_missed_cleavages is_rejected deltacnstar xcorr deltacn spscore sprank modification_info spectrum_query) )
1296
+
1297
+ # 0=hit_rank 1=peptide 2=peptide_prev_aa 3=peptide_next_aa 4=protein 5=num_tot_proteins 6=num_matched_ions 7=tot_num_ions 8=calc_neutral_pep_mass 9=massdiff 10=num_tol_term 11=num_missed_cleavages 12=is_rejected 13=deltacnstar 14=xcorr 15=deltacn 16=spscore 17=sprank 18=modification_info 19=spectrum_query
1298
+
1299
+ class Sequest::PepXML::SearchHit
1300
+ include SpecID::Pep
1301
+ include SpecIDXML
1302
+
1303
+ Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1304
+
1305
+ def aaseq ; self[1] end
1306
+ def aaseq=(arg) ; self[1] = arg end
1307
+
1308
+ # These are all search_score elements:
1309
+
1310
+ # 1 if there is no second ranked hit, 0 otherwise
1311
+
1312
+ tmp_verb = $VERBOSE
1313
+ $VERBOSE = nil
1314
+ def initialize(hash=nil)
1315
+ super(self.class.size)
1316
+ if hash
1317
+ self[0,20] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank], hash[:modification_info], hash[:spectrum_query]]
1318
+ end
1319
+ self
1320
+ end
1321
+ $VERBOSE = tmp_verb
1322
+
1323
+ undef_method :inspect
1324
+ def inspect
1325
+ var = @@attributes.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
1326
+ "#<SearchHit #{var}>"
1327
+ end
1328
+
1329
+ # Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
1330
+ def self.split_ions(ions)
1331
+ ions.split("/").map {|ion| ion.to_i }
1332
+ end
1333
+
1334
+ def search_score_xml(symbol)
1335
+ "#{tabs}<search_score name=\"#{symbol}\" value=\"#{send(symbol)}\"/>"
1336
+ end
1337
+
1338
+ def search_scores_xml(*symbol_list)
1339
+ symbol_list.collect do |sy|
1340
+ search_score_xml(sy)
1341
+ end.join("\n") + "\n"
1342
+ end
1343
+
1344
+ def to_pepxml
1345
+ mod_pepxml =
1346
+ if self[18]
1347
+ self[18].to_pepxml
1348
+ else
1349
+ ''
1350
+ end
1351
+
1352
+ #string = element_xml_and_att_string("search_hit", [:hit_rank, :peptide, :peptide_prev_aa, :peptide_next_aa, :protein, :num_tot_proteins, :num_matched_ions, :tot_num_ions, :calc_neutral_pep_mass, :massdiff_as_string, :num_tol_term, :num_missed_cleavages, :is_rejected]) do
1353
+ # note the to_plus_minus_string
1354
+ #puts "MASSDIFF:"
1355
+ #p massdiff
1356
+ element_xml_and_att_string("search_hit", "hit_rank=\"#{hit_rank}\" peptide=\"#{peptide}\" peptide_prev_aa=\"#{peptide_prev_aa}\" peptide_next_aa=\"#{peptide_next_aa}\" protein=\"#{protein}\" num_tot_proteins=\"#{num_tot_proteins}\" num_matched_ions=\"#{num_matched_ions}\" tot_num_ions=\"#{tot_num_ions}\" calc_neutral_pep_mass=\"#{calc_neutral_pep_mass}\" massdiff=\"#{massdiff.to_plus_minus_string}\" num_tol_term=\"#{num_tol_term}\" num_missed_cleavages=\"#{num_missed_cleavages}\" is_rejected=\"#{is_rejected}\"") do
1357
+ mod_pepxml +
1358
+ search_scores_xml(:xcorr, :deltacn, :deltacnstar, :spscore, :sprank)
1359
+ end
1360
+ end
1361
+
1362
+ def from_pepxml_node(node)
1363
+ self[0] = node['hit_rank'].to_i
1364
+ self[1] = node['peptide']
1365
+ self[2] = node['peptide_prev_aa']
1366
+ self[3] = node['peptide_next_aa']
1367
+ self[4] = node['protein'] ## will this be the string?? (yes, for now)
1368
+ self[5] = node['num_tot_proteins'].to_i
1369
+ self[6] = node['num_matched_ions'].to_i
1370
+ self[7] = node['tot_num_ions'].to_i
1371
+ self[8] = node['calc_neutral_pep_mass'].to_f
1372
+ self[9] = node['massdiff'].to_f
1373
+ self[10] = node['num_tol_term'].to_i
1374
+ self[11] = node['num_missed_cleavages'].to_i
1375
+ self[12] = node['is_rejected'].to_i
1376
+ self
1377
+ end
1378
+
1379
+ end
1380
+
1381
+
1382
+ Sequest::PepXML::SearchHit::ModificationInfo = Arrayclass.new(%w(modified_peptide mod_aminoacid_masses mod_nterm_mass mod_cterm_mass))
1383
+
1384
+ # Positions and masses of modifications
1385
+ class Sequest::PepXML::SearchHit::ModificationInfo
1386
+ include SpecIDXML
1387
+
1388
+ ## Should be something like this:
1389
+ # <modification_info mod_nterm_mass=" " mod_nterm_mass=" " modified_peptide=" ">
1390
+ # <mod_aminoacid_mass position=" " mass=" "/>
1391
+ # </modification_info>
1392
+
1393
+ alias_method :masses, :mod_aminoacid_masses
1394
+ alias_method :masses=, :mod_aminoacid_masses=
1395
+
1396
+ # Mass of modified N terminus<
1397
+ #attr_accessor :mod_nterm_mass
1398
+ # Mass of modified C terminus<
1399
+ #attr_accessor :mod_cterm_mass
1400
+ # Peptide sequence (with indicated modifications) I'm assuming that the
1401
+ # native sequest indicators are OK here
1402
+ #attr_accessor :modified_peptide
1403
+
1404
+ # These are objects of type: ...ModAminoacidMass
1405
+ # position ranges from 1 to peptide length
1406
+ #attr_accessor :mod_aminoacid_masses
1407
+
1408
+ # Will escape any xml special chars in modified_peptide
1409
+ def to_pepxml
1410
+ ## Collect the modifications:
1411
+ mod_strings = []
1412
+ if masses and masses.size > 0
1413
+ mod_strings = masses.map do |ar|
1414
+ "position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
1415
+ end
1416
+ end
1417
+ ## Create the attribute string:
1418
+ att_parts = []
1419
+ if mod_nterm_mass
1420
+ att_parts << "mod_nterm_mass=\"#{mod_nterm_mass}\""
1421
+ end
1422
+ if mod_cterm_mass
1423
+ att_parts << "mod_cterm_mass=\"#{mod_cterm_mass}\""
1424
+ end
1425
+ if modified_peptide
1426
+ att_parts << "modified_peptide=\"#{escape_special_chars(modified_peptide)}\""
1427
+ end
1428
+ element_xml_and_att_string('modification_info', att_parts.join(" ")) do
1429
+ mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
1430
+ end
1431
+ end
1432
+
1433
+ def self.from_pepxml_node(node)
1434
+ self.new.from_pepxml_node(node)
1435
+ end
1436
+
1437
+ # returns self
1438
+ def from_pepxml_node(node)
1439
+ self[0] = node['modified_peptide']
1440
+ self[2] = node['mod_nterm_mass']
1441
+ self[3] = node['mod_cterm_mass']
1442
+ masses = []
1443
+ node.children do |mass_n|
1444
+ masses << Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mass_n['position'].to_i, mass_n['mass'].to_f])
1445
+ end
1446
+ self[1] = masses
1447
+ self
1448
+ end
1449
+
1450
+ ##
1451
+
1452
+ # <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
1453
+ # <mod_aminoacid_mass position="2" mass="545.7160"/>
1454
+ # <mod_aminoacid_mass position="3" mass="147.1926"/>
1455
+ # </modification_info>
1456
+ end
1457
+
1458
+ Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass = Arrayclass.new(%w(position mass))