mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -6,6 +6,74 @@ require 'set_from_hash'
6
6
  require 'spec_id/bioworks'
7
7
  require 'instance_var_set_from_hash'
8
8
  require 'spec/msrun'
9
+ require 'spec_id/srf'
10
+
11
+ class Numeric
12
+ # returns a string with a + or - on the front
13
+ def to_plus_minus_string
14
+ if self >= 0
15
+ '+' << self.to_s
16
+ else
17
+ '-' << self.to_s
18
+ end
19
+ end
20
+ end
21
+
22
+ ##########################################
23
+ # NEED TO ADD MODIFICATIONS and generally verify pepxml creation!!! :
24
+ # HERE's an excerpt from an example file from tpp 2.9.2 that I'm going to follow:
25
+ =begin
26
+ <search_summary base_name="/regis/data3/search/akeller/LCQ/COMET/LIGHT/haloICAT2_41" search_engine="COMET" precursor_mass_type="average" fragment_mass_type="average">
27
+ <sequence_search_constraint sequence="C"/>
28
+ <aminoacid_modification aminoacid="C" massdiff="8.049" mass="553.765" variable="Y" binary="N"/>
29
+ <aminoacid_modification aminoacid="C" massdiff="442.5772" mass="545.7160" variable="N"/>
30
+ <aminoacid_modification aminoacid="M" massdiff="16.0000" mass="147.1926" variable="Y" binary="N" symbol="1"/>
31
+ <parameter name="peptide_mass_tol" value="3.0000"/>
32
+ <parameter name="peptide_mass_tol_units" value="DA"/>
33
+ <parameter name="num_output_lines" value="10"/>
34
+ <parameter name="remove_precursor_peak" value="0"/>
35
+ <parameter name="num_dup_headers" value="1"/>
36
+ <parameter name="email_address" value=""/>
37
+ <parameter name="ion_series" value="010000010"/>
38
+ <parameter name="max_num_var_mod_residues" value="3"/>
39
+ <parameter name="md5_check_sum" value="2547286a77a35abe2af3f2e9825ab814"/>
40
+ </search_summary>
41
+ =end
42
+
43
+ # and a guy with modifications:
44
+ =begin
45
+ <search_result spectrum="haloICAT2_41.1110.1110.2" start_scan="1110" end_scan="1110" precursor_neutral_mass="2000.6641" assumed_charge="2" index="28">
46
+ <search_hit hit_rank="1" peptide="GCMPSKEVLSAGAHR" peptide_prev_aa="R" peptide_next_aa="Y" protein="Chr_ORF0132" num_tot_proteins="1" num_matched_ions="19" tot_num_ions="30" calc_neutral_pep_mass="2001.3685" massdiff="-0.704" num_tol_term="2" num_missed_cleavages="1" is_rejected="0">
47
+ <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
48
+ <mod_aminoacid_mass position="2" mass="545.7160"/>
49
+ <mod_aminoacid_mass position="3" mass="147.1926"/>
50
+ </modification_info>
51
+ <search_score name="dotproduct" value="359"/>
52
+ <search_score name="delta" value="0.296"/>
53
+ <search_score name="deltastar" value="0"/>
54
+ <search_score name="zscore" value="5.290"/>
55
+ <search_score name="expect" value="0.000E+00"/>
56
+ <peptideprophet_result probability="0.9994" all_ntt_prob="(0.3713,0.4360,0.9994)">
57
+ <search_score_summary>
58
+ <parameter name="fval" value="3.4002"/>
59
+ <parameter name="ntt" value="2"/>
60
+ <parameter name="nmc" value="1"/>
61
+ <parameter name="massd" value="-0.704"/>
62
+ </search_score_summary>
63
+ </peptideprophet_result>
64
+ =end
65
+
66
+ # sequest.params option:
67
+ # diff_search_options = 15.994910 M 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y
68
+ # permanent mods are at the bottom: ...
69
+ # add_A_Alanine = 0.0000 ; added to A
70
+ # add_S_Serine = 0.0000 ; added to S
71
+ # add_P_Proline = 0.0000 ; added to P
72
+ # add_V_Valine = 0.0000 ; added to V
73
+ # add_T_Threonine = 0.0000 ; added to T
74
+ # ...
75
+
76
+
9
77
 
10
78
  module SpecID::Sequest; end
11
79
  class SpecID::Sequest::PepXML; end
@@ -26,8 +94,12 @@ class SpecID::Sequest::PepXML::MSMSPipelineAnalysis
26
94
  @xmlns = nil
27
95
  @xmlns_xsi = nil
28
96
  @xsi_schema_location = nil
29
- self.set_from_hash(hash)
30
- @msms_run_summary = yield
97
+ if hash
98
+ self.set_from_hash(hash)
99
+ end
100
+ if block_given?
101
+ @msms_run_summary = yield
102
+ end
31
103
  end
32
104
 
33
105
  # if no date string given, then it will set to Time.now
@@ -80,7 +152,8 @@ class SpecID::Sequest::PepXML::MSMSRunSummary
80
152
  include SpecIDXML
81
153
 
82
154
  # the version of TPP you are using (determines xml output)
83
- # The name of the pep xml file (without extension)
155
+ # The name of the pep xml file (without extension) (but this is a long
156
+ # filename!!!)
84
157
  attr_accessor :base_name
85
158
  # The name of the mass spec manufacturer
86
159
  attr_accessor :ms_manufacturer
@@ -104,7 +177,9 @@ class SpecID::Sequest::PepXML::MSMSRunSummary
104
177
  # set to the return value of the block
105
178
  def initialize(hash=nil)
106
179
  @spectrum_queries = []
107
- instance_var_set_from_hash(hash)
180
+ if hash
181
+ instance_var_set_from_hash(hash)
182
+ end
108
183
  if block_given? ; @spectrum_queries = yield end
109
184
  end
110
185
 
@@ -137,6 +212,8 @@ end
137
212
 
138
213
  class SpecID::Sequest::PepXML
139
214
  include SpecIDXML
215
+
216
+ ## CREATE a default version for the entire class
140
217
  class << self
141
218
  attr_accessor :pepxml_version
142
219
  end
@@ -144,7 +221,11 @@ class SpecID::Sequest::PepXML
144
221
  self.pepxml_version = DEF_VERSION # default version
145
222
 
146
223
  attr_accessor :pepxml_version, :msms_pipeline_analysis
224
+ ## the full path name (no extension)
147
225
  attr_accessor :base_name
226
+ attr_accessor :h_plus
227
+ attr_accessor :avg_parent
228
+
148
229
  #attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
149
230
 
150
231
  # returns an array of spectrum queries
@@ -153,10 +234,29 @@ class SpecID::Sequest::PepXML
153
234
  end
154
235
 
155
236
  # msms_pipeline_analysis is set to the result of the yielded block
156
- def initialize(pepxml_version=DEF_VERSION)
237
+ # and set_mono_or_avg is called with params if given
238
+ def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
157
239
  self.class.pepxml_version = pepxml_version
158
- @msms_pipeline_analysis = yield
159
- @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
240
+ if sequest_params_obj
241
+ set_mono_or_avg(sequest_params_obj)
242
+ end
243
+ if block_given?
244
+ @msms_pipeline_analysis = yield
245
+ @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
246
+ end
247
+ end
248
+
249
+ # sets @h_plus and @avg_parent from the sequest params object
250
+ def set_mono_or_avg(sequest_params_obj)
251
+ case sequest_params_obj.precursor_mass_type
252
+ when "monoisotopic" ; @avg_parent = false
253
+ else ; @avg_parent = true
254
+ end
255
+
256
+ case @avg_parent
257
+ when true ; @h_plus = SpecID::AVG[:h_plus]
258
+ when false ; @h_plus = SpecID::MONO[:h_plus]
259
+ end
160
260
  end
161
261
 
162
262
  def date
@@ -203,6 +303,190 @@ class SpecID::Sequest::PepXML
203
303
  end
204
304
  end
205
305
 
306
+
307
+ Default_Options = {
308
+ :out_path => nil,
309
+ :backup_db_path => '/project/marcotte/marcotte/ms/database',
310
+ # a PepXML option
311
+ :pepxml_version => DEF_VERSION,
312
+ ## MSMSRunSummary options:
313
+ # string must be recognized in sample_enzyme.rb
314
+ # or create your own SampleEnzyme object
315
+ :sample_enzyme => 'trypsin',
316
+ :ms_manufacturer => 'ThermoFinnigan',
317
+ :ms_model => 'LCQ Deca XP',
318
+ :ms_ionization => 'ESI',
319
+ :ms_mass_analyzer => 'Ion Trap',
320
+ :ms_detector => 'UNKNOWN',
321
+ :raw_data_type => "raw",
322
+ :raw_data => ".mzXML", ## even if you don't have it?
323
+ ## SearchSummary options:
324
+ :out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
325
+ :out_data => ".tgz" ## may be srf??
326
+ }
327
+
328
+ # will dynamically set :ms_model and :ms_mass_analyzer from srf info
329
+ # (ignoring defaults or anything passed in) for LTQ Orbitrap
330
+ # and LCQ Deca XP
331
+ # See SRF::Sequest::PepXML::Default_Options hash for defaults
332
+ # unless given, the out_path will be given as the path of the srf_file
333
+ def self.new_from_srf(srf_file, opts={})
334
+ opts = Default_Options.merge(opts)
335
+
336
+ ## set the outpath
337
+ out_path = opts.delete(:out_path)
338
+ unless out_path
339
+ out_path = File.dirname(srf_file)
340
+ end
341
+
342
+ ## read the srf file
343
+ srf = SRF.new(srf_file)
344
+
345
+ params = srf.params
346
+
347
+ ## check to see if we need backup_db
348
+ backup_db_path = opts.delete(:backup_db_path)
349
+ unless File.exist? params.database
350
+ params.database_path = backup_db_path
351
+ end
352
+
353
+ #######################################################################
354
+ # PREPARE THE OPTIONS:
355
+ #######################################################################
356
+ ## remove items from the options hash that don't belong to
357
+ ppxml_version = opts.delete(:pepxml_version)
358
+ out_data_type = opts.delete(:out_data_type)
359
+ out_data = opts.delete(:out_data)
360
+
361
+ ## Extract meta info from srf
362
+ bn_noext = base_name_noext(srf.header.raw_filename)
363
+ opts[:ms_model] = srf.header.model
364
+ case opts[:ms_model]
365
+ when /Orbitrap/
366
+ opts[:ms_mass_analyzer] = 'Orbitrap'
367
+ when /LCQ Deca XP/
368
+ opts[:ms_mass_analyzer] = 'Ion Trap'
369
+ end
370
+
371
+ ## Create the base name
372
+ full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
373
+ opts[:base_name] = full_base_name_no_ext
374
+
375
+ ## Create the search summary:
376
+ search_summary_options = {
377
+ :search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params),
378
+ :base_name => full_base_name_no_ext,
379
+ :out_data_type => out_data_type,
380
+ :out_data => out_data
381
+ }
382
+ opts[:search_summary] = SpecID::Sequest::PepXML::SearchSummary.new( params, search_summary_options)
383
+
384
+ ## Create the SampleEnzyme object if necessary
385
+ unless opts[:sample_enzyme].is_a? SampleEnzyme
386
+ opts[:sample_enzyme] = SampleEnzyme.new(opts[:sample_enzyme])
387
+ end
388
+
389
+ ## Create the pepxml obj
390
+ pepxml_obj = SpecID::Sequest::PepXML.new(ppxml_version, params)
391
+ ## name some common variables we'll need
392
+ h_plus = pepxml_obj.h_plus
393
+ avg_parent = pepxml_obj.avg_parent
394
+
395
+ #######################################################################
396
+ # CREATE the spectrum_queries_ar
397
+ #######################################################################
398
+ srf_index = srf.index
399
+ out_files = srf.out_files
400
+ spectrum_queries_arr = Array.new(srf.dta_files.size)
401
+ files_with_hits_index = 0 ## will end up being 1 indexed
402
+ srf.dta_files.each_with_index do |dta_file,i|
403
+ next if out_files[i].num_hits == 0
404
+ files_with_hits_index += 1
405
+
406
+ # Sort the hits
407
+ hits = out_files[i].hits
408
+ arr = hits.sort_by{|v| v.xcorr }
409
+
410
+ # Get proper deltacn and deltacnstar
411
+ # Prophet deltacn is not the same as the native Sequest deltacn
412
+ # It is the deltacn of the second best hit!
413
+ top_hit = arr.pop
414
+ second_hit = arr.last
415
+ if second_hit
416
+ top_hit[1] = second_hit[1]
417
+ deltacnstar = '0'
418
+ else
419
+ top_hit[1] = '1.0'
420
+ deltacnstar = '1'
421
+ end
422
+
423
+ ## mass calculations:
424
+ precursor_neutral_mass = dta_file.mh - h_plus
425
+ calc_neutral_pep_mass = top_hit[0] - h_plus
426
+ massdiff = precursor_neutral_mass - calc_neutral_pep_mass
427
+ if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
428
+ else ; massdiff = massdiff.to_s end
429
+
430
+ (start_scan, end_scan, charge) = srf_index[i]
431
+ sq_hash = {
432
+ :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
433
+ :start_scan => start_scan,
434
+ :end_scan => end_scan,
435
+ :precursor_neutral_mass => precursor_neutral_mass,
436
+ :assumed_charge => charge,
437
+ :pepxml_version => ppxml_version,
438
+ :index => files_with_hits_index,
439
+ }
440
+
441
+ # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
442
+ ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
443
+ (prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_hit[8])
444
+ # ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
445
+
446
+ sh_hash = {
447
+ :hit_rank => "1",
448
+ :peptide => pepseq,
449
+ :peptide_prev_aa => prevaa,
450
+ :peptide_next_aa => nextaa,
451
+ :protein => top_hit[9].split(" ").first,
452
+ :num_tot_proteins => top_hit[10],
453
+ :num_matched_ions => top_hit[6],
454
+ :tot_num_ions => top_hit[7],
455
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
456
+ :massdiff => massdiff,
457
+ :num_tol_term => SpecID::Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_hit[8]),
458
+ :num_missed_cleavages => SpecID::Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_hit[8]),
459
+ :is_rejected => '0',
460
+ # These are search score attributes:
461
+ :xcorr => top_hit[3],
462
+ :deltacn => top_hit[1],
463
+ :deltacnstar => deltacnstar,
464
+ :spscore => top_hit[2],
465
+ :sprank => top_hit[5],
466
+ }
467
+
468
+ spectrum_queries_arr[files_with_hits_index] = SpecID::Sequest::PepXML::SpectrumQuery.new(sq_hash) do
469
+ search_result = SpecID::Sequest::PepXML::SearchResult.new do
470
+ [ SpecID::Sequest::PepXML::SearchHit.new(sh_hash) ] # there can be multiple hits
471
+ end # SearchResult
472
+ [search_result] # can be multiple
473
+ end
474
+ end
475
+ spectrum_queries_arr.compact!
476
+
477
+ #######################################################################
478
+ # ADD the pipeline analysis
479
+ #######################################################################
480
+
481
+ pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'}) do
482
+ SpecID::Sequest::PepXML::MSMSRunSummary.new(opts) { spectrum_queries_arr }
483
+ end
484
+ pepxml_obj.msms_pipeline_analysis = pipeline
485
+ pepxml_obj.base_name = pipeline.msms_run_summary.base_name
486
+ pepxml_obj
487
+ end
488
+
489
+ # Takes bioworks 3.2/3.3 xml output (with no filters)
206
490
  # Returns a list of PepXML objects
207
491
  # msdata = path to mzXML files (or .timeIndex files) (or @TODO: path to sqt file(s))
208
492
  # params = sequest.params file
@@ -246,6 +530,7 @@ class SpecID::Sequest::PepXML
246
530
 
247
531
  ## Create a hash of spectrum_query arrays by filename (this very big block):
248
532
  spectrum_queries_by_base_name = {}
533
+ pepxml_objs_by_base_name = {}
249
534
  # Hash by the filenames to split into filenames:
250
535
  bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr|
251
536
 
@@ -262,7 +547,10 @@ class SpecID::Sequest::PepXML
262
547
  abort "invalid BioworksBrowser version: #{x}"
263
548
  end
264
549
 
265
- spectrum_queries = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
550
+ pepxml_obj = SpecID::Sequest::PepXML.new(pepxml_version, params)
551
+ pepxml_objs_by_base_name[base_name] = pepxml_obj
552
+
553
+ spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
266
554
 
267
555
 
268
556
  # Sort_by_rank and take the top hit (to mimick out2summary):
@@ -270,28 +558,18 @@ class SpecID::Sequest::PepXML
270
558
  top_pep = arr.pop
271
559
  second_hit = arr.last # needed for deltacnstar
272
560
 
273
- case params.precursor_mass_type
274
- when "monoisotopic" ; avg_parent = false
275
- else ; avg_parent = true
276
- end
277
-
278
- case avg_parent
279
- when true ; h_plus = SpecID::AVG[:h_plus]
280
- when false ; h_plus = SpecID::MONO[:h_plus]
281
- end
282
561
 
283
-
284
562
  case calc_prec_by
285
563
  when :prec_mz_arr
286
- precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, avg_parent)
564
+ precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, pepxml_obj.avg_parent)
287
565
  when :deltamass
288
- precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, avg_parent)
566
+ precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
289
567
  end
290
568
 
291
- calc_neutral_pep_mass = (top_pep.mass.to_f - h_plus)
569
+ calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
292
570
  massdiff = precursor_neutral_mass - calc_neutral_pep_mass
293
571
  if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
294
- else ; massdiff = massdiff.to_s end
572
+ else ; massdiff = massdiff.to_s end #already has a -
295
573
  # deltacn & star:
296
574
  # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
297
575
  if second_hit
@@ -317,7 +595,7 @@ class SpecID::Sequest::PepXML
317
595
  # NOTE: the bioworks mass is really M+H if two or more scans went
318
596
  # into the search_hit; calc_neutral_pep_mass is simply the avg of
319
597
  # precursor masses adjusted to be neutral
320
- (prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.split_sequence(top_pep.sequence)
598
+ (prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_pep.sequence)
321
599
  (num_matched_ions, tot_num_ions) = SpecID::Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
322
600
  search_hit = SpecID::Sequest::PepXML::SearchHit.new({
323
601
  :hit_rank => "1",
@@ -348,32 +626,36 @@ class SpecID::Sequest::PepXML
348
626
 
349
627
  # create an index by spectrum as results end up typically in out2summary
350
628
  # (I really dislike this order, however)
351
- spectrum_queries = spectrum_queries.sort_by {|pep| pep.spectrum }
352
- spectrum_queries.each_with_index {|res,index| res.index = "#{index + 1}" }
629
+ spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
630
+ spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
353
631
 
354
- spectrum_queries_by_base_name[base_name] = spectrum_queries
632
+ spectrum_queries_by_base_name[base_name] = spectrum_queries_ar
355
633
  end
356
634
 
357
- spectrum_queries_by_base_name.collect do |base_name, spectrum_queries|
635
+ modifications_string = bioworks.modifications
636
+
637
+ spectrum_queries_by_base_name.collect do |base_name, spectrum_queries_ar|
358
638
  case pepxml_version
359
639
  when 18
360
- SpecID::Sequest::PepXML.new(pepxml_version) do
361
- SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'}) do
362
- full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
363
- SpecID::Sequest::PepXML::MSMSRunSummary.new({
364
- :base_name => full_base_name_no_ext,
365
- :ms_manufacturer => ms_manufacturer,
366
- :ms_model => ms_model,
367
- :ms_ionization => ms_ionization,
368
- :ms_mass_analyzer => ms_mass_analyzer,
369
- :ms_detector => ms_detector,
370
- :raw_data_type => raw_data_type,
371
- :raw_data => raw_data,
372
- :sample_enzyme => SampleEnzyme.new(sample_enzyme),
373
- :search_summary => SpecID::Sequest::PepXML::SearchSummary.new(params, {:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params), :base_name => full_base_name_no_ext, :out_data_type => out_data_type, :out_data => out_data}),
374
- }) do spectrum_queries end
375
- end
640
+ pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'}) do
641
+ full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
642
+ SpecID::Sequest::PepXML::MSMSRunSummary.new({
643
+ :base_name => full_base_name_no_ext,
644
+ :ms_manufacturer => ms_manufacturer,
645
+ :ms_model => ms_model,
646
+ :ms_ionization => ms_ionization,
647
+ :ms_mass_analyzer => ms_mass_analyzer,
648
+ :ms_detector => ms_detector,
649
+ :raw_data_type => raw_data_type,
650
+ :raw_data => raw_data,
651
+ :sample_enzyme => SampleEnzyme.new(sample_enzyme),
652
+ :search_summary => SpecID::Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params), :base_name => full_base_name_no_ext, :out_data_type => out_data_type, :out_data => out_data}),
653
+ }) { spectrum_queries_ar }
376
654
  end
655
+ pepxml_obj = pepxml_objs_by_base_name[base_name]
656
+ pepxml_obj.msms_pipeline_analysis = pipeline
657
+ pepxml_obj.base_name = pipeline.msms_run_summary.base_name
658
+ pepxml_obj
377
659
  when 0
378
660
  ## @TODO: NEED TO REVAMP THIS:
379
661
  # SpecID::Sequest::PepXML.new(pepxml_version).set_from_hash({
@@ -429,6 +711,14 @@ class SpecID::Sequest::PepXML
429
711
  string
430
712
  end
431
713
 
714
+ # given any kind of filename (from windows or whatever)
715
+ # returns the base of the filename with no file extension
716
+ def self.base_name_noext(file)
717
+ file.gsub!("\\", '/')
718
+ File.basename(file).sub(/\.[\w^\.]+$/, '')
719
+ end
720
+
721
+
432
722
  end # PepXML
433
723
 
434
724
  ##
@@ -461,6 +751,15 @@ class SpecID::Sequest::Params
461
751
  one,two = line.split @@param_re
462
752
  two,comment = two.split @@param_two_split
463
753
  hash[one] = two.rstrip
754
+ # it is necessary to add this break so that params files inside srf
755
+ # files can be read. This will terminate the reading at the end of
756
+ # the file even though there are more lines
757
+ if line =~ /added to U/ || line =~ /digest_mass_range/## Will only work on bioworks 3.2 & 3.3 (bioworks 3.1 last line => Elastase/Tryp...)
758
+ break
759
+ end
760
+ if line =~ /digest_mass_range/ # there is no space in the srf params files
761
+ break
762
+ end
464
763
  else
465
764
  break
466
765
  end
@@ -468,17 +767,26 @@ class SpecID::Sequest::Params
468
767
  hash
469
768
  end
470
769
 
770
+ # returns self
771
+ def parse_handle(fh)
772
+ sequest_line = fh.gets #[SEQUEST]
773
+ @opts = grab_params(fh)
774
+ @opts["search_engine"] = "SEQUEST"
775
+ @mods = grab_params(fh)
776
+
777
+ ## this gets rid of the .hdr postfix on indexed databases
778
+ @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
779
+ self
780
+ end
781
+
471
782
  ## parses file
472
783
  ## and drops the .hdr behind indexed fasta files
784
+ ## returns self
473
785
  def parse(file)
474
786
  File.open(file) do |fh|
475
- sequest_line = fh.gets #[SEQUEST]
476
- @opts = grab_params(fh)
477
- @opts["search_engine"] = "SEQUEST"
478
- @mods = grab_params(fh)
787
+ parse_handle(fh)
479
788
  end
480
- ## this gets rid of the .hdr postfix on indexed databases
481
- @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
789
+ self
482
790
  end
483
791
 
484
792
  # returns( split_after, except_before)
@@ -569,6 +877,17 @@ class SpecID::Sequest::Params
569
877
  @opts["first_database_name"]
570
878
  end
571
879
 
880
+ # returns the appropriate aminoacid mass lookup table (in spec_id.rb SpecID::MONO or
881
+ # SpecID::AVG based on precursor_mass_type
882
+ def mass_table
883
+ case precursor_mass_type
884
+ when 'average'
885
+ SpecID::AVG
886
+ when 'monoisotopic'
887
+ SpecID::MONO
888
+ end
889
+ end
890
+
572
891
  # at least in Bioworks 3.2, the First number after the enzyme
573
892
  # is the indication of the enzymatic end stringency (required):
574
893
  # 1 = Fully enzymatic
@@ -628,7 +947,7 @@ class SpecID::Sequest::PepXML::SearchResult
628
947
  attr_accessor :search_hits
629
948
 
630
949
  # if block given, then search_hits set to return value
631
- def initialize()
950
+ def initialize
632
951
  if block_given? ; @search_hits = yield
633
952
  else ; @search_hits = [] end
634
953
  end
@@ -646,13 +965,16 @@ class SpecID::Sequest::PepXML::SearchSummary
646
965
  attr_accessor :base_name
647
966
  attr_accessor :out_data_type
648
967
  attr_accessor :out_data
968
+ attr_accessor :modifications
649
969
  # A SearchDatabase object (responds to :local_path and :type)
650
970
  attr_accessor :search_database
651
971
  # if given a sequest params object, then will set the following attributes:
652
972
  # args is a hash of parameters
653
- def initialize(params=nil, args=nil)
973
+ # modifications_string -> See Modifications
974
+ def initialize(params, modifications_string='', args=nil)
654
975
  @search_id = nil
655
976
  @params = params
977
+ @modifications = SpecID::Sequest::PepXML::Modifications.new(params, modifications_string)
656
978
  if args ; set_from_hash(args) end
657
979
  end
658
980
 
@@ -665,16 +987,304 @@ class SpecID::Sequest::PepXML::SearchSummary
665
987
  else ; '1' end
666
988
  end
667
989
 
990
+
668
991
  def to_pepxml
669
992
  element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
670
993
  search_database.to_pepxml +
671
994
  short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
995
+ @modifications.to_pepxml +
672
996
  @params.pepxml_parameters
673
997
  end
674
998
  end
675
999
 
676
1000
  end
677
1001
 
1002
+ class SpecID::Sequest::PepXML::Modifications
1003
+ include SpecIDXML
1004
+
1005
+ # sequest params object
1006
+ attr_accessor :params
1007
+ # array holding AAModifications
1008
+ attr_accessor :aa_mods
1009
+ # array holding TerminalModifications
1010
+ attr_accessor :term_mods
1011
+ # a hash of all differential modifications present by aa_one_letter_symbol
1012
+ # and special_symbol. This is NOT the mass difference but the total mass {
1013
+ # 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
1014
+ # the amino acid sequence, they are give the *differential* mass. The
1015
+ # termini are given the special symbol as in sequest e.g. '[' => 12.22, #
1016
+ # cterminus ']' => 14.55 # nterminus
1017
+ attr_accessor :masses_by_diff_mod_hash
1018
+ # a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
1019
+ # values are the special_symbols
1020
+ attr_accessor :mod_symbols_hash
1021
+
1022
+ # The modification symbols string looks like this:
1023
+ # (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
1024
+ # ct is cterminal peptide (differential)
1025
+ # nt is nterminal peptide (differential)
1026
+ # the C is just cysteine
1027
+ # will set_modifications and masses_by_diff_mod hash
1028
+ def initialize(params, modification_symbols_string='')
1029
+ @params = params
1030
+ set_modifications(params, modification_symbols_string)
1031
+ end
1032
+
1033
+ # set the masses_by_diff_mod and mod_symbols_hash from
1034
+ def set_hashes(modification_symbols_string)
1035
+ @mod_symbols_hash = {}
1036
+ @masses_by_diff_mod = {}
1037
+ if modification_symbols_string == nil || modification_symbols_string == ''
1038
+ return nil
1039
+ end
1040
+ table = @params.mass_table
1041
+ modification_symbols_string.split(/\)\s+\(/).each do |mod|
1042
+ if mod =~ /\(?(\w{1,2})(.) (.[\d\.]+)\)?/
1043
+ aa_as_sym = $1.to_sym,
1044
+ @mod_symbols_hash[[aa_as_sym, $3.to_f]] = $2.dup
1045
+ if $1 == 'ct' || $1 == 'nt'
1046
+ @masses_by_diff_mod[$2] = $3.to_f
1047
+ else
1048
+ @masses_by_diff_mod[$1+$2] = $3.to_f + table[aa_as_sym]
1049
+ end
1050
+ end
1051
+ end
1052
+ end
1053
+
1054
+ # given a bare peptide (no end pieces) returns a ModificationInfo object
1055
+ # e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
1056
+ # if there are no modifications, returns nil
1057
+ def modification_info(peptide)
1058
+ if @masses_by_diff_mod.size == 0
1059
+ return nil
1060
+ end
1061
+ hash[:modified_peptide] = peptide.dup
1062
+ hash = {}
1063
+ hsh = @masses_by_diff_mod
1064
+ table = @params.mass_table
1065
+ h = table[:h] # this? or h_plus ??
1066
+ oh = table[:o] + h
1067
+ ## only the termini can match a single char
1068
+ if hsh.key? peptide[0,1]
1069
+ # AA + H + differential_mod
1070
+ hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
1071
+ peptide.slice!( 1..-1 )
1072
+ end
1073
+ if hsh.key? peptide[-1,1]
1074
+ # AA + OH + differential_mod
1075
+ hash[:mod_cterm_mass] = table[peptide[-2,1].to_sym] + oh + hsh[peptide[-1,1]]
1076
+ peptide.slice!( 0..-2 )
1077
+ end
1078
+ mod_array = []
1079
+ (0...peptide.size).each do |i|
1080
+ if hsh.key? peptide[i,2]
1081
+ mod_array << [ i+1 , hsh[peptide[i,2]] ]
1082
+ end
1083
+ end
1084
+ if mod_array.size > 0
1085
+ hash[:mod_aminoacid_mass_array] = mod_array
1086
+ end
1087
+ if hash.size > 0
1088
+ SpecID::Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
1089
+ else
1090
+ nil
1091
+ end
1092
+ end
1093
+
1094
+ # 1. sets aa_mods and term_mods from a sequest params object
1095
+ # 2. sets @params
1096
+ # 3. sets @masses_by_diff_mod
1097
+ def set_modifications(params, modification_symbols_string)
1098
+ @params = params
1099
+
1100
+ set_hashes(modification_symbols_string)
1101
+
1102
+ ####################################
1103
+ ## static mods
1104
+ ####################################
1105
+
1106
+ static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
1107
+ static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
1108
+
1109
+ params.mods.each do |k,v|
1110
+ v_to_f = v.to_f
1111
+ if v_to_f != 0.0
1112
+ if k =~ /add_(\w)_/
1113
+ static_mods << [$1.to_sym, v_to_f]
1114
+ else
1115
+ static_terminal_mods << [k, v_to_f]
1116
+ end
1117
+ end
1118
+ end
1119
+ aa_hash = params.mass_table
1120
+
1121
+ ## Create the static_mods objects
1122
+ static_mods.map! do |mod|
1123
+ hash = {
1124
+ :aminoacid => mod[0].to_s,
1125
+ :massdiff => mod[1].to_plus_minus_string,
1126
+ :mass => aa_hash[mod[0]] + mod[1],
1127
+ :variable => 'N',
1128
+ :binary => 'Y',
1129
+ }
1130
+ SpecID::Sequest::PepXML::AAModification.new(hash)
1131
+ end
1132
+
1133
+ ## Create the static_terminal_mods objects
1134
+ static_terminal_mods.map! do |mod|
1135
+ terminus = if mod[0] =~ /Cterm/ ; 'c'
1136
+ else ; 'n' # only two possible termini
1137
+ end
1138
+ protein_terminus = case mod[0]
1139
+ when /Nterm_protein/ ; 'n'
1140
+ when /Cterm_protein/ ; 'c'
1141
+ else nil
1142
+ end
1143
+
1144
+ # create the hash
1145
+ hash = {
1146
+ :terminus => terminus,
1147
+ :massdiff => mod[1].to_plus_minus_string,
1148
+ :variable => 'N',
1149
+ :description => mod[0],
1150
+ }
1151
+ hash[:protein_terminus] = protein_terminus if protein_terminus
1152
+ SpecID::Sequest::PepXML::TerminalModification.new(hash)
1153
+ end
1154
+ #################################
1155
+ # Variable Mods:
1156
+ #################################
1157
+ arr = params.diff_search_options.rstrip.split(/\s+/)
1158
+ # [aa.to_sym, diff.to_f]
1159
+ variable_mods = []
1160
+ (0...arr.size).step(2) do |i|
1161
+ if arr[i].to_f != 0.0
1162
+ variable_mods << [arr[i+1].to_sym, arr[i].to_f]
1163
+ end
1164
+ end
1165
+ variable_mods.map! do |mod|
1166
+ hash = {
1167
+ :aminoacid => mod[0].to_s,
1168
+ :massdiff => mod[1].to_plus_minus_string,
1169
+ :mass => aa_hash[mod[0]] + mod[1],
1170
+ :variable => 'Y',
1171
+ :binary => 'N',
1172
+ :symbol => @mod_symbols_hash[mod],
1173
+ }
1174
+ SpecID::Sequest::PepXML::AAModification.new(hash)
1175
+ end
1176
+ #################################
1177
+ # TERMINAL Variable Mods:
1178
+ #################################
1179
+ # These are always peptide, not protein termini (for sequest)
1180
+ (nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
1181
+
1182
+ to_add = []
1183
+ if nterm_diff != 0.0
1184
+ to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
1185
+ end
1186
+ if cterm_diff != 0.0
1187
+ to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
1188
+ end
1189
+
1190
+ variable_terminal_mods = to_add.map do |term, mssdiff, symb|
1191
+ hash = {
1192
+ :terminus => term,
1193
+ :massdiff => mssdiff,
1194
+ :variable => 'Y',
1195
+ :symbol => symb,
1196
+ }
1197
+ SpecID::Sequest::PepXML::TerminalModification.new(hash)
1198
+ end
1199
+
1200
+ #########################
1201
+ # COLLECT THEM
1202
+ #########################
1203
+ @aa_mods = static_mods + variable_mods
1204
+ @term_mods = static_terminal_mods + variable_terminal_mods
1205
+ end
1206
+
1207
+ ## Generates the pepxml for static and differential amino acid mods based on
1208
+ ## sequest object
1209
+ def to_pepxml
1210
+ st = ''
1211
+ if @aa_mods
1212
+ st << @aa_mods.map {|v| v.to_pepxml }.join
1213
+ end
1214
+ if @term_mods
1215
+ st << @term_mods.map {|v| v.to_pepxml }.join
1216
+ end
1217
+ st
1218
+ end
1219
+
1220
+ end
1221
+
1222
+ # Modified aminoacid, static or variable
1223
+ # unless otherwise stated, all attributes can be anything
1224
+ class SpecID::Sequest::PepXML::AAModification
1225
+ include SpecIDXML
1226
+
1227
+ # The amino acid (one letter code)
1228
+ attr_accessor :aminoacid
1229
+ # Must be a string!!!!
1230
+ # Mass difference with respect to unmodified aminoacid, must begin with
1231
+ # either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
1232
+ # consider Numeric#to_plus_minus_string at top
1233
+ attr_accessor :massdiff
1234
+ # Mass of modified aminoacid
1235
+ attr_accessor :mass
1236
+ # Y if both modified and unmodified aminoacid could be present in the
1237
+ # dataset, N if only modified aminoacid can be present
1238
+ attr_accessor :variable
1239
+ # whether modification can reside only at protein terminus (specified 'n',
1240
+ # 'c', or 'nc')
1241
+ attr_accessor :peptide_terminus
1242
+ # Special symbol used by search engine to designate this modification
1243
+ attr_accessor :symbol
1244
+ # Y if each peptide must have only modified or unmodified aminoacid, N if a
1245
+ # peptide may contain both modified and unmodified aminoacid
1246
+ attr_accessor :binary
1247
+
1248
+ def initialize(hash=nil)
1249
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1250
+ end
1251
+
1252
+ def to_pepxml
1253
+ short_element_xml_from_instance_vars("aminoacid_modification")
1254
+ end
1255
+
1256
+ end
1257
+
1258
+ # Modified aminoacid, static or variable
1259
+ class SpecID::Sequest::PepXML::TerminalModification
1260
+ include SpecIDXML
1261
+
1262
+ # n for N-terminus, c for C-terminus
1263
+ attr_accessor :terminus
1264
+ # Mass difference with respect to unmodified terminus
1265
+ attr_accessor :massdiff
1266
+ # Mass of modified terminus
1267
+ attr_accessor :mass
1268
+ # Y if both modified and unmodified terminus could be present in the
1269
+ # dataset, N if only modified terminus can be present
1270
+ attr_accessor :variable
1271
+ # Special symbol used by search engine to designate this modification
1272
+ attr_accessor :symbol
1273
+ # whether modification can reside only at protein terminus (specified n or
1274
+ # c)
1275
+ attr_accessor :protein_terminus
1276
+ attr_accessor :description
1277
+
1278
+ def initialize(hash=nil)
1279
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1280
+ end
1281
+
1282
+ def to_pepxml
1283
+ short_element_xml_from_instance_vars("terminal_modification")
1284
+ end
1285
+ end
1286
+
1287
+
678
1288
  class SpecID::Sequest::PepXML::SearchDatabase
679
1289
  include SpecIDXML
680
1290
  attr_accessor :local_path
@@ -708,7 +1318,15 @@ end
708
1318
 
709
1319
  class SpecID::Sequest::PepXML::SpectrumQuery
710
1320
  include SpecIDXML
711
- attr_accessor :spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :index, :search_results
1321
+
1322
+ # basename_noext.first_scan.last_scan.charge
1323
+ attr_accessor :spectrum
1324
+ attr_accessor :start_scan
1325
+ attr_accessor :end_scan
1326
+ attr_accessor :precursor_neutral_mass
1327
+ attr_accessor :index
1328
+ attr_accessor :search_results
1329
+
712
1330
  # this is a string
713
1331
  attr_accessor :assumed_charge
714
1332
  attr_accessor :pepxml_version
@@ -803,6 +1421,10 @@ end
803
1421
  # this responds to flatten (so that it won't flatten).
804
1422
  class SpecID::Sequest::PepXML::SearchHit < Array
805
1423
  include SpecIDXML
1424
+
1425
+ Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1426
+
1427
+ # num_tot_proteins = "Number of unique proteins in search database containing peptide"
806
1428
  #attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected
807
1429
  #attr_accessor 13:deltacnstar
808
1430
  #attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank
@@ -811,69 +1433,82 @@ class SpecID::Sequest::PepXML::SearchHit < Array
811
1433
  ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17}
812
1434
  @@methods = ind_keys.keys
813
1435
  def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end
814
- def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
815
- def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
816
- def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
817
- def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
818
- def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
819
- def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
820
- def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
821
- def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
822
- def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
823
- def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
824
- def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
825
- def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
826
- def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
827
- def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
828
- def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
829
- def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
830
- def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
831
-
832
- @@arr_size = ind_keys.size
833
- ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
834
- ind_keys.merge!(ind_keys_w_eq)
835
- ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
836
-
837
- # These are all search_score elements:
838
-
839
- # 1 if there is no second ranked hit, 0 otherwise
840
-
841
- def initialize(hash=nil)
842
- super(@@arr_size)
843
- self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
844
- self
845
- #if hash ; set_from_hash(hash) end
846
- end
1436
+ def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
1437
+ def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
1438
+ def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
1439
+ def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
1440
+ def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
1441
+ def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
1442
+ def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
1443
+ def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
1444
+ def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
1445
+ def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
1446
+ def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
1447
+ def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
1448
+ def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
1449
+ def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
1450
+ def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
1451
+ def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
1452
+ def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
1453
+
1454
+ @@arr_size = ind_keys.size
1455
+ ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
1456
+ ind_keys.merge!(ind_keys_w_eq)
1457
+ ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
1458
+
1459
+ # These are all search_score elements:
1460
+
1461
+ # 1 if there is no second ranked hit, 0 otherwise
847
1462
 
848
- # Returns prev, peptide, next from sequence. Parse errors return
849
- # nil,nil,nil
850
- # R.PEPTIDE.A # -> R, PEPTIDE, A
851
- # R.PEPTIDE.- # -> R, PEPTIDE, -
852
- # PEPTIDE.A # -> -, PEPTIDE, A
853
- # A.PEPTIDE # -> A, PEPTIDE, -
854
- # PEPTIDE # -> nil,nil,nil
855
- def self.split_sequence(val)
856
- peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
857
- pieces = val.split(".")
858
- case pieces.size
859
- when 3
860
- peptide_prev_aa, peptide, peptide_next_aa = *pieces
861
- when 2
862
- if pieces[0].size > 1 ## N termini
863
- peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
864
- else ## C termini
865
- peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
866
- end
867
- when 1 ## this must be a parse error!
868
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
869
- when 0
870
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
871
- end
872
- return peptide_prev_aa, peptide, peptide_next_aa
873
- end
1463
+ def initialize(hash=nil)
1464
+ super(@@arr_size)
1465
+ self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
1466
+ self
1467
+ #if hash ; set_from_hash(hash) end
1468
+ end
1469
+
1470
+ # remove_non_amino_acids && split_sequence
1471
+ def self.prepare_sequence(val)
1472
+ nv = remove_non_amino_acids(val)
1473
+ split_sequence(nv)
1474
+ end
874
1475
 
875
- def inspect
876
- "#<SearchHit #{@@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")}>"
1476
+ # Returns prev, peptide, next from sequence. Parse errors return
1477
+ # nil,nil,nil
1478
+ # R.PEPTIDE.A # -> R, PEPTIDE, A
1479
+ # R.PEPTIDE.- # -> R, PEPTIDE, -
1480
+ # PEPTIDE.A # -> -, PEPTIDE, A
1481
+ # A.PEPTIDE # -> A, PEPTIDE, -
1482
+ # PEPTIDE # -> nil,nil,nil
1483
+ def self.split_sequence(val)
1484
+ peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
1485
+ pieces = val.split('.')
1486
+ case pieces.size
1487
+ when 3
1488
+ peptide_prev_aa, peptide, peptide_next_aa = *pieces
1489
+ when 2
1490
+ if pieces[0].size > 1 ## N termini
1491
+ peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
1492
+ else ## C termini
1493
+ peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
1494
+ end
1495
+ when 1 ## this must be a parse error!
1496
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
1497
+ when 0
1498
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
1499
+ end
1500
+ return peptide_prev_aa, peptide, peptide_next_aa
1501
+ end
1502
+
1503
+ # removes nonstandard chars with Non_standard_amino_acid_char_re
1504
+ # preserves A-Z and '.
1505
+ def self.remove_non_amino_acids(sequence)
1506
+ sequence.gsub(Non_standard_amino_acid_char_re, '')
1507
+ end
1508
+
1509
+ def inspect
1510
+ var = @@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
1511
+ "#<SearchHit #{var}>"
877
1512
  end
878
1513
 
879
1514
  # requires Params object and full sequence (with heads and tails)
@@ -924,3 +1559,65 @@ def inspect
924
1559
 
925
1560
  end
926
1561
 
1562
+ # Positions and masses of modifications
1563
+ class SpecID::Sequest::PepXML::SearchHit::ModificationInfo
1564
+ include SpecIDXML
1565
+
1566
+ ## Should be something like this:
1567
+ # <modification_info mod_nterm_mass=" " mod_nterm_mass=" " modified_peptide=" ">
1568
+ # <mod_aminoacid_mass position=" " mass=" "/>
1569
+ # </modification_info>
1570
+
1571
+
1572
+ # Mass of modified N terminus<
1573
+ attr_accessor :mod_nterm_mass
1574
+ # Mass of modified C terminus<
1575
+ attr_accessor :mod_cterm_mass
1576
+ # Peptide sequence (with indicated modifications) I'm assuming that the
1577
+ # native sequest indicators are OK here
1578
+ attr_accessor :modified_peptide
1579
+ ## A few main types:
1580
+
1581
+ # this should be an array of arrays: [[position, modified_mass], ...]
1582
+ # position ranges from 1 to peptide length
1583
+ attr_accessor :mod_aminoacid_mass_array
1584
+
1585
+ def initialize(hash=nil)
1586
+ instance_var_set_from_hash(hash)
1587
+ end
1588
+
1589
+ # Will escape any xml special chars in modified_peptide
1590
+ def to_pepxml
1591
+ ## Collect the modifications:
1592
+ mod_strings = []
1593
+ if @mod_aminoacid_mass_array
1594
+ mod_strings = @mod_aminoacid_mass_array.map do |ar|
1595
+ "position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
1596
+ end
1597
+ end
1598
+ ## Create the attribute string:
1599
+ att_parts = []
1600
+ if @mod_nterm_mass
1601
+ att_parts << "mod_nterm_mass=\"#{@mod_nterm_mass}\""
1602
+ end
1603
+ if @mod_cterm_mass
1604
+ att_parts << "mod_cterm_mass=\"#{@mod_cterm_mass}\""
1605
+ end
1606
+ if @modified_peptide
1607
+ att_parts << "modified_peptide=\"#{escape_special_chars(@modified_peptide)}\""
1608
+ end
1609
+ element_xml_and_att_string('modification_info', att_parts.join(" ")) do
1610
+ mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
1611
+ end
1612
+ end
1613
+
1614
+ ##
1615
+
1616
+ # <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
1617
+ # <mod_aminoacid_mass position="2" mass="545.7160"/>
1618
+ # <mod_aminoacid_mass position="3" mass="147.1926"/>
1619
+ # </modification_info>
1620
+
1621
+
1622
+ end
1623
+