mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -6,6 +6,74 @@ require 'set_from_hash'
6
6
  require 'spec_id/bioworks'
7
7
  require 'instance_var_set_from_hash'
8
8
  require 'spec/msrun'
9
+ require 'spec_id/srf'
10
+
11
+ class Numeric
12
+ # returns a string with a + or - on the front
13
+ def to_plus_minus_string
14
+ if self >= 0
15
+ '+' << self.to_s
16
+ else
17
+ '-' << self.to_s
18
+ end
19
+ end
20
+ end
21
+
22
+ ##########################################
23
+ # NEED TO ADD MODIFICATIONS and generally verify pepxml creation!!! :
24
+ # HERE's an excerpt from an example file from tpp 2.9.2 that I'm going to follow:
25
+ =begin
26
+ <search_summary base_name="/regis/data3/search/akeller/LCQ/COMET/LIGHT/haloICAT2_41" search_engine="COMET" precursor_mass_type="average" fragment_mass_type="average">
27
+ <sequence_search_constraint sequence="C"/>
28
+ <aminoacid_modification aminoacid="C" massdiff="8.049" mass="553.765" variable="Y" binary="N"/>
29
+ <aminoacid_modification aminoacid="C" massdiff="442.5772" mass="545.7160" variable="N"/>
30
+ <aminoacid_modification aminoacid="M" massdiff="16.0000" mass="147.1926" variable="Y" binary="N" symbol="1"/>
31
+ <parameter name="peptide_mass_tol" value="3.0000"/>
32
+ <parameter name="peptide_mass_tol_units" value="DA"/>
33
+ <parameter name="num_output_lines" value="10"/>
34
+ <parameter name="remove_precursor_peak" value="0"/>
35
+ <parameter name="num_dup_headers" value="1"/>
36
+ <parameter name="email_address" value=""/>
37
+ <parameter name="ion_series" value="010000010"/>
38
+ <parameter name="max_num_var_mod_residues" value="3"/>
39
+ <parameter name="md5_check_sum" value="2547286a77a35abe2af3f2e9825ab814"/>
40
+ </search_summary>
41
+ =end
42
+
43
+ # and a guy with modifications:
44
+ =begin
45
+ <search_result spectrum="haloICAT2_41.1110.1110.2" start_scan="1110" end_scan="1110" precursor_neutral_mass="2000.6641" assumed_charge="2" index="28">
46
+ <search_hit hit_rank="1" peptide="GCMPSKEVLSAGAHR" peptide_prev_aa="R" peptide_next_aa="Y" protein="Chr_ORF0132" num_tot_proteins="1" num_matched_ions="19" tot_num_ions="30" calc_neutral_pep_mass="2001.3685" massdiff="-0.704" num_tol_term="2" num_missed_cleavages="1" is_rejected="0">
47
+ <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
48
+ <mod_aminoacid_mass position="2" mass="545.7160"/>
49
+ <mod_aminoacid_mass position="3" mass="147.1926"/>
50
+ </modification_info>
51
+ <search_score name="dotproduct" value="359"/>
52
+ <search_score name="delta" value="0.296"/>
53
+ <search_score name="deltastar" value="0"/>
54
+ <search_score name="zscore" value="5.290"/>
55
+ <search_score name="expect" value="0.000E+00"/>
56
+ <peptideprophet_result probability="0.9994" all_ntt_prob="(0.3713,0.4360,0.9994)">
57
+ <search_score_summary>
58
+ <parameter name="fval" value="3.4002"/>
59
+ <parameter name="ntt" value="2"/>
60
+ <parameter name="nmc" value="1"/>
61
+ <parameter name="massd" value="-0.704"/>
62
+ </search_score_summary>
63
+ </peptideprophet_result>
64
+ =end
65
+
66
+ # sequest.params option:
67
+ # diff_search_options = 15.994910 M 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y
68
+ # permanent mods are at the bottom: ...
69
+ # add_A_Alanine = 0.0000 ; added to A
70
+ # add_S_Serine = 0.0000 ; added to S
71
+ # add_P_Proline = 0.0000 ; added to P
72
+ # add_V_Valine = 0.0000 ; added to V
73
+ # add_T_Threonine = 0.0000 ; added to T
74
+ # ...
75
+
76
+
9
77
 
10
78
  module SpecID::Sequest; end
11
79
  class SpecID::Sequest::PepXML; end
@@ -26,8 +94,12 @@ class SpecID::Sequest::PepXML::MSMSPipelineAnalysis
26
94
  @xmlns = nil
27
95
  @xmlns_xsi = nil
28
96
  @xsi_schema_location = nil
29
- self.set_from_hash(hash)
30
- @msms_run_summary = yield
97
+ if hash
98
+ self.set_from_hash(hash)
99
+ end
100
+ if block_given?
101
+ @msms_run_summary = yield
102
+ end
31
103
  end
32
104
 
33
105
  # if no date string given, then it will set to Time.now
@@ -80,7 +152,8 @@ class SpecID::Sequest::PepXML::MSMSRunSummary
80
152
  include SpecIDXML
81
153
 
82
154
  # the version of TPP you are using (determines xml output)
83
- # The name of the pep xml file (without extension)
155
+ # The name of the pep xml file (without extension) (but this is a long
156
+ # filename!!!)
84
157
  attr_accessor :base_name
85
158
  # The name of the mass spec manufacturer
86
159
  attr_accessor :ms_manufacturer
@@ -104,7 +177,9 @@ class SpecID::Sequest::PepXML::MSMSRunSummary
104
177
  # set to the return value of the block
105
178
  def initialize(hash=nil)
106
179
  @spectrum_queries = []
107
- instance_var_set_from_hash(hash)
180
+ if hash
181
+ instance_var_set_from_hash(hash)
182
+ end
108
183
  if block_given? ; @spectrum_queries = yield end
109
184
  end
110
185
 
@@ -137,6 +212,8 @@ end
137
212
 
138
213
  class SpecID::Sequest::PepXML
139
214
  include SpecIDXML
215
+
216
+ ## CREATE a default version for the entire class
140
217
  class << self
141
218
  attr_accessor :pepxml_version
142
219
  end
@@ -144,7 +221,11 @@ class SpecID::Sequest::PepXML
144
221
  self.pepxml_version = DEF_VERSION # default version
145
222
 
146
223
  attr_accessor :pepxml_version, :msms_pipeline_analysis
224
+ ## the full path name (no extension)
147
225
  attr_accessor :base_name
226
+ attr_accessor :h_plus
227
+ attr_accessor :avg_parent
228
+
148
229
  #attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
149
230
 
150
231
  # returns an array of spectrum queries
@@ -153,10 +234,29 @@ class SpecID::Sequest::PepXML
153
234
  end
154
235
 
155
236
  # msms_pipeline_analysis is set to the result of the yielded block
156
- def initialize(pepxml_version=DEF_VERSION)
237
+ # and set_mono_or_avg is called with params if given
238
+ def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
157
239
  self.class.pepxml_version = pepxml_version
158
- @msms_pipeline_analysis = yield
159
- @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
240
+ if sequest_params_obj
241
+ set_mono_or_avg(sequest_params_obj)
242
+ end
243
+ if block_given?
244
+ @msms_pipeline_analysis = yield
245
+ @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
246
+ end
247
+ end
248
+
249
+ # sets @h_plus and @avg_parent from the sequest params object
250
+ def set_mono_or_avg(sequest_params_obj)
251
+ case sequest_params_obj.precursor_mass_type
252
+ when "monoisotopic" ; @avg_parent = false
253
+ else ; @avg_parent = true
254
+ end
255
+
256
+ case @avg_parent
257
+ when true ; @h_plus = SpecID::AVG[:h_plus]
258
+ when false ; @h_plus = SpecID::MONO[:h_plus]
259
+ end
160
260
  end
161
261
 
162
262
  def date
@@ -203,6 +303,190 @@ class SpecID::Sequest::PepXML
203
303
  end
204
304
  end
205
305
 
306
+
307
+ Default_Options = {
308
+ :out_path => nil,
309
+ :backup_db_path => '/project/marcotte/marcotte/ms/database',
310
+ # a PepXML option
311
+ :pepxml_version => DEF_VERSION,
312
+ ## MSMSRunSummary options:
313
+ # string must be recognized in sample_enzyme.rb
314
+ # or create your own SampleEnzyme object
315
+ :sample_enzyme => 'trypsin',
316
+ :ms_manufacturer => 'ThermoFinnigan',
317
+ :ms_model => 'LCQ Deca XP',
318
+ :ms_ionization => 'ESI',
319
+ :ms_mass_analyzer => 'Ion Trap',
320
+ :ms_detector => 'UNKNOWN',
321
+ :raw_data_type => "raw",
322
+ :raw_data => ".mzXML", ## even if you don't have it?
323
+ ## SearchSummary options:
324
+ :out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
325
+ :out_data => ".tgz" ## may be srf??
326
+ }
327
+
328
+ # will dynamically set :ms_model and :ms_mass_analyzer from srf info
329
+ # (ignoring defaults or anything passed in) for LTQ Orbitrap
330
+ # and LCQ Deca XP
331
+ # See SRF::Sequest::PepXML::Default_Options hash for defaults
332
+ # unless given, the out_path will be given as the path of the srf_file
333
+ def self.new_from_srf(srf_file, opts={})
334
+ opts = Default_Options.merge(opts)
335
+
336
+ ## set the outpath
337
+ out_path = opts.delete(:out_path)
338
+ unless out_path
339
+ out_path = File.dirname(srf_file)
340
+ end
341
+
342
+ ## read the srf file
343
+ srf = SRF.new(srf_file)
344
+
345
+ params = srf.params
346
+
347
+ ## check to see if we need backup_db
348
+ backup_db_path = opts.delete(:backup_db_path)
349
+ unless File.exist? params.database
350
+ params.database_path = backup_db_path
351
+ end
352
+
353
+ #######################################################################
354
+ # PREPARE THE OPTIONS:
355
+ #######################################################################
356
+ ## remove items from the options hash that don't belong to
357
+ ppxml_version = opts.delete(:pepxml_version)
358
+ out_data_type = opts.delete(:out_data_type)
359
+ out_data = opts.delete(:out_data)
360
+
361
+ ## Extract meta info from srf
362
+ bn_noext = base_name_noext(srf.header.raw_filename)
363
+ opts[:ms_model] = srf.header.model
364
+ case opts[:ms_model]
365
+ when /Orbitrap/
366
+ opts[:ms_mass_analyzer] = 'Orbitrap'
367
+ when /LCQ Deca XP/
368
+ opts[:ms_mass_analyzer] = 'Ion Trap'
369
+ end
370
+
371
+ ## Create the base name
372
+ full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
373
+ opts[:base_name] = full_base_name_no_ext
374
+
375
+ ## Create the search summary:
376
+ search_summary_options = {
377
+ :search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params),
378
+ :base_name => full_base_name_no_ext,
379
+ :out_data_type => out_data_type,
380
+ :out_data => out_data
381
+ }
382
+ opts[:search_summary] = SpecID::Sequest::PepXML::SearchSummary.new( params, search_summary_options)
383
+
384
+ ## Create the SampleEnzyme object if necessary
385
+ unless opts[:sample_enzyme].is_a? SampleEnzyme
386
+ opts[:sample_enzyme] = SampleEnzyme.new(opts[:sample_enzyme])
387
+ end
388
+
389
+ ## Create the pepxml obj
390
+ pepxml_obj = SpecID::Sequest::PepXML.new(ppxml_version, params)
391
+ ## name some common variables we'll need
392
+ h_plus = pepxml_obj.h_plus
393
+ avg_parent = pepxml_obj.avg_parent
394
+
395
+ #######################################################################
396
+ # CREATE the spectrum_queries_ar
397
+ #######################################################################
398
+ srf_index = srf.index
399
+ out_files = srf.out_files
400
+ spectrum_queries_arr = Array.new(srf.dta_files.size)
401
+ files_with_hits_index = 0 ## will end up being 1 indexed
402
+ srf.dta_files.each_with_index do |dta_file,i|
403
+ next if out_files[i].num_hits == 0
404
+ files_with_hits_index += 1
405
+
406
+ # Sort the hits
407
+ hits = out_files[i].hits
408
+ arr = hits.sort_by{|v| v.xcorr }
409
+
410
+ # Get proper deltacn and deltacnstar
411
+ # Prophet deltacn is not the same as the native Sequest deltacn
412
+ # It is the deltacn of the second best hit!
413
+ top_hit = arr.pop
414
+ second_hit = arr.last
415
+ if second_hit
416
+ top_hit[1] = second_hit[1]
417
+ deltacnstar = '0'
418
+ else
419
+ top_hit[1] = '1.0'
420
+ deltacnstar = '1'
421
+ end
422
+
423
+ ## mass calculations:
424
+ precursor_neutral_mass = dta_file.mh - h_plus
425
+ calc_neutral_pep_mass = top_hit[0] - h_plus
426
+ massdiff = precursor_neutral_mass - calc_neutral_pep_mass
427
+ if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
428
+ else ; massdiff = massdiff.to_s end
429
+
430
+ (start_scan, end_scan, charge) = srf_index[i]
431
+ sq_hash = {
432
+ :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
433
+ :start_scan => start_scan,
434
+ :end_scan => end_scan,
435
+ :precursor_neutral_mass => precursor_neutral_mass,
436
+ :assumed_charge => charge,
437
+ :pepxml_version => ppxml_version,
438
+ :index => files_with_hits_index,
439
+ }
440
+
441
+ # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
442
+ ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
443
+ (prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_hit[8])
444
+ # ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
445
+
446
+ sh_hash = {
447
+ :hit_rank => "1",
448
+ :peptide => pepseq,
449
+ :peptide_prev_aa => prevaa,
450
+ :peptide_next_aa => nextaa,
451
+ :protein => top_hit[9].split(" ").first,
452
+ :num_tot_proteins => top_hit[10],
453
+ :num_matched_ions => top_hit[6],
454
+ :tot_num_ions => top_hit[7],
455
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
456
+ :massdiff => massdiff,
457
+ :num_tol_term => SpecID::Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_hit[8]),
458
+ :num_missed_cleavages => SpecID::Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_hit[8]),
459
+ :is_rejected => '0',
460
+ # These are search score attributes:
461
+ :xcorr => top_hit[3],
462
+ :deltacn => top_hit[1],
463
+ :deltacnstar => deltacnstar,
464
+ :spscore => top_hit[2],
465
+ :sprank => top_hit[5],
466
+ }
467
+
468
+ spectrum_queries_arr[files_with_hits_index] = SpecID::Sequest::PepXML::SpectrumQuery.new(sq_hash) do
469
+ search_result = SpecID::Sequest::PepXML::SearchResult.new do
470
+ [ SpecID::Sequest::PepXML::SearchHit.new(sh_hash) ] # there can be multiple hits
471
+ end # SearchResult
472
+ [search_result] # can be multiple
473
+ end
474
+ end
475
+ spectrum_queries_arr.compact!
476
+
477
+ #######################################################################
478
+ # ADD the pipeline analysis
479
+ #######################################################################
480
+
481
+ pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'}) do
482
+ SpecID::Sequest::PepXML::MSMSRunSummary.new(opts) { spectrum_queries_arr }
483
+ end
484
+ pepxml_obj.msms_pipeline_analysis = pipeline
485
+ pepxml_obj.base_name = pipeline.msms_run_summary.base_name
486
+ pepxml_obj
487
+ end
488
+
489
+ # Takes bioworks 3.2/3.3 xml output (with no filters)
206
490
  # Returns a list of PepXML objects
207
491
  # msdata = path to mzXML files (or .timeIndex files) (or @TODO: path to sqt file(s))
208
492
  # params = sequest.params file
@@ -246,6 +530,7 @@ class SpecID::Sequest::PepXML
246
530
 
247
531
  ## Create a hash of spectrum_query arrays by filename (this very big block):
248
532
  spectrum_queries_by_base_name = {}
533
+ pepxml_objs_by_base_name = {}
249
534
  # Hash by the filenames to split into filenames:
250
535
  bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr|
251
536
 
@@ -262,7 +547,10 @@ class SpecID::Sequest::PepXML
262
547
  abort "invalid BioworksBrowser version: #{x}"
263
548
  end
264
549
 
265
- spectrum_queries = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
550
+ pepxml_obj = SpecID::Sequest::PepXML.new(pepxml_version, params)
551
+ pepxml_objs_by_base_name[base_name] = pepxml_obj
552
+
553
+ spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
266
554
 
267
555
 
268
556
  # Sort_by_rank and take the top hit (to mimick out2summary):
@@ -270,28 +558,18 @@ class SpecID::Sequest::PepXML
270
558
  top_pep = arr.pop
271
559
  second_hit = arr.last # needed for deltacnstar
272
560
 
273
- case params.precursor_mass_type
274
- when "monoisotopic" ; avg_parent = false
275
- else ; avg_parent = true
276
- end
277
-
278
- case avg_parent
279
- when true ; h_plus = SpecID::AVG[:h_plus]
280
- when false ; h_plus = SpecID::MONO[:h_plus]
281
- end
282
561
 
283
-
284
562
  case calc_prec_by
285
563
  when :prec_mz_arr
286
- precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, avg_parent)
564
+ precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, pepxml_obj.avg_parent)
287
565
  when :deltamass
288
- precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, avg_parent)
566
+ precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
289
567
  end
290
568
 
291
- calc_neutral_pep_mass = (top_pep.mass.to_f - h_plus)
569
+ calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
292
570
  massdiff = precursor_neutral_mass - calc_neutral_pep_mass
293
571
  if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
294
- else ; massdiff = massdiff.to_s end
572
+ else ; massdiff = massdiff.to_s end #already has a -
295
573
  # deltacn & star:
296
574
  # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
297
575
  if second_hit
@@ -317,7 +595,7 @@ class SpecID::Sequest::PepXML
317
595
  # NOTE: the bioworks mass is really M+H if two or more scans went
318
596
  # into the search_hit; calc_neutral_pep_mass is simply the avg of
319
597
  # precursor masses adjusted to be neutral
320
- (prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.split_sequence(top_pep.sequence)
598
+ (prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_pep.sequence)
321
599
  (num_matched_ions, tot_num_ions) = SpecID::Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
322
600
  search_hit = SpecID::Sequest::PepXML::SearchHit.new({
323
601
  :hit_rank => "1",
@@ -348,32 +626,36 @@ class SpecID::Sequest::PepXML
348
626
 
349
627
  # create an index by spectrum as results end up typically in out2summary
350
628
  # (I really dislike this order, however)
351
- spectrum_queries = spectrum_queries.sort_by {|pep| pep.spectrum }
352
- spectrum_queries.each_with_index {|res,index| res.index = "#{index + 1}" }
629
+ spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
630
+ spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
353
631
 
354
- spectrum_queries_by_base_name[base_name] = spectrum_queries
632
+ spectrum_queries_by_base_name[base_name] = spectrum_queries_ar
355
633
  end
356
634
 
357
- spectrum_queries_by_base_name.collect do |base_name, spectrum_queries|
635
+ modifications_string = bioworks.modifications
636
+
637
+ spectrum_queries_by_base_name.collect do |base_name, spectrum_queries_ar|
358
638
  case pepxml_version
359
639
  when 18
360
- SpecID::Sequest::PepXML.new(pepxml_version) do
361
- SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'}) do
362
- full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
363
- SpecID::Sequest::PepXML::MSMSRunSummary.new({
364
- :base_name => full_base_name_no_ext,
365
- :ms_manufacturer => ms_manufacturer,
366
- :ms_model => ms_model,
367
- :ms_ionization => ms_ionization,
368
- :ms_mass_analyzer => ms_mass_analyzer,
369
- :ms_detector => ms_detector,
370
- :raw_data_type => raw_data_type,
371
- :raw_data => raw_data,
372
- :sample_enzyme => SampleEnzyme.new(sample_enzyme),
373
- :search_summary => SpecID::Sequest::PepXML::SearchSummary.new(params, {:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params), :base_name => full_base_name_no_ext, :out_data_type => out_data_type, :out_data => out_data}),
374
- }) do spectrum_queries end
375
- end
640
+ pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'}) do
641
+ full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
642
+ SpecID::Sequest::PepXML::MSMSRunSummary.new({
643
+ :base_name => full_base_name_no_ext,
644
+ :ms_manufacturer => ms_manufacturer,
645
+ :ms_model => ms_model,
646
+ :ms_ionization => ms_ionization,
647
+ :ms_mass_analyzer => ms_mass_analyzer,
648
+ :ms_detector => ms_detector,
649
+ :raw_data_type => raw_data_type,
650
+ :raw_data => raw_data,
651
+ :sample_enzyme => SampleEnzyme.new(sample_enzyme),
652
+ :search_summary => SpecID::Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params), :base_name => full_base_name_no_ext, :out_data_type => out_data_type, :out_data => out_data}),
653
+ }) { spectrum_queries_ar }
376
654
  end
655
+ pepxml_obj = pepxml_objs_by_base_name[base_name]
656
+ pepxml_obj.msms_pipeline_analysis = pipeline
657
+ pepxml_obj.base_name = pipeline.msms_run_summary.base_name
658
+ pepxml_obj
377
659
  when 0
378
660
  ## @TODO: NEED TO REVAMP THIS:
379
661
  # SpecID::Sequest::PepXML.new(pepxml_version).set_from_hash({
@@ -429,6 +711,14 @@ class SpecID::Sequest::PepXML
429
711
  string
430
712
  end
431
713
 
714
+ # given any kind of filename (from windows or whatever)
715
+ # returns the base of the filename with no file extension
716
+ def self.base_name_noext(file)
717
+ file.gsub!("\\", '/')
718
+ File.basename(file).sub(/\.[\w^\.]+$/, '')
719
+ end
720
+
721
+
432
722
  end # PepXML
433
723
 
434
724
  ##
@@ -461,6 +751,15 @@ class SpecID::Sequest::Params
461
751
  one,two = line.split @@param_re
462
752
  two,comment = two.split @@param_two_split
463
753
  hash[one] = two.rstrip
754
+ # it is necessary to add this break so that params files inside srf
755
+ # files can be read. This will terminate the reading at the end of
756
+ # the file even though there are more lines
757
+ if line =~ /added to U/ || line =~ /digest_mass_range/## Will only work on bioworks 3.2 & 3.3 (bioworks 3.1 last line => Elastase/Tryp...)
758
+ break
759
+ end
760
+ if line =~ /digest_mass_range/ # there is no space in the srf params files
761
+ break
762
+ end
464
763
  else
465
764
  break
466
765
  end
@@ -468,17 +767,26 @@ class SpecID::Sequest::Params
468
767
  hash
469
768
  end
470
769
 
770
+ # returns self
771
+ def parse_handle(fh)
772
+ sequest_line = fh.gets #[SEQUEST]
773
+ @opts = grab_params(fh)
774
+ @opts["search_engine"] = "SEQUEST"
775
+ @mods = grab_params(fh)
776
+
777
+ ## this gets rid of the .hdr postfix on indexed databases
778
+ @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
779
+ self
780
+ end
781
+
471
782
  ## parses file
472
783
  ## and drops the .hdr behind indexed fasta files
784
+ ## returns self
473
785
  def parse(file)
474
786
  File.open(file) do |fh|
475
- sequest_line = fh.gets #[SEQUEST]
476
- @opts = grab_params(fh)
477
- @opts["search_engine"] = "SEQUEST"
478
- @mods = grab_params(fh)
787
+ parse_handle(fh)
479
788
  end
480
- ## this gets rid of the .hdr postfix on indexed databases
481
- @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
789
+ self
482
790
  end
483
791
 
484
792
  # returns( split_after, except_before)
@@ -569,6 +877,17 @@ class SpecID::Sequest::Params
569
877
  @opts["first_database_name"]
570
878
  end
571
879
 
880
+ # returns the appropriate aminoacid mass lookup table (in spec_id.rb SpecID::MONO or
881
+ # SpecID::AVG based on precursor_mass_type
882
+ def mass_table
883
+ case precursor_mass_type
884
+ when 'average'
885
+ SpecID::AVG
886
+ when 'monoisotopic'
887
+ SpecID::MONO
888
+ end
889
+ end
890
+
572
891
  # at least in Bioworks 3.2, the First number after the enzyme
573
892
  # is the indication of the enzymatic end stringency (required):
574
893
  # 1 = Fully enzymatic
@@ -628,7 +947,7 @@ class SpecID::Sequest::PepXML::SearchResult
628
947
  attr_accessor :search_hits
629
948
 
630
949
  # if block given, then search_hits set to return value
631
- def initialize()
950
+ def initialize
632
951
  if block_given? ; @search_hits = yield
633
952
  else ; @search_hits = [] end
634
953
  end
@@ -646,13 +965,16 @@ class SpecID::Sequest::PepXML::SearchSummary
646
965
  attr_accessor :base_name
647
966
  attr_accessor :out_data_type
648
967
  attr_accessor :out_data
968
+ attr_accessor :modifications
649
969
  # A SearchDatabase object (responds to :local_path and :type)
650
970
  attr_accessor :search_database
651
971
  # if given a sequest params object, then will set the following attributes:
652
972
  # args is a hash of parameters
653
- def initialize(params=nil, args=nil)
973
+ # modifications_string -> See Modifications
974
+ def initialize(params, modifications_string='', args=nil)
654
975
  @search_id = nil
655
976
  @params = params
977
+ @modifications = SpecID::Sequest::PepXML::Modifications.new(params, modifications_string)
656
978
  if args ; set_from_hash(args) end
657
979
  end
658
980
 
@@ -665,16 +987,304 @@ class SpecID::Sequest::PepXML::SearchSummary
665
987
  else ; '1' end
666
988
  end
667
989
 
990
+
668
991
  def to_pepxml
669
992
  element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
670
993
  search_database.to_pepxml +
671
994
  short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
995
+ @modifications.to_pepxml +
672
996
  @params.pepxml_parameters
673
997
  end
674
998
  end
675
999
 
676
1000
  end
677
1001
 
1002
+ class SpecID::Sequest::PepXML::Modifications
1003
+ include SpecIDXML
1004
+
1005
+ # sequest params object
1006
+ attr_accessor :params
1007
+ # array holding AAModifications
1008
+ attr_accessor :aa_mods
1009
+ # array holding TerminalModifications
1010
+ attr_accessor :term_mods
1011
+ # a hash of all differential modifications present by aa_one_letter_symbol
1012
+ # and special_symbol. This is NOT the mass difference but the total mass {
1013
+ # 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
1014
+ # the amino acid sequence, they are give the *differential* mass. The
1015
+ # termini are given the special symbol as in sequest e.g. '[' => 12.22, #
1016
+ # cterminus ']' => 14.55 # nterminus
1017
+ attr_accessor :masses_by_diff_mod_hash
1018
+ # a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
1019
+ # values are the special_symbols
1020
+ attr_accessor :mod_symbols_hash
1021
+
1022
+ # The modification symbols string looks like this:
1023
+ # (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
1024
+ # ct is cterminal peptide (differential)
1025
+ # nt is nterminal peptide (differential)
1026
+ # the C is just cysteine
1027
+ # will set_modifications and masses_by_diff_mod hash
1028
+ def initialize(params, modification_symbols_string='')
1029
+ @params = params
1030
+ set_modifications(params, modification_symbols_string)
1031
+ end
1032
+
1033
+ # set the masses_by_diff_mod and mod_symbols_hash from
1034
+ def set_hashes(modification_symbols_string)
1035
+ @mod_symbols_hash = {}
1036
+ @masses_by_diff_mod = {}
1037
+ if modification_symbols_string == nil || modification_symbols_string == ''
1038
+ return nil
1039
+ end
1040
+ table = @params.mass_table
1041
+ modification_symbols_string.split(/\)\s+\(/).each do |mod|
1042
+ if mod =~ /\(?(\w{1,2})(.) (.[\d\.]+)\)?/
1043
+ aa_as_sym = $1.to_sym,
1044
+ @mod_symbols_hash[[aa_as_sym, $3.to_f]] = $2.dup
1045
+ if $1 == 'ct' || $1 == 'nt'
1046
+ @masses_by_diff_mod[$2] = $3.to_f
1047
+ else
1048
+ @masses_by_diff_mod[$1+$2] = $3.to_f + table[aa_as_sym]
1049
+ end
1050
+ end
1051
+ end
1052
+ end
1053
+
1054
+ # given a bare peptide (no end pieces) returns a ModificationInfo object
1055
+ # e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
1056
+ # if there are no modifications, returns nil
1057
+ def modification_info(peptide)
1058
+ if @masses_by_diff_mod.size == 0
1059
+ return nil
1060
+ end
1061
+ hash[:modified_peptide] = peptide.dup
1062
+ hash = {}
1063
+ hsh = @masses_by_diff_mod
1064
+ table = @params.mass_table
1065
+ h = table[:h] # this? or h_plus ??
1066
+ oh = table[:o] + h
1067
+ ## only the termini can match a single char
1068
+ if hsh.key? peptide[0,1]
1069
+ # AA + H + differential_mod
1070
+ hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
1071
+ peptide.slice!( 1..-1 )
1072
+ end
1073
+ if hsh.key? peptide[-1,1]
1074
+ # AA + OH + differential_mod
1075
+ hash[:mod_cterm_mass] = table[peptide[-2,1].to_sym] + oh + hsh[peptide[-1,1]]
1076
+ peptide.slice!( 0..-2 )
1077
+ end
1078
+ mod_array = []
1079
+ (0...peptide.size).each do |i|
1080
+ if hsh.key? peptide[i,2]
1081
+ mod_array << [ i+1 , hsh[peptide[i,2]] ]
1082
+ end
1083
+ end
1084
+ if mod_array.size > 0
1085
+ hash[:mod_aminoacid_mass_array] = mod_array
1086
+ end
1087
+ if hash.size > 0
1088
+ SpecID::Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
1089
+ else
1090
+ nil
1091
+ end
1092
+ end
1093
+
1094
+ # 1. sets aa_mods and term_mods from a sequest params object
1095
+ # 2. sets @params
1096
+ # 3. sets @masses_by_diff_mod
1097
+ def set_modifications(params, modification_symbols_string)
1098
+ @params = params
1099
+
1100
+ set_hashes(modification_symbols_string)
1101
+
1102
+ ####################################
1103
+ ## static mods
1104
+ ####################################
1105
+
1106
+ static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
1107
+ static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
1108
+
1109
+ params.mods.each do |k,v|
1110
+ v_to_f = v.to_f
1111
+ if v_to_f != 0.0
1112
+ if k =~ /add_(\w)_/
1113
+ static_mods << [$1.to_sym, v_to_f]
1114
+ else
1115
+ static_terminal_mods << [k, v_to_f]
1116
+ end
1117
+ end
1118
+ end
1119
+ aa_hash = params.mass_table
1120
+
1121
+ ## Create the static_mods objects
1122
+ static_mods.map! do |mod|
1123
+ hash = {
1124
+ :aminoacid => mod[0].to_s,
1125
+ :massdiff => mod[1].to_plus_minus_string,
1126
+ :mass => aa_hash[mod[0]] + mod[1],
1127
+ :variable => 'N',
1128
+ :binary => 'Y',
1129
+ }
1130
+ SpecID::Sequest::PepXML::AAModification.new(hash)
1131
+ end
1132
+
1133
+ ## Create the static_terminal_mods objects
1134
+ static_terminal_mods.map! do |mod|
1135
+ terminus = if mod[0] =~ /Cterm/ ; 'c'
1136
+ else ; 'n' # only two possible termini
1137
+ end
1138
+ protein_terminus = case mod[0]
1139
+ when /Nterm_protein/ ; 'n'
1140
+ when /Cterm_protein/ ; 'c'
1141
+ else nil
1142
+ end
1143
+
1144
+ # create the hash
1145
+ hash = {
1146
+ :terminus => terminus,
1147
+ :massdiff => mod[1].to_plus_minus_string,
1148
+ :variable => 'N',
1149
+ :description => mod[0],
1150
+ }
1151
+ hash[:protein_terminus] = protein_terminus if protein_terminus
1152
+ SpecID::Sequest::PepXML::TerminalModification.new(hash)
1153
+ end
1154
+ #################################
1155
+ # Variable Mods:
1156
+ #################################
1157
+ arr = params.diff_search_options.rstrip.split(/\s+/)
1158
+ # [aa.to_sym, diff.to_f]
1159
+ variable_mods = []
1160
+ (0...arr.size).step(2) do |i|
1161
+ if arr[i].to_f != 0.0
1162
+ variable_mods << [arr[i+1].to_sym, arr[i].to_f]
1163
+ end
1164
+ end
1165
+ variable_mods.map! do |mod|
1166
+ hash = {
1167
+ :aminoacid => mod[0].to_s,
1168
+ :massdiff => mod[1].to_plus_minus_string,
1169
+ :mass => aa_hash[mod[0]] + mod[1],
1170
+ :variable => 'Y',
1171
+ :binary => 'N',
1172
+ :symbol => @mod_symbols_hash[mod],
1173
+ }
1174
+ SpecID::Sequest::PepXML::AAModification.new(hash)
1175
+ end
1176
+ #################################
1177
+ # TERMINAL Variable Mods:
1178
+ #################################
1179
+ # These are always peptide, not protein termini (for sequest)
1180
+ (nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
1181
+
1182
+ to_add = []
1183
+ if nterm_diff != 0.0
1184
+ to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
1185
+ end
1186
+ if cterm_diff != 0.0
1187
+ to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
1188
+ end
1189
+
1190
+ variable_terminal_mods = to_add.map do |term, mssdiff, symb|
1191
+ hash = {
1192
+ :terminus => term,
1193
+ :massdiff => mssdiff,
1194
+ :variable => 'Y',
1195
+ :symbol => symb,
1196
+ }
1197
+ SpecID::Sequest::PepXML::TerminalModification.new(hash)
1198
+ end
1199
+
1200
+ #########################
1201
+ # COLLECT THEM
1202
+ #########################
1203
+ @aa_mods = static_mods + variable_mods
1204
+ @term_mods = static_terminal_mods + variable_terminal_mods
1205
+ end
1206
+
1207
+ ## Generates the pepxml for static and differential amino acid mods based on
1208
+ ## sequest object
1209
+ def to_pepxml
1210
+ st = ''
1211
+ if @aa_mods
1212
+ st << @aa_mods.map {|v| v.to_pepxml }.join
1213
+ end
1214
+ if @term_mods
1215
+ st << @term_mods.map {|v| v.to_pepxml }.join
1216
+ end
1217
+ st
1218
+ end
1219
+
1220
+ end
1221
+
1222
+ # Modified aminoacid, static or variable
1223
+ # unless otherwise stated, all attributes can be anything
1224
+ class SpecID::Sequest::PepXML::AAModification
1225
+ include SpecIDXML
1226
+
1227
+ # The amino acid (one letter code)
1228
+ attr_accessor :aminoacid
1229
+ # Must be a string!!!!
1230
+ # Mass difference with respect to unmodified aminoacid, must begin with
1231
+ # either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
1232
+ # consider Numeric#to_plus_minus_string at top
1233
+ attr_accessor :massdiff
1234
+ # Mass of modified aminoacid
1235
+ attr_accessor :mass
1236
+ # Y if both modified and unmodified aminoacid could be present in the
1237
+ # dataset, N if only modified aminoacid can be present
1238
+ attr_accessor :variable
1239
+ # whether modification can reside only at protein terminus (specified 'n',
1240
+ # 'c', or 'nc')
1241
+ attr_accessor :peptide_terminus
1242
+ # Special symbol used by search engine to designate this modification
1243
+ attr_accessor :symbol
1244
+ # Y if each peptide must have only modified or unmodified aminoacid, N if a
1245
+ # peptide may contain both modified and unmodified aminoacid
1246
+ attr_accessor :binary
1247
+
1248
+ def initialize(hash=nil)
1249
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1250
+ end
1251
+
1252
+ def to_pepxml
1253
+ short_element_xml_from_instance_vars("aminoacid_modification")
1254
+ end
1255
+
1256
+ end
1257
+
1258
+ # Modified aminoacid, static or variable
1259
+ class SpecID::Sequest::PepXML::TerminalModification
1260
+ include SpecIDXML
1261
+
1262
+ # n for N-terminus, c for C-terminus
1263
+ attr_accessor :terminus
1264
+ # Mass difference with respect to unmodified terminus
1265
+ attr_accessor :massdiff
1266
+ # Mass of modified terminus
1267
+ attr_accessor :mass
1268
+ # Y if both modified and unmodified terminus could be present in the
1269
+ # dataset, N if only modified terminus can be present
1270
+ attr_accessor :variable
1271
+ # Special symbol used by search engine to designate this modification
1272
+ attr_accessor :symbol
1273
+ # whether modification can reside only at protein terminus (specified n or
1274
+ # c)
1275
+ attr_accessor :protein_terminus
1276
+ attr_accessor :description
1277
+
1278
+ def initialize(hash=nil)
1279
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1280
+ end
1281
+
1282
+ def to_pepxml
1283
+ short_element_xml_from_instance_vars("terminal_modification")
1284
+ end
1285
+ end
1286
+
1287
+
678
1288
  class SpecID::Sequest::PepXML::SearchDatabase
679
1289
  include SpecIDXML
680
1290
  attr_accessor :local_path
@@ -708,7 +1318,15 @@ end
708
1318
 
709
1319
  class SpecID::Sequest::PepXML::SpectrumQuery
710
1320
  include SpecIDXML
711
- attr_accessor :spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :index, :search_results
1321
+
1322
+ # basename_noext.first_scan.last_scan.charge
1323
+ attr_accessor :spectrum
1324
+ attr_accessor :start_scan
1325
+ attr_accessor :end_scan
1326
+ attr_accessor :precursor_neutral_mass
1327
+ attr_accessor :index
1328
+ attr_accessor :search_results
1329
+
712
1330
  # this is a string
713
1331
  attr_accessor :assumed_charge
714
1332
  attr_accessor :pepxml_version
@@ -803,6 +1421,10 @@ end
803
1421
  # this responds to flatten (so that it won't flatten).
804
1422
  class SpecID::Sequest::PepXML::SearchHit < Array
805
1423
  include SpecIDXML
1424
+
1425
+ Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1426
+
1427
+ # num_tot_proteins = "Number of unique proteins in search database containing peptide"
806
1428
  #attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected
807
1429
  #attr_accessor 13:deltacnstar
808
1430
  #attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank
@@ -811,69 +1433,82 @@ class SpecID::Sequest::PepXML::SearchHit < Array
811
1433
  ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17}
812
1434
  @@methods = ind_keys.keys
813
1435
  def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end
814
- def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
815
- def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
816
- def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
817
- def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
818
- def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
819
- def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
820
- def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
821
- def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
822
- def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
823
- def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
824
- def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
825
- def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
826
- def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
827
- def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
828
- def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
829
- def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
830
- def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
831
-
832
- @@arr_size = ind_keys.size
833
- ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
834
- ind_keys.merge!(ind_keys_w_eq)
835
- ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
836
-
837
- # These are all search_score elements:
838
-
839
- # 1 if there is no second ranked hit, 0 otherwise
840
-
841
- def initialize(hash=nil)
842
- super(@@arr_size)
843
- self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
844
- self
845
- #if hash ; set_from_hash(hash) end
846
- end
1436
+ def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
1437
+ def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
1438
+ def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
1439
+ def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
1440
+ def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
1441
+ def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
1442
+ def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
1443
+ def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
1444
+ def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
1445
+ def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
1446
+ def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
1447
+ def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
1448
+ def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
1449
+ def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
1450
+ def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
1451
+ def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
1452
+ def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
1453
+
1454
+ @@arr_size = ind_keys.size
1455
+ ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
1456
+ ind_keys.merge!(ind_keys_w_eq)
1457
+ ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
1458
+
1459
+ # These are all search_score elements:
1460
+
1461
+ # 1 if there is no second ranked hit, 0 otherwise
847
1462
 
848
- # Returns prev, peptide, next from sequence. Parse errors return
849
- # nil,nil,nil
850
- # R.PEPTIDE.A # -> R, PEPTIDE, A
851
- # R.PEPTIDE.- # -> R, PEPTIDE, -
852
- # PEPTIDE.A # -> -, PEPTIDE, A
853
- # A.PEPTIDE # -> A, PEPTIDE, -
854
- # PEPTIDE # -> nil,nil,nil
855
- def self.split_sequence(val)
856
- peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
857
- pieces = val.split(".")
858
- case pieces.size
859
- when 3
860
- peptide_prev_aa, peptide, peptide_next_aa = *pieces
861
- when 2
862
- if pieces[0].size > 1 ## N termini
863
- peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
864
- else ## C termini
865
- peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
866
- end
867
- when 1 ## this must be a parse error!
868
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
869
- when 0
870
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
871
- end
872
- return peptide_prev_aa, peptide, peptide_next_aa
873
- end
1463
+ def initialize(hash=nil)
1464
+ super(@@arr_size)
1465
+ self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
1466
+ self
1467
+ #if hash ; set_from_hash(hash) end
1468
+ end
1469
+
1470
+ # remove_non_amino_acids && split_sequence
1471
+ def self.prepare_sequence(val)
1472
+ nv = remove_non_amino_acids(val)
1473
+ split_sequence(nv)
1474
+ end
874
1475
 
875
- def inspect
876
- "#<SearchHit #{@@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")}>"
1476
+ # Returns prev, peptide, next from sequence. Parse errors return
1477
+ # nil,nil,nil
1478
+ # R.PEPTIDE.A # -> R, PEPTIDE, A
1479
+ # R.PEPTIDE.- # -> R, PEPTIDE, -
1480
+ # PEPTIDE.A # -> -, PEPTIDE, A
1481
+ # A.PEPTIDE # -> A, PEPTIDE, -
1482
+ # PEPTIDE # -> nil,nil,nil
1483
+ def self.split_sequence(val)
1484
+ peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
1485
+ pieces = val.split('.')
1486
+ case pieces.size
1487
+ when 3
1488
+ peptide_prev_aa, peptide, peptide_next_aa = *pieces
1489
+ when 2
1490
+ if pieces[0].size > 1 ## N termini
1491
+ peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
1492
+ else ## C termini
1493
+ peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
1494
+ end
1495
+ when 1 ## this must be a parse error!
1496
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
1497
+ when 0
1498
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
1499
+ end
1500
+ return peptide_prev_aa, peptide, peptide_next_aa
1501
+ end
1502
+
1503
+ # removes nonstandard chars with Non_standard_amino_acid_char_re
1504
+ # preserves A-Z and '.
1505
+ def self.remove_non_amino_acids(sequence)
1506
+ sequence.gsub(Non_standard_amino_acid_char_re, '')
1507
+ end
1508
+
1509
+ def inspect
1510
+ var = @@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
1511
+ "#<SearchHit #{var}>"
877
1512
  end
878
1513
 
879
1514
  # requires Params object and full sequence (with heads and tails)
@@ -924,3 +1559,65 @@ def inspect
924
1559
 
925
1560
  end
926
1561
 
1562
+ # Positions and masses of modifications
1563
+ class SpecID::Sequest::PepXML::SearchHit::ModificationInfo
1564
+ include SpecIDXML
1565
+
1566
+ ## Should be something like this:
1567
+ # <modification_info mod_nterm_mass=" " mod_nterm_mass=" " modified_peptide=" ">
1568
+ # <mod_aminoacid_mass position=" " mass=" "/>
1569
+ # </modification_info>
1570
+
1571
+
1572
+ # Mass of modified N terminus<
1573
+ attr_accessor :mod_nterm_mass
1574
+ # Mass of modified C terminus<
1575
+ attr_accessor :mod_cterm_mass
1576
+ # Peptide sequence (with indicated modifications) I'm assuming that the
1577
+ # native sequest indicators are OK here
1578
+ attr_accessor :modified_peptide
1579
+ ## A few main types:
1580
+
1581
+ # this should be an array of arrays: [[position, modified_mass], ...]
1582
+ # position ranges from 1 to peptide length
1583
+ attr_accessor :mod_aminoacid_mass_array
1584
+
1585
+ def initialize(hash=nil)
1586
+ instance_var_set_from_hash(hash)
1587
+ end
1588
+
1589
+ # Will escape any xml special chars in modified_peptide
1590
+ def to_pepxml
1591
+ ## Collect the modifications:
1592
+ mod_strings = []
1593
+ if @mod_aminoacid_mass_array
1594
+ mod_strings = @mod_aminoacid_mass_array.map do |ar|
1595
+ "position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
1596
+ end
1597
+ end
1598
+ ## Create the attribute string:
1599
+ att_parts = []
1600
+ if @mod_nterm_mass
1601
+ att_parts << "mod_nterm_mass=\"#{@mod_nterm_mass}\""
1602
+ end
1603
+ if @mod_cterm_mass
1604
+ att_parts << "mod_cterm_mass=\"#{@mod_cterm_mass}\""
1605
+ end
1606
+ if @modified_peptide
1607
+ att_parts << "modified_peptide=\"#{escape_special_chars(@modified_peptide)}\""
1608
+ end
1609
+ element_xml_and_att_string('modification_info', att_parts.join(" ")) do
1610
+ mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
1611
+ end
1612
+ end
1613
+
1614
+ ##
1615
+
1616
+ # <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
1617
+ # <mod_aminoacid_mass position="2" mass="545.7160"/>
1618
+ # <mod_aminoacid_mass position="3" mass="147.1926"/>
1619
+ # </modification_info>
1620
+
1621
+
1622
+ end
1623
+