mspire 0.3.1 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/Rakefile +2 -2
  2. data/bin/bioworks_to_pepxml.rb +15 -3
  3. data/bin/ms_to_lmat.rb +2 -1
  4. data/bin/sqt_group.rb +26 -0
  5. data/changelog.txt +36 -0
  6. data/lib/ms/msrun.rb +3 -1
  7. data/lib/ms/parser/mzdata/dom.rb +14 -14
  8. data/lib/ms/scan.rb +3 -3
  9. data/lib/mspire.rb +1 -1
  10. data/lib/sample_enzyme.rb +39 -0
  11. data/lib/spec_id.rb +18 -0
  12. data/lib/spec_id/aa_freqs.rb +6 -9
  13. data/lib/spec_id/digestor.rb +16 -17
  14. data/lib/spec_id/mass.rb +63 -1
  15. data/lib/spec_id/parser/proph.rb +101 -2
  16. data/lib/spec_id/precision/filter.rb +3 -2
  17. data/lib/spec_id/precision/filter/cmdline.rb +3 -1
  18. data/lib/spec_id/precision/filter/output.rb +1 -0
  19. data/lib/spec_id/precision/prob.rb +88 -21
  20. data/lib/spec_id/precision/prob/cmdline.rb +28 -16
  21. data/lib/spec_id/precision/prob/output.rb +8 -2
  22. data/lib/spec_id/proph/pep_summary.rb +25 -12
  23. data/lib/spec_id/sequest.rb +28 -0
  24. data/lib/spec_id/sequest/pepxml.rb +142 -197
  25. data/lib/spec_id/sqt.rb +349 -0
  26. data/lib/spec_id/srf.rb +33 -23
  27. data/lib/validator.rb +40 -57
  28. data/lib/validator/aa.rb +3 -90
  29. data/lib/validator/aa_est.rb +112 -0
  30. data/lib/validator/cmdline.rb +163 -31
  31. data/lib/validator/decoy.rb +15 -7
  32. data/lib/validator/digestion_based.rb +5 -4
  33. data/lib/validator/q_value.rb +32 -0
  34. data/script/peps_per_bin.rb +67 -0
  35. data/script/sqt_to_meta.rb +24 -0
  36. data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
  37. data/specs/bin/fasta_shaker_spec.rb +2 -2
  38. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
  39. data/specs/bin/filter_and_validate_spec.rb +25 -6
  40. data/specs/bin/ms_to_lmat_spec.rb +2 -2
  41. data/specs/bin/prob_validate_spec.rb +5 -3
  42. data/specs/sample_enzyme_spec.rb +86 -1
  43. data/specs/spec_helper.rb +11 -9
  44. data/specs/spec_id/bioworks_spec.rb +2 -1
  45. data/specs/spec_id/precision/filter_spec.rb +5 -5
  46. data/specs/spec_id/precision/prob_spec.rb +0 -67
  47. data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
  48. data/specs/spec_id/protein_summary_spec.rb +4 -4
  49. data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
  50. data/specs/spec_id/sequest_spec.rb +38 -0
  51. data/specs/spec_id/sqt_spec.rb +111 -3
  52. data/specs/spec_id_spec.rb +2 -0
  53. data/specs/transmem/phobius_spec.rb +3 -1
  54. data/specs/transmem/toppred_spec.rb +1 -1
  55. data/specs/validator/aa_est_spec.rb +66 -0
  56. data/specs/validator/aa_spec.rb +1 -68
  57. data/specs/validator/background_spec.rb +2 -0
  58. data/specs/validator/bias_spec.rb +3 -27
  59. data/specs/validator/decoy_spec.rb +2 -2
  60. data/specs/validator/transmem_spec.rb +2 -1
  61. data/test_files/small.sqt +87 -0
  62. metadata +312 -293
@@ -1,4 +1,4 @@
1
-
1
+ require 'yaml'
2
2
  require 'spec_id/precision/output'
3
3
  require 'table'
4
4
  require 'matrix'
@@ -12,12 +12,18 @@ class SpecID::Precision::Prob::Output
12
12
  # returns array of data arrays and parallel labels
13
13
  def to_cols_and_labels(answer_hash)
14
14
  col_labels = %w(count probability peptide)
15
+ col_labels[1] = 'q_values' if answer_hash.key?(:q_values)
15
16
 
16
17
  cols = []
17
18
  cols << answer_hash[:count]
18
- cols << answer_hash[:probabilities]
19
+ if answer_hash.key?(:q_values)
20
+ cols << answer_hash[:q_values]
21
+ else
22
+ cols << answer_hash[:probabilities]
23
+ end
19
24
  cols << answer_hash[:aaseqs]
20
25
 
26
+
21
27
  # if there is a single modified peptide, we'll include the column
22
28
  if answer_hash.key?(:modified_peptides)
23
29
  cols << answer_hash[:modified_peptides]
@@ -1,7 +1,6 @@
1
1
 
2
2
  require 'array_class'
3
- puts "REQUIRING"
4
- puts( require 'spec_id/sequest/pepxml' )
3
+ require 'spec_id/sequest/pepxml'
5
4
  require 'spec_id/parser/proph'
6
5
 
7
6
  module Sequest ; end
@@ -13,10 +12,12 @@ module SpecID ; end
13
12
  module SpecID::Prot ; end
14
13
  module SpecID::Pep ; end
15
14
 
15
+
16
+
16
17
  module Proph
17
18
 
18
- class PepSummary < Sequest::PepXML::MSMSRunSummary
19
- # MSMSRunSummary is a SpecID object!
19
+ class PepSummary
20
+ include SpecID
20
21
 
21
22
  Filetype_and_version_re_new = /version="PeptideProphet v([\d\.]+) /
22
23
 
@@ -25,7 +26,7 @@ module Proph
25
26
  # the protein groups
26
27
  # currently these are just xml nodes returned!
27
28
  attr_accessor :peptideprophet_summary
28
- attr_accessor :spectrum_queries
29
+ attr_accessor :msms_run_summaries
29
30
  attr_accessor :version
30
31
 
31
32
  def hi_prob_best ; true end
@@ -51,24 +52,26 @@ module Proph
51
52
  end
52
53
 
53
54
  def initialize(file=nil)
54
- @prots = nil
55
55
  if file
56
56
  @version = get_version(file)
57
- #@prot_groups = ProtSummary::Parser.new.parse_file(file)
58
- SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
57
+ spec_id = SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
59
58
  end
60
59
  end
61
60
  end
62
61
 
62
+ # this is a SpecID::Pep (by interface: not including stuff yet)
63
63
  class PepSummary::Pep < Sequest::PepXML::SearchHit
64
- %w(probability fval ntt nmc massd).each do |guy|
64
+
65
+ # aaseq is defined in SearchHit
66
+
67
+ %w(probability fval ntt nmc massd prots).each do |guy|
65
68
  self.add_member(guy)
66
69
  end
67
70
 
68
71
  # returns self
69
- def from_pepxml_node(node, spec_query)
70
- super(node, spec_query)
71
- #pp_n = node.find_first('descendant::peptideprophet_result')
72
+ def from_pepxml_node(node)
73
+ super(node)
74
+
72
75
  an_res = node.find_first('child::analysis_result')
73
76
  pp_n = an_res.find_first('child::peptideprophet_result')
74
77
  self.probability = pp_n['probability'].to_f
@@ -87,6 +90,16 @@ module Proph
87
90
  self
88
91
  end
89
92
  end
93
+
94
+ ::Proph::PepSummary::Prot = ArrayClass.new(%w(name protein_descr peps))
95
+
96
+ class PepSummary::Prot
97
+ def first_entry ; self[0] end ## name
98
+ def reference ; self[0] + ' ' + self[1] end
99
+ end
100
+
90
101
  end
91
102
 
92
103
 
104
+
105
+
@@ -1,5 +1,33 @@
1
1
  require 'spec_id/sequest/params'
2
+ require 'hash_by'
3
+ require 'sort_by_attributes.rb'
2
4
 
3
5
  module Sequest
6
+
7
+ # returns one array of peptide hits: indexes hits based on index_by, takes
8
+ # the uniq ones and then sorts the group by sort_by (compatible with
9
+ # sort_by_attributes) then slices from first_index to last_index
10
+ # (inclusive).
11
+ def self.other_hits(peps, first_index=1, last_index=9, index_by=[:base_name, :first_scan, :charge], sort_by=[:xcorr, {:down => :xcorr}])
12
+ all_hits = []
13
+ peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
14
+ if peps_per_scan.size >= (first_index + 1)
15
+ all_hits.push( *(peps_per_scan.uniq.sort_by_attributes(*sort_by)[first_index..last_index]) )
16
+ end
17
+ end
18
+ all_hits.compact
19
+ end
20
+
21
+ def self.other_hits_sorted_by_xcorr(peps, first_index, last_index, index_by=[:base_name, :first_scan, :charge])
22
+ all_hits = []
23
+ peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
24
+ if peps_per_scan.size >= (first_index + 1)
25
+ all_hits.push( *(peps_per_scan.uniq.sort_by {|x| x.xcorr }.reverse[first_index..last_index]) )
26
+ end
27
+ end
28
+ all_hits.compact
29
+
30
+ end
31
+
4
32
  end
5
33
 
@@ -155,25 +155,6 @@ class Sequest::PepXML::MSMSRunSummary
155
155
  @ms_detector = node['msDetector']
156
156
  @raw_data_type = node['raw_data_type']
157
157
  @raw_data = node['raw_data']
158
-
159
- sample_enzyme_n = node.find_first("child::sample_enzyme")
160
- @sample_enzyme = SampleEnzyme.from_pepxml_node(sample_enzyme_n)
161
-
162
- search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
163
- spectrum_queries = search_summary_n.find("following-sibling::spectrum_query")
164
- @spectrum_queries = spectrum_queries.map do |sq_n|
165
- Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n, self)
166
- end
167
-
168
- ## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
169
- #into a better search summary object (to eventually depracate the params object)
170
- @search_summary = node ## in future call SearchSummary.from_pepxml_node
171
- @peps = []
172
- @spectrum_queries.each do |sq|
173
- sq.search_results.each do |sr|
174
- @peps.push( *(sr.search_hits) )
175
- end
176
- end
177
158
  self
178
159
  end
179
160
  end
@@ -353,7 +334,13 @@ Default_Options = {
353
334
  search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
354
335
 
355
336
  # create the sample enzyme from the params object:
356
- opts[:sample_enzyme] = params.sample_enzyme
337
+ sample_enzyme_obj =
338
+ if opts[:sample_enzyme]
339
+ opts[:sample_enzyme]
340
+ else
341
+ params.sample_enzyme
342
+ end
343
+ opts[:sample_enzyme] = sample_enzyme_obj
357
344
 
358
345
  ## Create the pepxml obj and top level objects
359
346
  pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
@@ -390,36 +377,20 @@ Default_Options = {
390
377
  out_files = srf.out_files
391
378
  spectrum_queries_arr = Array.new(srf.dta_files.size)
392
379
  files_with_hits_index = 0 ## will end up being 1 indexed
393
- srf.dta_files.each_with_index do |dta_file,i|
394
- next if out_files[i].num_hits == 0
395
- files_with_hits_index += 1
396
-
397
- # We don't need to sort the hits by xcorr since it comes pre-sorted in
398
- # srf files!
399
- #arr = hits.sort_by{|v| v.xcorr }
400
-
401
- # Get proper deltacn and deltacnstar
402
- # under new srf, deltacn is already corrected for what prophet wants,
403
- # deltacn_orig is how to access the old one
404
- # Prophet deltacn is not the same as the native Sequest deltacn
405
- # It is the deltacn of the second best hit!
406
-
407
- hits = out_files[i].hits
408
- top_hit = hits[0]
409
- second_hit = hits[1]
410
- deltacnstar =
411
- if second_hit ; '0'
412
- else ; '1'
413
- end
414
-
415
- ## mass calculations:
416
- precursor_neutral_mass = dta_file.mh - h_plus
417
- calc_neutral_pep_mass = top_hit[0] - h_plus
418
380
 
419
- (start_scan, end_scan, charge) = srf_index[i]
381
+ deltacn_orig = opts[:deltacn_orig]
382
+ deltacn_index =
383
+ if deltacn_orig ; 20
384
+ else 19
385
+ end
420
386
 
387
+ srf.dta_files.each_with_index do |dta_file,dta_i|
388
+ next if out_files[dta_i].num_hits == 0
389
+ files_with_hits_index += 1
421
390
 
391
+ precursor_neutral_mass = dta_file.mh - h_plus
422
392
 
393
+ (start_scan, end_scan, charge) = srf_index[dta_i]
423
394
  sq_hash = {
424
395
  :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
425
396
  :start_scan => start_scan,
@@ -432,39 +403,70 @@ Default_Options = {
432
403
 
433
404
  spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
434
405
 
435
- sequence = top_hit.sequence
436
-
437
- # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
438
- ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
439
- (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
440
- # 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn
441
-
442
- sh_hash = {
443
- :hit_rank => 1,
444
- :peptide => pepseq,
445
- :peptide_prev_aa => prevaa,
446
- :peptide_next_aa => nextaa,
447
- :protein => top_hit[10].first.reference.split(" ").first,
448
- :num_tot_proteins => top_hit[10].size,
449
- :num_matched_ions => top_hit[7],
450
- :tot_num_ions => top_hit[8],
451
- :calc_neutral_pep_mass => calc_neutral_pep_mass,
452
- :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
453
- :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, sequence),
454
- :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, sequence),
455
- :is_rejected => 0,
456
- # These are search score attributes:
457
- :xcorr => top_hit[3],
458
- :deltacn => top_hit[19],
459
- :deltacnstar => deltacnstar,
460
- :spscore => top_hit[2],
461
- :sprank => top_hit[6],
462
- :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
463
- }
464
- search_hit = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
406
+
407
+ hits = out_files[dta_i].hits
408
+
409
+ search_hits =
410
+ if opts[:all_hits]
411
+ Array.new(out_files[dta_i].num_hits) # all hits
412
+ else
413
+ Array.new(1) # top hit only
414
+ end
415
+
416
+ (0...(search_hits.size)).each do |hit_i|
417
+ hit = hits[hit_i]
418
+ # under the modified deltacn schema (like bioworks)
419
+ # Get proper deltacn and deltacnstar
420
+ # under new srf, deltacn is already corrected for what prophet wants,
421
+ # deltacn_orig_updated is how to access the old one
422
+ # Prophet deltacn is not the same as the native Sequest deltacn
423
+ # It is the deltacn of the second best hit!
424
+
425
+ ## mass calculations:
426
+ calc_neutral_pep_mass = hit[0] - h_plus
427
+
428
+
429
+ sequence = hit.sequence
430
+
431
+ # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
432
+ ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
433
+ (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
434
+ # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
435
+
436
+ sh_hash = {
437
+ :hit_rank => hit_i+1,
438
+ :peptide => pepseq,
439
+ :peptide_prev_aa => prevaa,
440
+ :peptide_next_aa => nextaa,
441
+ :protein => hit[10].first.reference.split(" ").first,
442
+ :num_tot_proteins => hit[10].size,
443
+ :num_matched_ions => hit[7],
444
+ :tot_num_ions => hit[8],
445
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
446
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
447
+ :num_tol_term => sample_enzyme_obj.num_tol_term(sequence),
448
+ :num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
449
+ :is_rejected => 0,
450
+ # These are search score attributes:
451
+ :xcorr => hit[3],
452
+ :deltacn => hit[deltacn_index],
453
+ :spscore => hit[2],
454
+ :sprank => hit[6],
455
+ :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
456
+ }
457
+ unless deltacn_orig
458
+ sh_hash[:deltacnstar] =
459
+ if hits[hit_i+1].nil? # no next hit? then its deltacnstar == 1
460
+ '1'
461
+ else
462
+ '0'
463
+ end
464
+ end
465
+ search_hits[hit_i] = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
466
+ end
465
467
 
466
468
  search_result = Sequest::PepXML::SearchResult.new
467
- search_result.search_hits = [search_hit]
469
+ search_result.search_hits = search_hits
468
470
  spectrum_query.search_results = [search_result]
469
471
  spectrum_queries_arr[files_with_hits_index] = spectrum_query
470
472
  end
@@ -473,56 +475,61 @@ Default_Options = {
473
475
  pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
474
476
  pepxml_obj.base_name = pipeline.msms_run_summary.base_name
475
477
  pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
476
-
478
+
477
479
  pepxml_obj
478
480
  end
479
481
 
480
482
  # takes an .srg or bioworks.xml file
481
483
  # if possible, ensures that an mzXML file is present for each pepxml file
482
484
  # :print => true, will print files
485
+ # NOTES: num_tol_term and num_missing_cleavages are both calculated from the
486
+ # sample_enzyme. Thus, a No_Enzyme search may still pass in a
487
+ # :sample_enzyme option to get these calculated.
483
488
  def self.set_from_bioworks(bioworks_file, opts={})
484
489
  opts = Default_Options.merge(opts)
485
490
  ## Create the out_path directory if necessary
486
491
 
487
- unless File.exist? opts[:out_path]
488
- FileUtils.mkpath(opts[:out_path])
489
- end
490
- unless File.directory? opts[:out_path]
491
- abort "#{opts[:out_path]} must be a directory!"
492
- end
493
-
494
- spec_id = SpecID.new(bioworks_file)
495
- pepxml_objs =
496
- if spec_id.is_a? Bioworks
497
- abort("must have opts[:params] set!") unless opts[:params]
498
- set_from_bioworks_xml(bioworks_file, opts[:params], opts)
499
- elsif spec_id.is_a? SRFGroup
500
- spec_id.srfs.map do |srf|
501
- new_from_srf(srf, opts)
492
+ unless File.exist? opts[:out_path]
493
+ FileUtils.mkpath(opts[:out_path])
502
494
  end
503
- else
504
- abort "invalid object"
505
- end
506
-
507
- if opts[:print]
508
- pepxml_objs.each do |obj|
509
- obj.to_pepxml(obj.base_name + ".xml")
495
+ unless File.directory? opts[:out_path]
496
+ abort "#{opts[:out_path]} must be a directory!"
497
+ end
498
+
499
+ spec_id = SpecID.new(bioworks_file)
500
+ pepxml_objs =
501
+ if spec_id.is_a? Bioworks
502
+ abort("must have opts[:params] set!") unless opts[:params]
503
+ set_from_bioworks_xml(bioworks_file, opts[:params], opts)
504
+ elsif spec_id.is_a? SRFGroup
505
+ spec_id.srfs.map do |srf|
506
+ new_from_srf(srf, opts)
507
+ end
508
+ else
509
+ abort "invalid object"
510
+ end
511
+
512
+ if opts[:print]
513
+ pepxml_objs.each do |obj|
514
+ obj.to_pepxml(obj.base_name + ".xml")
515
+ end
510
516
  end
517
+ pepxml_objs
511
518
  end
512
- pepxml_objs
513
- end
514
519
 
515
520
 
516
- # Takes bioworks 3.2/3.3 xml output (with no filters)
517
- # Returns a list of PepXML objects
518
- # params = sequest.params file
519
- # bioworks = bioworks.xml exported multi-consensus view file
520
- # pepxml_version = 0 for tpp 1.2.3
521
- # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
521
+ # Takes bioworks 3.2/3.3 xml output (with no filters)
522
+ # Returns a list of PepXML objects
523
+ # params = sequest.params file
524
+ # bioworks = bioworks.xml exported multi-consensus view file
525
+ # pepxml_version = 0 for tpp 1.2.3
526
+ # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
522
527
  def self.set_from_bioworks_xml(bioworks, params, opts={})
523
528
  opts = Default_Options.merge(opts)
524
529
  pepxml_version, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
525
530
 
531
+
532
+
526
533
  unless out_path
527
534
  out_path = '.'
528
535
  end
@@ -545,6 +552,13 @@ Default_Options = {
545
552
  else ; abort "Don't recognize #{bioworks} as object or string!"
546
553
  end
547
554
 
555
+ sample_enzyme_obj =
556
+ if opts[:sample_enzyme]
557
+ opts[:sample_enzyme]
558
+ else
559
+ params.sample_enzyme
560
+ end
561
+
548
562
  #puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
549
563
 
550
564
  ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
@@ -589,7 +603,7 @@ Default_Options = {
589
603
  :ms_detector => ms_detector,
590
604
  :raw_data_type => raw_data_type,
591
605
  :raw_data => raw_data,
592
- :sample_enzyme => params.sample_enzyme,
606
+ :sample_enzyme => sample_enzyme_obj, # usually, params.sample_enzyme,
593
607
  :search_summary => search_summary,
594
608
  })
595
609
  pipeline.msms_run_summary = msms_run_summary
@@ -626,10 +640,11 @@ Default_Options = {
626
640
  end
627
641
 
628
642
 
629
- spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
643
+ spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).map do |key,arr|
630
644
 
631
645
 
632
646
  # Sort_by_rank and take the top hit (to mimick out2summary):
647
+
633
648
  arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
634
649
  top_pep = arr.pop
635
650
  second_hit = arr.last # needed for deltacnstar
@@ -643,7 +658,7 @@ Default_Options = {
643
658
  end
644
659
 
645
660
  calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
646
-
661
+
647
662
  # deltacn & star:
648
663
  # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
649
664
  if second_hit
@@ -685,8 +700,8 @@ Default_Options = {
685
700
  :tot_num_ions => tot_num_ions,
686
701
  :calc_neutral_pep_mass => calc_neutral_pep_mass,
687
702
  :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
688
- :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_pep.sequence),
689
- :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_pep.sequence),
703
+ :num_tol_term => sample_enzyme_obj.num_tol_term(top_pep.sequence),
704
+ :num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
690
705
  :is_rejected => 0,
691
706
  # These are search score attributes:
692
707
  :xcorr => top_pep.xcorr,
@@ -697,7 +712,7 @@ Default_Options = {
697
712
  :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
698
713
  :spectrum_query => spec_query,
699
714
  })
700
- search_result.search_hits = [search_hit] # there can be multiple search hits
715
+ search_result.search_hits = [search_hit] # there can be multiple search hits
701
716
  spec_query.search_results = [search_result] # can be multiple search_results
702
717
  spec_query
703
718
  end
@@ -766,9 +781,8 @@ class Sequest::PepXML::SearchResult
766
781
  attr_accessor :search_hits
767
782
 
768
783
  # if block given, then search_hits set to return value
769
- def initialize
770
- if block_given? ; @search_hits = yield
771
- else ; @search_hits = [] end
784
+ def initialize(search_hits = [])
785
+ @search_hits = search_hits
772
786
  end
773
787
 
774
788
  def to_pepxml
@@ -777,17 +791,6 @@ class Sequest::PepXML::SearchResult
777
791
  end
778
792
  end
779
793
 
780
- def self.from_pepxml_node(node, spec_query)
781
- self.new.from_pepxml_node(node, spec_query)
782
- end
783
-
784
- def from_pepxml_node(node, spec_query, msmsrun_obj)
785
- sh_klass = msmsrun_obj.search_hit_class
786
- @search_hits = node.children.map do |sh_n|
787
- sh_klass.from_pepxml_node(sh_n, spec_query)
788
- end
789
- self
790
- end
791
794
  end
792
795
 
793
796
  class Sequest::PepXML::SearchSummary
@@ -820,7 +823,11 @@ class Sequest::PepXML::SearchSummary
820
823
  def to_pepxml
821
824
  element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
822
825
  search_database.to_pepxml +
823
- short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
826
+ if @params.enzyme =~ /^No_Enzyme/
827
+ ''
828
+ else
829
+ short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini])
830
+ end +
824
831
  @modifications.to_pepxml +
825
832
  Sequest::PepXML::Parameters.new(@params).to_pepxml
826
833
  end
@@ -1216,21 +1223,17 @@ class Sequest::PepXML::SpectrumQuery
1216
1223
  end
1217
1224
  end
1218
1225
 
1219
- def self.from_pepxml_node(node, msmsrun_obj)
1220
- self.new.from_pepxml_node(node, msmsrun_obj)
1226
+ def self.from_pepxml_node(node)
1227
+ self.new.from_pepxml_node(node)
1221
1228
  end
1222
1229
 
1223
- def from_pepxml_node(node, msmsrun_obj)
1230
+ def from_pepxml_node(node)
1224
1231
  self[0] = node['spectrum']
1225
1232
  self[1] = node['start_scan'].to_i
1226
1233
  self[2] = node['end_scan'].to_i
1227
1234
  self[3] = node['precursor_neutral_mass'].to_f
1228
1235
  self[4] = node['index'].to_i
1229
1236
  self[5] = node['assumed_charge'].to_i
1230
- self[6] = node.children.map do |v|
1231
- sh = Sequest::PepXML::SearchResult.new
1232
- sh.from_pepxml_node(v, self, msmsrun_obj)
1233
- end
1234
1237
  self
1235
1238
  end
1236
1239
 
@@ -1299,6 +1302,8 @@ class Sequest::PepXML::SearchHit
1299
1302
 
1300
1303
  Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1301
1304
 
1305
+ def aaseq ; self[1] end
1306
+ def aaseq=(arg) ; self[1] = arg end
1302
1307
 
1303
1308
  # These are all search_score elements:
1304
1309
 
@@ -1321,44 +1326,6 @@ class Sequest::PepXML::SearchHit
1321
1326
  "#<SearchHit #{var}>"
1322
1327
  end
1323
1328
 
1324
- # requires Params object and full sequence (with heads and tails)
1325
- def self.calc_num_missed_cleavages(params, sequence)
1326
- num_missed = 0
1327
- offset, split_after, except_before = params.enzyme_specificity
1328
- first, middle, last = SpecID::Pep.split_sequence(sequence)
1329
- to_regexp = "[#{split_after}]"
1330
- if except_before.size > 0
1331
- to_regexp << "[^#{except_before}]"
1332
- end
1333
- regexp = /#{to_regexp}/
1334
- arr = middle.scan(regexp)
1335
- num = arr.size
1336
- if middle[-1,1] =~ regexp
1337
- # if the regexp is a single letter (exceptions) and the last letter
1338
- # matches, then it will count when it is not a missed cleavage
1339
- # we can correct for this and get the right answer
1340
- num -= 1
1341
- else
1342
- num
1343
- end
1344
- end
1345
-
1346
- # requires Params object and full sequence (with heads and tails)
1347
- def self.calc_num_tol_term(params, sequence)
1348
- num_tol = 0
1349
- offset, split_after, except_before = params.enzyme_specificity
1350
- first, middle, last = SpecID::Pep.split_sequence(sequence)
1351
- last_of_middle = middle[-1,1]
1352
- first_of_middle = middle[0,1]
1353
- if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
1354
- num_tol += 1
1355
- end
1356
- if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
1357
- num_tol += 1
1358
- end
1359
- num_tol
1360
- end
1361
-
1362
1329
  # Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
1363
1330
  def self.split_ions(ions)
1364
1331
  ions.split("/").map {|ion| ion.to_i }
@@ -1392,11 +1359,7 @@ class Sequest::PepXML::SearchHit
1392
1359
  end
1393
1360
  end
1394
1361
 
1395
- def self.from_pepxml_node(node, spec_query)
1396
- self.new.from_pepxml_node(node, spec_query)
1397
- end
1398
-
1399
- def from_pepxml_node(node, spec_query)
1362
+ def from_pepxml_node(node)
1400
1363
  self[0] = node['hit_rank'].to_i
1401
1364
  self[1] = node['peptide']
1402
1365
  self[2] = node['peptide_prev_aa']
@@ -1410,24 +1373,6 @@ class Sequest::PepXML::SearchHit
1410
1373
  self[10] = node['num_tol_term'].to_i
1411
1374
  self[11] = node['num_missed_cleavages'].to_i
1412
1375
  self[12] = node['is_rejected'].to_i
1413
- if modinfo_node = node.find_first("child::modification_info")
1414
- self[18] = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
1415
- end
1416
- node.find("child::search_score").each do |ss_n|
1417
- case ss_n['name']
1418
- when 'deltacnstar'
1419
- self[13] = ss_n['value'].to_i
1420
- when 'xcorr'
1421
- self[14] = ss_n['value'].to_f
1422
- when 'deltacn'
1423
- self[15] = ss_n['value'].to_f
1424
- when 'spscore'
1425
- self[16] = ss_n['value'].to_f
1426
- when 'sprank'
1427
- self[17] = ss_n['value'].to_i
1428
- end
1429
- end
1430
- self[19] = spec_query
1431
1376
  self
1432
1377
  end
1433
1378