mspire 0.3.1 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/Rakefile +2 -2
  2. data/bin/bioworks_to_pepxml.rb +15 -3
  3. data/bin/ms_to_lmat.rb +2 -1
  4. data/bin/sqt_group.rb +26 -0
  5. data/changelog.txt +36 -0
  6. data/lib/ms/msrun.rb +3 -1
  7. data/lib/ms/parser/mzdata/dom.rb +14 -14
  8. data/lib/ms/scan.rb +3 -3
  9. data/lib/mspire.rb +1 -1
  10. data/lib/sample_enzyme.rb +39 -0
  11. data/lib/spec_id.rb +18 -0
  12. data/lib/spec_id/aa_freqs.rb +6 -9
  13. data/lib/spec_id/digestor.rb +16 -17
  14. data/lib/spec_id/mass.rb +63 -1
  15. data/lib/spec_id/parser/proph.rb +101 -2
  16. data/lib/spec_id/precision/filter.rb +3 -2
  17. data/lib/spec_id/precision/filter/cmdline.rb +3 -1
  18. data/lib/spec_id/precision/filter/output.rb +1 -0
  19. data/lib/spec_id/precision/prob.rb +88 -21
  20. data/lib/spec_id/precision/prob/cmdline.rb +28 -16
  21. data/lib/spec_id/precision/prob/output.rb +8 -2
  22. data/lib/spec_id/proph/pep_summary.rb +25 -12
  23. data/lib/spec_id/sequest.rb +28 -0
  24. data/lib/spec_id/sequest/pepxml.rb +142 -197
  25. data/lib/spec_id/sqt.rb +349 -0
  26. data/lib/spec_id/srf.rb +33 -23
  27. data/lib/validator.rb +40 -57
  28. data/lib/validator/aa.rb +3 -90
  29. data/lib/validator/aa_est.rb +112 -0
  30. data/lib/validator/cmdline.rb +163 -31
  31. data/lib/validator/decoy.rb +15 -7
  32. data/lib/validator/digestion_based.rb +5 -4
  33. data/lib/validator/q_value.rb +32 -0
  34. data/script/peps_per_bin.rb +67 -0
  35. data/script/sqt_to_meta.rb +24 -0
  36. data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
  37. data/specs/bin/fasta_shaker_spec.rb +2 -2
  38. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
  39. data/specs/bin/filter_and_validate_spec.rb +25 -6
  40. data/specs/bin/ms_to_lmat_spec.rb +2 -2
  41. data/specs/bin/prob_validate_spec.rb +5 -3
  42. data/specs/sample_enzyme_spec.rb +86 -1
  43. data/specs/spec_helper.rb +11 -9
  44. data/specs/spec_id/bioworks_spec.rb +2 -1
  45. data/specs/spec_id/precision/filter_spec.rb +5 -5
  46. data/specs/spec_id/precision/prob_spec.rb +0 -67
  47. data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
  48. data/specs/spec_id/protein_summary_spec.rb +4 -4
  49. data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
  50. data/specs/spec_id/sequest_spec.rb +38 -0
  51. data/specs/spec_id/sqt_spec.rb +111 -3
  52. data/specs/spec_id_spec.rb +2 -0
  53. data/specs/transmem/phobius_spec.rb +3 -1
  54. data/specs/transmem/toppred_spec.rb +1 -1
  55. data/specs/validator/aa_est_spec.rb +66 -0
  56. data/specs/validator/aa_spec.rb +1 -68
  57. data/specs/validator/background_spec.rb +2 -0
  58. data/specs/validator/bias_spec.rb +3 -27
  59. data/specs/validator/decoy_spec.rb +2 -2
  60. data/specs/validator/transmem_spec.rb +2 -1
  61. data/test_files/small.sqt +87 -0
  62. metadata +312 -293
@@ -1,4 +1,4 @@
1
-
1
+ require 'yaml'
2
2
  require 'spec_id/precision/output'
3
3
  require 'table'
4
4
  require 'matrix'
@@ -12,12 +12,18 @@ class SpecID::Precision::Prob::Output
12
12
  # returns array of data arrays and parallel labels
13
13
  def to_cols_and_labels(answer_hash)
14
14
  col_labels = %w(count probability peptide)
15
+ col_labels[1] = 'q_values' if answer_hash.key?(:q_values)
15
16
 
16
17
  cols = []
17
18
  cols << answer_hash[:count]
18
- cols << answer_hash[:probabilities]
19
+ if answer_hash.key?(:q_values)
20
+ cols << answer_hash[:q_values]
21
+ else
22
+ cols << answer_hash[:probabilities]
23
+ end
19
24
  cols << answer_hash[:aaseqs]
20
25
 
26
+
21
27
  # if there is a single modified peptide, we'll include the column
22
28
  if answer_hash.key?(:modified_peptides)
23
29
  cols << answer_hash[:modified_peptides]
@@ -1,7 +1,6 @@
1
1
 
2
2
  require 'array_class'
3
- puts "REQUIRING"
4
- puts( require 'spec_id/sequest/pepxml' )
3
+ require 'spec_id/sequest/pepxml'
5
4
  require 'spec_id/parser/proph'
6
5
 
7
6
  module Sequest ; end
@@ -13,10 +12,12 @@ module SpecID ; end
13
12
  module SpecID::Prot ; end
14
13
  module SpecID::Pep ; end
15
14
 
15
+
16
+
16
17
  module Proph
17
18
 
18
- class PepSummary < Sequest::PepXML::MSMSRunSummary
19
- # MSMSRunSummary is a SpecID object!
19
+ class PepSummary
20
+ include SpecID
20
21
 
21
22
  Filetype_and_version_re_new = /version="PeptideProphet v([\d\.]+) /
22
23
 
@@ -25,7 +26,7 @@ module Proph
25
26
  # the protein groups
26
27
  # currently these are just xml nodes returned!
27
28
  attr_accessor :peptideprophet_summary
28
- attr_accessor :spectrum_queries
29
+ attr_accessor :msms_run_summaries
29
30
  attr_accessor :version
30
31
 
31
32
  def hi_prob_best ; true end
@@ -51,24 +52,26 @@ module Proph
51
52
  end
52
53
 
53
54
  def initialize(file=nil)
54
- @prots = nil
55
55
  if file
56
56
  @version = get_version(file)
57
- #@prot_groups = ProtSummary::Parser.new.parse_file(file)
58
- SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
57
+ spec_id = SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
59
58
  end
60
59
  end
61
60
  end
62
61
 
62
+ # this is a SpecID::Pep (by interface: not including stuff yet)
63
63
  class PepSummary::Pep < Sequest::PepXML::SearchHit
64
- %w(probability fval ntt nmc massd).each do |guy|
64
+
65
+ # aaseq is defined in SearchHit
66
+
67
+ %w(probability fval ntt nmc massd prots).each do |guy|
65
68
  self.add_member(guy)
66
69
  end
67
70
 
68
71
  # returns self
69
- def from_pepxml_node(node, spec_query)
70
- super(node, spec_query)
71
- #pp_n = node.find_first('descendant::peptideprophet_result')
72
+ def from_pepxml_node(node)
73
+ super(node)
74
+
72
75
  an_res = node.find_first('child::analysis_result')
73
76
  pp_n = an_res.find_first('child::peptideprophet_result')
74
77
  self.probability = pp_n['probability'].to_f
@@ -87,6 +90,16 @@ module Proph
87
90
  self
88
91
  end
89
92
  end
93
+
94
+ ::Proph::PepSummary::Prot = ArrayClass.new(%w(name protein_descr peps))
95
+
96
+ class PepSummary::Prot
97
+ def first_entry ; self[0] end ## name
98
+ def reference ; self[0] + ' ' + self[1] end
99
+ end
100
+
90
101
  end
91
102
 
92
103
 
104
+
105
+
@@ -1,5 +1,33 @@
1
1
  require 'spec_id/sequest/params'
2
+ require 'hash_by'
3
+ require 'sort_by_attributes.rb'
2
4
 
3
5
  module Sequest
6
+
7
+ # returns one array of peptide hits: indexes hits based on index_by, takes
8
+ # the uniq ones and then sorts the group by sort_by (compatible with
9
+ # sort_by_attributes) then slices from first_index to last_index
10
+ # (inclusive).
11
+ def self.other_hits(peps, first_index=1, last_index=9, index_by=[:base_name, :first_scan, :charge], sort_by=[:xcorr, {:down => :xcorr}])
12
+ all_hits = []
13
+ peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
14
+ if peps_per_scan.size >= (first_index + 1)
15
+ all_hits.push( *(peps_per_scan.uniq.sort_by_attributes(*sort_by)[first_index..last_index]) )
16
+ end
17
+ end
18
+ all_hits.compact
19
+ end
20
+
21
+ def self.other_hits_sorted_by_xcorr(peps, first_index, last_index, index_by=[:base_name, :first_scan, :charge])
22
+ all_hits = []
23
+ peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
24
+ if peps_per_scan.size >= (first_index + 1)
25
+ all_hits.push( *(peps_per_scan.uniq.sort_by {|x| x.xcorr }.reverse[first_index..last_index]) )
26
+ end
27
+ end
28
+ all_hits.compact
29
+
30
+ end
31
+
4
32
  end
5
33
 
@@ -155,25 +155,6 @@ class Sequest::PepXML::MSMSRunSummary
155
155
  @ms_detector = node['msDetector']
156
156
  @raw_data_type = node['raw_data_type']
157
157
  @raw_data = node['raw_data']
158
-
159
- sample_enzyme_n = node.find_first("child::sample_enzyme")
160
- @sample_enzyme = SampleEnzyme.from_pepxml_node(sample_enzyme_n)
161
-
162
- search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
163
- spectrum_queries = search_summary_n.find("following-sibling::spectrum_query")
164
- @spectrum_queries = spectrum_queries.map do |sq_n|
165
- Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n, self)
166
- end
167
-
168
- ## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
169
- #into a better search summary object (to eventually depracate the params object)
170
- @search_summary = node ## in future call SearchSummary.from_pepxml_node
171
- @peps = []
172
- @spectrum_queries.each do |sq|
173
- sq.search_results.each do |sr|
174
- @peps.push( *(sr.search_hits) )
175
- end
176
- end
177
158
  self
178
159
  end
179
160
  end
@@ -353,7 +334,13 @@ Default_Options = {
353
334
  search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
354
335
 
355
336
  # create the sample enzyme from the params object:
356
- opts[:sample_enzyme] = params.sample_enzyme
337
+ sample_enzyme_obj =
338
+ if opts[:sample_enzyme]
339
+ opts[:sample_enzyme]
340
+ else
341
+ params.sample_enzyme
342
+ end
343
+ opts[:sample_enzyme] = sample_enzyme_obj
357
344
 
358
345
  ## Create the pepxml obj and top level objects
359
346
  pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
@@ -390,36 +377,20 @@ Default_Options = {
390
377
  out_files = srf.out_files
391
378
  spectrum_queries_arr = Array.new(srf.dta_files.size)
392
379
  files_with_hits_index = 0 ## will end up being 1 indexed
393
- srf.dta_files.each_with_index do |dta_file,i|
394
- next if out_files[i].num_hits == 0
395
- files_with_hits_index += 1
396
-
397
- # We don't need to sort the hits by xcorr since it comes pre-sorted in
398
- # srf files!
399
- #arr = hits.sort_by{|v| v.xcorr }
400
-
401
- # Get proper deltacn and deltacnstar
402
- # under new srf, deltacn is already corrected for what prophet wants,
403
- # deltacn_orig is how to access the old one
404
- # Prophet deltacn is not the same as the native Sequest deltacn
405
- # It is the deltacn of the second best hit!
406
-
407
- hits = out_files[i].hits
408
- top_hit = hits[0]
409
- second_hit = hits[1]
410
- deltacnstar =
411
- if second_hit ; '0'
412
- else ; '1'
413
- end
414
-
415
- ## mass calculations:
416
- precursor_neutral_mass = dta_file.mh - h_plus
417
- calc_neutral_pep_mass = top_hit[0] - h_plus
418
380
 
419
- (start_scan, end_scan, charge) = srf_index[i]
381
+ deltacn_orig = opts[:deltacn_orig]
382
+ deltacn_index =
383
+ if deltacn_orig ; 20
384
+ else 19
385
+ end
420
386
 
387
+ srf.dta_files.each_with_index do |dta_file,dta_i|
388
+ next if out_files[dta_i].num_hits == 0
389
+ files_with_hits_index += 1
421
390
 
391
+ precursor_neutral_mass = dta_file.mh - h_plus
422
392
 
393
+ (start_scan, end_scan, charge) = srf_index[dta_i]
423
394
  sq_hash = {
424
395
  :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
425
396
  :start_scan => start_scan,
@@ -432,39 +403,70 @@ Default_Options = {
432
403
 
433
404
  spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
434
405
 
435
- sequence = top_hit.sequence
436
-
437
- # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
438
- ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
439
- (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
440
- # 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn
441
-
442
- sh_hash = {
443
- :hit_rank => 1,
444
- :peptide => pepseq,
445
- :peptide_prev_aa => prevaa,
446
- :peptide_next_aa => nextaa,
447
- :protein => top_hit[10].first.reference.split(" ").first,
448
- :num_tot_proteins => top_hit[10].size,
449
- :num_matched_ions => top_hit[7],
450
- :tot_num_ions => top_hit[8],
451
- :calc_neutral_pep_mass => calc_neutral_pep_mass,
452
- :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
453
- :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, sequence),
454
- :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, sequence),
455
- :is_rejected => 0,
456
- # These are search score attributes:
457
- :xcorr => top_hit[3],
458
- :deltacn => top_hit[19],
459
- :deltacnstar => deltacnstar,
460
- :spscore => top_hit[2],
461
- :sprank => top_hit[6],
462
- :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
463
- }
464
- search_hit = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
406
+
407
+ hits = out_files[dta_i].hits
408
+
409
+ search_hits =
410
+ if opts[:all_hits]
411
+ Array.new(out_files[dta_i].num_hits) # all hits
412
+ else
413
+ Array.new(1) # top hit only
414
+ end
415
+
416
+ (0...(search_hits.size)).each do |hit_i|
417
+ hit = hits[hit_i]
418
+ # under the modified deltacn schema (like bioworks)
419
+ # Get proper deltacn and deltacnstar
420
+ # under new srf, deltacn is already corrected for what prophet wants,
421
+ # deltacn_orig_updated is how to access the old one
422
+ # Prophet deltacn is not the same as the native Sequest deltacn
423
+ # It is the deltacn of the second best hit!
424
+
425
+ ## mass calculations:
426
+ calc_neutral_pep_mass = hit[0] - h_plus
427
+
428
+
429
+ sequence = hit.sequence
430
+
431
+ # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
432
+ ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
433
+ (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
434
+ # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
435
+
436
+ sh_hash = {
437
+ :hit_rank => hit_i+1,
438
+ :peptide => pepseq,
439
+ :peptide_prev_aa => prevaa,
440
+ :peptide_next_aa => nextaa,
441
+ :protein => hit[10].first.reference.split(" ").first,
442
+ :num_tot_proteins => hit[10].size,
443
+ :num_matched_ions => hit[7],
444
+ :tot_num_ions => hit[8],
445
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
446
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
447
+ :num_tol_term => sample_enzyme_obj.num_tol_term(sequence),
448
+ :num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
449
+ :is_rejected => 0,
450
+ # These are search score attributes:
451
+ :xcorr => hit[3],
452
+ :deltacn => hit[deltacn_index],
453
+ :spscore => hit[2],
454
+ :sprank => hit[6],
455
+ :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
456
+ }
457
+ unless deltacn_orig
458
+ sh_hash[:deltacnstar] =
459
+ if hits[hit_i+1].nil? # no next hit? then its deltacnstar == 1
460
+ '1'
461
+ else
462
+ '0'
463
+ end
464
+ end
465
+ search_hits[hit_i] = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
466
+ end
465
467
 
466
468
  search_result = Sequest::PepXML::SearchResult.new
467
- search_result.search_hits = [search_hit]
469
+ search_result.search_hits = search_hits
468
470
  spectrum_query.search_results = [search_result]
469
471
  spectrum_queries_arr[files_with_hits_index] = spectrum_query
470
472
  end
@@ -473,56 +475,61 @@ Default_Options = {
473
475
  pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
474
476
  pepxml_obj.base_name = pipeline.msms_run_summary.base_name
475
477
  pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
476
-
478
+
477
479
  pepxml_obj
478
480
  end
479
481
 
480
482
  # takes an .srg or bioworks.xml file
481
483
  # if possible, ensures that an mzXML file is present for each pepxml file
482
484
  # :print => true, will print files
485
+ # NOTES: num_tol_term and num_missing_cleavages are both calculated from the
486
+ # sample_enzyme. Thus, a No_Enzyme search may still pass in a
487
+ # :sample_enzyme option to get these calculated.
483
488
  def self.set_from_bioworks(bioworks_file, opts={})
484
489
  opts = Default_Options.merge(opts)
485
490
  ## Create the out_path directory if necessary
486
491
 
487
- unless File.exist? opts[:out_path]
488
- FileUtils.mkpath(opts[:out_path])
489
- end
490
- unless File.directory? opts[:out_path]
491
- abort "#{opts[:out_path]} must be a directory!"
492
- end
493
-
494
- spec_id = SpecID.new(bioworks_file)
495
- pepxml_objs =
496
- if spec_id.is_a? Bioworks
497
- abort("must have opts[:params] set!") unless opts[:params]
498
- set_from_bioworks_xml(bioworks_file, opts[:params], opts)
499
- elsif spec_id.is_a? SRFGroup
500
- spec_id.srfs.map do |srf|
501
- new_from_srf(srf, opts)
492
+ unless File.exist? opts[:out_path]
493
+ FileUtils.mkpath(opts[:out_path])
502
494
  end
503
- else
504
- abort "invalid object"
505
- end
506
-
507
- if opts[:print]
508
- pepxml_objs.each do |obj|
509
- obj.to_pepxml(obj.base_name + ".xml")
495
+ unless File.directory? opts[:out_path]
496
+ abort "#{opts[:out_path]} must be a directory!"
497
+ end
498
+
499
+ spec_id = SpecID.new(bioworks_file)
500
+ pepxml_objs =
501
+ if spec_id.is_a? Bioworks
502
+ abort("must have opts[:params] set!") unless opts[:params]
503
+ set_from_bioworks_xml(bioworks_file, opts[:params], opts)
504
+ elsif spec_id.is_a? SRFGroup
505
+ spec_id.srfs.map do |srf|
506
+ new_from_srf(srf, opts)
507
+ end
508
+ else
509
+ abort "invalid object"
510
+ end
511
+
512
+ if opts[:print]
513
+ pepxml_objs.each do |obj|
514
+ obj.to_pepxml(obj.base_name + ".xml")
515
+ end
510
516
  end
517
+ pepxml_objs
511
518
  end
512
- pepxml_objs
513
- end
514
519
 
515
520
 
516
- # Takes bioworks 3.2/3.3 xml output (with no filters)
517
- # Returns a list of PepXML objects
518
- # params = sequest.params file
519
- # bioworks = bioworks.xml exported multi-consensus view file
520
- # pepxml_version = 0 for tpp 1.2.3
521
- # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
521
+ # Takes bioworks 3.2/3.3 xml output (with no filters)
522
+ # Returns a list of PepXML objects
523
+ # params = sequest.params file
524
+ # bioworks = bioworks.xml exported multi-consensus view file
525
+ # pepxml_version = 0 for tpp 1.2.3
526
+ # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
522
527
  def self.set_from_bioworks_xml(bioworks, params, opts={})
523
528
  opts = Default_Options.merge(opts)
524
529
  pepxml_version, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
525
530
 
531
+
532
+
526
533
  unless out_path
527
534
  out_path = '.'
528
535
  end
@@ -545,6 +552,13 @@ Default_Options = {
545
552
  else ; abort "Don't recognize #{bioworks} as object or string!"
546
553
  end
547
554
 
555
+ sample_enzyme_obj =
556
+ if opts[:sample_enzyme]
557
+ opts[:sample_enzyme]
558
+ else
559
+ params.sample_enzyme
560
+ end
561
+
548
562
  #puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
549
563
 
550
564
  ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
@@ -589,7 +603,7 @@ Default_Options = {
589
603
  :ms_detector => ms_detector,
590
604
  :raw_data_type => raw_data_type,
591
605
  :raw_data => raw_data,
592
- :sample_enzyme => params.sample_enzyme,
606
+ :sample_enzyme => sample_enzyme_obj, # usually, params.sample_enzyme,
593
607
  :search_summary => search_summary,
594
608
  })
595
609
  pipeline.msms_run_summary = msms_run_summary
@@ -626,10 +640,11 @@ Default_Options = {
626
640
  end
627
641
 
628
642
 
629
- spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
643
+ spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).map do |key,arr|
630
644
 
631
645
 
632
646
  # Sort_by_rank and take the top hit (to mimick out2summary):
647
+
633
648
  arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
634
649
  top_pep = arr.pop
635
650
  second_hit = arr.last # needed for deltacnstar
@@ -643,7 +658,7 @@ Default_Options = {
643
658
  end
644
659
 
645
660
  calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
646
-
661
+
647
662
  # deltacn & star:
648
663
  # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
649
664
  if second_hit
@@ -685,8 +700,8 @@ Default_Options = {
685
700
  :tot_num_ions => tot_num_ions,
686
701
  :calc_neutral_pep_mass => calc_neutral_pep_mass,
687
702
  :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
688
- :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_pep.sequence),
689
- :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_pep.sequence),
703
+ :num_tol_term => sample_enzyme_obj.num_tol_term(top_pep.sequence),
704
+ :num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
690
705
  :is_rejected => 0,
691
706
  # These are search score attributes:
692
707
  :xcorr => top_pep.xcorr,
@@ -697,7 +712,7 @@ Default_Options = {
697
712
  :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
698
713
  :spectrum_query => spec_query,
699
714
  })
700
- search_result.search_hits = [search_hit] # there can be multiple search hits
715
+ search_result.search_hits = [search_hit] # there can be multiple search hits
701
716
  spec_query.search_results = [search_result] # can be multiple search_results
702
717
  spec_query
703
718
  end
@@ -766,9 +781,8 @@ class Sequest::PepXML::SearchResult
766
781
  attr_accessor :search_hits
767
782
 
768
783
  # if block given, then search_hits set to return value
769
- def initialize
770
- if block_given? ; @search_hits = yield
771
- else ; @search_hits = [] end
784
+ def initialize(search_hits = [])
785
+ @search_hits = search_hits
772
786
  end
773
787
 
774
788
  def to_pepxml
@@ -777,17 +791,6 @@ class Sequest::PepXML::SearchResult
777
791
  end
778
792
  end
779
793
 
780
- def self.from_pepxml_node(node, spec_query)
781
- self.new.from_pepxml_node(node, spec_query)
782
- end
783
-
784
- def from_pepxml_node(node, spec_query, msmsrun_obj)
785
- sh_klass = msmsrun_obj.search_hit_class
786
- @search_hits = node.children.map do |sh_n|
787
- sh_klass.from_pepxml_node(sh_n, spec_query)
788
- end
789
- self
790
- end
791
794
  end
792
795
 
793
796
  class Sequest::PepXML::SearchSummary
@@ -820,7 +823,11 @@ class Sequest::PepXML::SearchSummary
820
823
  def to_pepxml
821
824
  element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
822
825
  search_database.to_pepxml +
823
- short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
826
+ if @params.enzyme =~ /^No_Enzyme/
827
+ ''
828
+ else
829
+ short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini])
830
+ end +
824
831
  @modifications.to_pepxml +
825
832
  Sequest::PepXML::Parameters.new(@params).to_pepxml
826
833
  end
@@ -1216,21 +1223,17 @@ class Sequest::PepXML::SpectrumQuery
1216
1223
  end
1217
1224
  end
1218
1225
 
1219
- def self.from_pepxml_node(node, msmsrun_obj)
1220
- self.new.from_pepxml_node(node, msmsrun_obj)
1226
+ def self.from_pepxml_node(node)
1227
+ self.new.from_pepxml_node(node)
1221
1228
  end
1222
1229
 
1223
- def from_pepxml_node(node, msmsrun_obj)
1230
+ def from_pepxml_node(node)
1224
1231
  self[0] = node['spectrum']
1225
1232
  self[1] = node['start_scan'].to_i
1226
1233
  self[2] = node['end_scan'].to_i
1227
1234
  self[3] = node['precursor_neutral_mass'].to_f
1228
1235
  self[4] = node['index'].to_i
1229
1236
  self[5] = node['assumed_charge'].to_i
1230
- self[6] = node.children.map do |v|
1231
- sh = Sequest::PepXML::SearchResult.new
1232
- sh.from_pepxml_node(v, self, msmsrun_obj)
1233
- end
1234
1237
  self
1235
1238
  end
1236
1239
 
@@ -1299,6 +1302,8 @@ class Sequest::PepXML::SearchHit
1299
1302
 
1300
1303
  Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1301
1304
 
1305
+ def aaseq ; self[1] end
1306
+ def aaseq=(arg) ; self[1] = arg end
1302
1307
 
1303
1308
  # These are all search_score elements:
1304
1309
 
@@ -1321,44 +1326,6 @@ class Sequest::PepXML::SearchHit
1321
1326
  "#<SearchHit #{var}>"
1322
1327
  end
1323
1328
 
1324
- # requires Params object and full sequence (with heads and tails)
1325
- def self.calc_num_missed_cleavages(params, sequence)
1326
- num_missed = 0
1327
- offset, split_after, except_before = params.enzyme_specificity
1328
- first, middle, last = SpecID::Pep.split_sequence(sequence)
1329
- to_regexp = "[#{split_after}]"
1330
- if except_before.size > 0
1331
- to_regexp << "[^#{except_before}]"
1332
- end
1333
- regexp = /#{to_regexp}/
1334
- arr = middle.scan(regexp)
1335
- num = arr.size
1336
- if middle[-1,1] =~ regexp
1337
- # if the regexp is a single letter (exceptions) and the last letter
1338
- # matches, then it will count when it is not a missed cleavage
1339
- # we can correct for this and get the right answer
1340
- num -= 1
1341
- else
1342
- num
1343
- end
1344
- end
1345
-
1346
- # requires Params object and full sequence (with heads and tails)
1347
- def self.calc_num_tol_term(params, sequence)
1348
- num_tol = 0
1349
- offset, split_after, except_before = params.enzyme_specificity
1350
- first, middle, last = SpecID::Pep.split_sequence(sequence)
1351
- last_of_middle = middle[-1,1]
1352
- first_of_middle = middle[0,1]
1353
- if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
1354
- num_tol += 1
1355
- end
1356
- if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
1357
- num_tol += 1
1358
- end
1359
- num_tol
1360
- end
1361
-
1362
1329
  # Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
1363
1330
  def self.split_ions(ions)
1364
1331
  ions.split("/").map {|ion| ion.to_i }
@@ -1392,11 +1359,7 @@ class Sequest::PepXML::SearchHit
1392
1359
  end
1393
1360
  end
1394
1361
 
1395
- def self.from_pepxml_node(node, spec_query)
1396
- self.new.from_pepxml_node(node, spec_query)
1397
- end
1398
-
1399
- def from_pepxml_node(node, spec_query)
1362
+ def from_pepxml_node(node)
1400
1363
  self[0] = node['hit_rank'].to_i
1401
1364
  self[1] = node['peptide']
1402
1365
  self[2] = node['peptide_prev_aa']
@@ -1410,24 +1373,6 @@ class Sequest::PepXML::SearchHit
1410
1373
  self[10] = node['num_tol_term'].to_i
1411
1374
  self[11] = node['num_missed_cleavages'].to_i
1412
1375
  self[12] = node['is_rejected'].to_i
1413
- if modinfo_node = node.find_first("child::modification_info")
1414
- self[18] = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
1415
- end
1416
- node.find("child::search_score").each do |ss_n|
1417
- case ss_n['name']
1418
- when 'deltacnstar'
1419
- self[13] = ss_n['value'].to_i
1420
- when 'xcorr'
1421
- self[14] = ss_n['value'].to_f
1422
- when 'deltacn'
1423
- self[15] = ss_n['value'].to_f
1424
- when 'spscore'
1425
- self[16] = ss_n['value'].to_f
1426
- when 'sprank'
1427
- self[17] = ss_n['value'].to_i
1428
- end
1429
- end
1430
- self[19] = spec_query
1431
1376
  self
1432
1377
  end
1433
1378