mspire 0.3.1 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -2
- data/bin/bioworks_to_pepxml.rb +15 -3
- data/bin/ms_to_lmat.rb +2 -1
- data/bin/sqt_group.rb +26 -0
- data/changelog.txt +36 -0
- data/lib/ms/msrun.rb +3 -1
- data/lib/ms/parser/mzdata/dom.rb +14 -14
- data/lib/ms/scan.rb +3 -3
- data/lib/mspire.rb +1 -1
- data/lib/sample_enzyme.rb +39 -0
- data/lib/spec_id.rb +18 -0
- data/lib/spec_id/aa_freqs.rb +6 -9
- data/lib/spec_id/digestor.rb +16 -17
- data/lib/spec_id/mass.rb +63 -1
- data/lib/spec_id/parser/proph.rb +101 -2
- data/lib/spec_id/precision/filter.rb +3 -2
- data/lib/spec_id/precision/filter/cmdline.rb +3 -1
- data/lib/spec_id/precision/filter/output.rb +1 -0
- data/lib/spec_id/precision/prob.rb +88 -21
- data/lib/spec_id/precision/prob/cmdline.rb +28 -16
- data/lib/spec_id/precision/prob/output.rb +8 -2
- data/lib/spec_id/proph/pep_summary.rb +25 -12
- data/lib/spec_id/sequest.rb +28 -0
- data/lib/spec_id/sequest/pepxml.rb +142 -197
- data/lib/spec_id/sqt.rb +349 -0
- data/lib/spec_id/srf.rb +33 -23
- data/lib/validator.rb +40 -57
- data/lib/validator/aa.rb +3 -90
- data/lib/validator/aa_est.rb +112 -0
- data/lib/validator/cmdline.rb +163 -31
- data/lib/validator/decoy.rb +15 -7
- data/lib/validator/digestion_based.rb +5 -4
- data/lib/validator/q_value.rb +32 -0
- data/script/peps_per_bin.rb +67 -0
- data/script/sqt_to_meta.rb +24 -0
- data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
- data/specs/bin/fasta_shaker_spec.rb +2 -2
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
- data/specs/bin/filter_and_validate_spec.rb +25 -6
- data/specs/bin/ms_to_lmat_spec.rb +2 -2
- data/specs/bin/prob_validate_spec.rb +5 -3
- data/specs/sample_enzyme_spec.rb +86 -1
- data/specs/spec_helper.rb +11 -9
- data/specs/spec_id/bioworks_spec.rb +2 -1
- data/specs/spec_id/precision/filter_spec.rb +5 -5
- data/specs/spec_id/precision/prob_spec.rb +0 -67
- data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
- data/specs/spec_id/protein_summary_spec.rb +4 -4
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
- data/specs/spec_id/sequest_spec.rb +38 -0
- data/specs/spec_id/sqt_spec.rb +111 -3
- data/specs/spec_id_spec.rb +2 -0
- data/specs/transmem/phobius_spec.rb +3 -1
- data/specs/transmem/toppred_spec.rb +1 -1
- data/specs/validator/aa_est_spec.rb +66 -0
- data/specs/validator/aa_spec.rb +1 -68
- data/specs/validator/background_spec.rb +2 -0
- data/specs/validator/bias_spec.rb +3 -27
- data/specs/validator/decoy_spec.rb +2 -2
- data/specs/validator/transmem_spec.rb +2 -1
- data/test_files/small.sqt +87 -0
- metadata +312 -293
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require 'yaml'
|
2
2
|
require 'spec_id/precision/output'
|
3
3
|
require 'table'
|
4
4
|
require 'matrix'
|
@@ -12,12 +12,18 @@ class SpecID::Precision::Prob::Output
|
|
12
12
|
# returns array of data arrays and parallel labels
|
13
13
|
def to_cols_and_labels(answer_hash)
|
14
14
|
col_labels = %w(count probability peptide)
|
15
|
+
col_labels[1] = 'q_values' if answer_hash.key?(:q_values)
|
15
16
|
|
16
17
|
cols = []
|
17
18
|
cols << answer_hash[:count]
|
18
|
-
|
19
|
+
if answer_hash.key?(:q_values)
|
20
|
+
cols << answer_hash[:q_values]
|
21
|
+
else
|
22
|
+
cols << answer_hash[:probabilities]
|
23
|
+
end
|
19
24
|
cols << answer_hash[:aaseqs]
|
20
25
|
|
26
|
+
|
21
27
|
# if there is a single modified peptide, we'll include the column
|
22
28
|
if answer_hash.key?(:modified_peptides)
|
23
29
|
cols << answer_hash[:modified_peptides]
|
@@ -1,7 +1,6 @@
|
|
1
1
|
|
2
2
|
require 'array_class'
|
3
|
-
|
4
|
-
puts( require 'spec_id/sequest/pepxml' )
|
3
|
+
require 'spec_id/sequest/pepxml'
|
5
4
|
require 'spec_id/parser/proph'
|
6
5
|
|
7
6
|
module Sequest ; end
|
@@ -13,10 +12,12 @@ module SpecID ; end
|
|
13
12
|
module SpecID::Prot ; end
|
14
13
|
module SpecID::Pep ; end
|
15
14
|
|
15
|
+
|
16
|
+
|
16
17
|
module Proph
|
17
18
|
|
18
|
-
class PepSummary
|
19
|
-
|
19
|
+
class PepSummary
|
20
|
+
include SpecID
|
20
21
|
|
21
22
|
Filetype_and_version_re_new = /version="PeptideProphet v([\d\.]+) /
|
22
23
|
|
@@ -25,7 +26,7 @@ module Proph
|
|
25
26
|
# the protein groups
|
26
27
|
# currently these are just xml nodes returned!
|
27
28
|
attr_accessor :peptideprophet_summary
|
28
|
-
attr_accessor :
|
29
|
+
attr_accessor :msms_run_summaries
|
29
30
|
attr_accessor :version
|
30
31
|
|
31
32
|
def hi_prob_best ; true end
|
@@ -51,24 +52,26 @@ module Proph
|
|
51
52
|
end
|
52
53
|
|
53
54
|
def initialize(file=nil)
|
54
|
-
@prots = nil
|
55
55
|
if file
|
56
56
|
@version = get_version(file)
|
57
|
-
|
58
|
-
SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
|
57
|
+
spec_id = SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
|
59
58
|
end
|
60
59
|
end
|
61
60
|
end
|
62
61
|
|
62
|
+
# this is a SpecID::Pep (by interface: not including stuff yet)
|
63
63
|
class PepSummary::Pep < Sequest::PepXML::SearchHit
|
64
|
-
|
64
|
+
|
65
|
+
# aaseq is defined in SearchHit
|
66
|
+
|
67
|
+
%w(probability fval ntt nmc massd prots).each do |guy|
|
65
68
|
self.add_member(guy)
|
66
69
|
end
|
67
70
|
|
68
71
|
# returns self
|
69
|
-
def from_pepxml_node(node
|
70
|
-
super(node
|
71
|
-
|
72
|
+
def from_pepxml_node(node)
|
73
|
+
super(node)
|
74
|
+
|
72
75
|
an_res = node.find_first('child::analysis_result')
|
73
76
|
pp_n = an_res.find_first('child::peptideprophet_result')
|
74
77
|
self.probability = pp_n['probability'].to_f
|
@@ -87,6 +90,16 @@ module Proph
|
|
87
90
|
self
|
88
91
|
end
|
89
92
|
end
|
93
|
+
|
94
|
+
::Proph::PepSummary::Prot = ArrayClass.new(%w(name protein_descr peps))
|
95
|
+
|
96
|
+
class PepSummary::Prot
|
97
|
+
def first_entry ; self[0] end ## name
|
98
|
+
def reference ; self[0] + ' ' + self[1] end
|
99
|
+
end
|
100
|
+
|
90
101
|
end
|
91
102
|
|
92
103
|
|
104
|
+
|
105
|
+
|
data/lib/spec_id/sequest.rb
CHANGED
@@ -1,5 +1,33 @@
|
|
1
1
|
require 'spec_id/sequest/params'
|
2
|
+
require 'hash_by'
|
3
|
+
require 'sort_by_attributes.rb'
|
2
4
|
|
3
5
|
module Sequest
|
6
|
+
|
7
|
+
# returns one array of peptide hits: indexes hits based on index_by, takes
|
8
|
+
# the uniq ones and then sorts the group by sort_by (compatible with
|
9
|
+
# sort_by_attributes) then slices from first_index to last_index
|
10
|
+
# (inclusive).
|
11
|
+
def self.other_hits(peps, first_index=1, last_index=9, index_by=[:base_name, :first_scan, :charge], sort_by=[:xcorr, {:down => :xcorr}])
|
12
|
+
all_hits = []
|
13
|
+
peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
|
14
|
+
if peps_per_scan.size >= (first_index + 1)
|
15
|
+
all_hits.push( *(peps_per_scan.uniq.sort_by_attributes(*sort_by)[first_index..last_index]) )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
all_hits.compact
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.other_hits_sorted_by_xcorr(peps, first_index, last_index, index_by=[:base_name, :first_scan, :charge])
|
22
|
+
all_hits = []
|
23
|
+
peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
|
24
|
+
if peps_per_scan.size >= (first_index + 1)
|
25
|
+
all_hits.push( *(peps_per_scan.uniq.sort_by {|x| x.xcorr }.reverse[first_index..last_index]) )
|
26
|
+
end
|
27
|
+
end
|
28
|
+
all_hits.compact
|
29
|
+
|
30
|
+
end
|
31
|
+
|
4
32
|
end
|
5
33
|
|
@@ -155,25 +155,6 @@ class Sequest::PepXML::MSMSRunSummary
|
|
155
155
|
@ms_detector = node['msDetector']
|
156
156
|
@raw_data_type = node['raw_data_type']
|
157
157
|
@raw_data = node['raw_data']
|
158
|
-
|
159
|
-
sample_enzyme_n = node.find_first("child::sample_enzyme")
|
160
|
-
@sample_enzyme = SampleEnzyme.from_pepxml_node(sample_enzyme_n)
|
161
|
-
|
162
|
-
search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
|
163
|
-
spectrum_queries = search_summary_n.find("following-sibling::spectrum_query")
|
164
|
-
@spectrum_queries = spectrum_queries.map do |sq_n|
|
165
|
-
Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n, self)
|
166
|
-
end
|
167
|
-
|
168
|
-
## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
|
169
|
-
#into a better search summary object (to eventually depracate the params object)
|
170
|
-
@search_summary = node ## in future call SearchSummary.from_pepxml_node
|
171
|
-
@peps = []
|
172
|
-
@spectrum_queries.each do |sq|
|
173
|
-
sq.search_results.each do |sr|
|
174
|
-
@peps.push( *(sr.search_hits) )
|
175
|
-
end
|
176
|
-
end
|
177
158
|
self
|
178
159
|
end
|
179
160
|
end
|
@@ -353,7 +334,13 @@ Default_Options = {
|
|
353
334
|
search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
|
354
335
|
|
355
336
|
# create the sample enzyme from the params object:
|
356
|
-
|
337
|
+
sample_enzyme_obj =
|
338
|
+
if opts[:sample_enzyme]
|
339
|
+
opts[:sample_enzyme]
|
340
|
+
else
|
341
|
+
params.sample_enzyme
|
342
|
+
end
|
343
|
+
opts[:sample_enzyme] = sample_enzyme_obj
|
357
344
|
|
358
345
|
## Create the pepxml obj and top level objects
|
359
346
|
pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
|
@@ -390,36 +377,20 @@ Default_Options = {
|
|
390
377
|
out_files = srf.out_files
|
391
378
|
spectrum_queries_arr = Array.new(srf.dta_files.size)
|
392
379
|
files_with_hits_index = 0 ## will end up being 1 indexed
|
393
|
-
srf.dta_files.each_with_index do |dta_file,i|
|
394
|
-
next if out_files[i].num_hits == 0
|
395
|
-
files_with_hits_index += 1
|
396
|
-
|
397
|
-
# We don't need to sort the hits by xcorr since it comes pre-sorted in
|
398
|
-
# srf files!
|
399
|
-
#arr = hits.sort_by{|v| v.xcorr }
|
400
|
-
|
401
|
-
# Get proper deltacn and deltacnstar
|
402
|
-
# under new srf, deltacn is already corrected for what prophet wants,
|
403
|
-
# deltacn_orig is how to access the old one
|
404
|
-
# Prophet deltacn is not the same as the native Sequest deltacn
|
405
|
-
# It is the deltacn of the second best hit!
|
406
|
-
|
407
|
-
hits = out_files[i].hits
|
408
|
-
top_hit = hits[0]
|
409
|
-
second_hit = hits[1]
|
410
|
-
deltacnstar =
|
411
|
-
if second_hit ; '0'
|
412
|
-
else ; '1'
|
413
|
-
end
|
414
|
-
|
415
|
-
## mass calculations:
|
416
|
-
precursor_neutral_mass = dta_file.mh - h_plus
|
417
|
-
calc_neutral_pep_mass = top_hit[0] - h_plus
|
418
380
|
|
419
|
-
|
381
|
+
deltacn_orig = opts[:deltacn_orig]
|
382
|
+
deltacn_index =
|
383
|
+
if deltacn_orig ; 20
|
384
|
+
else 19
|
385
|
+
end
|
420
386
|
|
387
|
+
srf.dta_files.each_with_index do |dta_file,dta_i|
|
388
|
+
next if out_files[dta_i].num_hits == 0
|
389
|
+
files_with_hits_index += 1
|
421
390
|
|
391
|
+
precursor_neutral_mass = dta_file.mh - h_plus
|
422
392
|
|
393
|
+
(start_scan, end_scan, charge) = srf_index[dta_i]
|
423
394
|
sq_hash = {
|
424
395
|
:spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
|
425
396
|
:start_scan => start_scan,
|
@@ -432,39 +403,70 @@ Default_Options = {
|
|
432
403
|
|
433
404
|
spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
|
434
405
|
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
406
|
+
|
407
|
+
hits = out_files[dta_i].hits
|
408
|
+
|
409
|
+
search_hits =
|
410
|
+
if opts[:all_hits]
|
411
|
+
Array.new(out_files[dta_i].num_hits) # all hits
|
412
|
+
else
|
413
|
+
Array.new(1) # top hit only
|
414
|
+
end
|
415
|
+
|
416
|
+
(0...(search_hits.size)).each do |hit_i|
|
417
|
+
hit = hits[hit_i]
|
418
|
+
# under the modified deltacn schema (like bioworks)
|
419
|
+
# Get proper deltacn and deltacnstar
|
420
|
+
# under new srf, deltacn is already corrected for what prophet wants,
|
421
|
+
# deltacn_orig_updated is how to access the old one
|
422
|
+
# Prophet deltacn is not the same as the native Sequest deltacn
|
423
|
+
# It is the deltacn of the second best hit!
|
424
|
+
|
425
|
+
## mass calculations:
|
426
|
+
calc_neutral_pep_mass = hit[0] - h_plus
|
427
|
+
|
428
|
+
|
429
|
+
sequence = hit.sequence
|
430
|
+
|
431
|
+
# NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
|
432
|
+
## THIS IS ALL INNER LOOP, so we make every effort at speed here:
|
433
|
+
(prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
|
434
|
+
# 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
|
435
|
+
|
436
|
+
sh_hash = {
|
437
|
+
:hit_rank => hit_i+1,
|
438
|
+
:peptide => pepseq,
|
439
|
+
:peptide_prev_aa => prevaa,
|
440
|
+
:peptide_next_aa => nextaa,
|
441
|
+
:protein => hit[10].first.reference.split(" ").first,
|
442
|
+
:num_tot_proteins => hit[10].size,
|
443
|
+
:num_matched_ions => hit[7],
|
444
|
+
:tot_num_ions => hit[8],
|
445
|
+
:calc_neutral_pep_mass => calc_neutral_pep_mass,
|
446
|
+
:massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
|
447
|
+
:num_tol_term => sample_enzyme_obj.num_tol_term(sequence),
|
448
|
+
:num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
|
449
|
+
:is_rejected => 0,
|
450
|
+
# These are search score attributes:
|
451
|
+
:xcorr => hit[3],
|
452
|
+
:deltacn => hit[deltacn_index],
|
453
|
+
:spscore => hit[2],
|
454
|
+
:sprank => hit[6],
|
455
|
+
:modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
|
456
|
+
}
|
457
|
+
unless deltacn_orig
|
458
|
+
sh_hash[:deltacnstar] =
|
459
|
+
if hits[hit_i+1].nil? # no next hit? then its deltacnstar == 1
|
460
|
+
'1'
|
461
|
+
else
|
462
|
+
'0'
|
463
|
+
end
|
464
|
+
end
|
465
|
+
search_hits[hit_i] = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
|
466
|
+
end
|
465
467
|
|
466
468
|
search_result = Sequest::PepXML::SearchResult.new
|
467
|
-
search_result.search_hits =
|
469
|
+
search_result.search_hits = search_hits
|
468
470
|
spectrum_query.search_results = [search_result]
|
469
471
|
spectrum_queries_arr[files_with_hits_index] = spectrum_query
|
470
472
|
end
|
@@ -473,56 +475,61 @@ Default_Options = {
|
|
473
475
|
pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
|
474
476
|
pepxml_obj.base_name = pipeline.msms_run_summary.base_name
|
475
477
|
pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
|
476
|
-
|
478
|
+
|
477
479
|
pepxml_obj
|
478
480
|
end
|
479
481
|
|
480
482
|
# takes an .srg or bioworks.xml file
|
481
483
|
# if possible, ensures that an mzXML file is present for each pepxml file
|
482
484
|
# :print => true, will print files
|
485
|
+
# NOTES: num_tol_term and num_missing_cleavages are both calculated from the
|
486
|
+
# sample_enzyme. Thus, a No_Enzyme search may still pass in a
|
487
|
+
# :sample_enzyme option to get these calculated.
|
483
488
|
def self.set_from_bioworks(bioworks_file, opts={})
|
484
489
|
opts = Default_Options.merge(opts)
|
485
490
|
## Create the out_path directory if necessary
|
486
491
|
|
487
|
-
|
488
|
-
|
489
|
-
end
|
490
|
-
unless File.directory? opts[:out_path]
|
491
|
-
abort "#{opts[:out_path]} must be a directory!"
|
492
|
-
end
|
493
|
-
|
494
|
-
spec_id = SpecID.new(bioworks_file)
|
495
|
-
pepxml_objs =
|
496
|
-
if spec_id.is_a? Bioworks
|
497
|
-
abort("must have opts[:params] set!") unless opts[:params]
|
498
|
-
set_from_bioworks_xml(bioworks_file, opts[:params], opts)
|
499
|
-
elsif spec_id.is_a? SRFGroup
|
500
|
-
spec_id.srfs.map do |srf|
|
501
|
-
new_from_srf(srf, opts)
|
492
|
+
unless File.exist? opts[:out_path]
|
493
|
+
FileUtils.mkpath(opts[:out_path])
|
502
494
|
end
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
pepxml_objs
|
509
|
-
|
495
|
+
unless File.directory? opts[:out_path]
|
496
|
+
abort "#{opts[:out_path]} must be a directory!"
|
497
|
+
end
|
498
|
+
|
499
|
+
spec_id = SpecID.new(bioworks_file)
|
500
|
+
pepxml_objs =
|
501
|
+
if spec_id.is_a? Bioworks
|
502
|
+
abort("must have opts[:params] set!") unless opts[:params]
|
503
|
+
set_from_bioworks_xml(bioworks_file, opts[:params], opts)
|
504
|
+
elsif spec_id.is_a? SRFGroup
|
505
|
+
spec_id.srfs.map do |srf|
|
506
|
+
new_from_srf(srf, opts)
|
507
|
+
end
|
508
|
+
else
|
509
|
+
abort "invalid object"
|
510
|
+
end
|
511
|
+
|
512
|
+
if opts[:print]
|
513
|
+
pepxml_objs.each do |obj|
|
514
|
+
obj.to_pepxml(obj.base_name + ".xml")
|
515
|
+
end
|
510
516
|
end
|
517
|
+
pepxml_objs
|
511
518
|
end
|
512
|
-
pepxml_objs
|
513
|
-
end
|
514
519
|
|
515
520
|
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
521
|
+
# Takes bioworks 3.2/3.3 xml output (with no filters)
|
522
|
+
# Returns a list of PepXML objects
|
523
|
+
# params = sequest.params file
|
524
|
+
# bioworks = bioworks.xml exported multi-consensus view file
|
525
|
+
# pepxml_version = 0 for tpp 1.2.3
|
526
|
+
# pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
|
522
527
|
def self.set_from_bioworks_xml(bioworks, params, opts={})
|
523
528
|
opts = Default_Options.merge(opts)
|
524
529
|
pepxml_version, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
|
525
530
|
|
531
|
+
|
532
|
+
|
526
533
|
unless out_path
|
527
534
|
out_path = '.'
|
528
535
|
end
|
@@ -545,6 +552,13 @@ Default_Options = {
|
|
545
552
|
else ; abort "Don't recognize #{bioworks} as object or string!"
|
546
553
|
end
|
547
554
|
|
555
|
+
sample_enzyme_obj =
|
556
|
+
if opts[:sample_enzyme]
|
557
|
+
opts[:sample_enzyme]
|
558
|
+
else
|
559
|
+
params.sample_enzyme
|
560
|
+
end
|
561
|
+
|
548
562
|
#puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
|
549
563
|
|
550
564
|
## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
|
@@ -589,7 +603,7 @@ Default_Options = {
|
|
589
603
|
:ms_detector => ms_detector,
|
590
604
|
:raw_data_type => raw_data_type,
|
591
605
|
:raw_data => raw_data,
|
592
|
-
:sample_enzyme => params.sample_enzyme,
|
606
|
+
:sample_enzyme => sample_enzyme_obj, # usually, params.sample_enzyme,
|
593
607
|
:search_summary => search_summary,
|
594
608
|
})
|
595
609
|
pipeline.msms_run_summary = msms_run_summary
|
@@ -626,10 +640,11 @@ Default_Options = {
|
|
626
640
|
end
|
627
641
|
|
628
642
|
|
629
|
-
spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).
|
643
|
+
spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).map do |key,arr|
|
630
644
|
|
631
645
|
|
632
646
|
# Sort_by_rank and take the top hit (to mimick out2summary):
|
647
|
+
|
633
648
|
arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
|
634
649
|
top_pep = arr.pop
|
635
650
|
second_hit = arr.last # needed for deltacnstar
|
@@ -643,7 +658,7 @@ Default_Options = {
|
|
643
658
|
end
|
644
659
|
|
645
660
|
calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
|
646
|
-
|
661
|
+
|
647
662
|
# deltacn & star:
|
648
663
|
# (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
|
649
664
|
if second_hit
|
@@ -685,8 +700,8 @@ Default_Options = {
|
|
685
700
|
:tot_num_ions => tot_num_ions,
|
686
701
|
:calc_neutral_pep_mass => calc_neutral_pep_mass,
|
687
702
|
:massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
|
688
|
-
:num_tol_term =>
|
689
|
-
:num_missed_cleavages =>
|
703
|
+
:num_tol_term => sample_enzyme_obj.num_tol_term(top_pep.sequence),
|
704
|
+
:num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
|
690
705
|
:is_rejected => 0,
|
691
706
|
# These are search score attributes:
|
692
707
|
:xcorr => top_pep.xcorr,
|
@@ -697,7 +712,7 @@ Default_Options = {
|
|
697
712
|
:modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
|
698
713
|
:spectrum_query => spec_query,
|
699
714
|
})
|
700
|
-
search_result.search_hits = [search_hit]
|
715
|
+
search_result.search_hits = [search_hit] # there can be multiple search hits
|
701
716
|
spec_query.search_results = [search_result] # can be multiple search_results
|
702
717
|
spec_query
|
703
718
|
end
|
@@ -766,9 +781,8 @@ class Sequest::PepXML::SearchResult
|
|
766
781
|
attr_accessor :search_hits
|
767
782
|
|
768
783
|
# if block given, then search_hits set to return value
|
769
|
-
def initialize
|
770
|
-
|
771
|
-
else ; @search_hits = [] end
|
784
|
+
def initialize(search_hits = [])
|
785
|
+
@search_hits = search_hits
|
772
786
|
end
|
773
787
|
|
774
788
|
def to_pepxml
|
@@ -777,17 +791,6 @@ class Sequest::PepXML::SearchResult
|
|
777
791
|
end
|
778
792
|
end
|
779
793
|
|
780
|
-
def self.from_pepxml_node(node, spec_query)
|
781
|
-
self.new.from_pepxml_node(node, spec_query)
|
782
|
-
end
|
783
|
-
|
784
|
-
def from_pepxml_node(node, spec_query, msmsrun_obj)
|
785
|
-
sh_klass = msmsrun_obj.search_hit_class
|
786
|
-
@search_hits = node.children.map do |sh_n|
|
787
|
-
sh_klass.from_pepxml_node(sh_n, spec_query)
|
788
|
-
end
|
789
|
-
self
|
790
|
-
end
|
791
794
|
end
|
792
795
|
|
793
796
|
class Sequest::PepXML::SearchSummary
|
@@ -820,7 +823,11 @@ class Sequest::PepXML::SearchSummary
|
|
820
823
|
def to_pepxml
|
821
824
|
element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
|
822
825
|
search_database.to_pepxml +
|
823
|
-
|
826
|
+
if @params.enzyme =~ /^No_Enzyme/
|
827
|
+
''
|
828
|
+
else
|
829
|
+
short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini])
|
830
|
+
end +
|
824
831
|
@modifications.to_pepxml +
|
825
832
|
Sequest::PepXML::Parameters.new(@params).to_pepxml
|
826
833
|
end
|
@@ -1216,21 +1223,17 @@ class Sequest::PepXML::SpectrumQuery
|
|
1216
1223
|
end
|
1217
1224
|
end
|
1218
1225
|
|
1219
|
-
def self.from_pepxml_node(node
|
1220
|
-
self.new.from_pepxml_node(node
|
1226
|
+
def self.from_pepxml_node(node)
|
1227
|
+
self.new.from_pepxml_node(node)
|
1221
1228
|
end
|
1222
1229
|
|
1223
|
-
def from_pepxml_node(node
|
1230
|
+
def from_pepxml_node(node)
|
1224
1231
|
self[0] = node['spectrum']
|
1225
1232
|
self[1] = node['start_scan'].to_i
|
1226
1233
|
self[2] = node['end_scan'].to_i
|
1227
1234
|
self[3] = node['precursor_neutral_mass'].to_f
|
1228
1235
|
self[4] = node['index'].to_i
|
1229
1236
|
self[5] = node['assumed_charge'].to_i
|
1230
|
-
self[6] = node.children.map do |v|
|
1231
|
-
sh = Sequest::PepXML::SearchResult.new
|
1232
|
-
sh.from_pepxml_node(v, self, msmsrun_obj)
|
1233
|
-
end
|
1234
1237
|
self
|
1235
1238
|
end
|
1236
1239
|
|
@@ -1299,6 +1302,8 @@ class Sequest::PepXML::SearchHit
|
|
1299
1302
|
|
1300
1303
|
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
|
1301
1304
|
|
1305
|
+
def aaseq ; self[1] end
|
1306
|
+
def aaseq=(arg) ; self[1] = arg end
|
1302
1307
|
|
1303
1308
|
# These are all search_score elements:
|
1304
1309
|
|
@@ -1321,44 +1326,6 @@ class Sequest::PepXML::SearchHit
|
|
1321
1326
|
"#<SearchHit #{var}>"
|
1322
1327
|
end
|
1323
1328
|
|
1324
|
-
# requires Params object and full sequence (with heads and tails)
|
1325
|
-
def self.calc_num_missed_cleavages(params, sequence)
|
1326
|
-
num_missed = 0
|
1327
|
-
offset, split_after, except_before = params.enzyme_specificity
|
1328
|
-
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
1329
|
-
to_regexp = "[#{split_after}]"
|
1330
|
-
if except_before.size > 0
|
1331
|
-
to_regexp << "[^#{except_before}]"
|
1332
|
-
end
|
1333
|
-
regexp = /#{to_regexp}/
|
1334
|
-
arr = middle.scan(regexp)
|
1335
|
-
num = arr.size
|
1336
|
-
if middle[-1,1] =~ regexp
|
1337
|
-
# if the regexp is a single letter (exceptions) and the last letter
|
1338
|
-
# matches, then it will count when it is not a missed cleavage
|
1339
|
-
# we can correct for this and get the right answer
|
1340
|
-
num -= 1
|
1341
|
-
else
|
1342
|
-
num
|
1343
|
-
end
|
1344
|
-
end
|
1345
|
-
|
1346
|
-
# requires Params object and full sequence (with heads and tails)
|
1347
|
-
def self.calc_num_tol_term(params, sequence)
|
1348
|
-
num_tol = 0
|
1349
|
-
offset, split_after, except_before = params.enzyme_specificity
|
1350
|
-
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
1351
|
-
last_of_middle = middle[-1,1]
|
1352
|
-
first_of_middle = middle[0,1]
|
1353
|
-
if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
|
1354
|
-
num_tol += 1
|
1355
|
-
end
|
1356
|
-
if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
|
1357
|
-
num_tol += 1
|
1358
|
-
end
|
1359
|
-
num_tol
|
1360
|
-
end
|
1361
|
-
|
1362
1329
|
# Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
|
1363
1330
|
def self.split_ions(ions)
|
1364
1331
|
ions.split("/").map {|ion| ion.to_i }
|
@@ -1392,11 +1359,7 @@ class Sequest::PepXML::SearchHit
|
|
1392
1359
|
end
|
1393
1360
|
end
|
1394
1361
|
|
1395
|
-
def
|
1396
|
-
self.new.from_pepxml_node(node, spec_query)
|
1397
|
-
end
|
1398
|
-
|
1399
|
-
def from_pepxml_node(node, spec_query)
|
1362
|
+
def from_pepxml_node(node)
|
1400
1363
|
self[0] = node['hit_rank'].to_i
|
1401
1364
|
self[1] = node['peptide']
|
1402
1365
|
self[2] = node['peptide_prev_aa']
|
@@ -1410,24 +1373,6 @@ class Sequest::PepXML::SearchHit
|
|
1410
1373
|
self[10] = node['num_tol_term'].to_i
|
1411
1374
|
self[11] = node['num_missed_cleavages'].to_i
|
1412
1375
|
self[12] = node['is_rejected'].to_i
|
1413
|
-
if modinfo_node = node.find_first("child::modification_info")
|
1414
|
-
self[18] = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
|
1415
|
-
end
|
1416
|
-
node.find("child::search_score").each do |ss_n|
|
1417
|
-
case ss_n['name']
|
1418
|
-
when 'deltacnstar'
|
1419
|
-
self[13] = ss_n['value'].to_i
|
1420
|
-
when 'xcorr'
|
1421
|
-
self[14] = ss_n['value'].to_f
|
1422
|
-
when 'deltacn'
|
1423
|
-
self[15] = ss_n['value'].to_f
|
1424
|
-
when 'spscore'
|
1425
|
-
self[16] = ss_n['value'].to_f
|
1426
|
-
when 'sprank'
|
1427
|
-
self[17] = ss_n['value'].to_i
|
1428
|
-
end
|
1429
|
-
end
|
1430
|
-
self[19] = spec_query
|
1431
1376
|
self
|
1432
1377
|
end
|
1433
1378
|
|