mspire 0.3.1 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -2
- data/bin/bioworks_to_pepxml.rb +15 -3
- data/bin/ms_to_lmat.rb +2 -1
- data/bin/sqt_group.rb +26 -0
- data/changelog.txt +36 -0
- data/lib/ms/msrun.rb +3 -1
- data/lib/ms/parser/mzdata/dom.rb +14 -14
- data/lib/ms/scan.rb +3 -3
- data/lib/mspire.rb +1 -1
- data/lib/sample_enzyme.rb +39 -0
- data/lib/spec_id.rb +18 -0
- data/lib/spec_id/aa_freqs.rb +6 -9
- data/lib/spec_id/digestor.rb +16 -17
- data/lib/spec_id/mass.rb +63 -1
- data/lib/spec_id/parser/proph.rb +101 -2
- data/lib/spec_id/precision/filter.rb +3 -2
- data/lib/spec_id/precision/filter/cmdline.rb +3 -1
- data/lib/spec_id/precision/filter/output.rb +1 -0
- data/lib/spec_id/precision/prob.rb +88 -21
- data/lib/spec_id/precision/prob/cmdline.rb +28 -16
- data/lib/spec_id/precision/prob/output.rb +8 -2
- data/lib/spec_id/proph/pep_summary.rb +25 -12
- data/lib/spec_id/sequest.rb +28 -0
- data/lib/spec_id/sequest/pepxml.rb +142 -197
- data/lib/spec_id/sqt.rb +349 -0
- data/lib/spec_id/srf.rb +33 -23
- data/lib/validator.rb +40 -57
- data/lib/validator/aa.rb +3 -90
- data/lib/validator/aa_est.rb +112 -0
- data/lib/validator/cmdline.rb +163 -31
- data/lib/validator/decoy.rb +15 -7
- data/lib/validator/digestion_based.rb +5 -4
- data/lib/validator/q_value.rb +32 -0
- data/script/peps_per_bin.rb +67 -0
- data/script/sqt_to_meta.rb +24 -0
- data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
- data/specs/bin/fasta_shaker_spec.rb +2 -2
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
- data/specs/bin/filter_and_validate_spec.rb +25 -6
- data/specs/bin/ms_to_lmat_spec.rb +2 -2
- data/specs/bin/prob_validate_spec.rb +5 -3
- data/specs/sample_enzyme_spec.rb +86 -1
- data/specs/spec_helper.rb +11 -9
- data/specs/spec_id/bioworks_spec.rb +2 -1
- data/specs/spec_id/precision/filter_spec.rb +5 -5
- data/specs/spec_id/precision/prob_spec.rb +0 -67
- data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
- data/specs/spec_id/protein_summary_spec.rb +4 -4
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
- data/specs/spec_id/sequest_spec.rb +38 -0
- data/specs/spec_id/sqt_spec.rb +111 -3
- data/specs/spec_id_spec.rb +2 -0
- data/specs/transmem/phobius_spec.rb +3 -1
- data/specs/transmem/toppred_spec.rb +1 -1
- data/specs/validator/aa_est_spec.rb +66 -0
- data/specs/validator/aa_spec.rb +1 -68
- data/specs/validator/background_spec.rb +2 -0
- data/specs/validator/bias_spec.rb +3 -27
- data/specs/validator/decoy_spec.rb +2 -2
- data/specs/validator/transmem_spec.rb +2 -1
- data/test_files/small.sqt +87 -0
- metadata +312 -293
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require 'yaml'
|
2
2
|
require 'spec_id/precision/output'
|
3
3
|
require 'table'
|
4
4
|
require 'matrix'
|
@@ -12,12 +12,18 @@ class SpecID::Precision::Prob::Output
|
|
12
12
|
# returns array of data arrays and parallel labels
|
13
13
|
def to_cols_and_labels(answer_hash)
|
14
14
|
col_labels = %w(count probability peptide)
|
15
|
+
col_labels[1] = 'q_values' if answer_hash.key?(:q_values)
|
15
16
|
|
16
17
|
cols = []
|
17
18
|
cols << answer_hash[:count]
|
18
|
-
|
19
|
+
if answer_hash.key?(:q_values)
|
20
|
+
cols << answer_hash[:q_values]
|
21
|
+
else
|
22
|
+
cols << answer_hash[:probabilities]
|
23
|
+
end
|
19
24
|
cols << answer_hash[:aaseqs]
|
20
25
|
|
26
|
+
|
21
27
|
# if there is a single modified peptide, we'll include the column
|
22
28
|
if answer_hash.key?(:modified_peptides)
|
23
29
|
cols << answer_hash[:modified_peptides]
|
@@ -1,7 +1,6 @@
|
|
1
1
|
|
2
2
|
require 'array_class'
|
3
|
-
|
4
|
-
puts( require 'spec_id/sequest/pepxml' )
|
3
|
+
require 'spec_id/sequest/pepxml'
|
5
4
|
require 'spec_id/parser/proph'
|
6
5
|
|
7
6
|
module Sequest ; end
|
@@ -13,10 +12,12 @@ module SpecID ; end
|
|
13
12
|
module SpecID::Prot ; end
|
14
13
|
module SpecID::Pep ; end
|
15
14
|
|
15
|
+
|
16
|
+
|
16
17
|
module Proph
|
17
18
|
|
18
|
-
class PepSummary
|
19
|
-
|
19
|
+
class PepSummary
|
20
|
+
include SpecID
|
20
21
|
|
21
22
|
Filetype_and_version_re_new = /version="PeptideProphet v([\d\.]+) /
|
22
23
|
|
@@ -25,7 +26,7 @@ module Proph
|
|
25
26
|
# the protein groups
|
26
27
|
# currently these are just xml nodes returned!
|
27
28
|
attr_accessor :peptideprophet_summary
|
28
|
-
attr_accessor :
|
29
|
+
attr_accessor :msms_run_summaries
|
29
30
|
attr_accessor :version
|
30
31
|
|
31
32
|
def hi_prob_best ; true end
|
@@ -51,24 +52,26 @@ module Proph
|
|
51
52
|
end
|
52
53
|
|
53
54
|
def initialize(file=nil)
|
54
|
-
@prots = nil
|
55
55
|
if file
|
56
56
|
@version = get_version(file)
|
57
|
-
|
58
|
-
SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
|
57
|
+
spec_id = SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
|
59
58
|
end
|
60
59
|
end
|
61
60
|
end
|
62
61
|
|
62
|
+
# this is a SpecID::Pep (by interface: not including stuff yet)
|
63
63
|
class PepSummary::Pep < Sequest::PepXML::SearchHit
|
64
|
-
|
64
|
+
|
65
|
+
# aaseq is defined in SearchHit
|
66
|
+
|
67
|
+
%w(probability fval ntt nmc massd prots).each do |guy|
|
65
68
|
self.add_member(guy)
|
66
69
|
end
|
67
70
|
|
68
71
|
# returns self
|
69
|
-
def from_pepxml_node(node
|
70
|
-
super(node
|
71
|
-
|
72
|
+
def from_pepxml_node(node)
|
73
|
+
super(node)
|
74
|
+
|
72
75
|
an_res = node.find_first('child::analysis_result')
|
73
76
|
pp_n = an_res.find_first('child::peptideprophet_result')
|
74
77
|
self.probability = pp_n['probability'].to_f
|
@@ -87,6 +90,16 @@ module Proph
|
|
87
90
|
self
|
88
91
|
end
|
89
92
|
end
|
93
|
+
|
94
|
+
::Proph::PepSummary::Prot = ArrayClass.new(%w(name protein_descr peps))
|
95
|
+
|
96
|
+
class PepSummary::Prot
|
97
|
+
def first_entry ; self[0] end ## name
|
98
|
+
def reference ; self[0] + ' ' + self[1] end
|
99
|
+
end
|
100
|
+
|
90
101
|
end
|
91
102
|
|
92
103
|
|
104
|
+
|
105
|
+
|
data/lib/spec_id/sequest.rb
CHANGED
@@ -1,5 +1,33 @@
|
|
1
1
|
require 'spec_id/sequest/params'
|
2
|
+
require 'hash_by'
|
3
|
+
require 'sort_by_attributes.rb'
|
2
4
|
|
3
5
|
module Sequest
|
6
|
+
|
7
|
+
# returns one array of peptide hits: indexes hits based on index_by, takes
|
8
|
+
# the uniq ones and then sorts the group by sort_by (compatible with
|
9
|
+
# sort_by_attributes) then slices from first_index to last_index
|
10
|
+
# (inclusive).
|
11
|
+
def self.other_hits(peps, first_index=1, last_index=9, index_by=[:base_name, :first_scan, :charge], sort_by=[:xcorr, {:down => :xcorr}])
|
12
|
+
all_hits = []
|
13
|
+
peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
|
14
|
+
if peps_per_scan.size >= (first_index + 1)
|
15
|
+
all_hits.push( *(peps_per_scan.uniq.sort_by_attributes(*sort_by)[first_index..last_index]) )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
all_hits.compact
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.other_hits_sorted_by_xcorr(peps, first_index, last_index, index_by=[:base_name, :first_scan, :charge])
|
22
|
+
all_hits = []
|
23
|
+
peps.hash_by(*index_by).each do |scan_key, peps_per_scan|
|
24
|
+
if peps_per_scan.size >= (first_index + 1)
|
25
|
+
all_hits.push( *(peps_per_scan.uniq.sort_by {|x| x.xcorr }.reverse[first_index..last_index]) )
|
26
|
+
end
|
27
|
+
end
|
28
|
+
all_hits.compact
|
29
|
+
|
30
|
+
end
|
31
|
+
|
4
32
|
end
|
5
33
|
|
@@ -155,25 +155,6 @@ class Sequest::PepXML::MSMSRunSummary
|
|
155
155
|
@ms_detector = node['msDetector']
|
156
156
|
@raw_data_type = node['raw_data_type']
|
157
157
|
@raw_data = node['raw_data']
|
158
|
-
|
159
|
-
sample_enzyme_n = node.find_first("child::sample_enzyme")
|
160
|
-
@sample_enzyme = SampleEnzyme.from_pepxml_node(sample_enzyme_n)
|
161
|
-
|
162
|
-
search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
|
163
|
-
spectrum_queries = search_summary_n.find("following-sibling::spectrum_query")
|
164
|
-
@spectrum_queries = spectrum_queries.map do |sq_n|
|
165
|
-
Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n, self)
|
166
|
-
end
|
167
|
-
|
168
|
-
## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
|
169
|
-
#into a better search summary object (to eventually depracate the params object)
|
170
|
-
@search_summary = node ## in future call SearchSummary.from_pepxml_node
|
171
|
-
@peps = []
|
172
|
-
@spectrum_queries.each do |sq|
|
173
|
-
sq.search_results.each do |sr|
|
174
|
-
@peps.push( *(sr.search_hits) )
|
175
|
-
end
|
176
|
-
end
|
177
158
|
self
|
178
159
|
end
|
179
160
|
end
|
@@ -353,7 +334,13 @@ Default_Options = {
|
|
353
334
|
search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
|
354
335
|
|
355
336
|
# create the sample enzyme from the params object:
|
356
|
-
|
337
|
+
sample_enzyme_obj =
|
338
|
+
if opts[:sample_enzyme]
|
339
|
+
opts[:sample_enzyme]
|
340
|
+
else
|
341
|
+
params.sample_enzyme
|
342
|
+
end
|
343
|
+
opts[:sample_enzyme] = sample_enzyme_obj
|
357
344
|
|
358
345
|
## Create the pepxml obj and top level objects
|
359
346
|
pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
|
@@ -390,36 +377,20 @@ Default_Options = {
|
|
390
377
|
out_files = srf.out_files
|
391
378
|
spectrum_queries_arr = Array.new(srf.dta_files.size)
|
392
379
|
files_with_hits_index = 0 ## will end up being 1 indexed
|
393
|
-
srf.dta_files.each_with_index do |dta_file,i|
|
394
|
-
next if out_files[i].num_hits == 0
|
395
|
-
files_with_hits_index += 1
|
396
|
-
|
397
|
-
# We don't need to sort the hits by xcorr since it comes pre-sorted in
|
398
|
-
# srf files!
|
399
|
-
#arr = hits.sort_by{|v| v.xcorr }
|
400
|
-
|
401
|
-
# Get proper deltacn and deltacnstar
|
402
|
-
# under new srf, deltacn is already corrected for what prophet wants,
|
403
|
-
# deltacn_orig is how to access the old one
|
404
|
-
# Prophet deltacn is not the same as the native Sequest deltacn
|
405
|
-
# It is the deltacn of the second best hit!
|
406
|
-
|
407
|
-
hits = out_files[i].hits
|
408
|
-
top_hit = hits[0]
|
409
|
-
second_hit = hits[1]
|
410
|
-
deltacnstar =
|
411
|
-
if second_hit ; '0'
|
412
|
-
else ; '1'
|
413
|
-
end
|
414
|
-
|
415
|
-
## mass calculations:
|
416
|
-
precursor_neutral_mass = dta_file.mh - h_plus
|
417
|
-
calc_neutral_pep_mass = top_hit[0] - h_plus
|
418
380
|
|
419
|
-
|
381
|
+
deltacn_orig = opts[:deltacn_orig]
|
382
|
+
deltacn_index =
|
383
|
+
if deltacn_orig ; 20
|
384
|
+
else 19
|
385
|
+
end
|
420
386
|
|
387
|
+
srf.dta_files.each_with_index do |dta_file,dta_i|
|
388
|
+
next if out_files[dta_i].num_hits == 0
|
389
|
+
files_with_hits_index += 1
|
421
390
|
|
391
|
+
precursor_neutral_mass = dta_file.mh - h_plus
|
422
392
|
|
393
|
+
(start_scan, end_scan, charge) = srf_index[dta_i]
|
423
394
|
sq_hash = {
|
424
395
|
:spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
|
425
396
|
:start_scan => start_scan,
|
@@ -432,39 +403,70 @@ Default_Options = {
|
|
432
403
|
|
433
404
|
spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
|
434
405
|
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
406
|
+
|
407
|
+
hits = out_files[dta_i].hits
|
408
|
+
|
409
|
+
search_hits =
|
410
|
+
if opts[:all_hits]
|
411
|
+
Array.new(out_files[dta_i].num_hits) # all hits
|
412
|
+
else
|
413
|
+
Array.new(1) # top hit only
|
414
|
+
end
|
415
|
+
|
416
|
+
(0...(search_hits.size)).each do |hit_i|
|
417
|
+
hit = hits[hit_i]
|
418
|
+
# under the modified deltacn schema (like bioworks)
|
419
|
+
# Get proper deltacn and deltacnstar
|
420
|
+
# under new srf, deltacn is already corrected for what prophet wants,
|
421
|
+
# deltacn_orig_updated is how to access the old one
|
422
|
+
# Prophet deltacn is not the same as the native Sequest deltacn
|
423
|
+
# It is the deltacn of the second best hit!
|
424
|
+
|
425
|
+
## mass calculations:
|
426
|
+
calc_neutral_pep_mass = hit[0] - h_plus
|
427
|
+
|
428
|
+
|
429
|
+
sequence = hit.sequence
|
430
|
+
|
431
|
+
# NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
|
432
|
+
## THIS IS ALL INNER LOOP, so we make every effort at speed here:
|
433
|
+
(prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
|
434
|
+
# 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
|
435
|
+
|
436
|
+
sh_hash = {
|
437
|
+
:hit_rank => hit_i+1,
|
438
|
+
:peptide => pepseq,
|
439
|
+
:peptide_prev_aa => prevaa,
|
440
|
+
:peptide_next_aa => nextaa,
|
441
|
+
:protein => hit[10].first.reference.split(" ").first,
|
442
|
+
:num_tot_proteins => hit[10].size,
|
443
|
+
:num_matched_ions => hit[7],
|
444
|
+
:tot_num_ions => hit[8],
|
445
|
+
:calc_neutral_pep_mass => calc_neutral_pep_mass,
|
446
|
+
:massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
|
447
|
+
:num_tol_term => sample_enzyme_obj.num_tol_term(sequence),
|
448
|
+
:num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
|
449
|
+
:is_rejected => 0,
|
450
|
+
# These are search score attributes:
|
451
|
+
:xcorr => hit[3],
|
452
|
+
:deltacn => hit[deltacn_index],
|
453
|
+
:spscore => hit[2],
|
454
|
+
:sprank => hit[6],
|
455
|
+
:modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
|
456
|
+
}
|
457
|
+
unless deltacn_orig
|
458
|
+
sh_hash[:deltacnstar] =
|
459
|
+
if hits[hit_i+1].nil? # no next hit? then its deltacnstar == 1
|
460
|
+
'1'
|
461
|
+
else
|
462
|
+
'0'
|
463
|
+
end
|
464
|
+
end
|
465
|
+
search_hits[hit_i] = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
|
466
|
+
end
|
465
467
|
|
466
468
|
search_result = Sequest::PepXML::SearchResult.new
|
467
|
-
search_result.search_hits =
|
469
|
+
search_result.search_hits = search_hits
|
468
470
|
spectrum_query.search_results = [search_result]
|
469
471
|
spectrum_queries_arr[files_with_hits_index] = spectrum_query
|
470
472
|
end
|
@@ -473,56 +475,61 @@ Default_Options = {
|
|
473
475
|
pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
|
474
476
|
pepxml_obj.base_name = pipeline.msms_run_summary.base_name
|
475
477
|
pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
|
476
|
-
|
478
|
+
|
477
479
|
pepxml_obj
|
478
480
|
end
|
479
481
|
|
480
482
|
# takes an .srg or bioworks.xml file
|
481
483
|
# if possible, ensures that an mzXML file is present for each pepxml file
|
482
484
|
# :print => true, will print files
|
485
|
+
# NOTES: num_tol_term and num_missing_cleavages are both calculated from the
|
486
|
+
# sample_enzyme. Thus, a No_Enzyme search may still pass in a
|
487
|
+
# :sample_enzyme option to get these calculated.
|
483
488
|
def self.set_from_bioworks(bioworks_file, opts={})
|
484
489
|
opts = Default_Options.merge(opts)
|
485
490
|
## Create the out_path directory if necessary
|
486
491
|
|
487
|
-
|
488
|
-
|
489
|
-
end
|
490
|
-
unless File.directory? opts[:out_path]
|
491
|
-
abort "#{opts[:out_path]} must be a directory!"
|
492
|
-
end
|
493
|
-
|
494
|
-
spec_id = SpecID.new(bioworks_file)
|
495
|
-
pepxml_objs =
|
496
|
-
if spec_id.is_a? Bioworks
|
497
|
-
abort("must have opts[:params] set!") unless opts[:params]
|
498
|
-
set_from_bioworks_xml(bioworks_file, opts[:params], opts)
|
499
|
-
elsif spec_id.is_a? SRFGroup
|
500
|
-
spec_id.srfs.map do |srf|
|
501
|
-
new_from_srf(srf, opts)
|
492
|
+
unless File.exist? opts[:out_path]
|
493
|
+
FileUtils.mkpath(opts[:out_path])
|
502
494
|
end
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
pepxml_objs
|
509
|
-
|
495
|
+
unless File.directory? opts[:out_path]
|
496
|
+
abort "#{opts[:out_path]} must be a directory!"
|
497
|
+
end
|
498
|
+
|
499
|
+
spec_id = SpecID.new(bioworks_file)
|
500
|
+
pepxml_objs =
|
501
|
+
if spec_id.is_a? Bioworks
|
502
|
+
abort("must have opts[:params] set!") unless opts[:params]
|
503
|
+
set_from_bioworks_xml(bioworks_file, opts[:params], opts)
|
504
|
+
elsif spec_id.is_a? SRFGroup
|
505
|
+
spec_id.srfs.map do |srf|
|
506
|
+
new_from_srf(srf, opts)
|
507
|
+
end
|
508
|
+
else
|
509
|
+
abort "invalid object"
|
510
|
+
end
|
511
|
+
|
512
|
+
if opts[:print]
|
513
|
+
pepxml_objs.each do |obj|
|
514
|
+
obj.to_pepxml(obj.base_name + ".xml")
|
515
|
+
end
|
510
516
|
end
|
517
|
+
pepxml_objs
|
511
518
|
end
|
512
|
-
pepxml_objs
|
513
|
-
end
|
514
519
|
|
515
520
|
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
521
|
+
# Takes bioworks 3.2/3.3 xml output (with no filters)
|
522
|
+
# Returns a list of PepXML objects
|
523
|
+
# params = sequest.params file
|
524
|
+
# bioworks = bioworks.xml exported multi-consensus view file
|
525
|
+
# pepxml_version = 0 for tpp 1.2.3
|
526
|
+
# pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
|
522
527
|
def self.set_from_bioworks_xml(bioworks, params, opts={})
|
523
528
|
opts = Default_Options.merge(opts)
|
524
529
|
pepxml_version, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
|
525
530
|
|
531
|
+
|
532
|
+
|
526
533
|
unless out_path
|
527
534
|
out_path = '.'
|
528
535
|
end
|
@@ -545,6 +552,13 @@ Default_Options = {
|
|
545
552
|
else ; abort "Don't recognize #{bioworks} as object or string!"
|
546
553
|
end
|
547
554
|
|
555
|
+
sample_enzyme_obj =
|
556
|
+
if opts[:sample_enzyme]
|
557
|
+
opts[:sample_enzyme]
|
558
|
+
else
|
559
|
+
params.sample_enzyme
|
560
|
+
end
|
561
|
+
|
548
562
|
#puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
|
549
563
|
|
550
564
|
## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
|
@@ -589,7 +603,7 @@ Default_Options = {
|
|
589
603
|
:ms_detector => ms_detector,
|
590
604
|
:raw_data_type => raw_data_type,
|
591
605
|
:raw_data => raw_data,
|
592
|
-
:sample_enzyme => params.sample_enzyme,
|
606
|
+
:sample_enzyme => sample_enzyme_obj, # usually, params.sample_enzyme,
|
593
607
|
:search_summary => search_summary,
|
594
608
|
})
|
595
609
|
pipeline.msms_run_summary = msms_run_summary
|
@@ -626,10 +640,11 @@ Default_Options = {
|
|
626
640
|
end
|
627
641
|
|
628
642
|
|
629
|
-
spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).
|
643
|
+
spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).map do |key,arr|
|
630
644
|
|
631
645
|
|
632
646
|
# Sort_by_rank and take the top hit (to mimick out2summary):
|
647
|
+
|
633
648
|
arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
|
634
649
|
top_pep = arr.pop
|
635
650
|
second_hit = arr.last # needed for deltacnstar
|
@@ -643,7 +658,7 @@ Default_Options = {
|
|
643
658
|
end
|
644
659
|
|
645
660
|
calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
|
646
|
-
|
661
|
+
|
647
662
|
# deltacn & star:
|
648
663
|
# (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
|
649
664
|
if second_hit
|
@@ -685,8 +700,8 @@ Default_Options = {
|
|
685
700
|
:tot_num_ions => tot_num_ions,
|
686
701
|
:calc_neutral_pep_mass => calc_neutral_pep_mass,
|
687
702
|
:massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
|
688
|
-
:num_tol_term =>
|
689
|
-
:num_missed_cleavages =>
|
703
|
+
:num_tol_term => sample_enzyme_obj.num_tol_term(top_pep.sequence),
|
704
|
+
:num_missed_cleavages => sample_enzyme_obj.num_missed_cleavages(pepseq),
|
690
705
|
:is_rejected => 0,
|
691
706
|
# These are search score attributes:
|
692
707
|
:xcorr => top_pep.xcorr,
|
@@ -697,7 +712,7 @@ Default_Options = {
|
|
697
712
|
:modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
|
698
713
|
:spectrum_query => spec_query,
|
699
714
|
})
|
700
|
-
search_result.search_hits = [search_hit]
|
715
|
+
search_result.search_hits = [search_hit] # there can be multiple search hits
|
701
716
|
spec_query.search_results = [search_result] # can be multiple search_results
|
702
717
|
spec_query
|
703
718
|
end
|
@@ -766,9 +781,8 @@ class Sequest::PepXML::SearchResult
|
|
766
781
|
attr_accessor :search_hits
|
767
782
|
|
768
783
|
# if block given, then search_hits set to return value
|
769
|
-
def initialize
|
770
|
-
|
771
|
-
else ; @search_hits = [] end
|
784
|
+
def initialize(search_hits = [])
|
785
|
+
@search_hits = search_hits
|
772
786
|
end
|
773
787
|
|
774
788
|
def to_pepxml
|
@@ -777,17 +791,6 @@ class Sequest::PepXML::SearchResult
|
|
777
791
|
end
|
778
792
|
end
|
779
793
|
|
780
|
-
def self.from_pepxml_node(node, spec_query)
|
781
|
-
self.new.from_pepxml_node(node, spec_query)
|
782
|
-
end
|
783
|
-
|
784
|
-
def from_pepxml_node(node, spec_query, msmsrun_obj)
|
785
|
-
sh_klass = msmsrun_obj.search_hit_class
|
786
|
-
@search_hits = node.children.map do |sh_n|
|
787
|
-
sh_klass.from_pepxml_node(sh_n, spec_query)
|
788
|
-
end
|
789
|
-
self
|
790
|
-
end
|
791
794
|
end
|
792
795
|
|
793
796
|
class Sequest::PepXML::SearchSummary
|
@@ -820,7 +823,11 @@ class Sequest::PepXML::SearchSummary
|
|
820
823
|
def to_pepxml
|
821
824
|
element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
|
822
825
|
search_database.to_pepxml +
|
823
|
-
|
826
|
+
if @params.enzyme =~ /^No_Enzyme/
|
827
|
+
''
|
828
|
+
else
|
829
|
+
short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini])
|
830
|
+
end +
|
824
831
|
@modifications.to_pepxml +
|
825
832
|
Sequest::PepXML::Parameters.new(@params).to_pepxml
|
826
833
|
end
|
@@ -1216,21 +1223,17 @@ class Sequest::PepXML::SpectrumQuery
|
|
1216
1223
|
end
|
1217
1224
|
end
|
1218
1225
|
|
1219
|
-
def self.from_pepxml_node(node
|
1220
|
-
self.new.from_pepxml_node(node
|
1226
|
+
def self.from_pepxml_node(node)
|
1227
|
+
self.new.from_pepxml_node(node)
|
1221
1228
|
end
|
1222
1229
|
|
1223
|
-
def from_pepxml_node(node
|
1230
|
+
def from_pepxml_node(node)
|
1224
1231
|
self[0] = node['spectrum']
|
1225
1232
|
self[1] = node['start_scan'].to_i
|
1226
1233
|
self[2] = node['end_scan'].to_i
|
1227
1234
|
self[3] = node['precursor_neutral_mass'].to_f
|
1228
1235
|
self[4] = node['index'].to_i
|
1229
1236
|
self[5] = node['assumed_charge'].to_i
|
1230
|
-
self[6] = node.children.map do |v|
|
1231
|
-
sh = Sequest::PepXML::SearchResult.new
|
1232
|
-
sh.from_pepxml_node(v, self, msmsrun_obj)
|
1233
|
-
end
|
1234
1237
|
self
|
1235
1238
|
end
|
1236
1239
|
|
@@ -1299,6 +1302,8 @@ class Sequest::PepXML::SearchHit
|
|
1299
1302
|
|
1300
1303
|
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
|
1301
1304
|
|
1305
|
+
def aaseq ; self[1] end
|
1306
|
+
def aaseq=(arg) ; self[1] = arg end
|
1302
1307
|
|
1303
1308
|
# These are all search_score elements:
|
1304
1309
|
|
@@ -1321,44 +1326,6 @@ class Sequest::PepXML::SearchHit
|
|
1321
1326
|
"#<SearchHit #{var}>"
|
1322
1327
|
end
|
1323
1328
|
|
1324
|
-
# requires Params object and full sequence (with heads and tails)
|
1325
|
-
def self.calc_num_missed_cleavages(params, sequence)
|
1326
|
-
num_missed = 0
|
1327
|
-
offset, split_after, except_before = params.enzyme_specificity
|
1328
|
-
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
1329
|
-
to_regexp = "[#{split_after}]"
|
1330
|
-
if except_before.size > 0
|
1331
|
-
to_regexp << "[^#{except_before}]"
|
1332
|
-
end
|
1333
|
-
regexp = /#{to_regexp}/
|
1334
|
-
arr = middle.scan(regexp)
|
1335
|
-
num = arr.size
|
1336
|
-
if middle[-1,1] =~ regexp
|
1337
|
-
# if the regexp is a single letter (exceptions) and the last letter
|
1338
|
-
# matches, then it will count when it is not a missed cleavage
|
1339
|
-
# we can correct for this and get the right answer
|
1340
|
-
num -= 1
|
1341
|
-
else
|
1342
|
-
num
|
1343
|
-
end
|
1344
|
-
end
|
1345
|
-
|
1346
|
-
# requires Params object and full sequence (with heads and tails)
|
1347
|
-
def self.calc_num_tol_term(params, sequence)
|
1348
|
-
num_tol = 0
|
1349
|
-
offset, split_after, except_before = params.enzyme_specificity
|
1350
|
-
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
1351
|
-
last_of_middle = middle[-1,1]
|
1352
|
-
first_of_middle = middle[0,1]
|
1353
|
-
if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
|
1354
|
-
num_tol += 1
|
1355
|
-
end
|
1356
|
-
if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
|
1357
|
-
num_tol += 1
|
1358
|
-
end
|
1359
|
-
num_tol
|
1360
|
-
end
|
1361
|
-
|
1362
1329
|
# Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
|
1363
1330
|
def self.split_ions(ions)
|
1364
1331
|
ions.split("/").map {|ion| ion.to_i }
|
@@ -1392,11 +1359,7 @@ class Sequest::PepXML::SearchHit
|
|
1392
1359
|
end
|
1393
1360
|
end
|
1394
1361
|
|
1395
|
-
def
|
1396
|
-
self.new.from_pepxml_node(node, spec_query)
|
1397
|
-
end
|
1398
|
-
|
1399
|
-
def from_pepxml_node(node, spec_query)
|
1362
|
+
def from_pepxml_node(node)
|
1400
1363
|
self[0] = node['hit_rank'].to_i
|
1401
1364
|
self[1] = node['peptide']
|
1402
1365
|
self[2] = node['peptide_prev_aa']
|
@@ -1410,24 +1373,6 @@ class Sequest::PepXML::SearchHit
|
|
1410
1373
|
self[10] = node['num_tol_term'].to_i
|
1411
1374
|
self[11] = node['num_missed_cleavages'].to_i
|
1412
1375
|
self[12] = node['is_rejected'].to_i
|
1413
|
-
if modinfo_node = node.find_first("child::modification_info")
|
1414
|
-
self[18] = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
|
1415
|
-
end
|
1416
|
-
node.find("child::search_score").each do |ss_n|
|
1417
|
-
case ss_n['name']
|
1418
|
-
when 'deltacnstar'
|
1419
|
-
self[13] = ss_n['value'].to_i
|
1420
|
-
when 'xcorr'
|
1421
|
-
self[14] = ss_n['value'].to_f
|
1422
|
-
when 'deltacn'
|
1423
|
-
self[15] = ss_n['value'].to_f
|
1424
|
-
when 'spscore'
|
1425
|
-
self[16] = ss_n['value'].to_f
|
1426
|
-
when 'sprank'
|
1427
|
-
self[17] = ss_n['value'].to_i
|
1428
|
-
end
|
1429
|
-
end
|
1430
|
-
self[19] = spec_query
|
1431
1376
|
self
|
1432
1377
|
end
|
1433
1378
|
|