mspire 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/spec_id/bioworks.rb
CHANGED
@@ -5,10 +5,11 @@ require 'xmlparser'
|
|
5
5
|
require 'spec_id'
|
6
6
|
require 'zlib'
|
7
7
|
require 'hash_by'
|
8
|
-
require 'set_from_hash'
|
9
8
|
require 'array_class'
|
9
|
+
require 'fasta'
|
10
10
|
|
11
11
|
## have to pre-declare some guys
|
12
|
+
module ProteinReferenceable; end
|
12
13
|
module SpecID; end
|
13
14
|
module SpecID::Prot; end
|
14
15
|
module SpecID::Pep; end
|
@@ -274,7 +275,7 @@ class Bioworks::XMLParser < XMLParser
|
|
274
275
|
def endElement(name)
|
275
276
|
case name
|
276
277
|
when "peptide"
|
277
|
-
@current_obj.
|
278
|
+
@current_obj.set_from_hash_given_text(@current_hash)
|
278
279
|
when "protein"
|
279
280
|
else
|
280
281
|
@current_hash[name] = @current_data
|
@@ -293,6 +294,7 @@ module Bioworks::XML
|
|
293
294
|
end
|
294
295
|
|
295
296
|
class Bioworks::Prot
|
297
|
+
include ProteinReferenceable
|
296
298
|
include SpecID::Prot
|
297
299
|
include Bioworks::XML
|
298
300
|
|
@@ -357,20 +359,20 @@ class Bioworks::Prot
|
|
357
359
|
hash.delete("bioworksinfo")
|
358
360
|
hash["sf"] = hash.delete("Sf")
|
359
361
|
hash["pi"] = hash.delete("pI")
|
360
|
-
|
362
|
+
set_from_xml_hash(hash)
|
361
363
|
end
|
362
364
|
|
363
365
|
# changes the sf to Sf and pI to pi
|
364
366
|
def set_from_xml_hash(hash)
|
365
367
|
@reference = hash["reference"]
|
366
|
-
@protein_probability = hash["protein_probability"]
|
367
|
-
|
368
|
-
@consensus_score = hash["consensus_score"]
|
369
|
-
@sf = hash["Sf"]
|
370
|
-
@unified_score = hash["unified_score"]
|
371
|
-
@coverage = hash["coverage"]
|
372
|
-
@pi = hash["pI"]
|
373
|
-
@weight = hash["weight"]
|
368
|
+
@protein_probability = hash["protein_probability"].to_f
|
369
|
+
#@probability = @protein_probability.to_f
|
370
|
+
@consensus_score = hash["consensus_score"].to_f
|
371
|
+
@sf = hash["Sf"].to_f
|
372
|
+
@unified_score = hash["unified_score"].to_f
|
373
|
+
@coverage = hash["coverage"].to_f
|
374
|
+
@pi = hash["pI"].to_f
|
375
|
+
@weight = hash["weight"].to_f
|
374
376
|
@accession = hash["accession"]
|
375
377
|
end
|
376
378
|
end
|
@@ -392,6 +394,8 @@ class Bioworks::Pep
|
|
392
394
|
## NOTE! the mass is really the theoretical MH+!!!!
|
393
395
|
## NOTE! ALL values stored as strings, except peptide_probability!
|
394
396
|
|
397
|
+
#ions is a string 'x/y'
|
398
|
+
|
395
399
|
## other accessors:
|
396
400
|
def probability ; self[15] end
|
397
401
|
def mh ; self[1] end
|
@@ -449,14 +453,16 @@ class Bioworks::Pep
|
|
449
453
|
end
|
450
454
|
$VERBOSE = tmp_verb
|
451
455
|
|
456
|
+
undef_method :inspect
|
452
457
|
def inspect
|
453
458
|
"<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
|
454
459
|
|
455
460
|
|
456
461
|
end
|
457
462
|
|
458
|
-
|
459
|
-
|
463
|
+
# if cast == true, then all the data will be cast
|
464
|
+
def set_from_hash_given_text(hash)
|
465
|
+
self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
|
460
466
|
self.file = hash["file"]
|
461
467
|
self[15] = hash["peptide_probability"].to_f
|
462
468
|
self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
|
@@ -470,7 +476,7 @@ class Bioworks::Pep
|
|
470
476
|
hash[$1] = $2
|
471
477
|
#puts "IN PEP: " + $1 + ": " + $2
|
472
478
|
elsif line =~ @@end_pep_re
|
473
|
-
|
479
|
+
set_from_hash_given_text(hash)
|
474
480
|
#puts "SELF[12]: #{self[12]}"
|
475
481
|
#puts "SELF[12]: #{self[12]}"
|
476
482
|
break
|
@@ -0,0 +1,139 @@
|
|
1
|
+
|
2
|
+
require 'spec_id/sequest/pepxml'
|
3
|
+
require 'spec_id/mass'
|
4
|
+
|
5
|
+
# A digestor must be able to respond to these methods:
|
6
|
+
class Digestor
|
7
|
+
|
8
|
+
# min_mh_mass = min molecular mass of peptide (M+H)+
|
9
|
+
attr_accessor :min_mh_mass
|
10
|
+
# max_mh_mass = max molecular mass of peptide (M+H)+
|
11
|
+
attr_accessor :max_mh_mass
|
12
|
+
# the number of allowable missed cleavages
|
13
|
+
attr_accessor :missed_cleavages
|
14
|
+
# sample_enzyme = SampleEnzyme object
|
15
|
+
attr_accessor :sample_enzyme
|
16
|
+
# hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
|
17
|
+
# In addition, the following keys (as symbols) are recognized.
|
18
|
+
# add_C_term_protein
|
19
|
+
# add_C_term_peptide
|
20
|
+
# add_N_term_protein
|
21
|
+
# add_N_term_peptide
|
22
|
+
attr_accessor :mass_hash
|
23
|
+
|
24
|
+
# returns a list of peptide objects created from a digestion of the fasta
|
25
|
+
# proteins using the sequest params (variable mods not supported yet)
|
26
|
+
def self.digest(fasta_obj, params_obj)
|
27
|
+
dig = self.new
|
28
|
+
dig.set_from_params(params_obj)
|
29
|
+
dig.create_peptide_hash(fasta_obj).values
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize
|
33
|
+
end
|
34
|
+
|
35
|
+
# takes a parameters object and fills in the necessary values
|
36
|
+
def set_from_params(params_obj, include_variable_mods=false)
|
37
|
+
raise NotImplementedError, "no variable mods yet" if include_variable_mods
|
38
|
+
if params_obj.is_a? Sequest::Params
|
39
|
+
@sample_enzyme = params_obj.sample_enzyme
|
40
|
+
@missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
|
41
|
+
(@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
|
42
|
+
(static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
|
43
|
+
monoisotopic_parents = case params_obj.mass_type_parent
|
44
|
+
when '0' ; false
|
45
|
+
when '1' ; true
|
46
|
+
end
|
47
|
+
|
48
|
+
@mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
|
49
|
+
else
|
50
|
+
raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# aka 'digestion'
|
55
|
+
# will return a hash of SpecID::GenericPep objects (with 'aaseq' and
|
56
|
+
# 'prots') hashed by aminoacid sequence. The prot will be the fasta object.
|
57
|
+
def create_peptide_hash(fasta_obj)
|
58
|
+
pep_to_prots_hash = {}
|
59
|
+
pep_objs = nil
|
60
|
+
pep_aaseqs_ar = fasta_obj.map do |prot|
|
61
|
+
@sample_enzyme.digest(prot.aaseq, @missed_cleavages)
|
62
|
+
end
|
63
|
+
prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
|
64
|
+
passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
|
65
|
+
#pep_aaseqs_ar.each_with_index do |before_peps,i|
|
66
|
+
# after_peps = passing_pep_seqs_ar[i]
|
67
|
+
# puts "before: #{before_peps.size} after: #{after_peps.size}"
|
68
|
+
# puts "Losing: #{(before_peps - after_peps).inspect}"
|
69
|
+
# puts "Keeping: #{after_peps.inspect}"
|
70
|
+
#end
|
71
|
+
fasta_obj.each_with_index do |prot, i|
|
72
|
+
pep_seqs = passing_pep_seqs_ar[i]
|
73
|
+
pep_seqs.each do |pep_seq|
|
74
|
+
pep_obj =
|
75
|
+
if pep_to_prots_hash.key?(pep_seq)
|
76
|
+
pep_to_prots_hash[pep_seq]
|
77
|
+
else
|
78
|
+
pep_ob = SpecID::GenericPep.new
|
79
|
+
pep_ob.prots = []
|
80
|
+
pep_ob.aaseq = pep_seq
|
81
|
+
pep_to_prots_hash[pep_seq] = pep_ob
|
82
|
+
end
|
83
|
+
pep_obj.prots << prot
|
84
|
+
end
|
85
|
+
end
|
86
|
+
#pep_to_prots_hash.each do |k,v|
|
87
|
+
# p v.aaseq
|
88
|
+
# puts v.prots.size
|
89
|
+
#end
|
90
|
+
pep_to_prots_hash
|
91
|
+
end
|
92
|
+
|
93
|
+
# min max are both in terms of the M+H(+)
|
94
|
+
#
|
95
|
+
# h_plus:
|
96
|
+
# On this website:
|
97
|
+
# http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
|
98
|
+
# They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
|
99
|
+
#
|
100
|
+
# The prot_aaseq is used if the mass_hash contains the keys
|
101
|
+
# :add_C_term_protein or :add_N_term_protein
|
102
|
+
#
|
103
|
+
# prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
|
104
|
+
# peptides matching a protein aaseq
|
105
|
+
# returns another parallel array of passing proteins
|
106
|
+
def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
|
107
|
+
if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
|
108
|
+
raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
|
109
|
+
else
|
110
|
+
# figure out how much must be added to each peptide
|
111
|
+
# include the h2o, the h, and N and C terminal static mods
|
112
|
+
h_key = h_plus ? :h_plus : :h
|
113
|
+
final_add = mass_hash[:h2o] + mass_hash[h_key]
|
114
|
+
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
115
|
+
if mass_hash.key?(sym)
|
116
|
+
final_add += mass_hash[sym]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
hash_by_aa_string = {}
|
120
|
+
mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
|
121
|
+
|
122
|
+
pep_aaseqs_ar.map do |pep_aaseqs|
|
123
|
+
pep_aaseqs.select do |aaseq|
|
124
|
+
sum = 0.0
|
125
|
+
aaseq.split('').each do |let|
|
126
|
+
if !hash_by_aa_string.key? let
|
127
|
+
puts 'NOT FOUND'
|
128
|
+
p let
|
129
|
+
end
|
130
|
+
sum += hash_by_aa_string[let]
|
131
|
+
end
|
132
|
+
mh_plus = sum + final_add
|
133
|
+
( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
data/lib/spec_id/mass.rb
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
|
2
|
+
class Mass
|
3
|
+
# http://expasy.org/tools/findmod/findmod_masses.html
|
4
|
+
# still need to add the modifications
|
5
|
+
MONO = {
|
6
|
+
:A => 71.03711,
|
7
|
+
:R => 156.10111,
|
8
|
+
:N => 114.04293,
|
9
|
+
:D => 115.02694,
|
10
|
+
:C => 103.00919,
|
11
|
+
:E => 129.04259,
|
12
|
+
:Q => 128.05858,
|
13
|
+
:G => 57.02146,
|
14
|
+
:H => 137.05891,
|
15
|
+
:I => 113.08406,
|
16
|
+
:L => 113.08406,
|
17
|
+
:K => 128.09496,
|
18
|
+
:M => 131.04049,
|
19
|
+
:F => 147.06841,
|
20
|
+
:P => 97.05276,
|
21
|
+
:S => 87.03203,
|
22
|
+
:T => 101.04768,
|
23
|
+
:W => 186.07931,
|
24
|
+
:Y => 163.06333,
|
25
|
+
:V => 99.06841,
|
26
|
+
|
27
|
+
# uncommon
|
28
|
+
:B => 172.048405, # average of aspartic acid and asparagine
|
29
|
+
:U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
30
|
+
:X => 118.805716, # the average of the mono masses of the 20 amino acids
|
31
|
+
:* => 118.805716, # same as X
|
32
|
+
|
33
|
+
# elements etc.
|
34
|
+
:h => 1.00783,
|
35
|
+
:h_plus => 1.00728,
|
36
|
+
:o => 15.9949146,
|
37
|
+
:h2o => 18.01056,
|
38
|
+
|
39
|
+
}
|
40
|
+
AVG = {
|
41
|
+
:A => 71.0788,
|
42
|
+
:R => 156.1875,
|
43
|
+
:N => 114.1038,
|
44
|
+
:D => 115.0886,
|
45
|
+
:C => 103.1388,
|
46
|
+
:E => 129.1155,
|
47
|
+
:Q => 128.1307,
|
48
|
+
:G => 57.0519,
|
49
|
+
:H => 137.1411,
|
50
|
+
:I => 113.1594,
|
51
|
+
:L => 113.1594,
|
52
|
+
:K => 128.1741,
|
53
|
+
:M => 131.1926,
|
54
|
+
:F => 147.1766,
|
55
|
+
:P => 97.1167,
|
56
|
+
:S => 87.0782,
|
57
|
+
:T => 101.1051,
|
58
|
+
:W => 186.2132,
|
59
|
+
:Y => 163.1760,
|
60
|
+
:V => 99.1326,
|
61
|
+
|
62
|
+
# uncommon
|
63
|
+
:B => 172.1405, # average of aspartic acid and asparagine
|
64
|
+
:U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
65
|
+
:X => 118.88603, # the average of the masses of the 20 amino acids
|
66
|
+
:* => 118.88603, # same as X
|
67
|
+
|
68
|
+
# elements etc.
|
69
|
+
:h => 1.00794,
|
70
|
+
:h_plus => 1.00739,
|
71
|
+
:o => 15.9994,
|
72
|
+
:h2o => 18.01524,
|
73
|
+
}
|
74
|
+
|
75
|
+
# returns a fresh hash where it has been added to each amino acid the amount
|
76
|
+
# specified in the array of a PepXML::Modifications object
|
77
|
+
# if static_terminal_mods given than will create the following keys as
|
78
|
+
# symbols as necessary:
|
79
|
+
# add_C_term_protein
|
80
|
+
# add_C_term_peptide
|
81
|
+
# add_N_term_protein
|
82
|
+
# add_N_term_peptide
|
83
|
+
def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
|
84
|
+
hash_to_use =
|
85
|
+
if monoisotopic
|
86
|
+
Mass::MONO
|
87
|
+
else
|
88
|
+
Mass::AVG
|
89
|
+
end
|
90
|
+
copy_hash = hash_to_use.dup
|
91
|
+
static_mods.each do |mod|
|
92
|
+
copy_hash[mod.aminoacid.to_sym] += mod.massdiff
|
93
|
+
end
|
94
|
+
static_terminal_mods.each do |mod|
|
95
|
+
if x = mod.protein_terminus
|
96
|
+
# its a protein terminus modification
|
97
|
+
case x
|
98
|
+
when 'n'
|
99
|
+
copy_hash[:add_N_term_protein] = mod.massdiff
|
100
|
+
when 'c'
|
101
|
+
copy_hash[:add_C_term_protein] = mod.massdiff
|
102
|
+
end
|
103
|
+
else
|
104
|
+
# its a peptide terminus modification
|
105
|
+
case mod.terminus
|
106
|
+
when 'n'
|
107
|
+
copy_hash[:add_N_term_peptide] = mod.massdiff
|
108
|
+
when 'c'
|
109
|
+
copy_hash[:add_C_term_peptide] = mod.massdiff
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
copy_hash
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
@@ -0,0 +1,236 @@
|
|
1
|
+
require 'xml_style_parser'
|
2
|
+
require 'spec_id/sequest/pepxml'
|
3
|
+
|
4
|
+
|
5
|
+
module SpecID ; end
|
6
|
+
module SpecID::Parser ; end
|
7
|
+
|
8
|
+
|
9
|
+
class SpecID::Parser::PepProph
|
10
|
+
include XMLStyleParser
|
11
|
+
def initialize(parse_type=:spec_id, version='3.0')
|
12
|
+
@method = parse_type
|
13
|
+
@version = version
|
14
|
+
implemented = %w(AXML LibXML)
|
15
|
+
klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
|
16
|
+
case klass_s
|
17
|
+
when 'AXML'
|
18
|
+
@get_root_node_from_file = Proc.new do |file|
|
19
|
+
AXML.parse_file(file)
|
20
|
+
end
|
21
|
+
when 'LibXML' # LibXML is buggy on some machines...
|
22
|
+
@get_root_node_from_file = Proc.new do |file|
|
23
|
+
doc = XML::Document.file(file)
|
24
|
+
doc.root
|
25
|
+
end
|
26
|
+
else
|
27
|
+
raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# returns the spec_id object
|
32
|
+
def spec_id(file, opts={})
|
33
|
+
raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
|
34
|
+
spec_id_obj =
|
35
|
+
if x = opts[:spec_id]
|
36
|
+
x
|
37
|
+
else
|
38
|
+
Proph::PepSummary.new
|
39
|
+
end
|
40
|
+
msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
|
41
|
+
spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
|
42
|
+
|
43
|
+
msms_run_summary_n = msms_pipeline_analysis_n.find_first('child::msms_run_summary')
|
44
|
+
spec_id_obj.from_pepxml_node(msms_run_summary_n)
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
class SpecID::Parser::ProtProph
|
50
|
+
include XMLStyleParser
|
51
|
+
Split_unique_stripped_peptides_re = /\+/
|
52
|
+
|
53
|
+
def initialize(parse_type=:spec_id, version='4')
|
54
|
+
@method = parse_type
|
55
|
+
@version = version
|
56
|
+
|
57
|
+
implemented = %w(AXML LibXML)
|
58
|
+
klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
|
59
|
+
case klass_s
|
60
|
+
when 'AXML'
|
61
|
+
#puts "parsing with AXML (XMLParser based)" if $VERBOSE
|
62
|
+
@get_root_node_from_file = Proc.new do |file|
|
63
|
+
AXML.parse_file(file)
|
64
|
+
end
|
65
|
+
when 'LibXML' # LibXML is buggy on some machines...
|
66
|
+
#puts "parsing with LibXML" if $VERBOSE
|
67
|
+
@get_root_node_from_file = Proc.new do |file|
|
68
|
+
doc = XML::Document.file(file)
|
69
|
+
doc.root
|
70
|
+
end
|
71
|
+
else
|
72
|
+
raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# returns the spec_id object
|
77
|
+
def spec_id(file, opts={})
|
78
|
+
raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
|
79
|
+
spec_id_obj =
|
80
|
+
if x = opts[:spec_id]
|
81
|
+
x
|
82
|
+
else
|
83
|
+
Proph::ProtSummary.new
|
84
|
+
end
|
85
|
+
protein_summary_n = @get_root_node_from_file.call(file)
|
86
|
+
|
87
|
+
#protein_summary_n = scan_for_first(doc, 'protein_summary')
|
88
|
+
|
89
|
+
# protein_summary_header_n = protein_summary_n.child
|
90
|
+
# could grab some of this info if we wanted...
|
91
|
+
|
92
|
+
pep_hash = {}
|
93
|
+
prot_hash = {}
|
94
|
+
protein_groups = []
|
95
|
+
|
96
|
+
# get all the proteins from inside protein groups
|
97
|
+
protein_group_name = 'protein_group'
|
98
|
+
get_protein_summary_header = true
|
99
|
+
protein_summary_n.each do |protein_group_n|
|
100
|
+
if get_protein_summary_header
|
101
|
+
protein_summary_header_n = protein_group_n
|
102
|
+
get_protein_summary_header = false
|
103
|
+
elsif protein_group_n.name == protein_group_name
|
104
|
+
protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# need to finalize hash stuff
|
109
|
+
pep_hash.each do |k,pep|
|
110
|
+
new_prots = []
|
111
|
+
pep.prots.each do |prot_or_string|
|
112
|
+
if prot_or_string.is_a?(Proph::Prot)
|
113
|
+
new_prots << prot_or_string
|
114
|
+
else
|
115
|
+
prt = prot_hash[prot_or_string]
|
116
|
+
if prt.nil?
|
117
|
+
# this is an indistinguishable protein!
|
118
|
+
else
|
119
|
+
new_prots << prt
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
pep.prots = new_prots
|
124
|
+
end
|
125
|
+
|
126
|
+
spec_id_obj.peps = pep_hash.values
|
127
|
+
spec_id_obj.prots = prot_hash.values
|
128
|
+
spec_id_obj.prot_groups = protein_groups
|
129
|
+
spec_id_obj
|
130
|
+
end
|
131
|
+
|
132
|
+
# takes a Y or N and gives true/false
|
133
|
+
def booleanize(string)
|
134
|
+
case string
|
135
|
+
when 'Y'
|
136
|
+
true
|
137
|
+
when 'N'
|
138
|
+
false
|
139
|
+
else
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# assumes that all the rest of the nodes are protein_groups
|
145
|
+
# pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
|
146
|
+
# modified) + charge
|
147
|
+
# (as far as I can tell, all protein entries are unique!)
|
148
|
+
# returns a ProtGroup object
|
149
|
+
def get_proteins(protein_group_node, pep_hash, prot_hash)
|
150
|
+
|
151
|
+
protein_group_proteins = []
|
152
|
+
|
153
|
+
protein_group_node.each do |protein_n|
|
154
|
+
raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
|
155
|
+
# probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
|
156
|
+
|
157
|
+
# get the description
|
158
|
+
# INITIALIZE the protein and set key
|
159
|
+
n = protein_n
|
160
|
+
protein_name = n['protein_name']
|
161
|
+
peps = []
|
162
|
+
protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
|
163
|
+
n['n_indistinguishable_proteins'].to_i,
|
164
|
+
n['percent_coverage'].to_f,
|
165
|
+
n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
|
166
|
+
n['group_sibling_id'], n['total_number_peptides'].to_i,
|
167
|
+
n['pct_spectrum_ids'].to_f, nil,
|
168
|
+
peps ])
|
169
|
+
protein_group_proteins << protein
|
170
|
+
prot_hash[protein_name] = protein
|
171
|
+
|
172
|
+
# traverse through the peptides (and annotation)
|
173
|
+
protein_n.each do |protein_sub_n|
|
174
|
+
# create a proteins array for each peptide
|
175
|
+
proteins = [protein]
|
176
|
+
|
177
|
+
if protein_sub_n.name == 'annotation'
|
178
|
+
protein.description = protein_sub_n['protein_description']
|
179
|
+
end
|
180
|
+
if protein_sub_n.name == 'peptide'
|
181
|
+
peptide_n = protein_sub_n
|
182
|
+
# peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
|
183
|
+
# get modifications, if any
|
184
|
+
|
185
|
+
n = peptide_n
|
186
|
+
peptide_sequence = n['peptide_sequence']
|
187
|
+
charge = n['charge'].to_i
|
188
|
+
|
189
|
+
# GET list of all proteins and modifications
|
190
|
+
|
191
|
+
mod_info = nil
|
192
|
+
peptide_hash_string = peptide_sequence
|
193
|
+
if peptide_n.child?
|
194
|
+
peptide_n.each do |pep_sub_n|
|
195
|
+
case pep_sub_n.name
|
196
|
+
when 'peptide_parent_protein'
|
197
|
+
# NOTE! the proteins list will have strings until the assoc.
|
198
|
+
# prot is found!
|
199
|
+
proteins << pep_sub_n['protein_name']
|
200
|
+
when 'modification_info'
|
201
|
+
masses = pep_sub_n.map do |mod_aa_mass_n|
|
202
|
+
Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
|
203
|
+
end
|
204
|
+
peptide_hash_string = pep_sub_n['modified_peptide']
|
205
|
+
mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
key = [peptide_hash_string, charge]
|
211
|
+
peptide =
|
212
|
+
if pep_hash.key? key
|
213
|
+
pep_hash[key]
|
214
|
+
else
|
215
|
+
pep = Proph::Prot::Pep.new([peptide_sequence, charge,
|
216
|
+
n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
|
217
|
+
n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
|
218
|
+
n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
|
219
|
+
n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
|
220
|
+
booleanize(n['is_contributing_evidence']),
|
221
|
+
n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
|
222
|
+
pep_hash[key] = pep
|
223
|
+
pep
|
224
|
+
end
|
225
|
+
peps << peptide
|
226
|
+
end
|
227
|
+
end # end protein children
|
228
|
+
end
|
229
|
+
Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
|
230
|
+
end
|
231
|
+
|
232
|
+
def parse(file, opts)
|
233
|
+
send(@method, file, opts)
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|