mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/spec_id/bioworks.rb
CHANGED
|
@@ -5,10 +5,11 @@ require 'xmlparser'
|
|
|
5
5
|
require 'spec_id'
|
|
6
6
|
require 'zlib'
|
|
7
7
|
require 'hash_by'
|
|
8
|
-
require 'set_from_hash'
|
|
9
8
|
require 'array_class'
|
|
9
|
+
require 'fasta'
|
|
10
10
|
|
|
11
11
|
## have to pre-declare some guys
|
|
12
|
+
module ProteinReferenceable; end
|
|
12
13
|
module SpecID; end
|
|
13
14
|
module SpecID::Prot; end
|
|
14
15
|
module SpecID::Pep; end
|
|
@@ -274,7 +275,7 @@ class Bioworks::XMLParser < XMLParser
|
|
|
274
275
|
def endElement(name)
|
|
275
276
|
case name
|
|
276
277
|
when "peptide"
|
|
277
|
-
@current_obj.
|
|
278
|
+
@current_obj.set_from_hash_given_text(@current_hash)
|
|
278
279
|
when "protein"
|
|
279
280
|
else
|
|
280
281
|
@current_hash[name] = @current_data
|
|
@@ -293,6 +294,7 @@ module Bioworks::XML
|
|
|
293
294
|
end
|
|
294
295
|
|
|
295
296
|
class Bioworks::Prot
|
|
297
|
+
include ProteinReferenceable
|
|
296
298
|
include SpecID::Prot
|
|
297
299
|
include Bioworks::XML
|
|
298
300
|
|
|
@@ -357,20 +359,20 @@ class Bioworks::Prot
|
|
|
357
359
|
hash.delete("bioworksinfo")
|
|
358
360
|
hash["sf"] = hash.delete("Sf")
|
|
359
361
|
hash["pi"] = hash.delete("pI")
|
|
360
|
-
|
|
362
|
+
set_from_xml_hash(hash)
|
|
361
363
|
end
|
|
362
364
|
|
|
363
365
|
# changes the sf to Sf and pI to pi
|
|
364
366
|
def set_from_xml_hash(hash)
|
|
365
367
|
@reference = hash["reference"]
|
|
366
|
-
@protein_probability = hash["protein_probability"]
|
|
367
|
-
|
|
368
|
-
@consensus_score = hash["consensus_score"]
|
|
369
|
-
@sf = hash["Sf"]
|
|
370
|
-
@unified_score = hash["unified_score"]
|
|
371
|
-
@coverage = hash["coverage"]
|
|
372
|
-
@pi = hash["pI"]
|
|
373
|
-
@weight = hash["weight"]
|
|
368
|
+
@protein_probability = hash["protein_probability"].to_f
|
|
369
|
+
#@probability = @protein_probability.to_f
|
|
370
|
+
@consensus_score = hash["consensus_score"].to_f
|
|
371
|
+
@sf = hash["Sf"].to_f
|
|
372
|
+
@unified_score = hash["unified_score"].to_f
|
|
373
|
+
@coverage = hash["coverage"].to_f
|
|
374
|
+
@pi = hash["pI"].to_f
|
|
375
|
+
@weight = hash["weight"].to_f
|
|
374
376
|
@accession = hash["accession"]
|
|
375
377
|
end
|
|
376
378
|
end
|
|
@@ -392,6 +394,8 @@ class Bioworks::Pep
|
|
|
392
394
|
## NOTE! the mass is really the theoretical MH+!!!!
|
|
393
395
|
## NOTE! ALL values stored as strings, except peptide_probability!
|
|
394
396
|
|
|
397
|
+
#ions is a string 'x/y'
|
|
398
|
+
|
|
395
399
|
## other accessors:
|
|
396
400
|
def probability ; self[15] end
|
|
397
401
|
def mh ; self[1] end
|
|
@@ -449,14 +453,16 @@ class Bioworks::Pep
|
|
|
449
453
|
end
|
|
450
454
|
$VERBOSE = tmp_verb
|
|
451
455
|
|
|
456
|
+
undef_method :inspect
|
|
452
457
|
def inspect
|
|
453
458
|
"<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
|
|
454
459
|
|
|
455
460
|
|
|
456
461
|
end
|
|
457
462
|
|
|
458
|
-
|
|
459
|
-
|
|
463
|
+
# if cast == true, then all the data will be cast
|
|
464
|
+
def set_from_hash_given_text(hash)
|
|
465
|
+
self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
|
|
460
466
|
self.file = hash["file"]
|
|
461
467
|
self[15] = hash["peptide_probability"].to_f
|
|
462
468
|
self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
|
|
@@ -470,7 +476,7 @@ class Bioworks::Pep
|
|
|
470
476
|
hash[$1] = $2
|
|
471
477
|
#puts "IN PEP: " + $1 + ": " + $2
|
|
472
478
|
elsif line =~ @@end_pep_re
|
|
473
|
-
|
|
479
|
+
set_from_hash_given_text(hash)
|
|
474
480
|
#puts "SELF[12]: #{self[12]}"
|
|
475
481
|
#puts "SELF[12]: #{self[12]}"
|
|
476
482
|
break
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
|
|
2
|
+
require 'spec_id/sequest/pepxml'
|
|
3
|
+
require 'spec_id/mass'
|
|
4
|
+
|
|
5
|
+
# A digestor must be able to respond to these methods:
|
|
6
|
+
class Digestor
|
|
7
|
+
|
|
8
|
+
# min_mh_mass = min molecular mass of peptide (M+H)+
|
|
9
|
+
attr_accessor :min_mh_mass
|
|
10
|
+
# max_mh_mass = max molecular mass of peptide (M+H)+
|
|
11
|
+
attr_accessor :max_mh_mass
|
|
12
|
+
# the number of allowable missed cleavages
|
|
13
|
+
attr_accessor :missed_cleavages
|
|
14
|
+
# sample_enzyme = SampleEnzyme object
|
|
15
|
+
attr_accessor :sample_enzyme
|
|
16
|
+
# hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
|
|
17
|
+
# In addition, the following keys (as symbols) are recognized.
|
|
18
|
+
# add_C_term_protein
|
|
19
|
+
# add_C_term_peptide
|
|
20
|
+
# add_N_term_protein
|
|
21
|
+
# add_N_term_peptide
|
|
22
|
+
attr_accessor :mass_hash
|
|
23
|
+
|
|
24
|
+
# returns a list of peptide objects created from a digestion of the fasta
|
|
25
|
+
# proteins using the sequest params (variable mods not supported yet)
|
|
26
|
+
def self.digest(fasta_obj, params_obj)
|
|
27
|
+
dig = self.new
|
|
28
|
+
dig.set_from_params(params_obj)
|
|
29
|
+
dig.create_peptide_hash(fasta_obj).values
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def initialize
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# takes a parameters object and fills in the necessary values
|
|
36
|
+
def set_from_params(params_obj, include_variable_mods=false)
|
|
37
|
+
raise NotImplementedError, "no variable mods yet" if include_variable_mods
|
|
38
|
+
if params_obj.is_a? Sequest::Params
|
|
39
|
+
@sample_enzyme = params_obj.sample_enzyme
|
|
40
|
+
@missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
|
|
41
|
+
(@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
|
|
42
|
+
(static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
|
|
43
|
+
monoisotopic_parents = case params_obj.mass_type_parent
|
|
44
|
+
when '0' ; false
|
|
45
|
+
when '1' ; true
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
@mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
|
|
49
|
+
else
|
|
50
|
+
raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# aka 'digestion'
|
|
55
|
+
# will return a hash of SpecID::GenericPep objects (with 'aaseq' and
|
|
56
|
+
# 'prots') hashed by aminoacid sequence. The prot will be the fasta object.
|
|
57
|
+
def create_peptide_hash(fasta_obj)
|
|
58
|
+
pep_to_prots_hash = {}
|
|
59
|
+
pep_objs = nil
|
|
60
|
+
pep_aaseqs_ar = fasta_obj.map do |prot|
|
|
61
|
+
@sample_enzyme.digest(prot.aaseq, @missed_cleavages)
|
|
62
|
+
end
|
|
63
|
+
prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
|
|
64
|
+
passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
|
|
65
|
+
#pep_aaseqs_ar.each_with_index do |before_peps,i|
|
|
66
|
+
# after_peps = passing_pep_seqs_ar[i]
|
|
67
|
+
# puts "before: #{before_peps.size} after: #{after_peps.size}"
|
|
68
|
+
# puts "Losing: #{(before_peps - after_peps).inspect}"
|
|
69
|
+
# puts "Keeping: #{after_peps.inspect}"
|
|
70
|
+
#end
|
|
71
|
+
fasta_obj.each_with_index do |prot, i|
|
|
72
|
+
pep_seqs = passing_pep_seqs_ar[i]
|
|
73
|
+
pep_seqs.each do |pep_seq|
|
|
74
|
+
pep_obj =
|
|
75
|
+
if pep_to_prots_hash.key?(pep_seq)
|
|
76
|
+
pep_to_prots_hash[pep_seq]
|
|
77
|
+
else
|
|
78
|
+
pep_ob = SpecID::GenericPep.new
|
|
79
|
+
pep_ob.prots = []
|
|
80
|
+
pep_ob.aaseq = pep_seq
|
|
81
|
+
pep_to_prots_hash[pep_seq] = pep_ob
|
|
82
|
+
end
|
|
83
|
+
pep_obj.prots << prot
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
#pep_to_prots_hash.each do |k,v|
|
|
87
|
+
# p v.aaseq
|
|
88
|
+
# puts v.prots.size
|
|
89
|
+
#end
|
|
90
|
+
pep_to_prots_hash
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# min max are both in terms of the M+H(+)
|
|
94
|
+
#
|
|
95
|
+
# h_plus:
|
|
96
|
+
# On this website:
|
|
97
|
+
# http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
|
|
98
|
+
# They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
|
|
99
|
+
#
|
|
100
|
+
# The prot_aaseq is used if the mass_hash contains the keys
|
|
101
|
+
# :add_C_term_protein or :add_N_term_protein
|
|
102
|
+
#
|
|
103
|
+
# prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
|
|
104
|
+
# peptides matching a protein aaseq
|
|
105
|
+
# returns another parallel array of passing proteins
|
|
106
|
+
def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
|
|
107
|
+
if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
|
|
108
|
+
raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
|
|
109
|
+
else
|
|
110
|
+
# figure out how much must be added to each peptide
|
|
111
|
+
# include the h2o, the h, and N and C terminal static mods
|
|
112
|
+
h_key = h_plus ? :h_plus : :h
|
|
113
|
+
final_add = mass_hash[:h2o] + mass_hash[h_key]
|
|
114
|
+
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
|
115
|
+
if mass_hash.key?(sym)
|
|
116
|
+
final_add += mass_hash[sym]
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
hash_by_aa_string = {}
|
|
120
|
+
mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
|
|
121
|
+
|
|
122
|
+
pep_aaseqs_ar.map do |pep_aaseqs|
|
|
123
|
+
pep_aaseqs.select do |aaseq|
|
|
124
|
+
sum = 0.0
|
|
125
|
+
aaseq.split('').each do |let|
|
|
126
|
+
if !hash_by_aa_string.key? let
|
|
127
|
+
puts 'NOT FOUND'
|
|
128
|
+
p let
|
|
129
|
+
end
|
|
130
|
+
sum += hash_by_aa_string[let]
|
|
131
|
+
end
|
|
132
|
+
mh_plus = sum + final_add
|
|
133
|
+
( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
end
|
data/lib/spec_id/mass.rb
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
|
|
2
|
+
class Mass
|
|
3
|
+
# http://expasy.org/tools/findmod/findmod_masses.html
|
|
4
|
+
# still need to add the modifications
|
|
5
|
+
MONO = {
|
|
6
|
+
:A => 71.03711,
|
|
7
|
+
:R => 156.10111,
|
|
8
|
+
:N => 114.04293,
|
|
9
|
+
:D => 115.02694,
|
|
10
|
+
:C => 103.00919,
|
|
11
|
+
:E => 129.04259,
|
|
12
|
+
:Q => 128.05858,
|
|
13
|
+
:G => 57.02146,
|
|
14
|
+
:H => 137.05891,
|
|
15
|
+
:I => 113.08406,
|
|
16
|
+
:L => 113.08406,
|
|
17
|
+
:K => 128.09496,
|
|
18
|
+
:M => 131.04049,
|
|
19
|
+
:F => 147.06841,
|
|
20
|
+
:P => 97.05276,
|
|
21
|
+
:S => 87.03203,
|
|
22
|
+
:T => 101.04768,
|
|
23
|
+
:W => 186.07931,
|
|
24
|
+
:Y => 163.06333,
|
|
25
|
+
:V => 99.06841,
|
|
26
|
+
|
|
27
|
+
# uncommon
|
|
28
|
+
:B => 172.048405, # average of aspartic acid and asparagine
|
|
29
|
+
:U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
|
30
|
+
:X => 118.805716, # the average of the mono masses of the 20 amino acids
|
|
31
|
+
:* => 118.805716, # same as X
|
|
32
|
+
|
|
33
|
+
# elements etc.
|
|
34
|
+
:h => 1.00783,
|
|
35
|
+
:h_plus => 1.00728,
|
|
36
|
+
:o => 15.9949146,
|
|
37
|
+
:h2o => 18.01056,
|
|
38
|
+
|
|
39
|
+
}
|
|
40
|
+
AVG = {
|
|
41
|
+
:A => 71.0788,
|
|
42
|
+
:R => 156.1875,
|
|
43
|
+
:N => 114.1038,
|
|
44
|
+
:D => 115.0886,
|
|
45
|
+
:C => 103.1388,
|
|
46
|
+
:E => 129.1155,
|
|
47
|
+
:Q => 128.1307,
|
|
48
|
+
:G => 57.0519,
|
|
49
|
+
:H => 137.1411,
|
|
50
|
+
:I => 113.1594,
|
|
51
|
+
:L => 113.1594,
|
|
52
|
+
:K => 128.1741,
|
|
53
|
+
:M => 131.1926,
|
|
54
|
+
:F => 147.1766,
|
|
55
|
+
:P => 97.1167,
|
|
56
|
+
:S => 87.0782,
|
|
57
|
+
:T => 101.1051,
|
|
58
|
+
:W => 186.2132,
|
|
59
|
+
:Y => 163.1760,
|
|
60
|
+
:V => 99.1326,
|
|
61
|
+
|
|
62
|
+
# uncommon
|
|
63
|
+
:B => 172.1405, # average of aspartic acid and asparagine
|
|
64
|
+
:U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
|
65
|
+
:X => 118.88603, # the average of the masses of the 20 amino acids
|
|
66
|
+
:* => 118.88603, # same as X
|
|
67
|
+
|
|
68
|
+
# elements etc.
|
|
69
|
+
:h => 1.00794,
|
|
70
|
+
:h_plus => 1.00739,
|
|
71
|
+
:o => 15.9994,
|
|
72
|
+
:h2o => 18.01524,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# returns a fresh hash where it has been added to each amino acid the amount
|
|
76
|
+
# specified in the array of a PepXML::Modifications object
|
|
77
|
+
# if static_terminal_mods given than will create the following keys as
|
|
78
|
+
# symbols as necessary:
|
|
79
|
+
# add_C_term_protein
|
|
80
|
+
# add_C_term_peptide
|
|
81
|
+
# add_N_term_protein
|
|
82
|
+
# add_N_term_peptide
|
|
83
|
+
def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
|
|
84
|
+
hash_to_use =
|
|
85
|
+
if monoisotopic
|
|
86
|
+
Mass::MONO
|
|
87
|
+
else
|
|
88
|
+
Mass::AVG
|
|
89
|
+
end
|
|
90
|
+
copy_hash = hash_to_use.dup
|
|
91
|
+
static_mods.each do |mod|
|
|
92
|
+
copy_hash[mod.aminoacid.to_sym] += mod.massdiff
|
|
93
|
+
end
|
|
94
|
+
static_terminal_mods.each do |mod|
|
|
95
|
+
if x = mod.protein_terminus
|
|
96
|
+
# its a protein terminus modification
|
|
97
|
+
case x
|
|
98
|
+
when 'n'
|
|
99
|
+
copy_hash[:add_N_term_protein] = mod.massdiff
|
|
100
|
+
when 'c'
|
|
101
|
+
copy_hash[:add_C_term_protein] = mod.massdiff
|
|
102
|
+
end
|
|
103
|
+
else
|
|
104
|
+
# its a peptide terminus modification
|
|
105
|
+
case mod.terminus
|
|
106
|
+
when 'n'
|
|
107
|
+
copy_hash[:add_N_term_peptide] = mod.massdiff
|
|
108
|
+
when 'c'
|
|
109
|
+
copy_hash[:add_C_term_peptide] = mod.massdiff
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
copy_hash
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
require 'xml_style_parser'
|
|
2
|
+
require 'spec_id/sequest/pepxml'
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
module SpecID ; end
|
|
6
|
+
module SpecID::Parser ; end
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SpecID::Parser::PepProph
|
|
10
|
+
include XMLStyleParser
|
|
11
|
+
def initialize(parse_type=:spec_id, version='3.0')
|
|
12
|
+
@method = parse_type
|
|
13
|
+
@version = version
|
|
14
|
+
implemented = %w(AXML LibXML)
|
|
15
|
+
klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
|
|
16
|
+
case klass_s
|
|
17
|
+
when 'AXML'
|
|
18
|
+
@get_root_node_from_file = Proc.new do |file|
|
|
19
|
+
AXML.parse_file(file)
|
|
20
|
+
end
|
|
21
|
+
when 'LibXML' # LibXML is buggy on some machines...
|
|
22
|
+
@get_root_node_from_file = Proc.new do |file|
|
|
23
|
+
doc = XML::Document.file(file)
|
|
24
|
+
doc.root
|
|
25
|
+
end
|
|
26
|
+
else
|
|
27
|
+
raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# returns the spec_id object
|
|
32
|
+
def spec_id(file, opts={})
|
|
33
|
+
raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
|
|
34
|
+
spec_id_obj =
|
|
35
|
+
if x = opts[:spec_id]
|
|
36
|
+
x
|
|
37
|
+
else
|
|
38
|
+
Proph::PepSummary.new
|
|
39
|
+
end
|
|
40
|
+
msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
|
|
41
|
+
spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
|
|
42
|
+
|
|
43
|
+
msms_run_summary_n = msms_pipeline_analysis_n.find_first('child::msms_run_summary')
|
|
44
|
+
spec_id_obj.from_pepxml_node(msms_run_summary_n)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
class SpecID::Parser::ProtProph
|
|
50
|
+
include XMLStyleParser
|
|
51
|
+
Split_unique_stripped_peptides_re = /\+/
|
|
52
|
+
|
|
53
|
+
def initialize(parse_type=:spec_id, version='4')
|
|
54
|
+
@method = parse_type
|
|
55
|
+
@version = version
|
|
56
|
+
|
|
57
|
+
implemented = %w(AXML LibXML)
|
|
58
|
+
klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
|
|
59
|
+
case klass_s
|
|
60
|
+
when 'AXML'
|
|
61
|
+
#puts "parsing with AXML (XMLParser based)" if $VERBOSE
|
|
62
|
+
@get_root_node_from_file = Proc.new do |file|
|
|
63
|
+
AXML.parse_file(file)
|
|
64
|
+
end
|
|
65
|
+
when 'LibXML' # LibXML is buggy on some machines...
|
|
66
|
+
#puts "parsing with LibXML" if $VERBOSE
|
|
67
|
+
@get_root_node_from_file = Proc.new do |file|
|
|
68
|
+
doc = XML::Document.file(file)
|
|
69
|
+
doc.root
|
|
70
|
+
end
|
|
71
|
+
else
|
|
72
|
+
raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# returns the spec_id object
|
|
77
|
+
def spec_id(file, opts={})
|
|
78
|
+
raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
|
|
79
|
+
spec_id_obj =
|
|
80
|
+
if x = opts[:spec_id]
|
|
81
|
+
x
|
|
82
|
+
else
|
|
83
|
+
Proph::ProtSummary.new
|
|
84
|
+
end
|
|
85
|
+
protein_summary_n = @get_root_node_from_file.call(file)
|
|
86
|
+
|
|
87
|
+
#protein_summary_n = scan_for_first(doc, 'protein_summary')
|
|
88
|
+
|
|
89
|
+
# protein_summary_header_n = protein_summary_n.child
|
|
90
|
+
# could grab some of this info if we wanted...
|
|
91
|
+
|
|
92
|
+
pep_hash = {}
|
|
93
|
+
prot_hash = {}
|
|
94
|
+
protein_groups = []
|
|
95
|
+
|
|
96
|
+
# get all the proteins from inside protein groups
|
|
97
|
+
protein_group_name = 'protein_group'
|
|
98
|
+
get_protein_summary_header = true
|
|
99
|
+
protein_summary_n.each do |protein_group_n|
|
|
100
|
+
if get_protein_summary_header
|
|
101
|
+
protein_summary_header_n = protein_group_n
|
|
102
|
+
get_protein_summary_header = false
|
|
103
|
+
elsif protein_group_n.name == protein_group_name
|
|
104
|
+
protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# need to finalize hash stuff
|
|
109
|
+
pep_hash.each do |k,pep|
|
|
110
|
+
new_prots = []
|
|
111
|
+
pep.prots.each do |prot_or_string|
|
|
112
|
+
if prot_or_string.is_a?(Proph::Prot)
|
|
113
|
+
new_prots << prot_or_string
|
|
114
|
+
else
|
|
115
|
+
prt = prot_hash[prot_or_string]
|
|
116
|
+
if prt.nil?
|
|
117
|
+
# this is an indistinguishable protein!
|
|
118
|
+
else
|
|
119
|
+
new_prots << prt
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
pep.prots = new_prots
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
spec_id_obj.peps = pep_hash.values
|
|
127
|
+
spec_id_obj.prots = prot_hash.values
|
|
128
|
+
spec_id_obj.prot_groups = protein_groups
|
|
129
|
+
spec_id_obj
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# takes a Y or N and gives true/false
|
|
133
|
+
def booleanize(string)
|
|
134
|
+
case string
|
|
135
|
+
when 'Y'
|
|
136
|
+
true
|
|
137
|
+
when 'N'
|
|
138
|
+
false
|
|
139
|
+
else
|
|
140
|
+
nil
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# assumes that all the rest of the nodes are protein_groups
|
|
145
|
+
# pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
|
|
146
|
+
# modified) + charge
|
|
147
|
+
# (as far as I can tell, all protein entries are unique!)
|
|
148
|
+
# returns a ProtGroup object
|
|
149
|
+
def get_proteins(protein_group_node, pep_hash, prot_hash)
|
|
150
|
+
|
|
151
|
+
protein_group_proteins = []
|
|
152
|
+
|
|
153
|
+
protein_group_node.each do |protein_n|
|
|
154
|
+
raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
|
|
155
|
+
# probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
|
|
156
|
+
|
|
157
|
+
# get the description
|
|
158
|
+
# INITIALIZE the protein and set key
|
|
159
|
+
n = protein_n
|
|
160
|
+
protein_name = n['protein_name']
|
|
161
|
+
peps = []
|
|
162
|
+
protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
|
|
163
|
+
n['n_indistinguishable_proteins'].to_i,
|
|
164
|
+
n['percent_coverage'].to_f,
|
|
165
|
+
n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
|
|
166
|
+
n['group_sibling_id'], n['total_number_peptides'].to_i,
|
|
167
|
+
n['pct_spectrum_ids'].to_f, nil,
|
|
168
|
+
peps ])
|
|
169
|
+
protein_group_proteins << protein
|
|
170
|
+
prot_hash[protein_name] = protein
|
|
171
|
+
|
|
172
|
+
# traverse through the peptides (and annotation)
|
|
173
|
+
protein_n.each do |protein_sub_n|
|
|
174
|
+
# create a proteins array for each peptide
|
|
175
|
+
proteins = [protein]
|
|
176
|
+
|
|
177
|
+
if protein_sub_n.name == 'annotation'
|
|
178
|
+
protein.description = protein_sub_n['protein_description']
|
|
179
|
+
end
|
|
180
|
+
if protein_sub_n.name == 'peptide'
|
|
181
|
+
peptide_n = protein_sub_n
|
|
182
|
+
# peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
|
|
183
|
+
# get modifications, if any
|
|
184
|
+
|
|
185
|
+
n = peptide_n
|
|
186
|
+
peptide_sequence = n['peptide_sequence']
|
|
187
|
+
charge = n['charge'].to_i
|
|
188
|
+
|
|
189
|
+
# GET list of all proteins and modifications
|
|
190
|
+
|
|
191
|
+
mod_info = nil
|
|
192
|
+
peptide_hash_string = peptide_sequence
|
|
193
|
+
if peptide_n.child?
|
|
194
|
+
peptide_n.each do |pep_sub_n|
|
|
195
|
+
case pep_sub_n.name
|
|
196
|
+
when 'peptide_parent_protein'
|
|
197
|
+
# NOTE! the proteins list will have strings until the assoc.
|
|
198
|
+
# prot is found!
|
|
199
|
+
proteins << pep_sub_n['protein_name']
|
|
200
|
+
when 'modification_info'
|
|
201
|
+
masses = pep_sub_n.map do |mod_aa_mass_n|
|
|
202
|
+
Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
|
|
203
|
+
end
|
|
204
|
+
peptide_hash_string = pep_sub_n['modified_peptide']
|
|
205
|
+
mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
key = [peptide_hash_string, charge]
|
|
211
|
+
peptide =
|
|
212
|
+
if pep_hash.key? key
|
|
213
|
+
pep_hash[key]
|
|
214
|
+
else
|
|
215
|
+
pep = Proph::Prot::Pep.new([peptide_sequence, charge,
|
|
216
|
+
n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
|
|
217
|
+
n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
|
|
218
|
+
n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
|
|
219
|
+
n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
|
|
220
|
+
booleanize(n['is_contributing_evidence']),
|
|
221
|
+
n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
|
|
222
|
+
pep_hash[key] = pep
|
|
223
|
+
pep
|
|
224
|
+
end
|
|
225
|
+
peps << peptide
|
|
226
|
+
end
|
|
227
|
+
end # end protein children
|
|
228
|
+
end
|
|
229
|
+
Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def parse(file, opts)
|
|
233
|
+
send(@method, file, opts)
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
end
|