mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id/mass.rb
DELETED
|
@@ -1,179 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
class Mass
|
|
3
|
-
# http://expasy.org/tools/findmod/findmod_masses.html
|
|
4
|
-
# still need to add the modifications
|
|
5
|
-
MONO = {
|
|
6
|
-
:A => 71.03711,
|
|
7
|
-
:R => 156.10111,
|
|
8
|
-
:N => 114.04293,
|
|
9
|
-
:D => 115.02694,
|
|
10
|
-
:C => 103.00919,
|
|
11
|
-
:E => 129.04259,
|
|
12
|
-
:Q => 128.05858,
|
|
13
|
-
:G => 57.02146,
|
|
14
|
-
:H => 137.05891,
|
|
15
|
-
:I => 113.08406,
|
|
16
|
-
:L => 113.08406,
|
|
17
|
-
:K => 128.09496,
|
|
18
|
-
:M => 131.04049,
|
|
19
|
-
:F => 147.06841,
|
|
20
|
-
:P => 97.05276,
|
|
21
|
-
:S => 87.03203,
|
|
22
|
-
:T => 101.04768,
|
|
23
|
-
:W => 186.07931,
|
|
24
|
-
:Y => 163.06333,
|
|
25
|
-
:V => 99.06841,
|
|
26
|
-
|
|
27
|
-
# uncommon
|
|
28
|
-
:B => 172.048405, # average of aspartic acid and asparagine
|
|
29
|
-
:U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
|
30
|
-
:X => 118.805716, # the average of the mono masses of the 20 amino acids
|
|
31
|
-
:* => 118.805716, # same as X
|
|
32
|
-
:Z => (129.04259 + 128.05858) / 2, # average glutamic acid and glutamine
|
|
33
|
-
|
|
34
|
-
# elements etc.
|
|
35
|
-
:h => 1.00783,
|
|
36
|
-
#:h_plus => 1.00728, # this is the mass I had
|
|
37
|
-
:h_plus => 1.007276, # this is the mass used by mascot merge.pl
|
|
38
|
-
:o => 15.9949146,
|
|
39
|
-
:h2o => 18.01056,
|
|
40
|
-
}
|
|
41
|
-
AVG = {
|
|
42
|
-
:A => 71.0788,
|
|
43
|
-
:R => 156.1875,
|
|
44
|
-
:N => 114.1038,
|
|
45
|
-
:D => 115.0886,
|
|
46
|
-
:C => 103.1388,
|
|
47
|
-
:E => 129.1155,
|
|
48
|
-
:Q => 128.1307,
|
|
49
|
-
:G => 57.0519,
|
|
50
|
-
:H => 137.1411,
|
|
51
|
-
:I => 113.1594,
|
|
52
|
-
:L => 113.1594,
|
|
53
|
-
:K => 128.1741,
|
|
54
|
-
:M => 131.1926,
|
|
55
|
-
:F => 147.1766,
|
|
56
|
-
:P => 97.1167,
|
|
57
|
-
:S => 87.0782,
|
|
58
|
-
:T => 101.1051,
|
|
59
|
-
:W => 186.2132,
|
|
60
|
-
:Y => 163.1760,
|
|
61
|
-
:V => 99.1326,
|
|
62
|
-
|
|
63
|
-
# uncommon
|
|
64
|
-
:B => 172.1405, # average of aspartic acid and asparagine
|
|
65
|
-
:U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
|
66
|
-
:X => 118.88603, # the average of the masses of the 20 amino acids
|
|
67
|
-
:* => 118.88603, # same as X
|
|
68
|
-
:Z => (129.1155+ 128.1307) / 2, # average glutamic acid and glutamine
|
|
69
|
-
|
|
70
|
-
# elements etc.
|
|
71
|
-
:h => 1.00794,
|
|
72
|
-
:h_plus => 1.00739,
|
|
73
|
-
:o => 15.9994,
|
|
74
|
-
:h2o => 18.01524,
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
# returns a fresh hash where it has been added to each amino acid the amount
|
|
78
|
-
# specified in the array of a PepXML::Modifications object
|
|
79
|
-
# if static_terminal_mods given than will create the following keys as
|
|
80
|
-
# symbols as necessary:
|
|
81
|
-
# add_C_term_protein
|
|
82
|
-
# add_C_term_peptide
|
|
83
|
-
# add_N_term_protein
|
|
84
|
-
# add_N_term_peptide
|
|
85
|
-
def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
|
|
86
|
-
hash_to_use =
|
|
87
|
-
if monoisotopic
|
|
88
|
-
Mass::MONO
|
|
89
|
-
else
|
|
90
|
-
Mass::AVG
|
|
91
|
-
end
|
|
92
|
-
copy_hash = hash_to_use.dup
|
|
93
|
-
static_mods.each do |mod|
|
|
94
|
-
copy_hash[mod.aminoacid.to_sym] += mod.massdiff
|
|
95
|
-
end
|
|
96
|
-
static_terminal_mods.each do |mod|
|
|
97
|
-
if x = mod.protein_terminus
|
|
98
|
-
# its a protein terminus modification
|
|
99
|
-
case x
|
|
100
|
-
when 'n'
|
|
101
|
-
copy_hash[:add_N_term_protein] = mod.massdiff
|
|
102
|
-
when 'c'
|
|
103
|
-
copy_hash[:add_C_term_protein] = mod.massdiff
|
|
104
|
-
end
|
|
105
|
-
else
|
|
106
|
-
# its a peptide terminus modification
|
|
107
|
-
case mod.terminus
|
|
108
|
-
when 'n'
|
|
109
|
-
copy_hash[:add_N_term_peptide] = mod.massdiff
|
|
110
|
-
when 'c'
|
|
111
|
-
copy_hash[:add_C_term_peptide] = mod.massdiff
|
|
112
|
-
end
|
|
113
|
-
end
|
|
114
|
-
end
|
|
115
|
-
copy_hash
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
# returns an array of masses parallel to array passed in
|
|
119
|
-
# If you want the mass with H+, then pass in the mass as h_plus
|
|
120
|
-
# The mass hash must repond to
|
|
121
|
-
# :h2o (water)
|
|
122
|
-
# and at least the twenty amino acids (by string or symbol)
|
|
123
|
-
# The mass hash may respond to :add_N_term_peptide or :add_C_term_peptide
|
|
124
|
-
# in which case these will be added to the final mass
|
|
125
|
-
def self.masses(aaseqs, mass_hash=Mass::MONO, h_plus=0.0)
|
|
126
|
-
final_add = mass_hash[:h2o] + h_plus
|
|
127
|
-
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
|
128
|
-
if mass_hash.key?(sym)
|
|
129
|
-
final_add += mass_hash[sym]
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
hash_by_aa_string = {}
|
|
133
|
-
mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
|
|
134
|
-
|
|
135
|
-
aaseqs.map do |pep_aaseqs|
|
|
136
|
-
sum = 0.0
|
|
137
|
-
aaseq.split('').each do |let|
|
|
138
|
-
sum += hash_by_aa_string[let]
|
|
139
|
-
end
|
|
140
|
-
mh_plus = sum + final_add
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
class Mass::Calculator
|
|
148
|
-
|
|
149
|
-
# mass_hash must respond to :h2o or 'h2o'. This is added to represent the
|
|
150
|
-
# tails of the peptide. add_extra is outside of that (e.g., an H+)
|
|
151
|
-
def initialize(mass_hash, add_extra=0.0)
|
|
152
|
-
@mass_hash = mass_hash_to_s(mass_hash)
|
|
153
|
-
@final_add = @mass_hash['h2o'] + add_extra
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
def mass_hash_to_s(mass_hash)
|
|
157
|
-
new_hash = {}
|
|
158
|
-
mass_hash.each do |k,v|
|
|
159
|
-
new_hash[k.to_s] = v
|
|
160
|
-
end
|
|
161
|
-
new_hash
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
def masses(aaseqs)
|
|
165
|
-
aaseqs.map do |aaseq|
|
|
166
|
-
sum = @final_add # <- add in the initialization
|
|
167
|
-
aaseq.split('').each do |let|
|
|
168
|
-
if @mass_hash.key? let
|
|
169
|
-
sum += @mass_hash[let]
|
|
170
|
-
else
|
|
171
|
-
abort "LETTER not found in mass_hash: #{let}"
|
|
172
|
-
end
|
|
173
|
-
end
|
|
174
|
-
sum
|
|
175
|
-
end
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
end
|
|
179
|
-
|
data/lib/spec_id/parser/proph.rb
DELETED
|
@@ -1,335 +0,0 @@
|
|
|
1
|
-
require 'xml_style_parser'
|
|
2
|
-
require 'spec_id/sequest/pepxml'
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
module SpecID ; end
|
|
6
|
-
module SpecID::Parser ; end
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class SpecID::Parser::PepProph
|
|
10
|
-
include XMLStyleParser
|
|
11
|
-
|
|
12
|
-
# gets the protein (and adds the pephit to the protein)
|
|
13
|
-
def get_protein(search_hit, name, description, global_prot_hash)
|
|
14
|
-
prot =
|
|
15
|
-
if global_prot_hash.key?(name)
|
|
16
|
-
global_prot_hash[name]
|
|
17
|
-
else
|
|
18
|
-
prt = Proph::PepSummary::Prot.new([name, description, []])
|
|
19
|
-
global_prot_hash[name] = prt
|
|
20
|
-
end
|
|
21
|
-
prot.peps << search_hit
|
|
22
|
-
prot
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def initialize(parse_type=:spec_id, version='3.0')
|
|
26
|
-
@method = parse_type
|
|
27
|
-
@version = version
|
|
28
|
-
implemented = %w(AXML LibXML)
|
|
29
|
-
klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
|
|
30
|
-
case klass_s
|
|
31
|
-
when 'AXML'
|
|
32
|
-
@get_root_node_from_file = Proc.new do |file|
|
|
33
|
-
AXML.parse_file(file)
|
|
34
|
-
end
|
|
35
|
-
when 'LibXML' # LibXML is buggy on some machines...
|
|
36
|
-
@get_root_node_from_file = Proc.new do |file|
|
|
37
|
-
doc = XML::Document.file(file)
|
|
38
|
-
doc.root
|
|
39
|
-
end
|
|
40
|
-
else
|
|
41
|
-
raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# returns the spec_id object
|
|
46
|
-
# :global_prot_hash is a hash if you have multiple of these files to be
|
|
47
|
-
# combined
|
|
48
|
-
def spec_id(file, opts={})
|
|
49
|
-
|
|
50
|
-
raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
|
|
51
|
-
spec_id_obj =
|
|
52
|
-
if x = opts[:spec_id]
|
|
53
|
-
x
|
|
54
|
-
else
|
|
55
|
-
Proph::PepSummary.new
|
|
56
|
-
end
|
|
57
|
-
global_prot_hash =
|
|
58
|
-
if y = opts[:global_prot_hash]
|
|
59
|
-
y
|
|
60
|
-
else
|
|
61
|
-
{}
|
|
62
|
-
end
|
|
63
|
-
msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
|
|
64
|
-
spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
spec_id_obj.msms_run_summaries = msms_pipeline_analysis_n.find('child::msms_run_summary').map do |msms_run_summary_n|
|
|
68
|
-
parse_msms_run_summary(msms_run_summary_n, global_prot_hash)
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
peps = []
|
|
72
|
-
spec_id_obj.msms_run_summaries.each do |mrs|
|
|
73
|
-
mrs.spectrum_queries.each do |sq|
|
|
74
|
-
sq.search_results.each do |sr|
|
|
75
|
-
peps.push( *(sr.search_hits) )
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
end
|
|
79
|
-
spec_id_obj.peps = peps
|
|
80
|
-
spec_id_obj.prots = global_prot_hash.values
|
|
81
|
-
spec_id_obj
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
# returns an msms_run_summary object
|
|
85
|
-
def parse_msms_run_summary(msms_run_summary_n, global_prot_hash)
|
|
86
|
-
msms_run_summary_obj = Sequest::PepXML::MSMSRunSummary.new
|
|
87
|
-
|
|
88
|
-
msms_run_summary_obj.from_pepxml_node(msms_run_summary_n)
|
|
89
|
-
sample_enzyme_n = msms_run_summary_n.find_first("child::sample_enzyme")
|
|
90
|
-
msms_run_summary_obj.sample_enzyme = SampleEnzyme.from_pepxml_node( sample_enzyme_n )
|
|
91
|
-
|
|
92
|
-
search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
|
|
93
|
-
spectrum_queries_nds = search_summary_n.find("following-sibling::spectrum_query")
|
|
94
|
-
|
|
95
|
-
msms_run_summary_obj.spectrum_queries = spectrum_queries_nds.map do |sq_n|
|
|
96
|
-
|
|
97
|
-
sq = Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n)
|
|
98
|
-
sq.search_results = sq_n.children.map do |sr_n|
|
|
99
|
-
sr = Sequest::PepXML::SearchResult.new
|
|
100
|
-
sr.search_hits = sr_n.children.map do |sh_n|
|
|
101
|
-
sh = Proph::PepSummary::Pep.new # descended from SearchHit
|
|
102
|
-
sh.from_pepxml_node(sh_n)
|
|
103
|
-
sh.spectrum_query = sq
|
|
104
|
-
prots = [ get_protein(sh, sh_n['protein'], sh_n['protein_descr'], global_prot_hash) ]
|
|
105
|
-
## alternative proteins:
|
|
106
|
-
if sh.num_tot_proteins > 1
|
|
107
|
-
sh_n.find('child::alternative_protein').each do |alt_prot_n|
|
|
108
|
-
prots << get_protein(sh, alt_prot_n['protein'], alt_prot_n['protein_descr'], global_prot_hash)
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
sh.prots = prots
|
|
112
|
-
|
|
113
|
-
if modinfo_node = sh_n.find_first("child::modification_info")
|
|
114
|
-
sh.modification_info = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
## search scores:
|
|
119
|
-
sh_n.find("child::search_score").each do |ss_n|
|
|
120
|
-
case ss_n['name']
|
|
121
|
-
when 'deltacnstar'
|
|
122
|
-
sh.deltacnstar = ss_n['value'].to_i
|
|
123
|
-
when 'xcorr'
|
|
124
|
-
sh.xcorr = ss_n['value'].to_f
|
|
125
|
-
when 'deltacn'
|
|
126
|
-
sh.deltacn = ss_n['value'].to_f
|
|
127
|
-
when 'spscore'
|
|
128
|
-
sh.spscore = ss_n['value'].to_f
|
|
129
|
-
when 'sprank'
|
|
130
|
-
sh.sprank = ss_n['value'].to_i
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
|
-
sh
|
|
134
|
-
end
|
|
135
|
-
sr
|
|
136
|
-
end
|
|
137
|
-
sq
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
|
|
141
|
-
#into a better search summary object (to eventually depracate the params object)
|
|
142
|
-
msms_run_summary_obj.search_summary = msms_run_summary_n
|
|
143
|
-
msms_run_summary_obj
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
class SpecID::Parser::ProtProph
|
|
149
|
-
include XMLStyleParser
|
|
150
|
-
Split_unique_stripped_peptides_re = /\+/
|
|
151
|
-
|
|
152
|
-
def initialize(parse_type=:spec_id, version='4')
|
|
153
|
-
@method = parse_type
|
|
154
|
-
@version = version
|
|
155
|
-
|
|
156
|
-
implemented = %w(AXML LibXML)
|
|
157
|
-
klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
|
|
158
|
-
case klass_s
|
|
159
|
-
when 'AXML'
|
|
160
|
-
#puts "parsing with AXML (XMLParser based)" if $VERBOSE
|
|
161
|
-
@get_root_node_from_file = Proc.new do |file|
|
|
162
|
-
AXML.parse_file(file)
|
|
163
|
-
end
|
|
164
|
-
when 'LibXML' # LibXML is buggy on some machines...
|
|
165
|
-
#puts "parsing with LibXML" if $VERBOSE
|
|
166
|
-
@get_root_node_from_file = Proc.new do |file|
|
|
167
|
-
doc = XML::Document.file(file)
|
|
168
|
-
doc.root
|
|
169
|
-
end
|
|
170
|
-
else
|
|
171
|
-
raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
|
|
172
|
-
end
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
# returns the spec_id object
|
|
176
|
-
def spec_id(file, opts={})
|
|
177
|
-
raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
|
|
178
|
-
spec_id_obj =
|
|
179
|
-
if x = opts[:spec_id]
|
|
180
|
-
x
|
|
181
|
-
else
|
|
182
|
-
Proph::ProtSummary.new
|
|
183
|
-
end
|
|
184
|
-
protein_summary_n = @get_root_node_from_file.call(file)
|
|
185
|
-
|
|
186
|
-
#protein_summary_n = scan_for_first(doc, 'protein_summary')
|
|
187
|
-
|
|
188
|
-
# protein_summary_header_n = protein_summary_n.child
|
|
189
|
-
# could grab some of this info if we wanted...
|
|
190
|
-
|
|
191
|
-
pep_hash = {}
|
|
192
|
-
prot_hash = {}
|
|
193
|
-
protein_groups = []
|
|
194
|
-
|
|
195
|
-
# get all the proteins from inside protein groups
|
|
196
|
-
protein_group_name = 'protein_group'
|
|
197
|
-
get_protein_summary_header = true
|
|
198
|
-
protein_summary_n.each do |protein_group_n|
|
|
199
|
-
if get_protein_summary_header
|
|
200
|
-
protein_summary_header_n = protein_group_n
|
|
201
|
-
get_protein_summary_header = false
|
|
202
|
-
elsif protein_group_n.name == protein_group_name
|
|
203
|
-
protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
|
|
204
|
-
end
|
|
205
|
-
end
|
|
206
|
-
|
|
207
|
-
# need to finalize hash stuff
|
|
208
|
-
pep_hash.each do |k,pep|
|
|
209
|
-
new_prots = []
|
|
210
|
-
pep.prots.each do |prot_or_string|
|
|
211
|
-
if prot_or_string.is_a?(Proph::Prot)
|
|
212
|
-
new_prots << prot_or_string
|
|
213
|
-
else
|
|
214
|
-
prt = prot_hash[prot_or_string]
|
|
215
|
-
if prt.nil?
|
|
216
|
-
# this is an indistinguishable protein!
|
|
217
|
-
else
|
|
218
|
-
new_prots << prt
|
|
219
|
-
end
|
|
220
|
-
end
|
|
221
|
-
end
|
|
222
|
-
pep.prots = new_prots
|
|
223
|
-
end
|
|
224
|
-
|
|
225
|
-
spec_id_obj.peps = pep_hash.values
|
|
226
|
-
spec_id_obj.prots = prot_hash.values
|
|
227
|
-
spec_id_obj.prot_groups = protein_groups
|
|
228
|
-
spec_id_obj
|
|
229
|
-
end
|
|
230
|
-
|
|
231
|
-
# takes a Y or N and gives true/false
|
|
232
|
-
def booleanize(string)
|
|
233
|
-
case string
|
|
234
|
-
when 'Y'
|
|
235
|
-
true
|
|
236
|
-
when 'N'
|
|
237
|
-
false
|
|
238
|
-
else
|
|
239
|
-
nil
|
|
240
|
-
end
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
# assumes that all the rest of the nodes are protein_groups
|
|
244
|
-
# pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
|
|
245
|
-
# modified) + charge
|
|
246
|
-
# (as far as I can tell, all protein entries are unique!)
|
|
247
|
-
# returns a ProtGroup object
|
|
248
|
-
def get_proteins(protein_group_node, pep_hash, prot_hash)
|
|
249
|
-
|
|
250
|
-
protein_group_proteins = []
|
|
251
|
-
|
|
252
|
-
protein_group_node.each do |protein_n|
|
|
253
|
-
raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
|
|
254
|
-
# probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
|
|
255
|
-
|
|
256
|
-
# get the description
|
|
257
|
-
# INITIALIZE the protein and set key
|
|
258
|
-
n = protein_n
|
|
259
|
-
protein_name = n['protein_name']
|
|
260
|
-
peps = []
|
|
261
|
-
protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
|
|
262
|
-
n['n_indistinguishable_proteins'].to_i,
|
|
263
|
-
n['percent_coverage'].to_f,
|
|
264
|
-
n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
|
|
265
|
-
n['group_sibling_id'], n['total_number_peptides'].to_i,
|
|
266
|
-
n['pct_spectrum_ids'].to_f, nil,
|
|
267
|
-
peps ])
|
|
268
|
-
protein_group_proteins << protein
|
|
269
|
-
prot_hash[protein_name] = protein
|
|
270
|
-
|
|
271
|
-
# traverse through the peptides (and annotation)
|
|
272
|
-
protein_n.each do |protein_sub_n|
|
|
273
|
-
# create a proteins array for each peptide
|
|
274
|
-
proteins = [protein]
|
|
275
|
-
|
|
276
|
-
if protein_sub_n.name == 'annotation'
|
|
277
|
-
protein.description = protein_sub_n['protein_description']
|
|
278
|
-
end
|
|
279
|
-
if protein_sub_n.name == 'peptide'
|
|
280
|
-
peptide_n = protein_sub_n
|
|
281
|
-
# peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
|
|
282
|
-
# get modifications, if any
|
|
283
|
-
|
|
284
|
-
n = peptide_n
|
|
285
|
-
peptide_sequence = n['peptide_sequence']
|
|
286
|
-
charge = n['charge'].to_i
|
|
287
|
-
|
|
288
|
-
# GET list of all proteins and modifications
|
|
289
|
-
|
|
290
|
-
mod_info = nil
|
|
291
|
-
peptide_hash_string = peptide_sequence
|
|
292
|
-
if peptide_n.child?
|
|
293
|
-
peptide_n.each do |pep_sub_n|
|
|
294
|
-
case pep_sub_n.name
|
|
295
|
-
when 'peptide_parent_protein'
|
|
296
|
-
# NOTE! the proteins list will have strings until the assoc.
|
|
297
|
-
# prot is found!
|
|
298
|
-
proteins << pep_sub_n['protein_name']
|
|
299
|
-
when 'modification_info'
|
|
300
|
-
masses = pep_sub_n.map do |mod_aa_mass_n|
|
|
301
|
-
Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
|
|
302
|
-
end
|
|
303
|
-
peptide_hash_string = pep_sub_n['modified_peptide']
|
|
304
|
-
mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
|
|
305
|
-
end
|
|
306
|
-
end
|
|
307
|
-
end
|
|
308
|
-
|
|
309
|
-
key = [peptide_hash_string, charge]
|
|
310
|
-
peptide =
|
|
311
|
-
if pep_hash.key? key
|
|
312
|
-
pep_hash[key]
|
|
313
|
-
else
|
|
314
|
-
pep = Proph::Prot::Pep.new([peptide_sequence, charge,
|
|
315
|
-
n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
|
|
316
|
-
n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
|
|
317
|
-
n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
|
|
318
|
-
n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
|
|
319
|
-
booleanize(n['is_contributing_evidence']),
|
|
320
|
-
n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
|
|
321
|
-
pep_hash[key] = pep
|
|
322
|
-
pep
|
|
323
|
-
end
|
|
324
|
-
peps << peptide
|
|
325
|
-
end
|
|
326
|
-
end # end protein children
|
|
327
|
-
end
|
|
328
|
-
Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
|
|
329
|
-
end
|
|
330
|
-
|
|
331
|
-
def parse(file, opts)
|
|
332
|
-
send(@method, file, opts)
|
|
333
|
-
end
|
|
334
|
-
|
|
335
|
-
end
|