mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/spec_id.rb
CHANGED
|
@@ -1,75 +1,19 @@
|
|
|
1
1
|
require 'ostruct'
|
|
2
2
|
require 'set'
|
|
3
3
|
require 'hash_by'
|
|
4
|
-
require 'spec_id/precision'
|
|
5
4
|
require 'roc'
|
|
6
5
|
require 'sample_enzyme' # for others
|
|
7
6
|
require 'spec_id/bioworks'
|
|
8
7
|
require 'spec_id/sequest'
|
|
9
|
-
require 'spec_id/proph'
|
|
8
|
+
require 'spec_id/proph/prot_summary'
|
|
10
9
|
require 'spec_id_xml'
|
|
10
|
+
require 'spec_id/mass'
|
|
11
|
+
require 'fasta'
|
|
12
|
+
|
|
13
|
+
module ProteinReferenceable ; end
|
|
11
14
|
|
|
12
15
|
class SampleEnzyme ; end
|
|
13
16
|
|
|
14
|
-
class Mass
|
|
15
|
-
# http://expasy.org/tools/findmod/findmod_masses.html
|
|
16
|
-
# still need to add the modifications
|
|
17
|
-
MONO = {
|
|
18
|
-
:A => 71.03711,
|
|
19
|
-
:R => 156.10111,
|
|
20
|
-
:N => 114.04293,
|
|
21
|
-
:D => 115.02694,
|
|
22
|
-
:C => 103.00919,
|
|
23
|
-
:E => 129.04259,
|
|
24
|
-
:Q => 128.05858,
|
|
25
|
-
:G => 57.02146,
|
|
26
|
-
:H => 137.05891,
|
|
27
|
-
:I => 113.08406,
|
|
28
|
-
:L => 113.08406,
|
|
29
|
-
:K => 128.09496,
|
|
30
|
-
:M => 131.04049,
|
|
31
|
-
:F => 147.06841,
|
|
32
|
-
:P => 97.05276,
|
|
33
|
-
:S => 87.03203,
|
|
34
|
-
:T => 101.04768,
|
|
35
|
-
:W => 186.07931,
|
|
36
|
-
:Y => 163.06333,
|
|
37
|
-
:V => 99.06841,
|
|
38
|
-
|
|
39
|
-
:h => 1.00783,
|
|
40
|
-
:h_plus => 1.00728,
|
|
41
|
-
:o => 15.9949146,
|
|
42
|
-
:h2o => 18.01056,
|
|
43
|
-
|
|
44
|
-
}
|
|
45
|
-
AVG = {
|
|
46
|
-
:A => 71.0788,
|
|
47
|
-
:R => 156.1875,
|
|
48
|
-
:N => 114.1038,
|
|
49
|
-
:D => 115.0886,
|
|
50
|
-
:C => 103.1388,
|
|
51
|
-
:E => 129.1155,
|
|
52
|
-
:Q => 128.1307,
|
|
53
|
-
:G => 57.0519,
|
|
54
|
-
:H => 137.1411,
|
|
55
|
-
:I => 113.1594,
|
|
56
|
-
:L => 113.1594,
|
|
57
|
-
:K => 128.1741,
|
|
58
|
-
:M => 131.1926,
|
|
59
|
-
:F => 147.1766,
|
|
60
|
-
:P => 97.1167,
|
|
61
|
-
:S => 87.0782,
|
|
62
|
-
:T => 101.1051,
|
|
63
|
-
:W => 186.2132,
|
|
64
|
-
:Y => 163.1760,
|
|
65
|
-
:V => 99.1326,
|
|
66
|
-
|
|
67
|
-
:h => 1.00794,
|
|
68
|
-
:h_plus => 1.00739,
|
|
69
|
-
:o => 15.9994,
|
|
70
|
-
:h2o => 18.01524,
|
|
71
|
-
}
|
|
72
|
-
end
|
|
73
17
|
|
|
74
18
|
module SpecID ; end
|
|
75
19
|
|
|
@@ -91,8 +35,14 @@ module SpecID
|
|
|
91
35
|
# Will return a SpecID object (really, the object corresponding to the
|
|
92
36
|
# file type which mixes in SpecID [is_a?(SpecID) == true])
|
|
93
37
|
# If no file is given, will return a GenericSpecID object.
|
|
38
|
+
# If file is an array, this is assumed to be a group of srf files which is
|
|
39
|
+
# converted into an SRFGroup Ojbect and run.
|
|
94
40
|
def self.new(file=nil, tp=nil)
|
|
95
|
-
|
|
41
|
+
# this will need to be specialized for other groups later
|
|
42
|
+
if file.is_a?(Array)
|
|
43
|
+
# takes an array of srf filenames
|
|
44
|
+
SRFGroup.new(file)
|
|
45
|
+
elsif file
|
|
96
46
|
from_file(file, tp)
|
|
97
47
|
else
|
|
98
48
|
GenericSpecID.new
|
|
@@ -100,22 +50,27 @@ module SpecID
|
|
|
100
50
|
end
|
|
101
51
|
|
|
102
52
|
# tp = file_type
|
|
103
|
-
#
|
|
53
|
+
# a single srf file will be packaged into an SRFGroup object
|
|
104
54
|
def self.from_file(file, tp=nil)
|
|
105
55
|
obj = nil
|
|
106
56
|
unless tp
|
|
107
57
|
tp = file_type(file)
|
|
108
58
|
end
|
|
109
59
|
obj = case tp
|
|
60
|
+
when 'srf'
|
|
61
|
+
#@hi_prob_best = false
|
|
62
|
+
SRFGroup.new([file])
|
|
110
63
|
when 'srg'
|
|
111
|
-
|
|
64
|
+
#@hi_prob_best = false
|
|
112
65
|
SRFGroup.new(file)
|
|
113
66
|
when 'bioworks'
|
|
114
|
-
|
|
67
|
+
#@hi_prob_best = false
|
|
115
68
|
Bioworks.new(file)
|
|
116
69
|
when 'protproph'
|
|
117
|
-
|
|
70
|
+
#@hi_prob_best = true
|
|
118
71
|
Proph::ProtSummary.new(file)
|
|
72
|
+
when 'pepproph'
|
|
73
|
+
Proph::PepSummary.new(file)
|
|
119
74
|
else
|
|
120
75
|
abort "UNRECOGNIZED file type for #{file}"
|
|
121
76
|
end
|
|
@@ -123,9 +78,76 @@ module SpecID
|
|
|
123
78
|
end
|
|
124
79
|
|
|
125
80
|
def inspect
|
|
126
|
-
|
|
81
|
+
peps_string =
|
|
82
|
+
if peps
|
|
83
|
+
"peps(#)=#{peps.size}"
|
|
84
|
+
else
|
|
85
|
+
"peps=(nil)"
|
|
86
|
+
end
|
|
87
|
+
"<#{self.class} #{peps_string}>"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# given some list of SpecID::Pep based objects, returns the list of proteins
|
|
91
|
+
# associated with those peptides
|
|
92
|
+
# kind must be a symbol:
|
|
93
|
+
# :no_update (current proteins are returned, but their peps attribute
|
|
94
|
+
# is not updated)
|
|
95
|
+
# :update (current proteins returned with peps attribute updated)
|
|
96
|
+
# :new (new proteins are created complete with peps attribute)
|
|
97
|
+
def self.protein_list(pephits, kind=:no_update)
|
|
98
|
+
|
|
99
|
+
orig_pephits_prts = []
|
|
100
|
+
if kind == :new
|
|
101
|
+
new_prots = {}
|
|
102
|
+
pephits.each_with_index do |pep,i|
|
|
103
|
+
orig_pephits_prts[i] = pep.prots
|
|
104
|
+
peps_new_prts = pep.prots.map do |prt|
|
|
105
|
+
if new_prots.key? prt.reference
|
|
106
|
+
already_exists = new_prots[prt.reference]
|
|
107
|
+
else
|
|
108
|
+
np = prt.dup
|
|
109
|
+
np.peps = []
|
|
110
|
+
new_prots[np.reference] = np
|
|
111
|
+
np
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
pep.prots = peps_new_prts
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
if kind == :update
|
|
119
|
+
pephits.each do |pep|
|
|
120
|
+
pep.prots.each do |prt|
|
|
121
|
+
prt.peps = []
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
prot_set = {}
|
|
127
|
+
pephits.each do |pep|
|
|
128
|
+
prts = pep.prots
|
|
129
|
+
prts.each do |prt|
|
|
130
|
+
prot_set[ prt.reference ] = prt
|
|
131
|
+
end
|
|
132
|
+
if (kind == :update || kind == :new)
|
|
133
|
+
prts.each do |prt|
|
|
134
|
+
prt.peps << pep
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
## Reset the original protein hits
|
|
140
|
+
if kind == :new
|
|
141
|
+
pephits.each_with_index do |pep,i|
|
|
142
|
+
pep.prots = orig_pephits_prts[i]
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
prot_set.values
|
|
127
147
|
end
|
|
128
148
|
|
|
149
|
+
|
|
150
|
+
|
|
129
151
|
# takes a comma separated list or array and extends the last to create an
|
|
130
152
|
# array of desired size
|
|
131
153
|
def self.extend_args(arg, desired_size)
|
|
@@ -193,13 +215,6 @@ module SpecID
|
|
|
193
215
|
prot_triplets
|
|
194
216
|
end
|
|
195
217
|
|
|
196
|
-
|
|
197
|
-
## basically, this is the command line wrapper
|
|
198
|
-
def self.precision(argv)
|
|
199
|
-
Prec.new.run_cmd_line(argv)
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
|
|
203
218
|
# returns number of true positives (array) and the specified output (as
|
|
204
219
|
# parallel array). Requires the classification method and a sorted array of
|
|
205
220
|
# tp values and an array fp values.
|
|
@@ -223,55 +238,100 @@ module SpecID
|
|
|
223
238
|
pps
|
|
224
239
|
end
|
|
225
240
|
|
|
226
|
-
def
|
|
227
|
-
|
|
228
|
-
|
|
241
|
+
def self.prots?(ar)
|
|
242
|
+
ar.first.is_a? SpecID::Prot
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def self.peps?(ar)
|
|
246
|
+
ar.first.is_a? SpecID::Pep
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# for older stuff
|
|
250
|
+
def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
|
|
251
|
+
objects =
|
|
252
|
+
case items
|
|
253
|
+
when :prots
|
|
254
|
+
prots
|
|
255
|
+
when :peps
|
|
256
|
+
peps
|
|
257
|
+
end
|
|
258
|
+
SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# includes the peptide hit in both
|
|
262
|
+
# returns (target, decoy)
|
|
263
|
+
# (for peps) ties can be :both, true (target wins), false (decoy wins)
|
|
264
|
+
# regardless of ties behavior, will partition out the proteins to be
|
|
265
|
+
# appropriate for the peptide
|
|
266
|
+
def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
|
|
267
|
+
if items.size == 0
|
|
268
|
+
return [[],[]]
|
|
269
|
+
elsif prots?(items)
|
|
229
270
|
myproc = proc { |prt|
|
|
230
|
-
if prt.reference =~ regex ; !
|
|
231
|
-
else ;
|
|
271
|
+
if prt.reference =~ regex ; !decoy_on_match
|
|
272
|
+
else ; decoy_on_match end
|
|
232
273
|
}
|
|
233
274
|
return classify(items, myproc)
|
|
234
|
-
|
|
275
|
+
elsif peps?(items)
|
|
235
276
|
match = [] ; nomatch = []
|
|
236
|
-
|
|
237
|
-
match_prots =
|
|
238
|
-
(hit, nohit) = pep.prots.partition do |prot|
|
|
277
|
+
items.each do |pep|
|
|
278
|
+
(match_prots, nomatch_prots) = pep.prots.partition do |prot|
|
|
239
279
|
prot.reference =~ regex
|
|
240
280
|
end
|
|
241
|
-
if
|
|
281
|
+
if match_prots.size == 0
|
|
242
282
|
nomatch << pep
|
|
243
|
-
elsif
|
|
283
|
+
elsif nomatch_prots.size == 0
|
|
244
284
|
match << pep
|
|
245
285
|
else ## both have hits
|
|
246
286
|
pep.prots = match_prots
|
|
247
287
|
nomatch_pep = pep.dup
|
|
248
288
|
nomatch_pep.prots = nomatch_prots
|
|
249
|
-
|
|
250
|
-
|
|
289
|
+
|
|
290
|
+
# resolve ties
|
|
291
|
+
case ties
|
|
292
|
+
when true
|
|
293
|
+
if decoy_on_match
|
|
294
|
+
nomatch << pep
|
|
295
|
+
else
|
|
296
|
+
match << pep
|
|
297
|
+
end
|
|
298
|
+
when false
|
|
299
|
+
if decoy_on_match
|
|
300
|
+
match << pep
|
|
301
|
+
else
|
|
302
|
+
nomatch << pep
|
|
303
|
+
end
|
|
304
|
+
when :both
|
|
305
|
+
match << pep
|
|
306
|
+
nomatch << pep
|
|
307
|
+
else ; raise ArgumentError
|
|
308
|
+
end
|
|
251
309
|
end
|
|
252
310
|
end
|
|
253
|
-
if
|
|
311
|
+
if decoy_on_match
|
|
254
312
|
return [nomatch , match]
|
|
255
313
|
else
|
|
256
314
|
return [match, nomatch]
|
|
257
315
|
end
|
|
258
316
|
else
|
|
259
|
-
|
|
317
|
+
raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
|
|
260
318
|
end
|
|
261
319
|
end
|
|
262
320
|
|
|
321
|
+
|
|
322
|
+
|
|
263
323
|
# returns [tp, fp] based on the protein prefix for items where items =
|
|
264
324
|
# (:prot|:peps)
|
|
265
325
|
# this may result in a duplication of some peptides if they match both
|
|
266
326
|
# normal and decoy proteins. In this case, the protein arrays are split,
|
|
267
327
|
# too, so that each points only to its breed of protein.
|
|
268
|
-
def
|
|
328
|
+
def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
|
|
269
329
|
if prefix
|
|
270
330
|
regex = /^#{Regexp.escape(flag)}/
|
|
271
331
|
else
|
|
272
332
|
regex = /#{Regexp.escape(flag)}/
|
|
273
333
|
end
|
|
274
|
-
classify_by_regex(items, regex,
|
|
334
|
+
classify_by_regex(items, regex, decoy_on_match)
|
|
275
335
|
end
|
|
276
336
|
|
|
277
337
|
# Returns (match, nomatch)
|
|
@@ -303,7 +363,7 @@ module SpecID
|
|
|
303
363
|
classify_item_by.call(item) ]
|
|
304
364
|
end
|
|
305
365
|
roc = ROC.new
|
|
306
|
-
tp, fp = roc.
|
|
366
|
+
tp, fp = roc.doublets_to_separate(doublets)
|
|
307
367
|
return tp, fp
|
|
308
368
|
end
|
|
309
369
|
|
|
@@ -393,11 +453,13 @@ module SpecID
|
|
|
393
453
|
end
|
|
394
454
|
File.open(file) do |fh|
|
|
395
455
|
lines = ""
|
|
396
|
-
|
|
456
|
+
8.times { lines << fh.readline }
|
|
397
457
|
if lines =~ /<bioworksinfo>/
|
|
398
458
|
return 'bioworks'
|
|
399
|
-
elsif lines =~ /<protein_summary/
|
|
459
|
+
elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
|
|
400
460
|
return 'protproph'
|
|
461
|
+
elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
|
|
462
|
+
return 'pepproph'
|
|
401
463
|
end
|
|
402
464
|
end
|
|
403
465
|
end
|
|
@@ -521,9 +583,10 @@ module SpecID
|
|
|
521
583
|
end
|
|
522
584
|
end
|
|
523
585
|
|
|
524
|
-
|
|
525
586
|
# A Generic spectraID protein
|
|
526
587
|
module SpecID::Prot
|
|
588
|
+
include ProteinReferenceable
|
|
589
|
+
|
|
527
590
|
# probability is always a float!
|
|
528
591
|
attr_accessor :probability, :reference, :peps
|
|
529
592
|
|
|
@@ -531,6 +594,14 @@ module SpecID::Prot
|
|
|
531
594
|
self.reference <=> other.reference
|
|
532
595
|
end
|
|
533
596
|
|
|
597
|
+
def inspect
|
|
598
|
+
pep_string =
|
|
599
|
+
if peps
|
|
600
|
+
", @peps(#)=#{peps.size}"
|
|
601
|
+
end
|
|
602
|
+
"<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
|
|
603
|
+
end
|
|
604
|
+
|
|
534
605
|
end
|
|
535
606
|
|
|
536
607
|
module SpecID::Pep
|
|
@@ -653,6 +724,23 @@ module SpecID::Pep
|
|
|
653
724
|
when :mmu
|
|
654
725
|
end
|
|
655
726
|
end
|
|
727
|
+
|
|
728
|
+
# calls the method associated with each key and returns the value
|
|
729
|
+
def values_at(*args)
|
|
730
|
+
args.map do |arg|
|
|
731
|
+
send(arg)
|
|
732
|
+
end
|
|
733
|
+
end
|
|
734
|
+
|
|
735
|
+
def inspect
|
|
736
|
+
|
|
737
|
+
prot_string =
|
|
738
|
+
if prots
|
|
739
|
+
", @prots(#)=#{prots.size}"
|
|
740
|
+
end
|
|
741
|
+
"<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
|
|
742
|
+
end
|
|
743
|
+
|
|
656
744
|
end
|
|
657
745
|
|
|
658
746
|
class SpecID::GenericProt
|
data/lib/spec_id_xml.rb
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# concatenation into a file
|
|
7
7
|
module SpecIDXML
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
MSial_chrs_hash = {
|
|
10
10
|
'"' => '"',
|
|
11
11
|
'&' => '&',
|
|
12
12
|
"'" => ''',
|
|
@@ -17,8 +17,8 @@ module SpecIDXML
|
|
|
17
17
|
# substitutes special xml chars
|
|
18
18
|
def escape_special_chars(string)
|
|
19
19
|
string.split('').map do |char|
|
|
20
|
-
if
|
|
21
|
-
# if x =
|
|
20
|
+
if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
|
|
21
|
+
# if x = MSial_chrs_hash[char] ; x # <-- that's slightly slower
|
|
22
22
|
else ; char end
|
|
23
23
|
end.join
|
|
24
24
|
end
|
|
@@ -33,13 +33,13 @@ module SpecIDXML
|
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def param_xml(symbol)
|
|
37
|
-
tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
|
|
36
|
+
def param_xml(obj, symbol)
|
|
37
|
+
tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
def params_xml(*symbol_list)
|
|
40
|
+
def params_xml(obj, *symbol_list)
|
|
41
41
|
symbol_list.collect { |sy|
|
|
42
|
-
param_xml(sy)
|
|
42
|
+
param_xml(obj, sy)
|
|
43
43
|
}.join("\n") + "\n"
|
|
44
44
|
end
|
|
45
45
|
|
|
@@ -92,9 +92,7 @@ module SpecIDXML
|
|
|
92
92
|
end
|
|
93
93
|
|
|
94
94
|
def attrs_xml(list_of_symbols)
|
|
95
|
-
list_of_symbols.collect {|sy|
|
|
96
|
-
attr_xml(sy)
|
|
97
|
-
}.join(" ")
|
|
95
|
+
list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
|
|
98
96
|
end
|
|
99
97
|
|
|
100
98
|
end
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
require 'transmem'
|
|
2
|
+
|
|
3
|
+
class Phobius ; end
|
|
4
|
+
|
|
5
|
+
# This class will probably change its interface some in the future
|
|
6
|
+
# That's the web portal
|
|
7
|
+
# http://phobius.cgb.ki.se/
|
|
8
|
+
# How to run:
|
|
9
|
+
# Select output format as 'Short'
|
|
10
|
+
# then hit 'Submit Query'
|
|
11
|
+
|
|
12
|
+
# note: to implement some of the TransmemIndex features, the update_aaseq
|
|
13
|
+
# method must be called!
|
|
14
|
+
class Phobius::Index < Hash
|
|
15
|
+
include TransmemIndex
|
|
16
|
+
|
|
17
|
+
# will update_aaseq if given a fasta_obj
|
|
18
|
+
def initialize(file, fasta_obj = nil )
|
|
19
|
+
Phobius.default_index(file, self)
|
|
20
|
+
if fasta_obj
|
|
21
|
+
update_aaseq(fasta_obj)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# we need to match whatever function toppred uses to generate identifiers if
|
|
26
|
+
# we want derivative processes to be fast and accurate
|
|
27
|
+
def reference_to_key(reference)
|
|
28
|
+
if reference
|
|
29
|
+
if reference.size > 0
|
|
30
|
+
index = reference.index(' ')
|
|
31
|
+
string =
|
|
32
|
+
if index
|
|
33
|
+
reference[0...index]
|
|
34
|
+
else
|
|
35
|
+
reference
|
|
36
|
+
end
|
|
37
|
+
string.gsub('"','')
|
|
38
|
+
else
|
|
39
|
+
''
|
|
40
|
+
end
|
|
41
|
+
else
|
|
42
|
+
nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# adds an :aaseq key to each hash (necessary for avg_overlap method)
|
|
47
|
+
# these are shallow references to the aaseq in the fasta obj
|
|
48
|
+
def update_aaseq(fasta)
|
|
49
|
+
fasta.each do |prot|
|
|
50
|
+
self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
class Phobius
|
|
57
|
+
include TransmemIndex
|
|
58
|
+
|
|
59
|
+
# returns the default index
|
|
60
|
+
def self.default_index(file, index={})
|
|
61
|
+
parser = Phobius::Parser.new(:short)
|
|
62
|
+
parser.file_to_index(file, index)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
module Phobius::Parser
|
|
68
|
+
|
|
69
|
+
def self.new(parser_type=:short)
|
|
70
|
+
klass =
|
|
71
|
+
case parser_type
|
|
72
|
+
when :short
|
|
73
|
+
Phobius::ParserShort
|
|
74
|
+
else
|
|
75
|
+
raise ArgumentError, "don't recognize parser type: #{parser_type}"
|
|
76
|
+
end
|
|
77
|
+
klass.new
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def file_to_index(file, index={})
|
|
81
|
+
File.open(file) {|fh| to_index(fh, index) }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Phobius::ParserShort
|
|
88
|
+
include Phobius::Parser
|
|
89
|
+
|
|
90
|
+
# takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
|
|
91
|
+
# and returns an array of hashes with the keys :start and :stop
|
|
92
|
+
def prediction_to_array(string)
|
|
93
|
+
segments = []
|
|
94
|
+
string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
|
|
95
|
+
segments << { :start => m1.to_i, :stop => m2.to_i }
|
|
96
|
+
end
|
|
97
|
+
segments
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# returns a hash structure in this form: { identifier => {
|
|
101
|
+
# :num_certain_transmembrane_segments => Int,
|
|
102
|
+
# :transmembrane_segments => [:start => Int, :stop
|
|
103
|
+
# => Int] }
|
|
104
|
+
# can parse io even if there is no header to key in on.
|
|
105
|
+
def to_index(io, index={})
|
|
106
|
+
init_pos = io.pos
|
|
107
|
+
cnt = 0
|
|
108
|
+
found_header = false
|
|
109
|
+
loop do
|
|
110
|
+
if io.gets =~ /SEQENCE/
|
|
111
|
+
found_header = true
|
|
112
|
+
break
|
|
113
|
+
end
|
|
114
|
+
cnt += 1
|
|
115
|
+
break if cnt > 10
|
|
116
|
+
end
|
|
117
|
+
if !found_header
|
|
118
|
+
io.pos = init_pos
|
|
119
|
+
end
|
|
120
|
+
current_record = nil
|
|
121
|
+
io.each do |line|
|
|
122
|
+
line.chomp!
|
|
123
|
+
# grab values
|
|
124
|
+
ar = line.split(/\s+/)
|
|
125
|
+
next if ar.size != 4
|
|
126
|
+
(key, num_tms, signal_peptide, prediction) = ar
|
|
127
|
+
# cast the values
|
|
128
|
+
num_tms = num_tms.to_i
|
|
129
|
+
signal_peptide =
|
|
130
|
+
case signal_peptide
|
|
131
|
+
when 'Y'
|
|
132
|
+
true
|
|
133
|
+
when '0'
|
|
134
|
+
false
|
|
135
|
+
end
|
|
136
|
+
index[key] = {
|
|
137
|
+
:num_certain_transmembrane_segments => num_tms,
|
|
138
|
+
:signal_peptide => signal_peptide,
|
|
139
|
+
}
|
|
140
|
+
if num_tms > 0
|
|
141
|
+
index[key][:transmembrane_segments] = prediction_to_array(prediction)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
index
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
end
|