mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/spec_id/proph.rb
CHANGED
|
@@ -1,468 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
require '
|
|
3
|
-
require '
|
|
4
|
-
require 'instance_var_set_from_hash'
|
|
5
|
-
require 'axml'
|
|
6
|
-
require 'spec_id'
|
|
2
|
+
#require 'spec_id/proph/prot_summary'
|
|
3
|
+
#require 'spec_id/proph/pep_summary'
|
|
7
4
|
|
|
8
|
-
|
|
9
|
-
module SpecID ; end
|
|
10
|
-
module SpecID::Prot ; end
|
|
11
|
-
module SpecID::Pep ; end
|
|
12
|
-
|
|
13
|
-
class Proph
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
################ --BEGIN
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class Parser
|
|
20
|
-
def root_el(file)
|
|
21
|
-
AXML.parse_file(file)
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class ProtSummary
|
|
27
|
-
include SpecID
|
|
28
|
-
|
|
29
|
-
attr_writer :prots
|
|
30
|
-
attr_accessor :prot_groups
|
|
31
|
-
|
|
32
|
-
def hi_prob_best ; true end
|
|
33
|
-
|
|
34
|
-
def initialize(file=nil)
|
|
35
|
-
@prots = nil
|
|
36
|
-
if file
|
|
37
|
-
@prot_groups = ProtSummary::Parser.new.parse_file(file)
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def prots
|
|
42
|
-
if @prots ; @prots
|
|
43
|
-
else
|
|
44
|
-
@prots = unique_prots(@prot_groups)
|
|
45
|
-
@prots
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# returns a set of unique proteins
|
|
50
|
-
def unique_prots(prot_groups)
|
|
51
|
-
all_prots = []
|
|
52
|
-
prot_groups.each do |pg|
|
|
53
|
-
pg.prots.each do |prt|
|
|
54
|
-
all_prots << prt
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first }
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
class ProtSummary::Parser < Parser
|
|
63
|
-
attr_accessor :prot_groups
|
|
64
|
-
def initialize(file=nil, with_peps=false, tp='axml')
|
|
65
|
-
if file
|
|
66
|
-
@prot_groups = parse_file(file, with_peps, tp)
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
# returns an array of protein_groups
|
|
71
|
-
def parse_file(file, with_peps=false, tp='axml')
|
|
72
|
-
File.open(file) do |fh|
|
|
73
|
-
@prot_groups = _parse_for_prot_groups(fh, with_peps, tp)
|
|
74
|
-
end
|
|
75
|
-
@prot_groups
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
# returns an array of ProtGroup objects
|
|
79
|
-
def _parse_for_prot_groups(stream, with_peps=false, tp='axml')
|
|
80
|
-
prtgrps = []
|
|
81
|
-
case tp
|
|
82
|
-
when 'axml'
|
|
83
|
-
root = AXML.parse(stream)
|
|
84
|
-
root.protein_group.each do |protein_group|
|
|
85
|
-
pg = ProtGroup.new(protein_group.attrs) do
|
|
86
|
-
protein_group.map do |protein|
|
|
87
|
-
Prot.new(protein.attrs)
|
|
88
|
-
end
|
|
89
|
-
end
|
|
90
|
-
prtgrps << pg
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
prtgrps
|
|
94
|
-
end
|
|
95
|
-
end # ProtSummary::Parser
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class ProtGroup
|
|
99
|
-
attr_accessor :group_number, :probability, :prots
|
|
100
|
-
def initialize(args=nil)
|
|
101
|
-
@prots = []
|
|
102
|
-
if args
|
|
103
|
-
instance_var_set_from_hash(args)
|
|
104
|
-
end
|
|
105
|
-
if block_given?
|
|
106
|
-
@prots = yield
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
class Prot
|
|
112
|
-
include SpecID::Prot
|
|
113
|
-
|
|
114
|
-
## probability and reference accessors are inherited
|
|
115
|
-
attr_accessor :peps, :protein_name, :cutoff, :group_sibling_id, :n_indistinguishable_proteins, :percent_coverage, :unique_stripped_peptides, :total_number_peptides, :pct_spectrum_ids, :description
|
|
116
|
-
|
|
117
|
-
# returns protein_name
|
|
118
|
-
def name ; @protein_name end
|
|
119
|
-
def reference ; @protein_name end
|
|
120
|
-
|
|
121
|
-
def initialize(args)
|
|
122
|
-
self.instance_var_set_from_hash(args)
|
|
123
|
-
if @probability ; @probability = @probability.to_f end
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
# def self.uniq_prots_with_prob_and_reference(file)
|
|
127
|
-
# root = Parser.root_el(file)
|
|
128
|
-
# prots = []
|
|
129
|
-
# root.protein_group.each do |group|
|
|
130
|
-
# group.protein.each do |prt|
|
|
131
|
-
# #prots << prt
|
|
132
|
-
# prots <<
|
|
133
|
-
# end
|
|
134
|
-
# end
|
|
135
|
-
#
|
|
136
|
-
# un_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
|
137
|
-
#
|
|
138
|
-
# end
|
|
139
|
-
|
|
140
|
-
def to_s
|
|
141
|
-
'<Prot: protein_name=' + @protein_name + ' ' + 'probability=' + @probability.to_s + '>'
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
end # class Prot
|
|
145
|
-
|
|
146
|
-
class Pep
|
|
147
|
-
include SpecID::Pep
|
|
148
|
-
|
|
149
|
-
attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
|
|
150
|
-
attr_writer :arithmetic_avg_scan_by_parent_time
|
|
151
|
-
|
|
152
|
-
def initialize(args=nil)
|
|
153
|
-
if args
|
|
154
|
-
@sequence = args[:sequence]
|
|
155
|
-
@probability = args[:probability] ## nsp prob
|
|
156
|
-
@filenames = args[:filenames]
|
|
157
|
-
@charge = args[:charge]
|
|
158
|
-
@nsp_cutoff = args[:nsp_cutoff]
|
|
159
|
-
if args.key?(:scans)
|
|
160
|
-
@scans = args[:scans]
|
|
161
|
-
else
|
|
162
|
-
@scans = [] ## this is set later if needed
|
|
163
|
-
end
|
|
164
|
-
else
|
|
165
|
-
@scans = []
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
# filter peptides based on the number of scans
|
|
170
|
-
# if a peptide has more than max_dups scans, the peptide is tossed
|
|
171
|
-
# note that multiple scans that were used as a single dtafile scan
|
|
172
|
-
# will be counted as a single scan for these purposes!
|
|
173
|
-
# (easy, since they are stored as a single item in the array of scans)
|
|
174
|
-
def self.filter_by_max_dup_scans(max_dups=nil, peps=nil)
|
|
175
|
-
if max_dups
|
|
176
|
-
new_peps = []
|
|
177
|
-
peps.each do |pep|
|
|
178
|
-
unless pep.scans.size > max_dups
|
|
179
|
-
new_peps << pep
|
|
180
|
-
end
|
|
181
|
-
end
|
|
182
|
-
new_peps
|
|
183
|
-
else
|
|
184
|
-
peps.dup
|
|
185
|
-
end
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
## from the list of scans, creates a scan object whose time is the
|
|
189
|
-
## arithmetic mean of the parent scans (based on prec_inten) and whose
|
|
190
|
-
## prec_mz is the avg of all prec_mz's. num is nil, charge is the first
|
|
191
|
-
def arithmetic_avg_scan_by_parent_time
|
|
192
|
-
unless @arithmetic_avg_scan_by_parent_time
|
|
193
|
-
flat_scans = @scans.flatten
|
|
194
|
-
|
|
195
|
-
# new_prec_mz
|
|
196
|
-
prec_mz_sum = 0.0
|
|
197
|
-
prec_inten_sum = 0.0
|
|
198
|
-
times = []
|
|
199
|
-
intens = []
|
|
200
|
-
tot_inten = 0.0
|
|
201
|
-
flat_scans.each do |c|
|
|
202
|
-
prec_inten = c.prec_inten
|
|
203
|
-
prec_inten_sum += prec_inten
|
|
204
|
-
prec_mz_sum += c.prec_mz
|
|
205
|
-
tot_inten += prec_inten
|
|
206
|
-
times << c.parent.time
|
|
207
|
-
intens << prec_inten
|
|
208
|
-
end
|
|
209
|
-
new_prec_mz = prec_mz_sum / flat_scans.size
|
|
210
|
-
new_prec_inten = prec_inten_sum / flat_scans.size
|
|
211
|
-
|
|
212
|
-
fraction_inten = []
|
|
213
|
-
intens.each do |inten|
|
|
214
|
-
fraction_inten.push( inten/tot_inten )
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
new_time = 0.0
|
|
218
|
-
(0...times.size).each do |i|
|
|
219
|
-
new_time += times[i] * fraction_inten[i]
|
|
220
|
-
end
|
|
221
|
-
|
|
222
|
-
@arithmetic_avg_scan_by_parent_time = Spec::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten )
|
|
223
|
-
|
|
224
|
-
end
|
|
225
|
-
@arithmetic_avg_scan_by_parent_time
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
def to_s
|
|
229
|
-
'<Pep seq=' + @sequence + ' ' + 'prob=' + @probability.to_s + ' charge=' + @charge + '>'
|
|
230
|
-
end
|
|
231
|
-
|
|
232
|
-
def has_dta?(dta_filename)
|
|
233
|
-
if @filenames
|
|
234
|
-
@filenames.each do |fn|
|
|
235
|
-
if dta_filename == fn
|
|
236
|
-
return true
|
|
237
|
-
end
|
|
238
|
-
end
|
|
239
|
-
end
|
|
240
|
-
return false
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
# Given a list of peptides, returns only those unique based on
|
|
244
|
-
# sequence/charge
|
|
245
|
-
def self.uniq_by_seqcharge(peptides)
|
|
246
|
-
# @TODO: this could be done with one fewer traversals, but it is beautiful
|
|
247
|
-
peptides.hash_by(:sequence, :charge).collect do |k,v|
|
|
248
|
-
v.first
|
|
249
|
-
end
|
|
250
|
-
end
|
|
251
|
-
|
|
252
|
-
end # class Pep
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
# Class for parsing the peptide prophet output files in various ways
|
|
256
|
-
class Pep::Parser < Parser
|
|
257
|
-
|
|
258
|
-
# parse_type = "rexml" | "regex"
|
|
259
|
-
# regex's are about 50 times faster but are not guaranteed to work
|
|
260
|
-
# seq charge hash is keyed on an array -> [sequence,charge]
|
|
261
|
-
# @TODO: implement parsing on this with xmlparser
|
|
262
|
-
def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml")
|
|
263
|
-
seq_charge_hash = Hash.new {|hash,key| hash[key] = [] }
|
|
264
|
-
case parse_type
|
|
265
|
-
when "rexml"
|
|
266
|
-
#puts "READING: " + pep_xml_file + " ..."
|
|
267
|
-
doc = REXML::Document.new File.new(pep_xml_file)
|
|
268
|
-
|
|
269
|
-
## Create a hash of peptides based on sequence_charge (takes an array)
|
|
270
|
-
doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result|
|
|
271
|
-
pep_charge = result.attributes['assumed_charge']
|
|
272
|
-
filename = result.attributes['spectrum']
|
|
273
|
-
result.elements.to_a('search_hit').each do |hit|
|
|
274
|
-
pep_seq = hit.attributes['peptide']
|
|
275
|
-
seq_charge = [pep_seq, pep_charge]
|
|
276
|
-
seq_charge_hash[seq_charge] << filename
|
|
277
|
-
end
|
|
278
|
-
end
|
|
279
|
-
seq_charge_hash
|
|
280
|
-
when "regex"
|
|
281
|
-
#puts "READING: " + pep_xml_file + " ..."
|
|
282
|
-
## Create a hash of peptides based on sequence_charge (takes an array)
|
|
283
|
-
|
|
284
|
-
## file from peptideAtlas:
|
|
285
|
-
search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
|
|
286
|
-
search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
|
|
287
|
-
search_hit_regex = /<search_hit .*peptide="(\w+)" /o
|
|
288
|
-
|
|
289
|
-
peptide_h = {}
|
|
290
|
-
filename = nil
|
|
291
|
-
pep_charge = nil
|
|
292
|
-
File.open(pep_xml_file).each do |line|
|
|
293
|
-
if line =~ search_result_regex1
|
|
294
|
-
filename = $1.dup
|
|
295
|
-
pep_charge = $2.dup
|
|
296
|
-
elsif line =~ search_result_regex2
|
|
297
|
-
filename = $1.dup
|
|
298
|
-
pep_charge = $2.dup
|
|
299
|
-
end
|
|
300
|
-
if line =~ search_hit_regex
|
|
301
|
-
pep_seq = $1.dup
|
|
302
|
-
seq_charge = [pep_seq, pep_charge]
|
|
303
|
-
seq_charge_hash[seq_charge] << filename
|
|
304
|
-
end
|
|
305
|
-
end
|
|
306
|
-
end
|
|
307
|
-
seq_charge_hash
|
|
308
|
-
end
|
|
309
|
-
|
|
310
|
-
# drops all search_hits that have peptideprophet probability < min_val
|
|
311
|
-
# and drops any search_results that end up with 0 search_hits
|
|
312
|
-
def filter_by_min_pep_prob(file, outfile, min_val)
|
|
313
|
-
root = root_el(file)
|
|
314
|
-
|
|
315
|
-
d_search_hit = nil
|
|
316
|
-
d_search_result = nil
|
|
317
|
-
root.children.each do |child1|
|
|
318
|
-
if child1.name == 'msms_run_summary'
|
|
319
|
-
d_search_result = []
|
|
320
|
-
child1.children.each do |child2|
|
|
321
|
-
if child2.name == 'search_result'
|
|
322
|
-
#puts "size before: " + child2.size.to_s
|
|
323
|
-
d_search_hit = []
|
|
324
|
-
child2.children.each do |child3|
|
|
325
|
-
if child3.name == 'search_hit'
|
|
326
|
-
child3.children.each do |child4|
|
|
327
|
-
if child4.name == 'peptideprophet_result'
|
|
328
|
-
if child4.attrs["probability"].to_f < min_val
|
|
329
|
-
#puts "dropping probability: #{child4.attrs["probability"]}"
|
|
330
|
-
d_search_hit << child3
|
|
331
|
-
else
|
|
332
|
-
#puts "keeping probability: #{child4.attrs["probability"]}"
|
|
333
|
-
end
|
|
334
|
-
end
|
|
335
|
-
end
|
|
336
|
-
end
|
|
337
|
-
end
|
|
338
|
-
d_search_hit.each do |to_drop|
|
|
339
|
-
to_drop.drop
|
|
340
|
-
end
|
|
341
|
-
#puts "size after: " + child2.size.to_s
|
|
342
|
-
if child2.size == 0
|
|
343
|
-
d_search_result << child2
|
|
344
|
-
end
|
|
345
|
-
end
|
|
346
|
-
end
|
|
347
|
-
d_search_result.each do |to_drop|
|
|
348
|
-
to_drop.drop
|
|
349
|
-
end
|
|
350
|
-
end
|
|
351
|
-
end
|
|
352
|
-
|
|
353
|
-
File.open(outfile, "w") do |fh|
|
|
354
|
-
fh.print root.to_s
|
|
355
|
-
end
|
|
356
|
-
end
|
|
357
|
-
end # Pep::Parser
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
# Class for parsing the '*-prot.xml' files in different ways
|
|
361
|
-
class Prot::Parser < Parser
|
|
362
|
-
|
|
363
|
-
attr_accessor :prots
|
|
364
|
-
attr_writer :peps
|
|
365
|
-
|
|
366
|
-
def initialize
|
|
367
|
-
@prots = []
|
|
368
|
-
end
|
|
369
|
-
|
|
370
|
-
# returns all the peptides from prots
|
|
371
|
-
def peps
|
|
372
|
-
unless @peps
|
|
373
|
-
@peps = []
|
|
374
|
-
@prots.each do |prot|
|
|
375
|
-
@peps.push(*(prot.peps))
|
|
376
|
-
end
|
|
377
|
-
end
|
|
378
|
-
@peps
|
|
379
|
-
end
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
# sets and returns an array of Prot objects
|
|
383
|
-
# parse_type = "rexml" | "regex"
|
|
384
|
-
def get_prots_and_peps(protxmlfile, prot_prob_cutoff=1.0, pep_init_prob_cutoff=1.0, pep_nsp_prob_cutoff=1.0, parse_type="rexml")
|
|
385
|
-
## ensure these are all floats
|
|
386
|
-
(prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff) = [prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff].collect do |cutoff|
|
|
387
|
-
cutoff.to_f
|
|
388
|
-
end
|
|
389
|
-
|
|
390
|
-
case parse_type
|
|
391
|
-
when "rexml"
|
|
392
|
-
doc = REXML::Document.new File.new(protxmlfile)
|
|
393
|
-
doc.elements.each("protein_summary/protein_group/protein") do |elem|
|
|
394
|
-
if elem.attributes['probability'].to_f >= prot_prob_cutoff
|
|
395
|
-
prob = elem.attributes['probability'].to_f
|
|
396
|
-
name= elem.attributes['protein_name']
|
|
397
|
-
curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
|
|
398
|
-
peptides = []
|
|
399
|
-
elem.elements.to_a('peptide').each do |pep|
|
|
400
|
-
if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff
|
|
401
|
-
nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f
|
|
402
|
-
sequence = pep.attributes['peptide_sequence']
|
|
403
|
-
charge = pep.attributes['charge']
|
|
404
|
-
pnm = pep.attributes['precursor_neutral_mass']
|
|
405
|
-
peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff))
|
|
406
|
-
end
|
|
407
|
-
## Only take proteins with peptides!
|
|
408
|
-
if peptides.size > 0
|
|
409
|
-
curr_prot.peps = peptides
|
|
410
|
-
@prots << curr_prot
|
|
411
|
-
end
|
|
412
|
-
end
|
|
413
|
-
end
|
|
414
|
-
end
|
|
415
|
-
when "regex"
|
|
416
|
-
prot_regex = /<protein protein_name="(.*)?" n_indistinguishable_proteins(.*)/o
|
|
417
|
-
prot_prob_regex = /probability="([\d\.]+)"/o
|
|
418
|
-
pep_regex = /<peptide peptide_sequence="(\w+)?"(.*)/o
|
|
419
|
-
pep_else_regex = /charge="(\d)" initial_probability="([\d\.]+)" nsp_adjusted_probability="([\d\.]+)"/o
|
|
420
|
-
|
|
421
|
-
curr_prot = nil
|
|
422
|
-
peptides = []
|
|
423
|
-
File.open(protxmlfile).each do |line|
|
|
424
|
-
if line =~ prot_regex
|
|
425
|
-
prob = nil
|
|
426
|
-
name = $1.dup
|
|
427
|
-
rest = $2
|
|
428
|
-
if rest =~ prot_prob_regex
|
|
429
|
-
prob = $1.dup
|
|
430
|
-
end
|
|
431
|
-
if curr_prot
|
|
432
|
-
if curr_prot.probability.to_f >= prot_prob_cutoff
|
|
433
|
-
if peptides.size > 0
|
|
434
|
-
curr_prot.peps = peptides
|
|
435
|
-
@prots.push(curr_prot)
|
|
436
|
-
end
|
|
437
|
-
end
|
|
438
|
-
end
|
|
439
|
-
curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
|
|
440
|
-
peptides = []
|
|
441
|
-
end
|
|
442
|
-
if line =~ pep_regex
|
|
443
|
-
sequence = $1.dup
|
|
444
|
-
rest = $2
|
|
445
|
-
if rest =~ pep_else_regex
|
|
446
|
-
charge = $1
|
|
447
|
-
init_prob = $2
|
|
448
|
-
nsp_prob = $3
|
|
449
|
-
if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff
|
|
450
|
-
peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff))
|
|
451
|
-
end
|
|
452
|
-
end
|
|
453
|
-
end
|
|
454
|
-
# get the last one:
|
|
455
|
-
if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0
|
|
456
|
-
curr_prot.peps = peptides
|
|
457
|
-
@prots.push(curr_prot)
|
|
458
|
-
end
|
|
459
|
-
end
|
|
460
|
-
end
|
|
461
|
-
@prots
|
|
462
|
-
end
|
|
463
|
-
|
|
464
|
-
end # Prot::Parser
|
|
465
|
-
|
|
466
|
-
################ --END
|
|
467
|
-
|
|
468
|
-
end # Proph
|
|
@@ -5,7 +5,7 @@ require 'hash_by'
|
|
|
5
5
|
require 'optparse'
|
|
6
6
|
require 'ostruct'
|
|
7
7
|
require 'spec_id'
|
|
8
|
-
require 'spec_id/precision'
|
|
8
|
+
#require 'spec_id/precision' # gone now
|
|
9
9
|
require 'gi'
|
|
10
10
|
|
|
11
11
|
#############################################################
|
|
@@ -428,7 +428,7 @@ class ProteinSummary
|
|
|
428
428
|
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
|
429
429
|
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
|
430
430
|
op.separator ""
|
|
431
|
-
op.separator "
|
|
431
|
+
op.separator "MSific to ProteinProphet (with no concatenated DB):"
|
|
432
432
|
op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
|
|
433
433
|
op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
|
|
434
434
|
op.on("--get_annotation", "retrieves annotation by gi code") {|v| opt.get_annotation = v}
|