mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
require 'rexml/document'
|
|
2
|
+
require 'hash_by'
|
|
3
|
+
require 'instance_var_set_from_hash'
|
|
4
|
+
require 'axml'
|
|
5
|
+
require 'spec_id'
|
|
6
|
+
require 'array_class'
|
|
7
|
+
|
|
8
|
+
require 'spec_id/parser/proph'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
module SpecID ; end
|
|
12
|
+
module SpecID::Prot ; end
|
|
13
|
+
module SpecID::Pep ; end
|
|
14
|
+
|
|
15
|
+
module Proph
|
|
16
|
+
|
|
17
|
+
class ProtSummary
|
|
18
|
+
include SpecID
|
|
19
|
+
|
|
20
|
+
# if you get this match it's a protein prophet file and the version is the
|
|
21
|
+
# first match!
|
|
22
|
+
Filetype_and_version_re_old = /ProteinProphet_v([\.\d]+)\.dtd/ # gives 1.9 or what else?
|
|
23
|
+
Filetype_and_version_re_new = /protXML_v([\.\d]+)\.xsd/ # gives 4 right now
|
|
24
|
+
# inherits prots and peps
|
|
25
|
+
|
|
26
|
+
# the protein groups
|
|
27
|
+
attr_accessor :prot_groups
|
|
28
|
+
attr_accessor :version
|
|
29
|
+
|
|
30
|
+
def hi_prob_best ; true end
|
|
31
|
+
|
|
32
|
+
def get_version(file)
|
|
33
|
+
answer = nil
|
|
34
|
+
File.open(file) do |fh|
|
|
35
|
+
5.times do
|
|
36
|
+
line = fh.gets
|
|
37
|
+
answer =
|
|
38
|
+
if line =~ Filetype_and_version_re_new
|
|
39
|
+
$1.dup
|
|
40
|
+
elsif line =~ Filetype_and_version_re_old
|
|
41
|
+
$1.dup
|
|
42
|
+
end
|
|
43
|
+
break if answer
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
raise(ArgumentError, "couldn't detect version in #{file}") unless answer
|
|
47
|
+
answer
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def initialize(file=nil)
|
|
51
|
+
@prots = nil
|
|
52
|
+
if file
|
|
53
|
+
@version = get_version(file)
|
|
54
|
+
#@prot_groups = ProtSummary::Parser.new.parse_file(file)
|
|
55
|
+
SpecID::Parser::ProtProph.new(:spec_id).parse(file, :spec_id => self)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# returns a set of unique proteins
|
|
60
|
+
def unique_prots(prot_groups)
|
|
61
|
+
all_prots = []
|
|
62
|
+
prot_groups.each do |pg|
|
|
63
|
+
pg.prots.each do |prt|
|
|
64
|
+
all_prots << prt
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
class ProtSummary::Parser
|
|
73
|
+
attr_accessor :prot_groups
|
|
74
|
+
def initialize(file=nil, with_peps=false, tp='axml')
|
|
75
|
+
if file
|
|
76
|
+
@prot_groups = parse_file(file, with_peps, tp)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# returns an array of protein_groups
|
|
81
|
+
def parse_file(file, with_peps=false, tp='axml')
|
|
82
|
+
File.open(file) do |fh|
|
|
83
|
+
@prot_groups = _parse_for_prot_groups(fh, with_peps, tp)
|
|
84
|
+
end
|
|
85
|
+
@prot_groups
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# returns an array of ProtGroup objects
|
|
89
|
+
def _parse_for_prot_groups(stream, with_peps=false, tp='axml')
|
|
90
|
+
prtgrps = []
|
|
91
|
+
case tp
|
|
92
|
+
when 'axml'
|
|
93
|
+
root = AXML.parse(stream)
|
|
94
|
+
root.protein_group.each do |protein_group|
|
|
95
|
+
pg = ProtGroup.new(protein_group.attrs) do
|
|
96
|
+
protein_group.map do |protein|
|
|
97
|
+
Prot.new(protein.attrs)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
prtgrps << pg
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
prtgrps
|
|
104
|
+
end
|
|
105
|
+
end # ProtSummary::Parser
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class ProtGroup
|
|
109
|
+
attr_accessor :group_number, :probability, :prots
|
|
110
|
+
def initialize(args=nil)
|
|
111
|
+
@prots = []
|
|
112
|
+
if args
|
|
113
|
+
instance_var_set_from_hash(args)
|
|
114
|
+
end
|
|
115
|
+
if block_given?
|
|
116
|
+
@prots = yield
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
end # Proph
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
Proph::Prot = ArrayClass.new(%w(protein_name probability n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description peps))
|
|
126
|
+
|
|
127
|
+
# note that 'description' is found in the element 'annotation', attribute 'protein_description'
|
|
128
|
+
# NOTE!: unique_stripped peptides is an array rather than + joined string
|
|
129
|
+
class Proph::Prot
|
|
130
|
+
include SpecID::Prot
|
|
131
|
+
|
|
132
|
+
# returns protein_name
|
|
133
|
+
def name ; self[0] end
|
|
134
|
+
def reference ; self[0] end
|
|
135
|
+
def first_entry ; self[0] end # the name is also the first_entry
|
|
136
|
+
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
#def to_s
|
|
140
|
+
# '<Prot: protein_name=' + @protein_name + ' ' + 'probability=' + @probability.to_s + '>'
|
|
141
|
+
#end
|
|
142
|
+
|
|
143
|
+
# this is a pep from a -prot.xml file
|
|
144
|
+
|
|
145
|
+
Proph::Prot::Pep = ArrayClass.new(%w(peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots))
|
|
146
|
+
|
|
147
|
+
class Proph::Prot::Pep
|
|
148
|
+
include SpecID::Pep
|
|
149
|
+
|
|
150
|
+
alias_method :mod_info, :modification_info
|
|
151
|
+
alias_method :mod_info=, :modification_info=
|
|
152
|
+
|
|
153
|
+
def aaseq ; self[0] end
|
|
154
|
+
def probability ; self[3] end
|
|
155
|
+
|
|
156
|
+
end # class Pep
|
|
157
|
+
|
|
158
|
+
=begin
|
|
159
|
+
#attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
|
|
160
|
+
#attr_writer :arithmetic_avg_scan_by_parent_time
|
|
161
|
+
|
|
162
|
+
#def initialize(args=nil)
|
|
163
|
+
# if args
|
|
164
|
+
# @sequence = args[:sequence]
|
|
165
|
+
# @probability = args[:probability] ## nsp prob
|
|
166
|
+
# @filenames = args[:filenames]
|
|
167
|
+
# @charge = args[:charge]
|
|
168
|
+
# @nsp_cutoff = args[:nsp_cutoff]
|
|
169
|
+
# if args.key?(:scans)
|
|
170
|
+
# @scans = args[:scans]
|
|
171
|
+
# else
|
|
172
|
+
# @scans = [] ## this is set later if needed
|
|
173
|
+
# end
|
|
174
|
+
# else
|
|
175
|
+
# @scans = []
|
|
176
|
+
# end
|
|
177
|
+
#end
|
|
178
|
+
|
|
179
|
+
# filter peptides based on the number of scans
|
|
180
|
+
# if a peptide has more than max_dups scans, the peptide is tossed
|
|
181
|
+
# note that multiple scans that were used as a single dtafile scan
|
|
182
|
+
# will be counted as a single scan for these purposes!
|
|
183
|
+
# (easy, since they are stored as a single item in the array of scans)
|
|
184
|
+
def self.filter_by_max_dup_scans(max_dups=nil, peps=nil)
|
|
185
|
+
if max_dups
|
|
186
|
+
new_peps = []
|
|
187
|
+
peps.each do |pep|
|
|
188
|
+
unless pep.scans.size > max_dups
|
|
189
|
+
new_peps << pep
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
new_peps
|
|
193
|
+
else
|
|
194
|
+
peps.dup
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
## from the list of scans, creates a scan object whose time is the
|
|
200
|
+
## arithmetic mean of the parent scans (based on prec_inten) and whose
|
|
201
|
+
## prec_mz is the avg of all prec_mz's. num is nil, charge is the first
|
|
202
|
+
def arithmetic_avg_scan_by_parent_time
|
|
203
|
+
unless @arithmetic_avg_scan_by_parent_time
|
|
204
|
+
flat_scans = @scans.flatten
|
|
205
|
+
|
|
206
|
+
# new_prec_mz
|
|
207
|
+
prec_mz_sum = 0.0
|
|
208
|
+
prec_inten_sum = 0.0
|
|
209
|
+
times = []
|
|
210
|
+
intens = []
|
|
211
|
+
tot_inten = 0.0
|
|
212
|
+
flat_scans.each do |c|
|
|
213
|
+
prec_inten = c.prec_inten
|
|
214
|
+
prec_inten_sum += prec_inten
|
|
215
|
+
prec_mz_sum += c.prec_mz
|
|
216
|
+
tot_inten += prec_inten
|
|
217
|
+
times << c.parent.time
|
|
218
|
+
intens << prec_inten
|
|
219
|
+
end
|
|
220
|
+
new_prec_mz = prec_mz_sum / flat_scans.size
|
|
221
|
+
new_prec_inten = prec_inten_sum / flat_scans.size
|
|
222
|
+
|
|
223
|
+
fraction_inten = []
|
|
224
|
+
intens.each do |inten|
|
|
225
|
+
fraction_inten.push( inten/tot_inten )
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
new_time = 0.0
|
|
229
|
+
(0...times.size).each do |i|
|
|
230
|
+
new_time += times[i] * fraction_inten[i]
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
@arithmetic_avg_scan_by_parent_time = MS::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten )
|
|
234
|
+
|
|
235
|
+
end
|
|
236
|
+
@arithmetic_avg_scan_by_parent_time
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def to_s
|
|
240
|
+
'<Pep seq=' + @sequence + ' ' + 'prob=' + @probability.to_s + ' charge=' + @charge + '>'
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def has_dta?(dta_filename)
|
|
244
|
+
if @filenames
|
|
245
|
+
@filenames.each do |fn|
|
|
246
|
+
if dta_filename == fn
|
|
247
|
+
return true
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
return false
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# Given a list of peptides, returns only those unique based on
|
|
256
|
+
# sequence/charge
|
|
257
|
+
def self.uniq_by_seqcharge(peptides)
|
|
258
|
+
# @TODO: this could be done with one fewer traversals, but it is beautiful
|
|
259
|
+
peptides.hash_by(:sequence, :charge).collect do |k,v|
|
|
260
|
+
v.first
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
=end
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
=begin
|
|
270
|
+
|
|
271
|
+
# Class for parsing the peptide prophet output files in various ways
|
|
272
|
+
class Proph::Pep::Parser < Parser
|
|
273
|
+
|
|
274
|
+
# parse_type = "rexml" | "regex"
|
|
275
|
+
# regex's are about 50 times faster but are not guaranteed to work
|
|
276
|
+
# seq charge hash is keyed on an array -> [sequence,charge]
|
|
277
|
+
# @TODO: implement parsing on this with xmlparser
|
|
278
|
+
def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml")
|
|
279
|
+
seq_charge_hash = Hash.new {|hash,key| hash[key] = [] }
|
|
280
|
+
case parse_type
|
|
281
|
+
when "rexml"
|
|
282
|
+
#puts "READING: " + pep_xml_file + " ..."
|
|
283
|
+
doc = REXML::Document.new File.new(pep_xml_file)
|
|
284
|
+
|
|
285
|
+
## Create a hash of peptides based on sequence_charge (takes an array)
|
|
286
|
+
doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result|
|
|
287
|
+
pep_charge = result.attributes['assumed_charge']
|
|
288
|
+
filename = result.attributes['spectrum']
|
|
289
|
+
result.elements.to_a('search_hit').each do |hit|
|
|
290
|
+
pep_seq = hit.attributes['peptide']
|
|
291
|
+
seq_charge = [pep_seq, pep_charge]
|
|
292
|
+
seq_charge_hash[seq_charge] << filename
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
seq_charge_hash
|
|
296
|
+
when "regex"
|
|
297
|
+
#puts "READING: " + pep_xml_file + " ..."
|
|
298
|
+
## Create a hash of peptides based on sequence_charge (takes an array)
|
|
299
|
+
|
|
300
|
+
## file from peptideAtlas:
|
|
301
|
+
search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
|
|
302
|
+
search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
|
|
303
|
+
search_hit_regex = /<search_hit .*peptide="(\w+)" /o
|
|
304
|
+
|
|
305
|
+
peptide_h = {}
|
|
306
|
+
filename = nil
|
|
307
|
+
pep_charge = nil
|
|
308
|
+
File.open(pep_xml_file).each do |line|
|
|
309
|
+
if line =~ search_result_regex1
|
|
310
|
+
filename = $1.dup
|
|
311
|
+
pep_charge = $2.dup
|
|
312
|
+
elsif line =~ search_result_regex2
|
|
313
|
+
filename = $1.dup
|
|
314
|
+
pep_charge = $2.dup
|
|
315
|
+
end
|
|
316
|
+
if line =~ search_hit_regex
|
|
317
|
+
pep_seq = $1.dup
|
|
318
|
+
seq_charge = [pep_seq, pep_charge]
|
|
319
|
+
seq_charge_hash[seq_charge] << filename
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
seq_charge_hash
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
# drops all search_hits that have peptideprophet probability < min_val
|
|
327
|
+
# and drops any search_results that end up with 0 search_hits
|
|
328
|
+
def filter_by_min_pep_prob(file, outfile, min_val)
|
|
329
|
+
root = root_el(file)
|
|
330
|
+
|
|
331
|
+
d_search_hit = nil
|
|
332
|
+
d_search_result = nil
|
|
333
|
+
root.children.each do |child1|
|
|
334
|
+
if child1.name == 'msms_run_summary'
|
|
335
|
+
d_search_result = []
|
|
336
|
+
child1.children.each do |child2|
|
|
337
|
+
if child2.name == 'search_result'
|
|
338
|
+
#puts "size before: " + child2.size.to_s
|
|
339
|
+
d_search_hit = []
|
|
340
|
+
child2.children.each do |child3|
|
|
341
|
+
if child3.name == 'search_hit'
|
|
342
|
+
child3.children.each do |child4|
|
|
343
|
+
if child4.name == 'peptideprophet_result'
|
|
344
|
+
if child4.attrs["probability"].to_f < min_val
|
|
345
|
+
#puts "dropping probability: #{child4.attrs["probability"]}"
|
|
346
|
+
d_search_hit << child3
|
|
347
|
+
else
|
|
348
|
+
#puts "keeping probability: #{child4.attrs["probability"]}"
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
d_search_hit.each do |to_drop|
|
|
355
|
+
to_drop.drop
|
|
356
|
+
end
|
|
357
|
+
#puts "size after: " + child2.size.to_s
|
|
358
|
+
if child2.size == 0
|
|
359
|
+
d_search_result << child2
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
d_search_result.each do |to_drop|
|
|
364
|
+
to_drop.drop
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
File.open(outfile, "w") do |fh|
|
|
370
|
+
fh.print root.to_s
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
end # Pep::Parser
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# Class for parsing the '*-prot.xml' files in different ways
|
|
377
|
+
class Proph::Prot::Parser < Parser
|
|
378
|
+
|
|
379
|
+
attr_accessor :prots
|
|
380
|
+
attr_writer :peps
|
|
381
|
+
|
|
382
|
+
def initialize
|
|
383
|
+
@prots = []
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# returns all the peptides from prots
|
|
387
|
+
def peps
|
|
388
|
+
unless @peps
|
|
389
|
+
@peps = []
|
|
390
|
+
@prots.each do |prot|
|
|
391
|
+
@peps.push(*(prot.peps))
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
@peps
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# sets and returns an array of Prot objects
|
|
399
|
+
# parse_type = "rexml" | "regex"
|
|
400
|
+
def get_prots_and_peps(protxmlfile, prot_prob_cutoff=1.0, pep_init_prob_cutoff=1.0, pep_nsp_prob_cutoff=1.0, parse_type="rexml")
|
|
401
|
+
## ensure these are all floats
|
|
402
|
+
(prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff) = [prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff].collect do |cutoff|
|
|
403
|
+
cutoff.to_f
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
case parse_type
|
|
407
|
+
when "rexml"
|
|
408
|
+
doc = REXML::Document.new File.new(protxmlfile)
|
|
409
|
+
doc.elements.each("protein_summary/protein_group/protein") do |elem|
|
|
410
|
+
if elem.attributes['probability'].to_f >= prot_prob_cutoff
|
|
411
|
+
prob = elem.attributes['probability'].to_f
|
|
412
|
+
name= elem.attributes['protein_name']
|
|
413
|
+
curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
|
|
414
|
+
peptides = []
|
|
415
|
+
elem.elements.to_a('peptide').each do |pep|
|
|
416
|
+
if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff
|
|
417
|
+
nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f
|
|
418
|
+
sequence = pep.attributes['peptide_sequence']
|
|
419
|
+
charge = pep.attributes['charge']
|
|
420
|
+
pnm = pep.attributes['precursor_neutral_mass']
|
|
421
|
+
peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff))
|
|
422
|
+
end
|
|
423
|
+
## Only take proteins with peptides!
|
|
424
|
+
if peptides.size > 0
|
|
425
|
+
curr_prot.peps = peptides
|
|
426
|
+
@prots << curr_prot
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
when "regex"
|
|
432
|
+
prot_regex = /<protein protein_name="(.*)?" n_indistinguishable_proteins(.*)/o
|
|
433
|
+
prot_prob_regex = /probability="([\d\.]+)"/o
|
|
434
|
+
pep_regex = /<peptide peptide_sequence="(\w+)?"(.*)/o
|
|
435
|
+
pep_else_regex = /charge="(\d)" initial_probability="([\d\.]+)" nsp_adjusted_probability="([\d\.]+)"/o
|
|
436
|
+
|
|
437
|
+
curr_prot = nil
|
|
438
|
+
peptides = []
|
|
439
|
+
File.open(protxmlfile).each do |line|
|
|
440
|
+
if line =~ prot_regex
|
|
441
|
+
prob = nil
|
|
442
|
+
name = $1.dup
|
|
443
|
+
rest = $2
|
|
444
|
+
if rest =~ prot_prob_regex
|
|
445
|
+
prob = $1.dup
|
|
446
|
+
end
|
|
447
|
+
if curr_prot
|
|
448
|
+
if curr_prot.probability.to_f >= prot_prob_cutoff
|
|
449
|
+
if peptides.size > 0
|
|
450
|
+
curr_prot.peps = peptides
|
|
451
|
+
@prots.push(curr_prot)
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
|
|
456
|
+
peptides = []
|
|
457
|
+
end
|
|
458
|
+
if line =~ pep_regex
|
|
459
|
+
sequence = $1.dup
|
|
460
|
+
rest = $2
|
|
461
|
+
if rest =~ pep_else_regex
|
|
462
|
+
charge = $1
|
|
463
|
+
init_prob = $2
|
|
464
|
+
nsp_prob = $3
|
|
465
|
+
if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff
|
|
466
|
+
peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff))
|
|
467
|
+
end
|
|
468
|
+
end
|
|
469
|
+
end
|
|
470
|
+
# get the last one:
|
|
471
|
+
if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0
|
|
472
|
+
curr_prot.peps = peptides
|
|
473
|
+
@prots.push(curr_prot)
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
@prots
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
end # Prot::Parser
|
|
481
|
+
|
|
482
|
+
################ --END
|
|
483
|
+
|
|
484
|
+
=end
|