mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id.rb
DELETED
|
@@ -1,778 +0,0 @@
|
|
|
1
|
-
require 'ostruct'
|
|
2
|
-
require 'set'
|
|
3
|
-
require 'hash_by'
|
|
4
|
-
require 'roc'
|
|
5
|
-
require 'sample_enzyme' # for others
|
|
6
|
-
require 'spec_id/bioworks'
|
|
7
|
-
require 'spec_id/sequest'
|
|
8
|
-
|
|
9
|
-
require 'spec_id/proph/prot_summary'
|
|
10
|
-
require 'spec_id/proph/pep_summary'
|
|
11
|
-
|
|
12
|
-
require 'spec_id_xml'
|
|
13
|
-
require 'spec_id/sqt'
|
|
14
|
-
require 'spec_id/mass'
|
|
15
|
-
require 'fasta'
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
module ProteinReferenceable ; end
|
|
20
|
-
|
|
21
|
-
class SampleEnzyme ; end
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
module SpecID ; end
|
|
25
|
-
|
|
26
|
-
class GenericSpecID ; include SpecID ; end
|
|
27
|
-
|
|
28
|
-
module SpecID
|
|
29
|
-
MONO = Mass::MONO
|
|
30
|
-
AVG = Mass::AVG
|
|
31
|
-
|
|
32
|
-
attr_accessor :peps, :prots
|
|
33
|
-
# True if a high protein/peptide score is better than low, false otherwise
|
|
34
|
-
# This is set automatically for known file types
|
|
35
|
-
attr_accessor :hi_prob_best
|
|
36
|
-
|
|
37
|
-
# A relative pathname of the file the specid object is derived from
|
|
38
|
-
attr_accessor :filename
|
|
39
|
-
|
|
40
|
-
# tp = file_type
|
|
41
|
-
# Will return a SpecID object (really, the object corresponding to the
|
|
42
|
-
# file type which mixes in SpecID [is_a?(SpecID) == true])
|
|
43
|
-
# If no file is given, will return a GenericSpecID object.
|
|
44
|
-
# If file is an array, this is assumed to be a group of srf files which is
|
|
45
|
-
# converted into an SRFGroup Ojbect and run.
|
|
46
|
-
def self.new(file=nil, tp=nil)
|
|
47
|
-
# this will need to be specialized for other groups later
|
|
48
|
-
if file.is_a?(Array)
|
|
49
|
-
# takes an array of srf filenames
|
|
50
|
-
SRFGroup.new(file)
|
|
51
|
-
elsif file
|
|
52
|
-
from_file(file, tp)
|
|
53
|
-
else
|
|
54
|
-
GenericSpecID.new
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# tp = file_type
|
|
59
|
-
# a single srf file will be packaged into an SRFGroup object
|
|
60
|
-
def self.from_file(file, tp=nil)
|
|
61
|
-
obj = nil
|
|
62
|
-
unless tp
|
|
63
|
-
tp = file_type(file)
|
|
64
|
-
end
|
|
65
|
-
obj = case tp
|
|
66
|
-
when 'srf'
|
|
67
|
-
#@hi_prob_best = false
|
|
68
|
-
SRFGroup.new([file])
|
|
69
|
-
when 'srg'
|
|
70
|
-
#@hi_prob_best = false
|
|
71
|
-
SRFGroup.new(file)
|
|
72
|
-
when 'bioworks'
|
|
73
|
-
#@hi_prob_best = false
|
|
74
|
-
Bioworks.new(file)
|
|
75
|
-
when 'protproph'
|
|
76
|
-
#@hi_prob_best = true
|
|
77
|
-
Proph::ProtSummary.new(file)
|
|
78
|
-
when 'pepproph'
|
|
79
|
-
Proph::PepSummary.new(file)
|
|
80
|
-
when 'sqg'
|
|
81
|
-
SQTGroup.new(file)
|
|
82
|
-
when 'sqt'
|
|
83
|
-
SQTGroup.new([file])
|
|
84
|
-
else
|
|
85
|
-
abort "UNRECOGNIZED file type for #{file}"
|
|
86
|
-
end
|
|
87
|
-
obj
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
def inspect
|
|
91
|
-
peps_string =
|
|
92
|
-
if peps
|
|
93
|
-
"peps(#)=#{peps.size}"
|
|
94
|
-
else
|
|
95
|
-
"peps=(nil)"
|
|
96
|
-
end
|
|
97
|
-
"<#{self.class} #{peps_string}>"
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
# given some list of SpecID::Pep based objects, returns the list of proteins
|
|
101
|
-
# associated with those peptides
|
|
102
|
-
# kind must be a symbol:
|
|
103
|
-
# :no_update (current proteins are returned, but their peps attribute
|
|
104
|
-
# is not updated)
|
|
105
|
-
# :update (current proteins returned with peps attribute updated)
|
|
106
|
-
# :new (new proteins are created complete with peps attribute)
|
|
107
|
-
def self.protein_list(pephits, kind=:no_update)
|
|
108
|
-
|
|
109
|
-
orig_pephits_prts = []
|
|
110
|
-
if kind == :new
|
|
111
|
-
new_prots = {}
|
|
112
|
-
pephits.each_with_index do |pep,i|
|
|
113
|
-
orig_pephits_prts[i] = pep.prots
|
|
114
|
-
peps_new_prts = pep.prots.map do |prt|
|
|
115
|
-
if new_prots.key? prt.reference
|
|
116
|
-
already_exists = new_prots[prt.reference]
|
|
117
|
-
else
|
|
118
|
-
np = prt.dup
|
|
119
|
-
np.peps = []
|
|
120
|
-
new_prots[np.reference] = np
|
|
121
|
-
np
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
pep.prots = peps_new_prts
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
if kind == :update
|
|
129
|
-
pephits.each do |pep|
|
|
130
|
-
pep.prots.each do |prt|
|
|
131
|
-
prt.peps = []
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
|
-
end
|
|
135
|
-
|
|
136
|
-
prot_set = {}
|
|
137
|
-
pephits.each do |pep|
|
|
138
|
-
prts = pep.prots
|
|
139
|
-
prts.each do |prt|
|
|
140
|
-
prot_set[ prt.reference ] = prt
|
|
141
|
-
end
|
|
142
|
-
if (kind == :update || kind == :new)
|
|
143
|
-
prts.each do |prt|
|
|
144
|
-
prt.peps << pep
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
## Reset the original protein hits
|
|
150
|
-
if kind == :new
|
|
151
|
-
pephits.each_with_index do |pep,i|
|
|
152
|
-
pep.prots = orig_pephits_prts[i]
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
prot_set.values
|
|
157
|
-
end
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# takes a comma separated list or array and extends the last to create an
|
|
162
|
-
# array of desired size
|
|
163
|
-
def self.extend_args(arg, desired_size)
|
|
164
|
-
arg_arr = arg
|
|
165
|
-
if arg.is_a? String
|
|
166
|
-
arg_arr = arg.split(',')
|
|
167
|
-
end
|
|
168
|
-
new_arr = []
|
|
169
|
-
last_arg = arg_arr[0]
|
|
170
|
-
desired_size.times do |i|
|
|
171
|
-
if arg_arr[i]
|
|
172
|
-
new_arr[i] = arg_arr[i]
|
|
173
|
-
last_arg = new_arr[i]
|
|
174
|
-
else
|
|
175
|
-
new_arr[i] = last_arg
|
|
176
|
-
end
|
|
177
|
-
end
|
|
178
|
-
new_arr
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
# takes an array of proteins, each having peps
|
|
182
|
-
# peptide grouping is done
|
|
183
|
-
# by-
|
|
184
|
-
# the protein with the most unique peptides ends up taking any
|
|
185
|
-
# degenerate peptides, tie goes to one with most hits total, then the one
|
|
186
|
-
# that had the top xcorr(s) (before removing any peptides).All other
|
|
187
|
-
# proteins with identical peptides will lose those peptides. So, the rich
|
|
188
|
-
# stay rich, and the poor get poorer.
|
|
189
|
-
# returns an array of triplets where each is [prot, pep_hits,
|
|
190
|
-
# uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
|
|
191
|
-
# peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
|
|
192
|
-
# update_prots == true will set each protein with the peptides found
|
|
193
|
-
def self.occams_razor(array_of_prots, update_prots=false)
|
|
194
|
-
peps_found = Set.new
|
|
195
|
-
|
|
196
|
-
to_sort = array_of_prots.map do |prot|
|
|
197
|
-
pps = prot.peps
|
|
198
|
-
|
|
199
|
-
peps_by_uniq_aaseq = pps.hash_by(:aaseq)
|
|
200
|
-
uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
|
|
201
|
-
xcorrs = pps.map {|pep| pep.xcorr }
|
|
202
|
-
|
|
203
|
-
silly = OpenStruct.new
|
|
204
|
-
# 0 1 2 3 4 5
|
|
205
|
-
[uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
|
|
206
|
-
end
|
|
207
|
-
prot_triplets = []
|
|
208
|
-
to_sort.sort.reverse.each do |ar|
|
|
209
|
-
prot = ar[3]
|
|
210
|
-
## overlapping set:
|
|
211
|
-
common = peps_found & ar[4]
|
|
212
|
-
## find the uniq ones in our little set of peptides:
|
|
213
|
-
uniq = ar[4] - common
|
|
214
|
-
pep_hits = []
|
|
215
|
-
if uniq.size != 0
|
|
216
|
-
## add to the found list:
|
|
217
|
-
peps_found.merge(uniq)
|
|
218
|
-
uniq.each do |seq|
|
|
219
|
-
pep_hits.push( *(ar[5][seq]) )
|
|
220
|
-
end
|
|
221
|
-
prot_triplets << [prot, pep_hits, uniq.to_a]
|
|
222
|
-
prot.peps = pep_hits if update_prots
|
|
223
|
-
end
|
|
224
|
-
end
|
|
225
|
-
prot_triplets
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
# returns number of true positives (array) and the specified output (as
|
|
229
|
-
# parallel array). Requires the classification method and a sorted array of
|
|
230
|
-
# tp values and an array fp values.
|
|
231
|
-
# (This is simply a wrapper around ROC#by_tps method!)
|
|
232
|
-
def by_tps(classification_method, tp, fp)
|
|
233
|
-
ROC.new.by_tps(classification_method, tp, fp)
|
|
234
|
-
end
|
|
235
|
-
|
|
236
|
-
# from the unique set of peptide hits, create a separate peptide hit for
|
|
237
|
-
# each protein reference where that peptide only references that protein
|
|
238
|
-
# e.g. pep.prots = [(a single protein)]
|
|
239
|
-
def pep_prots
|
|
240
|
-
pps = []
|
|
241
|
-
peps.each do |pep|
|
|
242
|
-
pep.prots.map do |prt|
|
|
243
|
-
pep.dup
|
|
244
|
-
pep.prots = [prt]
|
|
245
|
-
pps << pep
|
|
246
|
-
end
|
|
247
|
-
end
|
|
248
|
-
pps
|
|
249
|
-
end
|
|
250
|
-
|
|
251
|
-
def self.prots?(ar)
|
|
252
|
-
ar.first.is_a? SpecID::Prot
|
|
253
|
-
end
|
|
254
|
-
|
|
255
|
-
def self.peps?(ar)
|
|
256
|
-
ar.first.is_a? SpecID::Pep
|
|
257
|
-
end
|
|
258
|
-
|
|
259
|
-
# for older stuff
|
|
260
|
-
def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
|
|
261
|
-
objects =
|
|
262
|
-
case items
|
|
263
|
-
when :prots
|
|
264
|
-
prots
|
|
265
|
-
when :peps
|
|
266
|
-
peps
|
|
267
|
-
end
|
|
268
|
-
SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
# includes the peptide hit in both
|
|
272
|
-
# returns (target, decoy)
|
|
273
|
-
# (for peps) ties can be :both, true (target wins), false (decoy wins)
|
|
274
|
-
# regardless of ties behavior, will partition out the proteins to be
|
|
275
|
-
# appropriate for the peptide
|
|
276
|
-
def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
|
|
277
|
-
if items.size == 0
|
|
278
|
-
return [[],[]]
|
|
279
|
-
elsif prots?(items)
|
|
280
|
-
myproc = proc { |prt|
|
|
281
|
-
if prt.reference =~ regex ; !decoy_on_match
|
|
282
|
-
else ; decoy_on_match end
|
|
283
|
-
}
|
|
284
|
-
return classify(items, myproc)
|
|
285
|
-
elsif peps?(items)
|
|
286
|
-
match = [] ; nomatch = []
|
|
287
|
-
items.each do |pep|
|
|
288
|
-
(match_prots, nomatch_prots) = pep.prots.partition do |prot|
|
|
289
|
-
prot.reference =~ regex
|
|
290
|
-
end
|
|
291
|
-
if match_prots.size == 0
|
|
292
|
-
nomatch << pep
|
|
293
|
-
elsif nomatch_prots.size == 0
|
|
294
|
-
match << pep
|
|
295
|
-
else ## both have hits
|
|
296
|
-
pep.prots = match_prots
|
|
297
|
-
nomatch_pep = pep.dup
|
|
298
|
-
nomatch_pep.prots = nomatch_prots
|
|
299
|
-
|
|
300
|
-
# resolve ties
|
|
301
|
-
case ties
|
|
302
|
-
when true
|
|
303
|
-
if decoy_on_match
|
|
304
|
-
nomatch << pep
|
|
305
|
-
else
|
|
306
|
-
match << pep
|
|
307
|
-
end
|
|
308
|
-
when false
|
|
309
|
-
if decoy_on_match
|
|
310
|
-
match << pep
|
|
311
|
-
else
|
|
312
|
-
nomatch << pep
|
|
313
|
-
end
|
|
314
|
-
when :both
|
|
315
|
-
match << pep
|
|
316
|
-
nomatch << pep
|
|
317
|
-
else ; raise ArgumentError
|
|
318
|
-
end
|
|
319
|
-
end
|
|
320
|
-
end
|
|
321
|
-
if decoy_on_match
|
|
322
|
-
return [nomatch , match]
|
|
323
|
-
else
|
|
324
|
-
return [match, nomatch]
|
|
325
|
-
end
|
|
326
|
-
else
|
|
327
|
-
raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
|
|
328
|
-
end
|
|
329
|
-
end
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
# returns [tp, fp] based on the protein prefix for items where items =
|
|
334
|
-
# (:prot|:peps)
|
|
335
|
-
# this may result in a duplication of some peptides if they match both
|
|
336
|
-
# normal and decoy proteins. In this case, the protein arrays are split,
|
|
337
|
-
# too, so that each points only to its breed of protein.
|
|
338
|
-
def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
|
|
339
|
-
if prefix
|
|
340
|
-
regex = /^#{Regexp.escape(flag)}/
|
|
341
|
-
else
|
|
342
|
-
regex = /#{Regexp.escape(flag)}/
|
|
343
|
-
end
|
|
344
|
-
classify_by_regex(items, regex, decoy_on_match)
|
|
345
|
-
end
|
|
346
|
-
|
|
347
|
-
# Returns (match, nomatch)
|
|
348
|
-
# items = symbol (:prots, :peps)
|
|
349
|
-
# Returns two arrays, those returning true from classify_item_by and those
|
|
350
|
-
# returning false
|
|
351
|
-
def classify(items, classify_item_by)
|
|
352
|
-
its = send(items)
|
|
353
|
-
f = []; t = []
|
|
354
|
-
its.each do |it|
|
|
355
|
-
if classify_item_by.call(it)
|
|
356
|
-
t << it
|
|
357
|
-
else
|
|
358
|
-
f << it
|
|
359
|
-
end
|
|
360
|
-
end
|
|
361
|
-
[t,f]
|
|
362
|
-
end
|
|
363
|
-
|
|
364
|
-
# returns two arrays, true positives and false positives (determined by proc
|
|
365
|
-
# classify_item_by) sorted by proc rank_item_by. Items will be ranked from
|
|
366
|
-
# lowest to highest based on the return value of rank_item_by. items is a
|
|
367
|
-
# symbol (:prots or :peps)
|
|
368
|
-
def rank_and_classify(items, rank_item_by, classify_item_by)
|
|
369
|
-
its = send(items)
|
|
370
|
-
#its.each do |it| puts it.probability.to_s ; puts it.reference end
|
|
371
|
-
doublets = its.collect do |item|
|
|
372
|
-
[ rank_item_by.call(item),
|
|
373
|
-
classify_item_by.call(item) ]
|
|
374
|
-
end
|
|
375
|
-
roc = ROC.new
|
|
376
|
-
tp, fp = roc.doublets_to_separate(doublets)
|
|
377
|
-
return tp, fp
|
|
378
|
-
end
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
# returns a proc for getting all probabilities so that an ascending sort
|
|
382
|
-
# will put the best scores first
|
|
383
|
-
def probability_proc
|
|
384
|
-
if hi_prob_best
|
|
385
|
-
get_prob_proc = proc {|prt| prt.probability * -1 }
|
|
386
|
-
else
|
|
387
|
-
get_prob_proc = proc {|prt| prt.probability }
|
|
388
|
-
end
|
|
389
|
-
get_prob_proc
|
|
390
|
-
end
|
|
391
|
-
|
|
392
|
-
def separate_by_prefix(items, fp_prefix)
|
|
393
|
-
its = send(items)
|
|
394
|
-
|
|
395
|
-
if items == :prots
|
|
396
|
-
elsif items == :peps
|
|
397
|
-
abort "not implemented yet"
|
|
398
|
-
else
|
|
399
|
-
abort "no other items recognized yet"
|
|
400
|
-
end
|
|
401
|
-
end
|
|
402
|
-
|
|
403
|
-
# sorts the probabilities and then
|
|
404
|
-
# calcs predicted number hits and precision for protein probabilities
|
|
405
|
-
# (summing probabilities)
|
|
406
|
-
# one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
|
|
407
|
-
# called false positive rate
|
|
408
|
-
# SUM(1-probX)/#prots
|
|
409
|
-
def num_hits_and_ppv_for_protein_prophet_probabilities
|
|
410
|
-
current_sum_one_minus_prob = 0.0
|
|
411
|
-
num_prots = []
|
|
412
|
-
ppv = []
|
|
413
|
-
prot_cnt = 0
|
|
414
|
-
probs = prots.map {|v| v.probability}
|
|
415
|
-
sorted = probs.sort.reverse
|
|
416
|
-
sorted.each do |prob|
|
|
417
|
-
prot_cnt += 1
|
|
418
|
-
num_prots << prot_cnt
|
|
419
|
-
current_sum_one_minus_prob += 1.0 - prob
|
|
420
|
-
ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
|
|
421
|
-
# current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
|
|
422
|
-
end
|
|
423
|
-
[num_prots, ppv]
|
|
424
|
-
end
|
|
425
|
-
|
|
426
|
-
# convenience method for the common task of determining precision for
|
|
427
|
-
# proteins (with decoy proteins found by false_flag)
|
|
428
|
-
# returns (num_hits, precision)
|
|
429
|
-
def num_hits_and_ppv_for_prob(false_flag, prefix=false)
|
|
430
|
-
if prefix
|
|
431
|
-
regex = /^#{Regexp.escape(false_flag)}/
|
|
432
|
-
else
|
|
433
|
-
regex = /#{Regexp.escape(false_flag)}/
|
|
434
|
-
end
|
|
435
|
-
prob_proc = probability_proc
|
|
436
|
-
myproc = proc { |prt|
|
|
437
|
-
if prt.reference =~ regex ; false
|
|
438
|
-
else ; true end
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
|
|
442
|
-
|
|
443
|
-
(num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
|
|
444
|
-
[num_hits, precision]
|
|
445
|
-
end
|
|
446
|
-
|
|
447
|
-
# # takes the existing spec_id object and marshals it into "file.msh"
|
|
448
|
-
# # a new file will always look for a file.msh to load
|
|
449
|
-
# def marshal(force=false)
|
|
450
|
-
# if !(File.exist? @marshal_file)| force
|
|
451
|
-
# File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) }
|
|
452
|
-
# end
|
|
453
|
-
# end
|
|
454
|
-
|
|
455
|
-
# Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
|
|
456
|
-
# 'srf' if SRF file, 'srg' if search results group file.
|
|
457
|
-
def self.file_type(file)
|
|
458
|
-
if file =~ /\.srg$/
|
|
459
|
-
return 'srg'
|
|
460
|
-
elsif file =~ /\.sqg$/
|
|
461
|
-
return 'sqg'
|
|
462
|
-
end
|
|
463
|
-
if IO.read(file, 7,438) == 'Enzyme:'
|
|
464
|
-
return 'srf'
|
|
465
|
-
end
|
|
466
|
-
File.open(file) do |fh|
|
|
467
|
-
lines = ""
|
|
468
|
-
8.times { lines << fh.readline }
|
|
469
|
-
if lines =~ /<bioworksinfo>/
|
|
470
|
-
return 'bioworks'
|
|
471
|
-
elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
|
|
472
|
-
return 'protproph'
|
|
473
|
-
elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
|
|
474
|
-
return 'pepproph'
|
|
475
|
-
end
|
|
476
|
-
# assumes the header of a sqt file is less than 200 lines ...
|
|
477
|
-
200.times do
|
|
478
|
-
line = fh.gets
|
|
479
|
-
if line
|
|
480
|
-
lines << line
|
|
481
|
-
else ; break
|
|
482
|
-
end
|
|
483
|
-
end
|
|
484
|
-
if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
|
|
485
|
-
return 'sqt'
|
|
486
|
-
end
|
|
487
|
-
end
|
|
488
|
-
end
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
##############################################
|
|
492
|
-
# These are pretty specific to Smriti's needs:
|
|
493
|
-
|
|
494
|
-
# Given a hash of peptide arrays by some attribute key
|
|
495
|
-
# Return two sorted arrays of sorted probabilities
|
|
496
|
-
# The first of the min and second of the best 10 of each peptide array
|
|
497
|
-
def min_and_best10(hash)
|
|
498
|
-
## choose the min probability and sort by prob
|
|
499
|
-
min_peptides = hash.collect do |k,v|
|
|
500
|
-
v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
|
|
501
|
-
end
|
|
502
|
-
#puts min_peptides[0] # -> Bioworks::Pep
|
|
503
|
-
min_sorted_peps = sorted_probabilities(min_peptides)
|
|
504
|
-
#puts min_sorted_peps[0] # -> probability (Float)
|
|
505
|
-
|
|
506
|
-
peptides_by_tens = []
|
|
507
|
-
hash.each do |k,v|
|
|
508
|
-
arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10)
|
|
509
|
-
peptides_by_tens.push(*arr)
|
|
510
|
-
end
|
|
511
|
-
|
|
512
|
-
top_10_sorted_peps = sorted_probabilities(peptides_by_tens)
|
|
513
|
-
#puts top_10_sorted_peps[0] # -> float
|
|
514
|
-
#puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}"
|
|
515
|
-
#puts "size: min_sorted_peps.size : #{min_sorted_peps.size}"
|
|
516
|
-
#p top_10_sorted_peps
|
|
517
|
-
#p min_sorted_peps
|
|
518
|
-
return min_sorted_peps, top_10_sorted_peps
|
|
519
|
-
end
|
|
520
|
-
|
|
521
|
-
# Returns a list of sorted probabilities given the array of peptides
|
|
522
|
-
def sorted_probabilities(peptides)
|
|
523
|
-
#puts peptides.first.peptide_probability.class
|
|
524
|
-
#peptides.each do |pep| print pep.class.to_s + " " end
|
|
525
|
-
#puts peptides.first.is_a? Array
|
|
526
|
-
#abort "DFHDFD"
|
|
527
|
-
peptides.collect{|pep| pep.probability }.sort
|
|
528
|
-
end
|
|
529
|
-
|
|
530
|
-
# returns a sorted lists of probabilities based on all pepprots (a peptide
|
|
531
|
-
# associated with a protein)
|
|
532
|
-
def pep_probs_by_pep_prots
|
|
533
|
-
sorted_probabilities(peps)
|
|
534
|
-
end
|
|
535
|
-
|
|
536
|
-
##########################################################################
|
|
537
|
-
# WARNING! These might be dangerous to your health if there are multiple
|
|
538
|
-
# files collected in your bioworks file
|
|
539
|
-
##########################################################################
|
|
540
|
-
|
|
541
|
-
# (prob_list_by_min, prob_list_by_best10)
|
|
542
|
-
# returns 2 sorted lists of probabilities based on:
|
|
543
|
-
# 1. best peptide hit
|
|
544
|
-
# 2. top 10 peptide hits
|
|
545
|
-
# on a per scan basis
|
|
546
|
-
# NOTE: you may want to hash on base_name first!
|
|
547
|
-
def pep_probs_by_scan
|
|
548
|
-
hash = peps.hash_by(:first_scan, :last_scan)
|
|
549
|
-
return min_and_best10(hash)
|
|
550
|
-
end
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
#(prob_list_by_min, prob_list_by_best10)
|
|
554
|
-
# same as pep_probs_by_scan but per charge state
|
|
555
|
-
# NOTE: you may want to hash on base_name first!
|
|
556
|
-
def pep_probs_by_scan_charge
|
|
557
|
-
hash = peps.hash_by(:first_scan, :last_scan, :charge)
|
|
558
|
-
return min_and_best10(hash)
|
|
559
|
-
end
|
|
560
|
-
|
|
561
|
-
# (prob_list_by_min)
|
|
562
|
-
# hashes on seq-charge and returns the sorted list of probabilities of top
|
|
563
|
-
# hit per seq-charge
|
|
564
|
-
# NOTE: you may want to hash on base_name first!
|
|
565
|
-
def pep_probs_by_seq_charge
|
|
566
|
-
hash = peps.hash_by(:sequence, :charge)
|
|
567
|
-
min_peptides = hash.collect do |k,v|
|
|
568
|
-
v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
|
|
569
|
-
end
|
|
570
|
-
sorted_probabilities(min_peptides)
|
|
571
|
-
end
|
|
572
|
-
|
|
573
|
-
##########################################################################
|
|
574
|
-
# USE these if you have multiple files in your bioworks.xml file
|
|
575
|
-
##########################################################################
|
|
576
|
-
# (prob_list_by_min, prob_list_by_best10)
|
|
577
|
-
# returns 2 sorted lists of probabilities based on:
|
|
578
|
-
# 1. best peptide hit
|
|
579
|
-
# 2. top 10 peptide hits
|
|
580
|
-
# on a per scan basis
|
|
581
|
-
# NOTE: you may want to hash on base_name first!
|
|
582
|
-
def pep_probs_by_bn_scan
|
|
583
|
-
hash = peps.hash_by(:base_name, :first_scan, :last_scan)
|
|
584
|
-
return min_and_best10(hash)
|
|
585
|
-
end
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
#(prob_list_by_min, prob_list_by_best10)
|
|
589
|
-
# same as pep_probs_by_scan but per charge state
|
|
590
|
-
# NOTE: you may want to hash on base_name first!
|
|
591
|
-
def pep_probs_by_bn_scan_charge
|
|
592
|
-
hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
|
|
593
|
-
return min_and_best10(hash)
|
|
594
|
-
end
|
|
595
|
-
|
|
596
|
-
# (prob_list_by_min)
|
|
597
|
-
# hashes on seq-charge and returns the sorted list of probabilities of top
|
|
598
|
-
# hit per seq-charge
|
|
599
|
-
# NOTE: you may want to hash on base_name first!
|
|
600
|
-
def pep_probs_by_bn_seq_charge
|
|
601
|
-
hash = peps.hash_by(:base_name, :sequence, :charge)
|
|
602
|
-
min_peptides = hash.collect do |k,v|
|
|
603
|
-
v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
|
|
604
|
-
end
|
|
605
|
-
sorted_probabilities(min_peptides)
|
|
606
|
-
end
|
|
607
|
-
end
|
|
608
|
-
|
|
609
|
-
# A Generic spectraID protein
|
|
610
|
-
module SpecID::Prot
|
|
611
|
-
include ProteinReferenceable
|
|
612
|
-
|
|
613
|
-
# probability is always a float!
|
|
614
|
-
attr_accessor :probability, :reference, :peps
|
|
615
|
-
|
|
616
|
-
def <=> (other)
|
|
617
|
-
self.reference <=> other.reference
|
|
618
|
-
end
|
|
619
|
-
|
|
620
|
-
def inspect
|
|
621
|
-
pep_string =
|
|
622
|
-
if peps
|
|
623
|
-
", @peps(#)=#{peps.size}"
|
|
624
|
-
end
|
|
625
|
-
"<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
|
|
626
|
-
end
|
|
627
|
-
|
|
628
|
-
end
|
|
629
|
-
|
|
630
|
-
module SpecID::Pep
|
|
631
|
-
|
|
632
|
-
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
|
|
633
|
-
|
|
634
|
-
attr_accessor :prots
|
|
635
|
-
attr_accessor :probability
|
|
636
|
-
# full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
|
|
637
|
-
# or last.
|
|
638
|
-
attr_accessor :sequence
|
|
639
|
-
|
|
640
|
-
# the basic amino acid sequence (no leading or trailing '.' or amino acids)
|
|
641
|
-
# should not contain any special symbols, etc.
|
|
642
|
-
attr_accessor :aaseq
|
|
643
|
-
attr_accessor :charge
|
|
644
|
-
|
|
645
|
-
# removes nonstandard chars with Non_standard_amino_acid_char_re
|
|
646
|
-
# preserves A-Z and '.' and '-'
|
|
647
|
-
def self.remove_non_amino_acids(sequence)
|
|
648
|
-
sequence.gsub(Non_standard_amino_acid_char_re, '')
|
|
649
|
-
end
|
|
650
|
-
|
|
651
|
-
# remove_non_amino_acids && split_sequence
|
|
652
|
-
def self.prepare_sequence(val)
|
|
653
|
-
nv = remove_non_amino_acids(val)
|
|
654
|
-
split_sequence(nv)
|
|
655
|
-
end
|
|
656
|
-
|
|
657
|
-
def <=>(other)
|
|
658
|
-
aaseq <=> other.aaseq
|
|
659
|
-
end
|
|
660
|
-
|
|
661
|
-
# Returns prev, peptide, next from sequence. Parse errors return
|
|
662
|
-
# nil,nil,nil
|
|
663
|
-
# R.PEPTIDE.A # -> R, PEPTIDE, A
|
|
664
|
-
# R.PEPTIDE.- # -> R, PEPTIDE, -
|
|
665
|
-
# PEPTIDE.A # -> -, PEPTIDE, A
|
|
666
|
-
# A.PEPTIDE # -> A, PEPTIDE, -
|
|
667
|
-
# PEPTIDE # -> nil,nil,nil
|
|
668
|
-
def self.split_sequence(val)
|
|
669
|
-
peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
|
|
670
|
-
pieces = val.split('.')
|
|
671
|
-
case pieces.size
|
|
672
|
-
when 3
|
|
673
|
-
peptide_prev_aa, peptide, peptide_next_aa = *pieces
|
|
674
|
-
when 2
|
|
675
|
-
if pieces[0].size > 1 ## N termini
|
|
676
|
-
peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
|
|
677
|
-
else ## C termini
|
|
678
|
-
peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
|
|
679
|
-
end
|
|
680
|
-
when 1 ## this must be a parse error!
|
|
681
|
-
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
|
682
|
-
when 0
|
|
683
|
-
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
|
684
|
-
end
|
|
685
|
-
return peptide_prev_aa, peptide, peptide_next_aa
|
|
686
|
-
end
|
|
687
|
-
|
|
688
|
-
##
|
|
689
|
-
def self.sequence_to_aaseq(sequence)
|
|
690
|
-
after_removed = remove_non_amino_acids(sequence)
|
|
691
|
-
pieces = after_removed.split('.')
|
|
692
|
-
case pieces.size
|
|
693
|
-
when 3
|
|
694
|
-
pieces[1]
|
|
695
|
-
when 2
|
|
696
|
-
if pieces[0].size > 1 ## N termini
|
|
697
|
-
pieces[0]
|
|
698
|
-
else ## C termini
|
|
699
|
-
pieces[1]
|
|
700
|
-
end
|
|
701
|
-
when 1 ## this must be a parse error!
|
|
702
|
-
pieces[0] ## which is the peptide itself
|
|
703
|
-
else
|
|
704
|
-
abort "bad peptide sequence: #{sequence}"
|
|
705
|
-
end
|
|
706
|
-
end
|
|
707
|
-
|
|
708
|
-
# This will rapidly determine the list of proteins for which given
|
|
709
|
-
# peptides belong. It is meant to be low level and fast (eventually),
|
|
710
|
-
# so it asks for the data in a format amenable to this.
|
|
711
|
-
# returns a mirror array where each entry is an array of Fasta::Prot
|
|
712
|
-
# objects where each protein contains the sequence
|
|
713
|
-
def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
|
|
714
|
-
prots = fasta_obj.prots
|
|
715
|
-
prot_seqs = prots.map do |prot|
|
|
716
|
-
prot.aaseq
|
|
717
|
-
end
|
|
718
|
-
|
|
719
|
-
groups = peptide_strings_list.map do |pep_seq|
|
|
720
|
-
prot_index = 0
|
|
721
|
-
protein_group = []
|
|
722
|
-
prot_seqs.each do |prot_seq|
|
|
723
|
-
if prot_seq.include? pep_seq
|
|
724
|
-
protein_group << prots[prot_index]
|
|
725
|
-
end
|
|
726
|
-
prot_index += 1
|
|
727
|
-
end
|
|
728
|
-
protein_group
|
|
729
|
-
end
|
|
730
|
-
|
|
731
|
-
groups
|
|
732
|
-
end
|
|
733
|
-
|
|
734
|
-
# units can be :mmu, :amu, :ppm
|
|
735
|
-
def mass_accuracy(pep, unit=:ppm, mono=true)
|
|
736
|
-
# 10^6 * deltam accuracy/ m[measured]
|
|
737
|
-
# i.e., theoretical mass 1000, measured 999.9: 100ppm
|
|
738
|
-
# http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
|
|
739
|
-
# pep.mass is the theoretical M+H of the peptide
|
|
740
|
-
# this assumes that the deltacn value we're being told is correct, but I
|
|
741
|
-
# have my suspicions (since the <mass> value is not accurate...)
|
|
742
|
-
|
|
743
|
-
######## TO COMPLETE (and add to spec_id..?)
|
|
744
|
-
case unit
|
|
745
|
-
when :ppm
|
|
746
|
-
when :amu
|
|
747
|
-
when :mmu
|
|
748
|
-
end
|
|
749
|
-
end
|
|
750
|
-
|
|
751
|
-
# calls the method associated with each key and returns the value
|
|
752
|
-
def values_at(*args)
|
|
753
|
-
args.map do |arg|
|
|
754
|
-
send(arg)
|
|
755
|
-
end
|
|
756
|
-
end
|
|
757
|
-
|
|
758
|
-
def inspect
|
|
759
|
-
|
|
760
|
-
prot_string =
|
|
761
|
-
if prots
|
|
762
|
-
", @prots(#)=#{prots.size}"
|
|
763
|
-
end
|
|
764
|
-
"<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
|
|
765
|
-
end
|
|
766
|
-
|
|
767
|
-
end
|
|
768
|
-
|
|
769
|
-
class SpecID::GenericProt
|
|
770
|
-
include SpecID::Prot
|
|
771
|
-
end
|
|
772
|
-
|
|
773
|
-
class SpecID::GenericPep
|
|
774
|
-
include SpecID::Pep
|
|
775
|
-
end
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|