mspire 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,226 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
## The yeast Scal db mean background is: 0.00984
|
4
|
-
## The yeast Cysteine background freq is: 0.0131986582396467
|
5
|
-
pep_seq_re = /<search_hit .* peptide="(\w+)"/o
|
6
|
-
pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
|
7
|
-
|
8
|
-
if ARGV.size != 3
|
9
|
-
puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
|
10
|
-
puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
|
11
|
-
abort
|
12
|
-
end
|
13
|
-
|
14
|
-
def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
15
|
-
File.open(base_toplot, "w") do |fh|
|
16
|
-
fh.puts 'XYData'
|
17
|
-
fh.puts base
|
18
|
-
fh.puts title
|
19
|
-
fh.puts xaxis
|
20
|
-
fh.puts yaxis
|
21
|
-
cats.each do |ar|
|
22
|
-
fh.puts ar.join(" & ")
|
23
|
-
ar.each do |a|
|
24
|
-
fh.puts hash[a].join(" ")
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
############################################################################
|
31
|
-
#### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
|
32
|
-
#### CHANGE HIM THERE (eventually we need to put him in a lib file)
|
33
|
-
# (actual # with cys, expected # with cys, total#peptides,
|
34
|
-
# mean_fraction_of_cysteines_true, std)
|
35
|
-
# PepHit(C) = Peptide containing cysteine
|
36
|
-
# # Total PepHit(C) # Observed Bad Pep (C)
|
37
|
-
# ------------------ proportional_to ----------------------
|
38
|
-
# # Total PepHit # Total Bad PepHit (X)
|
39
|
-
def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
|
40
|
-
|
41
|
-
# the number of bona fide BAD cysteine hits
|
42
|
-
# (some of the cysteine hits (~5%) are true positives)
|
43
|
-
|
44
|
-
ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
|
45
|
-
if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
|
46
|
-
total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
|
47
|
-
fpr = total_number_false / total_peptides
|
48
|
-
[fpr, total_number_false]
|
49
|
-
end
|
50
|
-
############################################################################
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
(cysteine_background_freq, background_freq, file) = ARGV
|
56
|
-
cysteine_background_freq = cysteine_background_freq.to_f
|
57
|
-
background_freq = background_freq.to_f
|
58
|
-
|
59
|
-
seq_probs = []
|
60
|
-
last_seq_prob = nil
|
61
|
-
File.open(file) do |fh|
|
62
|
-
fh.each do |line|
|
63
|
-
if line =~ pep_seq_re
|
64
|
-
ar = Array.new(2)
|
65
|
-
ar[0] = $1
|
66
|
-
seq_probs << ar
|
67
|
-
last_seq_prob = ar
|
68
|
-
elsif line =~ pep_prob_re
|
69
|
-
last_seq_prob[1] = $1.to_f
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
#seq_probs.each do |seq|
|
75
|
-
# if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
|
76
|
-
# abort "BAD PARSING!!"
|
77
|
-
# end
|
78
|
-
#end
|
79
|
-
amino_acid_as_st = 'C'
|
80
|
-
|
81
|
-
sorted = seq_probs.sort_by {|v| v[1] }.reverse
|
82
|
-
|
83
|
-
## traverse the peptides
|
84
|
-
actual_cys_containing_peps = 0
|
85
|
-
expected_cys_containing_peps = 0.0
|
86
|
-
current_sum_one_minus_prob = 0.0
|
87
|
-
prob_estimated_fpr = 0.0
|
88
|
-
pep_cnt = 0
|
89
|
-
one_minus_freq = 1.0 - cysteine_background_freq
|
90
|
-
|
91
|
-
## tabulate:
|
92
|
-
pep_cnts = []
|
93
|
-
probs = []
|
94
|
-
prob_fprs = []
|
95
|
-
prob_tps = []
|
96
|
-
cys_fprs = []
|
97
|
-
cys_tps = []
|
98
|
-
fpr_diff = []
|
99
|
-
|
100
|
-
|
101
|
-
sorted.each do |ar|
|
102
|
-
pep_cnt += 1
|
103
|
-
|
104
|
-
pep = ar[0]
|
105
|
-
prob = ar[1]
|
106
|
-
|
107
|
-
## Cysteine FPR: ##
|
108
|
-
# Expected:
|
109
|
-
expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
|
110
|
-
# Actual:
|
111
|
-
if pep.include?(amino_acid_as_st)
|
112
|
-
actual_cys_containing_peps += 1
|
113
|
-
end
|
114
|
-
(cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
|
115
|
-
cys_tp = pep_cnt.to_f - total_num_false_by_cys
|
116
|
-
|
117
|
-
|
118
|
-
## FPR by prob: ##
|
119
|
-
# SUM(1-probX)/#peps
|
120
|
-
current_sum_one_minus_prob += 1.0 - prob
|
121
|
-
prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
|
122
|
-
prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
|
123
|
-
|
124
|
-
## GRAB or report the data:
|
125
|
-
pep_cnts << pep_cnt
|
126
|
-
probs << prob
|
127
|
-
prob_fprs << prob_estimated_fpr
|
128
|
-
prob_tps << prob_tp
|
129
|
-
cys_fprs << cys_fpr
|
130
|
-
cys_tps << cys_tp
|
131
|
-
fpr_diff << prob_estimated_fpr - cys_fpr
|
132
|
-
|
133
|
-
#puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
|
134
|
-
end
|
135
|
-
|
136
|
-
hash = {
|
137
|
-
'pep_cnts' => pep_cnts,
|
138
|
-
'probs' => probs,
|
139
|
-
'prob_fprs' => prob_fprs,
|
140
|
-
'prob_tps' => prob_tps,
|
141
|
-
'cys_fprs' => cys_fprs,
|
142
|
-
'cys_tps' => cys_tps,
|
143
|
-
'fpr_diff' => fpr_diff,
|
144
|
-
}
|
145
|
-
|
146
|
-
|
147
|
-
real_base = file.sub(/\.xml/,'')
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
## TPS vs FPR
|
152
|
-
base = real_base.dup
|
153
|
-
base << "." << "tps_vs_fpr"
|
154
|
-
base_toplot = base + '.to_plot'
|
155
|
-
title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
|
156
|
-
xaxis = "TPs"
|
157
|
-
yaxis = "FPR"
|
158
|
-
cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
|
159
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
160
|
-
|
161
|
-
## PEPHITS vs FPR
|
162
|
-
base = real_base.dup
|
163
|
-
base << "." << "num_pep_hits_vs_fpr"
|
164
|
-
base_toplot = base + '.to_plot'
|
165
|
-
title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
|
166
|
-
xaxis = "num peptide hits"
|
167
|
-
yaxis = "FPR"
|
168
|
-
cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
|
169
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
170
|
-
|
171
|
-
## PEPHITS VS FPR DIFF
|
172
|
-
base = real_base.dup
|
173
|
-
base << "." << "num_pep_hits_vs_fpr_diff"
|
174
|
-
base_toplot = base + '.to_plot'
|
175
|
-
title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
|
176
|
-
xaxis = "num peptide hits"
|
177
|
-
yaxis = "FPR diff (prob - cysteine)"
|
178
|
-
cats = [['pep_cnts', 'fpr_diff']]
|
179
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
180
|
-
|
181
|
-
## PROB VS FPR DIFF
|
182
|
-
base = real_base.dup
|
183
|
-
base << "." << "prob_vs_fpr_diff"
|
184
|
-
base_toplot = base + '.to_plot'
|
185
|
-
title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
|
186
|
-
xaxis = "peptide probability"
|
187
|
-
yaxis = "FPR diff (prob - cysteine)"
|
188
|
-
cats = [['probs', 'fpr_diff']]
|
189
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
=begin
|
194
|
-
|
195
|
-
returns [number_of_prots, actual_fpr]
|
196
|
-
def num_prots_above_fpr(prots, desired_fpr)
|
197
|
-
current_fpr_rate_percent = 0.0
|
198
|
-
previous_fpr_rate_percent = 0.0
|
199
|
-
current_sum_one_minus_prob = 0.0
|
200
|
-
proteins_within_fpr = 0
|
201
|
-
actual_fpr = nil
|
202
|
-
already_found = false
|
203
|
-
prot_cnt = 0
|
204
|
-
prots.each do |prot|
|
205
|
-
prot_cnt += 1
|
206
|
-
# SUM(1-probX)/#prots
|
207
|
-
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
208
|
-
current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
209
|
-
|
210
|
-
if current_fpr_rate_percent > desired_fpr && !already_found
|
211
|
-
actual_fpr = previous_fpr_rate_percent
|
212
|
-
proteins_within_fpr = prot_cnt
|
213
|
-
already_found = true
|
214
|
-
end
|
215
|
-
previous_fpr_rate_percent = current_fpr_rate_percent
|
216
|
-
end
|
217
|
-
[proteins_within_fpr, actual_fpr]
|
218
|
-
end
|
219
|
-
|
220
|
-
=end
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
@@ -1,56 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'table'
|
5
|
-
|
6
|
-
require 'ms/gradient_program'
|
7
|
-
|
8
|
-
delimiter = "\t"
|
9
|
-
table_format = false
|
10
|
-
opts = OptionParser.new do |op|
|
11
|
-
op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
|
12
|
-
op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
|
13
|
-
if v == 'space'
|
14
|
-
delimiter = " "
|
15
|
-
elsif v == 'tab'
|
16
|
-
delimiter = "\t"
|
17
|
-
elsif v == 'format'
|
18
|
-
table_format = true
|
19
|
-
else
|
20
|
-
abort "don't recognize #{v}"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
opts.parse!
|
26
|
-
|
27
|
-
if ARGV.size == 0
|
28
|
-
puts opts
|
29
|
-
exit
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
sets_of_tables = {}
|
34
|
-
ARGV.each do |file|
|
35
|
-
File.open(file) do |fh|
|
36
|
-
sets_of_tables[file] = GradientProgram.all_from_handle(fh)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
sets_of_tables.each do |file, tables|
|
41
|
-
puts "FILE: #{file}"
|
42
|
-
tables.each do |gp|
|
43
|
-
puts "PUMP_TYPE: #{gp.pump_type}"
|
44
|
-
col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
|
45
|
-
data = gp.time_points.map do |tp|
|
46
|
-
line = [tp.time, *(tp.percentages)]
|
47
|
-
line << tp.flow_rate
|
48
|
-
end
|
49
|
-
table = Table.new(data, nil, col_labels)
|
50
|
-
if table_format
|
51
|
-
puts table.to_formatted_string
|
52
|
-
else
|
53
|
-
puts table.to_s(delimiter)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
@@ -1,137 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
require 'vec'
|
4
|
-
|
5
|
-
# FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
|
6
|
-
# is 0.0101409563168847
|
7
|
-
|
8
|
-
# <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
|
9
|
-
|
10
|
-
def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
11
|
-
File.open(base_toplot, "w") do |fh|
|
12
|
-
fh.puts 'XYData'
|
13
|
-
fh.puts base
|
14
|
-
fh.puts title
|
15
|
-
fh.puts xaxis
|
16
|
-
fh.puts yaxis
|
17
|
-
cats.each do |ar|
|
18
|
-
fh.puts ar.join(" & ")
|
19
|
-
ar.each do |a|
|
20
|
-
fh.puts hash[a].join(" ")
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
|
25
|
-
end
|
26
|
-
|
27
|
-
peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
|
28
|
-
|
29
|
-
unless ARGV.size == 2
|
30
|
-
abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
|
31
|
-
end
|
32
|
-
|
33
|
-
(cysteine_background_freq, file) = ARGV
|
34
|
-
|
35
|
-
# each pep = [nsp_prob, init_prob, SEQUENCE]
|
36
|
-
peps = []
|
37
|
-
File.open(file) do |fh|
|
38
|
-
fh.each do |line|
|
39
|
-
if line =~ peptide_re
|
40
|
-
peps << [$3.to_f,$2.to_f,$1]
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
amino_acid_as_st = 'C'
|
47
|
-
one_minus_freq = 1.0 - cysteine_background_freq.to_f
|
48
|
-
actual_cys_containing_peps = 0
|
49
|
-
expected_cys_containing_peps = 0.0
|
50
|
-
current_sum_one_minus_prob = 0.0
|
51
|
-
prob_estimated_fpr = 0.0
|
52
|
-
pep_cnt = 0
|
53
|
-
|
54
|
-
the_probs = []
|
55
|
-
the_fractions = []
|
56
|
-
special_probs = []
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
#peps.sort.reverse.each do |ar|
|
62
|
-
#peps.sort.each do |ar|
|
63
|
-
peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
|
64
|
-
(nsp_prob, init_prob, pep) = ar
|
65
|
-
## Cysteine FPR: ##
|
66
|
-
# Expected:
|
67
|
-
expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
|
68
|
-
# Actual:
|
69
|
-
if pep.include?(amino_acid_as_st)
|
70
|
-
actual_cys_containing_peps += 1
|
71
|
-
end
|
72
|
-
fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
|
73
|
-
|
74
|
-
special_prob = (3.0 * nsp_prob) + init_prob
|
75
|
-
|
76
|
-
## Get the final fraction
|
77
|
-
#if special_prob < 4.0
|
78
|
-
# #puts the_fractions.join(" ")
|
79
|
-
# puts the_fractions.last
|
80
|
-
# abort
|
81
|
-
#end
|
82
|
-
|
83
|
-
# gather data to plot
|
84
|
-
the_probs << nsp_prob
|
85
|
-
special_probs << special_prob
|
86
|
-
the_fractions << fraction_ac_exp
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
hash = {
|
93
|
-
'probs' => the_probs,
|
94
|
-
'fractions' => the_fractions,
|
95
|
-
'special_probs' => special_probs,
|
96
|
-
}
|
97
|
-
|
98
|
-
real_base = file.sub(/\.xml/,'')
|
99
|
-
|
100
|
-
|
101
|
-
=begin
|
102
|
-
## PROB VS FPR DIFF
|
103
|
-
base = real_base.dup
|
104
|
-
base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
|
105
|
-
base_toplot = base + '.to_plot'
|
106
|
-
title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
|
107
|
-
xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
|
108
|
-
yaxis = "fraction with cysteines (actual/expected)"
|
109
|
-
cats = [['probs', 'fractions']]
|
110
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
111
|
-
=end
|
112
|
-
|
113
|
-
|
114
|
-
=begin
|
115
|
-
## PROB VS FPR DIFF
|
116
|
-
base = real_base.dup
|
117
|
-
base << "." << "prob_vs_actual_expected_fraction"
|
118
|
-
base_toplot = base + '.to_plot'
|
119
|
-
title = "peptide prob vs fraction with cysteines (actual/expected)"
|
120
|
-
xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
|
121
|
-
yaxis = "fraction with cysteines (actual/expected)"
|
122
|
-
cats = [['probs', 'fractions']]
|
123
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
124
|
-
=end
|
125
|
-
|
126
|
-
## SPECIAL PROB VS FPR DIFF
|
127
|
-
base = real_base.dup
|
128
|
-
base << "." << "special_prob_vs_actual_expected_fraction"
|
129
|
-
base_toplot = base + '.to_plot'
|
130
|
-
title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
|
131
|
-
xaxis = "(3 * nsp_prob) + init_prob"
|
132
|
-
yaxis = "fraction with cysteines (actual/expected)"
|
133
|
-
cats = [['special_probs', 'fractions']]
|
134
|
-
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
135
|
-
|
136
|
-
|
137
|
-
|
@@ -1,136 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
# Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
|
4
|
-
# There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
|
5
|
-
|
6
|
-
require 'spec_id'
|
7
|
-
require 'fasta'
|
8
|
-
require 'optparse'
|
9
|
-
require 'ostruct'
|
10
|
-
|
11
|
-
# returns an accession number if available, or the entire reference (less the
|
12
|
-
# starting '>'
|
13
|
-
def get_fasta_accession(fasta_prot)
|
14
|
-
head = fasta_prot.header
|
15
|
-
if head =~ ACC_REGEX
|
16
|
-
$1.dup
|
17
|
-
else
|
18
|
-
head.sub(/^>/, '').rstrip
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# returns the accession number from a reference, or the complete reference
|
23
|
-
def accession_from_ref(pep)
|
24
|
-
ref = pep.prot.reference
|
25
|
-
if ref =~ ACC_REGEX
|
26
|
-
$1.dup
|
27
|
-
else
|
28
|
-
ref.rstrip
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def get_pep_prot_accession(pep)
|
33
|
-
acc = pep.prot.accession
|
34
|
-
if !acc || acc == '0' || acc == 0
|
35
|
-
accession_from_ref(pep)
|
36
|
-
else
|
37
|
-
acc
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
#####################################################################
|
42
|
-
# MAIN
|
43
|
-
#####################################################################
|
44
|
-
|
45
|
-
opt = OpenStruct.new
|
46
|
-
opt.p = 'prob'
|
47
|
-
opts = OptionParser.new do |op|
|
48
|
-
op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
|
49
|
-
op.separator " [prints to stdout tab delimited table]"
|
50
|
-
op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
|
51
|
-
op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
|
52
|
-
end
|
53
|
-
opts.parse!
|
54
|
-
|
55
|
-
if ARGV.size < 2
|
56
|
-
puts opts
|
57
|
-
exit
|
58
|
-
end
|
59
|
-
|
60
|
-
case opt.p
|
61
|
-
when 'prob'
|
62
|
-
param = :peptide_probability
|
63
|
-
best = :first
|
64
|
-
when 'xcorr'
|
65
|
-
param = :xcorr
|
66
|
-
best = :last
|
67
|
-
else
|
68
|
-
abort "incorrect param: #{opt.p}"
|
69
|
-
end
|
70
|
-
|
71
|
-
############################
|
72
|
-
# GLOBALS
|
73
|
-
DELIM = "\t"
|
74
|
-
ACC_REGEX = /\|(.*?)\|/o
|
75
|
-
############################
|
76
|
-
|
77
|
-
bioworks = ARGV[0]
|
78
|
-
fasta_file = ARGV[1]
|
79
|
-
|
80
|
-
fprots = Fasta.new.read_file(fasta_file).prots
|
81
|
-
gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
|
82
|
-
|
83
|
-
peptides = SpecID.new(bioworks).peps
|
84
|
-
|
85
|
-
|
86
|
-
## Get the best peptide(s) per scan
|
87
|
-
top_peps_per_scan = []
|
88
|
-
|
89
|
-
peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
|
90
|
-
sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
|
91
|
-
|
92
|
-
top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
|
93
|
-
found_another = false
|
94
|
-
sorted_list.each do |pep|
|
95
|
-
if pep.send(param).to_f == top_peps.send(best).send(param).to_f
|
96
|
-
if opt.t
|
97
|
-
top_peps << pep
|
98
|
-
else
|
99
|
-
found_another = true
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
unless found_another
|
104
|
-
top_peps_per_scan.push( *top_peps )
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
## Get the best scoring peptide per peptide/prot from list of best
|
110
|
-
## peptides/scan
|
111
|
-
top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
|
112
|
-
pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
|
113
|
-
end
|
114
|
-
|
115
|
-
## sort the peptides by best score
|
116
|
-
sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
|
117
|
-
if best == :last ; sorted_top_pep_seq_prots.reverse! end
|
118
|
-
|
119
|
-
## plot the probability vs. the number of tps
|
120
|
-
puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
|
121
|
-
tps = 0
|
122
|
-
sorted_top_pep_seq_prots.each do |pep|
|
123
|
-
if gi_nums.include?( get_pep_prot_accession(pep) )
|
124
|
-
tps += 1
|
125
|
-
puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
@@ -1,44 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
require 'rexml/document'
|
4
|
-
|
5
|
-
if ARGV.size == 0
|
6
|
-
puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
|
7
|
-
puts "outputs a .csv file"
|
8
|
-
exit
|
9
|
-
end
|
10
|
-
|
11
|
-
class Protein
|
12
|
-
attr_accessor :name, :pi, :ni
|
13
|
-
def initialize(name, pi, ni)
|
14
|
-
@name, @pi, @ni = name, pi, ni
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
class Listener
|
19
|
-
attr_accessor :proteins
|
20
|
-
|
21
|
-
def initialize
|
22
|
-
@proteins = []
|
23
|
-
end
|
24
|
-
|
25
|
-
def tag_start(name, attrs)
|
26
|
-
if name == "protein"
|
27
|
-
protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
|
28
|
-
@proteins.push( protein )
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def method_missing(*args) ; end
|
33
|
-
|
34
|
-
end
|
35
|
-
|
36
|
-
ARGV.each do |file|
|
37
|
-
File.open("output.csv", 'w') do |out|
|
38
|
-
listener = Listener.new
|
39
|
-
REXML::Document.parse_stream(File.new(file), listener)
|
40
|
-
listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
|
41
|
-
out.puts [protein.name, protein.pi, protein.ni].join("\t")
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
data/script/histogram_probs.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
require 'vec'
|
4
|
-
require 'spec_id'
|
5
|
-
require 'optparse'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'set'
|
8
|
-
|
9
|
-
|
10
|
-
opt = OpenStruct.new
|
11
|
-
opt.p = ["INV_"]
|
12
|
-
opt.b = 50
|
13
|
-
opts = OptionParser.new do |opts|
|
14
|
-
opts.banner = "usage: #{File.basename(__FILE__)} [-d -b bins -p prefix[,...]] file ..."
|
15
|
-
opts.on_head "\noutputs 'histogram.toplot'\n(then) % plot.rb -w lp --yrange n1: --noenhanced histogram.toplot\n"
|
16
|
-
opts.on("-p", "--prefix PREFIX", "(comma sep list) FP protein header prefix (def: #{opt.p})") {|v| opt.p = v.split(',')}
|
17
|
-
opts.on("-b", "--bins NUM_BINS", "number of histogram bins (def: #{opt.b})") {|v| opt.b = v.to_i}
|
18
|
-
opts.on("-d", "--diff", "plots TP - FP") {|v| opt.b = v.to_i}
|
19
|
-
end
|
20
|
-
opts.parse!
|
21
|
-
|
22
|
-
if ARGV.size < 1
|
23
|
-
puts opts
|
24
|
-
end
|
25
|
-
|
26
|
-
outfile = 'histogram.toplot'
|
27
|
-
dtype = 'XYData'
|
28
|
-
outfile_base = 'histogram'
|
29
|
-
title = 'histogram of protein probabilities'
|
30
|
-
xaxis = 'probability'
|
31
|
-
yaxis = 'frequency'
|
32
|
-
out = File.open(outfile, "w")
|
33
|
-
[dtype, outfile_base, title, xaxis, yaxis].each do |it|
|
34
|
-
out.puts it
|
35
|
-
end
|
36
|
-
|
37
|
-
files = ARGV.to_a
|
38
|
-
files.each_with_index do |file,i|
|
39
|
-
fp = VecD.new; tp = VecD.new
|
40
|
-
bio = SpecID.new(file)
|
41
|
-
re = /^#{opt.p[i]}/
|
42
|
-
bio.prots.each do |prot|
|
43
|
-
if prot.reference =~ re
|
44
|
-
fp << Math.log10(prot.probability)
|
45
|
-
else
|
46
|
-
tp << Math.log10(prot.probability)
|
47
|
-
end
|
48
|
-
end
|
49
|
-
if fp.size == 0 then puts "NO FALSE POSITIVES FOUND! Your prefix is probably wrong ;)" end
|
50
|
-
label = file
|
51
|
-
t_bin, t_freq = tp.histogram(opt.b)
|
52
|
-
f_bin, f_freq = fp.histogram(opt.b)
|
53
|
-
out.puts 'TP ' + label
|
54
|
-
out.puts t_bin.to_s
|
55
|
-
out.puts t_freq.to_s
|
56
|
-
out.puts 'FP ' + label
|
57
|
-
out.puts f_bin.to_s
|
58
|
-
out.puts f_freq.to_s
|
59
|
-
end
|
60
|
-
|
61
|
-
out.close
|