mspire 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/sample_enzyme.rb
DELETED
@@ -1,160 +0,0 @@
|
|
1
|
-
|
2
|
-
module SpecIDXML; end
|
3
|
-
|
4
|
-
require 'strscan'
|
5
|
-
|
6
|
-
require 'spec_id_xml'
|
7
|
-
require 'spec_id'
|
8
|
-
|
9
|
-
|
10
|
-
class SampleEnzyme
|
11
|
-
include SpecIDXML
|
12
|
-
|
13
|
-
attr_accessor :name
|
14
|
-
# amino acids after which to cleave
|
15
|
-
attr_accessor :cut
|
16
|
-
# cleave at 'cut' amino acids UNLESS it is followed by 'no_cut'
|
17
|
-
attr_accessor :no_cut
|
18
|
-
# 'C' or 'N'
|
19
|
-
attr_accessor :sense
|
20
|
-
|
21
|
-
# Currently, recognize:
|
22
|
-
# trypsin
|
23
|
-
# For other enzymes, you must set :cut, :no_cut, :name, and :sense
|
24
|
-
# will yield the object if you want to set the values that way
|
25
|
-
def initialize(name=nil)
|
26
|
-
@num_missed_cleavages_regex = nil
|
27
|
-
@sense = nil
|
28
|
-
@cut = nil
|
29
|
-
@no_cut = nil
|
30
|
-
@name = name
|
31
|
-
if @name
|
32
|
-
# set the values if we recognize this name
|
33
|
-
send("set_#{@name}".to_sym)
|
34
|
-
end
|
35
|
-
if block_given?
|
36
|
-
yield(self)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def set_trypsin
|
41
|
-
@sense = 'C'
|
42
|
-
@cut = 'KR'
|
43
|
-
@no_cut = 'P'
|
44
|
-
end
|
45
|
-
|
46
|
-
def to_pepxml
|
47
|
-
element_xml(:sample_enzyme, [:name]) do
|
48
|
-
short_element_xml(:specificity, [:cut, :no_cut, :sense])
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# returns self
|
53
|
-
def from_pepxml_node(node)
|
54
|
-
self.name = node['name']
|
55
|
-
ch = node.child
|
56
|
-
self.cut = ch['cut']
|
57
|
-
self.no_cut= ch['no_cut']
|
58
|
-
self.sense = ch['sense']
|
59
|
-
self
|
60
|
-
end
|
61
|
-
|
62
|
-
def self.from_pepxml_node(node)
|
63
|
-
self.new.from_pepxml_node(node)
|
64
|
-
end
|
65
|
-
|
66
|
-
# takes an amino acid sequence (e.g., -.PEPTIDK.L)
|
67
|
-
# returns the number of missed cleavages
|
68
|
-
def num_missed_cleavages(aaseq)
|
69
|
-
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
70
|
-
@num_missed_cleavages_regex =
|
71
|
-
if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
|
72
|
-
else
|
73
|
-
regex_string = "[#{@cut}]"
|
74
|
-
if @no_cut and @no_cut != ''
|
75
|
-
regex_string << "[^#{@no_cut}]"
|
76
|
-
end
|
77
|
-
/#{regex_string}/
|
78
|
-
end
|
79
|
-
arr = aaseq.scan(@num_missed_cleavages_regex)
|
80
|
-
num = arr.size
|
81
|
-
if aaseq[-1,1] =~ @num_missed_cleavages_regex
|
82
|
-
num -= 1
|
83
|
-
end
|
84
|
-
num
|
85
|
-
end
|
86
|
-
|
87
|
-
# requires full sequence (with heads and tails)
|
88
|
-
def num_tol_term(sequence)
|
89
|
-
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
90
|
-
no_cut = @no_cut || ''
|
91
|
-
num_tol = 0
|
92
|
-
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
93
|
-
last_of_middle = middle[-1,1]
|
94
|
-
first_of_middle = middle[0,1]
|
95
|
-
if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
|
96
|
-
num_tol += 1
|
97
|
-
end
|
98
|
-
if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
|
99
|
-
num_tol += 1
|
100
|
-
end
|
101
|
-
num_tol
|
102
|
-
end
|
103
|
-
|
104
|
-
# returns all peptides of missed cleavages <= 'missed_cleavages'
|
105
|
-
# so 2 missed cleavages will return all no missed cleavage peptides
|
106
|
-
# all 1 missed cleavages and all 2 missed cleavages.
|
107
|
-
# options:
|
108
|
-
def digest(string, missed_cleavages=0, options={})
|
109
|
-
raise NotImplementedError if @sense == 'N'
|
110
|
-
s = StringScanner.new(string)
|
111
|
-
no_cut_regex = Regexp.new("[#{@no_cut}]")
|
112
|
-
regex = Regexp.new("[#{@cut}]")
|
113
|
-
peps = []
|
114
|
-
last_pos = 0
|
115
|
-
current_pep = ''
|
116
|
-
loop do
|
117
|
-
if s.eos?
|
118
|
-
break
|
119
|
-
end
|
120
|
-
m = s.scan_until(regex)
|
121
|
-
if m ## found a cut point
|
122
|
-
last_pos = s.pos
|
123
|
-
# is the next amino acid a no_cut?
|
124
|
-
if string[s.pos,1] =~ no_cut_regex
|
125
|
-
current_pep << m
|
126
|
-
else
|
127
|
-
# cut it
|
128
|
-
current_pep << m
|
129
|
-
peps << current_pep
|
130
|
-
current_pep = ''
|
131
|
-
end
|
132
|
-
else ## didn't find a cut point
|
133
|
-
current_pep << string[last_pos..-1]
|
134
|
-
peps << current_pep
|
135
|
-
break
|
136
|
-
end
|
137
|
-
end
|
138
|
-
## LOOP through and grab each set of missed cleavages from num down to 0
|
139
|
-
all_sets_of_peps = []
|
140
|
-
(0..missed_cleavages).to_a.reverse.each do |num_mc|
|
141
|
-
all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
|
142
|
-
end
|
143
|
-
all_sets_of_peps
|
144
|
-
end
|
145
|
-
|
146
|
-
# takes an array of peptides and returns an array containing 'num' missed
|
147
|
-
# cleavages
|
148
|
-
# DOES NOT contain peptides that contain < num of missed cleavages
|
149
|
-
# (i.e., will not return missed cleaveages of 1 or 2 if num == 3
|
150
|
-
def get_missed_cleavages(ar_of_peptide_seqs, num)
|
151
|
-
(0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
|
152
|
-
ar_of_peptide_seqs[i,num+1].join
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
def self.tryptic(string, missed_cleavages=0)
|
157
|
-
self.new("trypsin").digest(string, missed_cleavages)
|
158
|
-
end
|
159
|
-
|
160
|
-
end
|
data/lib/scan_i.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
|
2
|
-
# http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/7370f94e852c0fae/4068c8c1c1c158ee
|
3
|
-
class String
|
4
|
-
def scan_i seq
|
5
|
-
pos=0
|
6
|
-
ndx=[]
|
7
|
-
slen = seq.length
|
8
|
-
while i=index(seq,pos)
|
9
|
-
ndx << i
|
10
|
-
pos = i + slen
|
11
|
-
end
|
12
|
-
ndx
|
13
|
-
end
|
14
|
-
|
15
|
-
#def scan_enum seq
|
16
|
-
# self.enum_for(:scan, seq).map do
|
17
|
-
# $~.offset(0)[0]
|
18
|
-
# end
|
19
|
-
#end
|
20
|
-
end
|
21
|
-
|
data/lib/spec_id/aa_freqs.rb
DELETED
@@ -1,170 +0,0 @@
|
|
1
|
-
require 'fasta'
|
2
|
-
|
3
|
-
module SpecID ; end
|
4
|
-
|
5
|
-
class SpecID::AAFreqs
|
6
|
-
# hash by capital one-letter amino acid symbols giving the frequency of
|
7
|
-
# seeing that amino acid. Frequencies should add to 1.
|
8
|
-
attr_accessor :aafreqs
|
9
|
-
|
10
|
-
# fasta is fasta object!
|
11
|
-
def initialize(fasta=nil)
|
12
|
-
if fasta
|
13
|
-
@aafreqs = calculate_frequencies(fasta.prots)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
# takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
|
18
|
-
def calculate_frequencies(objs)
|
19
|
-
hash = {}
|
20
|
-
total_aas = 0
|
21
|
-
('A'..'Z').each do |x|
|
22
|
-
hash[x] = 0
|
23
|
-
end
|
24
|
-
hash['*'] = 0
|
25
|
-
objs.each do |obj|
|
26
|
-
aaseq = obj.aaseq
|
27
|
-
total_aas += aaseq.size
|
28
|
-
aaseq.split('').each do |x|
|
29
|
-
hash[x] += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
# normalize by total amount:
|
33
|
-
hash.each do |k,v|
|
34
|
-
hash[k] = hash[k].to_f / total_aas
|
35
|
-
end
|
36
|
-
# convert all strings to symbols:
|
37
|
-
hash.each do |k,v|
|
38
|
-
hash[k.to_sym] = hash.delete(k)
|
39
|
-
end
|
40
|
-
hash
|
41
|
-
end
|
42
|
-
|
43
|
-
# The expected probability for seeing that amino acid in a given length.
|
44
|
-
# This calculates a lookup table (array) from 0 to highest_length of the
|
45
|
-
# probability of seeing at least one amino acid (given its frequency, where
|
46
|
-
# frequency is from 0 to 1)
|
47
|
-
def self.probability_of_length_table(frequency, max_length)
|
48
|
-
one_minus_freq = 1.0 - frequency.to_f
|
49
|
-
lookup = Array.new(max_length + 1)
|
50
|
-
(0..max_length).each do |len|
|
51
|
-
lookup[len] = 1.0 - (one_minus_freq**len);
|
52
|
-
end
|
53
|
-
lookup
|
54
|
-
end
|
55
|
-
|
56
|
-
# takes an array of peptide strings
|
57
|
-
# gives the actual number of peptides with at least one
|
58
|
-
# gives the expected number of peptides given the probabilities in the
|
59
|
-
# length lookup table.
|
60
|
-
# currently ONLY takes at_least = 1
|
61
|
-
# depends on @aafreqs
|
62
|
-
# returns two numbers in array [actual, expected]
|
63
|
-
# expected is a Float!!!
|
64
|
-
def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
|
65
|
-
if at_least > 1
|
66
|
-
raise NotImplementedError, "can only do at_least=1 right now!"
|
67
|
-
end
|
68
|
-
one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
|
69
|
-
amino_acid_as_st = amino_acid.to_s
|
70
|
-
probs = []
|
71
|
-
actual = 0
|
72
|
-
expected = 0.0
|
73
|
-
peptide_aaseqs.each do |pep|
|
74
|
-
expected += (1.0 - (one_minus_freq**pep.size))
|
75
|
-
if pep.include?(amino_acid_as_st)
|
76
|
-
actual += 1
|
77
|
-
end
|
78
|
-
end
|
79
|
-
[actual, expected]
|
80
|
-
end
|
81
|
-
|
82
|
-
# pep_objs respond to sequence?
|
83
|
-
# also takes a hash of peptides keyed on :aaseq
|
84
|
-
def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
|
85
|
-
if pep_objs.is_a? Hash
|
86
|
-
seqs = pep_objs.keys
|
87
|
-
else
|
88
|
-
seqs = pep_objs.map do |v|
|
89
|
-
v.aaseq
|
90
|
-
end
|
91
|
-
end
|
92
|
-
@aafreqs ||= {}
|
93
|
-
@aafreqs[:C] = cyst_freq
|
94
|
-
actual_and_expected_number(seqs, :C, 1)
|
95
|
-
end
|
96
|
-
|
97
|
-
##
|
98
|
-
=begin
|
99
|
-
|
100
|
-
foreach my $pep (@$peps) {
|
101
|
-
unless ($pep->prob() >= $prob_cutoff) {next;}
|
102
|
-
my %freq = ();
|
103
|
-
my $aa = $pep->AA_sequence();
|
104
|
-
my $len = length($aa);
|
105
|
-
|
106
|
-
## EXPECTED probability for each length
|
107
|
-
for (my $i = 0; $i < 20; $i++) {
|
108
|
-
## rolling at least one 6 in n rolls is 1 - (5/6)^n.
|
109
|
-
$expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
|
110
|
-
}
|
111
|
-
## FILTER any peptides we've already seen
|
112
|
-
if ($seen{$aa}) { next; }
|
113
|
-
else { $seen{$aa}++; }
|
114
|
-
|
115
|
-
## Fill in these values with zeroes:
|
116
|
-
for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
|
117
|
-
|
118
|
-
## get the frequencies for each AA in each peptide:
|
119
|
-
for (my $i = 0; $i < $len; $i++) {
|
120
|
-
my $let = substr($aa, $i, 1);
|
121
|
-
$tot_freq{$let}++;
|
122
|
-
$pepc[$cnt][$an{$let}]++;
|
123
|
-
}
|
124
|
-
$cnt++;
|
125
|
-
}
|
126
|
-
|
127
|
-
##############################################################
|
128
|
-
# ANALYSIS 2: Fraction of Peptides containing X Amino Acid
|
129
|
-
##############################################################
|
130
|
-
|
131
|
-
## What is the percentage of peptides containing at least 1 cysteine?
|
132
|
-
my $atleast = 1;
|
133
|
-
|
134
|
-
my @has;
|
135
|
-
## initialize
|
136
|
-
for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
|
137
|
-
my $tot = scalar(@pepc);
|
138
|
-
foreach my $pep (@pepc) {
|
139
|
-
for (my $index = 0; $index < 20; $index++) {
|
140
|
-
if ($pep->[$index] >= $atleast) {
|
141
|
-
$has[$index]++;
|
142
|
-
}
|
143
|
-
}
|
144
|
-
}
|
145
|
-
|
146
|
-
|
147
|
-
my @exp_sum = (); ## The total number of peptides I'd expect
|
148
|
-
## WE simply add up the peptides' probabilities
|
149
|
-
## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
|
150
|
-
foreach my $pep (@expected) {
|
151
|
-
for (my $i = 0; $i < 20; $i++) {
|
152
|
-
$exp_sum[$i] += $pep->[$i];
|
153
|
-
}
|
154
|
-
}
|
155
|
-
|
156
|
-
my @obs = map { $_/$tot } @has;
|
157
|
-
my @exp = map { $_/$tot } @exp_sum;
|
158
|
-
print STDERR "*********************************************\n";
|
159
|
-
print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
|
160
|
-
print "[AA] [Observed] [Predicted]\n";
|
161
|
-
for (my $i = 0; $i < 20; $i++) {
|
162
|
-
print "$AA[$i] $obs[$i] $exp[$i]\n";
|
163
|
-
}
|
164
|
-
print STDERR "*********************************************\n";
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
=end
|
169
|
-
|
170
|
-
end
|