mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/sample_enzyme.rb
DELETED
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
module SpecIDXML; end
|
|
3
|
-
|
|
4
|
-
require 'strscan'
|
|
5
|
-
|
|
6
|
-
require 'spec_id_xml'
|
|
7
|
-
require 'spec_id'
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SampleEnzyme
|
|
11
|
-
include SpecIDXML
|
|
12
|
-
|
|
13
|
-
attr_accessor :name
|
|
14
|
-
# amino acids after which to cleave
|
|
15
|
-
attr_accessor :cut
|
|
16
|
-
# cleave at 'cut' amino acids UNLESS it is followed by 'no_cut'
|
|
17
|
-
attr_accessor :no_cut
|
|
18
|
-
# 'C' or 'N'
|
|
19
|
-
attr_accessor :sense
|
|
20
|
-
|
|
21
|
-
# Currently, recognize:
|
|
22
|
-
# trypsin
|
|
23
|
-
# For other enzymes, you must set :cut, :no_cut, :name, and :sense
|
|
24
|
-
# will yield the object if you want to set the values that way
|
|
25
|
-
def initialize(name=nil)
|
|
26
|
-
@num_missed_cleavages_regex = nil
|
|
27
|
-
@sense = nil
|
|
28
|
-
@cut = nil
|
|
29
|
-
@no_cut = nil
|
|
30
|
-
@name = name
|
|
31
|
-
if @name
|
|
32
|
-
# set the values if we recognize this name
|
|
33
|
-
send("set_#{@name}".to_sym)
|
|
34
|
-
end
|
|
35
|
-
if block_given?
|
|
36
|
-
yield(self)
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
def set_trypsin
|
|
41
|
-
@sense = 'C'
|
|
42
|
-
@cut = 'KR'
|
|
43
|
-
@no_cut = 'P'
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def to_pepxml
|
|
47
|
-
element_xml(:sample_enzyme, [:name]) do
|
|
48
|
-
short_element_xml(:specificity, [:cut, :no_cut, :sense])
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# returns self
|
|
53
|
-
def from_pepxml_node(node)
|
|
54
|
-
self.name = node['name']
|
|
55
|
-
ch = node.child
|
|
56
|
-
self.cut = ch['cut']
|
|
57
|
-
self.no_cut= ch['no_cut']
|
|
58
|
-
self.sense = ch['sense']
|
|
59
|
-
self
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
def self.from_pepxml_node(node)
|
|
63
|
-
self.new.from_pepxml_node(node)
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
# takes an amino acid sequence (e.g., -.PEPTIDK.L)
|
|
67
|
-
# returns the number of missed cleavages
|
|
68
|
-
def num_missed_cleavages(aaseq)
|
|
69
|
-
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
|
70
|
-
@num_missed_cleavages_regex =
|
|
71
|
-
if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
|
|
72
|
-
else
|
|
73
|
-
regex_string = "[#{@cut}]"
|
|
74
|
-
if @no_cut and @no_cut != ''
|
|
75
|
-
regex_string << "[^#{@no_cut}]"
|
|
76
|
-
end
|
|
77
|
-
/#{regex_string}/
|
|
78
|
-
end
|
|
79
|
-
arr = aaseq.scan(@num_missed_cleavages_regex)
|
|
80
|
-
num = arr.size
|
|
81
|
-
if aaseq[-1,1] =~ @num_missed_cleavages_regex
|
|
82
|
-
num -= 1
|
|
83
|
-
end
|
|
84
|
-
num
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# requires full sequence (with heads and tails)
|
|
88
|
-
def num_tol_term(sequence)
|
|
89
|
-
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
|
90
|
-
no_cut = @no_cut || ''
|
|
91
|
-
num_tol = 0
|
|
92
|
-
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
|
93
|
-
last_of_middle = middle[-1,1]
|
|
94
|
-
first_of_middle = middle[0,1]
|
|
95
|
-
if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
|
|
96
|
-
num_tol += 1
|
|
97
|
-
end
|
|
98
|
-
if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
|
|
99
|
-
num_tol += 1
|
|
100
|
-
end
|
|
101
|
-
num_tol
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# returns all peptides of missed cleavages <= 'missed_cleavages'
|
|
105
|
-
# so 2 missed cleavages will return all no missed cleavage peptides
|
|
106
|
-
# all 1 missed cleavages and all 2 missed cleavages.
|
|
107
|
-
# options:
|
|
108
|
-
def digest(string, missed_cleavages=0, options={})
|
|
109
|
-
raise NotImplementedError if @sense == 'N'
|
|
110
|
-
s = StringScanner.new(string)
|
|
111
|
-
no_cut_regex = Regexp.new("[#{@no_cut}]")
|
|
112
|
-
regex = Regexp.new("[#{@cut}]")
|
|
113
|
-
peps = []
|
|
114
|
-
last_pos = 0
|
|
115
|
-
current_pep = ''
|
|
116
|
-
loop do
|
|
117
|
-
if s.eos?
|
|
118
|
-
break
|
|
119
|
-
end
|
|
120
|
-
m = s.scan_until(regex)
|
|
121
|
-
if m ## found a cut point
|
|
122
|
-
last_pos = s.pos
|
|
123
|
-
# is the next amino acid a no_cut?
|
|
124
|
-
if string[s.pos,1] =~ no_cut_regex
|
|
125
|
-
current_pep << m
|
|
126
|
-
else
|
|
127
|
-
# cut it
|
|
128
|
-
current_pep << m
|
|
129
|
-
peps << current_pep
|
|
130
|
-
current_pep = ''
|
|
131
|
-
end
|
|
132
|
-
else ## didn't find a cut point
|
|
133
|
-
current_pep << string[last_pos..-1]
|
|
134
|
-
peps << current_pep
|
|
135
|
-
break
|
|
136
|
-
end
|
|
137
|
-
end
|
|
138
|
-
## LOOP through and grab each set of missed cleavages from num down to 0
|
|
139
|
-
all_sets_of_peps = []
|
|
140
|
-
(0..missed_cleavages).to_a.reverse.each do |num_mc|
|
|
141
|
-
all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
|
|
142
|
-
end
|
|
143
|
-
all_sets_of_peps
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
# takes an array of peptides and returns an array containing 'num' missed
|
|
147
|
-
# cleavages
|
|
148
|
-
# DOES NOT contain peptides that contain < num of missed cleavages
|
|
149
|
-
# (i.e., will not return missed cleaveages of 1 or 2 if num == 3
|
|
150
|
-
def get_missed_cleavages(ar_of_peptide_seqs, num)
|
|
151
|
-
(0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
|
|
152
|
-
ar_of_peptide_seqs[i,num+1].join
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
def self.tryptic(string, missed_cleavages=0)
|
|
157
|
-
self.new("trypsin").digest(string, missed_cleavages)
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
end
|
data/lib/scan_i.rb
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/7370f94e852c0fae/4068c8c1c1c158ee
|
|
3
|
-
class String
|
|
4
|
-
def scan_i seq
|
|
5
|
-
pos=0
|
|
6
|
-
ndx=[]
|
|
7
|
-
slen = seq.length
|
|
8
|
-
while i=index(seq,pos)
|
|
9
|
-
ndx << i
|
|
10
|
-
pos = i + slen
|
|
11
|
-
end
|
|
12
|
-
ndx
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
#def scan_enum seq
|
|
16
|
-
# self.enum_for(:scan, seq).map do
|
|
17
|
-
# $~.offset(0)[0]
|
|
18
|
-
# end
|
|
19
|
-
#end
|
|
20
|
-
end
|
|
21
|
-
|
data/lib/spec_id/aa_freqs.rb
DELETED
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
require 'fasta'
|
|
2
|
-
|
|
3
|
-
module SpecID ; end
|
|
4
|
-
|
|
5
|
-
class SpecID::AAFreqs
|
|
6
|
-
# hash by capital one-letter amino acid symbols giving the frequency of
|
|
7
|
-
# seeing that amino acid. Frequencies should add to 1.
|
|
8
|
-
attr_accessor :aafreqs
|
|
9
|
-
|
|
10
|
-
# fasta is fasta object!
|
|
11
|
-
def initialize(fasta=nil)
|
|
12
|
-
if fasta
|
|
13
|
-
@aafreqs = calculate_frequencies(fasta.prots)
|
|
14
|
-
end
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
# takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
|
|
18
|
-
def calculate_frequencies(objs)
|
|
19
|
-
hash = {}
|
|
20
|
-
total_aas = 0
|
|
21
|
-
('A'..'Z').each do |x|
|
|
22
|
-
hash[x] = 0
|
|
23
|
-
end
|
|
24
|
-
hash['*'] = 0
|
|
25
|
-
objs.each do |obj|
|
|
26
|
-
aaseq = obj.aaseq
|
|
27
|
-
total_aas += aaseq.size
|
|
28
|
-
aaseq.split('').each do |x|
|
|
29
|
-
hash[x] += 1
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
# normalize by total amount:
|
|
33
|
-
hash.each do |k,v|
|
|
34
|
-
hash[k] = hash[k].to_f / total_aas
|
|
35
|
-
end
|
|
36
|
-
# convert all strings to symbols:
|
|
37
|
-
hash.each do |k,v|
|
|
38
|
-
hash[k.to_sym] = hash.delete(k)
|
|
39
|
-
end
|
|
40
|
-
hash
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
# The expected probability for seeing that amino acid in a given length.
|
|
44
|
-
# This calculates a lookup table (array) from 0 to highest_length of the
|
|
45
|
-
# probability of seeing at least one amino acid (given its frequency, where
|
|
46
|
-
# frequency is from 0 to 1)
|
|
47
|
-
def self.probability_of_length_table(frequency, max_length)
|
|
48
|
-
one_minus_freq = 1.0 - frequency.to_f
|
|
49
|
-
lookup = Array.new(max_length + 1)
|
|
50
|
-
(0..max_length).each do |len|
|
|
51
|
-
lookup[len] = 1.0 - (one_minus_freq**len);
|
|
52
|
-
end
|
|
53
|
-
lookup
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
# takes an array of peptide strings
|
|
57
|
-
# gives the actual number of peptides with at least one
|
|
58
|
-
# gives the expected number of peptides given the probabilities in the
|
|
59
|
-
# length lookup table.
|
|
60
|
-
# currently ONLY takes at_least = 1
|
|
61
|
-
# depends on @aafreqs
|
|
62
|
-
# returns two numbers in array [actual, expected]
|
|
63
|
-
# expected is a Float!!!
|
|
64
|
-
def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
|
|
65
|
-
if at_least > 1
|
|
66
|
-
raise NotImplementedError, "can only do at_least=1 right now!"
|
|
67
|
-
end
|
|
68
|
-
one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
|
|
69
|
-
amino_acid_as_st = amino_acid.to_s
|
|
70
|
-
probs = []
|
|
71
|
-
actual = 0
|
|
72
|
-
expected = 0.0
|
|
73
|
-
peptide_aaseqs.each do |pep|
|
|
74
|
-
expected += (1.0 - (one_minus_freq**pep.size))
|
|
75
|
-
if pep.include?(amino_acid_as_st)
|
|
76
|
-
actual += 1
|
|
77
|
-
end
|
|
78
|
-
end
|
|
79
|
-
[actual, expected]
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
# pep_objs respond to sequence?
|
|
83
|
-
# also takes a hash of peptides keyed on :aaseq
|
|
84
|
-
def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
|
|
85
|
-
if pep_objs.is_a? Hash
|
|
86
|
-
seqs = pep_objs.keys
|
|
87
|
-
else
|
|
88
|
-
seqs = pep_objs.map do |v|
|
|
89
|
-
v.aaseq
|
|
90
|
-
end
|
|
91
|
-
end
|
|
92
|
-
@aafreqs ||= {}
|
|
93
|
-
@aafreqs[:C] = cyst_freq
|
|
94
|
-
actual_and_expected_number(seqs, :C, 1)
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
##
|
|
98
|
-
=begin
|
|
99
|
-
|
|
100
|
-
foreach my $pep (@$peps) {
|
|
101
|
-
unless ($pep->prob() >= $prob_cutoff) {next;}
|
|
102
|
-
my %freq = ();
|
|
103
|
-
my $aa = $pep->AA_sequence();
|
|
104
|
-
my $len = length($aa);
|
|
105
|
-
|
|
106
|
-
## EXPECTED probability for each length
|
|
107
|
-
for (my $i = 0; $i < 20; $i++) {
|
|
108
|
-
## rolling at least one 6 in n rolls is 1 - (5/6)^n.
|
|
109
|
-
$expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
|
|
110
|
-
}
|
|
111
|
-
## FILTER any peptides we've already seen
|
|
112
|
-
if ($seen{$aa}) { next; }
|
|
113
|
-
else { $seen{$aa}++; }
|
|
114
|
-
|
|
115
|
-
## Fill in these values with zeroes:
|
|
116
|
-
for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
|
|
117
|
-
|
|
118
|
-
## get the frequencies for each AA in each peptide:
|
|
119
|
-
for (my $i = 0; $i < $len; $i++) {
|
|
120
|
-
my $let = substr($aa, $i, 1);
|
|
121
|
-
$tot_freq{$let}++;
|
|
122
|
-
$pepc[$cnt][$an{$let}]++;
|
|
123
|
-
}
|
|
124
|
-
$cnt++;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
##############################################################
|
|
128
|
-
# ANALYSIS 2: Fraction of Peptides containing X Amino Acid
|
|
129
|
-
##############################################################
|
|
130
|
-
|
|
131
|
-
## What is the percentage of peptides containing at least 1 cysteine?
|
|
132
|
-
my $atleast = 1;
|
|
133
|
-
|
|
134
|
-
my @has;
|
|
135
|
-
## initialize
|
|
136
|
-
for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
|
|
137
|
-
my $tot = scalar(@pepc);
|
|
138
|
-
foreach my $pep (@pepc) {
|
|
139
|
-
for (my $index = 0; $index < 20; $index++) {
|
|
140
|
-
if ($pep->[$index] >= $atleast) {
|
|
141
|
-
$has[$index]++;
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
my @exp_sum = (); ## The total number of peptides I'd expect
|
|
148
|
-
## WE simply add up the peptides' probabilities
|
|
149
|
-
## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
|
|
150
|
-
foreach my $pep (@expected) {
|
|
151
|
-
for (my $i = 0; $i < 20; $i++) {
|
|
152
|
-
$exp_sum[$i] += $pep->[$i];
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
my @obs = map { $_/$tot } @has;
|
|
157
|
-
my @exp = map { $_/$tot } @exp_sum;
|
|
158
|
-
print STDERR "*********************************************\n";
|
|
159
|
-
print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
|
|
160
|
-
print "[AA] [Observed] [Predicted]\n";
|
|
161
|
-
for (my $i = 0; $i < 20; $i++) {
|
|
162
|
-
print "$AA[$i] $obs[$i] $exp[$i]\n";
|
|
163
|
-
}
|
|
164
|
-
print STDERR "*********************************************\n";
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
=end
|
|
169
|
-
|
|
170
|
-
end
|