mspire 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/lib/fasta.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'sample_enzyme'
|
1
2
|
|
2
3
|
class String
|
3
4
|
|
@@ -7,8 +8,10 @@ class String
|
|
7
8
|
end
|
8
9
|
end
|
9
10
|
|
11
|
+
# modifies and returns self
|
10
12
|
def shuffle!
|
11
13
|
each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
|
14
|
+
self
|
12
15
|
end
|
13
16
|
|
14
17
|
def shuffle
|
@@ -44,6 +47,7 @@ class Fasta
|
|
44
47
|
# Checks that the first character per line is '>' or character class [A-Za-z*]
|
45
48
|
# returns a fasta object for stringing commands
|
46
49
|
def read_file(fn)
|
50
|
+
first_char_re = /[A-Za-z*]/o
|
47
51
|
obj = nil
|
48
52
|
regex = /(\r\n)|\n/o
|
49
53
|
fh = File.new(fn).binmode
|
@@ -57,7 +61,7 @@ class Fasta
|
|
57
61
|
obj = Prot.new
|
58
62
|
@prots << obj
|
59
63
|
obj.header = line.dup
|
60
|
-
elsif first_char =~
|
64
|
+
elsif first_char =~ first_char_re
|
61
65
|
obj.aaseq << line.chomp
|
62
66
|
else
|
63
67
|
raise "Line not in fasta format (between arrows): -->#{line}<--"
|
@@ -105,26 +109,40 @@ class Fasta
|
|
105
109
|
# returns a new fasta object using some fraction of proteins randomly
|
106
110
|
# selected (fraction may be > 1). Always rounds up. Will not choose a
|
107
111
|
# protein twice unless all other proteins have been chosen
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
112
|
+
#
|
113
|
+
# fraction_prefix ensures that a unique header is given even if multiple
|
114
|
+
# fraction of proteins are being created
|
115
|
+
# fraction_cnt = (prot_cnt/num_prots).floor.to_i
|
116
|
+
# so for the first n proteins, it will be 0,
|
117
|
+
# the 2n proteins will be 1, etc.
|
118
|
+
# e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
|
119
|
+
# would give headers like this: >f0_<some_real_header>,
|
120
|
+
# >f1_<some_real_header>, ...
|
121
|
+
def fraction_of_prots(fraction=1, prefix_proc=nil)
|
122
|
+
new_num = (fraction.to_f * self.prots.size).ceil
|
123
|
+
arr = []
|
124
|
+
orig_num_prots = @prots.size
|
125
|
+
|
126
|
+
# initialize
|
127
|
+
new_prots = @prots.map {|prt| prt.dup }
|
128
|
+
frac_cnt = 0
|
129
|
+
ind_cnt = 0
|
130
|
+
prt_cnt = orig_num_prots
|
131
|
+
while ind_cnt < new_num
|
132
|
+
arr << new_prots.delete_at(rand(new_prots.size))
|
133
|
+
if prefix_proc
|
134
|
+
prefix = prefix_proc.call(frac_cnt)
|
135
|
+
arr.last.header_prefix!(prefix)
|
117
136
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
if hash.size == 0 then hash = Hash[*arr] end
|
137
|
+
prt_cnt -= 1 # index
|
138
|
+
if prt_cnt == 0
|
139
|
+
frac_cnt += 1
|
140
|
+
new_prots = @prots.map {|prt| prt.dup }
|
141
|
+
prt_cnt = orig_num_prots
|
124
142
|
end
|
125
|
-
|
143
|
+
ind_cnt += 1
|
126
144
|
end
|
127
|
-
fasta_fraction
|
145
|
+
fasta_fraction = Fasta.new(arr)
|
128
146
|
end
|
129
147
|
|
130
148
|
# Convenience method for modifying some fraction of the proteins of a file
|
@@ -204,18 +222,32 @@ class Fasta
|
|
204
222
|
other
|
205
223
|
end
|
206
224
|
|
225
|
+
# method = :shuffle! | :reverse!
|
226
|
+
def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
|
227
|
+
if tryptic_peptides
|
228
|
+
@prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
|
229
|
+
else
|
230
|
+
@prots.each {|prot| prot.aaseq!(method_as_symbol) }
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
207
234
|
# shuffles the aa sequence of each protein (each protein within itself)
|
208
235
|
def aaseq_shuffle!
|
209
|
-
@prots.each
|
210
|
-
prot.shuffle!
|
211
|
-
end
|
236
|
+
@prots.each {|prot| prot.shuffle! }
|
212
237
|
end
|
213
238
|
|
214
239
|
# shuffles the aa sequence of each protein (each protein within itself)
|
215
240
|
def aaseq_invert!
|
216
|
-
@prots.each
|
217
|
-
|
218
|
-
|
241
|
+
@prots.each {|prot| prot.invert! }
|
242
|
+
end
|
243
|
+
|
244
|
+
|
245
|
+
def aaseq_invert_tryptic_peptides!
|
246
|
+
@prots.each {|prot| prot.invert_tryptic_peptides! }
|
247
|
+
end
|
248
|
+
|
249
|
+
def aaseq_shuffle_tryptic_peptides!
|
250
|
+
@prots.each {|prot| prot.invert_tryptic_peptides! }
|
219
251
|
end
|
220
252
|
|
221
253
|
def header_prefix!(prefix)
|
@@ -264,6 +296,37 @@ class Fasta::Prot
|
|
264
296
|
end
|
265
297
|
end
|
266
298
|
|
299
|
+
# convenience
|
300
|
+
def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
|
301
|
+
def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
|
302
|
+
|
303
|
+
# modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
|
304
|
+
# [cuts after K or R but not if followed by a P]
|
305
|
+
# if method_as_symbol = :reverse
|
306
|
+
# :reverse | :shuffle OR :reverse! | :shuffle!
|
307
|
+
# aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
|
308
|
+
# -> 'ABCKCDERDEKDGEKWXYRRKDER'
|
309
|
+
def tryptic_peptides!(method_as_symbol)
|
310
|
+
peps = SampleEnzyme.tryptic(@aaseq)
|
311
|
+
ends_in_RK = /[KR]/o
|
312
|
+
|
313
|
+
## if the last peptide doesn't end in R or K we want to flip it completely
|
314
|
+
last_pep_special = nil
|
315
|
+
if peps.last[-1,1] !~ /[KR]/
|
316
|
+
last_pep_special = peps.pop
|
317
|
+
end
|
318
|
+
rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
|
319
|
+
if last_pep_special
|
320
|
+
rev_peps << last_pep_special.send(method_as_symbol)
|
321
|
+
end
|
322
|
+
@aaseq = rev_peps.join
|
323
|
+
end
|
324
|
+
|
325
|
+
# takes :reverse! | :shuffle!
|
326
|
+
def aaseq!(method_as_symbol)
|
327
|
+
@aaseq.send(method_as_symbol)
|
328
|
+
end
|
329
|
+
|
267
330
|
def invert!
|
268
331
|
@aaseq.reverse!
|
269
332
|
end
|
@@ -323,3 +386,4 @@ end
|
|
323
386
|
# end
|
324
387
|
# end
|
325
388
|
#end
|
389
|
+
|
data/lib/gi.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'rexml/document'
|
3
|
+
require 'rexml/streamlistener'
|
4
|
+
|
5
|
+
$ANNOTS = []
|
6
|
+
|
7
|
+
class GIListener
|
8
|
+
include REXML
|
9
|
+
include StreamListener
|
10
|
+
|
11
|
+
attr_accessor :annotations
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@get_title = false
|
15
|
+
@annotations = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def tag_start(name, attributes)
|
19
|
+
#puts "NAME" + name
|
20
|
+
#p attributes
|
21
|
+
if name == "Item" && attributes["Name"] == "Title"
|
22
|
+
@get_title = true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def text(text)
|
27
|
+
#puts "TEXT: " + text + @get_title.to_s
|
28
|
+
if @get_title
|
29
|
+
#puts "GETTING TITLE!"
|
30
|
+
@annotations.push text.chomp
|
31
|
+
@get_title = false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
class GI
|
40
|
+
BATCH_SIZE = 500
|
41
|
+
# takes an array of gi numbers and returns an array of annotation
|
42
|
+
# This allows use of the batch search mode on NCBI
|
43
|
+
def self.gi2annot(list_of_gi_numbers)
|
44
|
+
loop do
|
45
|
+
batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
|
46
|
+
if batch.size == 0 then break end
|
47
|
+
string = batch.join(",")
|
48
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
|
49
|
+
#puts url
|
50
|
+
annots = []
|
51
|
+
open(url) do |handle|
|
52
|
+
annots = parse_etool_output(handle)
|
53
|
+
end
|
54
|
+
annots
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
# Returns a list of Annotation strings
|
60
|
+
def self.parse_etool_output(handle)
|
61
|
+
listener = GIListener.new
|
62
|
+
parser = REXML::Parsers::StreamParser.new(handle, listener)
|
63
|
+
parser.parse
|
64
|
+
listener.annotations
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
=begin
|
73
|
+
|
74
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
75
|
+
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
|
76
|
+
<eSummaryResult>
|
77
|
+
|
78
|
+
<DocSum>
|
79
|
+
<Id>24115498</Id>
|
80
|
+
<Item Name="Caption" Type="String">NP_710008</Item>
|
81
|
+
<Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
|
82
|
+
<Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
|
83
|
+
<Item Name="Gi" Type="Integer">24115498</Item>
|
84
|
+
<Item Name="CreateDate" Type="String">2002/10/16</Item>
|
85
|
+
|
86
|
+
<Item Name="UpdateDate" Type="String">2006/04/03</Item>
|
87
|
+
<Item Name="Flags" Type="Integer">512</Item>
|
88
|
+
<Item Name="TaxId" Type="Integer">198214</Item>
|
89
|
+
<Item Name="Status" Type="String">live</Item>
|
90
|
+
<Item Name="ReplacedBy" Type="String"></Item>
|
91
|
+
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
92
|
+
</DocSum>
|
93
|
+
|
94
|
+
|
95
|
+
<DocSum>
|
96
|
+
<Id>434011</Id>
|
97
|
+
<Item Name="Caption" Type="String">CAA24741</Item>
|
98
|
+
|
99
|
+
<Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
|
100
|
+
<Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
|
101
|
+
<Item Name="Gi" Type="Integer">434011</Item>
|
102
|
+
<Item Name="CreateDate" Type="String">1983/12/06</Item>
|
103
|
+
<Item Name="UpdateDate" Type="String">2005/04/18</Item>
|
104
|
+
<Item Name="Flags" Type="Integer">0</Item>
|
105
|
+
<Item Name="TaxId" Type="Integer">562</Item>
|
106
|
+
<Item Name="Status" Type="String">live</Item>
|
107
|
+
<Item Name="ReplacedBy" Type="String"></Item>
|
108
|
+
|
109
|
+
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
110
|
+
</DocSum>
|
111
|
+
|
112
|
+
</eSummaryResult>
|
113
|
+
|
114
|
+
=end
|
data/lib/roc.rb
CHANGED
@@ -6,18 +6,22 @@
|
|
6
6
|
# receiver-operator-characteristics, precision-recall, etc.. Some definitions
|
7
7
|
# from (Davis & Goadrich. Proceedings of the 23rd
|
8
8
|
# International Conference on Machine Learning, Pittsburgh, PA, 2006):
|
9
|
-
# Recall = TP/(TP+FN)
|
10
|
-
# Precision = TP/(TP+FP)
|
9
|
+
# Recall = TP/(TP+FN) [aka, Sensitivity]
|
10
|
+
# Precision = TP/(TP+FP) [aka, Positive Predictive Value]
|
11
11
|
# True Positive Rate = TP/(TP+FN)
|
12
12
|
# False Positive Rate = FP/(FP+TN)
|
13
13
|
#
|
14
14
|
# Keys to some abbreviations used in this class:
|
15
|
+
# pred = number predicted to be correct
|
15
16
|
# tps = number of true positives
|
16
|
-
#
|
17
|
-
#
|
17
|
+
# ppv = positive predictive value
|
18
|
+
# om_ppv = one minus positive predictive value = FP/(TP+FP)
|
18
19
|
#
|
19
20
|
# NOTE: this class assumes that lower scores are better. Negate your scores
|
20
21
|
# if this is not the case.
|
22
|
+
#
|
23
|
+
# For estimation of false positive rates using a decoy database strategy, see
|
24
|
+
# the DecoyROC class.
|
21
25
|
class ROC
|
22
26
|
|
23
27
|
|
@@ -38,82 +42,84 @@ class ROC
|
|
38
42
|
area
|
39
43
|
end
|
40
44
|
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
def
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
_tps_calc(tp, fp, prc)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Returns (num_true_positives(ints), false_positive_rate(floats))
|
61
|
-
# calculated as ( FP/(FP+TP) ) as a function of number of true positives
|
62
|
-
# true positive values that are equal will cause jumps in the array values
|
63
|
-
# of true positives returned
|
64
|
-
# e.g. tps = [1,2,4] # -> jumps from 2 to 4
|
65
|
-
def tps_and_fpr2(tp, fp)
|
66
|
-
prc = proc {|tp_i,fp_i| (fp_i).to_f/((tp_i+1).to_f + fp_i.to_f) }
|
67
|
-
_tps_calc(tp, fp, prc)
|
45
|
+
# given an array of doublets where each doublet is a value and a boolean,
|
46
|
+
# sorts the list and divides it into two arrays (tps, fps) of the values.
|
47
|
+
# The output can then be fed into many of the other routines.
|
48
|
+
def prep_list(list)
|
49
|
+
tp = []; fp = []
|
50
|
+
list.each do |dbl|
|
51
|
+
if dbl[1]
|
52
|
+
tp << dbl
|
53
|
+
else
|
54
|
+
fp << dbl
|
55
|
+
end
|
56
|
+
end
|
57
|
+
[tp,fp].collect do |arr|
|
58
|
+
arr.collect! {|dbl| dbl[0] }
|
59
|
+
arr.sort
|
60
|
+
end
|
68
61
|
end
|
69
62
|
|
70
63
|
# Base function for tps calculations
|
71
|
-
def
|
64
|
+
def tps_and_ppv(tp, fp)
|
72
65
|
tp_i = 0
|
73
66
|
fp_i = 0
|
74
67
|
x = []
|
75
68
|
y = []
|
69
|
+
num_tps = 0
|
76
70
|
|
77
71
|
while tp_i < tp.size
|
78
72
|
while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
|
79
73
|
fp_i += 1
|
80
74
|
end
|
81
75
|
unless tp[tp_i] == tp[tp_i+1]
|
82
|
-
|
83
|
-
|
84
|
-
|
76
|
+
# get the correct number of each
|
77
|
+
num_tps = tp_i + 1
|
78
|
+
num_fps = fp_i
|
79
|
+
|
80
|
+
x << num_tps
|
81
|
+
y << num_tps.to_f/(num_tps+num_fps)
|
82
|
+
|
85
83
|
end
|
86
84
|
tp_i += 1
|
87
85
|
end
|
88
86
|
return x, y
|
89
87
|
end
|
88
|
+
end
|
90
89
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
x, y = tps_and_fpr2(tp,fp)
|
96
|
-
y.collect! {|v| v*2 }
|
97
|
-
return x, y
|
98
|
-
end
|
90
|
+
# For calculating precision given lists of hits and decoy hits. The hits are
|
91
|
+
# assumed to have false positives within them that can be estimated from the
|
92
|
+
# number of decoy hits at the same rate
|
93
|
+
class DecoyROC < ROC
|
99
94
|
|
100
|
-
#
|
101
|
-
#
|
102
|
-
#
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
95
|
+
# returns the [num_hits, num_tps, precision] as a function of true
|
96
|
+
# positives. Method will return precisely what is calculated (meaning some
|
97
|
+
# answers may seem bizarre if you have better decoy hits than real).
|
98
|
+
def pred_and_tps_and_ppv(hits, decoy_hits)
|
99
|
+
hits_i = 0
|
100
|
+
decoy_i = 0
|
101
|
+
|
102
|
+
num_hits_ar = []
|
103
|
+
num_tps_ar = []
|
104
|
+
ppv_ar = []
|
105
|
+
|
106
|
+
while hits_i < hits.size
|
107
|
+
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
108
|
+
decoy_i += 1
|
110
109
|
end
|
110
|
+
unless hits[hits_i] == hits[hits_i+1]
|
111
|
+
## determine the number of false positives
|
112
|
+
tot_num_hits = hits_i+1
|
113
|
+
num_tps = tot_num_hits - decoy_i
|
114
|
+
|
115
|
+
num_hits_ar << tot_num_hits
|
116
|
+
num_tps_ar << num_tps
|
117
|
+
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
118
|
+
|
119
|
+
end
|
120
|
+
hits_i += 1
|
111
121
|
end
|
112
|
-
[
|
113
|
-
arr.collect! {|dbl| dbl[0] }
|
114
|
-
arr.sort
|
115
|
-
end
|
122
|
+
[num_hits_ar, num_tps_ar, ppv_ar]
|
116
123
|
end
|
117
124
|
|
118
|
-
|
119
125
|
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'fasta'
|
2
|
+
|
3
|
+
class SpecID::AAFreqs
|
4
|
+
# a fasta object
|
5
|
+
attr_accessor :fasta
|
6
|
+
# hash by capital one-letter amino acid symbols giving the frequency of
|
7
|
+
# seeing that amino acid. Frequencies should add to 1.
|
8
|
+
attr_accessor :aafreqs
|
9
|
+
|
10
|
+
def initialize(fasta_file=nil)
|
11
|
+
if fasta_file
|
12
|
+
@fasta = Fasta.new.read_file(fasta_file)
|
13
|
+
@aafreqs = calculate_frequencies(@fasta)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# creates an aafreqs hash based on fasta object
|
18
|
+
def calculate_frequencies(fasta)
|
19
|
+
hash = {}
|
20
|
+
total_aas = 0
|
21
|
+
('A'..'Z').each do |x|
|
22
|
+
hash[x] = 0
|
23
|
+
end
|
24
|
+
hash['*'] = 0
|
25
|
+
fasta.prots.each do |prot|
|
26
|
+
aaseq = prot.aaseq
|
27
|
+
total_aas += aaseq.size
|
28
|
+
aaseq.split('').each do |x|
|
29
|
+
hash[x] += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
# normalize by total amount:
|
33
|
+
hash.each do |k,v|
|
34
|
+
hash[k] = hash[k].to_f / total_aas
|
35
|
+
end
|
36
|
+
# convert all strings to symbols:
|
37
|
+
hash.each do |k,v|
|
38
|
+
hash[k.to_sym] = hash.delete(k)
|
39
|
+
end
|
40
|
+
hash
|
41
|
+
end
|
42
|
+
|
43
|
+
# The expected probability for seeing that amino acid in a given length.
|
44
|
+
# This calculates a lookup table (array) from 0 to highest_length of the
|
45
|
+
# probability of seeing at least one amino acid (given its frequency, where
|
46
|
+
# frequency is from 0 to 1)
|
47
|
+
def self.probability_of_length_table(frequency, max_length)
|
48
|
+
one_minus_freq = 1.0 - frequency.to_f
|
49
|
+
lookup = Array.new(max_length + 1)
|
50
|
+
(0..max_length).each do |len|
|
51
|
+
lookup[len] = 1.0 - (one_minus_freq**len);
|
52
|
+
end
|
53
|
+
lookup
|
54
|
+
end
|
55
|
+
|
56
|
+
# takes an array of peptide strings
|
57
|
+
# gives the actual number of peptides with at least one
|
58
|
+
# gives the expected number of peptides given the probabilities in the
|
59
|
+
# length lookup table.
|
60
|
+
# currently ONLY takes at_least = 1
|
61
|
+
# depends on @aafreqs
|
62
|
+
# returns two numbers in array [actual, expected]
|
63
|
+
# expected is a Float!!!
|
64
|
+
def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
|
65
|
+
one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
|
66
|
+
amino_acid_as_st = amino_acid.to_s
|
67
|
+
probs = []
|
68
|
+
actual = 0
|
69
|
+
expected = 0.0
|
70
|
+
peptide_aaseqs.each do |pep|
|
71
|
+
expected += (1.0 - (one_minus_freq**pep.size))
|
72
|
+
if pep.include?(amino_acid_as_st)
|
73
|
+
actual += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
[actual, expected]
|
77
|
+
end
|
78
|
+
|
79
|
+
# pep_objs respond to sequence?
|
80
|
+
def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
|
81
|
+
@aafreqs ||= {}
|
82
|
+
@aafreqs[:C] = cyst_freq
|
83
|
+
seqs = pep_objs.map do |v|
|
84
|
+
if v.sequence =~ /\.([\w\*]+)\./
|
85
|
+
$1
|
86
|
+
else
|
87
|
+
abort v.sequence.to_s + " could not be matched!"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
actual_and_expected_number(seqs, :C, 1)
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
=begin
|
95
|
+
|
96
|
+
foreach my $pep (@$peps) {
|
97
|
+
unless ($pep->prob() >= $prob_cutoff) {next;}
|
98
|
+
my %freq = ();
|
99
|
+
my $aa = $pep->AA_sequence();
|
100
|
+
my $len = length($aa);
|
101
|
+
|
102
|
+
## EXPECTED probability for each length
|
103
|
+
for (my $i = 0; $i < 20; $i++) {
|
104
|
+
## rolling at least one 6 in n rolls is 1 - (5/6)^n.
|
105
|
+
$expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
|
106
|
+
}
|
107
|
+
## FILTER any peptides we've already seen
|
108
|
+
if ($seen{$aa}) { next; }
|
109
|
+
else { $seen{$aa}++; }
|
110
|
+
|
111
|
+
## Fill in these values with zeroes:
|
112
|
+
for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
|
113
|
+
|
114
|
+
## get the frequencies for each AA in each peptide:
|
115
|
+
for (my $i = 0; $i < $len; $i++) {
|
116
|
+
my $let = substr($aa, $i, 1);
|
117
|
+
$tot_freq{$let}++;
|
118
|
+
$pepc[$cnt][$an{$let}]++;
|
119
|
+
}
|
120
|
+
$cnt++;
|
121
|
+
}
|
122
|
+
|
123
|
+
##############################################################
|
124
|
+
# ANALYSIS 2: Fraction of Peptides containing X Amino Acid
|
125
|
+
##############################################################
|
126
|
+
|
127
|
+
## What is the percentage of peptides containing at least 1 cysteine?
|
128
|
+
my $atleast = 1;
|
129
|
+
|
130
|
+
my @has;
|
131
|
+
## initialize
|
132
|
+
for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
|
133
|
+
my $tot = scalar(@pepc);
|
134
|
+
foreach my $pep (@pepc) {
|
135
|
+
for (my $index = 0; $index < 20; $index++) {
|
136
|
+
if ($pep->[$index] >= $atleast) {
|
137
|
+
$has[$index]++;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
|
143
|
+
my @exp_sum = (); ## The total number of peptides I'd expect
|
144
|
+
## WE simply add up the peptides' probabilities
|
145
|
+
## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
|
146
|
+
foreach my $pep (@expected) {
|
147
|
+
for (my $i = 0; $i < 20; $i++) {
|
148
|
+
$exp_sum[$i] += $pep->[$i];
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
my @obs = map { $_/$tot } @has;
|
153
|
+
my @exp = map { $_/$tot } @exp_sum;
|
154
|
+
print STDERR "*********************************************\n";
|
155
|
+
print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
|
156
|
+
print "[AA] [Observed] [Predicted]\n";
|
157
|
+
for (my $i = 0; $i < 20; $i++) {
|
158
|
+
print "$AA[$i] $obs[$i] $exp[$i]\n";
|
159
|
+
}
|
160
|
+
print STDERR "*********************************************\n";
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
=end
|
165
|
+
|
166
|
+
end
|
data/lib/spec_id/bioworks.rb
CHANGED
@@ -15,12 +15,15 @@ module SpecIDXML; end
|
|
15
15
|
class SpecID::Bioworks
|
16
16
|
# Regular expressions
|
17
17
|
@@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
|
18
|
+
@@modifications_re = /<modifications>(.*)<\/modifications>/o
|
18
19
|
@@protein_re = /<protein>/o
|
19
20
|
@@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
|
20
21
|
@@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
|
21
22
|
|
22
23
|
|
23
24
|
attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
|
25
|
+
# a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
|
26
|
+
attr_accessor :modifications
|
24
27
|
attr_writer :peps
|
25
28
|
|
26
29
|
def hi_prob_best ; false end
|
@@ -196,6 +199,7 @@ class SpecID::Bioworks
|
|
196
199
|
@global_filename = @origfilename.gsub(File.extname(@origfilename), "")
|
197
200
|
end
|
198
201
|
@version = get_regex_val(fh, @@bioworksinfo_re)
|
202
|
+
@modifications = get_regex_val(fh, @@modifications_re)
|
199
203
|
@prots = get_prots(fh, self)
|
200
204
|
fh.close
|
201
205
|
end
|
@@ -456,7 +460,7 @@ class SpecID::Bioworks::Pep < Array
|
|
456
460
|
first_scan = first_scan[0]
|
457
461
|
last_scan = first_scan
|
458
462
|
end
|
459
|
-
|
463
|
+
[base_name, first_scan, last_scan]
|
460
464
|
end
|
461
465
|
|
462
466
|
def file=(arg)
|