mspire 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/lib/fasta.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'sample_enzyme'
|
1
2
|
|
2
3
|
class String
|
3
4
|
|
@@ -7,8 +8,10 @@ class String
|
|
7
8
|
end
|
8
9
|
end
|
9
10
|
|
11
|
+
# modifies and returns self
|
10
12
|
def shuffle!
|
11
13
|
each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
|
14
|
+
self
|
12
15
|
end
|
13
16
|
|
14
17
|
def shuffle
|
@@ -44,6 +47,7 @@ class Fasta
|
|
44
47
|
# Checks that the first character per line is '>' or character class [A-Za-z*]
|
45
48
|
# returns a fasta object for stringing commands
|
46
49
|
def read_file(fn)
|
50
|
+
first_char_re = /[A-Za-z*]/o
|
47
51
|
obj = nil
|
48
52
|
regex = /(\r\n)|\n/o
|
49
53
|
fh = File.new(fn).binmode
|
@@ -57,7 +61,7 @@ class Fasta
|
|
57
61
|
obj = Prot.new
|
58
62
|
@prots << obj
|
59
63
|
obj.header = line.dup
|
60
|
-
elsif first_char =~
|
64
|
+
elsif first_char =~ first_char_re
|
61
65
|
obj.aaseq << line.chomp
|
62
66
|
else
|
63
67
|
raise "Line not in fasta format (between arrows): -->#{line}<--"
|
@@ -105,26 +109,40 @@ class Fasta
|
|
105
109
|
# returns a new fasta object using some fraction of proteins randomly
|
106
110
|
# selected (fraction may be > 1). Always rounds up. Will not choose a
|
107
111
|
# protein twice unless all other proteins have been chosen
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
112
|
+
#
|
113
|
+
# fraction_prefix ensures that a unique header is given even if multiple
|
114
|
+
# fraction of proteins are being created
|
115
|
+
# fraction_cnt = (prot_cnt/num_prots).floor.to_i
|
116
|
+
# so for the first n proteins, it will be 0,
|
117
|
+
# the 2n proteins will be 1, etc.
|
118
|
+
# e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
|
119
|
+
# would give headers like this: >f0_<some_real_header>,
|
120
|
+
# >f1_<some_real_header>, ...
|
121
|
+
def fraction_of_prots(fraction=1, prefix_proc=nil)
|
122
|
+
new_num = (fraction.to_f * self.prots.size).ceil
|
123
|
+
arr = []
|
124
|
+
orig_num_prots = @prots.size
|
125
|
+
|
126
|
+
# initialize
|
127
|
+
new_prots = @prots.map {|prt| prt.dup }
|
128
|
+
frac_cnt = 0
|
129
|
+
ind_cnt = 0
|
130
|
+
prt_cnt = orig_num_prots
|
131
|
+
while ind_cnt < new_num
|
132
|
+
arr << new_prots.delete_at(rand(new_prots.size))
|
133
|
+
if prefix_proc
|
134
|
+
prefix = prefix_proc.call(frac_cnt)
|
135
|
+
arr.last.header_prefix!(prefix)
|
117
136
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
if hash.size == 0 then hash = Hash[*arr] end
|
137
|
+
prt_cnt -= 1 # index
|
138
|
+
if prt_cnt == 0
|
139
|
+
frac_cnt += 1
|
140
|
+
new_prots = @prots.map {|prt| prt.dup }
|
141
|
+
prt_cnt = orig_num_prots
|
124
142
|
end
|
125
|
-
|
143
|
+
ind_cnt += 1
|
126
144
|
end
|
127
|
-
fasta_fraction
|
145
|
+
fasta_fraction = Fasta.new(arr)
|
128
146
|
end
|
129
147
|
|
130
148
|
# Convenience method for modifying some fraction of the proteins of a file
|
@@ -204,18 +222,32 @@ class Fasta
|
|
204
222
|
other
|
205
223
|
end
|
206
224
|
|
225
|
+
# method = :shuffle! | :reverse!
|
226
|
+
def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
|
227
|
+
if tryptic_peptides
|
228
|
+
@prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
|
229
|
+
else
|
230
|
+
@prots.each {|prot| prot.aaseq!(method_as_symbol) }
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
207
234
|
# shuffles the aa sequence of each protein (each protein within itself)
|
208
235
|
def aaseq_shuffle!
|
209
|
-
@prots.each
|
210
|
-
prot.shuffle!
|
211
|
-
end
|
236
|
+
@prots.each {|prot| prot.shuffle! }
|
212
237
|
end
|
213
238
|
|
214
239
|
# shuffles the aa sequence of each protein (each protein within itself)
|
215
240
|
def aaseq_invert!
|
216
|
-
@prots.each
|
217
|
-
|
218
|
-
|
241
|
+
@prots.each {|prot| prot.invert! }
|
242
|
+
end
|
243
|
+
|
244
|
+
|
245
|
+
def aaseq_invert_tryptic_peptides!
|
246
|
+
@prots.each {|prot| prot.invert_tryptic_peptides! }
|
247
|
+
end
|
248
|
+
|
249
|
+
def aaseq_shuffle_tryptic_peptides!
|
250
|
+
@prots.each {|prot| prot.invert_tryptic_peptides! }
|
219
251
|
end
|
220
252
|
|
221
253
|
def header_prefix!(prefix)
|
@@ -264,6 +296,37 @@ class Fasta::Prot
|
|
264
296
|
end
|
265
297
|
end
|
266
298
|
|
299
|
+
# convenience
|
300
|
+
def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
|
301
|
+
def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
|
302
|
+
|
303
|
+
# modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
|
304
|
+
# [cuts after K or R but not if followed by a P]
|
305
|
+
# if method_as_symbol = :reverse
|
306
|
+
# :reverse | :shuffle OR :reverse! | :shuffle!
|
307
|
+
# aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
|
308
|
+
# -> 'ABCKCDERDEKDGEKWXYRRKDER'
|
309
|
+
def tryptic_peptides!(method_as_symbol)
|
310
|
+
peps = SampleEnzyme.tryptic(@aaseq)
|
311
|
+
ends_in_RK = /[KR]/o
|
312
|
+
|
313
|
+
## if the last peptide doesn't end in R or K we want to flip it completely
|
314
|
+
last_pep_special = nil
|
315
|
+
if peps.last[-1,1] !~ /[KR]/
|
316
|
+
last_pep_special = peps.pop
|
317
|
+
end
|
318
|
+
rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
|
319
|
+
if last_pep_special
|
320
|
+
rev_peps << last_pep_special.send(method_as_symbol)
|
321
|
+
end
|
322
|
+
@aaseq = rev_peps.join
|
323
|
+
end
|
324
|
+
|
325
|
+
# takes :reverse! | :shuffle!
|
326
|
+
def aaseq!(method_as_symbol)
|
327
|
+
@aaseq.send(method_as_symbol)
|
328
|
+
end
|
329
|
+
|
267
330
|
def invert!
|
268
331
|
@aaseq.reverse!
|
269
332
|
end
|
@@ -323,3 +386,4 @@ end
|
|
323
386
|
# end
|
324
387
|
# end
|
325
388
|
#end
|
389
|
+
|
data/lib/gi.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'rexml/document'
|
3
|
+
require 'rexml/streamlistener'
|
4
|
+
|
5
|
+
$ANNOTS = []
|
6
|
+
|
7
|
+
class GIListener
|
8
|
+
include REXML
|
9
|
+
include StreamListener
|
10
|
+
|
11
|
+
attr_accessor :annotations
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@get_title = false
|
15
|
+
@annotations = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def tag_start(name, attributes)
|
19
|
+
#puts "NAME" + name
|
20
|
+
#p attributes
|
21
|
+
if name == "Item" && attributes["Name"] == "Title"
|
22
|
+
@get_title = true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def text(text)
|
27
|
+
#puts "TEXT: " + text + @get_title.to_s
|
28
|
+
if @get_title
|
29
|
+
#puts "GETTING TITLE!"
|
30
|
+
@annotations.push text.chomp
|
31
|
+
@get_title = false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
class GI
|
40
|
+
BATCH_SIZE = 500
|
41
|
+
# takes an array of gi numbers and returns an array of annotation
|
42
|
+
# This allows use of the batch search mode on NCBI
|
43
|
+
def self.gi2annot(list_of_gi_numbers)
|
44
|
+
loop do
|
45
|
+
batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
|
46
|
+
if batch.size == 0 then break end
|
47
|
+
string = batch.join(",")
|
48
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
|
49
|
+
#puts url
|
50
|
+
annots = []
|
51
|
+
open(url) do |handle|
|
52
|
+
annots = parse_etool_output(handle)
|
53
|
+
end
|
54
|
+
annots
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
# Returns a list of Annotation strings
|
60
|
+
def self.parse_etool_output(handle)
|
61
|
+
listener = GIListener.new
|
62
|
+
parser = REXML::Parsers::StreamParser.new(handle, listener)
|
63
|
+
parser.parse
|
64
|
+
listener.annotations
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
=begin
|
73
|
+
|
74
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
75
|
+
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
|
76
|
+
<eSummaryResult>
|
77
|
+
|
78
|
+
<DocSum>
|
79
|
+
<Id>24115498</Id>
|
80
|
+
<Item Name="Caption" Type="String">NP_710008</Item>
|
81
|
+
<Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
|
82
|
+
<Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
|
83
|
+
<Item Name="Gi" Type="Integer">24115498</Item>
|
84
|
+
<Item Name="CreateDate" Type="String">2002/10/16</Item>
|
85
|
+
|
86
|
+
<Item Name="UpdateDate" Type="String">2006/04/03</Item>
|
87
|
+
<Item Name="Flags" Type="Integer">512</Item>
|
88
|
+
<Item Name="TaxId" Type="Integer">198214</Item>
|
89
|
+
<Item Name="Status" Type="String">live</Item>
|
90
|
+
<Item Name="ReplacedBy" Type="String"></Item>
|
91
|
+
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
92
|
+
</DocSum>
|
93
|
+
|
94
|
+
|
95
|
+
<DocSum>
|
96
|
+
<Id>434011</Id>
|
97
|
+
<Item Name="Caption" Type="String">CAA24741</Item>
|
98
|
+
|
99
|
+
<Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
|
100
|
+
<Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
|
101
|
+
<Item Name="Gi" Type="Integer">434011</Item>
|
102
|
+
<Item Name="CreateDate" Type="String">1983/12/06</Item>
|
103
|
+
<Item Name="UpdateDate" Type="String">2005/04/18</Item>
|
104
|
+
<Item Name="Flags" Type="Integer">0</Item>
|
105
|
+
<Item Name="TaxId" Type="Integer">562</Item>
|
106
|
+
<Item Name="Status" Type="String">live</Item>
|
107
|
+
<Item Name="ReplacedBy" Type="String"></Item>
|
108
|
+
|
109
|
+
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
110
|
+
</DocSum>
|
111
|
+
|
112
|
+
</eSummaryResult>
|
113
|
+
|
114
|
+
=end
|
data/lib/roc.rb
CHANGED
@@ -6,18 +6,22 @@
|
|
6
6
|
# receiver-operator-characteristics, precision-recall, etc.. Some definitions
|
7
7
|
# from (Davis & Goadrich. Proceedings of the 23rd
|
8
8
|
# International Conference on Machine Learning, Pittsburgh, PA, 2006):
|
9
|
-
# Recall = TP/(TP+FN)
|
10
|
-
# Precision = TP/(TP+FP)
|
9
|
+
# Recall = TP/(TP+FN) [aka, Sensitivity]
|
10
|
+
# Precision = TP/(TP+FP) [aka, Positive Predictive Value]
|
11
11
|
# True Positive Rate = TP/(TP+FN)
|
12
12
|
# False Positive Rate = FP/(FP+TN)
|
13
13
|
#
|
14
14
|
# Keys to some abbreviations used in this class:
|
15
|
+
# pred = number predicted to be correct
|
15
16
|
# tps = number of true positives
|
16
|
-
#
|
17
|
-
#
|
17
|
+
# ppv = positive predictive value
|
18
|
+
# om_ppv = one minus positive predictive value = FP/(TP+FP)
|
18
19
|
#
|
19
20
|
# NOTE: this class assumes that lower scores are better. Negate your scores
|
20
21
|
# if this is not the case.
|
22
|
+
#
|
23
|
+
# For estimation of false positive rates using a decoy database strategy, see
|
24
|
+
# the DecoyROC class.
|
21
25
|
class ROC
|
22
26
|
|
23
27
|
|
@@ -38,82 +42,84 @@ class ROC
|
|
38
42
|
area
|
39
43
|
end
|
40
44
|
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
def
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
_tps_calc(tp, fp, prc)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Returns (num_true_positives(ints), false_positive_rate(floats))
|
61
|
-
# calculated as ( FP/(FP+TP) ) as a function of number of true positives
|
62
|
-
# true positive values that are equal will cause jumps in the array values
|
63
|
-
# of true positives returned
|
64
|
-
# e.g. tps = [1,2,4] # -> jumps from 2 to 4
|
65
|
-
def tps_and_fpr2(tp, fp)
|
66
|
-
prc = proc {|tp_i,fp_i| (fp_i).to_f/((tp_i+1).to_f + fp_i.to_f) }
|
67
|
-
_tps_calc(tp, fp, prc)
|
45
|
+
# given an array of doublets where each doublet is a value and a boolean,
|
46
|
+
# sorts the list and divides it into two arrays (tps, fps) of the values.
|
47
|
+
# The output can then be fed into many of the other routines.
|
48
|
+
def prep_list(list)
|
49
|
+
tp = []; fp = []
|
50
|
+
list.each do |dbl|
|
51
|
+
if dbl[1]
|
52
|
+
tp << dbl
|
53
|
+
else
|
54
|
+
fp << dbl
|
55
|
+
end
|
56
|
+
end
|
57
|
+
[tp,fp].collect do |arr|
|
58
|
+
arr.collect! {|dbl| dbl[0] }
|
59
|
+
arr.sort
|
60
|
+
end
|
68
61
|
end
|
69
62
|
|
70
63
|
# Base function for tps calculations
|
71
|
-
def
|
64
|
+
def tps_and_ppv(tp, fp)
|
72
65
|
tp_i = 0
|
73
66
|
fp_i = 0
|
74
67
|
x = []
|
75
68
|
y = []
|
69
|
+
num_tps = 0
|
76
70
|
|
77
71
|
while tp_i < tp.size
|
78
72
|
while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
|
79
73
|
fp_i += 1
|
80
74
|
end
|
81
75
|
unless tp[tp_i] == tp[tp_i+1]
|
82
|
-
|
83
|
-
|
84
|
-
|
76
|
+
# get the correct number of each
|
77
|
+
num_tps = tp_i + 1
|
78
|
+
num_fps = fp_i
|
79
|
+
|
80
|
+
x << num_tps
|
81
|
+
y << num_tps.to_f/(num_tps+num_fps)
|
82
|
+
|
85
83
|
end
|
86
84
|
tp_i += 1
|
87
85
|
end
|
88
86
|
return x, y
|
89
87
|
end
|
88
|
+
end
|
90
89
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
x, y = tps_and_fpr2(tp,fp)
|
96
|
-
y.collect! {|v| v*2 }
|
97
|
-
return x, y
|
98
|
-
end
|
90
|
+
# For calculating precision given lists of hits and decoy hits. The hits are
|
91
|
+
# assumed to have false positives within them that can be estimated from the
|
92
|
+
# number of decoy hits at the same rate
|
93
|
+
class DecoyROC < ROC
|
99
94
|
|
100
|
-
#
|
101
|
-
#
|
102
|
-
#
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
95
|
+
# returns the [num_hits, num_tps, precision] as a function of true
|
96
|
+
# positives. Method will return precisely what is calculated (meaning some
|
97
|
+
# answers may seem bizarre if you have better decoy hits than real).
|
98
|
+
def pred_and_tps_and_ppv(hits, decoy_hits)
|
99
|
+
hits_i = 0
|
100
|
+
decoy_i = 0
|
101
|
+
|
102
|
+
num_hits_ar = []
|
103
|
+
num_tps_ar = []
|
104
|
+
ppv_ar = []
|
105
|
+
|
106
|
+
while hits_i < hits.size
|
107
|
+
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
108
|
+
decoy_i += 1
|
110
109
|
end
|
110
|
+
unless hits[hits_i] == hits[hits_i+1]
|
111
|
+
## determine the number of false positives
|
112
|
+
tot_num_hits = hits_i+1
|
113
|
+
num_tps = tot_num_hits - decoy_i
|
114
|
+
|
115
|
+
num_hits_ar << tot_num_hits
|
116
|
+
num_tps_ar << num_tps
|
117
|
+
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
118
|
+
|
119
|
+
end
|
120
|
+
hits_i += 1
|
111
121
|
end
|
112
|
-
[
|
113
|
-
arr.collect! {|dbl| dbl[0] }
|
114
|
-
arr.sort
|
115
|
-
end
|
122
|
+
[num_hits_ar, num_tps_ar, ppv_ar]
|
116
123
|
end
|
117
124
|
|
118
|
-
|
119
125
|
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'fasta'
|
2
|
+
|
3
|
+
class SpecID::AAFreqs
|
4
|
+
# a fasta object
|
5
|
+
attr_accessor :fasta
|
6
|
+
# hash by capital one-letter amino acid symbols giving the frequency of
|
7
|
+
# seeing that amino acid. Frequencies should add to 1.
|
8
|
+
attr_accessor :aafreqs
|
9
|
+
|
10
|
+
def initialize(fasta_file=nil)
|
11
|
+
if fasta_file
|
12
|
+
@fasta = Fasta.new.read_file(fasta_file)
|
13
|
+
@aafreqs = calculate_frequencies(@fasta)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# creates an aafreqs hash based on fasta object
|
18
|
+
def calculate_frequencies(fasta)
|
19
|
+
hash = {}
|
20
|
+
total_aas = 0
|
21
|
+
('A'..'Z').each do |x|
|
22
|
+
hash[x] = 0
|
23
|
+
end
|
24
|
+
hash['*'] = 0
|
25
|
+
fasta.prots.each do |prot|
|
26
|
+
aaseq = prot.aaseq
|
27
|
+
total_aas += aaseq.size
|
28
|
+
aaseq.split('').each do |x|
|
29
|
+
hash[x] += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
# normalize by total amount:
|
33
|
+
hash.each do |k,v|
|
34
|
+
hash[k] = hash[k].to_f / total_aas
|
35
|
+
end
|
36
|
+
# convert all strings to symbols:
|
37
|
+
hash.each do |k,v|
|
38
|
+
hash[k.to_sym] = hash.delete(k)
|
39
|
+
end
|
40
|
+
hash
|
41
|
+
end
|
42
|
+
|
43
|
+
# The expected probability for seeing that amino acid in a given length.
|
44
|
+
# This calculates a lookup table (array) from 0 to highest_length of the
|
45
|
+
# probability of seeing at least one amino acid (given its frequency, where
|
46
|
+
# frequency is from 0 to 1)
|
47
|
+
def self.probability_of_length_table(frequency, max_length)
|
48
|
+
one_minus_freq = 1.0 - frequency.to_f
|
49
|
+
lookup = Array.new(max_length + 1)
|
50
|
+
(0..max_length).each do |len|
|
51
|
+
lookup[len] = 1.0 - (one_minus_freq**len);
|
52
|
+
end
|
53
|
+
lookup
|
54
|
+
end
|
55
|
+
|
56
|
+
# takes an array of peptide strings
|
57
|
+
# gives the actual number of peptides with at least one
|
58
|
+
# gives the expected number of peptides given the probabilities in the
|
59
|
+
# length lookup table.
|
60
|
+
# currently ONLY takes at_least = 1
|
61
|
+
# depends on @aafreqs
|
62
|
+
# returns two numbers in array [actual, expected]
|
63
|
+
# expected is a Float!!!
|
64
|
+
def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
|
65
|
+
one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
|
66
|
+
amino_acid_as_st = amino_acid.to_s
|
67
|
+
probs = []
|
68
|
+
actual = 0
|
69
|
+
expected = 0.0
|
70
|
+
peptide_aaseqs.each do |pep|
|
71
|
+
expected += (1.0 - (one_minus_freq**pep.size))
|
72
|
+
if pep.include?(amino_acid_as_st)
|
73
|
+
actual += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
[actual, expected]
|
77
|
+
end
|
78
|
+
|
79
|
+
# pep_objs respond to sequence?
|
80
|
+
def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
|
81
|
+
@aafreqs ||= {}
|
82
|
+
@aafreqs[:C] = cyst_freq
|
83
|
+
seqs = pep_objs.map do |v|
|
84
|
+
if v.sequence =~ /\.([\w\*]+)\./
|
85
|
+
$1
|
86
|
+
else
|
87
|
+
abort v.sequence.to_s + " could not be matched!"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
actual_and_expected_number(seqs, :C, 1)
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
=begin
|
95
|
+
|
96
|
+
foreach my $pep (@$peps) {
|
97
|
+
unless ($pep->prob() >= $prob_cutoff) {next;}
|
98
|
+
my %freq = ();
|
99
|
+
my $aa = $pep->AA_sequence();
|
100
|
+
my $len = length($aa);
|
101
|
+
|
102
|
+
## EXPECTED probability for each length
|
103
|
+
for (my $i = 0; $i < 20; $i++) {
|
104
|
+
## rolling at least one 6 in n rolls is 1 - (5/6)^n.
|
105
|
+
$expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
|
106
|
+
}
|
107
|
+
## FILTER any peptides we've already seen
|
108
|
+
if ($seen{$aa}) { next; }
|
109
|
+
else { $seen{$aa}++; }
|
110
|
+
|
111
|
+
## Fill in these values with zeroes:
|
112
|
+
for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
|
113
|
+
|
114
|
+
## get the frequencies for each AA in each peptide:
|
115
|
+
for (my $i = 0; $i < $len; $i++) {
|
116
|
+
my $let = substr($aa, $i, 1);
|
117
|
+
$tot_freq{$let}++;
|
118
|
+
$pepc[$cnt][$an{$let}]++;
|
119
|
+
}
|
120
|
+
$cnt++;
|
121
|
+
}
|
122
|
+
|
123
|
+
##############################################################
|
124
|
+
# ANALYSIS 2: Fraction of Peptides containing X Amino Acid
|
125
|
+
##############################################################
|
126
|
+
|
127
|
+
## What is the percentage of peptides containing at least 1 cysteine?
|
128
|
+
my $atleast = 1;
|
129
|
+
|
130
|
+
my @has;
|
131
|
+
## initialize
|
132
|
+
for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
|
133
|
+
my $tot = scalar(@pepc);
|
134
|
+
foreach my $pep (@pepc) {
|
135
|
+
for (my $index = 0; $index < 20; $index++) {
|
136
|
+
if ($pep->[$index] >= $atleast) {
|
137
|
+
$has[$index]++;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
|
143
|
+
my @exp_sum = (); ## The total number of peptides I'd expect
|
144
|
+
## WE simply add up the peptides' probabilities
|
145
|
+
## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
|
146
|
+
foreach my $pep (@expected) {
|
147
|
+
for (my $i = 0; $i < 20; $i++) {
|
148
|
+
$exp_sum[$i] += $pep->[$i];
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
my @obs = map { $_/$tot } @has;
|
153
|
+
my @exp = map { $_/$tot } @exp_sum;
|
154
|
+
print STDERR "*********************************************\n";
|
155
|
+
print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
|
156
|
+
print "[AA] [Observed] [Predicted]\n";
|
157
|
+
for (my $i = 0; $i < 20; $i++) {
|
158
|
+
print "$AA[$i] $obs[$i] $exp[$i]\n";
|
159
|
+
}
|
160
|
+
print STDERR "*********************************************\n";
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
=end
|
165
|
+
|
166
|
+
end
|
data/lib/spec_id/bioworks.rb
CHANGED
@@ -15,12 +15,15 @@ module SpecIDXML; end
|
|
15
15
|
class SpecID::Bioworks
|
16
16
|
# Regular expressions
|
17
17
|
@@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
|
18
|
+
@@modifications_re = /<modifications>(.*)<\/modifications>/o
|
18
19
|
@@protein_re = /<protein>/o
|
19
20
|
@@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
|
20
21
|
@@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
|
21
22
|
|
22
23
|
|
23
24
|
attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
|
25
|
+
# a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
|
26
|
+
attr_accessor :modifications
|
24
27
|
attr_writer :peps
|
25
28
|
|
26
29
|
def hi_prob_best ; false end
|
@@ -196,6 +199,7 @@ class SpecID::Bioworks
|
|
196
199
|
@global_filename = @origfilename.gsub(File.extname(@origfilename), "")
|
197
200
|
end
|
198
201
|
@version = get_regex_val(fh, @@bioworksinfo_re)
|
202
|
+
@modifications = get_regex_val(fh, @@modifications_re)
|
199
203
|
@prots = get_prots(fh, self)
|
200
204
|
fh.close
|
201
205
|
end
|
@@ -456,7 +460,7 @@ class SpecID::Bioworks::Pep < Array
|
|
456
460
|
first_scan = first_scan[0]
|
457
461
|
last_scan = first_scan
|
458
462
|
end
|
459
|
-
|
463
|
+
[base_name, first_scan, last_scan]
|
460
464
|
end
|
461
465
|
|
462
466
|
def file=(arg)
|