mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/lib/fasta.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'sample_enzyme'
1
2
 
2
3
  class String
3
4
 
@@ -7,8 +8,10 @@ class String
7
8
  end
8
9
  end
9
10
 
11
+ # modifies and returns self
10
12
  def shuffle!
11
13
  each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
14
+ self
12
15
  end
13
16
 
14
17
  def shuffle
@@ -44,6 +47,7 @@ class Fasta
44
47
  # Checks that the first character per line is '>' or character class [A-Za-z*]
45
48
  # returns a fasta object for stringing commands
46
49
  def read_file(fn)
50
+ first_char_re = /[A-Za-z*]/o
47
51
  obj = nil
48
52
  regex = /(\r\n)|\n/o
49
53
  fh = File.new(fn).binmode
@@ -57,7 +61,7 @@ class Fasta
57
61
  obj = Prot.new
58
62
  @prots << obj
59
63
  obj.header = line.dup
60
- elsif first_char =~ /[A-Za-z*]/
64
+ elsif first_char =~ first_char_re
61
65
  obj.aaseq << line.chomp
62
66
  else
63
67
  raise "Line not in fasta format (between arrows): -->#{line}<--"
@@ -105,26 +109,40 @@ class Fasta
105
109
  # returns a new fasta object using some fraction of proteins randomly
106
110
  # selected (fraction may be > 1). Always rounds up. Will not choose a
107
111
  # protein twice unless all other proteins have been chosen
108
- def fraction_of_prots(fraction=1)
109
- fasta_fraction = nil
110
- if fraction == 1
111
- fasta_fraction = self.dup
112
- else
113
- new_num = (fraction.to_f * self.prots.size).ceil
114
- arr = []
115
- prots.each_with_index do |prot,i|
116
- arr << i << prot
112
+ #
113
+ # fraction_prefix ensures that a unique header is given even if multiple
114
+ # fraction of proteins are being created
115
+ # fraction_cnt = (prot_cnt/num_prots).floor.to_i
116
+ # so for the first n proteins, it will be 0,
117
+ # the 2n proteins will be 1, etc.
118
+ # e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
119
+ # would give headers like this: >f0_<some_real_header>,
120
+ # >f1_<some_real_header>, ...
121
+ def fraction_of_prots(fraction=1, prefix_proc=nil)
122
+ new_num = (fraction.to_f * self.prots.size).ceil
123
+ arr = []
124
+ orig_num_prots = @prots.size
125
+
126
+ # initialize
127
+ new_prots = @prots.map {|prt| prt.dup }
128
+ frac_cnt = 0
129
+ ind_cnt = 0
130
+ prt_cnt = orig_num_prots
131
+ while ind_cnt < new_num
132
+ arr << new_prots.delete_at(rand(new_prots.size))
133
+ if prefix_proc
134
+ prefix = prefix_proc.call(frac_cnt)
135
+ arr.last.header_prefix!(prefix)
117
136
  end
118
- hash = Hash[*arr]
119
- size = prots.size
120
- new_arr = []
121
- while new_arr.size <= new_num
122
- new_arr.push( hash.delete( rand(hash.size/2) ) )
123
- if hash.size == 0 then hash = Hash[*arr] end
137
+ prt_cnt -= 1 # index
138
+ if prt_cnt == 0
139
+ frac_cnt += 1
140
+ new_prots = @prots.map {|prt| prt.dup }
141
+ prt_cnt = orig_num_prots
124
142
  end
125
- fasta_fraction = Fasta.new(new_arr)
143
+ ind_cnt += 1
126
144
  end
127
- fasta_fraction
145
+ fasta_fraction = Fasta.new(arr)
128
146
  end
129
147
 
130
148
  # Convenience method for modifying some fraction of the proteins of a file
@@ -204,18 +222,32 @@ class Fasta
204
222
  other
205
223
  end
206
224
 
225
+ # method = :shuffle! | :reverse!
226
+ def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
227
+ if tryptic_peptides
228
+ @prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
229
+ else
230
+ @prots.each {|prot| prot.aaseq!(method_as_symbol) }
231
+ end
232
+ end
233
+
207
234
  # shuffles the aa sequence of each protein (each protein within itself)
208
235
  def aaseq_shuffle!
209
- @prots.each do |prot|
210
- prot.shuffle!
211
- end
236
+ @prots.each {|prot| prot.shuffle! }
212
237
  end
213
238
 
214
239
  # shuffles the aa sequence of each protein (each protein within itself)
215
240
  def aaseq_invert!
216
- @prots.each do |prot|
217
- prot.invert!
218
- end
241
+ @prots.each {|prot| prot.invert! }
242
+ end
243
+
244
+
245
+ def aaseq_invert_tryptic_peptides!
246
+ @prots.each {|prot| prot.invert_tryptic_peptides! }
247
+ end
248
+
249
+ def aaseq_shuffle_tryptic_peptides!
250
+ @prots.each {|prot| prot.invert_tryptic_peptides! }
219
251
  end
220
252
 
221
253
  def header_prefix!(prefix)
@@ -264,6 +296,37 @@ class Fasta::Prot
264
296
  end
265
297
  end
266
298
 
299
+ # convenience
300
+ def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
301
+ def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
302
+
303
+ # modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
304
+ # [cuts after K or R but not if followed by a P]
305
+ # if method_as_symbol = :reverse
306
+ # :reverse | :shuffle OR :reverse! | :shuffle!
307
+ # aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
308
+ # -> 'ABCKCDERDEKDGEKWXYRRKDER'
309
+ def tryptic_peptides!(method_as_symbol)
310
+ peps = SampleEnzyme.tryptic(@aaseq)
311
+ ends_in_RK = /[KR]/o
312
+
313
+ ## if the last peptide doesn't end in R or K we want to flip it completely
314
+ last_pep_special = nil
315
+ if peps.last[-1,1] !~ /[KR]/
316
+ last_pep_special = peps.pop
317
+ end
318
+ rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
319
+ if last_pep_special
320
+ rev_peps << last_pep_special.send(method_as_symbol)
321
+ end
322
+ @aaseq = rev_peps.join
323
+ end
324
+
325
+ # takes :reverse! | :shuffle!
326
+ def aaseq!(method_as_symbol)
327
+ @aaseq.send(method_as_symbol)
328
+ end
329
+
267
330
  def invert!
268
331
  @aaseq.reverse!
269
332
  end
@@ -323,3 +386,4 @@ end
323
386
  # end
324
387
  # end
325
388
  #end
389
+
data/lib/gi.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'open-uri'
2
+ require 'rexml/document'
3
+ require 'rexml/streamlistener'
4
+
5
+ $ANNOTS = []
6
+
7
+ class GIListener
8
+ include REXML
9
+ include StreamListener
10
+
11
+ attr_accessor :annotations
12
+
13
+ def initialize
14
+ @get_title = false
15
+ @annotations = []
16
+ end
17
+
18
+ def tag_start(name, attributes)
19
+ #puts "NAME" + name
20
+ #p attributes
21
+ if name == "Item" && attributes["Name"] == "Title"
22
+ @get_title = true
23
+ end
24
+ end
25
+
26
+ def text(text)
27
+ #puts "TEXT: " + text + @get_title.to_s
28
+ if @get_title
29
+ #puts "GETTING TITLE!"
30
+ @annotations.push text.chomp
31
+ @get_title = false
32
+ end
33
+ end
34
+
35
+ end
36
+
37
+
38
+
39
+ class GI
40
+ BATCH_SIZE = 500
41
+ # takes an array of gi numbers and returns an array of annotation
42
+ # This allows use of the batch search mode on NCBI
43
+ def self.gi2annot(list_of_gi_numbers)
44
+ loop do
45
+ batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
46
+ if batch.size == 0 then break end
47
+ string = batch.join(",")
48
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
49
+ #puts url
50
+ annots = []
51
+ open(url) do |handle|
52
+ annots = parse_etool_output(handle)
53
+ end
54
+ annots
55
+ end
56
+ end
57
+
58
+ protected
59
+ # Returns a list of Annotation strings
60
+ def self.parse_etool_output(handle)
61
+ listener = GIListener.new
62
+ parser = REXML::Parsers::StreamParser.new(handle, listener)
63
+ parser.parse
64
+ listener.annotations
65
+ end
66
+
67
+
68
+ end
69
+
70
+
71
+
72
+ =begin
73
+
74
+ <?xml version="1.0" encoding="ISO-8859-1"?>
75
+ <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
76
+ <eSummaryResult>
77
+
78
+ <DocSum>
79
+ <Id>24115498</Id>
80
+ <Item Name="Caption" Type="String">NP_710008</Item>
81
+ <Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
82
+ <Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
83
+ <Item Name="Gi" Type="Integer">24115498</Item>
84
+ <Item Name="CreateDate" Type="String">2002/10/16</Item>
85
+
86
+ <Item Name="UpdateDate" Type="String">2006/04/03</Item>
87
+ <Item Name="Flags" Type="Integer">512</Item>
88
+ <Item Name="TaxId" Type="Integer">198214</Item>
89
+ <Item Name="Status" Type="String">live</Item>
90
+ <Item Name="ReplacedBy" Type="String"></Item>
91
+ <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
92
+ </DocSum>
93
+
94
+
95
+ <DocSum>
96
+ <Id>434011</Id>
97
+ <Item Name="Caption" Type="String">CAA24741</Item>
98
+
99
+ <Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
100
+ <Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
101
+ <Item Name="Gi" Type="Integer">434011</Item>
102
+ <Item Name="CreateDate" Type="String">1983/12/06</Item>
103
+ <Item Name="UpdateDate" Type="String">2005/04/18</Item>
104
+ <Item Name="Flags" Type="Integer">0</Item>
105
+ <Item Name="TaxId" Type="Integer">562</Item>
106
+ <Item Name="Status" Type="String">live</Item>
107
+ <Item Name="ReplacedBy" Type="String"></Item>
108
+
109
+ <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
110
+ </DocSum>
111
+
112
+ </eSummaryResult>
113
+
114
+ =end
data/lib/roc.rb CHANGED
@@ -6,18 +6,22 @@
6
6
  # receiver-operator-characteristics, precision-recall, etc.. Some definitions
7
7
  # from (Davis & Goadrich. Proceedings of the 23rd
8
8
  # International Conference on Machine Learning, Pittsburgh, PA, 2006):
9
- # Recall = TP/(TP+FN)
10
- # Precision = TP/(TP+FP)
9
+ # Recall = TP/(TP+FN) [aka, Sensitivity]
10
+ # Precision = TP/(TP+FP) [aka, Positive Predictive Value]
11
11
  # True Positive Rate = TP/(TP+FN)
12
12
  # False Positive Rate = FP/(FP+TN)
13
13
  #
14
14
  # Keys to some abbreviations used in this class:
15
+ # pred = number predicted to be correct
15
16
  # tps = number of true positives
16
- # fpr = false positive rate
17
- # fpr2 = false positive rate calculated as: FP/(FP+TP)
17
+ # ppv = positive predictive value
18
+ # om_ppv = one minus positive predictive value = FP/(TP+FP)
18
19
  #
19
20
  # NOTE: this class assumes that lower scores are better. Negate your scores
20
21
  # if this is not the case.
22
+ #
23
+ # For estimation of false positive rates using a decoy database strategy, see
24
+ # the DecoyROC class.
21
25
  class ROC
22
26
 
23
27
 
@@ -38,82 +42,84 @@ class ROC
38
42
  area
39
43
  end
40
44
 
41
- # Returns (#tp, #yval) where #tp = number of true positives and yval is the
42
- # type of classification analysis (as symbol) (accepts: precision, fpr2,
43
- # fpr2_times2)
44
- def by_tps(yval, tp, fp)
45
- new_method = "tps_and_#{yval}".to_sym
46
- send(new_method, tp, fp)
47
- end
48
-
49
- # Returns (num_true_positives(ints), precision_arr(floats))
50
- # gives the precision TP/(TP+FP) as a function of number of true positives.
51
- # True positive values that are equal will cause jumps in the array values
52
- # of true positives returned. If false negatives are known, then a
53
- # recall-precision plot could be made (recall is TP/(TP+FN).
54
- # e.g. tps = [1,2,4] # -> jumps from 2 to 4
55
- def tps_and_precision(tp, fp)
56
- prc = proc {|tp_i, fp_i| (tp_i+1).to_f/((tp_i+1).to_f + fp_i.to_f) }
57
- _tps_calc(tp, fp, prc)
58
- end
59
-
60
- # Returns (num_true_positives(ints), false_positive_rate(floats))
61
- # calculated as ( FP/(FP+TP) ) as a function of number of true positives
62
- # true positive values that are equal will cause jumps in the array values
63
- # of true positives returned
64
- # e.g. tps = [1,2,4] # -> jumps from 2 to 4
65
- def tps_and_fpr2(tp, fp)
66
- prc = proc {|tp_i,fp_i| (fp_i).to_f/((tp_i+1).to_f + fp_i.to_f) }
67
- _tps_calc(tp, fp, prc)
45
+ # given an array of doublets where each doublet is a value and a boolean,
46
+ # sorts the list and divides it into two arrays (tps, fps) of the values.
47
+ # The output can then be fed into many of the other routines.
48
+ def prep_list(list)
49
+ tp = []; fp = []
50
+ list.each do |dbl|
51
+ if dbl[1]
52
+ tp << dbl
53
+ else
54
+ fp << dbl
55
+ end
56
+ end
57
+ [tp,fp].collect do |arr|
58
+ arr.collect! {|dbl| dbl[0] }
59
+ arr.sort
60
+ end
68
61
  end
69
62
 
70
63
  # Base function for tps calculations
71
- def _tps_calc(tp, fp, prc)
64
+ def tps_and_ppv(tp, fp)
72
65
  tp_i = 0
73
66
  fp_i = 0
74
67
  x = []
75
68
  y = []
69
+ num_tps = 0
76
70
 
77
71
  while tp_i < tp.size
78
72
  while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
79
73
  fp_i += 1
80
74
  end
81
75
  unless tp[tp_i] == tp[tp_i+1]
82
- x << tp_i+1
83
- #y << (fp_i+1).to_f/((tp_i+1).to_f + fp_i.to_f)
84
- y << prc.call(tp_i, fp_i)
76
+ # get the correct number of each
77
+ num_tps = tp_i + 1
78
+ num_fps = fp_i
79
+
80
+ x << num_tps
81
+ y << num_tps.to_f/(num_tps+num_fps)
82
+
85
83
  end
86
84
  tp_i += 1
87
85
  end
88
86
  return x, y
89
87
  end
88
+ end
90
89
 
91
- # Calculates the fpr based on Peng et. al. J. Proteome Res. 2003, 2, 43-50.
92
- # fpr = 2[#rev/(#rev+#real) == 2[FP/(FP+TP)]
93
- # This merely multiplies the fpr by 2.
94
- def tps_and_fpr2_times2(tp, fp)
95
- x, y = tps_and_fpr2(tp,fp)
96
- y.collect! {|v| v*2 }
97
- return x, y
98
- end
90
+ # For calculating precision given lists of hits and decoy hits. The hits are
91
+ # assumed to have false positives within them that can be estimated from the
92
+ # number of decoy hits at the same rate
93
+ class DecoyROC < ROC
99
94
 
100
- # given an array of doublets where each doublet is a value and a boolean,
101
- # sorts the list and divides it into two arrays (tps, fps) of the values.
102
- # The output can then be fed into many of the other routines.
103
- def prep_list(list)
104
- tp = []; fp = []
105
- list.each do |dbl|
106
- if dbl[1]
107
- tp << dbl
108
- else
109
- fp << dbl
95
+ # returns the [num_hits, num_tps, precision] as a function of true
96
+ # positives. Method will return precisely what is calculated (meaning some
97
+ # answers may seem bizarre if you have better decoy hits than real).
98
+ def pred_and_tps_and_ppv(hits, decoy_hits)
99
+ hits_i = 0
100
+ decoy_i = 0
101
+
102
+ num_hits_ar = []
103
+ num_tps_ar = []
104
+ ppv_ar = []
105
+
106
+ while hits_i < hits.size
107
+ while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
108
+ decoy_i += 1
110
109
  end
110
+ unless hits[hits_i] == hits[hits_i+1]
111
+ ## determine the number of false positives
112
+ tot_num_hits = hits_i+1
113
+ num_tps = tot_num_hits - decoy_i
114
+
115
+ num_hits_ar << tot_num_hits
116
+ num_tps_ar << num_tps
117
+ ppv_ar << ( num_tps.to_f/tot_num_hits )
118
+
119
+ end
120
+ hits_i += 1
111
121
  end
112
- [tp,fp].collect do |arr|
113
- arr.collect! {|dbl| dbl[0] }
114
- arr.sort
115
- end
122
+ [num_hits_ar, num_tps_ar, ppv_ar]
116
123
  end
117
124
 
118
-
119
125
  end
@@ -0,0 +1,166 @@
1
+ require 'fasta'
2
+
3
+ class SpecID::AAFreqs
4
+ # a fasta object
5
+ attr_accessor :fasta
6
+ # hash by capital one-letter amino acid symbols giving the frequency of
7
+ # seeing that amino acid. Frequencies should add to 1.
8
+ attr_accessor :aafreqs
9
+
10
+ def initialize(fasta_file=nil)
11
+ if fasta_file
12
+ @fasta = Fasta.new.read_file(fasta_file)
13
+ @aafreqs = calculate_frequencies(@fasta)
14
+ end
15
+ end
16
+
17
+ # creates an aafreqs hash based on fasta object
18
+ def calculate_frequencies(fasta)
19
+ hash = {}
20
+ total_aas = 0
21
+ ('A'..'Z').each do |x|
22
+ hash[x] = 0
23
+ end
24
+ hash['*'] = 0
25
+ fasta.prots.each do |prot|
26
+ aaseq = prot.aaseq
27
+ total_aas += aaseq.size
28
+ aaseq.split('').each do |x|
29
+ hash[x] += 1
30
+ end
31
+ end
32
+ # normalize by total amount:
33
+ hash.each do |k,v|
34
+ hash[k] = hash[k].to_f / total_aas
35
+ end
36
+ # convert all strings to symbols:
37
+ hash.each do |k,v|
38
+ hash[k.to_sym] = hash.delete(k)
39
+ end
40
+ hash
41
+ end
42
+
43
+ # The expected probability for seeing that amino acid in a given length.
44
+ # This calculates a lookup table (array) from 0 to highest_length of the
45
+ # probability of seeing at least one amino acid (given its frequency, where
46
+ # frequency is from 0 to 1)
47
+ def self.probability_of_length_table(frequency, max_length)
48
+ one_minus_freq = 1.0 - frequency.to_f
49
+ lookup = Array.new(max_length + 1)
50
+ (0..max_length).each do |len|
51
+ lookup[len] = 1.0 - (one_minus_freq**len);
52
+ end
53
+ lookup
54
+ end
55
+
56
+ # takes an array of peptide strings
57
+ # gives the actual number of peptides with at least one
58
+ # gives the expected number of peptides given the probabilities in the
59
+ # length lookup table.
60
+ # currently ONLY takes at_least = 1
61
+ # depends on @aafreqs
62
+ # returns two numbers in array [actual, expected]
63
+ # expected is a Float!!!
64
+ def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
65
+ one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
66
+ amino_acid_as_st = amino_acid.to_s
67
+ probs = []
68
+ actual = 0
69
+ expected = 0.0
70
+ peptide_aaseqs.each do |pep|
71
+ expected += (1.0 - (one_minus_freq**pep.size))
72
+ if pep.include?(amino_acid_as_st)
73
+ actual += 1
74
+ end
75
+ end
76
+ [actual, expected]
77
+ end
78
+
79
+ # pep_objs respond to sequence?
80
+ def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
81
+ @aafreqs ||= {}
82
+ @aafreqs[:C] = cyst_freq
83
+ seqs = pep_objs.map do |v|
84
+ if v.sequence =~ /\.([\w\*]+)\./
85
+ $1
86
+ else
87
+ abort v.sequence.to_s + " could not be matched!"
88
+ end
89
+ end
90
+ actual_and_expected_number(seqs, :C, 1)
91
+ end
92
+
93
+ ##
94
+ =begin
95
+
96
+ foreach my $pep (@$peps) {
97
+ unless ($pep->prob() >= $prob_cutoff) {next;}
98
+ my %freq = ();
99
+ my $aa = $pep->AA_sequence();
100
+ my $len = length($aa);
101
+
102
+ ## EXPECTED probability for each length
103
+ for (my $i = 0; $i < 20; $i++) {
104
+ ## rolling at least one 6 in n rolls is 1 - (5/6)^n.
105
+ $expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
106
+ }
107
+ ## FILTER any peptides we've already seen
108
+ if ($seen{$aa}) { next; }
109
+ else { $seen{$aa}++; }
110
+
111
+ ## Fill in these values with zeroes:
112
+ for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
113
+
114
+ ## get the frequencies for each AA in each peptide:
115
+ for (my $i = 0; $i < $len; $i++) {
116
+ my $let = substr($aa, $i, 1);
117
+ $tot_freq{$let}++;
118
+ $pepc[$cnt][$an{$let}]++;
119
+ }
120
+ $cnt++;
121
+ }
122
+
123
+ ##############################################################
124
+ # ANALYSIS 2: Fraction of Peptides containing X Amino Acid
125
+ ##############################################################
126
+
127
+ ## What is the percentage of peptides containing at least 1 cysteine?
128
+ my $atleast = 1;
129
+
130
+ my @has;
131
+ ## initialize
132
+ for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
133
+ my $tot = scalar(@pepc);
134
+ foreach my $pep (@pepc) {
135
+ for (my $index = 0; $index < 20; $index++) {
136
+ if ($pep->[$index] >= $atleast) {
137
+ $has[$index]++;
138
+ }
139
+ }
140
+ }
141
+
142
+
143
+ my @exp_sum = (); ## The total number of peptides I'd expect
144
+ ## WE simply add up the peptides' probabilities
145
+ ## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
146
+ foreach my $pep (@expected) {
147
+ for (my $i = 0; $i < 20; $i++) {
148
+ $exp_sum[$i] += $pep->[$i];
149
+ }
150
+ }
151
+
152
+ my @obs = map { $_/$tot } @has;
153
+ my @exp = map { $_/$tot } @exp_sum;
154
+ print STDERR "*********************************************\n";
155
+ print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
156
+ print "[AA] [Observed] [Predicted]\n";
157
+ for (my $i = 0; $i < 20; $i++) {
158
+ print "$AA[$i] $obs[$i] $exp[$i]\n";
159
+ }
160
+ print STDERR "*********************************************\n";
161
+
162
+
163
+
164
+ =end
165
+
166
+ end
@@ -15,12 +15,15 @@ module SpecIDXML; end
15
15
  class SpecID::Bioworks
16
16
  # Regular expressions
17
17
  @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
18
+ @@modifications_re = /<modifications>(.*)<\/modifications>/o
18
19
  @@protein_re = /<protein>/o
19
20
  @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
20
21
  @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
21
22
 
22
23
 
23
24
  attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
25
+ # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
26
+ attr_accessor :modifications
24
27
  attr_writer :peps
25
28
 
26
29
  def hi_prob_best ; false end
@@ -196,6 +199,7 @@ class SpecID::Bioworks
196
199
  @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
197
200
  end
198
201
  @version = get_regex_val(fh, @@bioworksinfo_re)
202
+ @modifications = get_regex_val(fh, @@modifications_re)
199
203
  @prots = get_prots(fh, self)
200
204
  fh.close
201
205
  end
@@ -456,7 +460,7 @@ class SpecID::Bioworks::Pep < Array
456
460
  first_scan = first_scan[0]
457
461
  last_scan = first_scan
458
462
  end
459
- return base_name, first_scan, last_scan
463
+ [base_name, first_scan, last_scan]
460
464
  end
461
465
 
462
466
  def file=(arg)