mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/lib/fasta.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'sample_enzyme'
1
2
 
2
3
  class String
3
4
 
@@ -7,8 +8,10 @@ class String
7
8
  end
8
9
  end
9
10
 
11
+ # modifies and returns self
10
12
  def shuffle!
11
13
  each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
14
+ self
12
15
  end
13
16
 
14
17
  def shuffle
@@ -44,6 +47,7 @@ class Fasta
44
47
  # Checks that the first character per line is '>' or character class [A-Za-z*]
45
48
  # returns a fasta object for stringing commands
46
49
  def read_file(fn)
50
+ first_char_re = /[A-Za-z*]/o
47
51
  obj = nil
48
52
  regex = /(\r\n)|\n/o
49
53
  fh = File.new(fn).binmode
@@ -57,7 +61,7 @@ class Fasta
57
61
  obj = Prot.new
58
62
  @prots << obj
59
63
  obj.header = line.dup
60
- elsif first_char =~ /[A-Za-z*]/
64
+ elsif first_char =~ first_char_re
61
65
  obj.aaseq << line.chomp
62
66
  else
63
67
  raise "Line not in fasta format (between arrows): -->#{line}<--"
@@ -105,26 +109,40 @@ class Fasta
105
109
  # returns a new fasta object using some fraction of proteins randomly
106
110
  # selected (fraction may be > 1). Always rounds up. Will not choose a
107
111
  # protein twice unless all other proteins have been chosen
108
- def fraction_of_prots(fraction=1)
109
- fasta_fraction = nil
110
- if fraction == 1
111
- fasta_fraction = self.dup
112
- else
113
- new_num = (fraction.to_f * self.prots.size).ceil
114
- arr = []
115
- prots.each_with_index do |prot,i|
116
- arr << i << prot
112
+ #
113
+ # fraction_prefix ensures that a unique header is given even if multiple
114
+ # fraction of proteins are being created
115
+ # fraction_cnt = (prot_cnt/num_prots).floor.to_i
116
+ # so for the first n proteins, it will be 0,
117
+ # the 2n proteins will be 1, etc.
118
+ # e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
119
+ # would give headers like this: >f0_<some_real_header>,
120
+ # >f1_<some_real_header>, ...
121
+ def fraction_of_prots(fraction=1, prefix_proc=nil)
122
+ new_num = (fraction.to_f * self.prots.size).ceil
123
+ arr = []
124
+ orig_num_prots = @prots.size
125
+
126
+ # initialize
127
+ new_prots = @prots.map {|prt| prt.dup }
128
+ frac_cnt = 0
129
+ ind_cnt = 0
130
+ prt_cnt = orig_num_prots
131
+ while ind_cnt < new_num
132
+ arr << new_prots.delete_at(rand(new_prots.size))
133
+ if prefix_proc
134
+ prefix = prefix_proc.call(frac_cnt)
135
+ arr.last.header_prefix!(prefix)
117
136
  end
118
- hash = Hash[*arr]
119
- size = prots.size
120
- new_arr = []
121
- while new_arr.size <= new_num
122
- new_arr.push( hash.delete( rand(hash.size/2) ) )
123
- if hash.size == 0 then hash = Hash[*arr] end
137
+ prt_cnt -= 1 # index
138
+ if prt_cnt == 0
139
+ frac_cnt += 1
140
+ new_prots = @prots.map {|prt| prt.dup }
141
+ prt_cnt = orig_num_prots
124
142
  end
125
- fasta_fraction = Fasta.new(new_arr)
143
+ ind_cnt += 1
126
144
  end
127
- fasta_fraction
145
+ fasta_fraction = Fasta.new(arr)
128
146
  end
129
147
 
130
148
  # Convenience method for modifying some fraction of the proteins of a file
@@ -204,18 +222,32 @@ class Fasta
204
222
  other
205
223
  end
206
224
 
225
+ # method = :shuffle! | :reverse!
226
+ def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
227
+ if tryptic_peptides
228
+ @prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
229
+ else
230
+ @prots.each {|prot| prot.aaseq!(method_as_symbol) }
231
+ end
232
+ end
233
+
207
234
  # shuffles the aa sequence of each protein (each protein within itself)
208
235
  def aaseq_shuffle!
209
- @prots.each do |prot|
210
- prot.shuffle!
211
- end
236
+ @prots.each {|prot| prot.shuffle! }
212
237
  end
213
238
 
214
239
  # shuffles the aa sequence of each protein (each protein within itself)
215
240
  def aaseq_invert!
216
- @prots.each do |prot|
217
- prot.invert!
218
- end
241
+ @prots.each {|prot| prot.invert! }
242
+ end
243
+
244
+
245
+ def aaseq_invert_tryptic_peptides!
246
+ @prots.each {|prot| prot.invert_tryptic_peptides! }
247
+ end
248
+
249
+ def aaseq_shuffle_tryptic_peptides!
250
+ @prots.each {|prot| prot.invert_tryptic_peptides! }
219
251
  end
220
252
 
221
253
  def header_prefix!(prefix)
@@ -264,6 +296,37 @@ class Fasta::Prot
264
296
  end
265
297
  end
266
298
 
299
+ # convenience
300
+ def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
301
+ def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
302
+
303
+ # modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
304
+ # [cuts after K or R but not if followed by a P]
305
+ # if method_as_symbol = :reverse
306
+ # :reverse | :shuffle OR :reverse! | :shuffle!
307
+ # aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
308
+ # -> 'ABCKCDERDEKDGEKWXYRRKDER'
309
+ def tryptic_peptides!(method_as_symbol)
310
+ peps = SampleEnzyme.tryptic(@aaseq)
311
+ ends_in_RK = /[KR]/o
312
+
313
+ ## if the last peptide doesn't end in R or K we want to flip it completely
314
+ last_pep_special = nil
315
+ if peps.last[-1,1] !~ /[KR]/
316
+ last_pep_special = peps.pop
317
+ end
318
+ rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
319
+ if last_pep_special
320
+ rev_peps << last_pep_special.send(method_as_symbol)
321
+ end
322
+ @aaseq = rev_peps.join
323
+ end
324
+
325
+ # takes :reverse! | :shuffle!
326
+ def aaseq!(method_as_symbol)
327
+ @aaseq.send(method_as_symbol)
328
+ end
329
+
267
330
  def invert!
268
331
  @aaseq.reverse!
269
332
  end
@@ -323,3 +386,4 @@ end
323
386
  # end
324
387
  # end
325
388
  #end
389
+
data/lib/gi.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'open-uri'
2
+ require 'rexml/document'
3
+ require 'rexml/streamlistener'
4
+
5
+ $ANNOTS = []
6
+
7
+ class GIListener
8
+ include REXML
9
+ include StreamListener
10
+
11
+ attr_accessor :annotations
12
+
13
+ def initialize
14
+ @get_title = false
15
+ @annotations = []
16
+ end
17
+
18
+ def tag_start(name, attributes)
19
+ #puts "NAME" + name
20
+ #p attributes
21
+ if name == "Item" && attributes["Name"] == "Title"
22
+ @get_title = true
23
+ end
24
+ end
25
+
26
+ def text(text)
27
+ #puts "TEXT: " + text + @get_title.to_s
28
+ if @get_title
29
+ #puts "GETTING TITLE!"
30
+ @annotations.push text.chomp
31
+ @get_title = false
32
+ end
33
+ end
34
+
35
+ end
36
+
37
+
38
+
39
+ class GI
40
+ BATCH_SIZE = 500
41
+ # takes an array of gi numbers and returns an array of annotation
42
+ # This allows use of the batch search mode on NCBI
43
+ def self.gi2annot(list_of_gi_numbers)
44
+ loop do
45
+ batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
46
+ if batch.size == 0 then break end
47
+ string = batch.join(",")
48
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
49
+ #puts url
50
+ annots = []
51
+ open(url) do |handle|
52
+ annots = parse_etool_output(handle)
53
+ end
54
+ annots
55
+ end
56
+ end
57
+
58
+ protected
59
+ # Returns a list of Annotation strings
60
+ def self.parse_etool_output(handle)
61
+ listener = GIListener.new
62
+ parser = REXML::Parsers::StreamParser.new(handle, listener)
63
+ parser.parse
64
+ listener.annotations
65
+ end
66
+
67
+
68
+ end
69
+
70
+
71
+
72
+ =begin
73
+
74
+ <?xml version="1.0" encoding="ISO-8859-1"?>
75
+ <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
76
+ <eSummaryResult>
77
+
78
+ <DocSum>
79
+ <Id>24115498</Id>
80
+ <Item Name="Caption" Type="String">NP_710008</Item>
81
+ <Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
82
+ <Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
83
+ <Item Name="Gi" Type="Integer">24115498</Item>
84
+ <Item Name="CreateDate" Type="String">2002/10/16</Item>
85
+
86
+ <Item Name="UpdateDate" Type="String">2006/04/03</Item>
87
+ <Item Name="Flags" Type="Integer">512</Item>
88
+ <Item Name="TaxId" Type="Integer">198214</Item>
89
+ <Item Name="Status" Type="String">live</Item>
90
+ <Item Name="ReplacedBy" Type="String"></Item>
91
+ <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
92
+ </DocSum>
93
+
94
+
95
+ <DocSum>
96
+ <Id>434011</Id>
97
+ <Item Name="Caption" Type="String">CAA24741</Item>
98
+
99
+ <Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
100
+ <Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
101
+ <Item Name="Gi" Type="Integer">434011</Item>
102
+ <Item Name="CreateDate" Type="String">1983/12/06</Item>
103
+ <Item Name="UpdateDate" Type="String">2005/04/18</Item>
104
+ <Item Name="Flags" Type="Integer">0</Item>
105
+ <Item Name="TaxId" Type="Integer">562</Item>
106
+ <Item Name="Status" Type="String">live</Item>
107
+ <Item Name="ReplacedBy" Type="String"></Item>
108
+
109
+ <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
110
+ </DocSum>
111
+
112
+ </eSummaryResult>
113
+
114
+ =end
data/lib/roc.rb CHANGED
@@ -6,18 +6,22 @@
6
6
  # receiver-operator-characteristics, precision-recall, etc.. Some definitions
7
7
  # from (Davis & Goadrich. Proceedings of the 23rd
8
8
  # International Conference on Machine Learning, Pittsburgh, PA, 2006):
9
- # Recall = TP/(TP+FN)
10
- # Precision = TP/(TP+FP)
9
+ # Recall = TP/(TP+FN) [aka, Sensitivity]
10
+ # Precision = TP/(TP+FP) [aka, Positive Predictive Value]
11
11
  # True Positive Rate = TP/(TP+FN)
12
12
  # False Positive Rate = FP/(FP+TN)
13
13
  #
14
14
  # Keys to some abbreviations used in this class:
15
+ # pred = number predicted to be correct
15
16
  # tps = number of true positives
16
- # fpr = false positive rate
17
- # fpr2 = false positive rate calculated as: FP/(FP+TP)
17
+ # ppv = positive predictive value
18
+ # om_ppv = one minus positive predictive value = FP/(TP+FP)
18
19
  #
19
20
  # NOTE: this class assumes that lower scores are better. Negate your scores
20
21
  # if this is not the case.
22
+ #
23
+ # For estimation of false positive rates using a decoy database strategy, see
24
+ # the DecoyROC class.
21
25
  class ROC
22
26
 
23
27
 
@@ -38,82 +42,84 @@ class ROC
38
42
  area
39
43
  end
40
44
 
41
- # Returns (#tp, #yval) where #tp = number of true positives and yval is the
42
- # type of classification analysis (as symbol) (accepts: precision, fpr2,
43
- # fpr2_times2)
44
- def by_tps(yval, tp, fp)
45
- new_method = "tps_and_#{yval}".to_sym
46
- send(new_method, tp, fp)
47
- end
48
-
49
- # Returns (num_true_positives(ints), precision_arr(floats))
50
- # gives the precision TP/(TP+FP) as a function of number of true positives.
51
- # True positive values that are equal will cause jumps in the array values
52
- # of true positives returned. If false negatives are known, then a
53
- # recall-precision plot could be made (recall is TP/(TP+FN).
54
- # e.g. tps = [1,2,4] # -> jumps from 2 to 4
55
- def tps_and_precision(tp, fp)
56
- prc = proc {|tp_i, fp_i| (tp_i+1).to_f/((tp_i+1).to_f + fp_i.to_f) }
57
- _tps_calc(tp, fp, prc)
58
- end
59
-
60
- # Returns (num_true_positives(ints), false_positive_rate(floats))
61
- # calculated as ( FP/(FP+TP) ) as a function of number of true positives
62
- # true positive values that are equal will cause jumps in the array values
63
- # of true positives returned
64
- # e.g. tps = [1,2,4] # -> jumps from 2 to 4
65
- def tps_and_fpr2(tp, fp)
66
- prc = proc {|tp_i,fp_i| (fp_i).to_f/((tp_i+1).to_f + fp_i.to_f) }
67
- _tps_calc(tp, fp, prc)
45
+ # given an array of doublets where each doublet is a value and a boolean,
46
+ # sorts the list and divides it into two arrays (tps, fps) of the values.
47
+ # The output can then be fed into many of the other routines.
48
+ def prep_list(list)
49
+ tp = []; fp = []
50
+ list.each do |dbl|
51
+ if dbl[1]
52
+ tp << dbl
53
+ else
54
+ fp << dbl
55
+ end
56
+ end
57
+ [tp,fp].collect do |arr|
58
+ arr.collect! {|dbl| dbl[0] }
59
+ arr.sort
60
+ end
68
61
  end
69
62
 
70
63
  # Base function for tps calculations
71
- def _tps_calc(tp, fp, prc)
64
+ def tps_and_ppv(tp, fp)
72
65
  tp_i = 0
73
66
  fp_i = 0
74
67
  x = []
75
68
  y = []
69
+ num_tps = 0
76
70
 
77
71
  while tp_i < tp.size
78
72
  while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
79
73
  fp_i += 1
80
74
  end
81
75
  unless tp[tp_i] == tp[tp_i+1]
82
- x << tp_i+1
83
- #y << (fp_i+1).to_f/((tp_i+1).to_f + fp_i.to_f)
84
- y << prc.call(tp_i, fp_i)
76
+ # get the correct number of each
77
+ num_tps = tp_i + 1
78
+ num_fps = fp_i
79
+
80
+ x << num_tps
81
+ y << num_tps.to_f/(num_tps+num_fps)
82
+
85
83
  end
86
84
  tp_i += 1
87
85
  end
88
86
  return x, y
89
87
  end
88
+ end
90
89
 
91
- # Calculates the fpr based on Peng et. al. J. Proteome Res. 2003, 2, 43-50.
92
- # fpr = 2[#rev/(#rev+#real) == 2[FP/(FP+TP)]
93
- # This merely multiplies the fpr by 2.
94
- def tps_and_fpr2_times2(tp, fp)
95
- x, y = tps_and_fpr2(tp,fp)
96
- y.collect! {|v| v*2 }
97
- return x, y
98
- end
90
+ # For calculating precision given lists of hits and decoy hits. The hits are
91
+ # assumed to have false positives within them that can be estimated from the
92
+ # number of decoy hits at the same rate
93
+ class DecoyROC < ROC
99
94
 
100
- # given an array of doublets where each doublet is a value and a boolean,
101
- # sorts the list and divides it into two arrays (tps, fps) of the values.
102
- # The output can then be fed into many of the other routines.
103
- def prep_list(list)
104
- tp = []; fp = []
105
- list.each do |dbl|
106
- if dbl[1]
107
- tp << dbl
108
- else
109
- fp << dbl
95
+ # returns the [num_hits, num_tps, precision] as a function of true
96
+ # positives. Method will return precisely what is calculated (meaning some
97
+ # answers may seem bizarre if you have better decoy hits than real).
98
+ def pred_and_tps_and_ppv(hits, decoy_hits)
99
+ hits_i = 0
100
+ decoy_i = 0
101
+
102
+ num_hits_ar = []
103
+ num_tps_ar = []
104
+ ppv_ar = []
105
+
106
+ while hits_i < hits.size
107
+ while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
108
+ decoy_i += 1
110
109
  end
110
+ unless hits[hits_i] == hits[hits_i+1]
111
+ ## determine the number of false positives
112
+ tot_num_hits = hits_i+1
113
+ num_tps = tot_num_hits - decoy_i
114
+
115
+ num_hits_ar << tot_num_hits
116
+ num_tps_ar << num_tps
117
+ ppv_ar << ( num_tps.to_f/tot_num_hits )
118
+
119
+ end
120
+ hits_i += 1
111
121
  end
112
- [tp,fp].collect do |arr|
113
- arr.collect! {|dbl| dbl[0] }
114
- arr.sort
115
- end
122
+ [num_hits_ar, num_tps_ar, ppv_ar]
116
123
  end
117
124
 
118
-
119
125
  end
@@ -0,0 +1,166 @@
1
+ require 'fasta'
2
+
3
+ class SpecID::AAFreqs
4
+ # a fasta object
5
+ attr_accessor :fasta
6
+ # hash by capital one-letter amino acid symbols giving the frequency of
7
+ # seeing that amino acid. Frequencies should add to 1.
8
+ attr_accessor :aafreqs
9
+
10
+ def initialize(fasta_file=nil)
11
+ if fasta_file
12
+ @fasta = Fasta.new.read_file(fasta_file)
13
+ @aafreqs = calculate_frequencies(@fasta)
14
+ end
15
+ end
16
+
17
+ # creates an aafreqs hash based on fasta object
18
+ def calculate_frequencies(fasta)
19
+ hash = {}
20
+ total_aas = 0
21
+ ('A'..'Z').each do |x|
22
+ hash[x] = 0
23
+ end
24
+ hash['*'] = 0
25
+ fasta.prots.each do |prot|
26
+ aaseq = prot.aaseq
27
+ total_aas += aaseq.size
28
+ aaseq.split('').each do |x|
29
+ hash[x] += 1
30
+ end
31
+ end
32
+ # normalize by total amount:
33
+ hash.each do |k,v|
34
+ hash[k] = hash[k].to_f / total_aas
35
+ end
36
+ # convert all strings to symbols:
37
+ hash.each do |k,v|
38
+ hash[k.to_sym] = hash.delete(k)
39
+ end
40
+ hash
41
+ end
42
+
43
+ # The expected probability for seeing that amino acid in a given length.
44
+ # This calculates a lookup table (array) from 0 to highest_length of the
45
+ # probability of seeing at least one amino acid (given its frequency, where
46
+ # frequency is from 0 to 1)
47
+ def self.probability_of_length_table(frequency, max_length)
48
+ one_minus_freq = 1.0 - frequency.to_f
49
+ lookup = Array.new(max_length + 1)
50
+ (0..max_length).each do |len|
51
+ lookup[len] = 1.0 - (one_minus_freq**len);
52
+ end
53
+ lookup
54
+ end
55
+
56
+ # takes an array of peptide strings
57
+ # gives the actual number of peptides with at least one
58
+ # gives the expected number of peptides given the probabilities in the
59
+ # length lookup table.
60
+ # currently ONLY takes at_least = 1
61
+ # depends on @aafreqs
62
+ # returns two numbers in array [actual, expected]
63
+ # expected is a Float!!!
64
+ def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
65
+ one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
66
+ amino_acid_as_st = amino_acid.to_s
67
+ probs = []
68
+ actual = 0
69
+ expected = 0.0
70
+ peptide_aaseqs.each do |pep|
71
+ expected += (1.0 - (one_minus_freq**pep.size))
72
+ if pep.include?(amino_acid_as_st)
73
+ actual += 1
74
+ end
75
+ end
76
+ [actual, expected]
77
+ end
78
+
79
+ # pep_objs respond to sequence?
80
+ def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
81
+ @aafreqs ||= {}
82
+ @aafreqs[:C] = cyst_freq
83
+ seqs = pep_objs.map do |v|
84
+ if v.sequence =~ /\.([\w\*]+)\./
85
+ $1
86
+ else
87
+ abort v.sequence.to_s + " could not be matched!"
88
+ end
89
+ end
90
+ actual_and_expected_number(seqs, :C, 1)
91
+ end
92
+
93
+ ##
94
+ =begin
95
+
96
+ foreach my $pep (@$peps) {
97
+ unless ($pep->prob() >= $prob_cutoff) {next;}
98
+ my %freq = ();
99
+ my $aa = $pep->AA_sequence();
100
+ my $len = length($aa);
101
+
102
+ ## EXPECTED probability for each length
103
+ for (my $i = 0; $i < 20; $i++) {
104
+ ## rolling at least one 6 in n rolls is 1 - (5/6)^n.
105
+ $expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
106
+ }
107
+ ## FILTER any peptides we've already seen
108
+ if ($seen{$aa}) { next; }
109
+ else { $seen{$aa}++; }
110
+
111
+ ## Fill in these values with zeroes:
112
+ for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
113
+
114
+ ## get the frequencies for each AA in each peptide:
115
+ for (my $i = 0; $i < $len; $i++) {
116
+ my $let = substr($aa, $i, 1);
117
+ $tot_freq{$let}++;
118
+ $pepc[$cnt][$an{$let}]++;
119
+ }
120
+ $cnt++;
121
+ }
122
+
123
+ ##############################################################
124
+ # ANALYSIS 2: Fraction of Peptides containing X Amino Acid
125
+ ##############################################################
126
+
127
+ ## What is the percentage of peptides containing at least 1 cysteine?
128
+ my $atleast = 1;
129
+
130
+ my @has;
131
+ ## initialize
132
+ for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
133
+ my $tot = scalar(@pepc);
134
+ foreach my $pep (@pepc) {
135
+ for (my $index = 0; $index < 20; $index++) {
136
+ if ($pep->[$index] >= $atleast) {
137
+ $has[$index]++;
138
+ }
139
+ }
140
+ }
141
+
142
+
143
+ my @exp_sum = (); ## The total number of peptides I'd expect
144
+ ## WE simply add up the peptides' probabilities
145
+ ## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
146
+ foreach my $pep (@expected) {
147
+ for (my $i = 0; $i < 20; $i++) {
148
+ $exp_sum[$i] += $pep->[$i];
149
+ }
150
+ }
151
+
152
+ my @obs = map { $_/$tot } @has;
153
+ my @exp = map { $_/$tot } @exp_sum;
154
+ print STDERR "*********************************************\n";
155
+ print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
156
+ print "[AA] [Observed] [Predicted]\n";
157
+ for (my $i = 0; $i < 20; $i++) {
158
+ print "$AA[$i] $obs[$i] $exp[$i]\n";
159
+ }
160
+ print STDERR "*********************************************\n";
161
+
162
+
163
+
164
+ =end
165
+
166
+ end
@@ -15,12 +15,15 @@ module SpecIDXML; end
15
15
  class SpecID::Bioworks
16
16
  # Regular expressions
17
17
  @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
18
+ @@modifications_re = /<modifications>(.*)<\/modifications>/o
18
19
  @@protein_re = /<protein>/o
19
20
  @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
20
21
  @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
21
22
 
22
23
 
23
24
  attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
25
+ # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
26
+ attr_accessor :modifications
24
27
  attr_writer :peps
25
28
 
26
29
  def hi_prob_best ; false end
@@ -196,6 +199,7 @@ class SpecID::Bioworks
196
199
  @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
197
200
  end
198
201
  @version = get_regex_val(fh, @@bioworksinfo_re)
202
+ @modifications = get_regex_val(fh, @@modifications_re)
199
203
  @prots = get_prots(fh, self)
200
204
  fh.close
201
205
  end
@@ -456,7 +460,7 @@ class SpecID::Bioworks::Pep < Array
456
460
  first_scan = first_scan[0]
457
461
  last_scan = first_scan
458
462
  end
459
- return base_name, first_scan, last_scan
463
+ [base_name, first_scan, last_scan]
460
464
  end
461
465
 
462
466
  def file=(arg)