mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/sample_enzyme.rb DELETED
@@ -1,160 +0,0 @@
1
-
2
- module SpecIDXML; end
3
-
4
- require 'strscan'
5
-
6
- require 'spec_id_xml'
7
- require 'spec_id'
8
-
9
-
10
- class SampleEnzyme
11
- include SpecIDXML
12
-
13
- attr_accessor :name
14
- # amino acids after which to cleave
15
- attr_accessor :cut
16
- # cleave at 'cut' amino acids UNLESS it is followed by 'no_cut'
17
- attr_accessor :no_cut
18
- # 'C' or 'N'
19
- attr_accessor :sense
20
-
21
- # Currently, recognize:
22
- # trypsin
23
- # For other enzymes, you must set :cut, :no_cut, :name, and :sense
24
- # will yield the object if you want to set the values that way
25
- def initialize(name=nil)
26
- @num_missed_cleavages_regex = nil
27
- @sense = nil
28
- @cut = nil
29
- @no_cut = nil
30
- @name = name
31
- if @name
32
- # set the values if we recognize this name
33
- send("set_#{@name}".to_sym)
34
- end
35
- if block_given?
36
- yield(self)
37
- end
38
- end
39
-
40
- def set_trypsin
41
- @sense = 'C'
42
- @cut = 'KR'
43
- @no_cut = 'P'
44
- end
45
-
46
- def to_pepxml
47
- element_xml(:sample_enzyme, [:name]) do
48
- short_element_xml(:specificity, [:cut, :no_cut, :sense])
49
- end
50
- end
51
-
52
- # returns self
53
- def from_pepxml_node(node)
54
- self.name = node['name']
55
- ch = node.child
56
- self.cut = ch['cut']
57
- self.no_cut= ch['no_cut']
58
- self.sense = ch['sense']
59
- self
60
- end
61
-
62
- def self.from_pepxml_node(node)
63
- self.new.from_pepxml_node(node)
64
- end
65
-
66
- # takes an amino acid sequence (e.g., -.PEPTIDK.L)
67
- # returns the number of missed cleavages
68
- def num_missed_cleavages(aaseq)
69
- raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
70
- @num_missed_cleavages_regex =
71
- if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
72
- else
73
- regex_string = "[#{@cut}]"
74
- if @no_cut and @no_cut != ''
75
- regex_string << "[^#{@no_cut}]"
76
- end
77
- /#{regex_string}/
78
- end
79
- arr = aaseq.scan(@num_missed_cleavages_regex)
80
- num = arr.size
81
- if aaseq[-1,1] =~ @num_missed_cleavages_regex
82
- num -= 1
83
- end
84
- num
85
- end
86
-
87
- # requires full sequence (with heads and tails)
88
- def num_tol_term(sequence)
89
- raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
90
- no_cut = @no_cut || ''
91
- num_tol = 0
92
- first, middle, last = SpecID::Pep.split_sequence(sequence)
93
- last_of_middle = middle[-1,1]
94
- first_of_middle = middle[0,1]
95
- if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
96
- num_tol += 1
97
- end
98
- if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
99
- num_tol += 1
100
- end
101
- num_tol
102
- end
103
-
104
- # returns all peptides of missed cleavages <= 'missed_cleavages'
105
- # so 2 missed cleavages will return all no missed cleavage peptides
106
- # all 1 missed cleavages and all 2 missed cleavages.
107
- # options:
108
- def digest(string, missed_cleavages=0, options={})
109
- raise NotImplementedError if @sense == 'N'
110
- s = StringScanner.new(string)
111
- no_cut_regex = Regexp.new("[#{@no_cut}]")
112
- regex = Regexp.new("[#{@cut}]")
113
- peps = []
114
- last_pos = 0
115
- current_pep = ''
116
- loop do
117
- if s.eos?
118
- break
119
- end
120
- m = s.scan_until(regex)
121
- if m ## found a cut point
122
- last_pos = s.pos
123
- # is the next amino acid a no_cut?
124
- if string[s.pos,1] =~ no_cut_regex
125
- current_pep << m
126
- else
127
- # cut it
128
- current_pep << m
129
- peps << current_pep
130
- current_pep = ''
131
- end
132
- else ## didn't find a cut point
133
- current_pep << string[last_pos..-1]
134
- peps << current_pep
135
- break
136
- end
137
- end
138
- ## LOOP through and grab each set of missed cleavages from num down to 0
139
- all_sets_of_peps = []
140
- (0..missed_cleavages).to_a.reverse.each do |num_mc|
141
- all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
142
- end
143
- all_sets_of_peps
144
- end
145
-
146
- # takes an array of peptides and returns an array containing 'num' missed
147
- # cleavages
148
- # DOES NOT contain peptides that contain < num of missed cleavages
149
- # (i.e., will not return missed cleaveages of 1 or 2 if num == 3
150
- def get_missed_cleavages(ar_of_peptide_seqs, num)
151
- (0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
152
- ar_of_peptide_seqs[i,num+1].join
153
- end
154
- end
155
-
156
- def self.tryptic(string, missed_cleavages=0)
157
- self.new("trypsin").digest(string, missed_cleavages)
158
- end
159
-
160
- end
data/lib/scan_i.rb DELETED
@@ -1,21 +0,0 @@
1
-
2
- # http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/7370f94e852c0fae/4068c8c1c1c158ee
3
- class String
4
- def scan_i seq
5
- pos=0
6
- ndx=[]
7
- slen = seq.length
8
- while i=index(seq,pos)
9
- ndx << i
10
- pos = i + slen
11
- end
12
- ndx
13
- end
14
-
15
- #def scan_enum seq
16
- # self.enum_for(:scan, seq).map do
17
- # $~.offset(0)[0]
18
- # end
19
- #end
20
- end
21
-
@@ -1,170 +0,0 @@
1
- require 'fasta'
2
-
3
- module SpecID ; end
4
-
5
- class SpecID::AAFreqs
6
- # hash by capital one-letter amino acid symbols giving the frequency of
7
- # seeing that amino acid. Frequencies should add to 1.
8
- attr_accessor :aafreqs
9
-
10
- # fasta is fasta object!
11
- def initialize(fasta=nil)
12
- if fasta
13
- @aafreqs = calculate_frequencies(fasta.prots)
14
- end
15
- end
16
-
17
- # takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
18
- def calculate_frequencies(objs)
19
- hash = {}
20
- total_aas = 0
21
- ('A'..'Z').each do |x|
22
- hash[x] = 0
23
- end
24
- hash['*'] = 0
25
- objs.each do |obj|
26
- aaseq = obj.aaseq
27
- total_aas += aaseq.size
28
- aaseq.split('').each do |x|
29
- hash[x] += 1
30
- end
31
- end
32
- # normalize by total amount:
33
- hash.each do |k,v|
34
- hash[k] = hash[k].to_f / total_aas
35
- end
36
- # convert all strings to symbols:
37
- hash.each do |k,v|
38
- hash[k.to_sym] = hash.delete(k)
39
- end
40
- hash
41
- end
42
-
43
- # The expected probability for seeing that amino acid in a given length.
44
- # This calculates a lookup table (array) from 0 to highest_length of the
45
- # probability of seeing at least one amino acid (given its frequency, where
46
- # frequency is from 0 to 1)
47
- def self.probability_of_length_table(frequency, max_length)
48
- one_minus_freq = 1.0 - frequency.to_f
49
- lookup = Array.new(max_length + 1)
50
- (0..max_length).each do |len|
51
- lookup[len] = 1.0 - (one_minus_freq**len);
52
- end
53
- lookup
54
- end
55
-
56
- # takes an array of peptide strings
57
- # gives the actual number of peptides with at least one
58
- # gives the expected number of peptides given the probabilities in the
59
- # length lookup table.
60
- # currently ONLY takes at_least = 1
61
- # depends on @aafreqs
62
- # returns two numbers in array [actual, expected]
63
- # expected is a Float!!!
64
- def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
65
- if at_least > 1
66
- raise NotImplementedError, "can only do at_least=1 right now!"
67
- end
68
- one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
69
- amino_acid_as_st = amino_acid.to_s
70
- probs = []
71
- actual = 0
72
- expected = 0.0
73
- peptide_aaseqs.each do |pep|
74
- expected += (1.0 - (one_minus_freq**pep.size))
75
- if pep.include?(amino_acid_as_st)
76
- actual += 1
77
- end
78
- end
79
- [actual, expected]
80
- end
81
-
82
- # pep_objs respond to sequence?
83
- # also takes a hash of peptides keyed on :aaseq
84
- def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
85
- if pep_objs.is_a? Hash
86
- seqs = pep_objs.keys
87
- else
88
- seqs = pep_objs.map do |v|
89
- v.aaseq
90
- end
91
- end
92
- @aafreqs ||= {}
93
- @aafreqs[:C] = cyst_freq
94
- actual_and_expected_number(seqs, :C, 1)
95
- end
96
-
97
- ##
98
- =begin
99
-
100
- foreach my $pep (@$peps) {
101
- unless ($pep->prob() >= $prob_cutoff) {next;}
102
- my %freq = ();
103
- my $aa = $pep->AA_sequence();
104
- my $len = length($aa);
105
-
106
- ## EXPECTED probability for each length
107
- for (my $i = 0; $i < 20; $i++) {
108
- ## rolling at least one 6 in n rolls is 1 - (5/6)^n.
109
- $expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
110
- }
111
- ## FILTER any peptides we've already seen
112
- if ($seen{$aa}) { next; }
113
- else { $seen{$aa}++; }
114
-
115
- ## Fill in these values with zeroes:
116
- for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
117
-
118
- ## get the frequencies for each AA in each peptide:
119
- for (my $i = 0; $i < $len; $i++) {
120
- my $let = substr($aa, $i, 1);
121
- $tot_freq{$let}++;
122
- $pepc[$cnt][$an{$let}]++;
123
- }
124
- $cnt++;
125
- }
126
-
127
- ##############################################################
128
- # ANALYSIS 2: Fraction of Peptides containing X Amino Acid
129
- ##############################################################
130
-
131
- ## What is the percentage of peptides containing at least 1 cysteine?
132
- my $atleast = 1;
133
-
134
- my @has;
135
- ## initialize
136
- for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
137
- my $tot = scalar(@pepc);
138
- foreach my $pep (@pepc) {
139
- for (my $index = 0; $index < 20; $index++) {
140
- if ($pep->[$index] >= $atleast) {
141
- $has[$index]++;
142
- }
143
- }
144
- }
145
-
146
-
147
- my @exp_sum = (); ## The total number of peptides I'd expect
148
- ## WE simply add up the peptides' probabilities
149
- ## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
150
- foreach my $pep (@expected) {
151
- for (my $i = 0; $i < 20; $i++) {
152
- $exp_sum[$i] += $pep->[$i];
153
- }
154
- }
155
-
156
- my @obs = map { $_/$tot } @has;
157
- my @exp = map { $_/$tot } @exp_sum;
158
- print STDERR "*********************************************\n";
159
- print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
160
- print "[AA] [Observed] [Predicted]\n";
161
- for (my $i = 0; $i < 20; $i++) {
162
- print "$AA[$i] $obs[$i] $exp[$i]\n";
163
- }
164
- print STDERR "*********************************************\n";
165
-
166
-
167
-
168
- =end
169
-
170
- end