mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/sample_enzyme.rb DELETED
@@ -1,160 +0,0 @@
1
-
2
- module SpecIDXML; end
3
-
4
- require 'strscan'
5
-
6
- require 'spec_id_xml'
7
- require 'spec_id'
8
-
9
-
10
- class SampleEnzyme
11
- include SpecIDXML
12
-
13
- attr_accessor :name
14
- # amino acids after which to cleave
15
- attr_accessor :cut
16
- # cleave at 'cut' amino acids UNLESS it is followed by 'no_cut'
17
- attr_accessor :no_cut
18
- # 'C' or 'N'
19
- attr_accessor :sense
20
-
21
- # Currently, recognize:
22
- # trypsin
23
- # For other enzymes, you must set :cut, :no_cut, :name, and :sense
24
- # will yield the object if you want to set the values that way
25
- def initialize(name=nil)
26
- @num_missed_cleavages_regex = nil
27
- @sense = nil
28
- @cut = nil
29
- @no_cut = nil
30
- @name = name
31
- if @name
32
- # set the values if we recognize this name
33
- send("set_#{@name}".to_sym)
34
- end
35
- if block_given?
36
- yield(self)
37
- end
38
- end
39
-
40
- def set_trypsin
41
- @sense = 'C'
42
- @cut = 'KR'
43
- @no_cut = 'P'
44
- end
45
-
46
- def to_pepxml
47
- element_xml(:sample_enzyme, [:name]) do
48
- short_element_xml(:specificity, [:cut, :no_cut, :sense])
49
- end
50
- end
51
-
52
- # returns self
53
- def from_pepxml_node(node)
54
- self.name = node['name']
55
- ch = node.child
56
- self.cut = ch['cut']
57
- self.no_cut= ch['no_cut']
58
- self.sense = ch['sense']
59
- self
60
- end
61
-
62
- def self.from_pepxml_node(node)
63
- self.new.from_pepxml_node(node)
64
- end
65
-
66
- # takes an amino acid sequence (e.g., -.PEPTIDK.L)
67
- # returns the number of missed cleavages
68
- def num_missed_cleavages(aaseq)
69
- raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
70
- @num_missed_cleavages_regex =
71
- if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
72
- else
73
- regex_string = "[#{@cut}]"
74
- if @no_cut and @no_cut != ''
75
- regex_string << "[^#{@no_cut}]"
76
- end
77
- /#{regex_string}/
78
- end
79
- arr = aaseq.scan(@num_missed_cleavages_regex)
80
- num = arr.size
81
- if aaseq[-1,1] =~ @num_missed_cleavages_regex
82
- num -= 1
83
- end
84
- num
85
- end
86
-
87
- # requires full sequence (with heads and tails)
88
- def num_tol_term(sequence)
89
- raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
90
- no_cut = @no_cut || ''
91
- num_tol = 0
92
- first, middle, last = SpecID::Pep.split_sequence(sequence)
93
- last_of_middle = middle[-1,1]
94
- first_of_middle = middle[0,1]
95
- if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
96
- num_tol += 1
97
- end
98
- if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
99
- num_tol += 1
100
- end
101
- num_tol
102
- end
103
-
104
- # returns all peptides of missed cleavages <= 'missed_cleavages'
105
- # so 2 missed cleavages will return all no missed cleavage peptides
106
- # all 1 missed cleavages and all 2 missed cleavages.
107
- # options:
108
- def digest(string, missed_cleavages=0, options={})
109
- raise NotImplementedError if @sense == 'N'
110
- s = StringScanner.new(string)
111
- no_cut_regex = Regexp.new("[#{@no_cut}]")
112
- regex = Regexp.new("[#{@cut}]")
113
- peps = []
114
- last_pos = 0
115
- current_pep = ''
116
- loop do
117
- if s.eos?
118
- break
119
- end
120
- m = s.scan_until(regex)
121
- if m ## found a cut point
122
- last_pos = s.pos
123
- # is the next amino acid a no_cut?
124
- if string[s.pos,1] =~ no_cut_regex
125
- current_pep << m
126
- else
127
- # cut it
128
- current_pep << m
129
- peps << current_pep
130
- current_pep = ''
131
- end
132
- else ## didn't find a cut point
133
- current_pep << string[last_pos..-1]
134
- peps << current_pep
135
- break
136
- end
137
- end
138
- ## LOOP through and grab each set of missed cleavages from num down to 0
139
- all_sets_of_peps = []
140
- (0..missed_cleavages).to_a.reverse.each do |num_mc|
141
- all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
142
- end
143
- all_sets_of_peps
144
- end
145
-
146
- # takes an array of peptides and returns an array containing 'num' missed
147
- # cleavages
148
- # DOES NOT contain peptides that contain < num of missed cleavages
149
- # (i.e., will not return missed cleaveages of 1 or 2 if num == 3
150
- def get_missed_cleavages(ar_of_peptide_seqs, num)
151
- (0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
152
- ar_of_peptide_seqs[i,num+1].join
153
- end
154
- end
155
-
156
- def self.tryptic(string, missed_cleavages=0)
157
- self.new("trypsin").digest(string, missed_cleavages)
158
- end
159
-
160
- end
data/lib/scan_i.rb DELETED
@@ -1,21 +0,0 @@
1
-
2
- # http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/7370f94e852c0fae/4068c8c1c1c158ee
3
- class String
4
- def scan_i seq
5
- pos=0
6
- ndx=[]
7
- slen = seq.length
8
- while i=index(seq,pos)
9
- ndx << i
10
- pos = i + slen
11
- end
12
- ndx
13
- end
14
-
15
- #def scan_enum seq
16
- # self.enum_for(:scan, seq).map do
17
- # $~.offset(0)[0]
18
- # end
19
- #end
20
- end
21
-
@@ -1,170 +0,0 @@
1
- require 'fasta'
2
-
3
- module SpecID ; end
4
-
5
- class SpecID::AAFreqs
6
- # hash by capital one-letter amino acid symbols giving the frequency of
7
- # seeing that amino acid. Frequencies should add to 1.
8
- attr_accessor :aafreqs
9
-
10
- # fasta is fasta object!
11
- def initialize(fasta=nil)
12
- if fasta
13
- @aafreqs = calculate_frequencies(fasta.prots)
14
- end
15
- end
16
-
17
- # takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
18
- def calculate_frequencies(objs)
19
- hash = {}
20
- total_aas = 0
21
- ('A'..'Z').each do |x|
22
- hash[x] = 0
23
- end
24
- hash['*'] = 0
25
- objs.each do |obj|
26
- aaseq = obj.aaseq
27
- total_aas += aaseq.size
28
- aaseq.split('').each do |x|
29
- hash[x] += 1
30
- end
31
- end
32
- # normalize by total amount:
33
- hash.each do |k,v|
34
- hash[k] = hash[k].to_f / total_aas
35
- end
36
- # convert all strings to symbols:
37
- hash.each do |k,v|
38
- hash[k.to_sym] = hash.delete(k)
39
- end
40
- hash
41
- end
42
-
43
- # The expected probability for seeing that amino acid in a given length.
44
- # This calculates a lookup table (array) from 0 to highest_length of the
45
- # probability of seeing at least one amino acid (given its frequency, where
46
- # frequency is from 0 to 1)
47
- def self.probability_of_length_table(frequency, max_length)
48
- one_minus_freq = 1.0 - frequency.to_f
49
- lookup = Array.new(max_length + 1)
50
- (0..max_length).each do |len|
51
- lookup[len] = 1.0 - (one_minus_freq**len);
52
- end
53
- lookup
54
- end
55
-
56
- # takes an array of peptide strings
57
- # gives the actual number of peptides with at least one
58
- # gives the expected number of peptides given the probabilities in the
59
- # length lookup table.
60
- # currently ONLY takes at_least = 1
61
- # depends on @aafreqs
62
- # returns two numbers in array [actual, expected]
63
- # expected is a Float!!!
64
- def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
65
- if at_least > 1
66
- raise NotImplementedError, "can only do at_least=1 right now!"
67
- end
68
- one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
69
- amino_acid_as_st = amino_acid.to_s
70
- probs = []
71
- actual = 0
72
- expected = 0.0
73
- peptide_aaseqs.each do |pep|
74
- expected += (1.0 - (one_minus_freq**pep.size))
75
- if pep.include?(amino_acid_as_st)
76
- actual += 1
77
- end
78
- end
79
- [actual, expected]
80
- end
81
-
82
- # pep_objs respond to sequence?
83
- # also takes a hash of peptides keyed on :aaseq
84
- def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
85
- if pep_objs.is_a? Hash
86
- seqs = pep_objs.keys
87
- else
88
- seqs = pep_objs.map do |v|
89
- v.aaseq
90
- end
91
- end
92
- @aafreqs ||= {}
93
- @aafreqs[:C] = cyst_freq
94
- actual_and_expected_number(seqs, :C, 1)
95
- end
96
-
97
- ##
98
- =begin
99
-
100
- foreach my $pep (@$peps) {
101
- unless ($pep->prob() >= $prob_cutoff) {next;}
102
- my %freq = ();
103
- my $aa = $pep->AA_sequence();
104
- my $len = length($aa);
105
-
106
- ## EXPECTED probability for each length
107
- for (my $i = 0; $i < 20; $i++) {
108
- ## rolling at least one 6 in n rolls is 1 - (5/6)^n.
109
- $expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
110
- }
111
- ## FILTER any peptides we've already seen
112
- if ($seen{$aa}) { next; }
113
- else { $seen{$aa}++; }
114
-
115
- ## Fill in these values with zeroes:
116
- for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
117
-
118
- ## get the frequencies for each AA in each peptide:
119
- for (my $i = 0; $i < $len; $i++) {
120
- my $let = substr($aa, $i, 1);
121
- $tot_freq{$let}++;
122
- $pepc[$cnt][$an{$let}]++;
123
- }
124
- $cnt++;
125
- }
126
-
127
- ##############################################################
128
- # ANALYSIS 2: Fraction of Peptides containing X Amino Acid
129
- ##############################################################
130
-
131
- ## What is the percentage of peptides containing at least 1 cysteine?
132
- my $atleast = 1;
133
-
134
- my @has;
135
- ## initialize
136
- for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
137
- my $tot = scalar(@pepc);
138
- foreach my $pep (@pepc) {
139
- for (my $index = 0; $index < 20; $index++) {
140
- if ($pep->[$index] >= $atleast) {
141
- $has[$index]++;
142
- }
143
- }
144
- }
145
-
146
-
147
- my @exp_sum = (); ## The total number of peptides I'd expect
148
- ## WE simply add up the peptides' probabilities
149
- ## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
150
- foreach my $pep (@expected) {
151
- for (my $i = 0; $i < 20; $i++) {
152
- $exp_sum[$i] += $pep->[$i];
153
- }
154
- }
155
-
156
- my @obs = map { $_/$tot } @has;
157
- my @exp = map { $_/$tot } @exp_sum;
158
- print STDERR "*********************************************\n";
159
- print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
160
- print "[AA] [Observed] [Predicted]\n";
161
- for (my $i = 0; $i < 20; $i++) {
162
- print "$AA[$i] $obs[$i] $exp[$i]\n";
163
- }
164
- print STDERR "*********************************************\n";
165
-
166
-
167
-
168
- =end
169
-
170
- end