mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,368 @@
1
+ require 'transmem'
2
+ require 'xml_style_parser'
3
+
4
+ class TopPred ; end
5
+
6
+
7
+ class TopPred::Index < Hash
8
+ include TransmemIndex
9
+
10
+ # we need to match whatever function toppred uses to generate identifiers if
11
+ # we want derivative processes to be fast and accurate
12
+ def reference_to_key(reference)
13
+ if reference
14
+ ri = reference.index(' ')
15
+ frst =
16
+ if ri
17
+ reference[0...reference.index(' ')]
18
+ else
19
+ reference
20
+ end
21
+ if frst
22
+ frst.gsub(/[^0-9a-zA-Z]/,'_')
23
+ else
24
+ nil
25
+ end
26
+ else
27
+ nil
28
+ end
29
+ end
30
+
31
+ def initialize(file, kind=:default)
32
+ case kind
33
+ when :default
34
+ TopPred.default_index(file, self)
35
+ else
36
+ abort "can't do #{kind}"
37
+ end
38
+ end
39
+
40
+ # This class will probably change its interface some in the future
41
+ # That's the web portal
42
+ # http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
43
+ # How to run:
44
+ # uncheck 'Produce hydrophobicity graph image (-g)'
45
+ # choose 'Xml' or 'New: new text' output
46
+ # type in your email, then hit 'Run toppred'
47
+ end
48
+
49
+ class TopPred
50
+ include TransmemIndex
51
+
52
+ # returns the default index
53
+ def self.default_index(file, index={})
54
+ TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
55
+ end
56
+
57
+ end
58
+
59
+ module TopPred::Parser
60
+ # returns :xml or :text
61
+ def self.filetype(file)
62
+ File.open(file) do |fh|
63
+ case fh.gets
64
+ when /<\?xml version.*>/
65
+ :xml
66
+ when /Algorithm specific/
67
+ :text
68
+ else
69
+ nil
70
+ end
71
+ end
72
+ end
73
+
74
+ # type = :xml or :text
75
+ def self.new(parser_type=:xml)
76
+ klass =
77
+ case parser_type
78
+ when :xml
79
+ TopPred::Parser_XML
80
+ when :text
81
+ TopPred::Parser_Text
82
+ else
83
+ abort "don't recognize parser type: #{parser_type}"
84
+ end
85
+ klass.new
86
+ end
87
+
88
+ def file_to_index(file, index={})
89
+ File.open(file) {|fh| to_index(fh, index) }
90
+ end
91
+
92
+ # where each segment = [prob, first, last] and aaseq is a string each
93
+ # segment may also be a hash => first, last, probability (adding key
94
+ # 'aaseq')
95
+ # first/last '1' indexed returns segments where each is [prob,
96
+ # first, last, aaseq] or hash (above)
97
+ def add_sequences_to_segments(segments, aaseq)
98
+ if segments.first.is_a? Array
99
+ segments.each do |seg|
100
+ first_index = seg[1] - 1
101
+ length = (seg[2] - seg[1]) + 1
102
+ seg.push( aaseq[first_index, length] )
103
+ end
104
+ else
105
+ segments.each do |seg|
106
+ first_index = seg[:start] - 1
107
+ length = (seg[:stop] - seg[:start]) + 1
108
+ seg[:aaseq] = ( aaseq[first_index, length] )
109
+ end
110
+ end
111
+ segments
112
+ end
113
+
114
+
115
+
116
+ end
117
+
118
+ module TopPred::Parser_XML
119
+ include TopPred::Parser
120
+ include XMLStyleParser
121
+
122
+ def self.new(meth=:to_index)
123
+ parser = XMLStyleParser.choose_parser(self, meth).new
124
+ @method = meth
125
+ parser
126
+ end
127
+
128
+ def parse(file)
129
+ send(@method, file)
130
+ end
131
+ end
132
+
133
+ class TopPred::Parser_XML::DOM
134
+ include TopPred::Parser_XML
135
+ include XMLStyleParser
136
+
137
+ =begin
138
+ YAL010C:
139
+ num_putative_transmembrane_segments: 1
140
+ aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
141
+ best_structure_probability: 1.0
142
+ transmembrane_segments:
143
+ - aaseq: SLGAEFWLGLVSLSPGCSTTL
144
+ stop: 252
145
+ start: 232
146
+ probability: 1.0
147
+ num_certain_transmembrane_segments: 1
148
+ num_found: 2
149
+ =end
150
+
151
+ # should return a index
152
+ def to_index(io, index = {})
153
+ get_root_node_from_io(io) do |toppreds_n|
154
+
155
+ abort if toppreds_n.name != 'toppreds'
156
+ toppreds_n.find('child::toppred').each do |toppred_n|
157
+ att_hash = {}
158
+ sequence_n = toppred_n.find_first('child::sequence')
159
+ index[sequence_n['id']] = att_hash
160
+ att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
161
+ abort if att_hash[:aaseq].size != sequence_n['size'].to_i
162
+ tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
163
+
164
+ num_found = tmsummary_n['segments'].to_i
165
+ att_hash[:num_found] = num_found
166
+ if num_found > 0
167
+
168
+ num_certain_transmembrane_segments = 0
169
+ num_putative_transmembrane_segments = 0
170
+ tmsummary_n.find('child::segment').each do |segment_n|
171
+ abort if segment_n.name != 'segment'
172
+ case segment_n['type']
173
+ when 'certain'
174
+ num_certain_transmembrane_segments += 1
175
+ else # putative
176
+ num_putative_transmembrane_segments += 1
177
+ end
178
+ end
179
+ att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
180
+ att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
181
+
182
+ topologies_n = tmsummary_n.next
183
+ abort if topologies_n.name != 'topologies'
184
+ # get the top probability topology:
185
+ top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
186
+ tmsegments = []
187
+ top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
188
+ tmhash = {}
189
+ tmhash[:start] = tmsegment_n['start'].to_i
190
+ tmhash[:stop] = tmsegment_n['stop'].to_i
191
+ ## WARNING! it appears the probability is broken on xml output!!
192
+ tmhash[:probability] = tmsegment_n['prob'].to_f
193
+ tmsegments << tmhash
194
+ end
195
+ add_sequences_to_segments(tmsegments, att_hash[:aaseq])
196
+ att_hash[:transmembrane_segments] = tmsegments
197
+ end
198
+ end
199
+ end
200
+ index
201
+ end
202
+
203
+ end
204
+
205
+ class TopPred::Parser_Text
206
+ include TopPred::Parser
207
+
208
+
209
+ # returns a hash structure in this form: {identifier => {aaseq => String,
210
+ # num_found: Int, num_certain_transmembrane_segments => Int,
211
+ # num_putative_transmembrane_segments => Int, best_structure_probability =>
212
+ # Float, transmembrane_segments => [probability => Float, start => Int, stop
213
+ # => Int, aaseq => String] } }
214
+ def to_index(io, index={})
215
+ current_record = nil
216
+
217
+ io.each do |line|
218
+ if line =~ /^Sequence : (.*?) +\(/
219
+ current_identifier = $1.dup
220
+ index[current_identifier] = {}
221
+ current_record = index[current_identifier]
222
+ current_record[:aaseq] = read_aaseq(io)
223
+ read_segment_summary(io, current_record)
224
+ elsif line =~ /^HEADER\s+START\s+STOP/
225
+ top_struc = top_structure( read_structures(io) )
226
+ current_record[:best_structure_probability] = top_struc[:probability]
227
+ current_record[:transmembrane_segments] = top_struc[:tm]
228
+ add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
229
+ segment_arrays_to_hashes(current_record[:transmembrane_segments])
230
+ end
231
+ end
232
+ index
233
+ end
234
+
235
+ private
236
+
237
+ # returns a list of all structures given a filehandle starting just after
238
+ # the first "HEADER START STOP ..." line
239
+ def read_structures(fh)
240
+ structures = []
241
+ loop do
242
+ structures.push( read_structure(fh) )
243
+ break if fh.eof?
244
+ line = fh.readline
245
+ unless line =~ /^HEADER\s+START\s+STOP/
246
+ break
247
+ end
248
+ end
249
+ structures
250
+ end
251
+
252
+ # returns a hash with key :probability and key :tm contains an array of
253
+ # arrays: [prob(Float), start(Int), stop(Int)]
254
+ def read_structure(fh)
255
+ structure = {}
256
+ # READ the first line
257
+ line = fh.readline
258
+ structure[:probability] = line.split(/\s+/)[2].to_f
259
+ structure[:tm] = read_segments(fh)
260
+ structure
261
+ end
262
+
263
+ # returns an array of arrays of transmembrane segments: [prob(Float),
264
+ # start(Int), stop(Int)]
265
+ # returns after seeing '//'
266
+ def read_segments(fh)
267
+ segments = []
268
+ st = Regexp.escape('//') ; end_regex = /#{st}/
269
+ fh.each do |line|
270
+ if line =~ /^TRANSMEM/
271
+ (header, start, stop, len, prob) = line.split(/\s+/)[0,5]
272
+ segments << [prob.to_f, start.to_i, stop.to_i]
273
+ elsif line =~ end_regex
274
+ break
275
+ end
276
+ end
277
+ segments
278
+ end
279
+
280
+ # returns the top probability structure (first on tie)
281
+ def top_structure(list)
282
+ top_prob = list.first[:probability]
283
+ top_struc = list.first
284
+ list.each do |st|
285
+ if st[:probability] > top_prob
286
+ top_struc = st
287
+ top_prob = st[:probability]
288
+ end
289
+ end
290
+ top_struc
291
+ end
292
+
293
+ def read_aaseq(fh)
294
+ aaseq = ''
295
+ fh.each do |line|
296
+ line.chomp!
297
+ unless line =~ /[\w\*]/
298
+ break
299
+ end
300
+ aaseq << line
301
+ end
302
+ aaseq
303
+ end
304
+
305
+ def segment_arrays_to_hashes(list)
306
+ list.map! do |ar|
307
+ { :probability => ar[0],
308
+ :start => ar[1],
309
+ :stop => ar[2],
310
+ :aaseq => ar[3],
311
+ }
312
+ end
313
+ end
314
+
315
+ # returns [certain, putative]
316
+ # expects first line to be a tm segment
317
+ def num_certain_putative(fh)
318
+ certain = 0
319
+ putative = 0
320
+ fh.each do |line|
321
+ certainty = line.chomp.split(/\s+/).last
322
+ if !certainty
323
+ break
324
+ else
325
+ certain += 1 if certainty == 'Certain'
326
+ putative += 1 if certainty == 'Putative'
327
+ end
328
+ end
329
+ [certain, putative]
330
+ end
331
+
332
+ def read_segment_summary(fh, rec)
333
+ fh.each do |line|
334
+ if line =~ /Found: (.*?) segments/
335
+ rec[:num_found] = $1.to_i
336
+ break if rec[:num_found] == 0
337
+ elsif line =~ /Helix\s+Begin/
338
+ (cert, putat) = num_certain_putative(fh)
339
+ rec[:num_certain_transmembrane_segments] = cert
340
+ rec[:num_putative_transmembrane_segments] = putat
341
+ break
342
+ end
343
+ end
344
+ end
345
+ end
346
+
347
+ class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
348
+ def get_root_node_from_io(io, &block)
349
+ # turn off warnings because this doesn't seem to work:
350
+ # XML::Parser.default_load_external_dtd = false
351
+ # (There is a warning about not finding DTD)
352
+ xml_parser_warnings = XML::Parser.default_warnings
353
+ XML::Parser.default_warnings = false
354
+ doc = XML::Parser.io(io).parse
355
+ root = doc.root
356
+ block.call(root)
357
+ # reset the warning level of XML::Parser:
358
+ XML::Parser.default_warnings = xml_parser_warnings
359
+ end
360
+ end
361
+
362
+ class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
363
+ def get_root_node_from_io(io, &block)
364
+ root = ::AXML.parse(io)
365
+ block.call(root)
366
+ end
367
+ end
368
+
data/lib/transmem.rb ADDED
@@ -0,0 +1,157 @@
1
+
2
+ # A transmemIndex is a hash that takes a fasta reference as key and returns
3
+ # a structured hash containing the transmembrane information.
4
+ module TransmemIndex
5
+
6
+ # returns :toppred or :phobius
7
+ def self.filetype(file)
8
+ tp = nil
9
+ File.open(file) do |fh|
10
+ while (line = fh.gets)
11
+ case line
12
+ when /SEQENCE/
13
+ tp = :phobius
14
+ break
15
+ when / 0 0 i/
16
+ tp = :phobius # if they don't have the headers,
17
+ # this will pick it up if they have a
18
+ # single prot without tm or signal peptide.
19
+ break
20
+ when /Algorithm specific parameters/
21
+ tp = :toppred # New text
22
+ break
23
+ when /<parameters>/
24
+ tp = :toppred # XML
25
+ break
26
+ end
27
+ end
28
+ end
29
+ tp
30
+ end
31
+
32
+ def reference_to_key(reference)
33
+ # needs to be subclassed or written
34
+ end
35
+
36
+ # right now accepts toppred.out files
37
+ # Phobius objects can use the fasta object to update their hash for methods
38
+ # like avg_overlap
39
+ def self.new(file, fasta=nil)
40
+ case x = filetype(file)
41
+ when :toppred
42
+ require 'transmem/toppred'
43
+ TopPred::Index.new(file)
44
+ when :phobius
45
+ require 'transmem/phobius'
46
+ # warn "WARNING: You have NO fasta object with Phobius based TransmemIndex! (which needs one to do proper indexing!)" unless fasta
47
+ Phobius::Index.new(file, fasta)
48
+ else
49
+ raise ArgumentError, "#{x} filetype for #{file} not recognized!"
50
+ end
51
+ end
52
+
53
+ # returns a hash of key -> num certain transmembrane segments
54
+ def num_certain_index
55
+ hash = {}
56
+ self.each do |k,v|
57
+ hash[k] = v[:num_certain_transmembrane_segments] || 0
58
+ end
59
+ hash
60
+ end
61
+
62
+ # tp = :number or :fraction which is the fraction of the sequence size
63
+ # returns the average number of overlapping amino acids with transmembrane
64
+ # segments
65
+ # returns nil if there is no protein by that key
66
+ def avg_overlap(key, sequence, tp=:number)
67
+ if self.key? key
68
+ numbers = num_transmem_aa(self[key], sequence)
69
+ if numbers.size > 0
70
+ sum = 0
71
+ numbers.each {|num| sum += num}
72
+ avg_num = sum.to_f / numbers.size
73
+ # the one line way to do it
74
+ #avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
75
+ if tp == :fraction
76
+ avg_num / sequence.size
77
+ # this is the same as doing this:
78
+ #numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
79
+ else
80
+ avg_num
81
+ end
82
+ else
83
+ 0.0
84
+ end
85
+ else # what to do if the protein isn't there?? which happens on occasion
86
+ nil
87
+ end
88
+ end
89
+
90
+ # returns an array (usually length of 1) of the number of amino acids
91
+ # contained inside transmembrane spanning segments.
92
+ # assumes that tmhash has the key 'transmembrane_segments'
93
+ # if there are no transmembrane segments, returns empty array.
94
+ def num_transmem_aa(tmhash, sequence)
95
+ if tmhash.key? :transmembrane_segments
96
+ ranges = tmhash[:transmembrane_segments].map do |tmseg|
97
+ Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
98
+ end
99
+ num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
100
+ else
101
+ []
102
+ end
103
+ end
104
+
105
+ # returns an array of the number of overlapping sequences in substring with
106
+ # the substrings defined in start_stop_doublets within full_sequence
107
+ # start_stop_doublets should be 0 indexed!!!
108
+ # the span includes the 'stop' position i.e., full_sequence[start..stop]
109
+ def num_overlapping_chars(full_sequence, ranges, substring)
110
+ #start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
111
+ if ranges.size == 0
112
+ []
113
+ #full_sequence.enum_for(:scan, substring).map { 0 }
114
+ else
115
+ substring_ranges = []
116
+ pos = 0
117
+ slen = substring.size
118
+ while i=full_sequence.index(substring,pos)
119
+ substring_ranges << Range.new(i, i+slen-1)
120
+ pos = i + slen
121
+ end
122
+ # brute force way
123
+ last_tm_range = ranges.last.last
124
+ to_return = substring_ranges.map do |sb|
125
+ overlap = 0
126
+ # there's got to be a much simpler way to do this, but this does work...
127
+ ranges.each do |tm|
128
+ (frst, lst) =
129
+ if tm.include?( sb.first )
130
+ [tm, sb]
131
+ elsif tm.include?( sb.last )
132
+ [sb, tm]
133
+ else
134
+ nil
135
+ end
136
+ if frst
137
+ if lst.last <= frst.last
138
+ overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
139
+ else
140
+ overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
141
+ end
142
+ end
143
+ end
144
+ overlap
145
+ end
146
+ end
147
+ end
148
+
149
+
150
+ end
151
+
152
+
153
+ #substring_ranges = full_sequence.enum_for(:scan, substring).map do
154
+ # (ofirst, olast) = $~.offset(0)
155
+ # Range.new(ofirst, olast - 1)
156
+ # end
157
+
@@ -0,0 +1,135 @@
1
+ require 'validator' # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
2
+ require 'validator/digestion_based'
3
+ require 'fasta'
4
+ require 'spec_id/aa_freqs'
5
+
6
+ # Constraints on aaseq attribute of peptides (the bare amino acid sequence)
7
+ # works by calculating amino acid frequencies in the fasta file used.
8
+ class Validator::AA < Validator::DigestionBased
9
+ include Precision::Calculator
10
+
11
+ attr_accessor :constraint
12
+
13
+ # it is a false hit if the amino acid is located in the peptide
14
+ attr_accessor :false_if_found
15
+
16
+ # if given, the frequency of the amino acid is used to estimate the false to
17
+ # total ratio based on the pephits given for pephit_precision.
18
+ # see Validator::AA.calc_frequency to calculate a frequency
19
+ attr_accessor :frequency
20
+
21
+ DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
22
+ :false_if_found => true,
23
+ } )
24
+
25
+ # returns tp, fp
26
+ def partition(peps)
27
+ (found, not_found) = peps.partition do |pep|
28
+ pep.aaseq.include?(@constraint)
29
+ end
30
+ if @false_if_found
31
+ [not_found, found]
32
+ else
33
+ [found, not_found]
34
+ end
35
+ end
36
+
37
+ # takes a fasta object and sets the frequency based on constraint.
38
+ # constraint is one acceptable to initialize!
39
+ # returns self
40
+ def set_frequency(fasta_obj)
41
+ table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
42
+ @frequency = table[@constraint.to_sym]
43
+ self
44
+ end
45
+
46
+ # right now only accepts single amino acids as constraints (as a string,
47
+ # e.g. 'C', or symbol, e.g. :C)
48
+ # options:
49
+ # :frequency OR :false_to_total_ratio should be used (NOT both)
50
+ # :frequency => Float, if the frequency of the amino acid is known (see
51
+ # Validator::AA.calc_frequency)
52
+ # :false_to_total_ratio => if a true digestion was already performed (see
53
+ # Validator::AA.calc_false_to_total_ratio)
54
+ # :false_if_found => it is a false positive if the amino acid is found.
55
+ # :background => the background level of amino acid Float
56
+ def initialize(constraint, options={})
57
+ @constraint = constraint.to_s
58
+ opts = DEFAULTS.merge(options)
59
+ (@frequency, @false_to_total_ratio, @false_if_found, @background) = opts.values_at(:frequency, :false_to_total_ratio, :false_if_found, :background)
60
+ end
61
+
62
+ # if expected is 0 then will return precision = 1.0
63
+ def pephit_precision(peps)
64
+ if @frequency
65
+ (actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
66
+ if expected == 0.0
67
+ 1.0
68
+ else
69
+ # what's this guy ?? good for??
70
+ fraction_of_expected = actual.to_f/expected
71
+ pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
72
+ end
73
+ elsif @false_to_total_ratio
74
+ super(peps)
75
+ else
76
+ raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
77
+ end
78
+ end
79
+
80
+ # returns (Actual(Int), Expected(Float)) based on how many peptides have at
81
+ # least one amino_acid, the frequency it is observed in background (then we
82
+ # can look at the size of each peptide and determine the likelihood of
83
+ # having the peptide with at least one amino acid).
84
+ # amino_acid should be a string (e.g., 'C')
85
+ def at_least_one(amino_acid, freq, amino_acid_seqs)
86
+ one_minus_freq = 1.0 - freq
87
+ probs = []
88
+ actual = 0
89
+ expected = 0.0
90
+ amino_acid_seqs.each do |aaseq|
91
+ expected += (1.0 - (one_minus_freq**aaseq.size))
92
+ if aaseq.include?(amino_acid)
93
+ actual += 1
94
+ end
95
+ end
96
+ [actual, expected]
97
+ end
98
+
99
+
100
+ # given: (actual # with 'AA', expected # with 'AA', total#peptides,
101
+ # mean_fraction_of_cysteines_true)
102
+ #
103
+ # PepHit('AA') = Peptide containing at least one 'AA'
104
+ # # expected PepHit('AA') # observed Bad Pep ('AA')
105
+ # ----------------------- proportional_to -------------------------
106
+ # # total PepHits # Total Bad PepHit
107
+ #
108
+ # returns the precision
109
+ # the background correction factor will not reduce the actual count of
110
+ # peptides to < 0. One can still get negative precision scores, however,
111
+ # depending on the other variables.
112
+ # background is the number of peptides with the amino acid in the purest
113
+ # sample over the total number of peps.
114
+ #---
115
+ # this is thoroughly explained in my 2007_09 presentations (inkscape)
116
+ #+++
117
+ def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
118
+ actual = actual.to_f
119
+ @calculated_background = actual / total_peps
120
+ actual -= (total_peps * background)
121
+ # We were doing it compared to the number expected.. but this is more
122
+ # clear
123
+ # actual/false_hits = expected/total_peps_passing
124
+ # false_hits = (total_peps_passing * actual) / expected
125
+ if actual < 0.0 ; actual = 0.0 end
126
+ total_number_false = (actual * total_peps).to_f / expected
127
+ #fppr = total_number_false / total_peps
128
+ prec = (total_peps - total_number_false) / total_peps
129
+ end
130
+
131
+ def to_param_string
132
+ "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
133
+ end
134
+ end
135
+