mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id_xml.rb DELETED
@@ -1,99 +0,0 @@
1
-
2
- # I would prefer to call this SpecID::XML, but I keep getting an error:
3
- # /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
4
- # constant XML referenced by SpecID::XML' This works around that for now.
5
- # Any major xml elements should return a newline at the end for simple
6
- # concatenation into a file
7
- module SpecIDXML
8
-
9
- MSial_chrs_hash = {
10
- '"' => '"',
11
- '&' => '&',
12
- "'" => ''',
13
- '<' => '&lt;',
14
- '>' => '&gt;',
15
- }
16
-
17
- # substitutes special xml chars
18
- def escape_special_chars(string)
19
- string.split('').map do |char|
20
- if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
21
- # if x = MSial_chrs_hash[char] ; x # <-- that's slightly slower
22
- else ; char end
23
- end.join
24
- end
25
-
26
- $DEPTH = 0
27
-
28
- def tabs
29
- # this is ugly
30
- string = ""
31
- $DEPTH.times { string << "\t" }
32
- string
33
- end
34
-
35
-
36
- def param_xml(obj, symbol)
37
- tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
38
- end
39
-
40
- def params_xml(obj, *symbol_list)
41
- symbol_list.collect { |sy|
42
- param_xml(obj, sy)
43
- }.join("\n") + "\n"
44
- end
45
-
46
- def short_element_xml(element, att_list)
47
- "#{tabs}<#{element} #{attrs_xml(att_list)}/>\n"
48
- end
49
-
50
- def short_element_xml_and_att_string(element, att_string)
51
- "#{tabs}<#{element} #{att_string}/>\n"
52
- end
53
-
54
- # requires that obj have attribute '@xml_element_name'
55
- # displays all *instance_variables* (does not call methods!)
56
- def short_element_xml_from_instance_vars(element_name)
57
- string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
58
- "#{tabs}<#{element_name} #{string}/>\n"
59
- end
60
-
61
- # takes an element as a symbol and returns the
62
- def element_xml_no_atts(element)
63
- start = "#{tabs}<#{element}>\n"
64
- $DEPTH += 1
65
- if block_given? ; middle = yield else ; middle = '' end
66
- $DEPTH -= 1
67
- start + middle + "#{tabs}</#{element}>\n"
68
- end
69
-
70
- # takes an element as a symbol and returns the
71
- def element_xml(element, att_list)
72
-
73
- start = "#{tabs}<#{element} #{attrs_xml(att_list)}>\n"
74
- $DEPTH += 1
75
- if block_given? ; middle = yield else ; middle = '' end
76
- $DEPTH -= 1
77
- start + middle + "#{tabs}</#{element}>\n"
78
- end
79
-
80
- # element as symbol and att_string as attributes
81
- # takes a block of whatever
82
- def element_xml_and_att_string(element, att_string)
83
- start = "#{tabs}<#{element} #{att_string}>\n"
84
- $DEPTH += 1
85
- if block_given? ; middle = yield else ; middle = '' end
86
- $DEPTH -= 1
87
- start + middle + "#{tabs}</#{element}>\n"
88
- end
89
-
90
- def attr_xml(symbol)
91
- "#{symbol}=\"#{send(symbol)}\""
92
- end
93
-
94
- def attrs_xml(list_of_symbols)
95
- list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
96
- end
97
-
98
- end
99
-
@@ -1,147 +0,0 @@
1
- require 'transmem'
2
-
3
- class Phobius ; end
4
-
5
- # This class will probably change its interface some in the future
6
- # That's the web portal
7
- # http://phobius.cgb.ki.se/
8
- # How to run:
9
- # Select output format as 'Short'
10
- # then hit 'Submit Query'
11
-
12
- # note: to implement some of the TransmemIndex features, the update_aaseq
13
- # method must be called!
14
- class Phobius::Index < Hash
15
- include TransmemIndex
16
-
17
- # will update_aaseq if given a fasta_obj
18
- def initialize(file, fasta_obj = nil )
19
- Phobius.default_index(file, self)
20
- if fasta_obj
21
- update_aaseq(fasta_obj)
22
- end
23
- end
24
-
25
- # we need to match whatever function toppred uses to generate identifiers if
26
- # we want derivative processes to be fast and accurate
27
- def reference_to_key(reference)
28
- if reference
29
- if reference.size > 0
30
- index = reference.index(' ')
31
- string =
32
- if index
33
- reference[0...index]
34
- else
35
- reference
36
- end
37
- string.gsub('"','')
38
- else
39
- ''
40
- end
41
- else
42
- nil
43
- end
44
- end
45
-
46
- # adds an :aaseq key to each hash (necessary for avg_overlap method)
47
- # these are shallow references to the aaseq in the fasta obj
48
- def update_aaseq(fasta)
49
- fasta.each do |prot|
50
- self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
51
- end
52
- end
53
-
54
- end
55
-
56
- class Phobius
57
- include TransmemIndex
58
-
59
- # returns the default index
60
- def self.default_index(file, index={})
61
- parser = Phobius::Parser.new(:short)
62
- parser.file_to_index(file, index)
63
- end
64
-
65
- end
66
-
67
- module Phobius::Parser
68
-
69
- def self.new(parser_type=:short)
70
- klass =
71
- case parser_type
72
- when :short
73
- Phobius::ParserShort
74
- else
75
- raise ArgumentError, "don't recognize parser type: #{parser_type}"
76
- end
77
- klass.new
78
- end
79
-
80
- def file_to_index(file, index={})
81
- File.open(file) {|fh| to_index(fh, index) }
82
- end
83
-
84
- end
85
-
86
-
87
- class Phobius::ParserShort
88
- include Phobius::Parser
89
-
90
- # takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
91
- # and returns an array of hashes with the keys :start and :stop
92
- def prediction_to_array(string)
93
- segments = []
94
- string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
95
- segments << { :start => m1.to_i, :stop => m2.to_i }
96
- end
97
- segments
98
- end
99
-
100
- # returns a hash structure in this form: { identifier => {
101
- # :num_certain_transmembrane_segments => Int,
102
- # :transmembrane_segments => [:start => Int, :stop
103
- # => Int] }
104
- # can parse io even if there is no header to key in on.
105
- def to_index(io, index={})
106
- init_pos = io.pos
107
- cnt = 0
108
- found_header = false
109
- loop do
110
- if io.gets =~ /SEQENCE/
111
- found_header = true
112
- break
113
- end
114
- cnt += 1
115
- break if cnt > 10
116
- end
117
- if !found_header
118
- io.pos = init_pos
119
- end
120
- current_record = nil
121
- io.each do |line|
122
- line.chomp!
123
- # grab values
124
- ar = line.split(/\s+/)
125
- next if ar.size != 4
126
- (key, num_tms, signal_peptide, prediction) = ar
127
- # cast the values
128
- num_tms = num_tms.to_i
129
- signal_peptide =
130
- case signal_peptide
131
- when 'Y'
132
- true
133
- when '0'
134
- false
135
- end
136
- index[key] = {
137
- :num_certain_transmembrane_segments => num_tms,
138
- :signal_peptide => signal_peptide,
139
- }
140
- if num_tms > 0
141
- index[key][:transmembrane_segments] = prediction_to_array(prediction)
142
- end
143
- end
144
- index
145
- end
146
-
147
- end
@@ -1,368 +0,0 @@
1
- require 'transmem'
2
- require 'xml_style_parser'
3
-
4
- class TopPred ; end
5
-
6
-
7
- class TopPred::Index < Hash
8
- include TransmemIndex
9
-
10
- # we need to match whatever function toppred uses to generate identifiers if
11
- # we want derivative processes to be fast and accurate
12
- def reference_to_key(reference)
13
- if reference
14
- ri = reference.index(' ')
15
- frst =
16
- if ri
17
- reference[0...reference.index(' ')]
18
- else
19
- reference
20
- end
21
- if frst
22
- frst.gsub(/[^0-9a-zA-Z]/,'_')
23
- else
24
- nil
25
- end
26
- else
27
- nil
28
- end
29
- end
30
-
31
- def initialize(file, kind=:default)
32
- case kind
33
- when :default
34
- TopPred.default_index(file, self)
35
- else
36
- abort "can't do #{kind}"
37
- end
38
- end
39
-
40
- # This class will probably change its interface some in the future
41
- # That's the web portal
42
- # http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
43
- # How to run:
44
- # uncheck 'Produce hydrophobicity graph image (-g)'
45
- # choose 'Xml' or 'New: new text' output
46
- # type in your email, then hit 'Run toppred'
47
- end
48
-
49
- class TopPred
50
- include TransmemIndex
51
-
52
- # returns the default index
53
- def self.default_index(file, index={})
54
- TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
55
- end
56
-
57
- end
58
-
59
- module TopPred::Parser
60
- # returns :xml or :text
61
- def self.filetype(file)
62
- File.open(file) do |fh|
63
- case fh.gets
64
- when /<\?xml version.*>/
65
- :xml
66
- when /Algorithm specific/
67
- :text
68
- else
69
- nil
70
- end
71
- end
72
- end
73
-
74
- # type = :xml or :text
75
- def self.new(parser_type=:xml)
76
- klass =
77
- case parser_type
78
- when :xml
79
- TopPred::Parser_XML
80
- when :text
81
- TopPred::Parser_Text
82
- else
83
- abort "don't recognize parser type: #{parser_type}"
84
- end
85
- klass.new
86
- end
87
-
88
- def file_to_index(file, index={})
89
- File.open(file) {|fh| to_index(fh, index) }
90
- end
91
-
92
- # where each segment = [prob, first, last] and aaseq is a string each
93
- # segment may also be a hash => first, last, probability (adding key
94
- # 'aaseq')
95
- # first/last '1' indexed returns segments where each is [prob,
96
- # first, last, aaseq] or hash (above)
97
- def add_sequences_to_segments(segments, aaseq)
98
- if segments.first.is_a? Array
99
- segments.each do |seg|
100
- first_index = seg[1] - 1
101
- length = (seg[2] - seg[1]) + 1
102
- seg.push( aaseq[first_index, length] )
103
- end
104
- else
105
- segments.each do |seg|
106
- first_index = seg[:start] - 1
107
- length = (seg[:stop] - seg[:start]) + 1
108
- seg[:aaseq] = ( aaseq[first_index, length] )
109
- end
110
- end
111
- segments
112
- end
113
-
114
-
115
-
116
- end
117
-
118
- module TopPred::Parser_XML
119
- include TopPred::Parser
120
- include XMLStyleParser
121
-
122
- def self.new(meth=:to_index)
123
- parser = XMLStyleParser.choose_parser(self, meth).new
124
- @method = meth
125
- parser
126
- end
127
-
128
- def parse(file)
129
- send(@method, file)
130
- end
131
- end
132
-
133
- class TopPred::Parser_XML::DOM
134
- include TopPred::Parser_XML
135
- include XMLStyleParser
136
-
137
- =begin
138
- YAL010C:
139
- num_putative_transmembrane_segments: 1
140
- aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
141
- best_structure_probability: 1.0
142
- transmembrane_segments:
143
- - aaseq: SLGAEFWLGLVSLSPGCSTTL
144
- stop: 252
145
- start: 232
146
- probability: 1.0
147
- num_certain_transmembrane_segments: 1
148
- num_found: 2
149
- =end
150
-
151
- # should return a index
152
- def to_index(io, index = {})
153
- get_root_node_from_io(io) do |toppreds_n|
154
-
155
- abort if toppreds_n.name != 'toppreds'
156
- toppreds_n.find('child::toppred').each do |toppred_n|
157
- att_hash = {}
158
- sequence_n = toppred_n.find_first('child::sequence')
159
- index[sequence_n['id']] = att_hash
160
- att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
161
- abort if att_hash[:aaseq].size != sequence_n['size'].to_i
162
- tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
163
-
164
- num_found = tmsummary_n['segments'].to_i
165
- att_hash[:num_found] = num_found
166
- if num_found > 0
167
-
168
- num_certain_transmembrane_segments = 0
169
- num_putative_transmembrane_segments = 0
170
- tmsummary_n.find('child::segment').each do |segment_n|
171
- abort if segment_n.name != 'segment'
172
- case segment_n['type']
173
- when 'certain'
174
- num_certain_transmembrane_segments += 1
175
- else # putative
176
- num_putative_transmembrane_segments += 1
177
- end
178
- end
179
- att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
180
- att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
181
-
182
- topologies_n = tmsummary_n.next
183
- abort if topologies_n.name != 'topologies'
184
- # get the top probability topology:
185
- top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
186
- tmsegments = []
187
- top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
188
- tmhash = {}
189
- tmhash[:start] = tmsegment_n['start'].to_i
190
- tmhash[:stop] = tmsegment_n['stop'].to_i
191
- ## WARNING! it appears the probability is broken on xml output!!
192
- tmhash[:probability] = tmsegment_n['prob'].to_f
193
- tmsegments << tmhash
194
- end
195
- add_sequences_to_segments(tmsegments, att_hash[:aaseq])
196
- att_hash[:transmembrane_segments] = tmsegments
197
- end
198
- end
199
- end
200
- index
201
- end
202
-
203
- end
204
-
205
- class TopPred::Parser_Text
206
- include TopPred::Parser
207
-
208
-
209
- # returns a hash structure in this form: {identifier => {aaseq => String,
210
- # num_found: Int, num_certain_transmembrane_segments => Int,
211
- # num_putative_transmembrane_segments => Int, best_structure_probability =>
212
- # Float, transmembrane_segments => [probability => Float, start => Int, stop
213
- # => Int, aaseq => String] } }
214
- def to_index(io, index={})
215
- current_record = nil
216
-
217
- io.each do |line|
218
- if line =~ /^Sequence : (.*?) +\(/
219
- current_identifier = $1.dup
220
- index[current_identifier] = {}
221
- current_record = index[current_identifier]
222
- current_record[:aaseq] = read_aaseq(io)
223
- read_segment_summary(io, current_record)
224
- elsif line =~ /^HEADER\s+START\s+STOP/
225
- top_struc = top_structure( read_structures(io) )
226
- current_record[:best_structure_probability] = top_struc[:probability]
227
- current_record[:transmembrane_segments] = top_struc[:tm]
228
- add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
229
- segment_arrays_to_hashes(current_record[:transmembrane_segments])
230
- end
231
- end
232
- index
233
- end
234
-
235
- private
236
-
237
- # returns a list of all structures given a filehandle starting just after
238
- # the first "HEADER START STOP ..." line
239
- def read_structures(fh)
240
- structures = []
241
- loop do
242
- structures.push( read_structure(fh) )
243
- break if fh.eof?
244
- line = fh.readline
245
- unless line =~ /^HEADER\s+START\s+STOP/
246
- break
247
- end
248
- end
249
- structures
250
- end
251
-
252
- # returns a hash with key :probability and key :tm contains an array of
253
- # arrays: [prob(Float), start(Int), stop(Int)]
254
- def read_structure(fh)
255
- structure = {}
256
- # READ the first line
257
- line = fh.readline
258
- structure[:probability] = line.split(/\s+/)[2].to_f
259
- structure[:tm] = read_segments(fh)
260
- structure
261
- end
262
-
263
- # returns an array of arrays of transmembrane segments: [prob(Float),
264
- # start(Int), stop(Int)]
265
- # returns after seeing '//'
266
- def read_segments(fh)
267
- segments = []
268
- st = Regexp.escape('//') ; end_regex = /#{st}/
269
- fh.each do |line|
270
- if line =~ /^TRANSMEM/
271
- (header, start, stop, len, prob) = line.split(/\s+/)[0,5]
272
- segments << [prob.to_f, start.to_i, stop.to_i]
273
- elsif line =~ end_regex
274
- break
275
- end
276
- end
277
- segments
278
- end
279
-
280
- # returns the top probability structure (first on tie)
281
- def top_structure(list)
282
- top_prob = list.first[:probability]
283
- top_struc = list.first
284
- list.each do |st|
285
- if st[:probability] > top_prob
286
- top_struc = st
287
- top_prob = st[:probability]
288
- end
289
- end
290
- top_struc
291
- end
292
-
293
- def read_aaseq(fh)
294
- aaseq = ''
295
- fh.each do |line|
296
- line.chomp!
297
- unless line =~ /[\w\*]/
298
- break
299
- end
300
- aaseq << line
301
- end
302
- aaseq
303
- end
304
-
305
- def segment_arrays_to_hashes(list)
306
- list.map! do |ar|
307
- { :probability => ar[0],
308
- :start => ar[1],
309
- :stop => ar[2],
310
- :aaseq => ar[3],
311
- }
312
- end
313
- end
314
-
315
- # returns [certain, putative]
316
- # expects first line to be a tm segment
317
- def num_certain_putative(fh)
318
- certain = 0
319
- putative = 0
320
- fh.each do |line|
321
- certainty = line.chomp.split(/\s+/).last
322
- if !certainty
323
- break
324
- else
325
- certain += 1 if certainty == 'Certain'
326
- putative += 1 if certainty == 'Putative'
327
- end
328
- end
329
- [certain, putative]
330
- end
331
-
332
- def read_segment_summary(fh, rec)
333
- fh.each do |line|
334
- if line =~ /Found: (.*?) segments/
335
- rec[:num_found] = $1.to_i
336
- break if rec[:num_found] == 0
337
- elsif line =~ /Helix\s+Begin/
338
- (cert, putat) = num_certain_putative(fh)
339
- rec[:num_certain_transmembrane_segments] = cert
340
- rec[:num_putative_transmembrane_segments] = putat
341
- break
342
- end
343
- end
344
- end
345
- end
346
-
347
- class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
348
- def get_root_node_from_io(io, &block)
349
- # turn off warnings because this doesn't seem to work:
350
- # XML::Parser.default_load_external_dtd = false
351
- # (There is a warning about not finding DTD)
352
- xml_parser_warnings = XML::Parser.default_warnings
353
- XML::Parser.default_warnings = false
354
- doc = XML::Parser.io(io).parse
355
- root = doc.root
356
- block.call(root)
357
- # reset the warning level of XML::Parser:
358
- XML::Parser.default_warnings = xml_parser_warnings
359
- end
360
- end
361
-
362
- class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
363
- def get_root_node_from_io(io, &block)
364
- root = ::AXML.parse(io)
365
- block.call(root)
366
- end
367
- end
368
-