mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id_xml.rb DELETED
@@ -1,99 +0,0 @@
1
-
2
- # I would prefer to call this SpecID::XML, but I keep getting an error:
3
- # /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
4
- # constant XML referenced by SpecID::XML' This works around that for now.
5
- # Any major xml elements should return a newline at the end for simple
6
- # concatenation into a file
7
- module SpecIDXML
8
-
9
- MSial_chrs_hash = {
10
- '"' => '"',
11
- '&' => '&',
12
- "'" => ''',
13
- '<' => '&lt;',
14
- '>' => '&gt;',
15
- }
16
-
17
- # substitutes special xml chars
18
- def escape_special_chars(string)
19
- string.split('').map do |char|
20
- if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
21
- # if x = MSial_chrs_hash[char] ; x # <-- that's slightly slower
22
- else ; char end
23
- end.join
24
- end
25
-
26
- $DEPTH = 0
27
-
28
- def tabs
29
- # this is ugly
30
- string = ""
31
- $DEPTH.times { string << "\t" }
32
- string
33
- end
34
-
35
-
36
- def param_xml(obj, symbol)
37
- tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
38
- end
39
-
40
- def params_xml(obj, *symbol_list)
41
- symbol_list.collect { |sy|
42
- param_xml(obj, sy)
43
- }.join("\n") + "\n"
44
- end
45
-
46
- def short_element_xml(element, att_list)
47
- "#{tabs}<#{element} #{attrs_xml(att_list)}/>\n"
48
- end
49
-
50
- def short_element_xml_and_att_string(element, att_string)
51
- "#{tabs}<#{element} #{att_string}/>\n"
52
- end
53
-
54
- # requires that obj have attribute '@xml_element_name'
55
- # displays all *instance_variables* (does not call methods!)
56
- def short_element_xml_from_instance_vars(element_name)
57
- string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
58
- "#{tabs}<#{element_name} #{string}/>\n"
59
- end
60
-
61
- # takes an element as a symbol and returns the
62
- def element_xml_no_atts(element)
63
- start = "#{tabs}<#{element}>\n"
64
- $DEPTH += 1
65
- if block_given? ; middle = yield else ; middle = '' end
66
- $DEPTH -= 1
67
- start + middle + "#{tabs}</#{element}>\n"
68
- end
69
-
70
- # takes an element as a symbol and returns the
71
- def element_xml(element, att_list)
72
-
73
- start = "#{tabs}<#{element} #{attrs_xml(att_list)}>\n"
74
- $DEPTH += 1
75
- if block_given? ; middle = yield else ; middle = '' end
76
- $DEPTH -= 1
77
- start + middle + "#{tabs}</#{element}>\n"
78
- end
79
-
80
- # element as symbol and att_string as attributes
81
- # takes a block of whatever
82
- def element_xml_and_att_string(element, att_string)
83
- start = "#{tabs}<#{element} #{att_string}>\n"
84
- $DEPTH += 1
85
- if block_given? ; middle = yield else ; middle = '' end
86
- $DEPTH -= 1
87
- start + middle + "#{tabs}</#{element}>\n"
88
- end
89
-
90
- def attr_xml(symbol)
91
- "#{symbol}=\"#{send(symbol)}\""
92
- end
93
-
94
- def attrs_xml(list_of_symbols)
95
- list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
96
- end
97
-
98
- end
99
-
@@ -1,147 +0,0 @@
1
- require 'transmem'
2
-
3
- class Phobius ; end
4
-
5
- # This class will probably change its interface some in the future
6
- # That's the web portal
7
- # http://phobius.cgb.ki.se/
8
- # How to run:
9
- # Select output format as 'Short'
10
- # then hit 'Submit Query'
11
-
12
- # note: to implement some of the TransmemIndex features, the update_aaseq
13
- # method must be called!
14
- class Phobius::Index < Hash
15
- include TransmemIndex
16
-
17
- # will update_aaseq if given a fasta_obj
18
- def initialize(file, fasta_obj = nil )
19
- Phobius.default_index(file, self)
20
- if fasta_obj
21
- update_aaseq(fasta_obj)
22
- end
23
- end
24
-
25
- # we need to match whatever function toppred uses to generate identifiers if
26
- # we want derivative processes to be fast and accurate
27
- def reference_to_key(reference)
28
- if reference
29
- if reference.size > 0
30
- index = reference.index(' ')
31
- string =
32
- if index
33
- reference[0...index]
34
- else
35
- reference
36
- end
37
- string.gsub('"','')
38
- else
39
- ''
40
- end
41
- else
42
- nil
43
- end
44
- end
45
-
46
- # adds an :aaseq key to each hash (necessary for avg_overlap method)
47
- # these are shallow references to the aaseq in the fasta obj
48
- def update_aaseq(fasta)
49
- fasta.each do |prot|
50
- self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
51
- end
52
- end
53
-
54
- end
55
-
56
- class Phobius
57
- include TransmemIndex
58
-
59
- # returns the default index
60
- def self.default_index(file, index={})
61
- parser = Phobius::Parser.new(:short)
62
- parser.file_to_index(file, index)
63
- end
64
-
65
- end
66
-
67
- module Phobius::Parser
68
-
69
- def self.new(parser_type=:short)
70
- klass =
71
- case parser_type
72
- when :short
73
- Phobius::ParserShort
74
- else
75
- raise ArgumentError, "don't recognize parser type: #{parser_type}"
76
- end
77
- klass.new
78
- end
79
-
80
- def file_to_index(file, index={})
81
- File.open(file) {|fh| to_index(fh, index) }
82
- end
83
-
84
- end
85
-
86
-
87
- class Phobius::ParserShort
88
- include Phobius::Parser
89
-
90
- # takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
91
- # and returns an array of hashes with the keys :start and :stop
92
- def prediction_to_array(string)
93
- segments = []
94
- string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
95
- segments << { :start => m1.to_i, :stop => m2.to_i }
96
- end
97
- segments
98
- end
99
-
100
- # returns a hash structure in this form: { identifier => {
101
- # :num_certain_transmembrane_segments => Int,
102
- # :transmembrane_segments => [:start => Int, :stop
103
- # => Int] }
104
- # can parse io even if there is no header to key in on.
105
- def to_index(io, index={})
106
- init_pos = io.pos
107
- cnt = 0
108
- found_header = false
109
- loop do
110
- if io.gets =~ /SEQENCE/
111
- found_header = true
112
- break
113
- end
114
- cnt += 1
115
- break if cnt > 10
116
- end
117
- if !found_header
118
- io.pos = init_pos
119
- end
120
- current_record = nil
121
- io.each do |line|
122
- line.chomp!
123
- # grab values
124
- ar = line.split(/\s+/)
125
- next if ar.size != 4
126
- (key, num_tms, signal_peptide, prediction) = ar
127
- # cast the values
128
- num_tms = num_tms.to_i
129
- signal_peptide =
130
- case signal_peptide
131
- when 'Y'
132
- true
133
- when '0'
134
- false
135
- end
136
- index[key] = {
137
- :num_certain_transmembrane_segments => num_tms,
138
- :signal_peptide => signal_peptide,
139
- }
140
- if num_tms > 0
141
- index[key][:transmembrane_segments] = prediction_to_array(prediction)
142
- end
143
- end
144
- index
145
- end
146
-
147
- end
@@ -1,368 +0,0 @@
1
- require 'transmem'
2
- require 'xml_style_parser'
3
-
4
- class TopPred ; end
5
-
6
-
7
- class TopPred::Index < Hash
8
- include TransmemIndex
9
-
10
- # we need to match whatever function toppred uses to generate identifiers if
11
- # we want derivative processes to be fast and accurate
12
- def reference_to_key(reference)
13
- if reference
14
- ri = reference.index(' ')
15
- frst =
16
- if ri
17
- reference[0...reference.index(' ')]
18
- else
19
- reference
20
- end
21
- if frst
22
- frst.gsub(/[^0-9a-zA-Z]/,'_')
23
- else
24
- nil
25
- end
26
- else
27
- nil
28
- end
29
- end
30
-
31
- def initialize(file, kind=:default)
32
- case kind
33
- when :default
34
- TopPred.default_index(file, self)
35
- else
36
- abort "can't do #{kind}"
37
- end
38
- end
39
-
40
- # This class will probably change its interface some in the future
41
- # That's the web portal
42
- # http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
43
- # How to run:
44
- # uncheck 'Produce hydrophobicity graph image (-g)'
45
- # choose 'Xml' or 'New: new text' output
46
- # type in your email, then hit 'Run toppred'
47
- end
48
-
49
- class TopPred
50
- include TransmemIndex
51
-
52
- # returns the default index
53
- def self.default_index(file, index={})
54
- TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
55
- end
56
-
57
- end
58
-
59
- module TopPred::Parser
60
- # returns :xml or :text
61
- def self.filetype(file)
62
- File.open(file) do |fh|
63
- case fh.gets
64
- when /<\?xml version.*>/
65
- :xml
66
- when /Algorithm specific/
67
- :text
68
- else
69
- nil
70
- end
71
- end
72
- end
73
-
74
- # type = :xml or :text
75
- def self.new(parser_type=:xml)
76
- klass =
77
- case parser_type
78
- when :xml
79
- TopPred::Parser_XML
80
- when :text
81
- TopPred::Parser_Text
82
- else
83
- abort "don't recognize parser type: #{parser_type}"
84
- end
85
- klass.new
86
- end
87
-
88
- def file_to_index(file, index={})
89
- File.open(file) {|fh| to_index(fh, index) }
90
- end
91
-
92
- # where each segment = [prob, first, last] and aaseq is a string each
93
- # segment may also be a hash => first, last, probability (adding key
94
- # 'aaseq')
95
- # first/last '1' indexed returns segments where each is [prob,
96
- # first, last, aaseq] or hash (above)
97
- def add_sequences_to_segments(segments, aaseq)
98
- if segments.first.is_a? Array
99
- segments.each do |seg|
100
- first_index = seg[1] - 1
101
- length = (seg[2] - seg[1]) + 1
102
- seg.push( aaseq[first_index, length] )
103
- end
104
- else
105
- segments.each do |seg|
106
- first_index = seg[:start] - 1
107
- length = (seg[:stop] - seg[:start]) + 1
108
- seg[:aaseq] = ( aaseq[first_index, length] )
109
- end
110
- end
111
- segments
112
- end
113
-
114
-
115
-
116
- end
117
-
118
- module TopPred::Parser_XML
119
- include TopPred::Parser
120
- include XMLStyleParser
121
-
122
- def self.new(meth=:to_index)
123
- parser = XMLStyleParser.choose_parser(self, meth).new
124
- @method = meth
125
- parser
126
- end
127
-
128
- def parse(file)
129
- send(@method, file)
130
- end
131
- end
132
-
133
- class TopPred::Parser_XML::DOM
134
- include TopPred::Parser_XML
135
- include XMLStyleParser
136
-
137
- =begin
138
- YAL010C:
139
- num_putative_transmembrane_segments: 1
140
- aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
141
- best_structure_probability: 1.0
142
- transmembrane_segments:
143
- - aaseq: SLGAEFWLGLVSLSPGCSTTL
144
- stop: 252
145
- start: 232
146
- probability: 1.0
147
- num_certain_transmembrane_segments: 1
148
- num_found: 2
149
- =end
150
-
151
- # should return a index
152
- def to_index(io, index = {})
153
- get_root_node_from_io(io) do |toppreds_n|
154
-
155
- abort if toppreds_n.name != 'toppreds'
156
- toppreds_n.find('child::toppred').each do |toppred_n|
157
- att_hash = {}
158
- sequence_n = toppred_n.find_first('child::sequence')
159
- index[sequence_n['id']] = att_hash
160
- att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
161
- abort if att_hash[:aaseq].size != sequence_n['size'].to_i
162
- tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
163
-
164
- num_found = tmsummary_n['segments'].to_i
165
- att_hash[:num_found] = num_found
166
- if num_found > 0
167
-
168
- num_certain_transmembrane_segments = 0
169
- num_putative_transmembrane_segments = 0
170
- tmsummary_n.find('child::segment').each do |segment_n|
171
- abort if segment_n.name != 'segment'
172
- case segment_n['type']
173
- when 'certain'
174
- num_certain_transmembrane_segments += 1
175
- else # putative
176
- num_putative_transmembrane_segments += 1
177
- end
178
- end
179
- att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
180
- att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
181
-
182
- topologies_n = tmsummary_n.next
183
- abort if topologies_n.name != 'topologies'
184
- # get the top probability topology:
185
- top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
186
- tmsegments = []
187
- top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
188
- tmhash = {}
189
- tmhash[:start] = tmsegment_n['start'].to_i
190
- tmhash[:stop] = tmsegment_n['stop'].to_i
191
- ## WARNING! it appears the probability is broken on xml output!!
192
- tmhash[:probability] = tmsegment_n['prob'].to_f
193
- tmsegments << tmhash
194
- end
195
- add_sequences_to_segments(tmsegments, att_hash[:aaseq])
196
- att_hash[:transmembrane_segments] = tmsegments
197
- end
198
- end
199
- end
200
- index
201
- end
202
-
203
- end
204
-
205
- class TopPred::Parser_Text
206
- include TopPred::Parser
207
-
208
-
209
- # returns a hash structure in this form: {identifier => {aaseq => String,
210
- # num_found: Int, num_certain_transmembrane_segments => Int,
211
- # num_putative_transmembrane_segments => Int, best_structure_probability =>
212
- # Float, transmembrane_segments => [probability => Float, start => Int, stop
213
- # => Int, aaseq => String] } }
214
- def to_index(io, index={})
215
- current_record = nil
216
-
217
- io.each do |line|
218
- if line =~ /^Sequence : (.*?) +\(/
219
- current_identifier = $1.dup
220
- index[current_identifier] = {}
221
- current_record = index[current_identifier]
222
- current_record[:aaseq] = read_aaseq(io)
223
- read_segment_summary(io, current_record)
224
- elsif line =~ /^HEADER\s+START\s+STOP/
225
- top_struc = top_structure( read_structures(io) )
226
- current_record[:best_structure_probability] = top_struc[:probability]
227
- current_record[:transmembrane_segments] = top_struc[:tm]
228
- add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
229
- segment_arrays_to_hashes(current_record[:transmembrane_segments])
230
- end
231
- end
232
- index
233
- end
234
-
235
- private
236
-
237
- # returns a list of all structures given a filehandle starting just after
238
- # the first "HEADER START STOP ..." line
239
- def read_structures(fh)
240
- structures = []
241
- loop do
242
- structures.push( read_structure(fh) )
243
- break if fh.eof?
244
- line = fh.readline
245
- unless line =~ /^HEADER\s+START\s+STOP/
246
- break
247
- end
248
- end
249
- structures
250
- end
251
-
252
- # returns a hash with key :probability and key :tm contains an array of
253
- # arrays: [prob(Float), start(Int), stop(Int)]
254
- def read_structure(fh)
255
- structure = {}
256
- # READ the first line
257
- line = fh.readline
258
- structure[:probability] = line.split(/\s+/)[2].to_f
259
- structure[:tm] = read_segments(fh)
260
- structure
261
- end
262
-
263
- # returns an array of arrays of transmembrane segments: [prob(Float),
264
- # start(Int), stop(Int)]
265
- # returns after seeing '//'
266
- def read_segments(fh)
267
- segments = []
268
- st = Regexp.escape('//') ; end_regex = /#{st}/
269
- fh.each do |line|
270
- if line =~ /^TRANSMEM/
271
- (header, start, stop, len, prob) = line.split(/\s+/)[0,5]
272
- segments << [prob.to_f, start.to_i, stop.to_i]
273
- elsif line =~ end_regex
274
- break
275
- end
276
- end
277
- segments
278
- end
279
-
280
- # returns the top probability structure (first on tie)
281
- def top_structure(list)
282
- top_prob = list.first[:probability]
283
- top_struc = list.first
284
- list.each do |st|
285
- if st[:probability] > top_prob
286
- top_struc = st
287
- top_prob = st[:probability]
288
- end
289
- end
290
- top_struc
291
- end
292
-
293
- def read_aaseq(fh)
294
- aaseq = ''
295
- fh.each do |line|
296
- line.chomp!
297
- unless line =~ /[\w\*]/
298
- break
299
- end
300
- aaseq << line
301
- end
302
- aaseq
303
- end
304
-
305
- def segment_arrays_to_hashes(list)
306
- list.map! do |ar|
307
- { :probability => ar[0],
308
- :start => ar[1],
309
- :stop => ar[2],
310
- :aaseq => ar[3],
311
- }
312
- end
313
- end
314
-
315
- # returns [certain, putative]
316
- # expects first line to be a tm segment
317
- def num_certain_putative(fh)
318
- certain = 0
319
- putative = 0
320
- fh.each do |line|
321
- certainty = line.chomp.split(/\s+/).last
322
- if !certainty
323
- break
324
- else
325
- certain += 1 if certainty == 'Certain'
326
- putative += 1 if certainty == 'Putative'
327
- end
328
- end
329
- [certain, putative]
330
- end
331
-
332
- def read_segment_summary(fh, rec)
333
- fh.each do |line|
334
- if line =~ /Found: (.*?) segments/
335
- rec[:num_found] = $1.to_i
336
- break if rec[:num_found] == 0
337
- elsif line =~ /Helix\s+Begin/
338
- (cert, putat) = num_certain_putative(fh)
339
- rec[:num_certain_transmembrane_segments] = cert
340
- rec[:num_putative_transmembrane_segments] = putat
341
- break
342
- end
343
- end
344
- end
345
- end
346
-
347
- class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
348
- def get_root_node_from_io(io, &block)
349
- # turn off warnings because this doesn't seem to work:
350
- # XML::Parser.default_load_external_dtd = false
351
- # (There is a warning about not finding DTD)
352
- xml_parser_warnings = XML::Parser.default_warnings
353
- XML::Parser.default_warnings = false
354
- doc = XML::Parser.io(io).parse
355
- root = doc.root
356
- block.call(root)
357
- # reset the warning level of XML::Parser:
358
- XML::Parser.default_warnings = xml_parser_warnings
359
- end
360
- end
361
-
362
- class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
363
- def get_root_node_from_io(io, &block)
364
- root = ::AXML.parse(io)
365
- block.call(root)
366
- end
367
- end
368
-