bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,50 @@
1
+ #
2
+ # bio/util/color_scheme/nucleotide.rb - Color codings for nucleotides
3
+ #
4
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: nucleotide.rb,v 1.2 2005/12/13 14:58:07 trevor Exp $
8
+ #
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+ #
29
+
30
+ require 'bio/util/color_scheme'
31
+
32
+ module Bio::ColorScheme
33
+ class Nucleotide < Simple
34
+
35
+ #########
36
+ protected
37
+ #########
38
+
39
+ @colors = {
40
+ 'A' => '64F73F',
41
+ 'C' => 'FFB340',
42
+ 'G' => 'EB413C',
43
+ 'T' => '3C88EE',
44
+ 'U' => '3C88EE',
45
+ }
46
+ @colors.default = 'FFFFFF' # return white by default
47
+
48
+ end
49
+ NA = Nuc = Nucleotide
50
+ end
@@ -0,0 +1,78 @@
1
+ #
2
+ # bio/util/color_scheme/strand.rb - Color codings for strand propensity
3
+ #
4
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: strand.rb,v 1.3 2005/12/13 14:58:07 trevor Exp $
8
+ #
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+ #
29
+
30
+ require 'bio/util/color_scheme'
31
+
32
+ module Bio::ColorScheme
33
+ class Strand < Score
34
+
35
+ #########
36
+ protected
37
+ #########
38
+
39
+ def self.score_to_rgb_hex(score, min, max)
40
+ percent = score_to_percent(score, min, max)
41
+ rgb_percent_to_hex(percent, percent, 1.0-percent)
42
+ end
43
+
44
+ @colors = {}
45
+ @scores = {
46
+ 'A' => 0.83,
47
+ 'C' => 1.19,
48
+ 'D' => 0.54,
49
+ 'E' => 0.37,
50
+ 'F' => 1.38,
51
+ 'G' => 0.75,
52
+ 'H' => 0.87,
53
+ 'I' => 1.6,
54
+ 'K' => 0.74,
55
+ 'L' => 1.3,
56
+ 'M' => 1.05,
57
+ 'N' => 0.89,
58
+ 'P' => 0.55,
59
+ 'Q' => 1.1,
60
+ 'R' => 0.93,
61
+ 'S' => 0.75,
62
+ 'T' => 1.19,
63
+ 'U' => 0.0,
64
+ 'V' => 1.7,
65
+ 'W' => 1.37,
66
+ 'Y' => 1.47,
67
+
68
+ 'B' => 0.72,
69
+ 'X' => 1.0,
70
+ 'Z' => 0.74,
71
+ }
72
+ @min = 0.37
73
+ @max = 1.7
74
+ @scores.each { |k,s| @colors[k] = score_to_rgb_hex(s, @min, @max) }
75
+ @colors.default = 'FFFFFF' # return white by default
76
+
77
+ end
78
+ end
@@ -0,0 +1,69 @@
1
+ #
2
+ # bio/util/color_scheme/taylor.rb - Taylor color codings for amino acids
3
+ #
4
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: taylor.rb,v 1.2 2005/12/13 14:58:07 trevor Exp $
8
+ #
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+ #
29
+
30
+ require 'bio/util/color_scheme'
31
+
32
+ module Bio::ColorScheme
33
+ class Taylor < Simple
34
+
35
+ #########
36
+ protected
37
+ #########
38
+
39
+ @colors = {
40
+ 'A' => 'CCFF00',
41
+ 'C' => 'FFFF00',
42
+ 'D' => 'FF0000',
43
+ 'E' => 'FF0066',
44
+ 'F' => '00FF66',
45
+ 'G' => 'FF9900',
46
+ 'H' => '0066FF',
47
+ 'I' => '66FF00',
48
+ 'K' => '6600FF',
49
+ 'L' => '33FF00',
50
+ 'M' => '00FF00',
51
+ 'N' => 'CC00FF',
52
+ 'P' => 'FFCC00',
53
+ 'Q' => 'FF00CC',
54
+ 'R' => '0000FF',
55
+ 'S' => 'FF3300',
56
+ 'T' => 'FF6600',
57
+ 'U' => 'FFFFFF',
58
+ 'V' => '99FF00',
59
+ 'W' => '00CCFF',
60
+ 'Y' => '00FFCC',
61
+
62
+ 'B' => 'FFFFFF',
63
+ 'X' => 'FFFFFF',
64
+ 'Z' => 'FFFFFF',
65
+ }
66
+ @colors.default = 'FFFFFF' # return white by default
67
+
68
+ end
69
+ end
@@ -0,0 +1,78 @@
1
+ #
2
+ # bio/util/color_scheme/turn.rb - Color codings for turn propensity
3
+ #
4
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: turn.rb,v 1.2 2005/12/13 14:58:07 trevor Exp $
8
+ #
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+ #
29
+
30
+ require 'bio/util/color_scheme'
31
+
32
+ module Bio::ColorScheme
33
+ class Turn < Score
34
+
35
+ #########
36
+ protected
37
+ #########
38
+
39
+ def self.score_to_rgb_hex(score, min, max)
40
+ percent = score_to_percent(score, min, max)
41
+ rgb_percent_to_hex(percent, 1.0-percent, 1.0-percent)
42
+ end
43
+
44
+ @colors = {}
45
+ @scores = {
46
+ 'A' => 0.66,
47
+ 'C' => 1.19,
48
+ 'D' => 1.46,
49
+ 'E' => 0.74,
50
+ 'F' => 0.6,
51
+ 'G' => 1.56,
52
+ 'H' => 0.95,
53
+ 'I' => 0.47,
54
+ 'K' => 1.01,
55
+ 'L' => 0.59,
56
+ 'M' => 0.6,
57
+ 'N' => 1.56,
58
+ 'P' => 1.52,
59
+ 'Q' => 0.98,
60
+ 'R' => 0.95,
61
+ 'S' => 1.43,
62
+ 'T' => 0.96,
63
+ 'U' => 0,
64
+ 'V' => 0.5,
65
+ 'W' => 0.96,
66
+ 'Y' => 1.14,
67
+
68
+ 'B' => 1.51,
69
+ 'X' => 1.0,
70
+ 'Z' => 0.86,
71
+ }
72
+ @min = 0.47
73
+ @max = 1.56
74
+ @scores.each { |k,s| @colors[k] = score_to_rgb_hex(s, @min, @max) }
75
+ @colors.default = 'FFFFFF' # return white by default
76
+
77
+ end
78
+ end
@@ -0,0 +1,69 @@
1
+ #
2
+ # bio/util/color_scheme/zappo.rb - Zappo color codings for amino acids
3
+ #
4
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: zappo.rb,v 1.2 2005/12/13 14:58:07 trevor Exp $
8
+ #
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+ #
29
+
30
+ require 'bio/util/color_scheme'
31
+
32
+ module Bio::ColorScheme
33
+ class Zappo < Simple
34
+
35
+ #########
36
+ protected
37
+ #########
38
+
39
+ @colors = {
40
+ 'A' => 'FFAFAF',
41
+ 'C' => 'FFFF00',
42
+ 'D' => 'FF0000',
43
+ 'E' => 'FF0000',
44
+ 'F' => 'FFC800',
45
+ 'G' => 'FF00FF',
46
+ 'H' => 'FF0000',
47
+ 'I' => 'FFAFAF',
48
+ 'K' => '6464FF',
49
+ 'L' => 'FFAFAF',
50
+ 'M' => 'FFAFAF',
51
+ 'N' => '00FF00',
52
+ 'P' => 'FF00FF',
53
+ 'Q' => '00FF00',
54
+ 'R' => '6464FF',
55
+ 'S' => '00FF00',
56
+ 'T' => '00FF00',
57
+ 'U' => 'FFFFFF',
58
+ 'V' => 'FFAFAF',
59
+ 'W' => 'FFC800',
60
+ 'Y' => 'FFC800',
61
+
62
+ 'B' => 'FFFFFF',
63
+ 'X' => 'FFFFFF',
64
+ 'Z' => 'FFFFFF',
65
+ }
66
+ @colors.default = 'FFFFFF' # return white by default
67
+
68
+ end
69
+ end
@@ -0,0 +1,337 @@
1
+ module Bio
2
+
3
+ #
4
+ # bio/util/contingency_table.rb - Statistical contingency table analysis for aligned sequences
5
+ #
6
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
7
+ # License:: LGPL
8
+ #
9
+ # $Id: contingency_table.rb,v 1.2 2005/12/13 14:58:37 trevor Exp $
10
+ #
11
+ #
12
+ #--
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27
+ #
28
+ #++
29
+ #
30
+ #
31
+
32
+ =begin rdoc
33
+ bio/util/contingency_table.rb - Statistical contingency table analysis for aligned sequences
34
+
35
+ == Synopsis
36
+
37
+ The Bio::ContingencyTable class provides basic statistical contingency table
38
+ analysis for two positions within aligned sequences.
39
+
40
+ When ContingencyTable is instantiated the set of characters in the aligned sequences may be
41
+ passed to it as an array. This is important since it uses these characters
42
+ to create the table's rows and columns. If this array is not passed it will
43
+ use it's default of an amino acid and nucleotide alphabet in lowercase along with the
44
+ clustal spacer '-'.
45
+
46
+ To get data from the table the most used functions will be chi_square and contingency_coefficient:
47
+ ctable = Bio::ContingencyTable.new()
48
+ ctable['a']['t'] += 1
49
+ # .. put more values into the table
50
+ puts ctable.chi_square
51
+ puts ctable.contingency_coefficient # between 0.0 and 1.0
52
+
53
+ The contingency_coefficient represents the degree of correlation of change between two
54
+ sequence positions in a multiple-sequence alignment. 0.0 indicates no correlation, 1.0 is the
55
+ maximum correlation.
56
+
57
+
58
+ == Further Reading
59
+
60
+ * http://en.wikipedia.org/wiki/Contingency_table
61
+ * http://www.physics.csbsju.edu/stats/exact.details.html
62
+ * Numerical Recipes in C by Press, Flannery, Teukolsky, and Vetterling
63
+
64
+
65
+ == Usage
66
+
67
+ What follows is an example of ContingencyTable in typical usage analyzing results from a clustal alignment.
68
+
69
+ require 'bio'
70
+ require 'bio/contingency_table'
71
+
72
+ seqs = {}
73
+ max_length = 0
74
+ Bio::ClustalW::Report.new( IO.read('sample.aln') ).to_a.each do |entry|
75
+ data = entry.data.strip
76
+ seqs[entry.definition] = data.downcase
77
+ max_length = data.size if max_length == 0
78
+ raise "Aligned sequences must be the same length!" unless data.size == max_length
79
+ end
80
+
81
+ VERBOSE = true
82
+ puts "i\tj\tchi_square\tcontingency_coefficient" if VERBOSE
83
+ correlations = {}
84
+
85
+ 0.upto(max_length - 1) do |i|
86
+ (i+1).upto(max_length - 1) do |j|
87
+ ctable = Bio::ContingencyTable.new()
88
+ seqs.each_value { |seq| ctable.table[ seq[i].chr ][ seq[j].chr ] += 1 }
89
+
90
+ chi_square = ctable.chi_square
91
+ contingency_coefficient = ctable.contingency_coefficient
92
+ puts [(i+1), (j+1), chi_square, contingency_coefficient].join("\t") if VERBOSE
93
+
94
+ correlations["#{i+1},#{j+1}"] = contingency_coefficient
95
+ correlations["#{j+1},#{i+1}"] = contingency_coefficient # Both ways are accurate
96
+ end
97
+ end
98
+
99
+ require 'yaml'
100
+ File.new('results.yml', 'a+') { |f| f.puts correlations.to_yaml }
101
+
102
+
103
+ == Tutorial
104
+
105
+ ContingencyTable returns the statistical significance of change between two positions in an alignment.
106
+ If you would like to see how every possible combination of positions in your alignment compares to one another
107
+ you must set this up yourself. Hopefully the provided examples will help you get started without
108
+ too much trouble.
109
+
110
+ def lite_example(sequences, max_length, characters)
111
+
112
+ %w{i j chi_square contingency_coefficient}.each { |x| print x.ljust(12) }
113
+ puts
114
+
115
+ 0.upto(max_length - 1) do |i|
116
+ (i+1).upto(max_length - 1) do |j|
117
+ ctable = Bio::ContingencyTable.new( characters )
118
+ sequences.each do |seq|
119
+ i_char = seq[i].chr
120
+ j_char = seq[j].chr
121
+ ctable.table[i_char][j_char] += 1
122
+ end
123
+ chi_square = ctable.chi_square
124
+ contingency_coefficient = ctable.contingency_coefficient
125
+ [(i+1), (j+1), chi_square, contingency_coefficient].each { |x| print x.to_s.ljust(12) }
126
+ puts
127
+ end
128
+ end
129
+
130
+ end
131
+
132
+ allowed_letters = Array.new
133
+ allowed_letters = 'abcdefghijk'.split('')
134
+
135
+ seqs = Array.new
136
+ seqs << 'abcde'
137
+ seqs << 'abcde'
138
+ seqs << 'aacje'
139
+ seqs << 'aacae'
140
+
141
+ length_of_every_sequence = seqs[0].size # 5 letters long
142
+
143
+ lite_example(seqs, length_of_every_sequence, allowed_letters)
144
+
145
+
146
+ Producing the following results:
147
+
148
+ i j chi_square contingency_coefficient
149
+ 1 2 0.0 0.0
150
+ 1 3 0.0 0.0
151
+ 1 4 0.0 0.0
152
+ 1 5 0.0 0.0
153
+ 2 3 0.0 0.0
154
+ 2 4 4.0 0.707106781186548
155
+ 2 5 0.0 0.0
156
+ 3 4 0.0 0.0
157
+ 3 5 0.0 0.0
158
+ 4 5 0.0 0.0
159
+
160
+ The position i=2 and j=4 has a high contingency coefficient indicating that the changes at these
161
+ positions are related. Note that i and j are arbitrary, this could be represented as i=4 and j=2
162
+ since they both refer to position two and position four in the alignment. Here are some more examples:
163
+
164
+ seqs = Array.new
165
+ seqs << 'abcde'
166
+ seqs << 'abcde'
167
+ seqs << 'aacje'
168
+ seqs << 'aacae'
169
+ seqs << 'akcfe'
170
+ seqs << 'akcfe'
171
+
172
+ length_of_every_sequence = seqs[0].size # 5 letters long
173
+
174
+ lite_example(seqs, length_of_every_sequence, allowed_letters)
175
+
176
+
177
+ Results:
178
+
179
+ i j chi_square contingency_coefficient
180
+ 1 2 0.0 0.0
181
+ 1 3 0.0 0.0
182
+ 1 4 0.0 0.0
183
+ 1 5 0.0 0.0
184
+ 2 3 0.0 0.0
185
+ 2 4 12.0 0.816496580927726
186
+ 2 5 0.0 0.0
187
+ 3 4 0.0 0.0
188
+ 3 5 0.0 0.0
189
+ 4 5 0.0 0.0
190
+
191
+ Here we can see that the strength of the correlation of change has increased when more data is added with correlated changes at the same positions.
192
+
193
+ seqs = Array.new
194
+ seqs << 'abcde'
195
+ seqs << 'abcde'
196
+ seqs << 'kacje' # changed first letter
197
+ seqs << 'aacae'
198
+ seqs << 'akcfa' # changed last letter
199
+ seqs << 'akcfe'
200
+
201
+ length_of_every_sequence = seqs[0].size # 5 letters long
202
+
203
+ lite_example(seqs, length_of_every_sequence, allowed_letters)
204
+
205
+
206
+ Results:
207
+
208
+ i j chi_square contingency_coefficient
209
+ 1 2 2.4 0.534522483824849
210
+ 1 3 0.0 0.0
211
+ 1 4 6.0 0.707106781186548
212
+ 1 5 0.24 0.196116135138184
213
+ 2 3 0.0 0.0
214
+ 2 4 12.0 0.816496580927726
215
+ 2 5 2.4 0.534522483824849
216
+ 3 4 0.0 0.0
217
+ 3 5 0.0 0.0
218
+ 4 5 2.4 0.534522483824849
219
+
220
+ With random changes it becomes more difficult to identify correlated changes, yet positions two
221
+ and four still have the highest correlation as indicated by the contingency coefficient. The
222
+ best way to improve the accuracy of your results, as is often the case with statistics, is to
223
+ increase the sample size.
224
+
225
+
226
+ == A Note on Efficiency
227
+
228
+ ContingencyTable is slow. It involves many calculations for even a seemingly small five-string data set.
229
+ Even worse, it's very dependent on matrix traversal, and this is done with two dimensional hashes which
230
+ dashes any hope of decent speed.
231
+
232
+ Finally, half of the matrix is redundant and positions could be summed with their companion position to reduce
233
+ calculations. For example the positions (5,2) and (2,5) could both have their values added together and
234
+ just stored in (2,5) while (5,2) could be an illegal position. Also, positions (1,1), (2,2), (3,3), etc.
235
+ will never be used.
236
+
237
+ The purpose of this package is flexibility and education. The code is short and to the point in
238
+ aims of achieving that purpose. If the BioRuby project moves towards C extensions in the future a
239
+ professional caliber version will likely be created.
240
+
241
+
242
+ == Author
243
+ Trevor Wennblom <trevor@corevx.com>
244
+
245
+
246
+ == Copyright
247
+ Copyright (C) 2005 Trevor Wennblom
248
+ Licensed under the same terms as BioRuby.
249
+
250
+ =end
251
+
252
+ class ContingencyTable
253
+ # Since we're making this math-notation friendly here is the layout of @table:
254
+ # * @table[row][column]
255
+ # * @table[i][j]
256
+ # * @table[y][x]
257
+ attr_accessor :table
258
+ attr_reader :characters
259
+
260
+ # Create a ContingencyTable that has characters_in_sequence.size rows and
261
+ # characters_in_sequence.size columns for each row
262
+ def initialize(characters_in_sequences = nil)
263
+ @characters = ( characters_in_sequences or %w{a c d e f g h i k l m n p q r s t v w y - x u} )
264
+ tmp = Hash[*@characters.collect { |v| [v, 0] }.flatten]
265
+ @table = Hash[*@characters.collect { |v| [v, tmp.dup] }.flatten]
266
+ end
267
+
268
+ # Report the sum of all values in a given row
269
+ def row_sum(i)
270
+ total = 0
271
+ @table[i].each { |k, v| total += v }
272
+ total
273
+ end
274
+
275
+ # Report the sum of all values in a given column
276
+ def column_sum(j)
277
+ total = 0
278
+ @table.each { |row_key, column| total += column[j] }
279
+ total
280
+ end
281
+
282
+ # Report the sum of all values in all columns.
283
+ #
284
+ # * This is the same thing as asking for the sum of all values in the table.
285
+ #
286
+ def column_sum_all
287
+ total = 0
288
+ @characters.each { |j| total += column_sum(j) }
289
+ total
290
+ end
291
+
292
+ # Report the sum of all values in all rows.
293
+ #
294
+ # * This is the same thing as asking for the sum of all values in the table.
295
+ #
296
+ def row_sum_all
297
+ total = 0
298
+ @characters.each { |i| total += row_sum(i) }
299
+ total
300
+ end
301
+ alias table_sum_all row_sum_all
302
+
303
+ #
304
+ # e(sub:ij) = (r(sub:i)/N) * (c(sub:j))
305
+ #
306
+ def expected(i, j)
307
+ (row_sum(i).to_f / table_sum_all) * column_sum(j)
308
+ end
309
+
310
+ # Report the chi square of the entire table
311
+ def chi_square
312
+ total = 0
313
+ c = @characters
314
+ max = c.size - 1
315
+ @characters.each do |i| # Loop through every row in the ContingencyTable
316
+ @characters.each do |j| # Loop through every column in the ContingencyTable
317
+ total += chi_square_element(i, j)
318
+ end
319
+ end
320
+ total
321
+ end
322
+
323
+ # Report the chi square relation of two elements in the table
324
+ def chi_square_element(i, j)
325
+ eij = expected(i, j)
326
+ return 0 if eij == 0
327
+ ( @table[i][j] - eij )**2 / eij
328
+ end
329
+
330
+ # Report the contingency coefficient of the table
331
+ def contingency_coefficient
332
+ c_s = chi_square
333
+ Math.sqrt(c_s / (table_sum_all + c_s) )
334
+ end
335
+
336
+ end
337
+ end