bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # gbtab2mysql.rb - load tab delimited GenBank data files into MySQL
4
+ #
5
+ # Copyright (C) 2002 KATAYAMA Toshiaki <k@bioruby.org>
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: gbtab2mysql.rb,v 1.3 2002/06/25 19:30:26 k Exp $
22
+ #
23
+
24
+ require 'dbi'
25
+
26
+ $schema_ent = <<END
27
+ id varchar(16) NOT NULL PRIMARY KEY,
28
+ nalen integer,
29
+ strand varchar(5),
30
+ natype varchar(5),
31
+ circular varchar(10),
32
+ division varchar(5),
33
+ date varchar(12),
34
+ definition varchar(255),
35
+ accession varchar(30),
36
+ versions varchar(30),
37
+ keywords varchar(255),
38
+ segment varchar(255),
39
+ source varchar(255),
40
+ organism varchar(255),
41
+ taxonomy varchar(255),
42
+ comment text,
43
+ basecount varchar(255),
44
+ origin varchar(255),
45
+ KEY (nalen),
46
+ KEY (division),
47
+ KEY (accession),
48
+ KEY (organism),
49
+ KEY (taxonomy)
50
+ END
51
+
52
+ $schema_ft = <<END
53
+ id varchar(16) NOT NULL,
54
+ num integer,
55
+ feature varchar(30),
56
+ position text,
57
+ span_min integer,
58
+ span_max integer,
59
+ qualifier varchar(30),
60
+ value text,
61
+ KEY (id),
62
+ KEY (num),
63
+ KEY (feature),
64
+ KEY (span_min),
65
+ KEY (span_max),
66
+ KEY (qualifier)
67
+ END
68
+
69
+ $schema_ref = <<END
70
+ id varchar(16) NOT NULL,
71
+ num integer,
72
+ authors text,
73
+ title text,
74
+ journal text,
75
+ medline varchar(255),
76
+ pubmed varchar(255),
77
+ KEY (id),
78
+ KEY (medline),
79
+ KEY (pubmed)
80
+ END
81
+
82
+ $schema_seq = <<END
83
+ id varchar(16) NOT NULL,
84
+ num integer,
85
+ naseq mediumtext,
86
+ KEY (id)
87
+ END
88
+
89
+
90
+ def create_table(dbh, table)
91
+ $stderr.puts("create tables on #{table}") if $DEBUG
92
+
93
+ query = "CREATE TABLE IF NOT EXISTS #{table} ( #{$schema_ent} )"
94
+ dbh.execute(query)
95
+ query = "CREATE TABLE IF NOT EXISTS #{table}ft ( #{$schema_ft} )"
96
+ dbh.execute(query)
97
+ query = "CREATE TABLE IF NOT EXISTS #{table}ref ( #{$schema_ref} )"
98
+ dbh.execute(query)
99
+ query = "CREATE TABLE IF NOT EXISTS #{table}seq ( #{$schema_seq} )"
100
+ dbh.execute(query)
101
+ end
102
+
103
+
104
+ def load_tab(dbh, base, table)
105
+ $stderr.puts("load #{base} into #{table}") if $DEBUG
106
+
107
+ query = "LOAD DATA LOCAL INFILE '#{base}.seq.ent.tab' INTO TABLE #{table}"
108
+ dbh.execute(query)
109
+ query = "LOAD DATA LOCAL INFILE '#{base}.seq.ft.tab' INTO TABLE #{table}ft"
110
+ dbh.execute(query)
111
+ query = "LOAD DATA LOCAL INFILE '#{base}.seq.ref.tab' INTO TABLE #{table}ref"
112
+ dbh.execute(query)
113
+ query = "LOAD DATA LOCAL INFILE '#{base}.seq.seq.tab' INTO TABLE #{table}seq"
114
+ dbh.execute(query)
115
+ end
116
+
117
+
118
+ def merge_table(dbh, tables)
119
+ query = "CREATE TABLE IF NOT EXISTS ent ( #{$schema_ent} )" +
120
+ " TYPE=MERGE UNION=( #{tables.join(', ')} )"
121
+ dbh.execute(query)
122
+ query = "CREATE TABLE IF NOT EXISTS ft ( #{$schema_ft} )" +
123
+ " TYPE=MERGE UNION=( #{tables.join('ft, ') + 'ft' } )"
124
+ dbh.execute(query)
125
+ query = "CREATE TABLE IF NOT EXISTS ref ( #{$schema_ref} )" +
126
+ " TYPE=MERGE UNION=( #{tables.join('ref, ') + 'ref' } )"
127
+ dbh.execute(query)
128
+ query = "CREATE TABLE IF NOT EXISTS seq ( #{$schema_seq} )" +
129
+ " TYPE=MERGE UNION=( #{tables.join('seq, ') + 'seq' } )"
130
+ dbh.execute(query)
131
+ end
132
+
133
+
134
+ $stderr.puts Time.now
135
+
136
+ DBI.connect('dbi:Mysql:genbank:localhost', 'root') do |dbh|
137
+ tables = Array.new
138
+
139
+ Dir.glob("*.seq").sort.each do |gbk|
140
+ base = File.basename(gbk, '.seq')
141
+
142
+ div = base[/gb.../]
143
+ num = base[/\d+/].to_i
144
+
145
+ table = div
146
+ table = "%s%d" % [ div, (num - 1) / 20 + 1 ] if num > 20
147
+
148
+ unless dbh.tables.include?(table)
149
+ create_table(dbh, table)
150
+ tables.push(table)
151
+ end
152
+
153
+ load_tab(dbh, base, table)
154
+ end
155
+
156
+ merge_table(dbh, tables)
157
+ end
158
+
159
+ $stderr.puts Time.now
160
+
161
+
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # genes2nuc.rb - convert KEGG/GENES entry into FASTA format (nuc)
4
+ #
5
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # $Id: genes2nuc.rb,v 0.4 2002/06/23 20:21:56 k Exp $
18
+ #
19
+
20
+ require 'bio/db/kegg/genes'
21
+ require 'bio/extend'
22
+
23
+ include Bio
24
+
25
+ while gets(KEGG::GENES::DELIMITER)
26
+ genes = KEGG::GENES.new($_)
27
+
28
+ next if genes.nalen == 0
29
+
30
+ puts ">#{genes.entry_id} #{genes.definition}"
31
+ puts genes.naseq.fold(60+12, 12)
32
+ end
33
+
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # genes2nuc.rb - convert KEGG/GENES entry into FASTA format (nuc)
4
+ #
5
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # $Id: genes2pep.rb,v 0.4 2002/06/23 20:21:56 k Exp $
18
+ #
19
+
20
+ require 'bio/db/kegg/genes'
21
+ require 'bio/extend'
22
+
23
+ include Bio
24
+
25
+ while gets(KEGG::GENES::DELIMITER)
26
+ genes = KEGG::GENES.new($_)
27
+
28
+ next if genes.aalen == 0
29
+
30
+ puts ">#{genes.entry_id} #{genes.definition}"
31
+ puts genes.aaseq.fold(60+12, 12)
32
+ end
33
+
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # genes2tab.rb - convert KEGG/GENES into tab delimited data for MySQL
4
+ #
5
+ # Usage:
6
+ #
7
+ # % genes2tab.rb /bio/db/kegg/genes/e.coli > genes_eco.tab
8
+ #
9
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
10
+ #
11
+ # This program is free software; you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation; either version 2 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # This program is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # $Id: genes2tab.rb,v 0.5 2002/06/23 20:21:56 k Exp $
22
+ #
23
+
24
+ require 'bio/db/kegg/genes'
25
+
26
+ include Bio
27
+
28
+ while entry = gets(KEGG::GENES::DELIMITER)
29
+
30
+ genes = KEGG::GENES.new(entry)
31
+
32
+ db = genes.dblinks.inspect
33
+
34
+ if genes.codon_usage.length == 64
35
+ cu = genes.codon_usage.join(' ')
36
+ else
37
+ cu = '\N'
38
+ end
39
+
40
+ ary = [
41
+ genes.entry_id,
42
+ genes.division,
43
+ genes.organism,
44
+ genes.name,
45
+ genes.definition,
46
+ genes.keggclass,
47
+ genes.position,
48
+ db,
49
+ cu,
50
+ genes.aalen,
51
+ genes.aaseq,
52
+ genes.nalen,
53
+ genes.naseq
54
+ ]
55
+
56
+ puts ary.join("\t")
57
+
58
+ end
59
+
60
+ =begin
61
+
62
+ CREATE DATABASE IF NOT EXISTS db_name;
63
+ CREATE TABLE IF NOT EXISTS db_name.genes (
64
+ id varchar(30) not NULL, # ENTRY ID
65
+ division varchar(30), # CDS, tRNA etc.
66
+ organism varchar(255),
67
+ gene varchar(255),
68
+ definition varchar(255),
69
+ keggclass varchar(255),
70
+ position varchar(255),
71
+ dblinks varchar(255),
72
+ codon_usage text,
73
+ aalen integer,
74
+ aaseq text,
75
+ nalen integer,
76
+ naseq text
77
+ );
78
+ LOAD DATA LOCAL INFILE 'genes.tab' INTO TABLE db_name.genes;
79
+
80
+ =end
81
+
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # genome2rb.rb - used to generate contents of the bio/data/keggorg.rb
4
+ #
5
+ # Usage:
6
+ #
7
+ # % genome2rb.rb genome | sort
8
+ #
9
+ # Copyright (C) 2002 KATAYAMA Toshiaki <k@bioruby.org>
10
+ #
11
+ # This program is free software; you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation; either version 2 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # This program is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # $Id: genome2rb.rb,v 1.1 2002/03/04 08:14:45 katayama Exp $
22
+ #
23
+
24
+ require 'bio'
25
+
26
+ Bio::FlatFile.new(Bio::KEGG::GENOME,ARGF).each do |x|
27
+ puts " '#{x.entry_id}' => [ '#{x.name}', '#{x.definition}' ],"
28
+ end
29
+
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # genome2tab.rb - convert KEGG/GENOME into tab delimited data for MySQL
4
+ #
5
+ # Usage:
6
+ #
7
+ # % genome2tab.rb /bio/db/kegg/genome/genome > genome.tab
8
+ #
9
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
10
+ #
11
+ # This program is free software; you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation; either version 2 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # This program is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # $Id: genome2tab.rb,v 0.5 2002/06/23 20:21:56 k Exp $
22
+ #
23
+
24
+ require 'bio/db/kegg/genome'
25
+
26
+ include Bio
27
+
28
+ while entry = gets(KEGG::GENOME::DELIMITER)
29
+
30
+ genome = KEGG::GENOME.new(entry)
31
+
32
+ ref = genome.references.inspect
33
+ chr = genome.chromosomes.inspect
34
+
35
+ ary = [
36
+ genome.entry_id,
37
+ genome.name,
38
+ genome.definition,
39
+ genome.taxid,
40
+ genome.taxonomy,
41
+ genome.comment,
42
+ ref,
43
+ chr,
44
+ genome.nalen,
45
+ genome.num_gene,
46
+ genome.num_rna,
47
+ genome.gc,
48
+ genome.genomemap,
49
+ ]
50
+
51
+ puts ary.join("\t")
52
+
53
+ end
54
+
55
+ =begin
56
+
57
+ CREATE DATABASE IF NOT EXISTS db_name;
58
+ CREATE TABLE IF NOT EXISTS db_name.genome (
59
+ id varchar(30) not NULL,
60
+ name varchar(80),
61
+ definition varchar(255),
62
+ taxid varchar(30),
63
+ taxonomy varchar(255),
64
+ comment varchar(255),
65
+ reference text,
66
+ chromosome text,
67
+ nalen integer,
68
+ num_gene integer,
69
+ num_rna integer,
70
+ gc float,
71
+ genomemap varchar(30),
72
+ );
73
+ LOAD DATA LOCAL INFILE 'genome.tab' INTO TABLE db_name.genome;
74
+
75
+ =end
76
+
data/sample/goslim.rb ADDED
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # goslim.rb - making a GO slim histgram
4
+ #
5
+ # Usage:
6
+ #
7
+ # % goslim.rb -p process.ontology -f function.ontology \
8
+ # -c component.ontology -s goslim_goa.2002 -g gene_association.mgi \
9
+ # -o mgi -r
10
+ # % R < mgi.R
11
+ # % gv mgi.pdf
12
+ #
13
+ # Copyright (C) 2003 Mitsuteru C. Nakao <n@bioruby.org>
14
+ #
15
+ # This program is free software; you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation; either version 2 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # $Id: goslim.rb,v 1.3 2003/05/13 10:45:42 n Exp $
26
+ #
27
+
28
+
29
+
30
+ SCRIPT_VERSION = '$Id: goslim.rb,v 1.3 2003/05/13 10:45:42 n Exp $'
31
+
32
+ USAGE = "${__FILE__} - GO slim
33
+ Usage:
34
+ #{__FILE__} -p process.ontology -f function.ontology \
35
+ -c component.ontolgy -g gene_association.mgi -s goslim_goa.2002 \
36
+ -o goslim.uniqued.out -r
37
+
38
+ #{__FILE__} -p process.ontology -f function.ontology \
39
+ -c component.ontolgy -l gene_association.list -s goslim_goa.2002 \
40
+ -o mgi.out -r
41
+
42
+ #{__FILE__} -p process.ontology -f function.ontology \
43
+ -c component.ontolgy -g gene_association.mgi -s goslim_goa.2002 >\
44
+ go_goslit.paired.list
45
+
46
+
47
+
48
+ Options;
49
+ -p,--process <go/ontology/process.ontology>
50
+ -f,--function <go/ontology/function.ontolgoy>
51
+ -c,--component <go/ontology/component.ontology>
52
+ -g,--ga <go/gene-associations/gene_association.someone>
53
+ -l,--galist <a GO_ID list>
54
+ -s,--goslim <go/GO_slim/goslim_someone>
55
+ -o,--output <file_name> -- output file name.
56
+ -r,--r_script -- Writing a R script in <file_name>.R to plot a barplot.
57
+ -h,--help
58
+ -v,--version
59
+
60
+ Format:
61
+ GO ID list: /^GO:\d{7}/ for each line
62
+
63
+ Mitsuteru C. Nakao <n@bioruby.org>
64
+ "
65
+
66
+
67
+
68
+ require 'getoptlong'
69
+ parser = GetoptLong.new
70
+ parser.set_options(
71
+ ['--process', '-p', GetoptLong::REQUIRED_ARGUMENT],
72
+ ['--function', '-f', GetoptLong::REQUIRED_ARGUMENT],
73
+ ['--component', '-c', GetoptLong::REQUIRED_ARGUMENT],
74
+ ['--ga', '-g', GetoptLong::REQUIRED_ARGUMENT],
75
+ ['--galist', '-l', GetoptLong::REQUIRED_ARGUMENT],
76
+ ['--goslim', '-s', GetoptLong::REQUIRED_ARGUMENT],
77
+ ['--output', '-o', GetoptLong::REQUIRED_ARGUMENT],
78
+ ['--r_script', '-r', GetoptLong::NO_ARGUMENT],
79
+ ['--help', '-h', GetoptLong::NO_ARGUMENT],
80
+ ['--version', '-v', GetoptLong::NO_ARGUMENT])
81
+
82
+ begin
83
+ parser.each_option do |name, arg|
84
+ eval "$OPT_#{name.sub(/^--/, '').gsub(/-/, '_').upcase} = '#{arg}'"
85
+ end
86
+ rescue
87
+ exit(1)
88
+ end
89
+
90
+ if $OPT_VERSION
91
+ puts SCRIPT_VERSION
92
+ exit(0)
93
+ end
94
+
95
+ if $OPT_HELP or !($OPT_PROCESS or $OPT_FUNCTION or $OPT_COMPONENT or
96
+ ($OPT_GA or $OPT_GALIST))
97
+ puts USAGE
98
+ exit(0)
99
+ end
100
+
101
+
102
+
103
+
104
+ # subroutines
105
+
106
+ def slim2r(datname)
107
+ tmp = "# usage: % R --vanilla < #{datname}.R
108
+ data <- read.delim2('#{datname}')
109
+ dat <- data$count
110
+ names(dat) <- paste(data$GO.Term, dat)
111
+ # set graphc format
112
+ pdf('#{datname}.pdf')
113
+ #postscript('#{datname}.ps')
114
+ # outside margins
115
+ par(mai = c(1,2.8,1,0.7))
116
+ barplot(dat,
117
+ cex.names = 0.6, # row names font size
118
+ las = 2, # set horizontal row names
119
+ horiz = T, # set horizontal
120
+ main = 'GO slim', # main title
121
+ # set color schema, proc, blue(3); func, red(2); comp, green(4)
122
+ col = cbind(c(data$aspect == 'process'),
123
+ c(data$aspect == 'function'),
124
+ c(data$aspect == 'component')) %*% c(4,2,3)) # color
125
+ dev.off()
126
+ "
127
+ end
128
+
129
+
130
+ # build GOslim uniqued list
131
+ def slim(ontology, slim_ids, tmp, ga, aspect)
132
+ tmp[aspect] = Hash.new(0)
133
+ slim_ids.each {|slim_id|
134
+ term = ontology.goid2term(slim_id)
135
+ if term
136
+ tmp[aspect][term] = 0
137
+ else
138
+ next
139
+ end
140
+
141
+ ga.each {|gaid|
142
+ begin
143
+ res = ontology.bfs_shortest_path(slim_id, gaid)
144
+ tmp[aspect][term] += 1 if res[0]
145
+ rescue NameError
146
+ $stderr.puts "Warnning: GO:#{slim_id} (#{term}) doesn't exist in the #{aspect}.ontology."
147
+ tmp[aspect].delete(term)
148
+ break
149
+ end
150
+ }
151
+ }
152
+ end
153
+
154
+
155
+ # build GO-GOslim uniqued list
156
+ def slim2(ontology, slim_ids, tmp, ga, aspect)
157
+ tmp[aspect] = Hash.new
158
+ slim_ids.each {|slim_id|
159
+ term = ontology.goid2term(slim_id)
160
+ if term
161
+ begin
162
+ unless tmp[aspect][term]['GOslim'].index(slim_id)
163
+ tmp[aspect][term]['GOslim'] << slim_id
164
+ end
165
+ rescue NameError
166
+ tmp[aspect][term] = {'GOslim'=>[slim_id], 'GO'=>[]}
167
+ end
168
+ else
169
+ next
170
+ end
171
+
172
+ ga.each {|gaid|
173
+ begin
174
+ res = ontology.bfs_shortest_path(slim_id, gaid)
175
+ tmp[aspect][term]['GO'] << gaid if res[0]
176
+ rescue NameError
177
+
178
+ break
179
+ end
180
+ }
181
+ }
182
+ end
183
+
184
+
185
+
186
+ #
187
+ # main
188
+ #
189
+
190
+ require 'bio/db/go'
191
+
192
+ aspects = ['process', 'function', 'component']
193
+ rootids = {
194
+ 'process' => '0008150',
195
+ 'function' => '0003674',
196
+ 'component' => '0005575'}
197
+
198
+ # files open
199
+
200
+ ios = {}
201
+ files = {
202
+ 'process' => $OPT_PROCESS,
203
+ 'function' => $OPT_FUNCTION,
204
+ 'component' => $OPT_COMPONENT,
205
+ 'ga' => $OPT_GA, # gene-association
206
+ 'list' => $OPT_GALIST, # gene-association list
207
+ 'slim' => $OPT_GOSLIM} # GO slim
208
+
209
+ files.each {|k, file_name|
210
+ next if file_name == nil
211
+ ios[k] = File.open(file_name)
212
+ }
213
+
214
+ if $OPT_OUTPUT
215
+ ios['output'] = File.new($OPT_OUTPUT, "w+")
216
+ ios['r_script'] = File.new("#{$OPT_OUTPUT}.R", "w+")
217
+ else
218
+ ios['r_script'] = ios['output'] = $stdout
219
+ end
220
+
221
+
222
+ # start
223
+
224
+ # ontology
225
+ ontology = {}
226
+ aspects.each {|aspect|
227
+ ontology[aspect] = Bio::GO::Ontology.new(ios[aspect].read)
228
+ }
229
+
230
+
231
+ # GO slim
232
+ goslim = Bio::GO::Ontology.new(ios['slim'].read)
233
+
234
+ # assign a aspect to terms in the GO slim.
235
+ slim_ids = Hash.new([])
236
+ goslim.to_list.map {|ent| ent.node }.flatten.uniq.each {|goid|
237
+ rootids.each {|aspect, rootid|
238
+ begin
239
+ a,b = ontology[aspect].bfs_shortest_path(rootid, goid)
240
+ slim_ids[aspect] << goid
241
+ rescue NameError
242
+ $stderr.puts "Error: (#{rootid}, #{goid})"
243
+ end
244
+ }
245
+ }
246
+
247
+
248
+
249
+
250
+ # gene-associations
251
+
252
+ ga_ids = []
253
+ if $OPT_GA
254
+ ga = Bio::GO::GeneAssociation.parser(ios['ga'].read)
255
+ ga_ids = ga.map {|ent| ent.goid }
256
+
257
+ elsif $OPT_GALIST
258
+ while line = ios['list'].gets
259
+ if /^GO:(\d{7})/ =~ line
260
+ goid = $1
261
+ ga_ids << goid
262
+ end
263
+ end
264
+ else
265
+ puts "Error: -l or -g options"
266
+ exit
267
+ end
268
+
269
+
270
+ # count number
271
+
272
+ count = Hash.new(0)
273
+
274
+ aspects.each {|aspect|
275
+ slim2(ontology[aspect], slim_ids[aspect], count, ga_ids, aspect)
276
+ }
277
+
278
+
279
+
280
+
281
+ # output
282
+
283
+ if $OPT_R_SCRIPT and $OPT_OUTPUT
284
+ tmp = [['aspect', 'count', 'GO Term'].join("\t")]
285
+ else
286
+ tmp = [['aspect', 'GO ID', 'GOslim Term', 'GOslim ID'].join("\t")]
287
+ end
288
+
289
+ ['component','function','process'].each {|aspect|
290
+ count[aspect].sort {|a, b| b[1]['GO'].size <=> a[1]['GO'].size }.each {|term, value|
291
+ next if term == ""
292
+
293
+ if $OPT_R_SCRIPT and $OPT_OUTPUT
294
+ tmp << [aspect, value['GO'].size, term].join("\t")
295
+ else
296
+ value['GO'].each {|goid|
297
+ tmp << [aspect, "GO:#{goid}", term,
298
+ value['GOslim'].map {|e| "GO:#{e}" }.join(' ')].join("\t")
299
+ }
300
+ end
301
+ }
302
+ }
303
+ ios['output'].puts tmp.join("\n")
304
+
305
+
306
+ if $OPT_R_SCRIPT and $OPT_OUTPUT
307
+ ios['r_script'].puts slim2r($OPT_OUTPUT)
308
+ end
309
+
310
+
311
+ #