bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # color_scheme_na.rb - A Bio::ColorScheme demo script for Nucleic Acids
4
+ # sequences.
5
+ #
6
+ # Usage:
7
+ #
8
+ # % ruby color_scheme_na.rb > cs-seq-fna.html
9
+ #
10
+ # % cat seq.fna
11
+ # >DNA_sequence
12
+ # acgtgtgtcatgctagtcgatcgtactagtcgtagctagtca
13
+ # % ruby color_scheme_na.rb seq.fna > colored-seq-fna.html
14
+ #
15
+ #
16
+ # Copyright (C) 2005 Mitsuteru C. Nakao <n@bioruby.org>
17
+ #
18
+ # This program is free software; you can redistribute it and/or modify
19
+ # it under the terms of the GNU General Public License as published by
20
+ # the Free Software Foundation; either version 2 of the License, or
21
+ # (at your option) any later version.
22
+ #
23
+ # This program is distributed in the hope that it will be useful,
24
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
25
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
+ # GNU General Public License for more details.
27
+ #
28
+ # $Id: color_scheme_na.rb,v 1.1 2005/10/31 07:39:13 nakao Exp $
29
+ #
30
+
31
+ require 'bio'
32
+
33
+
34
+ # returns folded sequence with <br>.
35
+ def br(i, width = 80)
36
+ return "<br\n>" if i % width == 0
37
+ ""
38
+ end
39
+
40
+
41
+ # returns sequence html doc
42
+ def display(seq, cs)
43
+ html = '<p style="font-family: monospace">'
44
+ postfix = '</span>'
45
+ i = 0
46
+ seq.each_byte do |c|
47
+ color = cs[c.chr]
48
+ prefix = %Q(<span style="background:\##{color};">)
49
+ html += prefix + c.chr + postfix
50
+ html += br(i += 1)
51
+ end
52
+ html + '</p>'
53
+ end
54
+
55
+
56
+ # returns scheme wise html doc
57
+ def display_scheme(scheme, naseq, aaseq)
58
+ html = ''
59
+ cs = eval("Bio::ColorScheme::#{scheme}")
60
+ [naseq, aaseq].each do |seq|
61
+ html += display(seq, cs)
62
+ end
63
+ return ['<div>', "<h3>#{cs}</h3>", html, '</div>']
64
+ end
65
+
66
+
67
+
68
+ if fna = ARGV.shift
69
+ naseq = Bio::FastaFormat.new(File.open(fna, 'r').read).naseq
70
+ aaseq = naseq.translate
71
+ else
72
+ naseq = Bio::Sequence::NA.new('acgtu' * 20).randomize
73
+ aaseq = naseq.translate
74
+ end
75
+
76
+ title = 'Bio::ColorScheme for DNA sequences'
77
+ doc = ['<html>',
78
+ '<header>', '<title>', title, '</title>', '</header>',
79
+ '<body>', '<h1>', title, '</h1>']
80
+
81
+ doc << ['<div>', '<h2>', 'Simple colors', '</h2>']
82
+ ['Nucleotide'].each do |scheme|
83
+ doc << display_scheme(scheme, naseq, "")
84
+ end
85
+ doc << ['</div>']
86
+
87
+ ['Zappo', 'Taylor' ].each do |scheme|
88
+ doc << display_scheme(scheme, "", aaseq)
89
+ end
90
+ doc << ['</div>']
91
+
92
+
93
+ doc << ['<div>', '<h2>', 'Score colors', '</h2>']
94
+ ['Buried', 'Helix', 'Hydropathy', 'Strand', 'Turn'].each do |score|
95
+ doc << display_scheme(score, "", aaseq)
96
+ end
97
+ doc << ['</div>']
98
+
99
+ puts doc + ['</body>','</html>']
data/sample/dbget ADDED
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # dbget - DBGET client
4
+ #
5
+ # Interface to GenomeNet DBGET system - http://www.genome.jp/dbget/
6
+ #
7
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
8
+ #
9
+ # This program is free software; you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation; either version 2 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with this program; if not, write to the Free Software
21
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
+ #
23
+ # $Id: dbget,v 1.7 2004/08/24 00:09:24 k Exp $
24
+ #
25
+
26
+ require "bio/io/dbget"
27
+
28
+ # DBGET command
29
+ com = File.basename($0) # e.g. $PATH/bget db entry
30
+ com = ARGV.shift if com == "dbget" # e.g. $PATH/dbget bget db entry
31
+
32
+ # DBGET query strings
33
+ arg = ARGV.join(" ")
34
+
35
+ # DBGET result
36
+ print Bio::DBGET.dbget(com, arg)
37
+
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # fasta2tab.rb - convert FASTA (-m 6) output into tab delimited data for MySQL
4
+ #
5
+ # Usage:
6
+ #
7
+ # % fasta2tab.rb FASTA-output-file[s] > fasta_results.tab
8
+ # % mysql < fasta_results.sql (use sample at the end of this file)
9
+ #
10
+ # Format accepted:
11
+ #
12
+ # % fasta3[3][_t] -Q -H -m 6 query.f target.f ktup > FASTA-output-file
13
+ #
14
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
15
+ #
16
+ # This program is free software; you can redistribute it and/or modify
17
+ # it under the terms of the GNU General Public License as published by
18
+ # the Free Software Foundation; either version 2 of the License, or
19
+ # (at your option) any later version.
20
+ #
21
+ # This program is distributed in the hope that it will be useful,
22
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
23
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24
+ # GNU General Public License for more details.
25
+ #
26
+ # $Id: fasta2tab.rb,v 0.1 2001/06/21 08:21:58 katayama Exp $
27
+ #
28
+
29
+ while gets
30
+
31
+ # query
32
+ if /^\S+: (\d+) aa$/
33
+ q_len = $1
34
+ end
35
+
36
+ # each hit
37
+ if /^>>([^>]\S+).*\((\d+) aa\)$/
38
+ target = $1
39
+ t_len = $2
40
+
41
+ # d = dummy variable
42
+ d, d, initn, d, init1, d, opt, d, zscore, d, bits, d, evalue =
43
+ gets.split(/\s+/)
44
+ d, d, sw, ident, d, ugident, d, d, overlap, d, d, lap =
45
+ gets.split(/\s+/)
46
+
47
+ # query-hit pair
48
+ print "#{$FILENAME}\t#{q_len}\t#{target}\t#{t_len}"
49
+
50
+ # pick up values
51
+ ary = [
52
+ initn,
53
+ init1,
54
+ opt,
55
+ zscore,
56
+ bits,
57
+ evalue,
58
+ sw,
59
+ ident,
60
+ ugident,
61
+ overlap,
62
+ lap
63
+ ]
64
+
65
+ # print values
66
+ for i in ary
67
+ i.tr!('^0-9.:e\-','')
68
+ print "\t#{i}"
69
+ end
70
+
71
+ print "\n"
72
+
73
+ end
74
+ end
75
+
76
+ =begin MySQL fasta_results.sql sample
77
+
78
+ CREATE DATABASE IF NOT EXISTS db_name;
79
+ CREATE TABLE IF NOT EXISTS db_name.table_name (
80
+ query varchar(25) not NULL,
81
+ q_len integer unsigned default 0,
82
+ target varchar(25) not NULL,
83
+ t_len integer unsigned default 0,
84
+ initn integer unsigned default 0,
85
+ init1 integer unsigned default 0,
86
+ opt integer unsigned default 0,
87
+ zscore float default 0.0,
88
+ bits float default 0.0,
89
+ evalue float default 0.0,
90
+ sw integer unsigned default 0,
91
+ ident float default 0.0,
92
+ ugident float default 0.0,
93
+ overlap integer unsigned default 0,
94
+ lap_at varchar(25) default NULL
95
+ );
96
+ LOAD DATA LOCAL INFILE 'fasta_results.tab' INTO TABLE db_name.table_name;
97
+
98
+ =end
99
+
data/sample/fsplit.rb ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # fsplit.rb - split FASTA file by each n entries
4
+ #
5
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # $Id: fsplit.rb,v 0.1 2001/06/21 08:22:29 katayama Exp $
18
+ #
19
+
20
+ if ARGV.length != 2
21
+
22
+ print <<-USAGE
23
+ fsplit.rb - split FASTA file by each n entries
24
+
25
+ Usage :
26
+
27
+ % ./fsplit.rb 2000 seq.f
28
+
29
+ This will produce seq.f.1, seq.f.2, ... with containing 2000 sequences
30
+ in each file.
31
+
32
+ USAGE
33
+ exit 1
34
+
35
+ end
36
+
37
+ count = ARGV.shift.to_i
38
+
39
+ i = -1
40
+
41
+ while gets
42
+ if /^>/
43
+ i += 1
44
+ if i % count == 0
45
+ n = i / count
46
+ out = File.new("#{$FILENAME}.#{n+1}", "w+")
47
+ end
48
+ end
49
+ out.print
50
+ end
51
+
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # gb2fasta.rb - convert GenBank entry into FASTA format (nuc)
4
+ #
5
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
6
+ # Copyright (C) 2002 Yoshinori K. Okuji <o@bioruby.org>
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # $Id: gb2fasta.rb,v 0.5 2002/07/23 04:51:24 k Exp $
19
+ #
20
+
21
+ require 'bio/io/flatfile'
22
+ require 'bio/db/genbank'
23
+
24
+ include Bio
25
+
26
+ ff = FlatFile.new(GenBank, ARGF)
27
+
28
+ while gb = ff.next_entry
29
+ print gb.seq.to_fasta("gb:#{gb.entry_id} #{gb.definition}", 70)
30
+ end
31
+
data/sample/gb2tab.rb ADDED
@@ -0,0 +1,325 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # gb2tab.rb - convert GenBank into tab delimited data for MySQL
4
+ #
5
+ # Usage:
6
+ #
7
+ # % gb2tab.rb gb*.seq
8
+ #
9
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
10
+ #
11
+ # This program is free software; you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation; either version 2 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # This program is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # $Id: gb2tab.rb,v 0.11 2002/04/22 09:10:10 k Exp $
22
+ #
23
+
24
+ require 'bio'
25
+
26
+ $stderr.puts Time.now
27
+
28
+ ARGV.each do |gbkfile|
29
+
30
+ gbk = open("#{gbkfile}")
31
+ ent = open("#{gbkfile}.ent.tab", "w")
32
+ ft = open("#{gbkfile}.ft.tab", "w")
33
+ ref = open("#{gbkfile}.ref.tab", "w")
34
+ seq = open("#{gbkfile}.seq.tab", "w")
35
+
36
+ while entry = gbk.gets(Bio::GenBank::DELIMITER)
37
+
38
+ gb = Bio::GenBank.new(entry)
39
+
40
+ ### MAIN BODY
41
+
42
+ ary = [
43
+ gb.entry_id,
44
+ gb.nalen,
45
+ gb.strand,
46
+ gb.natype,
47
+ gb.circular,
48
+ gb.division,
49
+ gb.date,
50
+ gb.definition,
51
+ gb.accession,
52
+ gb.versions.inspect,
53
+ gb.keywords.inspect,
54
+ gb.segment.inspect,
55
+ gb.common_name,
56
+ gb.organism,
57
+ gb.taxonomy,
58
+ gb.comment,
59
+ gb.basecount.inspect,
60
+ gb.origin,
61
+ ]
62
+
63
+ ent.puts ary.join("\t")
64
+
65
+ ### FEATURES
66
+
67
+ num = 0
68
+
69
+ gb.features.each do |f|
70
+ num += 1
71
+
72
+ span_min, span_max = f.locations.span
73
+
74
+ if f.qualifiers.empty?
75
+ ary = [
76
+ gb.entry_id,
77
+ num,
78
+ f.feature,
79
+ f.position,
80
+ span_min,
81
+ span_max,
82
+ '',
83
+ '',
84
+ ]
85
+ ft.puts ary.join("\t")
86
+ else
87
+ f.each do |q|
88
+ ary = [
89
+ gb.entry_id,
90
+ num,
91
+ f.feature,
92
+ f.position,
93
+ span_min,
94
+ span_max,
95
+ q.qualifier,
96
+ q.value,
97
+ ]
98
+ ft.puts ary.join("\t")
99
+ end
100
+ end
101
+
102
+ end
103
+
104
+ ### REFERENCE
105
+
106
+ num = 0
107
+
108
+ gb.references.each do |r|
109
+ num += 1
110
+
111
+ ary = [
112
+ gb.entry_id,
113
+ num,
114
+ r.authors.inspect,
115
+ r.title,
116
+ r.journal,
117
+ r.medline,
118
+ r.pubmed,
119
+ ]
120
+
121
+ ref.puts ary.join("\t")
122
+ end
123
+
124
+ ### SEQUENCE
125
+
126
+ maxlen = 16 * 10 ** 6
127
+
128
+ num = 0
129
+
130
+ 0.step(gb.nalen, maxlen) do |i|
131
+ num += 1
132
+
133
+ ary = [
134
+ gb.entry_id,
135
+ num,
136
+ gb.naseq[i, maxlen]
137
+ ]
138
+
139
+ seq.puts ary.join("\t")
140
+
141
+ end
142
+
143
+ end
144
+
145
+ gbk.close
146
+ ent.close
147
+ ft.close
148
+ ref.close
149
+ seq.close
150
+
151
+ end
152
+
153
+ $stderr.puts Time.now
154
+
155
+ =begin
156
+
157
+ Example usage in zsh:
158
+
159
+ % gb2tab.rb *.seq
160
+ % for i in *.seq
161
+ > do
162
+ > base=`basename $i .seq`
163
+ > ruby -pe "gsub(/%HOGE%/,'$base')" gb2tab.sql | mysql
164
+ > done
165
+
166
+ gb2tab.sql:
167
+
168
+ CREATE DATABASE IF NOT EXISTS genbank;
169
+ USE genbank;
170
+
171
+ CREATE TABLE IF NOT EXISTS %HOGE% (
172
+ id varchar(16) NOT NULL PRIMARY KEY,
173
+ nalen integer,
174
+ strand varchar(5),
175
+ natype varchar(5),
176
+ circular varchar(10),
177
+ division varchar(5),
178
+ date varchar(12),
179
+ definition varchar(255),
180
+ accession varchar(30),
181
+ versions varchar(30),
182
+ keywords varchar(255),
183
+ segment varchar(255),
184
+ source varchar(255),
185
+ organism varchar(255),
186
+ taxonomy varchar(255),
187
+ comment text,
188
+ basecount varchar(255),
189
+ origin varchar(255),
190
+ KEY (nalen),
191
+ KEY (division),
192
+ KEY (accession),
193
+ KEY (organism),
194
+ KEY (taxonomy)
195
+ );
196
+ LOAD DATA LOCAL INFILE '%HOGE%.seq.ent.tab' INTO TABLE %HOGE%;
197
+
198
+ CREATE TABLE IF NOT EXISTS %HOGE%ft (
199
+ id varchar(16) NOT NULL,
200
+ num integer,
201
+ feature varchar(30),
202
+ position text,
203
+ span_min integer,
204
+ span_max integer,
205
+ qualifier varchar(30),
206
+ value text,
207
+ KEY (id),
208
+ KEY (num),
209
+ KEY (feature),
210
+ KEY (span_min),
211
+ KEY (span_max),
212
+ KEY (qualifier)
213
+ );
214
+ LOAD DATA LOCAL INFILE '%HOGE%.seq.ft.tab' INTO TABLE %HOGE%ft;
215
+
216
+ CREATE TABLE IF NOT EXISTS %HOGE%ref (
217
+ id varchar(16) NOT NULL,
218
+ num integer,
219
+ authors text,
220
+ title text,
221
+ journal text,
222
+ medline varchar(255),
223
+ pubmed varchar(255),
224
+ KEY (id),
225
+ KEY (medline),
226
+ KEY (pubmed)
227
+ );
228
+ LOAD DATA LOCAL INFILE '%HOGE%.seq.ref.tab' INTO TABLE %HOGE%ref;
229
+
230
+ CREATE TABLE IF NOT EXISTS %HOGE%seq (
231
+ id varchar(16) NOT NULL,
232
+ num integer,
233
+ naseq mediumtext,
234
+ KEY (id)
235
+ );
236
+ LOAD DATA LOCAL INFILE '%HOGE%.seq.seq.tab' INTO TABLE %HOGE%seq;
237
+
238
+
239
+ gbmerge.sql sample:
240
+
241
+ CREATE TABLE IF NOT EXISTS ent (
242
+ id varchar(16) NOT NULL PRIMARY KEY,
243
+ nalen integer,
244
+ strand varchar(5),
245
+ natype varchar(5),
246
+ circular varchar(10),
247
+ division varchar(5),
248
+ date varchar(12),
249
+ definition varchar(255),
250
+ accession varchar(30),
251
+ versions varchar(30),
252
+ keywords varchar(255),
253
+ segment varchar(255),
254
+ source varchar(255),
255
+ organism varchar(255),
256
+ taxonomy varchar(255),
257
+ comment text,
258
+ basecount varchar(255),
259
+ origin varchar(255),
260
+ KEY (nalen),
261
+ KEY (division),
262
+ KEY (accession),
263
+ KEY (organism),
264
+ KEY (taxonomy)
265
+ ) TYPE=MERGE UNION=(
266
+ gbbct1,
267
+ gbbct2,
268
+ ..., # list up all tables by yourself
269
+ gbvrt
270
+ );
271
+
272
+ CREATE TABLE IF NOT EXISTS ft (
273
+ id varchar(16) NOT NULL,
274
+ num integer,
275
+ feature varchar(30),
276
+ position text,
277
+ span_min integer,
278
+ span_max integer,
279
+ qualifier varchar(30),
280
+ value text,
281
+ KEY (id),
282
+ KEY (num),
283
+ KEY (feature),
284
+ KEY (span_min),
285
+ KEY (span_max),
286
+ KEY (qualifier)
287
+ ) TYPE=MERGE UNION=(
288
+ gbbct1ft,
289
+ gbbct2ft,
290
+ ..., # list up all ft tables by yourself
291
+ gbvrtft
292
+ );
293
+
294
+ CREATE TABLE IF NOT EXISTS ref (
295
+ id varchar(16) NOT NULL,
296
+ num integer,
297
+ authors text,
298
+ title text,
299
+ journal text,
300
+ medline varchar(255),
301
+ pubmed varchar(255),
302
+ KEY (id),
303
+ KEY (medline),
304
+ KEY (pubmed)
305
+ ) TYPE=MERGE UNION=(
306
+ gbbct1ref,
307
+ gbbct2ref,
308
+ ..., # list up all ref tables by yourself
309
+ gbvrtref
310
+ );
311
+
312
+ CREATE TABLE IF NOT EXISTS seq (
313
+ id varchar(16) NOT NULL,
314
+ num integer,
315
+ naseq mediumtext,
316
+ KEY (id)
317
+ ) TYPE=MERGE UNION=(
318
+ gbbct1seq,
319
+ gbbct2seq,
320
+ ..., # list up all seq tables by yourself
321
+ gbvrtseq
322
+ );
323
+
324
+ =end
325
+