bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# color_scheme_na.rb - A Bio::ColorScheme demo script for Nucleic Acids
|
|
4
|
+
# sequences.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
#
|
|
8
|
+
# % ruby color_scheme_na.rb > cs-seq-fna.html
|
|
9
|
+
#
|
|
10
|
+
# % cat seq.fna
|
|
11
|
+
# >DNA_sequence
|
|
12
|
+
# acgtgtgtcatgctagtcgatcgtactagtcgtagctagtca
|
|
13
|
+
# % ruby color_scheme_na.rb seq.fna > colored-seq-fna.html
|
|
14
|
+
#
|
|
15
|
+
#
|
|
16
|
+
# Copyright (C) 2005 Mitsuteru C. Nakao <n@bioruby.org>
|
|
17
|
+
#
|
|
18
|
+
# This program is free software; you can redistribute it and/or modify
|
|
19
|
+
# it under the terms of the GNU General Public License as published by
|
|
20
|
+
# the Free Software Foundation; either version 2 of the License, or
|
|
21
|
+
# (at your option) any later version.
|
|
22
|
+
#
|
|
23
|
+
# This program is distributed in the hope that it will be useful,
|
|
24
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
25
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
26
|
+
# GNU General Public License for more details.
|
|
27
|
+
#
|
|
28
|
+
# $Id: color_scheme_na.rb,v 1.1 2005/10/31 07:39:13 nakao Exp $
|
|
29
|
+
#
|
|
30
|
+
|
|
31
|
+
require 'bio'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# returns folded sequence with <br>.
|
|
35
|
+
def br(i, width = 80)
|
|
36
|
+
return "<br\n>" if i % width == 0
|
|
37
|
+
""
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# returns sequence html doc
|
|
42
|
+
def display(seq, cs)
|
|
43
|
+
html = '<p style="font-family: monospace">'
|
|
44
|
+
postfix = '</span>'
|
|
45
|
+
i = 0
|
|
46
|
+
seq.each_byte do |c|
|
|
47
|
+
color = cs[c.chr]
|
|
48
|
+
prefix = %Q(<span style="background:\##{color};">)
|
|
49
|
+
html += prefix + c.chr + postfix
|
|
50
|
+
html += br(i += 1)
|
|
51
|
+
end
|
|
52
|
+
html + '</p>'
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# returns scheme wise html doc
|
|
57
|
+
def display_scheme(scheme, naseq, aaseq)
|
|
58
|
+
html = ''
|
|
59
|
+
cs = eval("Bio::ColorScheme::#{scheme}")
|
|
60
|
+
[naseq, aaseq].each do |seq|
|
|
61
|
+
html += display(seq, cs)
|
|
62
|
+
end
|
|
63
|
+
return ['<div>', "<h3>#{cs}</h3>", html, '</div>']
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if fna = ARGV.shift
|
|
69
|
+
naseq = Bio::FastaFormat.new(File.open(fna, 'r').read).naseq
|
|
70
|
+
aaseq = naseq.translate
|
|
71
|
+
else
|
|
72
|
+
naseq = Bio::Sequence::NA.new('acgtu' * 20).randomize
|
|
73
|
+
aaseq = naseq.translate
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
title = 'Bio::ColorScheme for DNA sequences'
|
|
77
|
+
doc = ['<html>',
|
|
78
|
+
'<header>', '<title>', title, '</title>', '</header>',
|
|
79
|
+
'<body>', '<h1>', title, '</h1>']
|
|
80
|
+
|
|
81
|
+
doc << ['<div>', '<h2>', 'Simple colors', '</h2>']
|
|
82
|
+
['Nucleotide'].each do |scheme|
|
|
83
|
+
doc << display_scheme(scheme, naseq, "")
|
|
84
|
+
end
|
|
85
|
+
doc << ['</div>']
|
|
86
|
+
|
|
87
|
+
['Zappo', 'Taylor' ].each do |scheme|
|
|
88
|
+
doc << display_scheme(scheme, "", aaseq)
|
|
89
|
+
end
|
|
90
|
+
doc << ['</div>']
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
doc << ['<div>', '<h2>', 'Score colors', '</h2>']
|
|
94
|
+
['Buried', 'Helix', 'Hydropathy', 'Strand', 'Turn'].each do |score|
|
|
95
|
+
doc << display_scheme(score, "", aaseq)
|
|
96
|
+
end
|
|
97
|
+
doc << ['</div>']
|
|
98
|
+
|
|
99
|
+
puts doc + ['</body>','</html>']
|
data/sample/dbget
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# dbget - DBGET client
|
|
4
|
+
#
|
|
5
|
+
# Interface to GenomeNet DBGET system - http://www.genome.jp/dbget/
|
|
6
|
+
#
|
|
7
|
+
# Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
|
|
8
|
+
#
|
|
9
|
+
# This program is free software; you can redistribute it and/or modify
|
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
|
11
|
+
# the Free Software Foundation; either version 2 of the License, or
|
|
12
|
+
# (at your option) any later version.
|
|
13
|
+
#
|
|
14
|
+
# This program is distributed in the hope that it will be useful,
|
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17
|
+
# GNU General Public License for more details.
|
|
18
|
+
#
|
|
19
|
+
# You should have received a copy of the GNU General Public License
|
|
20
|
+
# along with this program; if not, write to the Free Software
|
|
21
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
22
|
+
#
|
|
23
|
+
# $Id: dbget,v 1.7 2004/08/24 00:09:24 k Exp $
|
|
24
|
+
#
|
|
25
|
+
|
|
26
|
+
require "bio/io/dbget"
|
|
27
|
+
|
|
28
|
+
# DBGET command
|
|
29
|
+
com = File.basename($0) # e.g. $PATH/bget db entry
|
|
30
|
+
com = ARGV.shift if com == "dbget" # e.g. $PATH/dbget bget db entry
|
|
31
|
+
|
|
32
|
+
# DBGET query strings
|
|
33
|
+
arg = ARGV.join(" ")
|
|
34
|
+
|
|
35
|
+
# DBGET result
|
|
36
|
+
print Bio::DBGET.dbget(com, arg)
|
|
37
|
+
|
data/sample/fasta2tab.rb
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# fasta2tab.rb - convert FASTA (-m 6) output into tab delimited data for MySQL
|
|
4
|
+
#
|
|
5
|
+
# Usage:
|
|
6
|
+
#
|
|
7
|
+
# % fasta2tab.rb FASTA-output-file[s] > fasta_results.tab
|
|
8
|
+
# % mysql < fasta_results.sql (use sample at the end of this file)
|
|
9
|
+
#
|
|
10
|
+
# Format accepted:
|
|
11
|
+
#
|
|
12
|
+
# % fasta3[3][_t] -Q -H -m 6 query.f target.f ktup > FASTA-output-file
|
|
13
|
+
#
|
|
14
|
+
# Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
|
|
15
|
+
#
|
|
16
|
+
# This program is free software; you can redistribute it and/or modify
|
|
17
|
+
# it under the terms of the GNU General Public License as published by
|
|
18
|
+
# the Free Software Foundation; either version 2 of the License, or
|
|
19
|
+
# (at your option) any later version.
|
|
20
|
+
#
|
|
21
|
+
# This program is distributed in the hope that it will be useful,
|
|
22
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
23
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
24
|
+
# GNU General Public License for more details.
|
|
25
|
+
#
|
|
26
|
+
# $Id: fasta2tab.rb,v 0.1 2001/06/21 08:21:58 katayama Exp $
|
|
27
|
+
#
|
|
28
|
+
|
|
29
|
+
while gets
|
|
30
|
+
|
|
31
|
+
# query
|
|
32
|
+
if /^\S+: (\d+) aa$/
|
|
33
|
+
q_len = $1
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# each hit
|
|
37
|
+
if /^>>([^>]\S+).*\((\d+) aa\)$/
|
|
38
|
+
target = $1
|
|
39
|
+
t_len = $2
|
|
40
|
+
|
|
41
|
+
# d = dummy variable
|
|
42
|
+
d, d, initn, d, init1, d, opt, d, zscore, d, bits, d, evalue =
|
|
43
|
+
gets.split(/\s+/)
|
|
44
|
+
d, d, sw, ident, d, ugident, d, d, overlap, d, d, lap =
|
|
45
|
+
gets.split(/\s+/)
|
|
46
|
+
|
|
47
|
+
# query-hit pair
|
|
48
|
+
print "#{$FILENAME}\t#{q_len}\t#{target}\t#{t_len}"
|
|
49
|
+
|
|
50
|
+
# pick up values
|
|
51
|
+
ary = [
|
|
52
|
+
initn,
|
|
53
|
+
init1,
|
|
54
|
+
opt,
|
|
55
|
+
zscore,
|
|
56
|
+
bits,
|
|
57
|
+
evalue,
|
|
58
|
+
sw,
|
|
59
|
+
ident,
|
|
60
|
+
ugident,
|
|
61
|
+
overlap,
|
|
62
|
+
lap
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# print values
|
|
66
|
+
for i in ary
|
|
67
|
+
i.tr!('^0-9.:e\-','')
|
|
68
|
+
print "\t#{i}"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
print "\n"
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
=begin MySQL fasta_results.sql sample
|
|
77
|
+
|
|
78
|
+
CREATE DATABASE IF NOT EXISTS db_name;
|
|
79
|
+
CREATE TABLE IF NOT EXISTS db_name.table_name (
|
|
80
|
+
query varchar(25) not NULL,
|
|
81
|
+
q_len integer unsigned default 0,
|
|
82
|
+
target varchar(25) not NULL,
|
|
83
|
+
t_len integer unsigned default 0,
|
|
84
|
+
initn integer unsigned default 0,
|
|
85
|
+
init1 integer unsigned default 0,
|
|
86
|
+
opt integer unsigned default 0,
|
|
87
|
+
zscore float default 0.0,
|
|
88
|
+
bits float default 0.0,
|
|
89
|
+
evalue float default 0.0,
|
|
90
|
+
sw integer unsigned default 0,
|
|
91
|
+
ident float default 0.0,
|
|
92
|
+
ugident float default 0.0,
|
|
93
|
+
overlap integer unsigned default 0,
|
|
94
|
+
lap_at varchar(25) default NULL
|
|
95
|
+
);
|
|
96
|
+
LOAD DATA LOCAL INFILE 'fasta_results.tab' INTO TABLE db_name.table_name;
|
|
97
|
+
|
|
98
|
+
=end
|
|
99
|
+
|
data/sample/fsplit.rb
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# fsplit.rb - split FASTA file by each n entries
|
|
4
|
+
#
|
|
5
|
+
# Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
|
|
6
|
+
#
|
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
|
10
|
+
# (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# $Id: fsplit.rb,v 0.1 2001/06/21 08:22:29 katayama Exp $
|
|
18
|
+
#
|
|
19
|
+
|
|
20
|
+
if ARGV.length != 2
|
|
21
|
+
|
|
22
|
+
print <<-USAGE
|
|
23
|
+
fsplit.rb - split FASTA file by each n entries
|
|
24
|
+
|
|
25
|
+
Usage :
|
|
26
|
+
|
|
27
|
+
% ./fsplit.rb 2000 seq.f
|
|
28
|
+
|
|
29
|
+
This will produce seq.f.1, seq.f.2, ... with containing 2000 sequences
|
|
30
|
+
in each file.
|
|
31
|
+
|
|
32
|
+
USAGE
|
|
33
|
+
exit 1
|
|
34
|
+
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
count = ARGV.shift.to_i
|
|
38
|
+
|
|
39
|
+
i = -1
|
|
40
|
+
|
|
41
|
+
while gets
|
|
42
|
+
if /^>/
|
|
43
|
+
i += 1
|
|
44
|
+
if i % count == 0
|
|
45
|
+
n = i / count
|
|
46
|
+
out = File.new("#{$FILENAME}.#{n+1}", "w+")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
out.print
|
|
50
|
+
end
|
|
51
|
+
|
data/sample/gb2fasta.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# gb2fasta.rb - convert GenBank entry into FASTA format (nuc)
|
|
4
|
+
#
|
|
5
|
+
# Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
|
|
6
|
+
# Copyright (C) 2002 Yoshinori K. Okuji <o@bioruby.org>
|
|
7
|
+
#
|
|
8
|
+
# This program is free software; you can redistribute it and/or modify
|
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
|
11
|
+
# (at your option) any later version.
|
|
12
|
+
#
|
|
13
|
+
# This program is distributed in the hope that it will be useful,
|
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
# GNU General Public License for more details.
|
|
17
|
+
#
|
|
18
|
+
# $Id: gb2fasta.rb,v 0.5 2002/07/23 04:51:24 k Exp $
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
require 'bio/io/flatfile'
|
|
22
|
+
require 'bio/db/genbank'
|
|
23
|
+
|
|
24
|
+
include Bio
|
|
25
|
+
|
|
26
|
+
ff = FlatFile.new(GenBank, ARGF)
|
|
27
|
+
|
|
28
|
+
while gb = ff.next_entry
|
|
29
|
+
print gb.seq.to_fasta("gb:#{gb.entry_id} #{gb.definition}", 70)
|
|
30
|
+
end
|
|
31
|
+
|
data/sample/gb2tab.rb
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# gb2tab.rb - convert GenBank into tab delimited data for MySQL
|
|
4
|
+
#
|
|
5
|
+
# Usage:
|
|
6
|
+
#
|
|
7
|
+
# % gb2tab.rb gb*.seq
|
|
8
|
+
#
|
|
9
|
+
# Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
|
|
10
|
+
#
|
|
11
|
+
# This program is free software; you can redistribute it and/or modify
|
|
12
|
+
# it under the terms of the GNU General Public License as published by
|
|
13
|
+
# the Free Software Foundation; either version 2 of the License, or
|
|
14
|
+
# (at your option) any later version.
|
|
15
|
+
#
|
|
16
|
+
# This program is distributed in the hope that it will be useful,
|
|
17
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
+
# GNU General Public License for more details.
|
|
20
|
+
#
|
|
21
|
+
# $Id: gb2tab.rb,v 0.11 2002/04/22 09:10:10 k Exp $
|
|
22
|
+
#
|
|
23
|
+
|
|
24
|
+
require 'bio'
|
|
25
|
+
|
|
26
|
+
$stderr.puts Time.now
|
|
27
|
+
|
|
28
|
+
ARGV.each do |gbkfile|
|
|
29
|
+
|
|
30
|
+
gbk = open("#{gbkfile}")
|
|
31
|
+
ent = open("#{gbkfile}.ent.tab", "w")
|
|
32
|
+
ft = open("#{gbkfile}.ft.tab", "w")
|
|
33
|
+
ref = open("#{gbkfile}.ref.tab", "w")
|
|
34
|
+
seq = open("#{gbkfile}.seq.tab", "w")
|
|
35
|
+
|
|
36
|
+
while entry = gbk.gets(Bio::GenBank::DELIMITER)
|
|
37
|
+
|
|
38
|
+
gb = Bio::GenBank.new(entry)
|
|
39
|
+
|
|
40
|
+
### MAIN BODY
|
|
41
|
+
|
|
42
|
+
ary = [
|
|
43
|
+
gb.entry_id,
|
|
44
|
+
gb.nalen,
|
|
45
|
+
gb.strand,
|
|
46
|
+
gb.natype,
|
|
47
|
+
gb.circular,
|
|
48
|
+
gb.division,
|
|
49
|
+
gb.date,
|
|
50
|
+
gb.definition,
|
|
51
|
+
gb.accession,
|
|
52
|
+
gb.versions.inspect,
|
|
53
|
+
gb.keywords.inspect,
|
|
54
|
+
gb.segment.inspect,
|
|
55
|
+
gb.common_name,
|
|
56
|
+
gb.organism,
|
|
57
|
+
gb.taxonomy,
|
|
58
|
+
gb.comment,
|
|
59
|
+
gb.basecount.inspect,
|
|
60
|
+
gb.origin,
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
ent.puts ary.join("\t")
|
|
64
|
+
|
|
65
|
+
### FEATURES
|
|
66
|
+
|
|
67
|
+
num = 0
|
|
68
|
+
|
|
69
|
+
gb.features.each do |f|
|
|
70
|
+
num += 1
|
|
71
|
+
|
|
72
|
+
span_min, span_max = f.locations.span
|
|
73
|
+
|
|
74
|
+
if f.qualifiers.empty?
|
|
75
|
+
ary = [
|
|
76
|
+
gb.entry_id,
|
|
77
|
+
num,
|
|
78
|
+
f.feature,
|
|
79
|
+
f.position,
|
|
80
|
+
span_min,
|
|
81
|
+
span_max,
|
|
82
|
+
'',
|
|
83
|
+
'',
|
|
84
|
+
]
|
|
85
|
+
ft.puts ary.join("\t")
|
|
86
|
+
else
|
|
87
|
+
f.each do |q|
|
|
88
|
+
ary = [
|
|
89
|
+
gb.entry_id,
|
|
90
|
+
num,
|
|
91
|
+
f.feature,
|
|
92
|
+
f.position,
|
|
93
|
+
span_min,
|
|
94
|
+
span_max,
|
|
95
|
+
q.qualifier,
|
|
96
|
+
q.value,
|
|
97
|
+
]
|
|
98
|
+
ft.puts ary.join("\t")
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
### REFERENCE
|
|
105
|
+
|
|
106
|
+
num = 0
|
|
107
|
+
|
|
108
|
+
gb.references.each do |r|
|
|
109
|
+
num += 1
|
|
110
|
+
|
|
111
|
+
ary = [
|
|
112
|
+
gb.entry_id,
|
|
113
|
+
num,
|
|
114
|
+
r.authors.inspect,
|
|
115
|
+
r.title,
|
|
116
|
+
r.journal,
|
|
117
|
+
r.medline,
|
|
118
|
+
r.pubmed,
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
ref.puts ary.join("\t")
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
### SEQUENCE
|
|
125
|
+
|
|
126
|
+
maxlen = 16 * 10 ** 6
|
|
127
|
+
|
|
128
|
+
num = 0
|
|
129
|
+
|
|
130
|
+
0.step(gb.nalen, maxlen) do |i|
|
|
131
|
+
num += 1
|
|
132
|
+
|
|
133
|
+
ary = [
|
|
134
|
+
gb.entry_id,
|
|
135
|
+
num,
|
|
136
|
+
gb.naseq[i, maxlen]
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
seq.puts ary.join("\t")
|
|
140
|
+
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
gbk.close
|
|
146
|
+
ent.close
|
|
147
|
+
ft.close
|
|
148
|
+
ref.close
|
|
149
|
+
seq.close
|
|
150
|
+
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
$stderr.puts Time.now
|
|
154
|
+
|
|
155
|
+
=begin
|
|
156
|
+
|
|
157
|
+
Example usage in zsh:
|
|
158
|
+
|
|
159
|
+
% gb2tab.rb *.seq
|
|
160
|
+
% for i in *.seq
|
|
161
|
+
> do
|
|
162
|
+
> base=`basename $i .seq`
|
|
163
|
+
> ruby -pe "gsub(/%HOGE%/,'$base')" gb2tab.sql | mysql
|
|
164
|
+
> done
|
|
165
|
+
|
|
166
|
+
gb2tab.sql:
|
|
167
|
+
|
|
168
|
+
CREATE DATABASE IF NOT EXISTS genbank;
|
|
169
|
+
USE genbank;
|
|
170
|
+
|
|
171
|
+
CREATE TABLE IF NOT EXISTS %HOGE% (
|
|
172
|
+
id varchar(16) NOT NULL PRIMARY KEY,
|
|
173
|
+
nalen integer,
|
|
174
|
+
strand varchar(5),
|
|
175
|
+
natype varchar(5),
|
|
176
|
+
circular varchar(10),
|
|
177
|
+
division varchar(5),
|
|
178
|
+
date varchar(12),
|
|
179
|
+
definition varchar(255),
|
|
180
|
+
accession varchar(30),
|
|
181
|
+
versions varchar(30),
|
|
182
|
+
keywords varchar(255),
|
|
183
|
+
segment varchar(255),
|
|
184
|
+
source varchar(255),
|
|
185
|
+
organism varchar(255),
|
|
186
|
+
taxonomy varchar(255),
|
|
187
|
+
comment text,
|
|
188
|
+
basecount varchar(255),
|
|
189
|
+
origin varchar(255),
|
|
190
|
+
KEY (nalen),
|
|
191
|
+
KEY (division),
|
|
192
|
+
KEY (accession),
|
|
193
|
+
KEY (organism),
|
|
194
|
+
KEY (taxonomy)
|
|
195
|
+
);
|
|
196
|
+
LOAD DATA LOCAL INFILE '%HOGE%.seq.ent.tab' INTO TABLE %HOGE%;
|
|
197
|
+
|
|
198
|
+
CREATE TABLE IF NOT EXISTS %HOGE%ft (
|
|
199
|
+
id varchar(16) NOT NULL,
|
|
200
|
+
num integer,
|
|
201
|
+
feature varchar(30),
|
|
202
|
+
position text,
|
|
203
|
+
span_min integer,
|
|
204
|
+
span_max integer,
|
|
205
|
+
qualifier varchar(30),
|
|
206
|
+
value text,
|
|
207
|
+
KEY (id),
|
|
208
|
+
KEY (num),
|
|
209
|
+
KEY (feature),
|
|
210
|
+
KEY (span_min),
|
|
211
|
+
KEY (span_max),
|
|
212
|
+
KEY (qualifier)
|
|
213
|
+
);
|
|
214
|
+
LOAD DATA LOCAL INFILE '%HOGE%.seq.ft.tab' INTO TABLE %HOGE%ft;
|
|
215
|
+
|
|
216
|
+
CREATE TABLE IF NOT EXISTS %HOGE%ref (
|
|
217
|
+
id varchar(16) NOT NULL,
|
|
218
|
+
num integer,
|
|
219
|
+
authors text,
|
|
220
|
+
title text,
|
|
221
|
+
journal text,
|
|
222
|
+
medline varchar(255),
|
|
223
|
+
pubmed varchar(255),
|
|
224
|
+
KEY (id),
|
|
225
|
+
KEY (medline),
|
|
226
|
+
KEY (pubmed)
|
|
227
|
+
);
|
|
228
|
+
LOAD DATA LOCAL INFILE '%HOGE%.seq.ref.tab' INTO TABLE %HOGE%ref;
|
|
229
|
+
|
|
230
|
+
CREATE TABLE IF NOT EXISTS %HOGE%seq (
|
|
231
|
+
id varchar(16) NOT NULL,
|
|
232
|
+
num integer,
|
|
233
|
+
naseq mediumtext,
|
|
234
|
+
KEY (id)
|
|
235
|
+
);
|
|
236
|
+
LOAD DATA LOCAL INFILE '%HOGE%.seq.seq.tab' INTO TABLE %HOGE%seq;
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
gbmerge.sql sample:
|
|
240
|
+
|
|
241
|
+
CREATE TABLE IF NOT EXISTS ent (
|
|
242
|
+
id varchar(16) NOT NULL PRIMARY KEY,
|
|
243
|
+
nalen integer,
|
|
244
|
+
strand varchar(5),
|
|
245
|
+
natype varchar(5),
|
|
246
|
+
circular varchar(10),
|
|
247
|
+
division varchar(5),
|
|
248
|
+
date varchar(12),
|
|
249
|
+
definition varchar(255),
|
|
250
|
+
accession varchar(30),
|
|
251
|
+
versions varchar(30),
|
|
252
|
+
keywords varchar(255),
|
|
253
|
+
segment varchar(255),
|
|
254
|
+
source varchar(255),
|
|
255
|
+
organism varchar(255),
|
|
256
|
+
taxonomy varchar(255),
|
|
257
|
+
comment text,
|
|
258
|
+
basecount varchar(255),
|
|
259
|
+
origin varchar(255),
|
|
260
|
+
KEY (nalen),
|
|
261
|
+
KEY (division),
|
|
262
|
+
KEY (accession),
|
|
263
|
+
KEY (organism),
|
|
264
|
+
KEY (taxonomy)
|
|
265
|
+
) TYPE=MERGE UNION=(
|
|
266
|
+
gbbct1,
|
|
267
|
+
gbbct2,
|
|
268
|
+
..., # list up all tables by yourself
|
|
269
|
+
gbvrt
|
|
270
|
+
);
|
|
271
|
+
|
|
272
|
+
CREATE TABLE IF NOT EXISTS ft (
|
|
273
|
+
id varchar(16) NOT NULL,
|
|
274
|
+
num integer,
|
|
275
|
+
feature varchar(30),
|
|
276
|
+
position text,
|
|
277
|
+
span_min integer,
|
|
278
|
+
span_max integer,
|
|
279
|
+
qualifier varchar(30),
|
|
280
|
+
value text,
|
|
281
|
+
KEY (id),
|
|
282
|
+
KEY (num),
|
|
283
|
+
KEY (feature),
|
|
284
|
+
KEY (span_min),
|
|
285
|
+
KEY (span_max),
|
|
286
|
+
KEY (qualifier)
|
|
287
|
+
) TYPE=MERGE UNION=(
|
|
288
|
+
gbbct1ft,
|
|
289
|
+
gbbct2ft,
|
|
290
|
+
..., # list up all ft tables by yourself
|
|
291
|
+
gbvrtft
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
CREATE TABLE IF NOT EXISTS ref (
|
|
295
|
+
id varchar(16) NOT NULL,
|
|
296
|
+
num integer,
|
|
297
|
+
authors text,
|
|
298
|
+
title text,
|
|
299
|
+
journal text,
|
|
300
|
+
medline varchar(255),
|
|
301
|
+
pubmed varchar(255),
|
|
302
|
+
KEY (id),
|
|
303
|
+
KEY (medline),
|
|
304
|
+
KEY (pubmed)
|
|
305
|
+
) TYPE=MERGE UNION=(
|
|
306
|
+
gbbct1ref,
|
|
307
|
+
gbbct2ref,
|
|
308
|
+
..., # list up all ref tables by yourself
|
|
309
|
+
gbvrtref
|
|
310
|
+
);
|
|
311
|
+
|
|
312
|
+
CREATE TABLE IF NOT EXISTS seq (
|
|
313
|
+
id varchar(16) NOT NULL,
|
|
314
|
+
num integer,
|
|
315
|
+
naseq mediumtext,
|
|
316
|
+
KEY (id)
|
|
317
|
+
) TYPE=MERGE UNION=(
|
|
318
|
+
gbbct1seq,
|
|
319
|
+
gbbct2seq,
|
|
320
|
+
..., # list up all seq tables by yourself
|
|
321
|
+
gbvrtseq
|
|
322
|
+
);
|
|
323
|
+
|
|
324
|
+
=end
|
|
325
|
+
|