bio 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
#
|
2
|
+
# bio/io/fastacmd.rb - NCBI fastacmd wrapper class
|
3
|
+
#
|
4
|
+
# Copyright (C) 2005 Shuji SHIGENOBU <shige@nibb.ac.jp>
|
5
|
+
# Copyright (C) 2005 Toshiaki Katayama <k@bioruby.org>
|
6
|
+
#
|
7
|
+
# This library is free software; you can redistribute it and/or
|
8
|
+
# modify it under the terms of the GNU Lesser General Public
|
9
|
+
# License as published by the Free Software Foundation; either
|
10
|
+
# version 2 of the License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This library is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
15
|
+
# Lesser General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Lesser General Public
|
18
|
+
# License along with this library; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
20
|
+
#
|
21
|
+
# $Id: fastacmd.rb,v 1.8 2005/09/26 13:00:08 k Exp $
|
22
|
+
#
|
23
|
+
|
24
|
+
require 'bio/db/fasta'
|
25
|
+
require 'bio/io/flatfile'
|
26
|
+
require 'bio/command'
|
27
|
+
|
28
|
+
module Bio
|
29
|
+
class Blast
|
30
|
+
|
31
|
+
class Fastacmd
|
32
|
+
|
33
|
+
include Enumerable
|
34
|
+
include Bio::Command::Tools
|
35
|
+
|
36
|
+
def initialize(db)
|
37
|
+
@database = db
|
38
|
+
@fastacmd = 'fastacmd'
|
39
|
+
end
|
40
|
+
attr_accessor :database, :fastacmd, :errorlog
|
41
|
+
|
42
|
+
# get an entry_id and returns a Bio::FastaFormat object
|
43
|
+
def get_by_id(entry_id)
|
44
|
+
fetch(entry_id).shift
|
45
|
+
end
|
46
|
+
|
47
|
+
# get one or more entry_id and returns an Array of Bio::FastaFormat objects
|
48
|
+
def fetch(list)
|
49
|
+
if list.respond_to?(:join)
|
50
|
+
entry_id = list.join(",")
|
51
|
+
else
|
52
|
+
entry_id = list
|
53
|
+
end
|
54
|
+
|
55
|
+
cmd = [ @fastacmd, '-d', @database, '-s', entry_id ]
|
56
|
+
call_command_local(cmd) do |inn, out|
|
57
|
+
inn.close_write
|
58
|
+
Bio::FlatFile.new(Bio::FastaFormat, out).to_a
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def each_entry
|
63
|
+
cmd = [ @fastacmd, '-d', @database, '-D', 'T' ]
|
64
|
+
call_command_local(cmd) do |inn, out|
|
65
|
+
inn.close_write
|
66
|
+
Bio::FlatFile.open(Bio::FastaFormat, out) do |f|
|
67
|
+
f.each_entry do |e|
|
68
|
+
yield e
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
self
|
73
|
+
end
|
74
|
+
alias each each_entry
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
if __FILE__ == $0
|
83
|
+
|
84
|
+
database = ARGV.shift || "/db/myblastdb"
|
85
|
+
entry_id = ARGV.shift || "sp:128U_DROME"
|
86
|
+
ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
|
87
|
+
|
88
|
+
fastacmd = Bio::Blast::Fastacmd.new(database)
|
89
|
+
|
90
|
+
### Retrieve one sequence
|
91
|
+
entry = fastacmd.get_by_id(entry_id)
|
92
|
+
|
93
|
+
# Fastacmd#get_by_id(entry_id) returns a Bio::FastaFormat object.
|
94
|
+
p entry
|
95
|
+
|
96
|
+
# Bio::FastaFormat becomes a fasta format string when printed by puts.
|
97
|
+
puts entry
|
98
|
+
|
99
|
+
# Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
|
100
|
+
# object even when the result is a single entry.
|
101
|
+
p fastacmd.fetch(entry_id)
|
102
|
+
|
103
|
+
### Retrieve more sequences
|
104
|
+
|
105
|
+
# Fastacmd#fetch method also accepts a list of entry_id and returns
|
106
|
+
# an Array of Bio::FastaFormat objects.
|
107
|
+
p fastacmd.fetch(ent_list)
|
108
|
+
|
109
|
+
# So, you can iterate on the results.
|
110
|
+
fastacmd.fetch(ent_list).each do |fasta|
|
111
|
+
puts fasta
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
### Iterates on all entries
|
116
|
+
|
117
|
+
# You can also iterate on all sequences in the database!
|
118
|
+
fastacmd.each do |fasta|
|
119
|
+
p [ fasta.definition[0..30], fasta.seq.size ]
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
data/lib/bio/io/fetch.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
#
|
2
|
+
# = bio/io/biofetch.rb - BioFetch access module
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2002, 2005
|
5
|
+
# Toshiaki Katayama <k@bioruby.org>
|
6
|
+
# License:: LGPL
|
7
|
+
#
|
8
|
+
# $Id: fetch.rb,v 1.4 2005/12/18 15:58:42 k Exp $
|
9
|
+
#
|
10
|
+
#--
|
11
|
+
#
|
12
|
+
# This library is free software; you can redistribute it and/or
|
13
|
+
# modify it under the terms of the GNU Lesser General Public
|
14
|
+
# License as published by the Free Software Foundation; either
|
15
|
+
# version 2 of the License, or (at your option) any later version.
|
16
|
+
#
|
17
|
+
# This library is distributed in the hope that it will be useful,
|
18
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
19
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
20
|
+
# Lesser General Public License for more details.
|
21
|
+
#
|
22
|
+
# You should have received a copy of the GNU Lesser General Public
|
23
|
+
# License along with this library; if not, write to the Free Software
|
24
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
25
|
+
#
|
26
|
+
#++
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'uri'
|
30
|
+
require 'net/http'
|
31
|
+
|
32
|
+
module Bio
|
33
|
+
|
34
|
+
class Fetch
|
35
|
+
|
36
|
+
# Create a new Bio::Fetch server object.
|
37
|
+
# Use Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') to connect
|
38
|
+
# to EBI BioFetch server.
|
39
|
+
def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
|
40
|
+
schema, user, @host, @port, reg, @path, = URI.split(url)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Set default database to dbname (prepare for get_by_id).
|
44
|
+
attr_accessor :database
|
45
|
+
|
46
|
+
# Get raw database entry by id (mainly used by Bio::Registry).
|
47
|
+
def get_by_id(id)
|
48
|
+
fetch(@database, id)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Fetch a database entry as specified by database (db), entry id (id),
|
52
|
+
# 'raw' text or 'html' (style), and format. When using BioRuby's
|
53
|
+
# BioFetch server, value for the format should not be set.
|
54
|
+
def fetch(db, id, style = 'raw', format = nil)
|
55
|
+
data = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
|
56
|
+
data.push("format=#{format}") if format
|
57
|
+
data = data.join('&')
|
58
|
+
|
59
|
+
responce, result = Net::HTTP.new(@host, @port).post(@path, data)
|
60
|
+
return result
|
61
|
+
end
|
62
|
+
|
63
|
+
# Short cut for using BioRuby's BioFetch server. You can fetch an entry
|
64
|
+
# without creating instance of BioFetch server.
|
65
|
+
def self.query(*args)
|
66
|
+
self.new.fetch(*args)
|
67
|
+
end
|
68
|
+
|
69
|
+
# What databases are available?
|
70
|
+
def databases
|
71
|
+
query = "info=dbs"
|
72
|
+
responce, result = Net::HTTP.new(@host, @port).post(@path, query)
|
73
|
+
return result
|
74
|
+
end
|
75
|
+
|
76
|
+
# What formats does the database X have?
|
77
|
+
def formats(database = @database)
|
78
|
+
if database
|
79
|
+
query = "info=formats;db=#{database}"
|
80
|
+
responce, result = Net::HTTP.new(@host, @port).post(@path, query)
|
81
|
+
return result
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# How many entries can be retrieved simultaneously?
|
86
|
+
def maxids
|
87
|
+
query = "info=maxids"
|
88
|
+
responce, result = Net::HTTP.new(@host, @port).post(@path, query)
|
89
|
+
return result
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end # module Bio
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
if __FILE__ == $0
|
99
|
+
|
100
|
+
# bfserv = Bio::Fetch.new('http://www.ebi.ac.uk:80/cgi-bin/dbfetch')
|
101
|
+
bfserv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
|
102
|
+
puts "# test 1"
|
103
|
+
puts bfserv.fetch('embl', 'J00231', 'raw')
|
104
|
+
puts "# test 2"
|
105
|
+
puts bfserv.fetch('embl', 'J00231', 'html')
|
106
|
+
|
107
|
+
puts "# test 3"
|
108
|
+
puts Bio::Fetch.query('genbank', 'J00231')
|
109
|
+
puts "# test 4"
|
110
|
+
puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
|
@@ -0,0 +1,496 @@
|
|
1
|
+
#
|
2
|
+
# = bio/io/flatfile.rb - flatfile access wrapper class
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2001, 2002 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
|
5
|
+
# License:: LGPL
|
6
|
+
#
|
7
|
+
#--
|
8
|
+
# This library is free software; you can redistribute it and/or
|
9
|
+
# modify it under the terms of the GNU Lesser General Public
|
10
|
+
# License as published by the Free Software Foundation; either
|
11
|
+
# version 2 of the License, or (at your option) any later version.
|
12
|
+
#
|
13
|
+
# This library is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# Lesser General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU Lesser General Public
|
19
|
+
# License along with this library; if not, write to the Free Software
|
20
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
21
|
+
#++
|
22
|
+
#
|
23
|
+
# $Id: flatfile.rb,v 1.41 2005/11/01 15:34:45 ngoto Exp $
|
24
|
+
#
|
25
|
+
# Bio::FlatFile is a helper and wrapper class to read a biological data file.
|
26
|
+
# It acts like a IO object.
|
27
|
+
# It can automatically detect data format, and users do not need to tell
|
28
|
+
# the class what the data is.
|
29
|
+
#
|
30
|
+
|
31
|
+
module Bio
|
32
|
+
|
33
|
+
# Bio::FlatFile is a helper and wrapper class to read a biological data file.
|
34
|
+
# It acts like a IO object.
|
35
|
+
# It can automatically detect data format, and users do not need to tell
|
36
|
+
# the class what the data is.
|
37
|
+
class FlatFile
|
38
|
+
|
39
|
+
include Enumerable
|
40
|
+
|
41
|
+
# Creates a new Bio::FlatFile object to read a file or a stream
|
42
|
+
# which contains +dbclass+ data.
|
43
|
+
#
|
44
|
+
# +dbclass+ should be a class (or module) or nil.
|
45
|
+
# e.g. Bio::GenBank, Bio::FastaFormat.
|
46
|
+
#
|
47
|
+
# If +file+ is a filename (which doesn't have gets method),
|
48
|
+
# the method opens a local file named +file+
|
49
|
+
# with 'File.open(filename, mode, perm)'.
|
50
|
+
#
|
51
|
+
# When nil is given to dbclass, trying to determine database class
|
52
|
+
# (file format) automatically. If fails to determine, dbclass is
|
53
|
+
# set to nil and FlatFile#next_entry works same as IO#gets when
|
54
|
+
# raw = true. It is recommended to set dbclass using
|
55
|
+
# FlatFile#dbclass= method if fails to determine automatically.
|
56
|
+
#
|
57
|
+
# * Example 1
|
58
|
+
# Bio::FlatFile.open(Bio::GenBank, "genbank/gbest40.seq")
|
59
|
+
# * Example 2
|
60
|
+
# Bio::FlatFile.open(nil, "embl/est_hum17.dat")
|
61
|
+
# * Example 3
|
62
|
+
# Bio::FlatFile.open(Bio::GenBank, $stdin)
|
63
|
+
#
|
64
|
+
# If it is called with block, the block will be executed with
|
65
|
+
# a newly opened Bio::FlatFile instance object. If filename
|
66
|
+
# is given, the file is automatically closed when leaving the block.
|
67
|
+
#
|
68
|
+
# * Example 4
|
69
|
+
# Bio::FlatFile.open(nil, 'test4.fst') do |ff|
|
70
|
+
# ff.each { |e| print e.definition, "\n" }
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
def self.open(dbclass, file, *arg)
|
74
|
+
# 3rd and 4th arg: mode, perm (passed to File.open)
|
75
|
+
openmode = []
|
76
|
+
while x = arg[0] and !x.is_a?(Hash)
|
77
|
+
openmode << arg.shift
|
78
|
+
end
|
79
|
+
# rest of arg: passed to FlatFile.new
|
80
|
+
# create a flatfile object
|
81
|
+
unless file.respond_to?(:gets)
|
82
|
+
# 'file' is a filename
|
83
|
+
if block_given? then
|
84
|
+
File.open(file, *openmode) do |fobj|
|
85
|
+
ff = self.new(dbclass, fobj, *arg)
|
86
|
+
yield ff
|
87
|
+
end
|
88
|
+
else
|
89
|
+
fobj = File.open(file, *openmode)
|
90
|
+
self.new(dbclass, fobj, *arg)
|
91
|
+
end
|
92
|
+
else
|
93
|
+
# 'file' is a IO object
|
94
|
+
ff = self.new(dbclass, file, *arg)
|
95
|
+
block_given? ? (yield ff) : ff
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Same as Bio::FlatFile.open(nil, filename_or_stream, mode, perm, options).
|
100
|
+
#
|
101
|
+
# * Example 1
|
102
|
+
# Bio::FlatFile.auto(ARGF)
|
103
|
+
# * Example 2
|
104
|
+
# Bio::FlatFile.auto("embl/est_hum17.dat")
|
105
|
+
# * Example 3
|
106
|
+
# Bio::FlatFile.auto(IO.popen("gzip -dc nc1101.flat.gz"))
|
107
|
+
#
|
108
|
+
def self.auto(*arg, &block)
|
109
|
+
self.open(nil, *arg, &block)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Same as FlatFile.auto(filename_or_stream, *arg).to_a
|
113
|
+
# (It might be OBSOLETED in the future.)
|
114
|
+
def self.to_a(*arg)
|
115
|
+
self.auto(*arg) do |ff|
|
116
|
+
raise 'cannot determine file format' unless ff.dbclass
|
117
|
+
ff.to_a
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Same as FlatFile.open, except that 'stream' should be a opened
|
122
|
+
# stream object (IO, File, ..., who have the 'gets' method).
|
123
|
+
#
|
124
|
+
# * Example 1
|
125
|
+
# Bio::FlatFile.new(Bio::GenBank, ARGF)
|
126
|
+
# * Example 2
|
127
|
+
# Bio::FlatFile.new(Bio::GenBank, IO.popen("gzip -dc nc1101.flat.gz"))
|
128
|
+
#
|
129
|
+
# +options+ should be a hash (or nil). It will be OBSOLETED!!
|
130
|
+
# Available options are below:
|
131
|
+
# [<tt>:raw</tt>] if true, "raw mode" (same as #raw=true).
|
132
|
+
# default: false (not "raw mode").
|
133
|
+
#
|
134
|
+
# * Example 3
|
135
|
+
# Bio::FlatFile.new(nil, $stdin, :raw=>true)
|
136
|
+
# * Example 3 in old style (deprecated)
|
137
|
+
# Bio::FlatFile.new(nil, $stdin, true)
|
138
|
+
#
|
139
|
+
def initialize(dbclass, stream, options = nil)
|
140
|
+
# 2nd arg: IO object
|
141
|
+
@io = stream
|
142
|
+
# 3rd arg: options (nil or a Hash)
|
143
|
+
self.raw = false
|
144
|
+
if options.is_a?(Hash) then
|
145
|
+
self.raw = options[:raw] if options.has_key?(:raw)
|
146
|
+
else
|
147
|
+
self.raw = options
|
148
|
+
end
|
149
|
+
# initialize prefetch buffer
|
150
|
+
@prefetch = ''
|
151
|
+
# 1st arg: database class (or file format autodetection)
|
152
|
+
if dbclass then
|
153
|
+
self.dbclass = dbclass
|
154
|
+
else
|
155
|
+
autodetect
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# IO object in the flatfile object.
|
160
|
+
attr_reader :io
|
161
|
+
|
162
|
+
# Get next entry.
|
163
|
+
def next_entry
|
164
|
+
@entry_raw = gets(@rs)
|
165
|
+
return nil unless @entry_raw
|
166
|
+
if raw then
|
167
|
+
@entry_raw
|
168
|
+
else
|
169
|
+
e = @dbclass.new(@entry_raw)
|
170
|
+
begin
|
171
|
+
s = e.entry_overrun
|
172
|
+
rescue NameError
|
173
|
+
s = nil
|
174
|
+
end
|
175
|
+
if s then
|
176
|
+
@entry_raw[-(s.length), s.length] = ''
|
177
|
+
ungets(s)
|
178
|
+
end
|
179
|
+
e
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Returns the last raw entry as a string.
|
184
|
+
attr_reader :entry_raw
|
185
|
+
|
186
|
+
# Iterates over each entry in the flatfile.
|
187
|
+
#
|
188
|
+
# * Example
|
189
|
+
# include Bio
|
190
|
+
# ff = FlatFile.open(GenBank, "genbank/gbhtg14.seq")
|
191
|
+
# ff.each_entry do |x|
|
192
|
+
# puts x.definition
|
193
|
+
# end
|
194
|
+
def each_entry
|
195
|
+
while e = self.next_entry
|
196
|
+
yield e
|
197
|
+
end
|
198
|
+
end
|
199
|
+
alias each each_entry
|
200
|
+
|
201
|
+
# Resets file pointer to the start of the flatfile.
|
202
|
+
# (similar to IO#rewind)
|
203
|
+
def rewind
|
204
|
+
r = @io.rewind
|
205
|
+
@prefetch = ''
|
206
|
+
r
|
207
|
+
end
|
208
|
+
|
209
|
+
# Closes input stream.
|
210
|
+
# (similar to IO#close)
|
211
|
+
def close
|
212
|
+
@io.close
|
213
|
+
end
|
214
|
+
|
215
|
+
# Returns current position of input stream.
|
216
|
+
# If the input stream is not a normal file,
|
217
|
+
# the result is not guaranteed.
|
218
|
+
# It is similar to IO#pos.
|
219
|
+
# Note that it will not be equal to io.pos,
|
220
|
+
# because FlatFile#autodetect may pre-read some lines.
|
221
|
+
def pos
|
222
|
+
@io.pos - @prefetch.size
|
223
|
+
end
|
224
|
+
|
225
|
+
# (Not recommended to use it.)
|
226
|
+
# Sets position of input stream.
|
227
|
+
# If the input stream is not a normal file,
|
228
|
+
# the result is not guaranteed.
|
229
|
+
# It is similar to IO#pos=.
|
230
|
+
# Note that it will not be equal to io.pos=,
|
231
|
+
# because FlatFile#autodetect may pre-read some lines.
|
232
|
+
def pos=(p)
|
233
|
+
r = (@io.pos = p)
|
234
|
+
@prefetch = ''
|
235
|
+
r
|
236
|
+
end
|
237
|
+
|
238
|
+
# Returns true if input stream is end-of-file.
|
239
|
+
# Otherwise, returns false.
|
240
|
+
# (Similar to IO#eof?, but may not be equal to io.eof?,
|
241
|
+
# because FlatFile#autodetect may pre-read some lines.)
|
242
|
+
def eof?
|
243
|
+
if @prefetch.size > 0
|
244
|
+
false
|
245
|
+
else
|
246
|
+
@io.eof?
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Similar to IO#gets.
|
251
|
+
# Internal use only. Users should not call it directly.
|
252
|
+
def gets(io_rs = $/)
|
253
|
+
if @prefetch.size > 0
|
254
|
+
if io_rs == nil then
|
255
|
+
r = @prefetch + @io.gets(nil).to_s
|
256
|
+
@prefetch = ''
|
257
|
+
else
|
258
|
+
if io_rs == '' then
|
259
|
+
sp_rs = /\n\n/n
|
260
|
+
sp_rs_orig = "\n\n"
|
261
|
+
else
|
262
|
+
sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
|
263
|
+
sp_rs_orig = io_rs
|
264
|
+
end
|
265
|
+
a = @prefetch.split(sp_rs, 2)
|
266
|
+
if a.size > 1 then
|
267
|
+
r = a[0] + sp_rs_orig
|
268
|
+
@prefetch = a[1]
|
269
|
+
else
|
270
|
+
@prefetch << @io.gets(io_rs).to_s
|
271
|
+
a = @prefetch.split(sp_rs, 2)
|
272
|
+
if a.size > 1 then
|
273
|
+
r = a[0] + sp_rs_orig
|
274
|
+
@prefetch = a[1].to_s
|
275
|
+
else
|
276
|
+
r = @prefetch
|
277
|
+
@prefetch = ''
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
r
|
282
|
+
else
|
283
|
+
@io.gets(io_rs)
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
# Unread read data.
|
288
|
+
# Internal use only. Users must not call it.
|
289
|
+
def ungets(str)
|
290
|
+
@prefetch = str + @prefetch
|
291
|
+
nil
|
292
|
+
end
|
293
|
+
|
294
|
+
# Similar to IO#getc.
|
295
|
+
# Internal use only. Users should not call it directly.
|
296
|
+
def getc
|
297
|
+
if @prefetch.size > 0 then
|
298
|
+
r = @prefetch[0]
|
299
|
+
@prefetch = @prefetch[1..-1]
|
300
|
+
else
|
301
|
+
r = @io.getc
|
302
|
+
end
|
303
|
+
r
|
304
|
+
end
|
305
|
+
|
306
|
+
# Similar to IO#ungetc.
|
307
|
+
# Internal use only. Users should not call it.
|
308
|
+
def ungetc(c)
|
309
|
+
@prefetch = sprintf("%c", c) + @prefetch
|
310
|
+
nil
|
311
|
+
end
|
312
|
+
|
313
|
+
# If true is given, the next_entry method returns
|
314
|
+
# a entry as a text, whereas if false, returns as a parsed object.
|
315
|
+
def raw=(bool)
|
316
|
+
@raw = (bool ? true : false)
|
317
|
+
end
|
318
|
+
|
319
|
+
# If true, raw mode.
|
320
|
+
attr_reader :raw
|
321
|
+
|
322
|
+
# Sets database class. Plese use only if autodetect fails.
|
323
|
+
def dbclass=(k)
|
324
|
+
if k then
|
325
|
+
@dbclass = k
|
326
|
+
@rs = @dbclass::DELIMITER
|
327
|
+
else
|
328
|
+
@dbclass = nil
|
329
|
+
@rs = $/
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
# Returns database class which is automatically detected or
|
334
|
+
# given in FlatFile#initialize.
|
335
|
+
attr_reader :dbclass
|
336
|
+
|
337
|
+
# Performs determination of database class (file format).
|
338
|
+
# Pre-reads +lines+ lines for format determination (default 31 lines).
|
339
|
+
# If fails, returns nil or false. Otherwise, returns database class.
|
340
|
+
#
|
341
|
+
# The method can be called anytime if you want (but not recommended).
|
342
|
+
# This might be useful if input file is a mixture of muitiple format data.
|
343
|
+
def autodetect(lines = 31)
|
344
|
+
r = nil
|
345
|
+
1.upto(lines) do |x|
|
346
|
+
if line = @io.gets then
|
347
|
+
@prefetch << line
|
348
|
+
if line and line.strip.size > 0 then
|
349
|
+
r = self.class.autodetect(@prefetch)
|
350
|
+
if r then
|
351
|
+
self.dbclass = r
|
352
|
+
return r
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
self.dbclass = nil unless dbclass
|
358
|
+
r
|
359
|
+
end
|
360
|
+
|
361
|
+
# Detects database class (== file format) of given file.
|
362
|
+
# If fails to determine, returns nil.
|
363
|
+
def self.autodetect_file(filename)
|
364
|
+
ff = self.open(nil, filename)
|
365
|
+
r = ff.dbclass
|
366
|
+
ff.close
|
367
|
+
r
|
368
|
+
end
|
369
|
+
|
370
|
+
# Detects database class (== file format) of given input stream.
|
371
|
+
# If fails to determine, returns nil.
|
372
|
+
# Caution: the method reads some data from the input stream,
|
373
|
+
# and the data will be lost.
|
374
|
+
def self.autodetect_stream(io)
|
375
|
+
ff = self.new(nil, io)
|
376
|
+
r = ff.dbclass
|
377
|
+
r
|
378
|
+
end
|
379
|
+
|
380
|
+
# Detects database class (== file format) of given string.
|
381
|
+
# If fails to determine, returns false or nil.
|
382
|
+
def self.autodetect(text)
|
383
|
+
require 'bio'
|
384
|
+
case text
|
385
|
+
when /^LOCUS .+ bp .*[a-z]*[DR]?NA/
|
386
|
+
Bio::GenBank
|
387
|
+
when /^LOCUS .+ aa .+/
|
388
|
+
Bio::GenPept
|
389
|
+
when /^UI \- [0-9]+$/
|
390
|
+
Bio::MEDLINE
|
391
|
+
|
392
|
+
when /^ID .+\; .*(DNA|RNA|XXX)\;/
|
393
|
+
Bio::EMBL
|
394
|
+
when /^ID .+\; *PRT\;/
|
395
|
+
Bio::SPTR
|
396
|
+
when /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/
|
397
|
+
Bio::PROSITE
|
398
|
+
when /^AC [-A-Za-z0-9_\.]+$/
|
399
|
+
Bio::TRANSFAC
|
400
|
+
|
401
|
+
when /^H [-A-Z0-9_\.]+$/
|
402
|
+
if text =~ /^M [rc]/ then
|
403
|
+
Bio::AAindex2
|
404
|
+
elsif text =~ /^I A\/L/ then
|
405
|
+
Bio::AAindex1
|
406
|
+
else
|
407
|
+
false #fail to determine
|
408
|
+
end
|
409
|
+
|
410
|
+
when /^CODE [0-9]+$/
|
411
|
+
Bio::LITDB
|
412
|
+
when /^Entry [A-Z0-9]+/
|
413
|
+
Bio::KEGG::BRITE
|
414
|
+
|
415
|
+
when /^ENTRY .+ KO\s*$/
|
416
|
+
Bio::KEGG::KO
|
417
|
+
when /^ENTRY .+ Glycan\s*$/
|
418
|
+
Bio::KEGG::GLYCAN
|
419
|
+
when /^ENTRY .+ (CDS|gene|.*RNA) /
|
420
|
+
Bio::KEGG::GENES
|
421
|
+
when /^ENTRY EC [0-9\.]+$/
|
422
|
+
Bio::KEGG::ENZYME
|
423
|
+
when /^ENTRY C[A-Za-z0-9\._]+$/
|
424
|
+
Bio::KEGG::COMPOUND
|
425
|
+
when /^ENTRY R[A-Za-z0-9\._]+$/
|
426
|
+
Bio::KEGG::REACTION
|
427
|
+
when /^ENTRY [a-z]+$/
|
428
|
+
Bio::KEGG::GENOME
|
429
|
+
|
430
|
+
when /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/
|
431
|
+
if $1 == 'clusters'
|
432
|
+
Bio::FANTOM::MaXML::Cluster
|
433
|
+
elsif $1 == 'sequences'
|
434
|
+
Bio::FANTOM::MaXML::Sequence
|
435
|
+
else
|
436
|
+
nil #unknown
|
437
|
+
end
|
438
|
+
|
439
|
+
when /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/
|
440
|
+
Bio::PDB
|
441
|
+
|
442
|
+
when /^CLUSTAL .*\(.*\).*sequence +alignment/
|
443
|
+
Bio::ClustalW::Report
|
444
|
+
|
445
|
+
when /\<\!DOCTYPE BlastOutput PUBLIC /
|
446
|
+
Bio::Blast::Report
|
447
|
+
|
448
|
+
when /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
|
449
|
+
Bio::Blast::WU::Report
|
450
|
+
when /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
|
451
|
+
Bio::Blast::WU::Report_TBlast
|
452
|
+
|
453
|
+
when /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
|
454
|
+
Bio::Blast::Default::Report
|
455
|
+
when /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
|
456
|
+
Bio::Blast::Default::Report_TBlast
|
457
|
+
|
458
|
+
when /^psLayout version \d+\s*$/
|
459
|
+
Bio::Blat::Report
|
460
|
+
when /^\-\-SPIDEY version .+\-\-$/
|
461
|
+
Bio::Spidey::Report
|
462
|
+
|
463
|
+
when /^HMMER +\d+\./
|
464
|
+
Bio::HMMER::Report
|
465
|
+
|
466
|
+
when /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/
|
467
|
+
Bio::Sim4::Report
|
468
|
+
|
469
|
+
when /^>.+$/
|
470
|
+
if text =~ /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ then
|
471
|
+
Bio::NBRF
|
472
|
+
elsif text =~ /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ then
|
473
|
+
Bio::FastaFormat
|
474
|
+
elsif text =~ /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ then
|
475
|
+
Bio::FastaNumericFormat
|
476
|
+
else
|
477
|
+
false #fail to determine
|
478
|
+
end
|
479
|
+
|
480
|
+
else
|
481
|
+
nil #not found
|
482
|
+
end
|
483
|
+
end
|
484
|
+
|
485
|
+
end #class FlatFile
|
486
|
+
|
487
|
+
end #module Bio
|
488
|
+
|
489
|
+
|
490
|
+
if __FILE__ == $0
|
491
|
+
if ARGV.size == 2
|
492
|
+
require 'bio'
|
493
|
+
p Bio::FlatFile.open(eval(ARGV.shift), ARGV.shift).next_entry
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|