bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
data/lib/bio/data/na.rb
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/data/na.rb - Nucleic Acids
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001, 2005
|
|
5
|
+
# Toshiaki Katayama <k@bioruby.org>
|
|
6
|
+
# License:: LGPL
|
|
7
|
+
#
|
|
8
|
+
# $Id: na.rb,v 0.19 2005/12/10 18:14:22 k Exp $
|
|
9
|
+
#
|
|
10
|
+
# == Synopsis
|
|
11
|
+
#
|
|
12
|
+
# Bio::NucleicAcid class contains data related to nucleic acids.
|
|
13
|
+
#
|
|
14
|
+
# == Usage
|
|
15
|
+
#
|
|
16
|
+
# Examples:
|
|
17
|
+
#
|
|
18
|
+
# require 'bio'
|
|
19
|
+
#
|
|
20
|
+
# puts "### na = Bio::NucleicAcid.new"
|
|
21
|
+
# na = Bio::NucleicAcid.new
|
|
22
|
+
#
|
|
23
|
+
# puts "# na.to_re('yrwskmbdhvnatgc')"
|
|
24
|
+
# p na.to_re('yrwskmbdhvnatgc')
|
|
25
|
+
#
|
|
26
|
+
# puts "# Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')"
|
|
27
|
+
# p Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')
|
|
28
|
+
#
|
|
29
|
+
# puts "# na.weight('A')"
|
|
30
|
+
# p na.weight('A')
|
|
31
|
+
#
|
|
32
|
+
# puts "# Bio::NucleicAcid.weight('A')"
|
|
33
|
+
# p Bio::NucleicAcid.weight('A')
|
|
34
|
+
#
|
|
35
|
+
# puts "# na.weight('atgc')"
|
|
36
|
+
# p na.weight('atgc')
|
|
37
|
+
#
|
|
38
|
+
# puts "# Bio::NucleicAcid.weight('atgc')"
|
|
39
|
+
# p Bio::NucleicAcid.weight('atgc')
|
|
40
|
+
#
|
|
41
|
+
#--
|
|
42
|
+
#
|
|
43
|
+
# This library is free software; you can redistribute it and/or
|
|
44
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
45
|
+
# License as published by the Free Software Foundation; either
|
|
46
|
+
# version 2 of the License, or (at your option) any later version.
|
|
47
|
+
#
|
|
48
|
+
# This library is distributed in the hope that it will be useful,
|
|
49
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
50
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
51
|
+
# Lesser General Public License for more details.
|
|
52
|
+
#
|
|
53
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
54
|
+
# License along with this library; if not, write to the Free Software
|
|
55
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
56
|
+
#
|
|
57
|
+
#++
|
|
58
|
+
#
|
|
59
|
+
|
|
60
|
+
module Bio
|
|
61
|
+
|
|
62
|
+
class NucleicAcid
|
|
63
|
+
|
|
64
|
+
module Data
|
|
65
|
+
|
|
66
|
+
# IUPAC code
|
|
67
|
+
# * Faisst and Meyer (Nucleic Acids Res. 20:3-26, 1992)
|
|
68
|
+
# * http://www.ncbi.nlm.nih.gov/collab/FT/
|
|
69
|
+
|
|
70
|
+
NAMES = {
|
|
71
|
+
|
|
72
|
+
'y' => '[tc]',
|
|
73
|
+
'r' => '[ag]',
|
|
74
|
+
'w' => '[at]',
|
|
75
|
+
's' => '[gc]',
|
|
76
|
+
'k' => '[tg]',
|
|
77
|
+
'm' => '[ac]',
|
|
78
|
+
|
|
79
|
+
'b' => '[tgc]',
|
|
80
|
+
'd' => '[atg]',
|
|
81
|
+
'h' => '[atc]',
|
|
82
|
+
'v' => '[agc]',
|
|
83
|
+
|
|
84
|
+
'n' => '[atgc]',
|
|
85
|
+
|
|
86
|
+
'a' => 'a',
|
|
87
|
+
't' => 't',
|
|
88
|
+
'g' => 'g',
|
|
89
|
+
'c' => 'c',
|
|
90
|
+
'u' => 'u',
|
|
91
|
+
|
|
92
|
+
'A' => 'Adenine',
|
|
93
|
+
'T' => 'Thymine',
|
|
94
|
+
'G' => 'Guanine',
|
|
95
|
+
'C' => 'Cytosine',
|
|
96
|
+
'U' => 'Uracil',
|
|
97
|
+
|
|
98
|
+
'Y' => 'pYrimidine',
|
|
99
|
+
'R' => 'puRine',
|
|
100
|
+
'W' => 'Weak',
|
|
101
|
+
'S' => 'Strong',
|
|
102
|
+
'K' => 'Keto',
|
|
103
|
+
'M' => 'aroMatic',
|
|
104
|
+
|
|
105
|
+
'B' => 'not A',
|
|
106
|
+
'D' => 'not C',
|
|
107
|
+
'H' => 'not G',
|
|
108
|
+
'V' => 'not T',
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
WEIGHT = {
|
|
112
|
+
|
|
113
|
+
# Calculated by BioPerl's Bio::Tools::SeqStats.pm :-)
|
|
114
|
+
|
|
115
|
+
'a' => 135.15,
|
|
116
|
+
't' => 126.13,
|
|
117
|
+
'g' => 151.15,
|
|
118
|
+
'c' => 111.12,
|
|
119
|
+
'u' => 112.10,
|
|
120
|
+
|
|
121
|
+
:adenine => 135.15,
|
|
122
|
+
:thymine => 126.13,
|
|
123
|
+
:guanine => 151.15,
|
|
124
|
+
:cytosine => 111.12,
|
|
125
|
+
:uracil => 112.10,
|
|
126
|
+
|
|
127
|
+
:deoxyribose_phosphate => 196.11,
|
|
128
|
+
:ribose_phosphate => 212.11,
|
|
129
|
+
|
|
130
|
+
:hydrogen => 1.00794,
|
|
131
|
+
:water => 18.015,
|
|
132
|
+
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
def weight(x = nil, rna = nil)
|
|
136
|
+
if x
|
|
137
|
+
if x.length > 1
|
|
138
|
+
if rna
|
|
139
|
+
phosphate = WEIGHT[:ribose_phosphate]
|
|
140
|
+
else
|
|
141
|
+
phosphate = WEIGHT[:deoxyribose_phosphate]
|
|
142
|
+
end
|
|
143
|
+
hydrogen = WEIGHT[:hydrogen]
|
|
144
|
+
water = WEIGHT[:water]
|
|
145
|
+
|
|
146
|
+
total = 0.0
|
|
147
|
+
x.each_byte do |byte|
|
|
148
|
+
base = byte.chr.downcase
|
|
149
|
+
if WEIGHT[base]
|
|
150
|
+
total += WEIGHT[base] + phosphate - hydrogen * 2
|
|
151
|
+
else
|
|
152
|
+
raise "Error: invalid nucleic acid '#{base}'"
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
total -= water * (x.length - 1)
|
|
156
|
+
else
|
|
157
|
+
WEIGHT[x.to_s.downcase]
|
|
158
|
+
end
|
|
159
|
+
else
|
|
160
|
+
WEIGHT
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def [](x)
|
|
165
|
+
NAMES[x]
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# backward compatibility
|
|
169
|
+
def names
|
|
170
|
+
NAMES
|
|
171
|
+
end
|
|
172
|
+
alias na names
|
|
173
|
+
|
|
174
|
+
def name(x)
|
|
175
|
+
NAMES[x.to_s.upcase]
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def to_re(seq, rna = false)
|
|
179
|
+
str = seq.to_s
|
|
180
|
+
str.gsub!(/[^atgcu]/) { |base|
|
|
181
|
+
NAMES[base] || '.'
|
|
182
|
+
}
|
|
183
|
+
if rna
|
|
184
|
+
str.tr!("t", "u")
|
|
185
|
+
end
|
|
186
|
+
Regexp.new(str)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# as instance methods
|
|
193
|
+
include Data
|
|
194
|
+
|
|
195
|
+
# as class methods
|
|
196
|
+
extend Data
|
|
197
|
+
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
end # module Bio
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __FILE__ == $0
|
|
204
|
+
|
|
205
|
+
puts "### na = Bio::NucleicAcid.new"
|
|
206
|
+
na = Bio::NucleicAcid.new
|
|
207
|
+
|
|
208
|
+
puts "# na.to_re('yrwskmbdhvnatgc')"
|
|
209
|
+
p na.to_re('yrwskmbdhvnatgc')
|
|
210
|
+
|
|
211
|
+
puts "# Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')"
|
|
212
|
+
p Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')
|
|
213
|
+
|
|
214
|
+
puts "# na.weight('A')"
|
|
215
|
+
p na.weight('A')
|
|
216
|
+
|
|
217
|
+
puts "# Bio::NucleicAcid.weight('A')"
|
|
218
|
+
p Bio::NucleicAcid.weight('A')
|
|
219
|
+
|
|
220
|
+
puts "# na.weight('atgc')"
|
|
221
|
+
p na.weight('atgc')
|
|
222
|
+
|
|
223
|
+
puts "# Bio::NucleicAcid.weight('atgc')"
|
|
224
|
+
p Bio::NucleicAcid.weight('atgc')
|
|
225
|
+
|
|
226
|
+
end
|
data/lib/bio/db.rb
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db.rb - common API for database parsers
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001, 2002, 2005
|
|
5
|
+
# KATAYAMA Toshiaki <k@bioruby.org>
|
|
6
|
+
# License:: LGPL
|
|
7
|
+
#
|
|
8
|
+
# $Id: db.rb,v 0.31 2005/12/07 11:23:51 k Exp $
|
|
9
|
+
#
|
|
10
|
+
# == On-demand parsing and cache
|
|
11
|
+
#
|
|
12
|
+
# The flatfile parsers (sub classes of the Bio::DB) split the original entry
|
|
13
|
+
# into a Hash and store the hash in the @orig instance variable. To parse
|
|
14
|
+
# in detail is delayed until the method is called which requires a further
|
|
15
|
+
# parsing of a content of the @orig hash. Fully parsed data is cached in the
|
|
16
|
+
# another hash, @data, separately.
|
|
17
|
+
#
|
|
18
|
+
# == Guide lines for the developers to create an new database class
|
|
19
|
+
#
|
|
20
|
+
# --- Bio::DB.new(entry)
|
|
21
|
+
#
|
|
22
|
+
# The 'new' method should accept the entire entry in one String and
|
|
23
|
+
# return the parsed database object.
|
|
24
|
+
#
|
|
25
|
+
# --- Bio::DB#entry_id
|
|
26
|
+
#
|
|
27
|
+
# Database classes should implement the following methods if appropriate:
|
|
28
|
+
#
|
|
29
|
+
# * entry_id
|
|
30
|
+
# * definition
|
|
31
|
+
#
|
|
32
|
+
# Every sub class should define the following constants if appropriate:
|
|
33
|
+
#
|
|
34
|
+
# * DELIMITER (RS)
|
|
35
|
+
# * entry separator of the flatfile of the database.
|
|
36
|
+
# * RS (= record separator) is an alias for the DELIMITER in short.
|
|
37
|
+
#
|
|
38
|
+
# * TAGSIZE
|
|
39
|
+
# * length of the tag field in the FORTRAN-like format.
|
|
40
|
+
#
|
|
41
|
+
# |<- tag ->||<- data ---->|
|
|
42
|
+
# ENTRY_ID A12345
|
|
43
|
+
# DEFINITION Hoge gene of the Pokemonia pikachuae
|
|
44
|
+
#
|
|
45
|
+
# === Template of the sub class
|
|
46
|
+
#
|
|
47
|
+
# module Bio
|
|
48
|
+
# class Hoge < DB
|
|
49
|
+
#
|
|
50
|
+
# DELIMITER = RS = "\n//\n"
|
|
51
|
+
# TAGSIZE = 12 # You can omit this line if not needed
|
|
52
|
+
#
|
|
53
|
+
# def initialize(entry)
|
|
54
|
+
# end
|
|
55
|
+
#
|
|
56
|
+
# def entry_id
|
|
57
|
+
# end
|
|
58
|
+
#
|
|
59
|
+
# end # class Hoge
|
|
60
|
+
# end # module Bio
|
|
61
|
+
#
|
|
62
|
+
# === Recommended method names for sub classes
|
|
63
|
+
#
|
|
64
|
+
# In general, the method name should be in the singular form when returns
|
|
65
|
+
# a Object (including the case when the Object is a String), and should be
|
|
66
|
+
# the plural form when returns same Objects in Array. It depends on the
|
|
67
|
+
# database classes that which form of the method name can be use.
|
|
68
|
+
#
|
|
69
|
+
# For example, GenBank has several REFERENCE fields in one entry, so define
|
|
70
|
+
# Bio::GenBank#references and this method should return an Array of the
|
|
71
|
+
# Reference objects. On the other hand, MEDLINE has one REFERENCE information
|
|
72
|
+
# per one entry, so define Bio::MEDLINE#reference method and this should
|
|
73
|
+
# return a Reference object.
|
|
74
|
+
#
|
|
75
|
+
# The method names used in the sub classes should be taken from the following
|
|
76
|
+
# list if appropriate:
|
|
77
|
+
#
|
|
78
|
+
# --- entry_id #=> String
|
|
79
|
+
#
|
|
80
|
+
# The entry identifier.
|
|
81
|
+
#
|
|
82
|
+
# --- definition #=> String
|
|
83
|
+
#
|
|
84
|
+
# The description of the entry.
|
|
85
|
+
#
|
|
86
|
+
# --- reference #=> Bio::Reference
|
|
87
|
+
# --- references #=> Array of Bio::Reference
|
|
88
|
+
#
|
|
89
|
+
# The reference field(s) of the entry.
|
|
90
|
+
#
|
|
91
|
+
# --- dblink #=> String
|
|
92
|
+
# --- dblinks #=> Array of String
|
|
93
|
+
#
|
|
94
|
+
# The link(s) to the other database entry.
|
|
95
|
+
#
|
|
96
|
+
# --- naseq #=> Bio::Sequence::NA
|
|
97
|
+
#
|
|
98
|
+
# The DNA/RNA sequence of the entry.
|
|
99
|
+
#
|
|
100
|
+
# --- nalen #=> Integer
|
|
101
|
+
#
|
|
102
|
+
# The length of the DNA/RNA sequence of the entry.
|
|
103
|
+
#
|
|
104
|
+
# --- aaseq #=> Bio::Sequence::AA
|
|
105
|
+
#
|
|
106
|
+
# The amino acid sequence of the entry.
|
|
107
|
+
#
|
|
108
|
+
# --- aalen #=> Integer
|
|
109
|
+
#
|
|
110
|
+
# The length of the amino acid sequence of the entry.
|
|
111
|
+
#
|
|
112
|
+
# --- seq #=> Bio::Sequence::NA or Bio::Sequence::AA
|
|
113
|
+
#
|
|
114
|
+
# Returns an appropriate sequence object.
|
|
115
|
+
#
|
|
116
|
+
# --- position #=> String
|
|
117
|
+
#
|
|
118
|
+
# The position of the sequence in the entry or in the genome (depends on
|
|
119
|
+
# the database).
|
|
120
|
+
#
|
|
121
|
+
# --- locations #=> Bio::Locations
|
|
122
|
+
#
|
|
123
|
+
# Returns Bio::Locations.new(position).
|
|
124
|
+
#
|
|
125
|
+
# --- division #=> String
|
|
126
|
+
#
|
|
127
|
+
# The sub division name of the database.
|
|
128
|
+
#
|
|
129
|
+
# * Example:
|
|
130
|
+
# * EST, VRL etc. for GenBank
|
|
131
|
+
# * PATTERN, RULE etc. for PROSITE
|
|
132
|
+
#
|
|
133
|
+
# --- date #=> String
|
|
134
|
+
#
|
|
135
|
+
# The date of the entry.
|
|
136
|
+
# Should we use Date (by ParseDate) instead of String?
|
|
137
|
+
#
|
|
138
|
+
# --- gene #=> String
|
|
139
|
+
# --- genes #=> Array of String
|
|
140
|
+
#
|
|
141
|
+
# The name(s) of the gene.
|
|
142
|
+
#
|
|
143
|
+
# --- organism #=> String
|
|
144
|
+
#
|
|
145
|
+
# The name of the organism.
|
|
146
|
+
#
|
|
147
|
+
#--
|
|
148
|
+
#
|
|
149
|
+
# This library is free software; you can redistribute it and/or
|
|
150
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
151
|
+
# License as published by the Free Software Foundation; either
|
|
152
|
+
# version 2 of the License, or (at your option) any later version.
|
|
153
|
+
#
|
|
154
|
+
# This library is distributed in the hope that it will be useful,
|
|
155
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
156
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
157
|
+
# Lesser General Public License for more details.
|
|
158
|
+
#
|
|
159
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
160
|
+
# License along with this library; if not, write to the Free Software
|
|
161
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
162
|
+
#
|
|
163
|
+
#++
|
|
164
|
+
#
|
|
165
|
+
|
|
166
|
+
require 'bio/sequence'
|
|
167
|
+
require 'bio/reference'
|
|
168
|
+
require 'bio/feature'
|
|
169
|
+
|
|
170
|
+
module Bio
|
|
171
|
+
|
|
172
|
+
class DB
|
|
173
|
+
|
|
174
|
+
def self.open(filename, *mode, &block)
|
|
175
|
+
Bio::FlatFile.open(self, filename, *mode, &block)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Returns an entry identifier as a String. This method must be
|
|
179
|
+
# implemented in every database classes by overriding this method.
|
|
180
|
+
def entry_id
|
|
181
|
+
raise NotImplementedError
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Returns a list of the top level tags of the entry as an Array of String.
|
|
185
|
+
def tags
|
|
186
|
+
@orig.keys
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Returns true or false - wether the entry contains the field of the
|
|
190
|
+
# given tag name.
|
|
191
|
+
def exists?(tag)
|
|
192
|
+
@orig.include?(tag)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Returns an intact field of the tag as a String.
|
|
196
|
+
def get(tag)
|
|
197
|
+
@orig[tag]
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Similar to the get method, however, fetch returns the content of the
|
|
201
|
+
# field without its tag and any extra white spaces stripped.
|
|
202
|
+
def fetch(tag, skip = 0)
|
|
203
|
+
field = @orig[tag].split(/\n/, skip + 1).last.to_s
|
|
204
|
+
truncate(field.gsub(/^.{0,#{@tagsize}}/,''))
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
private
|
|
209
|
+
|
|
210
|
+
# Returns a String with successive white spaces are replaced by one
|
|
211
|
+
# space and stripeed.
|
|
212
|
+
def truncate(str)
|
|
213
|
+
if str
|
|
214
|
+
str.gsub(/\s+/, ' ').strip
|
|
215
|
+
else
|
|
216
|
+
""
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Returns a tag name of the field as a String.
|
|
221
|
+
def tag_get(str)
|
|
222
|
+
if str
|
|
223
|
+
str[0,@tagsize].strip
|
|
224
|
+
else
|
|
225
|
+
""
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Returns a String of the field without a tag name.
|
|
230
|
+
def tag_cut(str)
|
|
231
|
+
if str
|
|
232
|
+
str[0,@tagsize] = ''
|
|
233
|
+
else
|
|
234
|
+
""
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Returns the content of the field as a String like the fetch method.
|
|
239
|
+
# Furthermore, field_fetch stores the result in the @data hash.
|
|
240
|
+
def field_fetch(tag, skip = 0)
|
|
241
|
+
unless @data[tag]
|
|
242
|
+
@data[tag] = fetch(tag, skip)
|
|
243
|
+
end
|
|
244
|
+
return @data[tag]
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Returns an Array containing each line of the field without a tag.
|
|
248
|
+
# lines_fetch also stores the result in the @data hash.
|
|
249
|
+
def lines_fetch(tag)
|
|
250
|
+
unless @data[tag]
|
|
251
|
+
@data[tag] = get(tag).split(/\n/).map{ |l| tag_cut(l) }
|
|
252
|
+
end
|
|
253
|
+
@data[tag]
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
end # class DB
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# Stores a NCBI style (GenBank, KEGG etc.) entry.
|
|
260
|
+
class NCBIDB < DB
|
|
261
|
+
|
|
262
|
+
autoload :Common, 'bio/db/genbank/common'
|
|
263
|
+
|
|
264
|
+
# The entire entry is passed as a String. The length of the tag field is
|
|
265
|
+
# passed as an Integer. Parses the entry roughly by the entry2hash method
|
|
266
|
+
# and returns a database object.
|
|
267
|
+
def initialize(entry, tagsize)
|
|
268
|
+
@tagsize = tagsize
|
|
269
|
+
@orig = entry2hash(entry.strip) # Hash of the original entry
|
|
270
|
+
@data = {} # Hash of the parsed entry
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
private
|
|
274
|
+
|
|
275
|
+
# Splits an entry into an Array of Strings at the level of top tags.
|
|
276
|
+
def toptag2array(str)
|
|
277
|
+
sep = "\001"
|
|
278
|
+
str.gsub(/\n([A-Za-z\/])/, "\n#{sep}\\1").split(sep)
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Splits a field into an Array of Strings at the level of sub tags.
|
|
282
|
+
def subtag2array(str)
|
|
283
|
+
sep = "\001"
|
|
284
|
+
str.gsub(/\n(\s{1,#{@tagsize-1}}\S)/, "\n#{sep}\\1").split(sep)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Returns the contents of the entry as a Hash with the top level tags as
|
|
288
|
+
# its keys.
|
|
289
|
+
def entry2hash(entry)
|
|
290
|
+
hash = Hash.new('')
|
|
291
|
+
|
|
292
|
+
fields = toptag2array(entry)
|
|
293
|
+
|
|
294
|
+
fields.each do |field|
|
|
295
|
+
tag = tag_get(field)
|
|
296
|
+
hash[tag] += field
|
|
297
|
+
end
|
|
298
|
+
return hash
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
end # class NCBIDB
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# Class for KEGG databases. Inherits a NCBIDB class.
|
|
305
|
+
class KEGGDB < NCBIDB
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# Stores an EMBL style (EMBL, TrEMBL, Swiss-Prot etc.) entry.
|
|
310
|
+
class EMBLDB < DB
|
|
311
|
+
|
|
312
|
+
autoload :Common, 'bio/db/embl/common'
|
|
313
|
+
|
|
314
|
+
# The entire entry is passed as a String. The length of the tag field is
|
|
315
|
+
# passed as an Integer. Parses the entry roughly by the entry2hash method
|
|
316
|
+
# and returns a database object.
|
|
317
|
+
def initialize(entry, tagsize)
|
|
318
|
+
@tagsize = tagsize
|
|
319
|
+
@orig = entry2hash(entry.strip) # Hash of the original entry
|
|
320
|
+
@data = {} # Hash of the parsed entry
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
private
|
|
324
|
+
|
|
325
|
+
# Returns the contents of the entry as a Hash.
|
|
326
|
+
def entry2hash(entry)
|
|
327
|
+
hash = Hash.new('')
|
|
328
|
+
entry.each_line do |line|
|
|
329
|
+
tag = tag_get(line)
|
|
330
|
+
next if tag == 'XX'
|
|
331
|
+
tag = 'R' if tag =~ /^R./ # Reference lines
|
|
332
|
+
hash[tag] += line
|
|
333
|
+
end
|
|
334
|
+
return hash
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
end # class EMBLDB
|
|
338
|
+
|
|
339
|
+
end # module Bio
|
|
340
|
+
|