bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
#
|
|
2
|
+
# bio/db/genbank/common.rb - Common methods for GenBank style database classes
|
|
3
|
+
#
|
|
4
|
+
# Copyright (C) 2004 KATAYAMA Toshiaki <k@bioruby.org>
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# $Id: common.rb,v 1.9 2005/12/07 11:23:51 k Exp $
|
|
21
|
+
#
|
|
22
|
+
|
|
23
|
+
require 'bio/db'
|
|
24
|
+
|
|
25
|
+
module Bio
|
|
26
|
+
class NCBIDB
|
|
27
|
+
module Common
|
|
28
|
+
|
|
29
|
+
DELIMITER = RS = "\n//\n"
|
|
30
|
+
TAGSIZE = 12
|
|
31
|
+
|
|
32
|
+
def initialize(entry)
|
|
33
|
+
super(entry, TAGSIZE)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# LOCUS -- Locus class must be defined in child classes
|
|
37
|
+
|
|
38
|
+
# DEFINITION
|
|
39
|
+
def definition
|
|
40
|
+
field_fetch('DEFINITION')
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ACCESSION
|
|
45
|
+
def accessions
|
|
46
|
+
accession.split(/\s+/)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# VERSION
|
|
51
|
+
def versions
|
|
52
|
+
@data['VERSION'] ||= fetch('VERSION').split(/\s+/)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def acc_version
|
|
56
|
+
versions.first.to_s
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def accession
|
|
60
|
+
acc_version.split(/\./).first.to_s
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def version
|
|
64
|
+
acc_version.split(/\./).last.to_i
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def gi
|
|
68
|
+
versions.last
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# NID
|
|
73
|
+
def nid
|
|
74
|
+
field_fetch('NID')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# KEYWORDS
|
|
79
|
+
def keywords
|
|
80
|
+
@data['KEYWORDS'] ||= fetch('KEYWORDS').chomp('.').split(/; /)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# SEGMENT
|
|
85
|
+
def segment
|
|
86
|
+
@data['SEGMENT'] ||= fetch('SEGMENT').scan(/\d+/).join("/")
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# SOURCE
|
|
91
|
+
def source
|
|
92
|
+
unless @data['SOURCE']
|
|
93
|
+
name, org = get('SOURCE').split('ORGANISM')
|
|
94
|
+
org ||= ""
|
|
95
|
+
if org[/\S+;/]
|
|
96
|
+
organism = $`
|
|
97
|
+
taxonomy = $& + $'
|
|
98
|
+
elsif org[/\S+\./] # rs:NC_001741
|
|
99
|
+
organism = $`
|
|
100
|
+
taxonomy = $& + $'
|
|
101
|
+
else
|
|
102
|
+
organism = org
|
|
103
|
+
taxonomy = ''
|
|
104
|
+
end
|
|
105
|
+
@data['SOURCE'] = {
|
|
106
|
+
'common_name' => truncate(tag_cut(name)),
|
|
107
|
+
'organism' => truncate(organism),
|
|
108
|
+
'taxonomy' => truncate(taxonomy),
|
|
109
|
+
}
|
|
110
|
+
@data['SOURCE'].default = ''
|
|
111
|
+
end
|
|
112
|
+
@data['SOURCE']
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def common_name
|
|
116
|
+
source['common_name']
|
|
117
|
+
end
|
|
118
|
+
alias vernacular_name common_name
|
|
119
|
+
|
|
120
|
+
def organism
|
|
121
|
+
source['organism']
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def taxonomy
|
|
125
|
+
source['taxonomy']
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# REFERENCE
|
|
130
|
+
def references
|
|
131
|
+
unless @data['REFERENCE']
|
|
132
|
+
ary = []
|
|
133
|
+
toptag2array(get('REFERENCE')).each do |ref|
|
|
134
|
+
hash = Hash.new('')
|
|
135
|
+
subtag2array(ref).each do |field|
|
|
136
|
+
case tag_get(field)
|
|
137
|
+
when /AUTHORS/
|
|
138
|
+
authors = truncate(tag_cut(field))
|
|
139
|
+
authors = authors.split(/, /)
|
|
140
|
+
authors[-1] = authors[-1].split(/\s+and\s+/) if authors[-1]
|
|
141
|
+
authors = authors.flatten.map { |a| a.sub(/,/, ', ') }
|
|
142
|
+
hash['authors'] = authors
|
|
143
|
+
when /TITLE/
|
|
144
|
+
hash['title'] = truncate(tag_cut(field)) + '.'
|
|
145
|
+
when /JOURNAL/
|
|
146
|
+
journal = truncate(tag_cut(field))
|
|
147
|
+
if journal =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
|
|
148
|
+
hash['journal'] = $1
|
|
149
|
+
hash['volume'] = $2
|
|
150
|
+
hash['issue'] = $3
|
|
151
|
+
hash['pages'] = $4
|
|
152
|
+
hash['year'] = $5
|
|
153
|
+
else
|
|
154
|
+
hash['journal'] = journal
|
|
155
|
+
end
|
|
156
|
+
when /MEDLINE/
|
|
157
|
+
hash['medline'] = truncate(tag_cut(field))
|
|
158
|
+
when /PUBMED/
|
|
159
|
+
hash['pubmed'] = truncate(tag_cut(field))
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
ary.push(Reference.new(hash))
|
|
163
|
+
end
|
|
164
|
+
@data['REFERENCE'] = References.new(ary)
|
|
165
|
+
end
|
|
166
|
+
if block_given?
|
|
167
|
+
@data['REFERENCE'].each do |r|
|
|
168
|
+
yield r
|
|
169
|
+
end
|
|
170
|
+
else
|
|
171
|
+
@data['REFERENCE']
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# COMMENT
|
|
177
|
+
def comment
|
|
178
|
+
field_fetch('COMMENT')
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# FEATURES
|
|
183
|
+
def features
|
|
184
|
+
unless @data['FEATURES']
|
|
185
|
+
ary = []
|
|
186
|
+
in_quote = false
|
|
187
|
+
get('FEATURES').each_line do |line|
|
|
188
|
+
next if line =~ /^FEATURES/
|
|
189
|
+
|
|
190
|
+
# feature type (source, CDS, ...)
|
|
191
|
+
head = line[0,20].to_s.strip
|
|
192
|
+
|
|
193
|
+
# feature value (position or /qualifier=)
|
|
194
|
+
body = line[20,60].to_s.chomp
|
|
195
|
+
|
|
196
|
+
# sub-array [ feature type, position, /q="data", ... ]
|
|
197
|
+
if line =~ /^ {5}\S/
|
|
198
|
+
ary.push([ head, body ])
|
|
199
|
+
|
|
200
|
+
# feature qualifier start (/q="data..., /q="data...", /q=data, /q)
|
|
201
|
+
elsif body =~ /^ \// and not in_quote # gb:IRO125195
|
|
202
|
+
ary.last.push(body)
|
|
203
|
+
|
|
204
|
+
# flag for open quote (/q="data...)
|
|
205
|
+
if body =~ /="/ and body !~ /"$/
|
|
206
|
+
in_quote = true
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# feature qualifier continued (...data..., ...data...")
|
|
210
|
+
else
|
|
211
|
+
ary.last.last << body
|
|
212
|
+
|
|
213
|
+
# flag for closing quote (/q="data... lines ...")
|
|
214
|
+
if body =~ /"$/
|
|
215
|
+
in_quote = false
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
ary.collect! do |subary|
|
|
221
|
+
parse_qualifiers(subary)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
@data['FEATURES'] = Features.new(ary)
|
|
225
|
+
end
|
|
226
|
+
if block_given?
|
|
227
|
+
@data['FEATURES'].each do |f|
|
|
228
|
+
yield f
|
|
229
|
+
end
|
|
230
|
+
else
|
|
231
|
+
@data['FEATURES']
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# ORIGIN
|
|
237
|
+
def origin
|
|
238
|
+
unless @data['ORIGIN']
|
|
239
|
+
ori, seqstr = get('ORIGIN').split("\n", 2)
|
|
240
|
+
seqstr ||= ""
|
|
241
|
+
@data['ORIGIN'] = truncate(tag_cut(ori))
|
|
242
|
+
@data['SEQUENCE'] = seqstr.tr("0-9 \t\n\r\/", '')
|
|
243
|
+
end
|
|
244
|
+
@data['ORIGIN']
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
### private methods
|
|
249
|
+
|
|
250
|
+
private
|
|
251
|
+
|
|
252
|
+
def parse_qualifiers(ary)
|
|
253
|
+
feature = Feature.new
|
|
254
|
+
|
|
255
|
+
feature.feature = ary.shift
|
|
256
|
+
feature.position = ary.shift.gsub(/\s/, '')
|
|
257
|
+
|
|
258
|
+
ary.each do |f|
|
|
259
|
+
if f =~ %r{/([^=]+)=?"?([^"]*)"?}
|
|
260
|
+
qualifier, value = $1, $2
|
|
261
|
+
|
|
262
|
+
case qualifier
|
|
263
|
+
when 'translation'
|
|
264
|
+
value = Sequence::AA.new(value)
|
|
265
|
+
when 'codon_start'
|
|
266
|
+
value = value.to_i
|
|
267
|
+
else
|
|
268
|
+
value = true if value.empty?
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
feature.append(Feature::Qualifier.new(qualifier, value))
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
return feature
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
end # Common
|
|
279
|
+
end # GenBank
|
|
280
|
+
end # Bio
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
=begin
|
|
284
|
+
|
|
285
|
+
= Bio::GenBank::Common
|
|
286
|
+
|
|
287
|
+
This module defines a common framework among GenBank, GenPept, RefSeq, and
|
|
288
|
+
DDBJ. For more details, see the documentations in each genbank/*.rb files.
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
== SEE ALSO
|
|
292
|
+
|
|
293
|
+
* ((<URL:ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt>))
|
|
294
|
+
* ((<URL:http://www.ncbi.nlm.nih.gov/collab/FT/index.html>))
|
|
295
|
+
|
|
296
|
+
=end
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#
|
|
2
|
+
# bio/db/genbank/ddbj.rb - DDBJ database class
|
|
3
|
+
#
|
|
4
|
+
# Copyright (C) 2000-2004 KATAYAMA Toshiaki <k@bioruby.org>
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# $Id: ddbj.rb,v 1.7 2005/09/09 16:02:04 ngoto Exp $
|
|
21
|
+
#
|
|
22
|
+
|
|
23
|
+
require 'bio/db/genbank/genbank'
|
|
24
|
+
|
|
25
|
+
module Bio
|
|
26
|
+
|
|
27
|
+
class DDBJ < GenBank
|
|
28
|
+
|
|
29
|
+
autoload :XML, 'bio/io/ddbjxml'
|
|
30
|
+
|
|
31
|
+
# Nothing to do (DDBJ database format is completely same as GenBank)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
end # Bio
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
#
|
|
2
|
+
# bio/db/genbank/genbank.rb - GenBank database class
|
|
3
|
+
#
|
|
4
|
+
# Copyright (C) 2000-2005 KATAYAMA Toshiaki <k@bioruby.org>
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# $Id: genbank.rb,v 0.38 2005/12/07 11:23:51 k Exp $
|
|
21
|
+
#
|
|
22
|
+
|
|
23
|
+
require 'bio/db'
|
|
24
|
+
require 'bio/db/genbank/common'
|
|
25
|
+
|
|
26
|
+
module Bio
|
|
27
|
+
class GenBank < NCBIDB
|
|
28
|
+
|
|
29
|
+
include Bio::NCBIDB::Common
|
|
30
|
+
|
|
31
|
+
# LOCUS
|
|
32
|
+
class Locus
|
|
33
|
+
def initialize(locus_line)
|
|
34
|
+
if locus_line.empty?
|
|
35
|
+
# do nothing (just for empty or incomplete entry string)
|
|
36
|
+
elsif locus_line.length > 75 # after Rel 126.0
|
|
37
|
+
@entry_id = locus_line[12..27].strip
|
|
38
|
+
@length = locus_line[29..39].to_i
|
|
39
|
+
@strand = locus_line[44..46].strip
|
|
40
|
+
@natype = locus_line[47..52].strip
|
|
41
|
+
@circular = locus_line[55..62].strip
|
|
42
|
+
@division = locus_line[63..66].strip
|
|
43
|
+
@date = locus_line[68..78].strip
|
|
44
|
+
else
|
|
45
|
+
@entry_id = locus_line[12..21].strip
|
|
46
|
+
@length = locus_line[22..29].to_i
|
|
47
|
+
@strand = locus_line[33..35].strip
|
|
48
|
+
@natype = locus_line[36..39].strip
|
|
49
|
+
@circular = locus_line[42..51].strip
|
|
50
|
+
@division = locus_line[52..54].strip
|
|
51
|
+
@date = locus_line[62..72].strip
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
attr_accessor :entry_id, :length, :strand, :natype, :circular,
|
|
55
|
+
:division, :date
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def locus
|
|
59
|
+
@data['LOCUS'] ||= Locus.new(get('LOCUS'))
|
|
60
|
+
end
|
|
61
|
+
def entry_id; locus.entry_id; end
|
|
62
|
+
def length; locus.length; end
|
|
63
|
+
def circular; locus.circular; end
|
|
64
|
+
def division; locus.division; end
|
|
65
|
+
def date; locus.date; end
|
|
66
|
+
|
|
67
|
+
def strand; locus.strand; end
|
|
68
|
+
def natype; locus.natype; end
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ORIGIN
|
|
72
|
+
def seq
|
|
73
|
+
unless @data['SEQUENCE']
|
|
74
|
+
origin
|
|
75
|
+
end
|
|
76
|
+
Bio::Sequence::NA.new(@data['SEQUENCE'])
|
|
77
|
+
end
|
|
78
|
+
alias naseq seq
|
|
79
|
+
alias nalen length
|
|
80
|
+
|
|
81
|
+
def seq_len
|
|
82
|
+
seq.length
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# FEATURES
|
|
87
|
+
def each_cds
|
|
88
|
+
features.each do |feature|
|
|
89
|
+
if feature.feature == 'CDS'
|
|
90
|
+
yield(feature)
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def each_gene
|
|
96
|
+
features.each do |feature|
|
|
97
|
+
if feature.feature == 'gene'
|
|
98
|
+
yield(feature)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# BASE COUNT : obsoleted after GenBank release 138.0
|
|
105
|
+
def basecount(base = nil)
|
|
106
|
+
unless @data['BASE COUNT']
|
|
107
|
+
hash = Hash.new(0)
|
|
108
|
+
get('BASE COUNT').scan(/(\d+) (\w)/).each do |c, b|
|
|
109
|
+
hash[b] = c.to_i
|
|
110
|
+
end
|
|
111
|
+
@data['BASE COUNT'] = hash
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
if base
|
|
115
|
+
base.downcase!
|
|
116
|
+
@data['BASE COUNT'][base]
|
|
117
|
+
else
|
|
118
|
+
@data['BASE COUNT']
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
end # GenBank
|
|
123
|
+
end # Bio
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __FILE__ == $0
|
|
128
|
+
|
|
129
|
+
begin
|
|
130
|
+
require 'pp'
|
|
131
|
+
alias p pp
|
|
132
|
+
rescue LoadError
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
puts "### GenBank"
|
|
136
|
+
if ARGV.size > 0
|
|
137
|
+
gb = Bio::GenBank.new(ARGF.read)
|
|
138
|
+
else
|
|
139
|
+
require 'bio/io/fetch'
|
|
140
|
+
gb = Bio::GenBank.new(Bio::Fetch.query('gb', 'LPATOVGNS'))
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
puts "## LOCUS"
|
|
144
|
+
puts "# GenBank.locus"
|
|
145
|
+
p gb.locus
|
|
146
|
+
puts "# GenBank.entry_id"
|
|
147
|
+
p gb.entry_id
|
|
148
|
+
puts "# GenBank.nalen"
|
|
149
|
+
p gb.nalen
|
|
150
|
+
puts "# GenBank.strand"
|
|
151
|
+
p gb.strand
|
|
152
|
+
puts "# GenBank.natype"
|
|
153
|
+
p gb.natype
|
|
154
|
+
puts "# GenBank.circular"
|
|
155
|
+
p gb.circular
|
|
156
|
+
puts "# GenBank.division"
|
|
157
|
+
p gb.division
|
|
158
|
+
puts "# GenBank.date"
|
|
159
|
+
p gb.date
|
|
160
|
+
|
|
161
|
+
puts "## DEFINITION"
|
|
162
|
+
p gb.definition
|
|
163
|
+
|
|
164
|
+
puts "## ACCESSION"
|
|
165
|
+
p gb.accession
|
|
166
|
+
|
|
167
|
+
puts "## VERSION"
|
|
168
|
+
p gb.versions
|
|
169
|
+
p gb.version
|
|
170
|
+
p gb.gi
|
|
171
|
+
|
|
172
|
+
puts "## NID"
|
|
173
|
+
p gb.nid
|
|
174
|
+
|
|
175
|
+
puts "## KEYWORDS"
|
|
176
|
+
p gb.keywords
|
|
177
|
+
|
|
178
|
+
puts "## SEGMENT"
|
|
179
|
+
p gb.segment
|
|
180
|
+
|
|
181
|
+
puts "## SOURCE"
|
|
182
|
+
p gb.source
|
|
183
|
+
p gb.common_name
|
|
184
|
+
p gb.vernacular_name
|
|
185
|
+
p gb.organism
|
|
186
|
+
p gb.taxonomy
|
|
187
|
+
|
|
188
|
+
puts "## REFERENCE"
|
|
189
|
+
p gb.references
|
|
190
|
+
|
|
191
|
+
puts "## COMMENT"
|
|
192
|
+
p gb.comment
|
|
193
|
+
|
|
194
|
+
puts "## FEATURES"
|
|
195
|
+
p gb.features
|
|
196
|
+
|
|
197
|
+
puts "## BASE COUNT"
|
|
198
|
+
p gb.basecount
|
|
199
|
+
p gb.basecount('a')
|
|
200
|
+
p gb.basecount('A')
|
|
201
|
+
|
|
202
|
+
puts "## ORIGIN"
|
|
203
|
+
p gb.origin
|
|
204
|
+
p gb.naseq
|
|
205
|
+
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
=begin
|
|
210
|
+
|
|
211
|
+
= Bio::GenBank
|
|
212
|
+
|
|
213
|
+
=== Initialize
|
|
214
|
+
|
|
215
|
+
--- Bio::GenBank.new(entry)
|
|
216
|
+
|
|
217
|
+
=== LOCUS
|
|
218
|
+
|
|
219
|
+
--- Bio::GenBank#locus -> Bio::Locus
|
|
220
|
+
|
|
221
|
+
Returns contents of the LOCUS record as a Bio::GenBank::Locus object.
|
|
222
|
+
|
|
223
|
+
--- Bio::GenBank#entry_id -> String
|
|
224
|
+
--- Bio::GenBank#nalen -> Fixnum
|
|
225
|
+
--- Bio::GenBank#strand -> String
|
|
226
|
+
--- Bio::GenBank#natype -> String
|
|
227
|
+
--- Bio::GenBank#circular -> String
|
|
228
|
+
--- Bio::GenBank#division -> String
|
|
229
|
+
--- Bio::GenBank#date -> String
|
|
230
|
+
|
|
231
|
+
Access methods for the contents of the LOCUS record.
|
|
232
|
+
|
|
233
|
+
=== DEFINITION
|
|
234
|
+
|
|
235
|
+
--- Bio::GenBank#definition -> String
|
|
236
|
+
|
|
237
|
+
Returns contents of the DEFINITION record as a String.
|
|
238
|
+
|
|
239
|
+
=== ACCESSION
|
|
240
|
+
|
|
241
|
+
--- Bio::GenBank#accessions -> Array
|
|
242
|
+
|
|
243
|
+
Returns contents of the ACCESSION record as an Array.
|
|
244
|
+
|
|
245
|
+
=== VERSION
|
|
246
|
+
|
|
247
|
+
--- Bio::GenBank#versions -> Array
|
|
248
|
+
|
|
249
|
+
Returns contents of the VERSION record as an Array of Strings.
|
|
250
|
+
|
|
251
|
+
--- Bio::GenBank#acc_version -> String
|
|
252
|
+
--- Bio::GenBank#accession -> String
|
|
253
|
+
--- Bio::GenBank#version -> Fixnum
|
|
254
|
+
--- Bio::GenBank#gi -> String
|
|
255
|
+
|
|
256
|
+
Access methods for the contents of the VERSION record.
|
|
257
|
+
|
|
258
|
+
The 'acc_version' method returns the first part of the VERSION record
|
|
259
|
+
as a "ACCESSION.VERSION" String, 'accession' method returns the ACCESSION
|
|
260
|
+
part of the acc_version, 'version' method returns the VERSION part of the
|
|
261
|
+
acc_version as a Fixnum, and the 'gi' method returns the second part of
|
|
262
|
+
the VERSION record as a "GI:#######" String.
|
|
263
|
+
|
|
264
|
+
=== NID
|
|
265
|
+
|
|
266
|
+
--- Bio::GenBank#nid -> String
|
|
267
|
+
|
|
268
|
+
Returns contents of the NID record as a String.
|
|
269
|
+
|
|
270
|
+
=== KEYWORDS
|
|
271
|
+
|
|
272
|
+
--- Bio::GenBank#keywords -> Array
|
|
273
|
+
|
|
274
|
+
Returns contents of the KEYWORDS record as an Array of Strings.
|
|
275
|
+
|
|
276
|
+
=== SEGMENT
|
|
277
|
+
|
|
278
|
+
--- Bio::GenBank#segment -> String
|
|
279
|
+
|
|
280
|
+
Returns contents of the SEGMENT record as a "m/n" form String.
|
|
281
|
+
|
|
282
|
+
=== SOURCE
|
|
283
|
+
|
|
284
|
+
--- Bio::GenBank#source -> Hash
|
|
285
|
+
|
|
286
|
+
Returns contents of the SOURCE record as a Hash.
|
|
287
|
+
|
|
288
|
+
--- Bio::GenBank#common_name -> String
|
|
289
|
+
--- Bio::GenBank#vernacular_name -> String
|
|
290
|
+
--- Bio::GenBank#organism -> String
|
|
291
|
+
--- Bio::GenBank#taxonomy -> String
|
|
292
|
+
|
|
293
|
+
Access methods for the contents of the SOURCE record.
|
|
294
|
+
|
|
295
|
+
The 'common_name' method is same as source['common_name'].
|
|
296
|
+
The 'vernacular_name' method is an alias for the 'common_name'.
|
|
297
|
+
The 'organism' method is same as source['organism'].
|
|
298
|
+
The 'taxonomy' method is same as source['taxonomy'].
|
|
299
|
+
|
|
300
|
+
=== REFERENCE
|
|
301
|
+
|
|
302
|
+
--- Bio::GenBank#references -> Array
|
|
303
|
+
|
|
304
|
+
Returns contents of the REFERENCE records as an Array of Bio::Reference
|
|
305
|
+
objects.
|
|
306
|
+
|
|
307
|
+
=== COMMENT
|
|
308
|
+
|
|
309
|
+
--- Bio::GenBank#comment -> String
|
|
310
|
+
|
|
311
|
+
Returns contents of the COMMENT record as a String.
|
|
312
|
+
|
|
313
|
+
=== FEATURES
|
|
314
|
+
|
|
315
|
+
--- Bio::GenBank#features -> Bio::Features
|
|
316
|
+
|
|
317
|
+
Returns contents of the FEATURES record as a Bio::Features object.
|
|
318
|
+
|
|
319
|
+
--- Bio::GenBank#each_cds -> Array
|
|
320
|
+
|
|
321
|
+
Iterate only for the 'CDS' portion of the Bio::Features.
|
|
322
|
+
|
|
323
|
+
--- Bio::GenBank#each_gene -> Array
|
|
324
|
+
|
|
325
|
+
Iterate only for the 'gene' portion of the Bio::Features.
|
|
326
|
+
|
|
327
|
+
=== BASE COUNT
|
|
328
|
+
|
|
329
|
+
--- Bio::GenBank#basecount(base = nil) -> Hash or Fixnum
|
|
330
|
+
|
|
331
|
+
Returns the BASE COUNT as a Hash. When the base is specified, returns
|
|
332
|
+
count of the base as a Fixnum. The base can be one of 'a', 't', 'g',
|
|
333
|
+
'c', and 'o' (others).
|
|
334
|
+
|
|
335
|
+
=== ORIGIN
|
|
336
|
+
|
|
337
|
+
--- Bio::GenBank#origin -> String
|
|
338
|
+
|
|
339
|
+
Returns contents of the ORIGIN record as a String.
|
|
340
|
+
|
|
341
|
+
--- Bio::GenBank#naseq -> Bio::Sequence::NA
|
|
342
|
+
--- Bio::GenBank#seq -> Bio::Sequence::NA
|
|
343
|
+
|
|
344
|
+
Returns DNA sequence in the ORIGIN record as a Bio::Sequence::NA object.
|
|
345
|
+
|
|
346
|
+
== SEE ALSO
|
|
347
|
+
|
|
348
|
+
* ((<URL:ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt>))
|
|
349
|
+
* ((<URL:http://www.ncbi.nlm.nih.gov/collab/FT/index.html>))
|
|
350
|
+
|
|
351
|
+
=end
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
|