bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/aaindex.rb - AAindex database class
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001 KAWASHIMA Shuichi <s@bioruby.org>
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
#
|
|
7
|
+
# $Id: aaindex.rb,v 1.16 2005/12/18 15:58:41 k Exp $
|
|
8
|
+
#
|
|
9
|
+
# == Description
|
|
10
|
+
# Classes for Amino Acid Index Database (AAindex and AAindex2).
|
|
11
|
+
# * AAindex Manual: http://www.genome.jp/dbget-bin/show_man?aaindex
|
|
12
|
+
#
|
|
13
|
+
# == Examples
|
|
14
|
+
# aax1 = Bio::AAindex1.new("PRAM900102.aaindex1")
|
|
15
|
+
# aax1.entry_id
|
|
16
|
+
# aax1.index
|
|
17
|
+
#
|
|
18
|
+
# aax2 = Bio::AAindex2.new("HENS920102.aaindex2")
|
|
19
|
+
# aax2.entry_id
|
|
20
|
+
# aax2.matrix
|
|
21
|
+
# aax2.matrix[2,2]
|
|
22
|
+
#
|
|
23
|
+
# == References
|
|
24
|
+
# * http://www.genome.jp/aaindex/
|
|
25
|
+
#
|
|
26
|
+
#--
|
|
27
|
+
#
|
|
28
|
+
# This library is free software; you can redistribute it and/or
|
|
29
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
30
|
+
# License as published by the Free Software Foundation; either
|
|
31
|
+
# version 2 of the License, or (at your option) any later version.
|
|
32
|
+
#
|
|
33
|
+
# This library is distributed in the hope that it will be useful,
|
|
34
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
35
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
36
|
+
# Lesser General Public License for more details.
|
|
37
|
+
#
|
|
38
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
39
|
+
# License along with this library; if not, write to the Free Software
|
|
40
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
41
|
+
#
|
|
42
|
+
#++
|
|
43
|
+
#
|
|
44
|
+
|
|
45
|
+
require "bio/db"
|
|
46
|
+
require "matrix"
|
|
47
|
+
|
|
48
|
+
module Bio
|
|
49
|
+
|
|
50
|
+
class AAindex < KEGGDB
|
|
51
|
+
|
|
52
|
+
# Delimiter
|
|
53
|
+
DELIMITER ="\n//\n"
|
|
54
|
+
|
|
55
|
+
# Delimiter
|
|
56
|
+
RS = DELIMITER
|
|
57
|
+
|
|
58
|
+
# Bio::DB API
|
|
59
|
+
TAGSIZE = 2
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def initialize(entry)
|
|
63
|
+
super(entry, TAGSIZE)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Returns
|
|
67
|
+
def entry_id
|
|
68
|
+
field_fetch('H')
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Returns
|
|
72
|
+
def definition
|
|
73
|
+
field_fetch('D')
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Returns
|
|
77
|
+
def dblinks
|
|
78
|
+
field_fetch('R')
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Returns
|
|
82
|
+
def author
|
|
83
|
+
field_fetch('A')
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Returns
|
|
87
|
+
def title
|
|
88
|
+
field_fetch('T')
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Returns
|
|
92
|
+
def journal
|
|
93
|
+
field_fetch('J')
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Returns
|
|
97
|
+
def comment
|
|
98
|
+
get('*')
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class AAindex1 < AAindex
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def initialize(entry)
|
|
108
|
+
super(entry)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Returns
|
|
112
|
+
def correlation_coefficient
|
|
113
|
+
field_fetch('C')
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Returns
|
|
117
|
+
def index(type = :float)
|
|
118
|
+
aa = %w( A R N D C Q E G H I L K M F P S T W Y V )
|
|
119
|
+
values = field_fetch('I', 1).split(' ')
|
|
120
|
+
|
|
121
|
+
if values.size != 20
|
|
122
|
+
raise "Invalid format in #{entry_id} : #{values.inspect}"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
if type == :zscore and values.size > 0
|
|
126
|
+
sum = 0.0
|
|
127
|
+
values.each do |a|
|
|
128
|
+
sum += a.to_f
|
|
129
|
+
end
|
|
130
|
+
mean = sum / values.size # / 20
|
|
131
|
+
var = 0.0
|
|
132
|
+
values.each do |a|
|
|
133
|
+
var += (a.to_f - mean) ** 2
|
|
134
|
+
end
|
|
135
|
+
sd = Math.sqrt(var)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
if type == :integer
|
|
139
|
+
figure = 0
|
|
140
|
+
values.each do |a|
|
|
141
|
+
figure = [ figure, a[/\..*/].length - 1 ].max
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
hash = {}
|
|
146
|
+
|
|
147
|
+
aa.each_with_index do |a, i|
|
|
148
|
+
case type
|
|
149
|
+
when :string
|
|
150
|
+
hash[a] = values[i]
|
|
151
|
+
when :float
|
|
152
|
+
hash[a] = values[i].to_f
|
|
153
|
+
when :zscore
|
|
154
|
+
hash[a] = (values[i].to_f - mean) / sd
|
|
155
|
+
when :integer
|
|
156
|
+
hash[a] = (values[i].to_f * 10 ** figure).to_i
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
return hash
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class AAindex2 < AAindex
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def initialize(entry)
|
|
169
|
+
super(entry)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Returns
|
|
173
|
+
def rows
|
|
174
|
+
label_data
|
|
175
|
+
@rows
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Returns
|
|
179
|
+
def cols
|
|
180
|
+
label_data
|
|
181
|
+
@cols
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Returns
|
|
185
|
+
def matrix
|
|
186
|
+
ma = Array.new
|
|
187
|
+
|
|
188
|
+
data = label_data
|
|
189
|
+
data.each_line do |line|
|
|
190
|
+
list = line.strip.split(/\s+/).map{|x| x.to_f}
|
|
191
|
+
ma.push(list)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
Matrix[*ma]
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Returns
|
|
198
|
+
def old_matrix # for AAindex <= ver 5.0
|
|
199
|
+
|
|
200
|
+
@aa = {} # used to determine row/column of the aa
|
|
201
|
+
attr_reader :aa
|
|
202
|
+
|
|
203
|
+
field = field_fetch('I')
|
|
204
|
+
|
|
205
|
+
case field
|
|
206
|
+
when / (ARNDCQEGHILKMFPSTWYV)\s+(.*)/ # 20x19/2 matrix
|
|
207
|
+
aalist = $1
|
|
208
|
+
values = $2.split(/\s+/)
|
|
209
|
+
|
|
210
|
+
0.upto(aalist.length - 1) do |i|
|
|
211
|
+
@aa[aalist[i].chr] = i
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
ma = Array.new
|
|
215
|
+
20.times do
|
|
216
|
+
ma.push(Array.new(20)) # 2D array of 20x(20)
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
for i in 0 .. 19 do
|
|
220
|
+
for j in i .. 19 do
|
|
221
|
+
ma[i][j] = values[i + j*(j+1)/2].to_f
|
|
222
|
+
ma[j][i] = ma[i][j]
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
Matrix[*ma]
|
|
226
|
+
|
|
227
|
+
when / -ARNDCQEGHILKMFPSTWYV / # 21x20/2 matrix (with gap)
|
|
228
|
+
raise NotImplementedError
|
|
229
|
+
when / ACDEFGHIKLMNPQRSTVWYJ- / # 21x21 matrix (with gap)
|
|
230
|
+
raise NotImplementedError
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
private
|
|
235
|
+
|
|
236
|
+
def label_data
|
|
237
|
+
label, data = get('M').split("\n", 2)
|
|
238
|
+
if /M rows = (\S+), cols = (\S+)/.match(label)
|
|
239
|
+
rows, cols = $1, $2
|
|
240
|
+
@rows = rows.split('')
|
|
241
|
+
@cols = cols.split('')
|
|
242
|
+
end
|
|
243
|
+
return data
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __FILE__ == $0
|
|
252
|
+
require 'bio/io/fetch'
|
|
253
|
+
|
|
254
|
+
puts "### AAindex1 (PRAM900102)"
|
|
255
|
+
aax1 = Bio::AAindex1.new(Bio::Fetch.query('aaindex', 'PRAM900102', 'raw'))
|
|
256
|
+
p aax1.entry_id
|
|
257
|
+
p aax1.definition
|
|
258
|
+
p aax1.dblinks
|
|
259
|
+
p aax1.author
|
|
260
|
+
p aax1.title
|
|
261
|
+
p aax1.journal
|
|
262
|
+
p aax1.correlation_coefficient
|
|
263
|
+
p aax1.index
|
|
264
|
+
puts "### AAindex2 (HENS920102)"
|
|
265
|
+
aax2 = Bio::AAindex2.new(Bio::Fetch.query('aaindex', 'HENS920102', 'raw'))
|
|
266
|
+
p aax2.entry_id
|
|
267
|
+
p aax2.definition
|
|
268
|
+
p aax2.dblinks
|
|
269
|
+
p aax2.author
|
|
270
|
+
p aax2.title
|
|
271
|
+
p aax2.journal
|
|
272
|
+
p aax2.rows
|
|
273
|
+
p aax2.cols
|
|
274
|
+
p aax2.matrix
|
|
275
|
+
p aax2.matrix[2,2]
|
|
276
|
+
p aax2.matrix.determinant
|
|
277
|
+
p aax2.matrix.rank
|
|
278
|
+
p aax2.matrix.transpose
|
|
279
|
+
end
|
|
280
|
+
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/embl.rb - Common methods for EMBL style database classes
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
#
|
|
7
|
+
# $Id: common.rb,v 1.8 2005/11/02 07:30:14 nakao Exp $
|
|
8
|
+
#
|
|
9
|
+
# == EMBL style databases class
|
|
10
|
+
#
|
|
11
|
+
# This module defines a common framework among EMBL, SWISS-PROT, TrEMBL.
|
|
12
|
+
# For more details, see the documentations in each embl/*.rb libraries.
|
|
13
|
+
#
|
|
14
|
+
# EMBL style format:
|
|
15
|
+
# ID - identification (begins each entry; 1 per entry)
|
|
16
|
+
# AC - accession number (>=1 per entry)
|
|
17
|
+
# SV - sequence version (1 per entry)
|
|
18
|
+
# DT - date (2 per entry)
|
|
19
|
+
# DE - description (>=1 per entry)
|
|
20
|
+
# KW - keyword (>=1 per entry)
|
|
21
|
+
# OS - organism species (>=1 per entry)
|
|
22
|
+
# OC - organism classification (>=1 per entry)
|
|
23
|
+
# OG - organelle (0 or 1 per entry)
|
|
24
|
+
# RN - reference number (>=1 per entry)
|
|
25
|
+
# RC - reference comment (>=0 per entry)
|
|
26
|
+
# RP - reference positions (>=1 per entry)
|
|
27
|
+
# RX - reference cross-reference (>=0 per entry)
|
|
28
|
+
# RA - reference author(s) (>=1 per entry)
|
|
29
|
+
# RG - reference group (>=0 per entry)
|
|
30
|
+
# RT - reference title (>=1 per entry)
|
|
31
|
+
# RL - reference location (>=1 per entry)
|
|
32
|
+
# DR - database cross-reference (>=0 per entry)
|
|
33
|
+
# FH - feature table header (0 or 2 per entry)
|
|
34
|
+
# FT - feature table data (>=0 per entry)
|
|
35
|
+
# CC - comments or notes (>=0 per entry)
|
|
36
|
+
# XX - spacer line (many per entry)
|
|
37
|
+
# SQ - sequence header (1 per entry)
|
|
38
|
+
# bb - (blanks) sequence data (>=1 per entry)
|
|
39
|
+
# // - termination line (ends each entry; 1 per entry)
|
|
40
|
+
#
|
|
41
|
+
#
|
|
42
|
+
# == Example
|
|
43
|
+
#
|
|
44
|
+
# require 'bio/db/embl/common'
|
|
45
|
+
# module Bio
|
|
46
|
+
# class NEWDB < EMBLDB
|
|
47
|
+
# include Bio::EMBLDB::Common
|
|
48
|
+
# end
|
|
49
|
+
# end
|
|
50
|
+
#
|
|
51
|
+
#--
|
|
52
|
+
#
|
|
53
|
+
# This library is free software; you can redistribute it and/or
|
|
54
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
55
|
+
# License as published by the Free Software Foundation; either
|
|
56
|
+
# version 2 of the License, or (at your option) any later version.
|
|
57
|
+
#
|
|
58
|
+
# This library is distributed in the hope that it will be useful,
|
|
59
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
60
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
61
|
+
# Lesser General Public License for more details.
|
|
62
|
+
#
|
|
63
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
64
|
+
# License along with this library; if not, write to the Free Software
|
|
65
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
66
|
+
#
|
|
67
|
+
#++
|
|
68
|
+
#
|
|
69
|
+
|
|
70
|
+
require 'bio/db'
|
|
71
|
+
require 'bio/reference'
|
|
72
|
+
|
|
73
|
+
module Bio
|
|
74
|
+
class EMBLDB
|
|
75
|
+
module Common
|
|
76
|
+
|
|
77
|
+
DELIMITER = "\n//\n"
|
|
78
|
+
RS = DELIMITER
|
|
79
|
+
TAGSIZE = 5
|
|
80
|
+
|
|
81
|
+
def initialize(entry)
|
|
82
|
+
super(entry, TAGSIZE)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# returns a Array of accession numbers in the AC lines.
|
|
86
|
+
#
|
|
87
|
+
# AC Line
|
|
88
|
+
# "AC A12345; B23456;"
|
|
89
|
+
# AC [AC1;]+
|
|
90
|
+
#
|
|
91
|
+
# Accession numbers format:
|
|
92
|
+
# 1 2 3 4 5 6
|
|
93
|
+
# [O,P,Q] [0-9] [A-Z, 0-9] [A-Z, 0-9] [A-Z, 0-9] [0-9]
|
|
94
|
+
def ac
|
|
95
|
+
unless @data['AC']
|
|
96
|
+
tmp = Array.new
|
|
97
|
+
field_fetch('AC').split(/ /).each do |e|
|
|
98
|
+
tmp.push(e.sub(/;/,''))
|
|
99
|
+
end
|
|
100
|
+
@data['AC'] = tmp
|
|
101
|
+
end
|
|
102
|
+
@data['AC']
|
|
103
|
+
end
|
|
104
|
+
alias accessions ac
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# returns the first accession number in the AC lines
|
|
108
|
+
def accession
|
|
109
|
+
ac[0]
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# returns a String int the DE line.
|
|
114
|
+
#
|
|
115
|
+
# DE Line
|
|
116
|
+
def de
|
|
117
|
+
unless @data['DE']
|
|
118
|
+
@data['DE'] = fetch('DE')
|
|
119
|
+
end
|
|
120
|
+
@data['DE']
|
|
121
|
+
end
|
|
122
|
+
alias description de
|
|
123
|
+
alias definition de # API
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# returns contents in the OS line.
|
|
128
|
+
# * Bio::EMBLDB#os -> Array of <OS Hash>
|
|
129
|
+
# where <OS Hash> is:
|
|
130
|
+
# [{'name'=>'Human', 'os'=>'Homo sapiens'},
|
|
131
|
+
# {'name'=>'Rat', 'os'=>'Rattus norveticus'}]
|
|
132
|
+
# * Bio::SPTR#os[0]['name'] => "Human"
|
|
133
|
+
# * Bio::SPTR#os[0] => {'name'=>"Human", 'os'=>'Homo sapiens'}
|
|
134
|
+
# * Bio::STPR#os(0) => "Homo sapiens (Human)"
|
|
135
|
+
#
|
|
136
|
+
# OS Line; organism species (>=1)
|
|
137
|
+
# "OS Trifolium repens (white clover)"
|
|
138
|
+
#
|
|
139
|
+
# OS Genus species (name).
|
|
140
|
+
# OS Genus species (name0) (name1).
|
|
141
|
+
# OS Genus species (name0) (name1).
|
|
142
|
+
# OS Genus species (name0), G s0 (name0), and G s (name1).
|
|
143
|
+
def os(num = nil)
|
|
144
|
+
unless @data['OS']
|
|
145
|
+
os = Array.new
|
|
146
|
+
fetch('OS').split(/, and|, /).each do |tmp|
|
|
147
|
+
if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
|
|
148
|
+
org = $1
|
|
149
|
+
tmp =~ /(\(.+\))/
|
|
150
|
+
os.push({'name' => $1, 'os' => org})
|
|
151
|
+
else
|
|
152
|
+
raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
@data['OS'] = os
|
|
156
|
+
end
|
|
157
|
+
if num
|
|
158
|
+
# EX. "Trifolium repens (white clover)"
|
|
159
|
+
"#{@data['OS'][num]['os']} {#data['OS'][num]['name']"
|
|
160
|
+
end
|
|
161
|
+
@data['OS']
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# returns contents in the OG line.
|
|
166
|
+
# * Bio::EMBLDB::Common#og -> [ <ogranella String>* ]
|
|
167
|
+
#
|
|
168
|
+
# OG Line; organella (0 or 1/entry)
|
|
169
|
+
# OG Plastid; Chloroplast.
|
|
170
|
+
# OG Mitochondrion.
|
|
171
|
+
# OG Plasmid sym pNGR234a.
|
|
172
|
+
# OG Plastid; Cyanelle.
|
|
173
|
+
# OG Plasmid pSymA (megaplasmid 1).
|
|
174
|
+
# OG Plasmid pNRC100, Plasmid pNRC200, and Plasmid pHH1.
|
|
175
|
+
def og
|
|
176
|
+
unless @data['OG']
|
|
177
|
+
og = Array.new
|
|
178
|
+
if get('OG').size > 0
|
|
179
|
+
ogstr = fetch('OG')
|
|
180
|
+
ogstr.sub!(/\.$/,'')
|
|
181
|
+
ogstr.sub!(/ and/,'')
|
|
182
|
+
ogstr.sub!(/;/, ',')
|
|
183
|
+
ogstr.split(',').each do |tmp|
|
|
184
|
+
og.push(tmp.strip)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
@data['OG'] = og
|
|
188
|
+
end
|
|
189
|
+
@data['OG']
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# returns contents in the OC line.
|
|
194
|
+
# * Bio::EMBLDB::Common#oc -> [ <organism class String>* ]
|
|
195
|
+
# OC Line; organism classification (>=1)
|
|
196
|
+
# OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;
|
|
197
|
+
# OC Theileria.
|
|
198
|
+
def oc
|
|
199
|
+
unless @data['OC']
|
|
200
|
+
begin
|
|
201
|
+
@data['OC'] = fetch('OC').sub(/.$/,'').split(/;/).map {|e|
|
|
202
|
+
e.strip
|
|
203
|
+
}
|
|
204
|
+
rescue NameError
|
|
205
|
+
nil
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
@data['OC']
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# returns keywords in the KW line.
|
|
212
|
+
# * Bio::EMBLDB::Common#kw -> [ <keyword>* ]
|
|
213
|
+
# KW Line; keyword (>=1)
|
|
214
|
+
# KW [Keyword;]+
|
|
215
|
+
def kw
|
|
216
|
+
unless @data['KW']
|
|
217
|
+
if get('KW').size > 0
|
|
218
|
+
tmp = fetch('KW').sub(/.$/,'')
|
|
219
|
+
@data['KW'] = tmp.split(/;/).map {|e| e.strip }
|
|
220
|
+
else
|
|
221
|
+
@data['KW'] = []
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
@data['KW']
|
|
225
|
+
end
|
|
226
|
+
alias keywords kw
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# returns contents in the R lines.
|
|
230
|
+
# * Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
|
|
231
|
+
# where <reference information Hash> is:
|
|
232
|
+
# {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
|
233
|
+
# 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
|
234
|
+
#
|
|
235
|
+
# R Lines
|
|
236
|
+
# * RN RC RP RX RA RT RL RG
|
|
237
|
+
def ref
|
|
238
|
+
unless @data['R']
|
|
239
|
+
ary = Array.new
|
|
240
|
+
get('R').split(/\nRN /).each do |str|
|
|
241
|
+
raw = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
|
242
|
+
'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
|
243
|
+
str = 'RN ' + str unless /^RN / =~ str
|
|
244
|
+
str.split("\n").each do |line|
|
|
245
|
+
if /^(R[NPXARLCTG]) (.+)/ =~ line
|
|
246
|
+
raw[$1] += $2 + ' '
|
|
247
|
+
else
|
|
248
|
+
raise "Invalid format in R lines, \n[#{line}]\n"
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
raw.each_value {|v|
|
|
252
|
+
v.strip!
|
|
253
|
+
v.sub!(/^"/,'')
|
|
254
|
+
v.sub!(/;$/,'')
|
|
255
|
+
v.sub!(/"$/,'')
|
|
256
|
+
}
|
|
257
|
+
ary.push(raw)
|
|
258
|
+
end
|
|
259
|
+
@data['R'] = ary
|
|
260
|
+
end
|
|
261
|
+
@data['R']
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# returns Bio::Reference object from Bio::EMBLDB::Common#ref.
|
|
265
|
+
# * Bio::EMBLDB::Common#ref -> Bio::References
|
|
266
|
+
def references
|
|
267
|
+
unless @data['references']
|
|
268
|
+
ary = self.ref.map {|ent|
|
|
269
|
+
hash = Hash.new('')
|
|
270
|
+
ent.each {|key, value|
|
|
271
|
+
case key
|
|
272
|
+
when 'RA'
|
|
273
|
+
hash['authors'] = value.split(/, /)
|
|
274
|
+
when 'RT'
|
|
275
|
+
hash['title'] = value
|
|
276
|
+
when 'RL'
|
|
277
|
+
if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
|
|
278
|
+
hash['journal'] = $1
|
|
279
|
+
hash['volume'] = $2
|
|
280
|
+
hash['issue'] = $3
|
|
281
|
+
hash['pages'] = $4
|
|
282
|
+
hash['year'] = $5
|
|
283
|
+
else
|
|
284
|
+
hash['journal'] = value
|
|
285
|
+
end
|
|
286
|
+
when 'RX' # PUBMED, MEDLINE
|
|
287
|
+
value.split('.').each {|item|
|
|
288
|
+
tag, xref = item.split(/; /).map {|i| i.strip }
|
|
289
|
+
hash[ tag.downcase ] = xref
|
|
290
|
+
}
|
|
291
|
+
end
|
|
292
|
+
}
|
|
293
|
+
Reference.new(hash)
|
|
294
|
+
}
|
|
295
|
+
@data['references'] = References.new(ary)
|
|
296
|
+
end
|
|
297
|
+
@data['references']
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# returns contents in the DR line.
|
|
302
|
+
# * Bio::EMBLDB::Common#dr -> [ <Database cross-reference Hash>* ]
|
|
303
|
+
# where <Database cross-reference Hash> is:
|
|
304
|
+
# * Bio::EMBLDB::Common#dr {|k,v| }
|
|
305
|
+
#
|
|
306
|
+
# DR Line; defabases cross-reference (>=0)
|
|
307
|
+
# a cross_ref pre one line
|
|
308
|
+
# "DR database_identifier; primary_identifier; secondary_identifier."
|
|
309
|
+
def dr
|
|
310
|
+
unless @data['DR']
|
|
311
|
+
tmp = Hash.new
|
|
312
|
+
self.get('DR').split(/\n/).each do |db|
|
|
313
|
+
a = db.sub(/^DR /,'').sub(/.$/,'').strip.split(/;[ ]/)
|
|
314
|
+
dbname = a.shift
|
|
315
|
+
tmp[dbname] = Array.new unless tmp[dbname]
|
|
316
|
+
tmp[dbname].push(a)
|
|
317
|
+
end
|
|
318
|
+
@data['DR'] = tmp
|
|
319
|
+
end
|
|
320
|
+
if block_given?
|
|
321
|
+
@data['DR'].each do |k,v|
|
|
322
|
+
yield(k, v)
|
|
323
|
+
end
|
|
324
|
+
else
|
|
325
|
+
@data['DR']
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
end # module Common
|
|
330
|
+
end # class EMBLDB
|
|
331
|
+
end # module Bio
|
|
332
|
+
|