bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/embl/embl.rb - EMBL database class
|
|
3
|
+
#
|
|
4
|
+
#
|
|
5
|
+
# Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
|
|
6
|
+
# License:: LGPL
|
|
7
|
+
#
|
|
8
|
+
# $Id: embl.rb,v 1.25 2005/11/02 07:30:14 nakao Exp $
|
|
9
|
+
#
|
|
10
|
+
# == EMBL database entry
|
|
11
|
+
#
|
|
12
|
+
#
|
|
13
|
+
#
|
|
14
|
+
# == Example
|
|
15
|
+
#
|
|
16
|
+
# emb = Bio::EMBL.new($<.read)
|
|
17
|
+
# emb.entry_id
|
|
18
|
+
# emb.each_cds do |cds|
|
|
19
|
+
# cds
|
|
20
|
+
# end
|
|
21
|
+
# emb.seq
|
|
22
|
+
#
|
|
23
|
+
#--
|
|
24
|
+
#
|
|
25
|
+
# This library is free software; you can redistribute it and/or
|
|
26
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
27
|
+
# License as published by the Free Software Foundation; either
|
|
28
|
+
# version 2 of the License, or (at your option) any later version.
|
|
29
|
+
#
|
|
30
|
+
# This library is distributed in the hope that it will be useful,
|
|
31
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
32
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
33
|
+
# Lesser General Public License for more details.
|
|
34
|
+
#
|
|
35
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
36
|
+
# License along with this library; if not, write to the Free Software
|
|
37
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
38
|
+
#
|
|
39
|
+
#++
|
|
40
|
+
#
|
|
41
|
+
|
|
42
|
+
require 'bio/db'
|
|
43
|
+
require 'bio/db/embl/common'
|
|
44
|
+
|
|
45
|
+
module Bio
|
|
46
|
+
class EMBL < EMBLDB
|
|
47
|
+
include Bio::EMBLDB::Common
|
|
48
|
+
|
|
49
|
+
# returns contents in the ID line.
|
|
50
|
+
# * Bio::EMBL#id_line -> <ID Hash>
|
|
51
|
+
# where <ID Hash> is:
|
|
52
|
+
# {'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
|
|
53
|
+
# 'SEQUENCE_LENGTH' => Int}
|
|
54
|
+
#
|
|
55
|
+
# ID Line
|
|
56
|
+
# "ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."
|
|
57
|
+
#
|
|
58
|
+
# DATA_CLASS = ['standard']
|
|
59
|
+
#
|
|
60
|
+
# MOLECULE_TYPE: DNA RNA XXX
|
|
61
|
+
#
|
|
62
|
+
# Code ( DIVISION )
|
|
63
|
+
# EST (ESTs)
|
|
64
|
+
# PHG (Bacteriophage)
|
|
65
|
+
# FUN (Fungi)
|
|
66
|
+
# GSS (Genome survey)
|
|
67
|
+
# HTC (High Throughput cDNAs)
|
|
68
|
+
# HTG (HTGs)
|
|
69
|
+
# HUM (Human)
|
|
70
|
+
# INV (Invertebrates)
|
|
71
|
+
# ORG (Organelles)
|
|
72
|
+
# MAM (Other Mammals)
|
|
73
|
+
# VRT (Other Vertebrates)
|
|
74
|
+
# PLN (Plants)
|
|
75
|
+
# PRO (Prokaryotes)
|
|
76
|
+
# ROD (Rodents)
|
|
77
|
+
# SYN (Synthetic)
|
|
78
|
+
# STS (STSs)
|
|
79
|
+
# UNC (Unclassified)
|
|
80
|
+
# VRL (Viruses)
|
|
81
|
+
#
|
|
82
|
+
def id_line(key=nil)
|
|
83
|
+
unless @data['ID']
|
|
84
|
+
tmp = Hash.new
|
|
85
|
+
idline = fetch('ID').split(/; +/)
|
|
86
|
+
tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline[0].split(/ +/)
|
|
87
|
+
tmp['MOLECULE_TYPE'] = idline[1]
|
|
88
|
+
tmp['DIVISION'] = idline[2]
|
|
89
|
+
tmp['SEQUENCE_LENGTH'] = idline[3].strip.split(' ').first.to_i
|
|
90
|
+
|
|
91
|
+
@data['ID'] = tmp
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
if key
|
|
95
|
+
@data['ID'][key]
|
|
96
|
+
else
|
|
97
|
+
@data['ID']
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# returns ENTRY_NAME in the ID line.
|
|
102
|
+
# * Bio::EMBL#entry -> String
|
|
103
|
+
def entry
|
|
104
|
+
id_line('ENTRY_NAME')
|
|
105
|
+
end
|
|
106
|
+
alias entry_name entry
|
|
107
|
+
alias entry_id entry
|
|
108
|
+
|
|
109
|
+
# returns MOLECULE_TYPE in the ID line.
|
|
110
|
+
# * Bio::EMBL#molecule -> String
|
|
111
|
+
def molecule
|
|
112
|
+
id_line('MOLECULE_TYPE')
|
|
113
|
+
end
|
|
114
|
+
alias molecule_type molecule
|
|
115
|
+
|
|
116
|
+
# returns DIVISION in the ID line.
|
|
117
|
+
# * Bio::EMBL#division -> String
|
|
118
|
+
def division
|
|
119
|
+
id_line('DIVISION')
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# returns SEQUENCE_LENGTH in the ID line.
|
|
123
|
+
# * Bio::EMBL#sequencelength -> String
|
|
124
|
+
def sequence_length
|
|
125
|
+
id_line('SEQUENCE_LENGTH')
|
|
126
|
+
end
|
|
127
|
+
alias seqlen sequence_length
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# AC Line
|
|
131
|
+
# "AC A12345; B23456;"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# returns the version information in the sequence version (SV) line.
|
|
135
|
+
# * Bio::EMBL#sv -> Accession.Version in String
|
|
136
|
+
# * Bio::EMBL#version -> accession in Int
|
|
137
|
+
#
|
|
138
|
+
# SV Line; sequence version (1/entry)
|
|
139
|
+
# SV Accession.Version
|
|
140
|
+
def sv
|
|
141
|
+
field_fetch('SV').sub(/;/,'')
|
|
142
|
+
end
|
|
143
|
+
def version
|
|
144
|
+
sv.split(".")[1].to_i
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# returns contents in the date (DT) line.
|
|
149
|
+
# * Bio::EMBL#dt -> <DT Hash>
|
|
150
|
+
# where <DT Hash> is:
|
|
151
|
+
# {}
|
|
152
|
+
# * Bio::EMBL#dt(key) -> String
|
|
153
|
+
# keys: 'created' and 'updated'
|
|
154
|
+
#
|
|
155
|
+
# DT Line; date (2/entry)
|
|
156
|
+
def dt(key=nil)
|
|
157
|
+
unless @data['DT']
|
|
158
|
+
tmp = Hash.new
|
|
159
|
+
dt_line = self.get('DT').split(/\n/)
|
|
160
|
+
tmp['created'] = dt_line[0].sub(/\w{2} /,'').strip
|
|
161
|
+
tmp['updated'] = dt_line[1].sub(/\w{2} /,'').strip
|
|
162
|
+
@data['DT'] = tmp
|
|
163
|
+
end
|
|
164
|
+
if key
|
|
165
|
+
@data['DT'][key]
|
|
166
|
+
else
|
|
167
|
+
@data['DT']
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
##
|
|
174
|
+
# DE Line; description (>=1)
|
|
175
|
+
#
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
##
|
|
179
|
+
# KW Line; keyword (>=1)
|
|
180
|
+
# KW [Keyword;]+
|
|
181
|
+
#
|
|
182
|
+
# Bio::EMBLDB#kw -> Array
|
|
183
|
+
# #keywords -> Array
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
##
|
|
187
|
+
# OS Line; organism species (>=1)
|
|
188
|
+
# OS Genus species (name)
|
|
189
|
+
# "OS Trifolium repens (white clover)"
|
|
190
|
+
#
|
|
191
|
+
# Bio::EMBLDB#os -> Array
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
##
|
|
195
|
+
# OC Line; organism classification (>=1)
|
|
196
|
+
#
|
|
197
|
+
# Bio::EMBLDB#oc -> Array
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
##
|
|
201
|
+
# OG Line; organella (0 or 1/entry)
|
|
202
|
+
# ["Mitochondrion", "Chloroplast","Kinetoplast", "Cyanelle", "Plastid"]
|
|
203
|
+
# or a plasmid name (e.g. "Plasmid pBR322").
|
|
204
|
+
#
|
|
205
|
+
# Bio::EMBLDB#og -> String
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
##
|
|
209
|
+
# R Lines
|
|
210
|
+
# RN RC RP RX RA RT RL
|
|
211
|
+
#
|
|
212
|
+
# Bio::EMBLDB#ref
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
##
|
|
216
|
+
# DR Line; defabases cross-regerence (>=0)
|
|
217
|
+
# "DR database_identifier; primary_identifier; secondary_identifier."
|
|
218
|
+
#
|
|
219
|
+
# Bio::EMBLDB#dr
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# returns feature table header (String) in the feature header (FH) line.
|
|
223
|
+
#
|
|
224
|
+
# FH Line; feature table header (0 or 2)
|
|
225
|
+
def fh
|
|
226
|
+
fetch('FH')
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# returns contents in the feature table (FT) lines.
|
|
230
|
+
# * Bio::EMBL#ft -> Bio::Features
|
|
231
|
+
# * Bio::EMBL#ft {} -> {|Bio::Feature| }
|
|
232
|
+
#
|
|
233
|
+
# same as features method in bio/db/genbank.rb
|
|
234
|
+
#
|
|
235
|
+
# FT Line; feature table data (>=0)
|
|
236
|
+
def ft
|
|
237
|
+
unless @data['FT']
|
|
238
|
+
@data['FT'] = Array.new
|
|
239
|
+
ary = Array.new
|
|
240
|
+
in_quote = false
|
|
241
|
+
@orig['FT'].each_line do |line|
|
|
242
|
+
next if line =~ /^FEATURES/
|
|
243
|
+
|
|
244
|
+
head = line[0,20].strip # feature key (source, CDS, ...)
|
|
245
|
+
body = line[20,60].chomp # feature value (position, /qualifier=)
|
|
246
|
+
if line =~ /^FT {3}(\S+)/
|
|
247
|
+
ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
|
|
248
|
+
elsif body =~ /^ \// and not in_quote
|
|
249
|
+
ary.last.push(body) # /q="data..., /q=data, /q
|
|
250
|
+
|
|
251
|
+
if body =~ /=" / and body !~ /"$/
|
|
252
|
+
in_quote = true
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
else
|
|
256
|
+
ary.last.last << body # ...data..., ...data..."
|
|
257
|
+
|
|
258
|
+
if body =~ /"$/
|
|
259
|
+
in_quote = false
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
ary.map! do |subary|
|
|
265
|
+
parse_qualifiers(subary)
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
@data['FT'] = Features.new(ary)
|
|
269
|
+
end
|
|
270
|
+
if block_given?
|
|
271
|
+
@data['FT'].each do |feature|
|
|
272
|
+
yield feature
|
|
273
|
+
end
|
|
274
|
+
else
|
|
275
|
+
@data['FT']
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
alias features ft
|
|
279
|
+
|
|
280
|
+
# iterates on CDS features in the FT lines.
|
|
281
|
+
def each_cds
|
|
282
|
+
ft.each do |cds_feature|
|
|
283
|
+
if cds_feature.feature == 'CDS'
|
|
284
|
+
yield cds_feature
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# iterates on gene features in the FT lines.
|
|
290
|
+
def each_gene
|
|
291
|
+
ft.each do |gene_feature|
|
|
292
|
+
if gene_feature.feature == 'gene'
|
|
293
|
+
yield gene_feature
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# returns comment text in the comments (CC) line.
|
|
300
|
+
#
|
|
301
|
+
# CC Line; comments of notes (>=0)
|
|
302
|
+
def cc
|
|
303
|
+
get('CC')
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
##
|
|
308
|
+
# XX Line; spacer line (many)
|
|
309
|
+
# def nxx
|
|
310
|
+
# end
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# returns sequence header information in the sequence header (SQ) line.
|
|
314
|
+
# * Bio::EMBL#sq -> <SQ Hash>
|
|
315
|
+
# where <SQ Hash> is:
|
|
316
|
+
# {'ntlen' => Int, 'other' => Int,
|
|
317
|
+
# 'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}
|
|
318
|
+
# * Bio::EMBL#sq(base) -> <base content in Int>
|
|
319
|
+
# * Bio::EMBL#sq[base] -> <base content in Int>
|
|
320
|
+
#
|
|
321
|
+
# SQ Line; sequence header (1/entry)
|
|
322
|
+
# SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
|
|
323
|
+
def sq(base = nil)
|
|
324
|
+
unless @data['SQ']
|
|
325
|
+
fetch('SQ') =~ \
|
|
326
|
+
/(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
|
|
327
|
+
@data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
|
|
328
|
+
'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
|
|
329
|
+
else
|
|
330
|
+
@data['SQ']
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
if base
|
|
334
|
+
@data['SQ'][base.downcase]
|
|
335
|
+
else
|
|
336
|
+
@data['SQ']
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
# returns the nucleotie sequence in this entry.
|
|
342
|
+
# * Bio::EMBL#seq -> Bio::Sequence::NA
|
|
343
|
+
#
|
|
344
|
+
# @orig[''] as sequence
|
|
345
|
+
# bb Line; (blanks) sequence data (>=1)
|
|
346
|
+
def seq
|
|
347
|
+
Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
|
|
348
|
+
end
|
|
349
|
+
alias naseq seq
|
|
350
|
+
alias ntseq seq
|
|
351
|
+
|
|
352
|
+
# // Line; termination line (end; 1/entry)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
### private methods
|
|
356
|
+
|
|
357
|
+
private
|
|
358
|
+
|
|
359
|
+
##
|
|
360
|
+
# same as Bio::GenBank#parse_qualifiers(feature)
|
|
361
|
+
def parse_qualifiers(ary)
|
|
362
|
+
feature = Feature.new
|
|
363
|
+
|
|
364
|
+
feature.feature = ary.shift
|
|
365
|
+
feature.position = ary.shift.gsub(/\s/, '')
|
|
366
|
+
|
|
367
|
+
ary.each do |f|
|
|
368
|
+
if f =~ %r{/([^=]+)=?"?([^"]*)"?}
|
|
369
|
+
qualifier, value = $1, $2
|
|
370
|
+
|
|
371
|
+
if value.empty?
|
|
372
|
+
value = true
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
case qualifier
|
|
376
|
+
when 'translation'
|
|
377
|
+
value = Sequence::AA.new(value.gsub(/\s/, ''))
|
|
378
|
+
when 'codon_start'
|
|
379
|
+
value = value.to_i
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
feature.append(Feature::Qualifier.new(qualifier, value))
|
|
383
|
+
end
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
return feature
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
if __FILE__ == $0
|
|
395
|
+
while ent = $<.gets(Bio::EMBL::RS)
|
|
396
|
+
puts "\n ==> e = Bio::EMBL.new(ent) "
|
|
397
|
+
e = Bio::EMBL.new(ent)
|
|
398
|
+
|
|
399
|
+
puts "\n ==> e.entry_id "
|
|
400
|
+
p e.entry_id
|
|
401
|
+
puts "\n ==> e.id_line "
|
|
402
|
+
p e.id_line
|
|
403
|
+
puts "\n ==> e.id_line('molecule') "
|
|
404
|
+
p e.id_line('molecule')
|
|
405
|
+
puts "\n ==> e.molecule "
|
|
406
|
+
p e.molecule
|
|
407
|
+
puts "\n ==> e.ac "
|
|
408
|
+
p e.ac
|
|
409
|
+
puts "\n ==> e.sv "
|
|
410
|
+
p e.sv
|
|
411
|
+
puts "\n ==> e.dt "
|
|
412
|
+
p e.dt
|
|
413
|
+
puts "\n ==> e.dt('created') "
|
|
414
|
+
p e.dt('created')
|
|
415
|
+
puts "\n ==> e.de "
|
|
416
|
+
p e.de
|
|
417
|
+
puts "\n ==> e.kw "
|
|
418
|
+
p e.kw
|
|
419
|
+
puts "\n ==> e.os "
|
|
420
|
+
p e.os
|
|
421
|
+
puts "\n ==> e.oc "
|
|
422
|
+
p e.oc
|
|
423
|
+
puts "\n ==> e.og "
|
|
424
|
+
p e.og
|
|
425
|
+
puts "\n ==> e.ref "
|
|
426
|
+
p e.ref
|
|
427
|
+
puts "\n ==> e.dr "
|
|
428
|
+
p e.dr
|
|
429
|
+
puts "\n ==> e.ft "
|
|
430
|
+
p e.ft
|
|
431
|
+
puts "\n ==> e.each_cds {|c| p c}"
|
|
432
|
+
p e.each_cds {|c| p c }
|
|
433
|
+
puts "\n ==> e.sq "
|
|
434
|
+
p e.sq
|
|
435
|
+
puts "\n ==> e.sq('a') "
|
|
436
|
+
p e.sq('a')
|
|
437
|
+
puts "\n ==> e.gc"
|
|
438
|
+
p e.gc
|
|
439
|
+
puts "\n ==> e.seq "
|
|
440
|
+
p e.seq
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
|
|
@@ -0,0 +1,954 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/embl/sptr.rb - UniProt/SwissProt and TrEMBL database class
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
#
|
|
7
|
+
# $Id: sptr.rb,v 1.29 2005/11/02 07:30:14 nakao Exp $
|
|
8
|
+
#
|
|
9
|
+
# == UniProtKB/SwissProt and TrEMBL
|
|
10
|
+
#
|
|
11
|
+
# See the SWISS-PROT dicument file SPECLIST.TXT.
|
|
12
|
+
#
|
|
13
|
+
# == Example
|
|
14
|
+
#
|
|
15
|
+
#--
|
|
16
|
+
#
|
|
17
|
+
# This library is free software; you can redistribute it and/or
|
|
18
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
19
|
+
# License as published by the Free Software Foundation; either
|
|
20
|
+
# version 2 of the License, or (at your option) any later version.
|
|
21
|
+
#
|
|
22
|
+
# This library is distributed in the hope that it will be useful,
|
|
23
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
24
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
25
|
+
# Lesser General Public License for more details.
|
|
26
|
+
#
|
|
27
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
28
|
+
# License along with this library; if not, write to the Free Software
|
|
29
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
30
|
+
#
|
|
31
|
+
#++
|
|
32
|
+
#
|
|
33
|
+
|
|
34
|
+
require 'bio/db'
|
|
35
|
+
require 'bio/db/embl/common'
|
|
36
|
+
|
|
37
|
+
module Bio
|
|
38
|
+
|
|
39
|
+
# Parser class for UniProtKB/SwissProt and TrEMBL database entry
|
|
40
|
+
class SPTR < EMBLDB
|
|
41
|
+
include Bio::EMBLDB::Common
|
|
42
|
+
|
|
43
|
+
@@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
|
|
44
|
+
@@data_class = ["STANDARD", "PRELIMINARY"]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# returns a Hash of the ID line.
|
|
48
|
+
# returns a content (Int or String) of the ID line by a given key.
|
|
49
|
+
# Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
|
|
50
|
+
#
|
|
51
|
+
# ID Line
|
|
52
|
+
# "ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
|
|
53
|
+
#
|
|
54
|
+
# ENTRY_NAME := "#{X}_#{Y}"
|
|
55
|
+
# X =~ /[A-Z0-9]{1,5}/ # The protein name.
|
|
56
|
+
# Y =~ /[A-Z0-9]{1,5}/ # The biological source of the protein.
|
|
57
|
+
# MOLECULE_TYPE := 'PRT' =~ /\w{3}/
|
|
58
|
+
# SEQUENCE_LENGTH =~ /\d+ AA/
|
|
59
|
+
def id_line(key = nil)
|
|
60
|
+
unless @data['ID']
|
|
61
|
+
tmp = Hash.new
|
|
62
|
+
a = @orig['ID'].split(/ +/)
|
|
63
|
+
tmp['ENTRY_NAME'] = a[1]
|
|
64
|
+
tmp['DATA_CLASS'] = a[2].sub(/;/,'')
|
|
65
|
+
tmp['MOLECULE_TYPE'] = a[3].sub(/;/,'')
|
|
66
|
+
tmp['SEQUENCE_LENGTH'] = a[4].to_i
|
|
67
|
+
@data['ID'] = tmp
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if key
|
|
71
|
+
@data['ID'][key] # String/Int
|
|
72
|
+
else
|
|
73
|
+
@data['ID'] # Hash
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# returns a ENTRY_NAME in the ID line.
|
|
80
|
+
#
|
|
81
|
+
# A short-cut for Bio::SPTR#id_line('ENTRY_NAME').
|
|
82
|
+
def entry_id
|
|
83
|
+
id_line('ENTRY_NAME')
|
|
84
|
+
end
|
|
85
|
+
alias entry_name entry_id
|
|
86
|
+
alias entry entry_id
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# returns a MOLECULE_TYPE in the ID line.
|
|
90
|
+
#
|
|
91
|
+
# A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
|
|
92
|
+
def molecule
|
|
93
|
+
id_line('MOLECULE_TYPE')
|
|
94
|
+
end
|
|
95
|
+
alias molecule_type molecule
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# returns a SEQUENCE_LENGTH in the ID line.
|
|
99
|
+
#
|
|
100
|
+
# A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
|
|
101
|
+
def sequence_length
|
|
102
|
+
id_line('SEQUENCE_LENGTH')
|
|
103
|
+
end
|
|
104
|
+
alias aalen sequence_length
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# Bio::EMBLDB::Common#ac -> ary
|
|
108
|
+
# #accessions -> ary
|
|
109
|
+
# #accession -> String (accessions.first)
|
|
110
|
+
@@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# returns a Hash of information in the DT lines.
|
|
115
|
+
# hash keys:
|
|
116
|
+
# ['created', 'sequence', 'annotation']
|
|
117
|
+
# also Symbols acceptable (ASAP):
|
|
118
|
+
# [:created, :sequence, :annotation]
|
|
119
|
+
#
|
|
120
|
+
# returns a String of information in the DT lines by a given key..
|
|
121
|
+
#
|
|
122
|
+
# DT Line; date (3/entry)
|
|
123
|
+
# DT DD-MMM-YYY (rel. NN, Created)
|
|
124
|
+
# DT DD-MMM-YYY (rel. NN, Last sequence update)
|
|
125
|
+
# DT DD-MMM-YYY (rel. NN, Last annotation update)
|
|
126
|
+
def dt(key = nil)
|
|
127
|
+
unless @data['DT']
|
|
128
|
+
tmp = Hash.new
|
|
129
|
+
a = self.get('DT').split(/\n/)
|
|
130
|
+
tmp['created'] = a[0].sub(/\w{2} /,'').strip
|
|
131
|
+
tmp['sequence'] = a[1].sub(/\w{2} /,'').strip
|
|
132
|
+
tmp['annotation'] = a[2].sub(/\w{2} /,'').strip
|
|
133
|
+
@data['DT'] = tmp
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
if key
|
|
137
|
+
@data['DT'][key]
|
|
138
|
+
else
|
|
139
|
+
@data['DT']
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# returns the proposed official name of the protein.
|
|
145
|
+
#
|
|
146
|
+
# DE Line; description (>=1)
|
|
147
|
+
# "DE #{OFFICIAL_NAME} (#{SYNONYM})"
|
|
148
|
+
# "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
|
|
149
|
+
# OFFICIAL_NAME 1/entry
|
|
150
|
+
# SYNONYM >=0
|
|
151
|
+
# CONTEINS >=0
|
|
152
|
+
def protein_name
|
|
153
|
+
name = ""
|
|
154
|
+
if de_line = fetch('DE') then
|
|
155
|
+
str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
|
|
156
|
+
name = str[/^[^(]*/].strip
|
|
157
|
+
name << ' (Fragment)' if str =~ /fragment/i
|
|
158
|
+
end
|
|
159
|
+
return name
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# returns an array of synonyms (unofficial names).
|
|
164
|
+
#
|
|
165
|
+
# synonyms are each placed in () following the official name on the DE line.
|
|
166
|
+
def synonyms
|
|
167
|
+
ary = Array.new
|
|
168
|
+
if de_line = fetch('DE') then
|
|
169
|
+
line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
|
|
170
|
+
line.scan(/\([^)]+/) do |synonym|
|
|
171
|
+
unless synonym =~ /fragment/i then
|
|
172
|
+
ary << synonym[1..-1].strip # index to remove the leading (
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
return ary
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# returns gene names in the GN line.
|
|
181
|
+
#
|
|
182
|
+
# New UniProt/SwissProt format:
|
|
183
|
+
# * Bio::SPTR#gn -> [ <gene record>* ]
|
|
184
|
+
# where <gene record> is:
|
|
185
|
+
# { :name => '...',
|
|
186
|
+
# :synonyms => [ 's1', 's2', ... ],
|
|
187
|
+
# :loci => [ 'l1', 'l2', ... ],
|
|
188
|
+
# :orfs => [ 'o1', 'o2', ... ]
|
|
189
|
+
# }
|
|
190
|
+
#
|
|
191
|
+
# Old format:
|
|
192
|
+
# * Bio::SPTR#gn -> Array # AND
|
|
193
|
+
# * Bio::SPTR#gn[0] -> Array # OR
|
|
194
|
+
#
|
|
195
|
+
# GN Line: Gene name(s) (>=0, optional)
|
|
196
|
+
def gn
|
|
197
|
+
return @data['GN'] if @data['GN']
|
|
198
|
+
|
|
199
|
+
case fetch('GN')
|
|
200
|
+
when /Name=/ then
|
|
201
|
+
return gn_uniprot_parser
|
|
202
|
+
else
|
|
203
|
+
return gn_old_parser
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# returns contents in the old style GN line.
|
|
208
|
+
# GN Line: Gene name(s) (>=0, optional)
|
|
209
|
+
# GN HNS OR DRDX OR OSMZ OR BGLY.
|
|
210
|
+
# GN CECA1 AND CECA2.
|
|
211
|
+
# GN CECA1 AND (HOGE OR FUGA).
|
|
212
|
+
#
|
|
213
|
+
# GN NAME1 [(AND|OR) NAME]+.
|
|
214
|
+
#
|
|
215
|
+
# Bio::SPTR#gn -> Array # AND
|
|
216
|
+
# #gn[0] -> Array # OR
|
|
217
|
+
# #gene_names -> Array
|
|
218
|
+
def gn_old_parser
|
|
219
|
+
names = Array.new
|
|
220
|
+
if get('GN').size > 0
|
|
221
|
+
names = fetch('GN').sub(/\.$/,'').split(/ AND /)
|
|
222
|
+
names.map! { |synonyms|
|
|
223
|
+
synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
|
|
224
|
+
e.strip
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
end
|
|
228
|
+
return @data['GN'] = names
|
|
229
|
+
end
|
|
230
|
+
private :gn_old_parser
|
|
231
|
+
|
|
232
|
+
# returns contents in the structured GN line.
|
|
233
|
+
# The new format of the GN line is:
|
|
234
|
+
# GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
|
|
235
|
+
# GN ORFNames=[, ...];
|
|
236
|
+
#
|
|
237
|
+
# * Bio::SPTR#gn -> [ <gene record>* ]
|
|
238
|
+
# where <gene record> is:
|
|
239
|
+
# { :name => '...',
|
|
240
|
+
# :synonyms => [ 's1', 's2', ... ],
|
|
241
|
+
# :loci => [ 'l1', 'l2', ... ],
|
|
242
|
+
# :orfs => [ 'o1', 'o2', ... ]
|
|
243
|
+
# }
|
|
244
|
+
def gn_uniprot_parser
|
|
245
|
+
@data['GN'] = Array.new
|
|
246
|
+
gn_line = fetch('GN').strip
|
|
247
|
+
records = gn_line.split(/\s*and\s*/)
|
|
248
|
+
records.each do |record|
|
|
249
|
+
gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
|
|
250
|
+
record.each(';') do |element|
|
|
251
|
+
case element
|
|
252
|
+
when /Name=/ then
|
|
253
|
+
gene_hash[:name] = $'[0..-2]
|
|
254
|
+
when /Synonyms=/ then
|
|
255
|
+
gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
|
|
256
|
+
when /OrderedLocusNames=/ then
|
|
257
|
+
gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
|
|
258
|
+
when /ORFNames=/ then
|
|
259
|
+
gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
@data['GN'] << gene_hash
|
|
263
|
+
end
|
|
264
|
+
return @data['GN']
|
|
265
|
+
end
|
|
266
|
+
private :gn_uniprot_parser
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
# returns a Array of gene names in the GN line.
|
|
270
|
+
def gene_names
|
|
271
|
+
gn # set @data['GN'] if it hasn't been already done
|
|
272
|
+
if @data['GN'].first.class == Hash then
|
|
273
|
+
@data['GN'].collect { |element| element[:name] }
|
|
274
|
+
else
|
|
275
|
+
@data['GN'].first
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# returns a String of the first gene name in the GN line.
|
|
281
|
+
def gene_name
|
|
282
|
+
gene_names.first
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# returns a Array of Hashs or a String of the OS line when a key given.
|
|
287
|
+
# * Bio::EMBLDB#os -> Array
|
|
288
|
+
# [{'name' => '(Human)', 'os' => 'Homo sapiens'},
|
|
289
|
+
# {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
|
|
290
|
+
# * Bio::EPTR#os[0] -> Hash
|
|
291
|
+
# {'name' => "(Human)", 'os' => 'Homo sapiens'}
|
|
292
|
+
# * Bio::SPTR#os[0]['name'] -> "(Human)"
|
|
293
|
+
# * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
|
|
294
|
+
#
|
|
295
|
+
# OS Line; organism species (>=1)
|
|
296
|
+
# OS Genus species (name).
|
|
297
|
+
# OS Genus species (name0) (name1).
|
|
298
|
+
# OS Genus species (name0) (name1).
|
|
299
|
+
# OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
|
|
300
|
+
# OS Homo sapiens (Human), and Rarrus norveticus (Rat)
|
|
301
|
+
def os(num = nil)
|
|
302
|
+
unless @data['OS']
|
|
303
|
+
os = Array.new
|
|
304
|
+
fetch('OS').split(/, and|, /).each do |tmp|
|
|
305
|
+
if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
|
|
306
|
+
org = $1
|
|
307
|
+
tmp =~ /(\(.+\))/
|
|
308
|
+
os.push({'name' => $1, 'os' => org})
|
|
309
|
+
else
|
|
310
|
+
raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
@data['OS'] = os
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
if num
|
|
317
|
+
# EX. "Trifolium repens (white clover)"
|
|
318
|
+
return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
|
|
319
|
+
else
|
|
320
|
+
return @data['OS']
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# Bio::EMBLDB::Common#og -> Array
|
|
326
|
+
# OG Line; organella (0 or 1/entry)
|
|
327
|
+
# ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
|
|
328
|
+
# or a plasmid name (e.g. "Plasmid pBR322").
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# Bio::EMBLDB::Common#oc -> Array
|
|
332
|
+
# OC Line; organism classification (>=1)
|
|
333
|
+
# "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
|
|
334
|
+
# "OC Theileria."
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
# returns a Hash of oraganism taxonomy cross-references.
|
|
339
|
+
# * Bio::SPTR#ox -> Hash
|
|
340
|
+
# {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
|
|
341
|
+
#
|
|
342
|
+
# OX Line; organism taxonomy cross-reference (>=1 per entry)
|
|
343
|
+
# OX NCBI_TaxID=1234;
|
|
344
|
+
# OX NCBI_TaxID=1234, 2345, 3456, 4567;
|
|
345
|
+
def ox
|
|
346
|
+
unless @data['OX']
|
|
347
|
+
tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
|
|
348
|
+
hsh = Hash.new
|
|
349
|
+
tmp.each do |e|
|
|
350
|
+
db,refs = e.split(/=/)
|
|
351
|
+
hsh[db] = refs.split(/, */)
|
|
352
|
+
end
|
|
353
|
+
@data['OX'] = hsh
|
|
354
|
+
end
|
|
355
|
+
return @data['OX']
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
# Bio::EMBLDB::Common#ref -> Array
|
|
360
|
+
# R Lines
|
|
361
|
+
# RN RC RP RX RA RT RL
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
@@cc_topics = ['ALTERNATIVE PRODUCTS','CATALYTIC ACTIVITY','CAUTION',
|
|
365
|
+
'COFACTOR','DATABASE','DEVELOPMENTAL STAGE','DISEASE','DOMAIN',
|
|
366
|
+
'ENZYME REGULATION','FUNCTION','INDUCTION','MASS SPECTROMETRY',
|
|
367
|
+
'MISCELLANEOUS','PATHWAY','PHARMACEUTICAL','POLYMORPHISM','PTM',
|
|
368
|
+
'SIMILARITY','SUBCELLULAR LOCATION','SUBUNIT','TISSUE SPECIFICITY']
|
|
369
|
+
# returns contents in the CC lines.
|
|
370
|
+
# * Bio::SPTR#cc -> Hash
|
|
371
|
+
|
|
372
|
+
# * Bio::SPTR#cc(Int) -> String
|
|
373
|
+
# returns an Array of contents in the TOPIC string.
|
|
374
|
+
# * Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
|
|
375
|
+
#
|
|
376
|
+
# returns contents of the "ALTERNATIVE PRODUCTS".
|
|
377
|
+
# * Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
|
|
378
|
+
# {'Event' => str,
|
|
379
|
+
# 'Named isoforms' => int,
|
|
380
|
+
# 'Comment' => str,
|
|
381
|
+
# 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
|
|
382
|
+
#
|
|
383
|
+
# CC -!- ALTERNATIVE PRODUCTS:
|
|
384
|
+
# CC Event=Alternative splicing; Named isoforms=15;
|
|
385
|
+
# ...
|
|
386
|
+
# CC placentae isoforms. All tissues differentially splice exon 13;
|
|
387
|
+
# CC Name=A; Synonyms=no del;
|
|
388
|
+
# CC IsoId=P15529-1; Sequence=Displayed;
|
|
389
|
+
#
|
|
390
|
+
# returns contents of the "DATABASE".
|
|
391
|
+
# * Bio::SPTR#cc('DATABASE') -> Array
|
|
392
|
+
# [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
|
|
393
|
+
#
|
|
394
|
+
# CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
|
395
|
+
#
|
|
396
|
+
# returns contents of the "MASS SPECTROMETRY".
|
|
397
|
+
# * Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
|
|
398
|
+
# [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
|
|
399
|
+
#
|
|
400
|
+
# MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
|
401
|
+
#
|
|
402
|
+
# CC lines (>=0, optional)
|
|
403
|
+
# CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
|
|
404
|
+
# CC IN LIVER, KIDNEY, LUNG AND BRAIN.
|
|
405
|
+
#
|
|
406
|
+
# CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
|
|
407
|
+
# CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
|
|
408
|
+
def cc(tag = nil)
|
|
409
|
+
unless @data['CC']
|
|
410
|
+
cc = Hash.new
|
|
411
|
+
cmt = '-' * (77 - 4 + 1)
|
|
412
|
+
dlm = /-!- /
|
|
413
|
+
|
|
414
|
+
return cc if get('CC').size == 0 # 12KD_MYCSM has no CC lines.
|
|
415
|
+
|
|
416
|
+
begin
|
|
417
|
+
fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
|
|
418
|
+
if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
|
|
419
|
+
key = $1
|
|
420
|
+
body = $2.gsub(/- (?!AND)/,'-')
|
|
421
|
+
unless cc[key]
|
|
422
|
+
cc[key] = [body]
|
|
423
|
+
else
|
|
424
|
+
cc[key].push(body)
|
|
425
|
+
end
|
|
426
|
+
else
|
|
427
|
+
raise ["Error: [#{entry_id}]: CC Lines", '',
|
|
428
|
+
tmp, '', '', fetch('CC'),''].join("\n")
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
rescue NameError
|
|
432
|
+
if fetch('CC') == ''
|
|
433
|
+
return {}
|
|
434
|
+
else
|
|
435
|
+
raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
|
|
436
|
+
"\n'#{self.get('CC')}'\n", "(#{$!})"].join
|
|
437
|
+
end
|
|
438
|
+
rescue NoMethodError
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
@data['CC'] = cc
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
case tag
|
|
445
|
+
when 'ALTERNATIVE PRODUCTS'
|
|
446
|
+
ap = @data['CC']['ALTERNATIVE PRODUCTS'].to_s
|
|
447
|
+
return ap unless ap
|
|
448
|
+
|
|
449
|
+
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
|
|
450
|
+
tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 'Variants' => []}
|
|
451
|
+
|
|
452
|
+
if /Event=(.+?);/ =~ ap
|
|
453
|
+
tmp['Event'] = $1
|
|
454
|
+
end
|
|
455
|
+
if /Named isoforms=(\S+?);/ =~ ap
|
|
456
|
+
tmp['Named isoforms'] = $1
|
|
457
|
+
end
|
|
458
|
+
if /Comment=(.+?);/m =~ ap
|
|
459
|
+
tmp['Comment'] = $1
|
|
460
|
+
end
|
|
461
|
+
ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
|
|
462
|
+
tmp['Variants'] << cc_ap_variants_parse(ent)
|
|
463
|
+
end
|
|
464
|
+
return tmp
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
when 'DATABASE'
|
|
468
|
+
# DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
|
469
|
+
tmp = Array.new
|
|
470
|
+
db = @data['CC']['DATABASE']
|
|
471
|
+
return db unless db
|
|
472
|
+
|
|
473
|
+
db.each do |e|
|
|
474
|
+
db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
|
|
475
|
+
e.sub(/.$/,'').split(/;/).each do |line|
|
|
476
|
+
case line
|
|
477
|
+
when /NAME=(.+)/
|
|
478
|
+
db['NAME'] = $1
|
|
479
|
+
when /NOTE=(.+)/
|
|
480
|
+
db['NOTE'] = $1
|
|
481
|
+
when /WWW="(.+)"/
|
|
482
|
+
db['WWW'] = $1
|
|
483
|
+
when /FTP="(.+)"/
|
|
484
|
+
db['FTP'] = $1
|
|
485
|
+
end
|
|
486
|
+
end
|
|
487
|
+
tmp.push(db)
|
|
488
|
+
end
|
|
489
|
+
return tmp
|
|
490
|
+
|
|
491
|
+
when 'MASS SPECTOROMETRY'
|
|
492
|
+
# MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
|
493
|
+
tmp = Array.new
|
|
494
|
+
ms = @data['CC']['MASS SPECTOROMETRY']
|
|
495
|
+
return ms unless ms
|
|
496
|
+
|
|
497
|
+
ms.each do |m|
|
|
498
|
+
mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
|
|
499
|
+
m.sub(/.$/,'').split(/;/).each do |line|
|
|
500
|
+
case line
|
|
501
|
+
when /MW=(.+)/
|
|
502
|
+
mass['MW'] = $1.to_f
|
|
503
|
+
when /MW_ERR=(.+)/
|
|
504
|
+
mass['MW_ERR'] = $1.to_f
|
|
505
|
+
when /METHOD="(.+)"/
|
|
506
|
+
mass['METHOD'] = $1.to_s
|
|
507
|
+
when /RANGE="(\d+-\d+)"/
|
|
508
|
+
mass['RANGE'] = $1 # RANGE class ?
|
|
509
|
+
end
|
|
510
|
+
end
|
|
511
|
+
tmp.push(mass)
|
|
512
|
+
end
|
|
513
|
+
return tmp
|
|
514
|
+
|
|
515
|
+
when 'INTERACTION'
|
|
516
|
+
return cc_interaction_parse(@data['CC']['INTERACTION'].to_s)
|
|
517
|
+
|
|
518
|
+
when nil
|
|
519
|
+
return @data['CC']
|
|
520
|
+
|
|
521
|
+
else
|
|
522
|
+
return @data['CC'][tag]
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def cc_ap_variants_parse(ent)
|
|
529
|
+
hsh = {}
|
|
530
|
+
ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
|
|
531
|
+
case e[0]
|
|
532
|
+
when 'Sequence'
|
|
533
|
+
e[1] = e[1].sub(/;/,'').split(/, /)
|
|
534
|
+
end
|
|
535
|
+
hsh[e[0]] = e[1]
|
|
536
|
+
end
|
|
537
|
+
return hsh
|
|
538
|
+
end
|
|
539
|
+
private :cc_ap_variants_parse
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
# returns conteins in a line of the CC INTERACTION section.
|
|
543
|
+
#
|
|
544
|
+
# CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
|
|
545
|
+
def cc_interaction_parse(str)
|
|
546
|
+
it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
|
|
547
|
+
it.map {|ent|
|
|
548
|
+
{:partner_id => ent[0].strip,
|
|
549
|
+
:nbexp => ent[1].strip,
|
|
550
|
+
:intact_acc => ent[2].split(', ') }
|
|
551
|
+
}
|
|
552
|
+
end
|
|
553
|
+
private :cc_interaction_parse
|
|
554
|
+
|
|
555
|
+
# returns databases cross-references in the DR lines.
|
|
556
|
+
# * Bio::EMBLDB#dr -> Hash w/in Array
|
|
557
|
+
#
|
|
558
|
+
# DR Line; defabases cross-reference (>=0)
|
|
559
|
+
# a cross_ref pre one line
|
|
560
|
+
# DR database_identifier; primary_identifier; secondary_identifier.
|
|
561
|
+
@@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
|
|
562
|
+
'ECOGENE',
|
|
563
|
+
'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
|
|
564
|
+
'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
|
|
565
|
+
'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
|
|
566
|
+
'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
|
|
567
|
+
|
|
568
|
+
# Bio::EMBLDB::Common#kw - Array
|
|
569
|
+
# #keywords -> Array
|
|
570
|
+
#
|
|
571
|
+
# KW Line; keyword (>=1)
|
|
572
|
+
# KW [Keyword;]+
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
# returns conteins in the feature table.
|
|
576
|
+
# * Bio::SPTR#ft -> Hash
|
|
577
|
+
# {'feature_name' => [{'From' => str, 'To' => str,
|
|
578
|
+
# 'Description' => str, 'FTId' => str}],...}
|
|
579
|
+
#
|
|
580
|
+
# returns an Array of the information about the feature_name in the feature table.
|
|
581
|
+
# * Bio::SPTR#ft(feature_name) -> Array of Hash
|
|
582
|
+
# [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
|
|
583
|
+
#
|
|
584
|
+
# FT Line; feature table data (>=0, optional)
|
|
585
|
+
#
|
|
586
|
+
# Col Data item
|
|
587
|
+
# ----- -----------------
|
|
588
|
+
# 1- 2 FT
|
|
589
|
+
# 6-13 Feature name
|
|
590
|
+
# 15-20 `FROM' endpoint
|
|
591
|
+
# 22-27 `TO' endpoint
|
|
592
|
+
# 35-75 Description (>=0 per key)
|
|
593
|
+
# ----- -----------------
|
|
594
|
+
def ft(feature_name = nil)
|
|
595
|
+
unless @data['FT']
|
|
596
|
+
table = Hash.new()
|
|
597
|
+
last_feature = nil
|
|
598
|
+
|
|
599
|
+
begin
|
|
600
|
+
get('FT').split(/\n/).each {|line|
|
|
601
|
+
|
|
602
|
+
feature = line[5..12].strip
|
|
603
|
+
|
|
604
|
+
if feature == '' and line[34..74]
|
|
605
|
+
tmp = ' ' + line[34..74].strip
|
|
606
|
+
table[last_feature].last['Description'] << tmp
|
|
607
|
+
|
|
608
|
+
next unless /\.$/ =~ line
|
|
609
|
+
else
|
|
610
|
+
from = line[14..19].strip
|
|
611
|
+
to = line[21..26].strip
|
|
612
|
+
desc = line[34..74].strip if line[34..74]
|
|
613
|
+
|
|
614
|
+
table[feature] = [] unless table[feature]
|
|
615
|
+
table[feature] << {
|
|
616
|
+
'From' => from.to_i,
|
|
617
|
+
'To' => to.to_i,
|
|
618
|
+
'Description' => desc,
|
|
619
|
+
'diff' => [],
|
|
620
|
+
'FTId' => nil }
|
|
621
|
+
last_feature = feature
|
|
622
|
+
next
|
|
623
|
+
end
|
|
624
|
+
|
|
625
|
+
case last_feature
|
|
626
|
+
when 'VARSPLIC', 'VARIANT', 'CONFLICT'
|
|
627
|
+
if /FTId=(.+?)\./ =~ line # version 41 >
|
|
628
|
+
ftid = $1
|
|
629
|
+
table[last_feature].last['FTId'] = ftid
|
|
630
|
+
table[last_feature].last['Description'].sub!(/ \/FTId=#{ftid}./,'')
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
case table[last_feature].last['Description']
|
|
634
|
+
when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
|
|
635
|
+
original = $1
|
|
636
|
+
swap = $2
|
|
637
|
+
original = original.gsub(/ /,'').strip
|
|
638
|
+
swap = swap.gsub(/ /,'').strip
|
|
639
|
+
when /Missing/i
|
|
640
|
+
original = seq.subseq(table[last_feature].last['From'],
|
|
641
|
+
table[last_feature].last['To'])
|
|
642
|
+
swap = ''
|
|
643
|
+
else
|
|
644
|
+
raise line
|
|
645
|
+
end
|
|
646
|
+
table[last_feature].last['diff'] = [original, swap]
|
|
647
|
+
end
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
rescue
|
|
651
|
+
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n" +
|
|
652
|
+
"'#{self.get('FT')}'\n"
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
table.each_key do |k|
|
|
656
|
+
table[k].each do |e|
|
|
657
|
+
if / -> / =~ e['Description']
|
|
658
|
+
pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
|
|
659
|
+
e['Description'].sub!(pattern) {
|
|
660
|
+
a = $1
|
|
661
|
+
b = $2
|
|
662
|
+
a.gsub(/ /,'') + " -> " + b.gsub(/ /,'')
|
|
663
|
+
}
|
|
664
|
+
end
|
|
665
|
+
if /- [\w\d]/ =~ e['Description']
|
|
666
|
+
e['Description'].gsub!(/([\w\d]- [\w\d]+)/) {
|
|
667
|
+
a = $1
|
|
668
|
+
if /- AND/ =~ a
|
|
669
|
+
a
|
|
670
|
+
else
|
|
671
|
+
a.sub(/ /,'')
|
|
672
|
+
end
|
|
673
|
+
}
|
|
674
|
+
end
|
|
675
|
+
end
|
|
676
|
+
end
|
|
677
|
+
@data['FT'] = table
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
if feature_name
|
|
681
|
+
@data['FT'][feature_name]
|
|
682
|
+
else
|
|
683
|
+
@data['FT']
|
|
684
|
+
end
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
# returns a Hash of conteins in the SQ lines.
|
|
689
|
+
# * Bio::SPTRL#sq -> hsh
|
|
690
|
+
#
|
|
691
|
+
# returns a value of a key given in the SQ lines.
|
|
692
|
+
# * Bio::SPTRL#sq(key) -> int or str
|
|
693
|
+
# * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length', 'CRC64']
|
|
694
|
+
#
|
|
695
|
+
# SQ Line; sequence header (1/entry)
|
|
696
|
+
# SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
|
|
697
|
+
# SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
|
|
698
|
+
#
|
|
699
|
+
# MW, Dalton unit.
|
|
700
|
+
# CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
|
|
701
|
+
def sq(key = nil)
|
|
702
|
+
unless @data['SQ']
|
|
703
|
+
if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
|
|
704
|
+
@data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
|
|
705
|
+
else
|
|
706
|
+
raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
|
|
707
|
+
end
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
if key
|
|
711
|
+
case key
|
|
712
|
+
when /mw/, /molecular/, /weight/
|
|
713
|
+
@data['SQ']['MW']
|
|
714
|
+
when /len/, /length/, /AA/
|
|
715
|
+
@data['SQ']['aalen']
|
|
716
|
+
else
|
|
717
|
+
@data['SQ'][key]
|
|
718
|
+
end
|
|
719
|
+
else
|
|
720
|
+
@data['SQ']
|
|
721
|
+
end
|
|
722
|
+
end
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
# returns a Bio::Sequence::AA of the amino acid sequence.
|
|
726
|
+
# * Bio::SPTR#seq -> Bio::Sequence::AA
|
|
727
|
+
#
|
|
728
|
+
# blank Line; sequence data (>=1)
|
|
729
|
+
def seq
|
|
730
|
+
unless @data['']
|
|
731
|
+
@data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
|
|
732
|
+
end
|
|
733
|
+
return @data['']
|
|
734
|
+
end
|
|
735
|
+
alias aaseq seq
|
|
736
|
+
|
|
737
|
+
end # class SPTR
|
|
738
|
+
|
|
739
|
+
end # module Bio
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
if __FILE__ == $0
|
|
743
|
+
# Usage: ruby __FILE__ uniprot_sprot.dat
|
|
744
|
+
# Usage: ruby __FILE__ uniprot_sprot.dat | egrep '^RuntimeError'
|
|
745
|
+
|
|
746
|
+
begin
|
|
747
|
+
require 'pp'
|
|
748
|
+
alias pp p
|
|
749
|
+
rescue LoadError
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
def cmd(cmd, tag = nil, ent = $ent)
|
|
753
|
+
puts " ==> #{cmd} "
|
|
754
|
+
puts Bio::SPTR.new(ent).get(tag) if tag
|
|
755
|
+
begin
|
|
756
|
+
p eval(cmd)
|
|
757
|
+
rescue RuntimeError
|
|
758
|
+
puts "RuntimeError(#{Bio::SPTR.new($ent).entry_id})}: #{$!} "
|
|
759
|
+
end
|
|
760
|
+
puts
|
|
761
|
+
end
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
while $ent = $<.gets(Bio::SPTR::RS)
|
|
765
|
+
|
|
766
|
+
cmd "Bio::SPTR.new($ent).entry_id"
|
|
767
|
+
|
|
768
|
+
cmd "Bio::SPTR.new($ent).id_line", 'ID'
|
|
769
|
+
cmd "Bio::SPTR.new($ent).entry"
|
|
770
|
+
cmd "Bio::SPTR.new($ent).entry_name"
|
|
771
|
+
cmd "Bio::SPTR.new($ent).molecule"
|
|
772
|
+
cmd "Bio::SPTR.new($ent).sequence_length"
|
|
773
|
+
|
|
774
|
+
cmd "Bio::SPTR.new($ent).ac", 'AC'
|
|
775
|
+
cmd "Bio::SPTR.new($ent).accession"
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
cmd "Bio::SPTR.new($ent).gn", 'GN'
|
|
779
|
+
cmd "Bio::SPTR.new($ent).gene_name"
|
|
780
|
+
cmd "Bio::SPTR.new($ent).gene_names"
|
|
781
|
+
|
|
782
|
+
cmd "Bio::SPTR.new($ent).dt", "DT"
|
|
783
|
+
['created','annotation','sequence'].each do |key|
|
|
784
|
+
cmd "Bio::SPTR.new($ent).dt('#{key}')"
|
|
785
|
+
end
|
|
786
|
+
|
|
787
|
+
cmd "Bio::SPTR.new($ent).de", 'DE'
|
|
788
|
+
cmd "Bio::SPTR.new($ent).definition"
|
|
789
|
+
cmd "Bio::SPTR.new($ent).protein_name"
|
|
790
|
+
cmd "Bio::SPTR.new($ent).synonyms"
|
|
791
|
+
|
|
792
|
+
cmd "Bio::SPTR.new($ent).kw", 'KW'
|
|
793
|
+
|
|
794
|
+
cmd "Bio::SPTR.new($ent).os", 'OS'
|
|
795
|
+
|
|
796
|
+
cmd "Bio::SPTR.new($ent).oc", 'OC'
|
|
797
|
+
|
|
798
|
+
cmd "Bio::SPTR.new($ent).og", 'OG'
|
|
799
|
+
|
|
800
|
+
cmd "Bio::SPTR.new($ent).ox", 'OX'
|
|
801
|
+
|
|
802
|
+
cmd "Bio::SPTR.new($ent).ref", 'R'
|
|
803
|
+
|
|
804
|
+
cmd "Bio::SPTR.new($ent).cc", 'CC'
|
|
805
|
+
cmd "Bio::SPTR.new($ent).cc('ALTERNATIVE PRODUCTS')"
|
|
806
|
+
cmd "Bio::SPTR.new($ent).cc('DATABASE')"
|
|
807
|
+
cmd "Bio::SPTR.new($ent).cc('MASS SPECTOMETRY')"
|
|
808
|
+
|
|
809
|
+
cmd "Bio::SPTR.new($ent).dr", 'DR'
|
|
810
|
+
|
|
811
|
+
cmd "Bio::SPTR.new($ent).ft", 'FT'
|
|
812
|
+
cmd "Bio::SPTR.new($ent).ft['DOMAIN']"
|
|
813
|
+
|
|
814
|
+
cmd "Bio::SPTR.new($ent).sq", "SQ"
|
|
815
|
+
cmd "Bio::SPTR.new($ent).seq"
|
|
816
|
+
end
|
|
817
|
+
|
|
818
|
+
end
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
=begin
|
|
822
|
+
|
|
823
|
+
= Bio::SPTR < Bio::DB
|
|
824
|
+
|
|
825
|
+
Class for a entry in the SWISS-PROT/TrEMBL database.
|
|
826
|
+
|
|
827
|
+
* ((<URL:http://www.ebi.ac.uk/swissprot/>))
|
|
828
|
+
* ((<URL:http://www.ebi.ac.uk/trembl/>))
|
|
829
|
+
* ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
--- Bio::SPTR.new(a_sp_entry)
|
|
833
|
+
|
|
834
|
+
=== ID line (Identification)
|
|
835
|
+
|
|
836
|
+
--- Bio::SPTR#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
|
|
837
|
+
'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
|
|
838
|
+
--- Bio::SPTR#id_line(key) -> str
|
|
839
|
+
|
|
840
|
+
key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
|
|
841
|
+
|
|
842
|
+
--- Bio::SPTR#entry_id -> str
|
|
843
|
+
--- Bio::SPTR#molecule -> str
|
|
844
|
+
--- Bio::SPTR#sequence_length -> int
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
=== AC lines (Accession number)
|
|
848
|
+
|
|
849
|
+
--- Bio::SPTR#ac -> ary
|
|
850
|
+
--- Bio::SPTR#accessions -> ary
|
|
851
|
+
--- Bio::SPTR#accession -> accessions.first
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
=== GN line (Gene name(s))
|
|
855
|
+
|
|
856
|
+
--- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
|
|
857
|
+
--- Bio::SPTR#gene_name -> str
|
|
858
|
+
--- Bio::SPTR#gene_names -> [str] or [str]
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
=== DT lines (Date)
|
|
862
|
+
|
|
863
|
+
--- Bio::SPTR#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
|
|
864
|
+
--- Bio::SPTR#dt(key) -> str
|
|
865
|
+
|
|
866
|
+
key := (created|annotation|sequence)
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
=== DE lines (Description)
|
|
870
|
+
|
|
871
|
+
--- Bio::SPTR#de -> str
|
|
872
|
+
#definition -> str
|
|
873
|
+
|
|
874
|
+
--- Bio::SPTR#protein_name
|
|
875
|
+
|
|
876
|
+
Returns the proposed official name of the protein
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
--- Bio::SPTR#synonyms
|
|
880
|
+
|
|
881
|
+
Returns an array of synonyms (unofficial names)
|
|
882
|
+
|
|
883
|
+
=== KW lines (Keyword)
|
|
884
|
+
|
|
885
|
+
--- Bio::SPTR#kw -> ary
|
|
886
|
+
|
|
887
|
+
=== OS lines (Organism species)
|
|
888
|
+
|
|
889
|
+
--- Bio::SPTR#os -> [{'name' => str, 'os' => str}, ...]
|
|
890
|
+
|
|
891
|
+
=== OC lines (organism classification)
|
|
892
|
+
|
|
893
|
+
--- Bio::SPTR#oc -> ary
|
|
894
|
+
|
|
895
|
+
=== OG line (Organella)
|
|
896
|
+
|
|
897
|
+
--- Bio::SPTR#og -> ary
|
|
898
|
+
|
|
899
|
+
=== OX line (Organism taxonomy cross-reference)
|
|
900
|
+
|
|
901
|
+
--- Bio::SPTR#ox -> {'NCBI_TaxID' => [], ...}
|
|
902
|
+
|
|
903
|
+
=== RN RC RP RX RA RT RL RG lines (Reference)
|
|
904
|
+
|
|
905
|
+
--- Bio::SPTR#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
|
|
906
|
+
|
|
907
|
+
=== DR lines (Database cross-reference)
|
|
908
|
+
|
|
909
|
+
--- Bio::SPTR#dr -> {'EMBL' => ary, ...}
|
|
910
|
+
|
|
911
|
+
=== FT lines (Feature table data)
|
|
912
|
+
|
|
913
|
+
--- Bio::SPTR#ft -> hsh
|
|
914
|
+
|
|
915
|
+
=== SQ lines (Sequence header and data)
|
|
916
|
+
|
|
917
|
+
--- Bio::SPTR#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
|
|
918
|
+
--- Bio::SPTR#sq(key) -> int or str
|
|
919
|
+
|
|
920
|
+
key := (aalen|MW|CRC64)
|
|
921
|
+
|
|
922
|
+
--- Bio::EMBL#seq -> Bio::Sequece::AA
|
|
923
|
+
#aaseq -> Bio::Sequece::AA
|
|
924
|
+
|
|
925
|
+
=end
|
|
926
|
+
|
|
927
|
+
# Content Occurrence in an entry
|
|
928
|
+
# ---- --------------------------- --------------------------------
|
|
929
|
+
# ID - identification (begins each entry; 1 per entry)
|
|
930
|
+
# AC - accession number(s) (>=1 per entry)
|
|
931
|
+
# DT - date (3 per entry)
|
|
932
|
+
# DE - description (>=1 per entry)
|
|
933
|
+
# GN - gene name(s) (>=0 per entry; optional)
|
|
934
|
+
# OS - organism species (>=1 per entry)
|
|
935
|
+
# OG - organelle (0 or 1 per entry; optional)
|
|
936
|
+
# OC - organism classification (>=1 per entry)
|
|
937
|
+
# OX - organism taxonomy x-ref (>=1 per entry)
|
|
938
|
+
# RN - reference number (>=1 per entry)
|
|
939
|
+
# RP - reference positions (>=1 per entry)
|
|
940
|
+
# RC - reference comment(s) (>=0 per entry; optional)
|
|
941
|
+
# RX - reference cross-reference(s) (>=0 per entry; optional)
|
|
942
|
+
# RA - reference author(s) (>=1 per entry)
|
|
943
|
+
# RT - reference title (>=0 per entry; optional)
|
|
944
|
+
# RL - reference location (>=1 per entry)
|
|
945
|
+
# CC - comments or notes (>=0 per entry; optional)
|
|
946
|
+
# DR - database cross-references (>=0 per entry; optional)
|
|
947
|
+
# KW - keywords (>=1 per entry)
|
|
948
|
+
# FT - feature table data (>=0 per entry; optional)
|
|
949
|
+
# SQ - sequence header (1 per entry)
|
|
950
|
+
# - (blanks) The sequence data (>=1 per entry)
|
|
951
|
+
# // - termination line (ends each entry; 1 per entry)
|
|
952
|
+
# ---- --------------------------- --------------------------------
|
|
953
|
+
|
|
954
|
+
|