RubyGems - bio - Versions diffs - 0.7.0 - Mend

bio 0.7.0

Files changed (201) hide show

data/bin/bioruby +107 -0
data/bin/br_biofetch.rb +59 -0
data/bin/br_bioflat.rb +294 -0
data/bin/br_biogetseq.rb +57 -0
data/bin/br_pmfetch.rb +431 -0
data/doc/BioRuby.rd.ja +225 -0
data/doc/Changes-0.7.rd +236 -0
data/doc/Design.rd.ja +341 -0
data/doc/KEGG_API.rd +1437 -0
data/doc/KEGG_API.rd.ja +1399 -0
data/doc/TODO.rd.ja +138 -0
data/doc/Tutorial.rd +1138 -0
data/doc/Tutorial.rd.ja +2110 -0
data/etc/bioinformatics/seqdatabase.ini +210 -0
data/lib/bio.rb +256 -0
data/lib/bio/alignment.rb +1906 -0
data/lib/bio/appl/bl2seq/report.rb +350 -0
data/lib/bio/appl/blast.rb +269 -0
data/lib/bio/appl/blast/format0.rb +1402 -0
data/lib/bio/appl/blast/format8.rb +95 -0
data/lib/bio/appl/blast/report.rb +652 -0
data/lib/bio/appl/blast/rexml.rb +151 -0
data/lib/bio/appl/blast/wublast.rb +553 -0
data/lib/bio/appl/blast/xmlparser.rb +222 -0
data/lib/bio/appl/blat/report.rb +392 -0
data/lib/bio/appl/clustalw.rb +191 -0
data/lib/bio/appl/clustalw/report.rb +154 -0
data/lib/bio/appl/emboss.rb +68 -0
data/lib/bio/appl/fasta.rb +262 -0
data/lib/bio/appl/fasta/format10.rb +428 -0
data/lib/bio/appl/fasta/format6.rb +37 -0
data/lib/bio/appl/genscan/report.rb +570 -0
data/lib/bio/appl/hmmer.rb +129 -0
data/lib/bio/appl/hmmer/report.rb +556 -0
data/lib/bio/appl/mafft.rb +222 -0
data/lib/bio/appl/mafft/report.rb +119 -0
data/lib/bio/appl/psort.rb +555 -0
data/lib/bio/appl/psort/report.rb +473 -0
data/lib/bio/appl/sim4.rb +134 -0
data/lib/bio/appl/sim4/report.rb +501 -0
data/lib/bio/appl/sosui/report.rb +166 -0
data/lib/bio/appl/spidey/report.rb +604 -0
data/lib/bio/appl/targetp/report.rb +283 -0
data/lib/bio/appl/tmhmm/report.rb +238 -0
data/lib/bio/command.rb +166 -0
data/lib/bio/data/aa.rb +354 -0
data/lib/bio/data/codontable.rb +740 -0
data/lib/bio/data/na.rb +226 -0
data/lib/bio/db.rb +340 -0
data/lib/bio/db/aaindex.rb +280 -0
data/lib/bio/db/embl/common.rb +332 -0
data/lib/bio/db/embl/embl.rb +446 -0
data/lib/bio/db/embl/sptr.rb +954 -0
data/lib/bio/db/embl/swissprot.rb +32 -0
data/lib/bio/db/embl/trembl.rb +31 -0
data/lib/bio/db/embl/uniprot.rb +32 -0
data/lib/bio/db/fantom.rb +604 -0
data/lib/bio/db/fasta.rb +869 -0
data/lib/bio/db/genbank/common.rb +299 -0
data/lib/bio/db/genbank/ddbj.rb +34 -0
data/lib/bio/db/genbank/genbank.rb +354 -0
data/lib/bio/db/genbank/genpept.rb +73 -0
data/lib/bio/db/genbank/refseq.rb +31 -0
data/lib/bio/db/gff.rb +106 -0
data/lib/bio/db/go.rb +497 -0
data/lib/bio/db/kegg/brite.rb +51 -0
data/lib/bio/db/kegg/cell.rb +88 -0
data/lib/bio/db/kegg/compound.rb +130 -0
data/lib/bio/db/kegg/enzyme.rb +125 -0
data/lib/bio/db/kegg/expression.rb +173 -0
data/lib/bio/db/kegg/genes.rb +293 -0
data/lib/bio/db/kegg/genome.rb +362 -0
data/lib/bio/db/kegg/glycan.rb +213 -0
data/lib/bio/db/kegg/keggtab.rb +418 -0
data/lib/bio/db/kegg/kgml.rb +299 -0
data/lib/bio/db/kegg/ko.rb +178 -0
data/lib/bio/db/kegg/reaction.rb +97 -0
data/lib/bio/db/litdb.rb +131 -0
data/lib/bio/db/medline.rb +317 -0
data/lib/bio/db/nbrf.rb +199 -0
data/lib/bio/db/pdb.rb +38 -0
data/lib/bio/db/pdb/atom.rb +60 -0
data/lib/bio/db/pdb/chain.rb +117 -0
data/lib/bio/db/pdb/model.rb +106 -0
data/lib/bio/db/pdb/pdb.rb +1682 -0
data/lib/bio/db/pdb/residue.rb +122 -0
data/lib/bio/db/pdb/utils.rb +234 -0
data/lib/bio/db/prosite.rb +616 -0
data/lib/bio/db/rebase.rb +417 -0
data/lib/bio/db/transfac.rb +387 -0
data/lib/bio/feature.rb +201 -0
data/lib/bio/io/brdb.rb +103 -0
data/lib/bio/io/das.rb +471 -0
data/lib/bio/io/dbget.rb +212 -0
data/lib/bio/io/ddbjxml.rb +614 -0
data/lib/bio/io/fastacmd.rb +123 -0
data/lib/bio/io/fetch.rb +114 -0
data/lib/bio/io/flatfile.rb +496 -0
data/lib/bio/io/flatfile/bdb.rb +266 -0
data/lib/bio/io/flatfile/index.rb +1308 -0
data/lib/bio/io/flatfile/indexer.rb +778 -0
data/lib/bio/io/higet.rb +92 -0
data/lib/bio/io/keggapi.rb +863 -0
data/lib/bio/io/pubmed.rb +189 -0
data/lib/bio/io/registry.rb +308 -0
data/lib/bio/io/soapwsdl.rb +114 -0
data/lib/bio/io/sql.rb +428 -0
data/lib/bio/location.rb +650 -0
data/lib/bio/pathway.rb +991 -0
data/lib/bio/reference.rb +308 -0
data/lib/bio/sequence.rb +593 -0
data/lib/bio/shell.rb +51 -0
data/lib/bio/shell/core.rb +512 -0
data/lib/bio/shell/plugin/codon.rb +228 -0
data/lib/bio/shell/plugin/entry.rb +85 -0
data/lib/bio/shell/plugin/flatfile.rb +119 -0
data/lib/bio/shell/plugin/keggapi.rb +187 -0
data/lib/bio/shell/plugin/midi.rb +448 -0
data/lib/bio/shell/plugin/obda.rb +63 -0
data/lib/bio/shell/plugin/seq.rb +238 -0
data/lib/bio/shell/session.rb +214 -0
data/lib/bio/util/color_scheme.rb +214 -0
data/lib/bio/util/color_scheme/buried.rb +78 -0
data/lib/bio/util/color_scheme/helix.rb +78 -0
data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
data/lib/bio/util/color_scheme/strand.rb +78 -0
data/lib/bio/util/color_scheme/taylor.rb +69 -0
data/lib/bio/util/color_scheme/turn.rb +78 -0
data/lib/bio/util/color_scheme/zappo.rb +69 -0
data/lib/bio/util/contingency_table.rb +337 -0
data/lib/bio/util/sirna.rb +306 -0
data/lib/bioruby.rb +34 -0
data/sample/biofetch.rb +475 -0
data/sample/color_scheme_na.rb +99 -0
data/sample/dbget +37 -0
data/sample/fasta2tab.rb +99 -0
data/sample/fsplit.rb +51 -0
data/sample/gb2fasta.rb +31 -0
data/sample/gb2tab.rb +325 -0
data/sample/gbtab2mysql.rb +161 -0
data/sample/genes2nuc.rb +33 -0
data/sample/genes2pep.rb +33 -0
data/sample/genes2tab.rb +81 -0
data/sample/genome2rb.rb +29 -0
data/sample/genome2tab.rb +76 -0
data/sample/goslim.rb +311 -0
data/sample/gt2fasta.rb +47 -0
data/sample/pmfetch.rb +42 -0
data/sample/pmsearch.rb +42 -0
data/sample/psortplot_html.rb +222 -0
data/sample/ssearch2tab.rb +96 -0
data/sample/tdiary.rb +158 -0
data/sample/tfastx2tab.rb +100 -0
data/sample/vs-genes.rb +212 -0
data/test/data/SOSUI/sample.report +11 -0
data/test/data/TMHMM/sample.report +21 -0
data/test/data/blast/eco:b0002.faa +15 -0
data/test/data/blast/eco:b0002.faa.m0 +128 -0
data/test/data/blast/eco:b0002.faa.m7 +65 -0
data/test/data/blast/eco:b0002.faa.m8 +1 -0
data/test/data/embl/AB090716.embl +65 -0
data/test/data/genscan/sample.report +63 -0
data/test/data/prosite/prosite.dat +2233 -0
data/test/data/refseq/nm_126355.entret +64 -0
data/test/data/uniprot/p53_human.uniprot +1456 -0
data/test/runner.rb +10 -0
data/test/unit/bio/appl/blast/test_report.rb +427 -0
data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
data/test/unit/bio/appl/genscan/test_report.rb +195 -0
data/test/unit/bio/appl/sosui/test_report.rb +94 -0
data/test/unit/bio/appl/targetp/test_report.rb +159 -0
data/test/unit/bio/appl/test_blast.rb +159 -0
data/test/unit/bio/appl/test_fasta.rb +142 -0
data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
data/test/unit/bio/data/test_aa.rb +103 -0
data/test/unit/bio/data/test_codontable.rb +120 -0
data/test/unit/bio/data/test_na.rb +89 -0
data/test/unit/bio/db/embl/test_common.rb +130 -0
data/test/unit/bio/db/embl/test_embl.rb +227 -0
data/test/unit/bio/db/embl/test_sptr.rb +268 -0
data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
data/test/unit/bio/db/kegg/test_genes.rb +58 -0
data/test/unit/bio/db/test_fasta.rb +263 -0
data/test/unit/bio/db/test_gff.rb +140 -0
data/test/unit/bio/db/test_prosite.rb +1450 -0
data/test/unit/bio/io/test_ddbjxml.rb +87 -0
data/test/unit/bio/io/test_soapwsdl.rb +45 -0
data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
data/test/unit/bio/test_alignment.rb +1028 -0
data/test/unit/bio/test_command.rb +71 -0
data/test/unit/bio/test_db.rb +109 -0
data/test/unit/bio/test_feature.rb +128 -0
data/test/unit/bio/test_location.rb +51 -0
data/test/unit/bio/test_pathway.rb +485 -0
data/test/unit/bio/test_sequence.rb +386 -0
data/test/unit/bio/test_shell.rb +31 -0
data/test/unit/bio/util/test_color_scheme.rb +45 -0
data/test/unit/bio/util/test_contingency_table.rb +106 -0
data/test/unit/bio/util/test_sirna.rb +258 -0
metadata +295 -0

data/lib/bio/db/embl/embl.rb ADDED Viewed

@@ -0,0 +1,446 @@
+#
+# = bio/db/embl/embl.rb - EMBL database class
+#
+#
+# Copyright::   Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
+# License::     LGPL
+#
+# $Id: embl.rb,v 1.25 2005/11/02 07:30:14 nakao Exp $
+#
+# == EMBL database entry
+#
+#
+#
+# == Example
+#
+# emb = Bio::EMBL.new($<.read)
+# emb.entry_id
+# emb.each_cds do |cds|
+#   cds
+# end
+# emb.seq
+#
+#--
+#
+#  This library is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2 of the License, or (at your option) any later version.
+#
+#  This library is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with this library; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+#
+#++
+#
+require 'bio/db'
+require 'bio/db/embl/common'
+module Bio
+class EMBL < EMBLDB
+  include Bio::EMBLDB::Common
+  # returns contents in the ID line.
+  # * Bio::EMBL#id_line -> <ID Hash>
+  # where <ID Hash> is:
+  #  {'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
+  #   'SEQUENCE_LENGTH' => Int}
+  #
+  # ID Line
+  #  "ID  ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."
+  #
+  # DATA_CLASS = ['standard']
+  #
+  # MOLECULE_TYPE: DNA RNA XXX
+  #
+  # Code ( DIVISION )
+  #  EST (ESTs)
+  #  PHG (Bacteriophage)
+  #  FUN (Fungi)
+  #  GSS (Genome survey)
+  #  HTC (High Throughput cDNAs)
+  #  HTG (HTGs)
+  #  HUM (Human)
+  #  INV (Invertebrates)
+  #  ORG (Organelles)
+  #  MAM (Other Mammals)
+  #  VRT (Other Vertebrates)
+  #  PLN (Plants)
+  #  PRO (Prokaryotes)
+  #  ROD (Rodents)
+  #  SYN (Synthetic)
+  #  STS (STSs)
+  #  UNC (Unclassified)
+  #  VRL (Viruses)
+  #
+  def id_line(key=nil)
+    unless @data['ID']
+      tmp = Hash.new
+      idline = fetch('ID').split(/; +/)
+      tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline[0].split(/ +/)
+      tmp['MOLECULE_TYPE'] = idline[1]
+      tmp['DIVISION'] = idline[2]
+      tmp['SEQUENCE_LENGTH'] = idline[3].strip.split(' ').first.to_i
+      @data['ID'] = tmp
+    end
+    if key
+      @data['ID'][key]
+    else
+      @data['ID']
+    end
+  end
+  # returns ENTRY_NAME in the ID line.
+  # * Bio::EMBL#entry -> String
+  def entry
+    id_line('ENTRY_NAME')
+  end
+  alias entry_name entry
+  alias entry_id entry
+  # returns MOLECULE_TYPE in the ID line.
+  # * Bio::EMBL#molecule -> String
+  def molecule
+    id_line('MOLECULE_TYPE')
+  end
+  alias molecule_type molecule
+  # returns DIVISION in the ID line.
+  # * Bio::EMBL#division -> String
+  def division
+    id_line('DIVISION')
+  end
+  # returns SEQUENCE_LENGTH in the ID line.
+  # * Bio::EMBL#sequencelength -> String
+  def sequence_length
+    id_line('SEQUENCE_LENGTH')
+  end
+  alias seqlen sequence_length
+  # AC Line
+  # "AC   A12345; B23456;"
+  # returns the version information in the sequence version (SV) line.
+  # * Bio::EMBL#sv -> Accession.Version in String
+  # * Bio::EMBL#version -> accession in Int
+  #
+  # SV Line; sequence version (1/entry)
+  #  SV    Accession.Version
+  def sv
+    field_fetch('SV').sub(/;/,'')
+  end
+  def version
+    sv.split(".")[1].to_i
+  end
+  # returns contents in the date (DT) line.
+  # * Bio::EMBL#dt  -> <DT Hash>
+  # where <DT Hash> is:
+  #  {}
+  # * Bio::EMBL#dt(key)  -> String
+  # keys: 'created' and 'updated'
+  #
+  # DT Line; date (2/entry)
+  def dt(key=nil)
+    unless @data['DT']
+      tmp = Hash.new
+      dt_line = self.get('DT').split(/\n/)
+      tmp['created'] = dt_line[0].sub(/\w{2}   /,'').strip
+      tmp['updated'] = dt_line[1].sub(/\w{2}   /,'').strip
+      @data['DT'] = tmp
+    end
+    if key
+      @data['DT'][key]
+    else
+      @data['DT']
+    end
+  end
+  ##
+  # DE Line; description (>=1)
+  #
+  ##
+  # KW Line; keyword (>=1)
+  # KW   [Keyword;]+
+  #
+  # Bio::EMBLDB#kw  -> Array
+  #            #keywords  -> Array
+  ##
+  # OS Line; organism species (>=1)
+  # OS   Genus species (name)
+  # "OS   Trifolium repens (white clover)"
+  #
+  # Bio::EMBLDB#os  -> Array
+  ##
+  # OC Line; organism classification (>=1)
+  #
+  # Bio::EMBLDB#oc  -> Array
+  ##
+  # OG Line; organella (0 or 1/entry)
+  # ["Mitochondrion", "Chloroplast","Kinetoplast", "Cyanelle", "Plastid"]
+  #  or a plasmid name (e.g. "Plasmid pBR322").
+  #
+  # Bio::EMBLDB#og  -> String
+  ##
+  # R Lines
+  # RN RC RP RX RA RT RL
+  #
+  # Bio::EMBLDB#ref
+  ##
+  # DR Line; defabases cross-regerence (>=0)
+  # "DR  database_identifier; primary_identifier; secondary_identifier."
+  #
+  # Bio::EMBLDB#dr
+  # returns feature table header (String) in the feature header (FH) line.
+  #
+  # FH Line; feature table header (0 or 2)
+  def fh
+    fetch('FH')
+  end
+  # returns contents in the feature table (FT) lines.
+  # * Bio::EMBL#ft -> Bio::Features
+  # * Bio::EMBL#ft {} -> {|Bio::Feature| }
+  #
+  # same as features method in bio/db/genbank.rb
+  #
+  # FT Line; feature table data (>=0)
+  def ft
+    unless @data['FT']
+      @data['FT'] = Array.new
+      ary = Array.new
+      in_quote = false
+      @orig['FT'].each_line do |line|
+        next if line =~ /^FEATURES/
+        head = line[0,20].strip  # feature key (source, CDS, ...)
+        body = line[20,60].chomp # feature value (position, /qualifier=)
+        if line =~ /^FT {3}(\S+)/
+          ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
+        elsif body =~ /^ \// and not in_quote
+          ary.last.push(body)    # /q="data..., /q=data, /q
+          if body =~ /=" / and body !~ /"$/
+            in_quote = true
+          end
+        else
+          ary.last.last << body # ...data..., ...data..."
+          if body =~ /"$/
+            in_quote = false
+          end
+        end
+      end
+      ary.map! do |subary|
+        parse_qualifiers(subary)
+      end
+      @data['FT'] = Features.new(ary)
+    end
+    if block_given?
+      @data['FT'].each do |feature|
+        yield feature
+      end
+    else
+      @data['FT']
+    end
+  end
+  alias features ft
+  # iterates on CDS features in the FT lines.
+  def each_cds
+    ft.each do |cds_feature|
+      if cds_feature.feature == 'CDS'
+        yield cds_feature
+      end
+    end
+  end
+  # iterates on gene features in the FT lines.
+  def each_gene
+    ft.each do |gene_feature|
+      if gene_feature.feature == 'gene'
+        yield gene_feature
+      end
+    end
+  end
+  # returns comment text in the comments (CC) line.
+  #
+  # CC Line; comments of notes (>=0)
+  def cc
+    get('CC')
+  end
+  ##
+  # XX Line; spacer line (many)
+  #  def nxx
+  #  end
+  # returns sequence header information in the sequence header (SQ) line.
+  # * Bio::EMBL#sq  -> <SQ Hash>
+  # where <SQ Hash> is:
+  #     {'ntlen' => Int, 'other' => Int,
+  #      'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}
+  # * Bio::EMBL#sq(base)  -> <base content in Int>
+  # * Bio::EMBL#sq[base]  -> <base content in Int>
+  #
+  # SQ Line; sequence header (1/entry)
+  #  SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
+  def sq(base = nil)
+    unless @data['SQ']
+      fetch('SQ') =~ \
+             /(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
+      @data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
+                     'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
+    else
+      @data['SQ']
+    end
+    if base
+      @data['SQ'][base.downcase]
+    else
+      @data['SQ']
+    end
+  end
+  # returns the nucleotie sequence in this entry.
+  # * Bio::EMBL#seq  -> Bio::Sequence::NA
+  #
+  # @orig[''] as sequence
+  # bb Line; (blanks) sequence data (>=1)
+  def seq
+    Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
+  end
+  alias naseq seq
+  alias ntseq seq
+  # // Line; termination line (end; 1/entry)
+  ### private methods
+  private
+  ##
+  # same as Bio::GenBank#parse_qualifiers(feature)
+  def parse_qualifiers(ary)
+    feature = Feature.new
+    feature.feature = ary.shift
+    feature.position = ary.shift.gsub(/\s/, '')
+    ary.each do |f|
+      if f =~ %r{/([^=]+)=?"?([^"]*)"?}
+        qualifier, value = $1, $2
+        if value.empty?
+          value = true
+        end
+        case qualifier
+        when 'translation'
+          value = Sequence::AA.new(value.gsub(/\s/, ''))
+        when 'codon_start'
+          value = value.to_i
+        end
+        feature.append(Feature::Qualifier.new(qualifier, value))
+      end
+    end
+    return feature
+  end
+end
+end
+if __FILE__ == $0
+  while ent = $<.gets(Bio::EMBL::RS)
+    puts "\n ==> e = Bio::EMBL.new(ent) "
+    e = Bio::EMBL.new(ent)
+    puts "\n ==> e.entry_id "
+    p e.entry_id
+    puts "\n ==> e.id_line "
+    p e.id_line
+    puts "\n ==> e.id_line('molecule') "
+    p e.id_line('molecule')
+    puts "\n ==> e.molecule "
+    p e.molecule
+    puts "\n ==> e.ac "
+    p e.ac
+    puts "\n ==> e.sv "
+    p e.sv
+    puts "\n ==> e.dt "
+    p e.dt
+    puts "\n ==> e.dt('created') "
+    p e.dt('created')
+    puts "\n ==> e.de "
+    p e.de
+    puts "\n ==> e.kw "
+    p e.kw
+    puts "\n ==> e.os "
+    p e.os
+    puts "\n ==> e.oc "
+    p e.oc
+    puts "\n ==> e.og "
+    p e.og
+    puts "\n ==> e.ref "
+    p e.ref
+    puts "\n ==> e.dr "
+    p e.dr
+    puts "\n ==> e.ft "
+    p e.ft
+    puts "\n ==> e.each_cds {|c| p c}"
+    p e.each_cds {|c| p c }
+    puts "\n ==> e.sq "
+    p e.sq
+    puts "\n ==> e.sq('a') "
+    p e.sq('a')
+    puts "\n ==> e.gc"
+    p e.gc
+    puts "\n ==> e.seq "
+    p e.seq
+  end
+end

data/lib/bio/db/embl/sptr.rb ADDED Viewed

@@ -0,0 +1,954 @@
+#
+# = bio/db/embl/sptr.rb - UniProt/SwissProt and TrEMBL database class
+#
+# Copyright::   Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
+# License::     LGPL
+#
+# $Id: sptr.rb,v 1.29 2005/11/02 07:30:14 nakao Exp $
+#
+# == UniProtKB/SwissProt and TrEMBL
+#
+# See the SWISS-PROT dicument file SPECLIST.TXT.
+#
+# == Example
+#
+#--
+#
+#  This library is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2 of the License, or (at your option) any later version.
+#
+#  This library is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with this library; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+#
+#++
+#
+require 'bio/db'
+require 'bio/db/embl/common'
+module Bio
+# Parser class for UniProtKB/SwissProt and TrEMBL database entry
+class SPTR < EMBLDB
+  include Bio::EMBLDB::Common
+  @@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
+  @@data_class = ["STANDARD", "PRELIMINARY"]
+  # returns a Hash of the ID line.
+  # returns a content (Int or String) of the ID line by a given key.
+  # Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
+  #
+  # ID Line
+  # "ID  #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
+  #
+  #   ENTRY_NAME := "#{X}_#{Y}"
+  #     X =~ /[A-Z0-9]{1,5}/ # The protein name.
+  #     Y =~ /[A-Z0-9]{1,5}/ # The biological source of the protein.
+  #   MOLECULE_TYPE := 'PRT' =~ /\w{3}/
+  #   SEQUENCE_LENGTH =~ /\d+ AA/
+  def id_line(key = nil)
+    unless @data['ID']
+      tmp = Hash.new
+      a = @orig['ID'].split(/ +/)
+      tmp['ENTRY_NAME']      = a[1]
+      tmp['DATA_CLASS']      = a[2].sub(/;/,'')
+      tmp['MOLECULE_TYPE']   = a[3].sub(/;/,'')
+      tmp['SEQUENCE_LENGTH'] = a[4].to_i
+      @data['ID'] = tmp
+    end
+    if key
+      @data['ID'][key] # String/Int
+    else
+      @data['ID']      # Hash
+    end
+  end
+  # returns a ENTRY_NAME in the ID line.
+  #
+  # A short-cut for Bio::SPTR#id_line('ENTRY_NAME').
+  def entry_id
+    id_line('ENTRY_NAME')
+  end
+  alias entry_name entry_id
+  alias entry entry_id
+  # returns a MOLECULE_TYPE in the ID line.
+  #
+  # A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
+  def molecule
+    id_line('MOLECULE_TYPE')
+  end
+  alias molecule_type molecule
+  # returns a SEQUENCE_LENGTH in the ID line.
+  #
+  # A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
+  def sequence_length
+    id_line('SEQUENCE_LENGTH')
+  end
+  alias aalen sequence_length
+  # Bio::EMBLDB::Common#ac  -> ary
+  #                  #accessions  -> ary
+  #                  #accession  -> String (accessions.first)
+  @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
+  # returns a Hash of information in the DT lines.
+  #  hash keys:
+  #    ['created', 'sequence', 'annotation']
+  #  also Symbols acceptable (ASAP):
+  #    [:created, :sequence, :annotation]
+  #
+  # returns a String of information in the DT lines by a given key..
+  #
+  # DT Line; date (3/entry)
+  #  DT DD-MMM-YYY (rel. NN, Created)
+  #  DT DD-MMM-YYY (rel. NN, Last sequence update)
+  #  DT DD-MMM-YYY (rel. NN, Last annotation update)
+  def dt(key = nil)
+    unless @data['DT']
+      tmp = Hash.new
+      a = self.get('DT').split(/\n/)
+      tmp['created']    = a[0].sub(/\w{2}   /,'').strip
+      tmp['sequence']   = a[1].sub(/\w{2}   /,'').strip
+      tmp['annotation'] = a[2].sub(/\w{2}   /,'').strip
+      @data['DT'] = tmp
+    end
+    if key
+      @data['DT'][key]
+    else
+      @data['DT']
+    end
+  end
+  # returns the proposed official name of the protein.
+  #
+  # DE Line; description (>=1)
+  #  "DE #{OFFICIAL_NAME} (#{SYNONYM})"
+  #  "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
+  #  OFFICIAL_NAME  1/entry
+  #  SYNONYM        >=0
+  #  CONTEINS       >=0
+  def protein_name
+    name = ""
+    if de_line = fetch('DE') then
+      str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
+      name = str[/^[^(]*/].strip
+      name << ' (Fragment)' if str =~ /fragment/i
+    end
+    return name
+  end
+  # returns an array of synonyms (unofficial names).
+  #
+  # synonyms are each placed in () following the official name on the DE line.
+  def synonyms
+    ary = Array.new
+    if de_line = fetch('DE') then
+      line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
+      line.scan(/\([^)]+/) do |synonym|
+        unless synonym =~ /fragment/i then
+          ary << synonym[1..-1].strip # index to remove the leading (
+        end
+      end
+    end
+    return ary
+  end
+  # returns gene names in the GN line.
+  #
+  # New UniProt/SwissProt format:
+  # * Bio::SPTR#gn -> [ <gene record>* ]
+  # where <gene record> is:
+  #                    { :name => '...',
+  #                      :synonyms => [ 's1', 's2', ... ],
+  #                      :loci   => [ 'l1', 'l2', ... ],
+  #                      :orfs     => [ 'o1', 'o2', ... ]
+  #                    }
+  #
+  # Old format:
+  # * Bio::SPTR#gn -> Array      # AND
+  # * Bio::SPTR#gn[0] -> Array   # OR
+  #
+  # GN Line: Gene name(s) (>=0, optional)
+  def gn
+    return @data['GN'] if @data['GN']
+    case fetch('GN')
+    when /Name=/ then
+      return gn_uniprot_parser
+    else
+      return gn_old_parser
+    end
+  end
+  # returns contents in the old style GN line.
+  # GN Line: Gene name(s) (>=0, optional)
+  #  GN   HNS OR DRDX OR OSMZ OR BGLY.
+  #  GN   CECA1 AND CECA2.
+  #  GN   CECA1 AND (HOGE OR FUGA).
+  #
+  #  GN NAME1 [(AND|OR) NAME]+.
+  #
+  # Bio::SPTR#gn -> Array      # AND
+  #          #gn[0] -> Array   # OR
+  #          #gene_names -> Array
+  def gn_old_parser
+    names = Array.new
+    if get('GN').size > 0
+      names = fetch('GN').sub(/\.$/,'').split(/ AND /)
+      names.map! { |synonyms|
+        synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
+          e.strip
+        }
+      }
+    end
+    return @data['GN'] = names
+  end
+  private :gn_old_parser
+  # returns contents in the structured GN line.
+  # The new format of the GN line is:
+  #  GN   Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
+  #  GN   ORFNames=[, ...];
+  #
+  # * Bio::SPTR#gn -> [ <gene record>* ]
+  # where <gene record> is:
+  #                    { :name => '...',
+  #                      :synonyms => [ 's1', 's2', ... ],
+  #                      :loci   => [ 'l1', 'l2', ... ],
+  #                      :orfs     => [ 'o1', 'o2', ... ]
+  #                    }
+  def gn_uniprot_parser
+    @data['GN'] = Array.new
+    gn_line = fetch('GN').strip
+    records = gn_line.split(/\s*and\s*/)
+    records.each do |record|
+      gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
+      record.each(';') do |element|
+        case element
+        when /Name=/ then
+          gene_hash[:name] = $'[0..-2]
+        when /Synonyms=/ then
+          gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
+        when /OrderedLocusNames=/ then
+          gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
+        when /ORFNames=/ then
+          gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
+        end
+      end
+      @data['GN'] << gene_hash
+    end
+    return @data['GN']
+  end
+  private :gn_uniprot_parser
+  # returns a Array of gene names in the GN line.
+  def gene_names
+    gn # set @data['GN'] if it hasn't been already done
+    if @data['GN'].first.class == Hash then
+      @data['GN'].collect { |element| element[:name] }
+    else
+      @data['GN'].first
+    end
+  end
+  # returns a String of the first gene name in the GN line.
+  def gene_name
+    gene_names.first
+  end
+  # returns a Array of Hashs or a String of the OS line when a key given.
+  # * Bio::EMBLDB#os  -> Array
+  #  [{'name' => '(Human)', 'os' => 'Homo sapiens'},
+  #   {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
+  # * Bio::EPTR#os[0] -> Hash
+  #  {'name' => "(Human)", 'os' => 'Homo sapiens'}
+  # * Bio::SPTR#os[0]['name'] -> "(Human)"
+  # * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
+  #
+  # OS Line; organism species (>=1)
+  #  OS   Genus species (name).
+  #  OS   Genus species (name0) (name1).
+  #  OS   Genus species (name0) (name1).
+  #  OS   Genus species (name0), G s0 (name0), and G s (name0) (name1).
+  #  OS   Homo sapiens (Human), and Rarrus norveticus (Rat)
+  def os(num = nil)
+    unless @data['OS']
+      os = Array.new
+      fetch('OS').split(/, and|, /).each do |tmp|
+        if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
+          org = $1
+          tmp =~ /(\(.+\))/
+          os.push({'name' => $1, 'os' => org})
+        else
+          raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
+        end
+      end
+      @data['OS'] = os
+    end
+    if num
+      # EX. "Trifolium repens (white clover)"
+      return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
+    else
+      return @data['OS']
+    end
+  end
+  # Bio::EMBLDB::Common#og -> Array
+  # OG Line; organella (0 or 1/entry)
+  # ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
+  #  or a plasmid name (e.g. "Plasmid pBR322").
+  # Bio::EMBLDB::Common#oc -> Array
+  # OC Line; organism classification (>=1)
+  # "OC   Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
+  # "OC   Theileria."
+  # returns a Hash of oraganism taxonomy cross-references.
+  # * Bio::SPTR#ox -> Hash
+  #  {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
+  #
+  # OX Line; organism taxonomy cross-reference (>=1 per entry)
+  #  OX   NCBI_TaxID=1234;
+  #  OX   NCBI_TaxID=1234, 2345, 3456, 4567;
+  def ox
+    unless @data['OX']
+      tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
+      hsh = Hash.new
+      tmp.each do |e|
+        db,refs = e.split(/=/)
+        hsh[db] = refs.split(/, */)
+      end
+      @data['OX'] = hsh
+    end
+    return @data['OX']
+  end
+  # Bio::EMBLDB::Common#ref -> Array
+  # R Lines
+  # RN RC RP RX RA RT RL
+  @@cc_topics = ['ALTERNATIVE PRODUCTS','CATALYTIC ACTIVITY','CAUTION',
+    'COFACTOR','DATABASE','DEVELOPMENTAL STAGE','DISEASE','DOMAIN',
+    'ENZYME REGULATION','FUNCTION','INDUCTION','MASS SPECTROMETRY',
+    'MISCELLANEOUS','PATHWAY','PHARMACEUTICAL','POLYMORPHISM','PTM',
+    'SIMILARITY','SUBCELLULAR LOCATION','SUBUNIT','TISSUE SPECIFICITY']
+  # returns contents in the CC lines.
+  # * Bio::SPTR#cc -> Hash
+  # * Bio::SPTR#cc(Int) -> String
+  # returns an Array of contents in the TOPIC string.
+  # * Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
+  #
+  # returns contents of the "ALTERNATIVE PRODUCTS".
+  # * Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
+  #  {'Event' => str,
+  #   'Named isoforms' => int,
+  #   'Comment' => str,
+  #   'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
+  #
+  #  CC   -!- ALTERNATIVE PRODUCTS:
+  #  CC       Event=Alternative splicing; Named isoforms=15;
+  #  ...
+  #  CC         placentae isoforms. All tissues differentially splice exon 13;
+  #  CC       Name=A; Synonyms=no del;
+  #  CC         IsoId=P15529-1; Sequence=Displayed;
+  #
+  # returns contents of the "DATABASE".
+  # * Bio::SPTR#cc('DATABASE') -> Array
+  #  [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
+  #
+  #  CC   -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
+  #
+  # returns contents of the "MASS SPECTROMETRY".
+  # * Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
+  #  [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
+  #
+  #  MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
+  #
+  # CC lines (>=0, optional)
+  #  CC   -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
+  #  CC       IN LIVER, KIDNEY, LUNG AND BRAIN.
+  #
+  #  CC   -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
+  #  CC       SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
+  def cc(tag = nil)
+    unless @data['CC']
+      cc  = Hash.new
+      cmt = '-' * (77 - 4 + 1)
+      dlm = /-!- /
+      return cc if get('CC').size == 0 # 12KD_MYCSM has no CC lines.
+      begin
+        fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
+          if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
+            key  = $1
+            body = $2.gsub(/- (?!AND)/,'-')
+            unless cc[key]
+              cc[key] = [body]
+            else
+              cc[key].push(body)
+            end
+          else
+            raise ["Error: [#{entry_id}]: CC Lines", '',
+                   tmp, '', '', fetch('CC'),''].join("\n")
+          end
+        end
+      rescue NameError
+        if fetch('CC') == ''
+          return {}
+        else
+          raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
+                 "\n'#{self.get('CC')}'\n", "(#{$!})"].join
+        end
+      rescue NoMethodError
+      end
+      @data['CC'] = cc
+    end
+    case tag
+    when 'ALTERNATIVE PRODUCTS'
+      ap = @data['CC']['ALTERNATIVE PRODUCTS'].to_s
+      return ap unless ap
+      # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
+      tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 'Variants'  => []}
+      if /Event=(.+?);/ =~ ap
+        tmp['Event'] = $1
+      end
+      if /Named isoforms=(\S+?);/ =~ ap
+        tmp['Named isoforms'] = $1
+      end
+      if /Comment=(.+?);/m =~ ap
+        tmp['Comment'] = $1
+      end
+      ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
+        tmp['Variants'] << cc_ap_variants_parse(ent)
+      end
+      return tmp
+    when 'DATABASE'
+      # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
+      tmp = Array.new
+      db = @data['CC']['DATABASE']
+      return db unless db
+      db.each do |e|
+        db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
+        e.sub(/.$/,'').split(/;/).each do |line|
+          case line
+          when /NAME=(.+)/
+            db['NAME'] = $1
+          when /NOTE=(.+)/
+            db['NOTE'] = $1
+          when /WWW="(.+)"/
+            db['WWW'] = $1
+          when /FTP="(.+)"/
+            db['FTP'] = $1
+          end
+        end
+        tmp.push(db)
+      end
+      return tmp
+    when 'MASS SPECTOROMETRY'
+      # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
+      tmp = Array.new
+      ms = @data['CC']['MASS SPECTOROMETRY']
+      return ms unless ms
+      ms.each do |m|
+        mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
+        m.sub(/.$/,'').split(/;/).each do |line|
+          case line
+          when /MW=(.+)/
+            mass['MW'] = $1.to_f
+          when /MW_ERR=(.+)/
+            mass['MW_ERR'] = $1.to_f
+          when /METHOD="(.+)"/
+            mass['METHOD'] = $1.to_s
+          when /RANGE="(\d+-\d+)"/
+            mass['RANGE'] = $1          # RANGE class ?
+          end
+        end
+        tmp.push(mass)
+      end
+      return tmp
+    when 'INTERACTION'
+      return cc_interaction_parse(@data['CC']['INTERACTION'].to_s)
+    when nil
+      return @data['CC']
+    else
+      return @data['CC'][tag]
+    end
+  end
+  def cc_ap_variants_parse(ent)
+    hsh = {}
+    ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
+      case e[0]
+      when 'Sequence'
+        e[1] = e[1].sub(/;/,'').split(/, /)
+      end
+      hsh[e[0]] = e[1]
+    end
+    return hsh
+  end
+  private :cc_ap_variants_parse
+  # returns conteins in a line of the CC INTERACTION section.
+  #
+  #  CC       P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
+  def cc_interaction_parse(str)
+    it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
+    it.map {|ent|
+      {:partner_id => ent[0].strip,
+       :nbexp => ent[1].strip,
+       :intact_acc => ent[2].split(', ') }
+    }
+  end
+  private :cc_interaction_parse
+  # returns databases cross-references in the DR lines.
+  # * Bio::EMBLDB#dr  -> Hash w/in Array
+  #
+  # DR Line; defabases cross-reference (>=0)
+  # a cross_ref pre one line
+  #  DR  database_identifier; primary_identifier; secondary_identifier.
+  @@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
+    'ECOGENE',
+    'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
+    'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
+    'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
+    'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
+  # Bio::EMBLDB::Common#kw - Array
+  #                    #keywords  -> Array
+  #
+  # KW Line; keyword (>=1)
+  # KW   [Keyword;]+
+  # returns conteins in the feature table.
+  # * Bio::SPTR#ft -> Hash
+  #  {'feature_name' => [{'From' => str, 'To' => str,
+  #                       'Description' => str, 'FTId' => str}],...}
+  #
+  # returns an Array of the information about the feature_name in the feature table.
+  # * Bio::SPTR#ft(feature_name) -> Array of Hash
+  #  [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
+  #
+  # FT Line; feature table data (>=0, optional)
+  #
+  #  Col     Data item
+  #  -----   -----------------
+  #   1- 2   FT
+  #   6-13   Feature name
+  #  15-20   `FROM' endpoint
+  #  22-27   `TO' endpoint
+  #  35-75   Description (>=0 per key)
+  #  -----   -----------------
+  def ft(feature_name = nil)
+    unless @data['FT']
+      table        = Hash.new()
+      last_feature = nil
+      begin
+        get('FT').split(/\n/).each {|line|
+          feature = line[5..12].strip
+          if feature == '' and line[34..74]
+            tmp = ' ' + line[34..74].strip
+            table[last_feature].last['Description'] << tmp
+            next unless /\.$/ =~ line
+          else
+            from = line[14..19].strip
+            to   = line[21..26].strip
+            desc = line[34..74].strip if line[34..74]
+            table[feature] = [] unless table[feature]
+            table[feature] << {
+              'From'        => from.to_i,
+              'To'          => to.to_i,
+              'Description' => desc,
+              'diff'        => [],
+              'FTId'        => nil }
+            last_feature = feature
+            next
+          end
+          case last_feature
+          when 'VARSPLIC', 'VARIANT', 'CONFLICT'
+            if /FTId=(.+?)\./ =~ line   # version 41 >
+              ftid = $1
+              table[last_feature].last['FTId'] = ftid
+              table[last_feature].last['Description'].sub!(/ \/FTId=#{ftid}./,'')
+            end
+            case table[last_feature].last['Description']
+            when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
+              original = $1
+              swap = $2
+              original = original.gsub(/ /,'').strip
+              swap = swap.gsub(/ /,'').strip
+            when /Missing/i
+              original = seq.subseq(table[last_feature].last['From'],
+                                    table[last_feature].last['To'])
+              swap = ''
+            else
+              raise line
+            end
+            table[last_feature].last['diff'] = [original, swap]
+          end
+        }
+      rescue
+        raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n" +
+                  "'#{self.get('FT')}'\n"
+      end
+      table.each_key do |k|
+        table[k].each do |e|
+          if / -> / =~ e['Description']
+            pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
+            e['Description'].sub!(pattern) {
+              a = $1
+              b = $2
+              a.gsub(/ /,'') + " -> " + b.gsub(/ /,'')
+            }
+          end
+          if /- [\w\d]/ =~ e['Description']
+            e['Description'].gsub!(/([\w\d]- [\w\d]+)/) {
+              a = $1
+              if /- AND/ =~ a
+                a
+              else
+                a.sub(/ /,'')
+              end
+            }
+          end
+        end
+      end
+      @data['FT'] = table
+    end
+    if feature_name
+      @data['FT'][feature_name]
+    else
+      @data['FT']
+    end
+  end
+  # returns a Hash of conteins in the SQ lines.
+  # * Bio::SPTRL#sq  -> hsh
+  #
+  # returns a value of a key given in the SQ lines.
+  # * Bio::SPTRL#sq(key)  -> int or str
+  # * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length', 'CRC64']
+  #
+  # SQ Line; sequence header (1/entry)
+  #  SQ   SEQUENCE   233 AA;  25630 MW;  146A1B48A1475C86 CRC64;
+  #  SQ   SEQUENCE  \d+ AA; \d+ MW;  [0-9A-Z]+ CRC64;
+  #
+  # MW, Dalton unit.
+  # CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
+  def sq(key = nil)
+    unless @data['SQ']
+      if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
+        @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
+      else
+        raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
+      end
+    end
+    if key
+      case key
+      when /mw/, /molecular/, /weight/
+        @data['SQ']['MW']
+      when /len/, /length/, /AA/
+        @data['SQ']['aalen']
+      else
+        @data['SQ'][key]
+      end
+    else
+      @data['SQ']
+    end
+  end
+  # returns a Bio::Sequence::AA of the amino acid sequence.
+  # * Bio::SPTR#seq -> Bio::Sequence::AA
+  #
+  # blank Line; sequence data (>=1)
+  def seq
+    unless @data['']
+      @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
+    end
+    return @data['']
+  end
+  alias aaseq seq
+end # class SPTR
+end # module Bio
+if __FILE__ == $0
+  # Usage: ruby __FILE__ uniprot_sprot.dat
+  # Usage: ruby __FILE__ uniprot_sprot.dat | egrep '^RuntimeError'
+  begin
+    require 'pp'
+    alias pp p
+  rescue LoadError
+  end
+  def cmd(cmd, tag = nil, ent = $ent)
+    puts " ==> #{cmd} "
+    puts Bio::SPTR.new(ent).get(tag) if tag
+    begin
+      p eval(cmd)
+    rescue RuntimeError
+      puts "RuntimeError(#{Bio::SPTR.new($ent).entry_id})}: #{$!} "
+    end
+    puts
+  end
+  while $ent = $<.gets(Bio::SPTR::RS)
+    cmd "Bio::SPTR.new($ent).entry_id"
+    cmd "Bio::SPTR.new($ent).id_line", 'ID'
+    cmd "Bio::SPTR.new($ent).entry"
+    cmd "Bio::SPTR.new($ent).entry_name"
+    cmd "Bio::SPTR.new($ent).molecule"
+    cmd "Bio::SPTR.new($ent).sequence_length"
+    cmd "Bio::SPTR.new($ent).ac", 'AC'
+    cmd "Bio::SPTR.new($ent).accession"
+    cmd "Bio::SPTR.new($ent).gn", 'GN'
+    cmd "Bio::SPTR.new($ent).gene_name"
+    cmd "Bio::SPTR.new($ent).gene_names"
+    cmd "Bio::SPTR.new($ent).dt", "DT"
+    ['created','annotation','sequence'].each do |key|
+      cmd "Bio::SPTR.new($ent).dt('#{key}')"
+    end
+    cmd "Bio::SPTR.new($ent).de", 'DE'
+    cmd "Bio::SPTR.new($ent).definition"
+    cmd "Bio::SPTR.new($ent).protein_name"
+    cmd "Bio::SPTR.new($ent).synonyms"
+    cmd "Bio::SPTR.new($ent).kw", 'KW'
+    cmd "Bio::SPTR.new($ent).os", 'OS'
+    cmd "Bio::SPTR.new($ent).oc", 'OC'
+    cmd "Bio::SPTR.new($ent).og", 'OG'
+    cmd "Bio::SPTR.new($ent).ox", 'OX'
+    cmd "Bio::SPTR.new($ent).ref", 'R'
+    cmd "Bio::SPTR.new($ent).cc", 'CC'
+    cmd "Bio::SPTR.new($ent).cc('ALTERNATIVE PRODUCTS')"
+    cmd "Bio::SPTR.new($ent).cc('DATABASE')"
+    cmd "Bio::SPTR.new($ent).cc('MASS SPECTOMETRY')"
+    cmd "Bio::SPTR.new($ent).dr", 'DR'
+    cmd "Bio::SPTR.new($ent).ft", 'FT'
+    cmd "Bio::SPTR.new($ent).ft['DOMAIN']"
+    cmd "Bio::SPTR.new($ent).sq", "SQ"
+    cmd "Bio::SPTR.new($ent).seq"
+  end
+end
+=begin
+= Bio::SPTR < Bio::DB
+Class for a entry in the SWISS-PROT/TrEMBL database.
+  * ((<URL:http://www.ebi.ac.uk/swissprot/>))
+  * ((<URL:http://www.ebi.ac.uk/trembl/>))
+  * ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
+--- Bio::SPTR.new(a_sp_entry)
+=== ID line (Identification)
+--- Bio::SPTR#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
+                          'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
+--- Bio::SPTR#id_line(key) -> str
+       key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
+--- Bio::SPTR#entry_id -> str
+--- Bio::SPTR#molecule -> str
+--- Bio::SPTR#sequence_length -> int
+=== AC lines (Accession number)
+--- Bio::SPTR#ac -> ary
+--- Bio::SPTR#accessions -> ary
+--- Bio::SPTR#accession -> accessions.first
+=== GN line (Gene name(s))
+--- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
+--- Bio::SPTR#gene_name -> str
+--- Bio::SPTR#gene_names -> [str] or [str]
+=== DT lines (Date)
+--- Bio::SPTR#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
+--- Bio::SPTR#dt(key) -> str
+      key := (created|annotation|sequence)
+=== DE lines (Description)
+--- Bio::SPTR#de -> str
+             #definition -> str
+--- Bio::SPTR#protein_name
+      Returns the proposed official name of the protein
+--- Bio::SPTR#synonyms
+      Returns an array of synonyms (unofficial names)
+=== KW lines (Keyword)
+--- Bio::SPTR#kw -> ary
+=== OS lines (Organism species)
+--- Bio::SPTR#os -> [{'name' => str, 'os' => str}, ...]
+=== OC lines (organism classification)
+--- Bio::SPTR#oc -> ary
+=== OG line (Organella)
+--- Bio::SPTR#og -> ary
+=== OX line (Organism taxonomy cross-reference)
+--- Bio::SPTR#ox -> {'NCBI_TaxID' => [], ...}
+=== RN RC RP RX RA RT RL RG lines (Reference)
+--- Bio::SPTR#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
+=== DR lines (Database cross-reference)
+--- Bio::SPTR#dr -> {'EMBL' => ary, ...}
+=== FT lines (Feature table data)
+--- Bio::SPTR#ft -> hsh
+=== SQ lines (Sequence header and data)
+--- Bio::SPTR#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
+--- Bio::SPTR#sq(key) -> int or str
+          key := (aalen|MW|CRC64)
+--- Bio::EMBL#seq -> Bio::Sequece::AA
+             #aaseq -> Bio::Sequece::AA
+=end
+  #      Content                      Occurrence in an entry
+  # ---- ---------------------------  --------------------------------
+  # ID - identification               (begins each entry; 1 per entry)
+  # AC - accession number(s)          (>=1 per entry)
+  # DT - date                         (3 per entry)
+  # DE - description                  (>=1 per entry)
+  # GN - gene name(s)                 (>=0 per entry; optional)
+  # OS - organism species             (>=1 per entry)
+  # OG - organelle                    (0 or 1 per entry; optional)
+  # OC - organism classification      (>=1 per entry)
+  # OX - organism taxonomy x-ref      (>=1 per entry)
+  # RN - reference number             (>=1 per entry)
+  # RP - reference positions          (>=1 per entry)
+  # RC - reference comment(s)         (>=0 per entry; optional)
+  # RX - reference cross-reference(s) (>=0 per entry; optional)
+  # RA - reference author(s)          (>=1 per entry)
+  # RT - reference title              (>=0 per entry; optional)
+  # RL - reference location           (>=1 per entry)
+  # CC - comments or notes            (>=0 per entry; optional)
+  # DR - database cross-references    (>=0 per entry; optional)
+  # KW - keywords                     (>=1 per entry)
+  # FT - feature table data           (>=0 per entry; optional)
+  # SQ - sequence header              (1 per entry)
+  #    - (blanks) The sequence data   (>=1 per entry)
+  # // - termination line             (ends each entry; 1 per entry)
+  # ---- ---------------------------  --------------------------------