bio 1.4.3.0001 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +39 -33
- data/BSDL +22 -0
- data/COPYING +2 -2
- data/COPYING.ja +36 -36
- data/ChangeLog +2404 -1025
- data/KNOWN_ISSUES.rdoc +15 -55
- data/README.rdoc +17 -23
- data/RELEASE_NOTES.rdoc +246 -183
- data/Rakefile +3 -2
- data/bin/br_biofetch.rb +29 -5
- data/bioruby.gemspec +15 -32
- data/bioruby.gemspec.erb +10 -20
- data/doc/ChangeLog-1.4.3 +1478 -0
- data/doc/RELEASE_NOTES-1.4.3.rdoc +204 -0
- data/doc/Tutorial.rd +0 -6
- data/doc/Tutorial.rd.html +7 -12
- data/doc/Tutorial.rd.ja +960 -1064
- data/doc/Tutorial.rd.ja.html +977 -1067
- data/gemfiles/Gemfile.travis-jruby1.8 +2 -1
- data/gemfiles/Gemfile.travis-jruby1.9 +2 -4
- data/gemfiles/Gemfile.travis-rbx +13 -0
- data/gemfiles/Gemfile.travis-ruby1.8 +2 -1
- data/gemfiles/Gemfile.travis-ruby1.9 +2 -4
- data/gemfiles/Gemfile.travis-ruby2.2 +9 -0
- data/lib/bio.rb +10 -43
- data/lib/bio/alignment.rb +8 -14
- data/lib/bio/appl/blast.rb +1 -2
- data/lib/bio/appl/blast/format0.rb +18 -7
- data/lib/bio/appl/blast/remote.rb +0 -9
- data/lib/bio/appl/blast/report.rb +1 -1
- data/lib/bio/appl/clustalw/report.rb +3 -1
- data/lib/bio/appl/genscan/report.rb +1 -2
- data/lib/bio/appl/iprscan/report.rb +1 -2
- data/lib/bio/appl/meme/mast.rb +4 -4
- data/lib/bio/appl/meme/mast/report.rb +1 -1
- data/lib/bio/appl/paml/codeml.rb +2 -2
- data/lib/bio/appl/paml/codeml/report.rb +1 -0
- data/lib/bio/appl/paml/common.rb +1 -1
- data/lib/bio/appl/sosui/report.rb +1 -2
- data/lib/bio/command.rb +62 -2
- data/lib/bio/data/aa.rb +13 -31
- data/lib/bio/data/codontable.rb +1 -2
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +1 -0
- data/lib/bio/db/biosql/sequence.rb +1 -1
- data/lib/bio/db/embl/common.rb +1 -1
- data/lib/bio/db/embl/embl.rb +5 -4
- data/lib/bio/db/embl/format_embl.rb +3 -3
- data/lib/bio/db/embl/sptr.rb +9 -1444
- data/lib/bio/db/embl/swissprot.rb +12 -29
- data/lib/bio/db/embl/trembl.rb +13 -30
- data/lib/bio/db/embl/uniprot.rb +12 -29
- data/lib/bio/db/embl/uniprotkb.rb +1455 -0
- data/lib/bio/db/fasta.rb +17 -0
- data/lib/bio/db/fasta/defline.rb +1 -3
- data/lib/bio/db/fastq.rb +1 -1
- data/lib/bio/db/genbank/ddbj.rb +9 -5
- data/lib/bio/db/genbank/refseq.rb +11 -3
- data/lib/bio/db/gff.rb +3 -4
- data/lib/bio/db/go.rb +5 -6
- data/lib/bio/db/kegg/module.rb +4 -5
- data/lib/bio/db/kegg/pathway.rb +4 -5
- data/lib/bio/db/kegg/reaction.rb +1 -1
- data/lib/bio/db/nexus.rb +3 -2
- data/lib/bio/db/pdb/pdb.rb +2 -2
- data/lib/bio/db/phyloxml/phyloxml_elements.rb +82 -59
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +2 -2
- data/lib/bio/db/phyloxml/phyloxml_writer.rb +1 -2
- data/lib/bio/db/sanger_chromatogram/chromatogram.rb +1 -2
- data/lib/bio/db/transfac.rb +1 -1
- data/lib/bio/io/das.rb +40 -41
- data/lib/bio/io/fastacmd.rb +0 -16
- data/lib/bio/io/fetch.rb +111 -55
- data/lib/bio/io/flatfile/buffer.rb +4 -5
- data/lib/bio/io/hinv.rb +2 -3
- data/lib/bio/io/ncbirest.rb +43 -6
- data/lib/bio/io/pubmed.rb +76 -81
- data/lib/bio/io/togows.rb +33 -10
- data/lib/bio/map.rb +1 -1
- data/lib/bio/pathway.rb +1 -1
- data/lib/bio/sequence/compat.rb +1 -1
- data/lib/bio/sequence/na.rb +63 -12
- data/lib/bio/shell.rb +0 -2
- data/lib/bio/shell/core.rb +5 -6
- data/lib/bio/shell/interface.rb +3 -4
- data/lib/bio/shell/irb.rb +1 -2
- data/lib/bio/shell/plugin/entry.rb +2 -3
- data/lib/bio/shell/plugin/seq.rb +7 -6
- data/lib/bio/shell/setup.rb +1 -2
- data/lib/bio/tree.rb +2 -2
- data/lib/bio/util/contingency_table.rb +0 -2
- data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +2 -2
- data/lib/bio/util/sirna.rb +76 -16
- data/lib/bio/version.rb +8 -9
- data/sample/benchmark_clustalw_report.rb +47 -0
- data/sample/biofetch.rb +248 -151
- data/setup.rb +6 -7
- data/test/data/clustalw/example1-seqnos.aln +58 -0
- data/test/network/bio/appl/blast/test_remote.rb +1 -15
- data/test/network/bio/appl/test_blast.rb +0 -12
- data/test/network/bio/io/test_pubmed.rb +49 -0
- data/test/network/bio/io/test_togows.rb +0 -1
- data/test/network/bio/test_command.rb +65 -2
- data/test/unit/bio/appl/bl2seq/test_report.rb +0 -1
- data/test/unit/bio/appl/blast/test_report.rb +110 -48
- data/test/unit/bio/appl/clustalw/test_report.rb +67 -51
- data/test/unit/bio/appl/sim4/test_report.rb +46 -17
- data/test/unit/bio/appl/test_blast.rb +2 -2
- data/test/unit/bio/db/embl/test_embl.rb +0 -1
- data/test/unit/bio/db/embl/test_embl_rel89.rb +0 -1
- data/test/unit/bio/db/embl/{test_sptr.rb → test_uniprotkb.rb} +111 -115
- data/test/unit/bio/db/embl/{test_uniprot_new_part.rb → test_uniprotkb_new_part.rb} +11 -11
- data/test/unit/bio/db/genbank/test_genbank.rb +10 -4
- data/test/unit/bio/db/pdb/test_pdb.rb +14 -8
- data/test/unit/bio/db/test_fasta.rb +41 -1
- data/test/unit/bio/db/test_fastq.rb +14 -4
- data/test/unit/bio/db/test_gff.rb +2 -2
- data/test/unit/bio/db/test_phyloxml.rb +30 -30
- data/test/unit/bio/db/test_phyloxml_writer.rb +2 -2
- data/test/unit/bio/io/flatfile/test_autodetection.rb +1 -2
- data/test/unit/bio/io/flatfile/test_buffer.rb +7 -1
- data/test/unit/bio/io/flatfile/test_splitter.rb +1 -1
- data/test/unit/bio/io/test_togows.rb +3 -2
- data/test/unit/bio/sequence/test_dblink.rb +1 -1
- data/test/unit/bio/sequence/test_na.rb +3 -1
- data/test/unit/bio/test_alignment.rb +1 -2
- data/test/unit/bio/test_command.rb +5 -4
- data/test/unit/bio/test_db.rb +4 -2
- data/test/unit/bio/test_pathway.rb +25 -10
- data/test/unit/bio/util/test_sirna.rb +22 -22
- metadata +656 -1430
- data/doc/KEGG_API.rd +0 -1843
- data/doc/KEGG_API.rd.ja +0 -1834
- data/extconf.rb +0 -2
- data/lib/bio/appl/blast/ddbj.rb +0 -131
- data/lib/bio/db/kegg/taxonomy.rb +0 -280
- data/lib/bio/io/dbget.rb +0 -194
- data/lib/bio/io/ddbjrest.rb +0 -344
- data/lib/bio/io/ddbjxml.rb +0 -458
- data/lib/bio/io/ebisoap.rb +0 -158
- data/lib/bio/io/ensembl.rb +0 -229
- data/lib/bio/io/higet.rb +0 -73
- data/lib/bio/io/keggapi.rb +0 -363
- data/lib/bio/io/ncbisoap.rb +0 -156
- data/lib/bio/io/soapwsdl.rb +0 -119
- data/lib/bio/shell/plugin/keggapi.rb +0 -181
- data/lib/bio/shell/plugin/soap.rb +0 -87
- data/sample/dbget +0 -37
- data/sample/demo_ddbjxml.rb +0 -212
- data/sample/demo_kegg_taxonomy.rb +0 -92
- data/sample/demo_keggapi.rb +0 -502
- data/sample/psortplot_html.rb +0 -214
- data/test/network/bio/io/test_ddbjrest.rb +0 -47
- data/test/network/bio/io/test_ensembl.rb +0 -230
- data/test/network/bio/io/test_soapwsdl.rb +0 -53
- data/test/unit/bio/io/test_ddbjxml.rb +0 -81
- data/test/unit/bio/io/test_ensembl.rb +0 -111
- data/test/unit/bio/io/test_soapwsdl.rb +0 -33
@@ -1,41 +1,24 @@
|
|
1
1
|
#
|
2
|
-
# = bio/db/embl/swissprot.rb - SwissProt database class
|
2
|
+
# = bio/db/embl/swissprot.rb - (deprecated) SwissProt database class
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C)
|
4
|
+
# Copyright:: Copyright (C) 2013 BioRuby Project
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: swissprot.rb,v 1.7 2007/04/05 23:35:40 trevor Exp $
|
8
|
-
#
|
9
7
|
|
10
|
-
|
8
|
+
warn "Bio::SwissProt is deprecated. Use Bio::UniProtKB."
|
11
9
|
|
12
10
|
module Bio
|
13
11
|
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
# This class holds name space for SwissProt specific methods.
|
18
|
-
#
|
19
|
-
# SwissProt (before UniProtKB/SwissProt) specific methods are defined in
|
20
|
-
# this class. Shared methods for UniProtKB/SwissProt and TrEMBL classes
|
21
|
-
# are defined in Bio::SPTR class.
|
22
|
-
#
|
23
|
-
# == Examples
|
24
|
-
#
|
25
|
-
# str = File.read("p53_human.swiss")
|
26
|
-
# obj = Bio::SwissProt.new(str)
|
27
|
-
# obj.entry_id #=> "P53_HUMAN"
|
28
|
-
#
|
29
|
-
# == Referencees
|
30
|
-
#
|
31
|
-
# * Swiss-Prot Protein knowledgebase
|
32
|
-
# http://au.expasy.org/sprot/
|
33
|
-
#
|
34
|
-
# * Swiss-Prot Protein Knowledgebase User Manual
|
35
|
-
# http://au.expasy.org/sprot/userman.html
|
36
|
-
#
|
12
|
+
require 'bio/db/embl/uniprotkb' unless const_defined?(:UniProtKB)
|
13
|
+
|
14
|
+
# Bio::SwissProt is deprecated. Use Bio::UniProtKB.
|
37
15
|
class SwissProt < SPTR
|
38
|
-
|
16
|
+
|
17
|
+
# Bio::SwissProt is deprecated. Use Bio::UniProtKB.
|
18
|
+
def initialize(str)
|
19
|
+
warn "Bio::SwissProt is deprecated. Use Bio::UniProtKB."
|
20
|
+
super(str)
|
21
|
+
end
|
39
22
|
end
|
40
23
|
|
41
24
|
end
|
data/lib/bio/db/embl/trembl.rb
CHANGED
@@ -1,41 +1,24 @@
|
|
1
1
|
#
|
2
|
-
# = bio/db/embl/trembl.rb - TrEMBL database class
|
2
|
+
# = bio/db/embl/trembl.rb - (deprecated) TrEMBL database class
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C)
|
4
|
+
# Copyright:: Copyright (C) 2013 BioRuby Project
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: trembl.rb,v 1.7 2007/04/05 23:35:40 trevor Exp $
|
8
|
-
#
|
9
7
|
|
10
|
-
|
8
|
+
warn "Bio::TrEMBL is deprecated. Use Bio::UniProtKB."
|
11
9
|
|
12
10
|
module Bio
|
13
11
|
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
# str = File.read("Q2UNG2_ASPOR.trembl")
|
26
|
-
# obj = Bio::TrEMBL.new(str)
|
27
|
-
# obj.entry_id #=> "Q2UNG2_ASPOR"
|
28
|
-
#
|
29
|
-
# == Referencees
|
30
|
-
#
|
31
|
-
# * TrEMBL Computer-annotated supplement to Swiss-Prot
|
32
|
-
# http://au.expasy.org/sprot/
|
33
|
-
#
|
34
|
-
# * TrEMBL Computer-annotated supplement to Swiss-Prot User Manual
|
35
|
-
# http://au.expasy.org/sprot/userman.html
|
36
|
-
#
|
37
|
-
class TrEMBL < SPTR
|
38
|
-
# Nothing to do (TrEMBL format is abstracted in SPTR)
|
12
|
+
require 'bio/db/embl/uniprotkb' unless const_defined?(:UniProtKB)
|
13
|
+
|
14
|
+
# Bio::TrEMBL is deprecated. Use Bio::UniProtKB.
|
15
|
+
class TrEMBL < UniProtKB
|
16
|
+
|
17
|
+
# Bio::TrEMBL is deprecated. Use Bio::UniProtKB.
|
18
|
+
def initialize(str)
|
19
|
+
warn "Bio::TrEMBL is deprecated. Use Bio::UniProtKB."
|
20
|
+
super(str)
|
21
|
+
end
|
39
22
|
end
|
40
23
|
|
41
24
|
end
|
data/lib/bio/db/embl/uniprot.rb
CHANGED
@@ -1,42 +1,25 @@
|
|
1
1
|
#
|
2
2
|
# = bio/db/embl/uniprot.rb - UniProt database class
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C)
|
4
|
+
# Copyright:: Copyright (C) 2013 BioRuby Project
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: uniprot.rb,v 1.5 2007/04/05 23:35:40 trevor Exp $
|
8
7
|
#
|
9
8
|
|
10
|
-
|
9
|
+
warn "Bio::UniProt is an alias of Bio::UniProtKB. Please use Bio::UniProtKB. Bio::UniProt may be deprecated in the future." if $VERBOSE
|
11
10
|
|
12
11
|
module Bio
|
13
12
|
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
|
24
|
-
#
|
25
|
-
# str = File.read("p53_human.swiss")
|
26
|
-
# obj = Bio::UniProt.new(str)
|
27
|
-
# obj.entry_id #=> "P53_HUMAN"
|
28
|
-
#
|
29
|
-
# == Referencees
|
30
|
-
#
|
31
|
-
# * UniProt
|
32
|
-
# http://uniprot.org/
|
33
|
-
#
|
34
|
-
# * The UniProtKB/SwissProt/TrEMBL User Manual
|
35
|
-
# http://www.expasy.org/sprot/userman.html
|
36
|
-
#
|
37
|
-
class UniProt < SPTR
|
38
|
-
# Nothing to do (UniProt format is abstracted in SPTR)
|
39
|
-
end
|
13
|
+
require 'bio/db/embl/uniprotkb' unless const_defined?(:UniProtKB)
|
14
|
+
|
15
|
+
# Bio::UniProt is changed to an alias of Bio::UniProtKB.
|
16
|
+
# Please use Bio::UniProtKB.
|
17
|
+
# Bio::UniProt may be deprecated in the future.
|
18
|
+
#
|
19
|
+
# Note that Bio::SPTR have been renamed to Bio::UniProtKB and
|
20
|
+
# is also an alias of Bio::UniProtKB.
|
21
|
+
#
|
22
|
+
UniProt = UniProtKB
|
40
23
|
|
41
24
|
end
|
42
25
|
|
@@ -0,0 +1,1455 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/embl/uniprotkb.rb - UniProtKB data parser class
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2001-2006 Mitsuteru C. Nakao <n@bioruby.org>
|
5
|
+
# License:: The Ruby License
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# == Description
|
9
|
+
#
|
10
|
+
# See Bio::UniProtKB documents.
|
11
|
+
#
|
12
|
+
|
13
|
+
require 'bio/db'
|
14
|
+
require 'bio/db/embl/common'
|
15
|
+
|
16
|
+
module Bio
|
17
|
+
|
18
|
+
# == Description
|
19
|
+
#
|
20
|
+
# Parser class for UniProtKB/SwissProt and TrEMBL database entry.
|
21
|
+
#
|
22
|
+
# See the UniProtKB document files and manuals.
|
23
|
+
#
|
24
|
+
# == Examples
|
25
|
+
#
|
26
|
+
# str = File.read("p53_human.swiss")
|
27
|
+
# obj = Bio::UniProtKB.new(str)
|
28
|
+
# obj.entry_id #=> "P53_HUMAN"
|
29
|
+
#
|
30
|
+
# == References
|
31
|
+
#
|
32
|
+
# * The UniProt Knowledgebase (UniProtKB)
|
33
|
+
# http://www.uniprot.org/help/uniprotkb
|
34
|
+
#
|
35
|
+
# * The Universal Protein Resource (UniProt)
|
36
|
+
# http://uniprot.org/
|
37
|
+
#
|
38
|
+
# * The UniProtKB/SwissProt/TrEMBL User Manual
|
39
|
+
# http://www.uniprot.org/docs/userman.html
|
40
|
+
#
|
41
|
+
class UniProtKB < EMBLDB
|
42
|
+
include Bio::EMBLDB::Common
|
43
|
+
|
44
|
+
@@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
|
45
|
+
@@data_class = ["STANDARD", "PRELIMINARY"]
|
46
|
+
|
47
|
+
# returns a Hash of the ID line.
|
48
|
+
#
|
49
|
+
# returns a content (Int or String) of the ID line by a given key.
|
50
|
+
# Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
|
51
|
+
#
|
52
|
+
# === ID Line (since UniProtKB release 9.0 of 31-Oct-2006)
|
53
|
+
# ID P53_HUMAN Reviewed; 393 AA.
|
54
|
+
# #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
|
55
|
+
#
|
56
|
+
# === Examples
|
57
|
+
# obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed",
|
58
|
+
# "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}
|
59
|
+
#
|
60
|
+
# obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
|
61
|
+
#
|
62
|
+
#
|
63
|
+
# === ID Line (older style)
|
64
|
+
# ID P53_HUMAN STANDARD; PRT; 393 AA.
|
65
|
+
# #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
|
66
|
+
#
|
67
|
+
# === Examples
|
68
|
+
# obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD",
|
69
|
+
# "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}
|
70
|
+
#
|
71
|
+
# obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
|
72
|
+
#
|
73
|
+
def id_line(key = nil)
|
74
|
+
return id_line[key] if key
|
75
|
+
return @data['ID'] if @data['ID']
|
76
|
+
|
77
|
+
part = @orig['ID'].split(/ +/)
|
78
|
+
if part[4].to_s.chomp == 'AA.' then
|
79
|
+
# after UniProtKB release 9.0 of 31-Oct-2006
|
80
|
+
# (http://www.uniprot.org/docs/sp_news.htm)
|
81
|
+
molecule_type = nil
|
82
|
+
sequence_length = part[3].to_i
|
83
|
+
else
|
84
|
+
molecule_type = part[3].sub(/;/,'')
|
85
|
+
sequence_length = part[4].to_i
|
86
|
+
end
|
87
|
+
@data['ID'] = {
|
88
|
+
'ENTRY_NAME' => part[1],
|
89
|
+
'DATA_CLASS' => part[2].sub(/;/,''),
|
90
|
+
'MOLECULE_TYPE' => molecule_type,
|
91
|
+
'SEQUENCE_LENGTH' => sequence_length
|
92
|
+
}
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
# returns a ENTRY_NAME in the ID line.
|
97
|
+
#
|
98
|
+
def entry_id
|
99
|
+
id_line('ENTRY_NAME')
|
100
|
+
end
|
101
|
+
alias entry_name entry_id
|
102
|
+
alias entry entry_id
|
103
|
+
|
104
|
+
|
105
|
+
# returns a MOLECULE_TYPE in the ID line.
|
106
|
+
#
|
107
|
+
# A short-cut for Bio::UniProtKB#id_line('MOLECULE_TYPE').
|
108
|
+
def molecule
|
109
|
+
id_line('MOLECULE_TYPE')
|
110
|
+
end
|
111
|
+
alias molecule_type molecule
|
112
|
+
|
113
|
+
|
114
|
+
# returns a SEQUENCE_LENGTH in the ID line.
|
115
|
+
#
|
116
|
+
# A short-cut for Bio::UniProtKB#id_line('SEQUENCE_LENGHT').
|
117
|
+
def sequence_length
|
118
|
+
id_line('SEQUENCE_LENGTH')
|
119
|
+
end
|
120
|
+
alias aalen sequence_length
|
121
|
+
|
122
|
+
|
123
|
+
# Bio::EMBLDB::Common#ac -> ary
|
124
|
+
# #accessions -> ary
|
125
|
+
# #accession -> String (accessions.first)
|
126
|
+
@@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
# returns a Hash of information in the DT lines.
|
131
|
+
# hash keys:
|
132
|
+
# ['created', 'sequence', 'annotation']
|
133
|
+
#--
|
134
|
+
# also Symbols acceptable (ASAP):
|
135
|
+
# [:created, :sequence, :annotation]
|
136
|
+
#++
|
137
|
+
#
|
138
|
+
# Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is
|
139
|
+
# changed, and the word "annotation" is no longer used in DT lines.
|
140
|
+
# Despite the change, the word "annotation" is still used for keeping
|
141
|
+
# compatibility.
|
142
|
+
#
|
143
|
+
# returns a String of information in the DT lines by a given key.
|
144
|
+
#
|
145
|
+
# === DT Line; date (3/entry)
|
146
|
+
# DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
|
147
|
+
# DT DD-MMM-YYY (sequence version NN)
|
148
|
+
# DT DD-MMM-YYY (entry version NN)
|
149
|
+
#
|
150
|
+
# The format have been changed in UniProtKB release 7.0 of 07-Feb-2006.
|
151
|
+
# Below is the older format.
|
152
|
+
#
|
153
|
+
# === Old format of DT Line; date (3/entry)
|
154
|
+
# DT DD-MMM-YYY (rel. NN, Created)
|
155
|
+
# DT DD-MMM-YYY (rel. NN, Last sequence update)
|
156
|
+
# DT DD-MMM-YYY (rel. NN, Last annotation update)
|
157
|
+
def dt(key = nil)
|
158
|
+
return dt[key] if key
|
159
|
+
return @data['DT'] if @data['DT']
|
160
|
+
|
161
|
+
part = self.get('DT').split(/\n/)
|
162
|
+
@data['DT'] = {
|
163
|
+
'created' => part[0].sub(/\w{2} /,'').strip,
|
164
|
+
'sequence' => part[1].sub(/\w{2} /,'').strip,
|
165
|
+
'annotation' => part[2].sub(/\w{2} /,'').strip
|
166
|
+
}
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
# (private) parses DE line (description lines)
|
171
|
+
# since UniProtKB release 14.0 of 22-Jul-2008
|
172
|
+
#
|
173
|
+
# Return array containing array.
|
174
|
+
#
|
175
|
+
# http://www.uniprot.org/docs/sp_news.htm
|
176
|
+
def parse_DE_line_rel14(str)
|
177
|
+
# Retruns if it is not the new format since Rel.14
|
178
|
+
return nil unless /^DE (RecName|AltName|SubName)\: / =~ str
|
179
|
+
ret = []
|
180
|
+
cur = nil
|
181
|
+
str.each_line do |line|
|
182
|
+
case line
|
183
|
+
when /^DE (Includes|Contains)\: *$/
|
184
|
+
cur = [ $1 ]
|
185
|
+
ret.push cur
|
186
|
+
cur = nil
|
187
|
+
#subcat_and_desc = nil
|
188
|
+
next
|
189
|
+
when /^DE *(RecName|AltName|SubName)\: +(.*)/
|
190
|
+
category = $1
|
191
|
+
subcat_and_desc = $2
|
192
|
+
cur = [ category ]
|
193
|
+
ret.push cur
|
194
|
+
when /^DE *(Flags)\: +(.*)/
|
195
|
+
category = $1
|
196
|
+
desc = $2
|
197
|
+
flags = desc.strip.split(/\s*\;\s*/) || []
|
198
|
+
cur = [ category, flags ]
|
199
|
+
ret.push cur
|
200
|
+
cur = nil
|
201
|
+
#subcat_and_desc = nil
|
202
|
+
next
|
203
|
+
when /^DE *(.*)/
|
204
|
+
subcat_and_desc = $1
|
205
|
+
else
|
206
|
+
warn "Warning: skipped DE line in unknown format: #{line.inspect}"
|
207
|
+
#subcat_and_desc = nil
|
208
|
+
next
|
209
|
+
end
|
210
|
+
case subcat_and_desc
|
211
|
+
when nil
|
212
|
+
# does nothing
|
213
|
+
when /\A([^\=]+)\=(.*)/
|
214
|
+
subcat = $1
|
215
|
+
desc = $2
|
216
|
+
desc.sub!(/\;\s*\z/, '')
|
217
|
+
unless cur
|
218
|
+
warn "Warning: unknown category in DE line: #{line.inspect}"
|
219
|
+
cur = [ '' ]
|
220
|
+
ret.push cur
|
221
|
+
end
|
222
|
+
cur.push [ subcat, desc ]
|
223
|
+
else
|
224
|
+
warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
|
225
|
+
end
|
226
|
+
end
|
227
|
+
ret
|
228
|
+
end
|
229
|
+
private :parse_DE_line_rel14
|
230
|
+
|
231
|
+
# returns the proposed official name of the protein.
|
232
|
+
# Returns a String.
|
233
|
+
#
|
234
|
+
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
|
235
|
+
# been changed. The method returns the full name which is taken from
|
236
|
+
# "RecName: Full=" or "SubName: Full=" line normally in the beginning of
|
237
|
+
# the DE lines.
|
238
|
+
# Unlike parser for old format, no special treatments for fragment or
|
239
|
+
# precursor.
|
240
|
+
#
|
241
|
+
# For old format, the method parses the DE lines and returns the protein
|
242
|
+
# name as a String.
|
243
|
+
#
|
244
|
+
# === DE Line; description (>=1)
|
245
|
+
# "DE #{OFFICIAL_NAME} (#{SYNONYM})"
|
246
|
+
# "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
|
247
|
+
# OFFICIAL_NAME 1/entry
|
248
|
+
# SYNONYM >=0
|
249
|
+
# CONTEINS >=0
|
250
|
+
def protein_name
|
251
|
+
@data['DE'] ||= parse_DE_line_rel14(get('DE'))
|
252
|
+
parsed_de_line = @data['DE']
|
253
|
+
if parsed_de_line then
|
254
|
+
# since UniProtKB release 14.0 of 22-Jul-2008
|
255
|
+
name = nil
|
256
|
+
parsed_de_line.each do |a|
|
257
|
+
case a[0]
|
258
|
+
when 'RecName', 'SubName'
|
259
|
+
if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
|
260
|
+
name = name_pair[1]
|
261
|
+
break
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
name = name.to_s
|
266
|
+
else
|
267
|
+
# old format (before Rel. 13.x)
|
268
|
+
name = ""
|
269
|
+
if de_line = fetch('DE') then
|
270
|
+
str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
|
271
|
+
name = str[/^[^(]*/].strip
|
272
|
+
name << ' (Fragment)' if str =~ /fragment/i
|
273
|
+
end
|
274
|
+
end
|
275
|
+
return name
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
# returns synonyms (unofficial and/or alternative names).
|
280
|
+
# Returns an Array containing String objects.
|
281
|
+
#
|
282
|
+
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
|
283
|
+
# been changed. The method returns the full or short names which are
|
284
|
+
# taken from "RecName: Short=", "RecName: EC=", and AltName lines,
|
285
|
+
# except after "Contains:" or "Includes:".
|
286
|
+
# For keeping compatibility with old format parser, "RecName: EC=N.N.N.N"
|
287
|
+
# is reported as "EC N.N.N.N".
|
288
|
+
# In addition, to prevent confusion, "Allergen=" and "CD_antigen="
|
289
|
+
# prefixes are added for the corresponding fields.
|
290
|
+
#
|
291
|
+
# For old format, the method parses the DE lines and returns synonyms.
|
292
|
+
# synonyms are each placed in () following the official name on the DE line.
|
293
|
+
def synonyms
|
294
|
+
ary = Array.new
|
295
|
+
@data['DE'] ||= parse_DE_line_rel14(get('DE'))
|
296
|
+
parsed_de_line = @data['DE']
|
297
|
+
if parsed_de_line then
|
298
|
+
# since UniProtKB release 14.0 of 22-Jul-2008
|
299
|
+
parsed_de_line.each do |a|
|
300
|
+
case a[0]
|
301
|
+
when 'Includes', 'Contains'
|
302
|
+
break #the each loop
|
303
|
+
when 'RecName', 'SubName', 'AltName'
|
304
|
+
a[1..-1].each do |b|
|
305
|
+
if name = b[1] and b[1] != self.protein_name then
|
306
|
+
case b[0]
|
307
|
+
when 'EC'
|
308
|
+
name = "EC " + b[1]
|
309
|
+
when 'Allergen', 'CD_antigen'
|
310
|
+
name = b[0] + '=' + b[1]
|
311
|
+
else
|
312
|
+
name = b[1]
|
313
|
+
end
|
314
|
+
ary.push name
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end #case a[0]
|
318
|
+
end #parsed_de_line.each
|
319
|
+
else
|
320
|
+
# old format (before Rel. 13.x)
|
321
|
+
if de_line = fetch('DE') then
|
322
|
+
line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
|
323
|
+
line.scan(/\([^)]+/) do |synonym|
|
324
|
+
unless synonym =~ /fragment/i then
|
325
|
+
ary << synonym[1..-1].strip # index to remove the leading (
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
return ary
|
331
|
+
end
|
332
|
+
|
333
|
+
|
334
|
+
# returns gene names in the GN line.
|
335
|
+
#
|
336
|
+
# New UniProt/SwissProt format:
|
337
|
+
# * Bio::UniProtKB#gn -> [ <gene record>* ]
|
338
|
+
# where <gene record> is:
|
339
|
+
# { :name => '...',
|
340
|
+
# :synonyms => [ 's1', 's2', ... ],
|
341
|
+
# :loci => [ 'l1', 'l2', ... ],
|
342
|
+
# :orfs => [ 'o1', 'o2', ... ]
|
343
|
+
# }
|
344
|
+
#
|
345
|
+
# Old format:
|
346
|
+
# * Bio::UniProtKB#gn -> Array # AND
|
347
|
+
# * Bio::UniProtKB#gn[0] -> Array # OR
|
348
|
+
#
|
349
|
+
# === GN Line: Gene name(s) (>=0, optional)
|
350
|
+
def gn
|
351
|
+
unless @data['GN']
|
352
|
+
case fetch('GN')
|
353
|
+
when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
|
354
|
+
@data['GN'] = gn_uniprot_parser
|
355
|
+
else
|
356
|
+
@data['GN'] = gn_old_parser
|
357
|
+
end
|
358
|
+
end
|
359
|
+
@data['GN']
|
360
|
+
end
|
361
|
+
|
362
|
+
|
363
|
+
# returns contents in the old style GN line.
|
364
|
+
# === GN Line: Gene name(s) (>=0, optional)
|
365
|
+
# GN HNS OR DRDX OR OSMZ OR BGLY.
|
366
|
+
# GN CECA1 AND CECA2.
|
367
|
+
# GN CECA1 AND (HOGE OR FUGA).
|
368
|
+
#
|
369
|
+
# GN NAME1 [(AND|OR) NAME]+.
|
370
|
+
#
|
371
|
+
# Bio::UniProtKB#gn -> Array # AND
|
372
|
+
# #gn[0] -> Array # OR
|
373
|
+
# #gene_names -> Array
|
374
|
+
def gn_old_parser
|
375
|
+
names = Array.new
|
376
|
+
if get('GN').size > 0
|
377
|
+
names = fetch('GN').sub(/\.$/,'').split(/ AND /)
|
378
|
+
names.map! { |synonyms|
|
379
|
+
synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
|
380
|
+
e.strip
|
381
|
+
}
|
382
|
+
}
|
383
|
+
end
|
384
|
+
@data['GN'] = names
|
385
|
+
end
|
386
|
+
private :gn_old_parser
|
387
|
+
|
388
|
+
# returns contents in the structured GN line.
|
389
|
+
# The new format of the GN line is:
|
390
|
+
# GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
|
391
|
+
# GN ORFNames=[, ...];
|
392
|
+
#
|
393
|
+
# * Bio::UniProtKB#gn -> [ <gene record>* ]
|
394
|
+
# where <gene record> is:
|
395
|
+
# { :name => '...',
|
396
|
+
# :synonyms => [ 's1', 's2', ... ],
|
397
|
+
# :loci => [ 'l1', 'l2', ... ],
|
398
|
+
# :orfs => [ 'o1', 'o2', ... ]
|
399
|
+
# }
|
400
|
+
def gn_uniprot_parser
|
401
|
+
@data['GN'] = Array.new
|
402
|
+
gn_line = fetch('GN').strip
|
403
|
+
records = gn_line.split(/\s*and\s*/)
|
404
|
+
records.each do |record|
|
405
|
+
gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
|
406
|
+
record.each_line(';') do |element|
|
407
|
+
case element
|
408
|
+
when /Name=/ then
|
409
|
+
gene_hash[:name] = $'[0..-2]
|
410
|
+
when /Synonyms=/ then
|
411
|
+
gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
|
412
|
+
when /OrderedLocusNames=/ then
|
413
|
+
gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
|
414
|
+
when /ORFNames=/ then
|
415
|
+
gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
|
416
|
+
end
|
417
|
+
end
|
418
|
+
@data['GN'] << gene_hash
|
419
|
+
end
|
420
|
+
return @data['GN']
|
421
|
+
end
|
422
|
+
private :gn_uniprot_parser
|
423
|
+
|
424
|
+
|
425
|
+
# returns a Array of gene names in the GN line.
|
426
|
+
def gene_names
|
427
|
+
gn # set @data['GN'] if it hasn't been already done
|
428
|
+
if @data['GN'].first.class == Hash then
|
429
|
+
@data['GN'].collect { |element| element[:name] }
|
430
|
+
else
|
431
|
+
@data['GN'].first
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
|
436
|
+
# returns a String of the first gene name in the GN line.
|
437
|
+
def gene_name
|
438
|
+
(x = self.gene_names) ? x.first : nil
|
439
|
+
end
|
440
|
+
|
441
|
+
|
442
|
+
# returns a Array of Hashs or a String of the OS line when a key given.
|
443
|
+
# * Bio::EMBLDB#os -> Array
|
444
|
+
# [{'name' => '(Human)', 'os' => 'Homo sapiens'},
|
445
|
+
# {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
|
446
|
+
# * Bio::EPTR#os[0] -> Hash
|
447
|
+
# {'name' => "(Human)", 'os' => 'Homo sapiens'}
|
448
|
+
# * Bio::UniProtKB#os[0]['name'] -> "(Human)"
|
449
|
+
# * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
|
450
|
+
#
|
451
|
+
# === OS Line; organism species (>=1)
|
452
|
+
# OS Genus species (name).
|
453
|
+
# OS Genus species (name0) (name1).
|
454
|
+
# OS Genus species (name0) (name1).
|
455
|
+
# OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
|
456
|
+
# OS Homo sapiens (Human), and Rarrus norveticus (Rat)
|
457
|
+
# OS Hippotis sp. Clark and Watts 825.
|
458
|
+
# OS unknown cyperaceous sp.
|
459
|
+
def os(num = nil)
|
460
|
+
unless @data['OS']
|
461
|
+
os = Array.new
|
462
|
+
fetch('OS').split(/, and|, /).each do |tmp|
|
463
|
+
if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/
|
464
|
+
org = $1
|
465
|
+
tmp =~ /(\(.+\))/
|
466
|
+
os.push({'name' => $1, 'os' => org})
|
467
|
+
else
|
468
|
+
raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
|
469
|
+
end
|
470
|
+
end
|
471
|
+
@data['OS'] = os
|
472
|
+
end
|
473
|
+
|
474
|
+
if num
|
475
|
+
# EX. "Trifolium repens (white clover)"
|
476
|
+
return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
|
477
|
+
else
|
478
|
+
return @data['OS']
|
479
|
+
end
|
480
|
+
end
|
481
|
+
|
482
|
+
|
483
|
+
# Bio::EMBLDB::Common#og -> Array
|
484
|
+
# OG Line; organella (0 or 1/entry)
|
485
|
+
# ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
|
486
|
+
# or a plasmid name (e.g. "Plasmid pBR322").
|
487
|
+
|
488
|
+
|
489
|
+
# Bio::EMBLDB::Common#oc -> Array
|
490
|
+
# OC Line; organism classification (>=1)
|
491
|
+
# "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
|
492
|
+
# "OC Theileria."
|
493
|
+
|
494
|
+
|
495
|
+
|
496
|
+
# returns a Hash of oraganism taxonomy cross-references.
|
497
|
+
# * Bio::UniProtKB#ox -> Hash
|
498
|
+
# {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
|
499
|
+
#
|
500
|
+
# === OX Line; organism taxonomy cross-reference (>=1 per entry)
|
501
|
+
# OX NCBI_TaxID=1234;
|
502
|
+
# OX NCBI_TaxID=1234, 2345, 3456, 4567;
|
503
|
+
def ox
|
504
|
+
unless @data['OX']
|
505
|
+
tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
|
506
|
+
hsh = Hash.new
|
507
|
+
tmp.each do |e|
|
508
|
+
db,refs = e.split(/=/)
|
509
|
+
hsh[db] = refs.split(/, */)
|
510
|
+
end
|
511
|
+
@data['OX'] = hsh
|
512
|
+
end
|
513
|
+
return @data['OX']
|
514
|
+
end
|
515
|
+
|
516
|
+
# === The OH Line;
|
517
|
+
#
|
518
|
+
# OH NCBI_TaxID=TaxID; HostName.
|
519
|
+
# http://br.expasy.org/sprot/userman.html#OH_line
|
520
|
+
def oh
|
521
|
+
unless @data['OH']
|
522
|
+
@data['OH'] = fetch('OH').split("\. ").map {|x|
|
523
|
+
if x =~ /NCBI_TaxID=(\d+);/
|
524
|
+
taxid = $1
|
525
|
+
else
|
526
|
+
raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
|
527
|
+
$!, "\n", get('OH'), "\n"].join
|
528
|
+
|
529
|
+
end
|
530
|
+
if x =~ /NCBI_TaxID=\d+; (.+)/
|
531
|
+
host_name = $1
|
532
|
+
host_name.sub!(/\.$/, '')
|
533
|
+
else
|
534
|
+
host_name = nil
|
535
|
+
end
|
536
|
+
{'NCBI_TaxID' => taxid, 'HostName' => host_name}
|
537
|
+
}
|
538
|
+
end
|
539
|
+
@data['OH']
|
540
|
+
end
|
541
|
+
|
542
|
+
|
543
|
+
|
544
|
+
# Bio::EMBLDB::Common#ref -> Array
|
545
|
+
# R Lines
|
546
|
+
# RN RC RP RX RA RT RL
|
547
|
+
|
548
|
+
# returns contents in the R lines.
|
549
|
+
# * Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
|
550
|
+
# where <reference information Hash> is:
|
551
|
+
# {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
552
|
+
# 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
553
|
+
#
|
554
|
+
# R Lines
|
555
|
+
# * RN RC RP RX RA RT RL RG
|
556
|
+
def ref
|
557
|
+
unless @data['R']
|
558
|
+
@data['R'] = [get('R').split(/\nRN /)].flatten.map { |str|
|
559
|
+
hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
560
|
+
'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
561
|
+
str = 'RN ' + str unless /^RN / =~ str
|
562
|
+
|
563
|
+
str.split("\n").each do |line|
|
564
|
+
if /^(R[NPXARLCTG]) (.+)/ =~ line
|
565
|
+
hash[$1] += $2 + ' '
|
566
|
+
else
|
567
|
+
raise "Invalid format in R lines, \n[#{line}]\n"
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|
571
|
+
hash['RN'] = set_RN(hash['RN'])
|
572
|
+
hash['RC'] = set_RC(hash['RC'])
|
573
|
+
hash['RP'] = set_RP(hash['RP'])
|
574
|
+
hash['RX'] = set_RX(hash['RX'])
|
575
|
+
hash['RA'] = set_RA(hash['RA'])
|
576
|
+
hash['RT'] = set_RT(hash['RT'])
|
577
|
+
hash['RL'] = set_RL(hash['RL'])
|
578
|
+
hash['RG'] = set_RG(hash['RG'])
|
579
|
+
|
580
|
+
hash
|
581
|
+
}
|
582
|
+
|
583
|
+
end
|
584
|
+
@data['R']
|
585
|
+
end
|
586
|
+
|
587
|
+
def set_RN(data)
|
588
|
+
data.strip
|
589
|
+
end
|
590
|
+
|
591
|
+
def set_RC(data)
|
592
|
+
data.scan(/([STP]\w+)=(.+);/).map { |comment|
|
593
|
+
[comment[1].split(/, and |, /)].flatten.map { |text|
|
594
|
+
{'Token' => comment[0], 'Text' => text}
|
595
|
+
}
|
596
|
+
}.flatten
|
597
|
+
end
|
598
|
+
private :set_RC
|
599
|
+
|
600
|
+
def set_RP(data)
|
601
|
+
data = data.strip
|
602
|
+
data = data.sub(/\.$/, '')
|
603
|
+
data.split(/, AND |, /i).map {|x|
|
604
|
+
x = x.strip
|
605
|
+
x = x.gsub(' ', ' ')
|
606
|
+
}
|
607
|
+
end
|
608
|
+
private :set_RP
|
609
|
+
|
610
|
+
def set_RX(data)
|
611
|
+
rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
|
612
|
+
if data =~ /MEDLINE=(.+?);/
|
613
|
+
rx['MEDLINE'] = $1
|
614
|
+
end
|
615
|
+
if data =~ /PubMed=(.+?);/
|
616
|
+
rx['PubMed'] = $1
|
617
|
+
end
|
618
|
+
if data =~ /DOI=(.+?);/
|
619
|
+
rx['DOI'] = $1
|
620
|
+
end
|
621
|
+
rx
|
622
|
+
end
|
623
|
+
private :set_RX
|
624
|
+
|
625
|
+
def set_RA(data)
|
626
|
+
data = data.sub(/; *$/, '')
|
627
|
+
end
|
628
|
+
private :set_RA
|
629
|
+
|
630
|
+
def set_RT(data)
|
631
|
+
data = data.sub(/; *$/, '')
|
632
|
+
data = data.gsub(/(^"|"$)/, '')
|
633
|
+
end
|
634
|
+
private :set_RT
|
635
|
+
|
636
|
+
def set_RL(data)
|
637
|
+
data = data.strip
|
638
|
+
end
|
639
|
+
private :set_RL
|
640
|
+
|
641
|
+
def set_RG(data)
|
642
|
+
data = data.split('; ')
|
643
|
+
end
|
644
|
+
private :set_RG
|
645
|
+
|
646
|
+
|
647
|
+
|
648
|
+
# returns Bio::Reference object from Bio::EMBLDB::Common#ref.
|
649
|
+
# * Bio::EMBLDB::Common#ref -> Bio::References
|
650
|
+
def references
|
651
|
+
unless @data['references']
|
652
|
+
ary = self.ref.map {|ent|
|
653
|
+
hash = Hash.new('')
|
654
|
+
ent.each {|key, value|
|
655
|
+
case key
|
656
|
+
when 'RA'
|
657
|
+
hash['authors'] = value.split(/, /)
|
658
|
+
when 'RT'
|
659
|
+
hash['title'] = value
|
660
|
+
when 'RL'
|
661
|
+
if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
|
662
|
+
hash['journal'] = $1
|
663
|
+
hash['volume'] = $2
|
664
|
+
hash['issue'] = $3
|
665
|
+
hash['pages'] = $4
|
666
|
+
hash['year'] = $5
|
667
|
+
else
|
668
|
+
hash['journal'] = value
|
669
|
+
end
|
670
|
+
when 'RX' # PUBMED, MEDLINE, DOI
|
671
|
+
value.each do |tag, xref|
|
672
|
+
hash[ tag.downcase ] = xref
|
673
|
+
end
|
674
|
+
end
|
675
|
+
}
|
676
|
+
Reference.new(hash)
|
677
|
+
}
|
678
|
+
@data['references'] = References.new(ary)
|
679
|
+
end
|
680
|
+
@data['references']
|
681
|
+
end
|
682
|
+
|
683
|
+
|
684
|
+
|
685
|
+
|
686
|
+
|
687
|
+
|
688
|
+
# === The HI line
|
689
|
+
# Bio::UniProtKB#hi #=> hash
|
690
|
+
def hi
|
691
|
+
unless @data['HI']
|
692
|
+
@data['HI'] = []
|
693
|
+
fetch('HI').split(/\. /).each do |hlist|
|
694
|
+
hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''}
|
695
|
+
hash['Category'], hash['Keywords'] = hlist.split(': ')
|
696
|
+
hash['Keywords'] = hash['Keywords'].split('; ')
|
697
|
+
hash['Keyword'] = hash['Keywords'].pop
|
698
|
+
hash['Keyword'].sub!(/\.$/, '')
|
699
|
+
@data['HI'] << hash
|
700
|
+
end
|
701
|
+
end
|
702
|
+
@data['HI']
|
703
|
+
end
|
704
|
+
|
705
|
+
|
706
|
+
@@cc_topics = ['PHARMACEUTICAL',
|
707
|
+
'BIOTECHNOLOGY',
|
708
|
+
'TOXIC DOSE',
|
709
|
+
'ALLERGEN',
|
710
|
+
'RNA EDITING',
|
711
|
+
'POLYMORPHISM',
|
712
|
+
'BIOPHYSICOCHEMICAL PROPERTIES',
|
713
|
+
'MASS SPECTROMETRY',
|
714
|
+
'WEB RESOURCE',
|
715
|
+
'ENZYME REGULATION',
|
716
|
+
'DISEASE',
|
717
|
+
'INTERACTION',
|
718
|
+
'DEVELOPMENTAL STAGE',
|
719
|
+
'INDUCTION',
|
720
|
+
'CAUTION',
|
721
|
+
'ALTERNATIVE PRODUCTS',
|
722
|
+
'DOMAIN',
|
723
|
+
'PTM',
|
724
|
+
'MISCELLANEOUS',
|
725
|
+
'TISSUE SPECIFICITY',
|
726
|
+
'COFACTOR',
|
727
|
+
'PATHWAY',
|
728
|
+
'SUBUNIT',
|
729
|
+
'CATALYTIC ACTIVITY',
|
730
|
+
'SUBCELLULAR LOCATION',
|
731
|
+
'FUNCTION',
|
732
|
+
'SIMILARITY']
|
733
|
+
# returns contents in the CC lines.
|
734
|
+
# * Bio::UniProtKB#cc -> Hash
|
735
|
+
#
|
736
|
+
# returns an object of contents in the TOPIC.
|
737
|
+
# * Bio::UniProtKB#cc(TOPIC) -> Array w/in Hash, Hash
|
738
|
+
#
|
739
|
+
# returns contents of the "ALTERNATIVE PRODUCTS".
|
740
|
+
# * Bio::UniProtKB#cc('ALTERNATIVE PRODUCTS') -> Hash
|
741
|
+
# {'Event' => str,
|
742
|
+
# 'Named isoforms' => int,
|
743
|
+
# 'Comment' => str,
|
744
|
+
# 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
|
745
|
+
#
|
746
|
+
# CC -!- ALTERNATIVE PRODUCTS:
|
747
|
+
# CC Event=Alternative splicing; Named isoforms=15;
|
748
|
+
# ...
|
749
|
+
# CC placentae isoforms. All tissues differentially splice exon 13;
|
750
|
+
# CC Name=A; Synonyms=no del;
|
751
|
+
# CC IsoId=P15529-1; Sequence=Displayed;
|
752
|
+
#
|
753
|
+
# returns contents of the "DATABASE".
|
754
|
+
# * Bio::UniProtKB#cc('DATABASE') -> Array
|
755
|
+
# [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
|
756
|
+
#
|
757
|
+
# CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
758
|
+
#
|
759
|
+
# returns contents of the "MASS SPECTROMETRY".
|
760
|
+
# * Bio::UniProtKB#cc('MASS SPECTROMETRY') -> Array
|
761
|
+
# [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
|
762
|
+
#
|
763
|
+
# CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
764
|
+
#
|
765
|
+
# === CC lines (>=0, optional)
|
766
|
+
# CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
|
767
|
+
# CC IN LIVER, KIDNEY, LUNG AND BRAIN.
|
768
|
+
#
|
769
|
+
# CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
|
770
|
+
# CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
|
771
|
+
#
|
772
|
+
# See also http://www.expasy.org/sprot/userman.html#CC_line
|
773
|
+
#
|
774
|
+
def cc(topic = nil)
|
775
|
+
unless @data['CC']
|
776
|
+
cc = Hash.new
|
777
|
+
comment_border= '-' * (77 - 4 + 1)
|
778
|
+
dlm = /-!- /
|
779
|
+
|
780
|
+
# 12KD_MYCSM has no CC lines.
|
781
|
+
return cc if get('CC').size == 0
|
782
|
+
|
783
|
+
cc_raw = fetch('CC')
|
784
|
+
|
785
|
+
# Removing the copyright statement.
|
786
|
+
cc_raw.sub!(/ *---.+---/m, '')
|
787
|
+
|
788
|
+
# Not any CC Lines without the copyright statement.
|
789
|
+
return cc if cc_raw == ''
|
790
|
+
|
791
|
+
begin
|
792
|
+
cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
|
793
|
+
_ = copyright #dummy for suppress "assigned but unused variable"
|
794
|
+
cc_raw = cc_raw.sub(dlm,'')
|
795
|
+
cc_raw.split(dlm).each do |tmp|
|
796
|
+
tmp = tmp.strip
|
797
|
+
|
798
|
+
if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
|
799
|
+
key = $1
|
800
|
+
body = $2
|
801
|
+
body.gsub!(/- (?!AND)/,'-')
|
802
|
+
body.strip!
|
803
|
+
unless cc[key]
|
804
|
+
cc[key] = [body]
|
805
|
+
else
|
806
|
+
cc[key].push(body)
|
807
|
+
end
|
808
|
+
else
|
809
|
+
raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
|
810
|
+
'', get('CC'),''].join("\n")
|
811
|
+
end
|
812
|
+
end
|
813
|
+
rescue NameError
|
814
|
+
if fetch('CC') == ''
|
815
|
+
return {}
|
816
|
+
else
|
817
|
+
raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
|
818
|
+
"\n'#{self.get('CC')}'\n", "(#{$!})"].join
|
819
|
+
end
|
820
|
+
rescue NoMethodError
|
821
|
+
end
|
822
|
+
|
823
|
+
@data['CC'] = cc
|
824
|
+
end
|
825
|
+
|
826
|
+
|
827
|
+
case topic
|
828
|
+
when 'ALLERGEN'
|
829
|
+
return @data['CC'][topic]
|
830
|
+
when 'ALTERNATIVE PRODUCTS'
|
831
|
+
return cc_alternative_products(@data['CC'][topic])
|
832
|
+
when 'BIOPHYSICOCHEMICAL PROPERTIES'
|
833
|
+
return cc_biophysiochemical_properties(@data['CC'][topic])
|
834
|
+
when 'BIOTECHNOLOGY'
|
835
|
+
return @data['CC'][topic]
|
836
|
+
when 'CATALITIC ACTIVITY'
|
837
|
+
return cc_catalytic_activity(@data['CC'][topic])
|
838
|
+
when 'CAUTION'
|
839
|
+
return cc_caution(@data['CC'][topic])
|
840
|
+
when 'COFACTOR'
|
841
|
+
return @data['CC'][topic]
|
842
|
+
when 'DEVELOPMENTAL STAGE'
|
843
|
+
return @data['CC'][topic].join('')
|
844
|
+
when 'DISEASE'
|
845
|
+
return @data['CC'][topic].join('')
|
846
|
+
when 'DOMAIN'
|
847
|
+
return @data['CC'][topic]
|
848
|
+
when 'ENZYME REGULATION'
|
849
|
+
return @data['CC'][topic].join('')
|
850
|
+
when 'FUNCTION'
|
851
|
+
return @data['CC'][topic].join('')
|
852
|
+
when 'INDUCTION'
|
853
|
+
return @data['CC'][topic].join('')
|
854
|
+
when 'INTERACTION'
|
855
|
+
return cc_interaction(@data['CC'][topic])
|
856
|
+
when 'MASS SPECTROMETRY'
|
857
|
+
return cc_mass_spectrometry(@data['CC'][topic])
|
858
|
+
when 'MISCELLANEOUS'
|
859
|
+
return @data['CC'][topic]
|
860
|
+
when 'PATHWAY'
|
861
|
+
return cc_pathway(@data['CC'][topic])
|
862
|
+
when 'PHARMACEUTICAL'
|
863
|
+
return @data['CC'][topic]
|
864
|
+
when 'POLYMORPHISM'
|
865
|
+
return @data['CC'][topic]
|
866
|
+
when 'PTM'
|
867
|
+
return @data['CC'][topic]
|
868
|
+
when 'RNA EDITING'
|
869
|
+
return cc_rna_editing(@data['CC'][topic])
|
870
|
+
when 'SIMILARITY'
|
871
|
+
return @data['CC'][topic]
|
872
|
+
when 'SUBCELLULAR LOCATION'
|
873
|
+
return cc_subcellular_location(@data['CC'][topic])
|
874
|
+
when 'SUBUNIT'
|
875
|
+
return @data['CC'][topic]
|
876
|
+
when 'TISSUE SPECIFICITY'
|
877
|
+
return @data['CC'][topic]
|
878
|
+
when 'TOXIC DOSE'
|
879
|
+
return @data['CC'][topic]
|
880
|
+
when 'WEB RESOURCE'
|
881
|
+
return cc_web_resource(@data['CC'][topic])
|
882
|
+
when 'DATABASE'
|
883
|
+
# DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
884
|
+
tmp = Array.new
|
885
|
+
db = @data['CC']['DATABASE']
|
886
|
+
return db unless db
|
887
|
+
|
888
|
+
db.each do |e|
|
889
|
+
db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
|
890
|
+
e.sub(/.$/,'').split(/;/).each do |line|
|
891
|
+
case line
|
892
|
+
when /NAME=(.+)/
|
893
|
+
db['NAME'] = $1
|
894
|
+
when /NOTE=(.+)/
|
895
|
+
db['NOTE'] = $1
|
896
|
+
when /WWW="(.+)"/
|
897
|
+
db['WWW'] = $1
|
898
|
+
when /FTP="(.+)"/
|
899
|
+
db['FTP'] = $1
|
900
|
+
end
|
901
|
+
end
|
902
|
+
tmp.push(db)
|
903
|
+
end
|
904
|
+
return tmp
|
905
|
+
when nil
|
906
|
+
return @data['CC']
|
907
|
+
else
|
908
|
+
return @data['CC'][topic]
|
909
|
+
end
|
910
|
+
end
|
911
|
+
|
912
|
+
|
913
|
+
def cc_alternative_products(data)
|
914
|
+
ap = data.join('')
|
915
|
+
return ap unless ap
|
916
|
+
|
917
|
+
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
|
918
|
+
tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "",
|
919
|
+
'Variants' => []}
|
920
|
+
if /Event=(.+?);/ =~ ap
|
921
|
+
tmp['Event'] = $1
|
922
|
+
tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
|
923
|
+
end
|
924
|
+
if /Named isoforms=(\S+?);/ =~ ap
|
925
|
+
tmp['Named isoforms'] = $1
|
926
|
+
end
|
927
|
+
if /Comment=(.+?);/m =~ ap
|
928
|
+
tmp['Comment'] = $1
|
929
|
+
end
|
930
|
+
ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
|
931
|
+
tmp['Variants'] << cc_alternative_products_variants(ent)
|
932
|
+
end
|
933
|
+
return tmp
|
934
|
+
end
|
935
|
+
private :cc_alternative_products
|
936
|
+
|
937
|
+
def cc_alternative_products_variants(data)
|
938
|
+
variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
|
939
|
+
data.split(/; /).map {|x| x.split(/=/) }.each do |e|
|
940
|
+
case e[0]
|
941
|
+
when 'Sequence', 'Synonyms', 'IsoId'
|
942
|
+
e[1] = e[1].sub(/;/,'').split(/, /)
|
943
|
+
end
|
944
|
+
variant[e[0]] = e[1]
|
945
|
+
end
|
946
|
+
variant
|
947
|
+
end
|
948
|
+
private :cc_alternative_products_variants
|
949
|
+
|
950
|
+
|
951
|
+
def cc_biophysiochemical_properties(data)
|
952
|
+
data = data[0]
|
953
|
+
|
954
|
+
hash = {'Absorption' => {},
|
955
|
+
'Kinetic parameters' => {},
|
956
|
+
'pH dependence' => "",
|
957
|
+
'Redox potential' => "",
|
958
|
+
'Temperature dependence' => ""}
|
959
|
+
if data =~ /Absorption: Abs\(max\)=(.+?);/
|
960
|
+
hash['Absorption']['Abs(max)'] = $1
|
961
|
+
end
|
962
|
+
if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
|
963
|
+
hash['Absorption']['Note'] = $1
|
964
|
+
end
|
965
|
+
if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
|
966
|
+
hash['Kinetic parameters']['KM'] = $1
|
967
|
+
hash['Kinetic parameters']['Vmax'] = $2
|
968
|
+
end
|
969
|
+
if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
|
970
|
+
hash['Kinetic parameters']['Note'] = $1
|
971
|
+
end
|
972
|
+
if data =~ /pH dependence: (.+?);/
|
973
|
+
hash['pH dependence'] = $1
|
974
|
+
end
|
975
|
+
if data =~ /Redox potential: (.+?);/
|
976
|
+
hash['Redox potential'] = $1
|
977
|
+
end
|
978
|
+
if data =~ /Temperature dependence: (.+?);/
|
979
|
+
hash['Temperature dependence'] = $1
|
980
|
+
end
|
981
|
+
hash
|
982
|
+
end
|
983
|
+
private :cc_biophysiochemical_properties
|
984
|
+
|
985
|
+
|
986
|
+
def cc_caution(data)
|
987
|
+
data.join('')
|
988
|
+
end
|
989
|
+
private :cc_caution
|
990
|
+
|
991
|
+
|
992
|
+
# returns conteins in a line of the CC INTERACTION section.
|
993
|
+
#
|
994
|
+
# CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
|
995
|
+
def cc_interaction(data)
|
996
|
+
str = data.join('')
|
997
|
+
it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
|
998
|
+
it.map {|ent|
|
999
|
+
ent.map! {|x| x.strip }
|
1000
|
+
if ent[0] =~ /^(.+):(.+)/
|
1001
|
+
spac = $1
|
1002
|
+
spid = $2.split(' ')[0]
|
1003
|
+
optid = nil
|
1004
|
+
elsif ent[0] =~ /Self/
|
1005
|
+
spac = self.entry_id
|
1006
|
+
spid = self.entry_id
|
1007
|
+
optid = nil
|
1008
|
+
end
|
1009
|
+
if ent[0] =~ /^.+:.+ (.+)/
|
1010
|
+
optid = $1
|
1011
|
+
end
|
1012
|
+
|
1013
|
+
{'SP_Ac' => spac,
|
1014
|
+
'identifier' => spid,
|
1015
|
+
'NbExp' => ent[1],
|
1016
|
+
'IntAct' => ent[2].split(', '),
|
1017
|
+
'optional_identifier' => optid}
|
1018
|
+
}
|
1019
|
+
end
|
1020
|
+
private :cc_interaction
|
1021
|
+
|
1022
|
+
|
1023
|
+
def cc_mass_spectrometry(data)
|
1024
|
+
# MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
1025
|
+
return data unless data
|
1026
|
+
|
1027
|
+
data.map { |m|
|
1028
|
+
mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
|
1029
|
+
'NOTE' => nil}
|
1030
|
+
m.sub(/.$/,'').split(/;/).each do |line|
|
1031
|
+
case line
|
1032
|
+
when /MW=(.+)/
|
1033
|
+
mass['MW'] = $1
|
1034
|
+
when /MW_ERR=(.+)/
|
1035
|
+
mass['MW_ERR'] = $1
|
1036
|
+
when /METHOD=(.+)/
|
1037
|
+
mass['METHOD'] = $1
|
1038
|
+
when /RANGE=(\d+-\d+)/
|
1039
|
+
mass['RANGE'] = $1 # RANGE class ?
|
1040
|
+
when /NOTE=(.+)/
|
1041
|
+
mass['NOTE'] = $1
|
1042
|
+
end
|
1043
|
+
end
|
1044
|
+
mass
|
1045
|
+
}
|
1046
|
+
end
|
1047
|
+
private :cc_mass_spectrometry
|
1048
|
+
|
1049
|
+
|
1050
|
+
def cc_pathway(data)
|
1051
|
+
data.map {|x| x.sub(/\.$/, '') }.map {|x|
|
1052
|
+
x.split(/; | and |: /)
|
1053
|
+
}[0]
|
1054
|
+
end
|
1055
|
+
private :cc_pathway
|
1056
|
+
|
1057
|
+
|
1058
|
+
def cc_rna_editing(data)
|
1059
|
+
data = data.join('')
|
1060
|
+
entry = {'Modified_positions' => [], 'Note' => ""}
|
1061
|
+
if data =~ /Modified_positions=(.+?)(\.|;)/
|
1062
|
+
entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
|
1063
|
+
else
|
1064
|
+
raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
|
1065
|
+
end
|
1066
|
+
if data =~ /Note=(.+)/
|
1067
|
+
entry['Note'] = $1
|
1068
|
+
end
|
1069
|
+
entry
|
1070
|
+
end
|
1071
|
+
private :cc_rna_editing
|
1072
|
+
|
1073
|
+
|
1074
|
+
def cc_subcellular_location(data)
|
1075
|
+
data.map {|x|
|
1076
|
+
x.split('. ').map {|y|
|
1077
|
+
y.split('; ').map {|z|
|
1078
|
+
z.sub(/\.$/, '')
|
1079
|
+
}
|
1080
|
+
}
|
1081
|
+
}[0]
|
1082
|
+
end
|
1083
|
+
private :cc_subcellular_location
|
1084
|
+
|
1085
|
+
|
1086
|
+
#--
|
1087
|
+
# Since UniProtKB release 12.2 of 11-Sep-2007:
|
1088
|
+
# CC -!- WEB RESOURCE: Name=ResourceName[; Note=FreeText][; URL=WWWAddress]. # Old format:
|
1089
|
+
# CC -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].
|
1090
|
+
#++
|
1091
|
+
|
1092
|
+
def cc_web_resource(data)
|
1093
|
+
data.map {|x|
|
1094
|
+
entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
|
1095
|
+
x.split(';').each do |y|
|
1096
|
+
case y
|
1097
|
+
when /(Name|Note)\=(.+)/
|
1098
|
+
key = $1
|
1099
|
+
val = $2.strip
|
1100
|
+
entry[key] = val
|
1101
|
+
when /(NAME|NOTE)\=(.+)/
|
1102
|
+
key = $1.downcase.capitalize
|
1103
|
+
val = $2.strip
|
1104
|
+
entry[key] = val
|
1105
|
+
when /URL\=\"(.+)\"/
|
1106
|
+
entry['URL'] = $1.strip
|
1107
|
+
end
|
1108
|
+
end
|
1109
|
+
entry
|
1110
|
+
}
|
1111
|
+
end
|
1112
|
+
private :cc_web_resource
|
1113
|
+
|
1114
|
+
# returns databases cross-references in the DR lines.
|
1115
|
+
# * Bio::UniProtKB#dr -> Hash w/in Array
|
1116
|
+
#
|
1117
|
+
# === DR Line; defabases cross-reference (>=0)
|
1118
|
+
# DR database_identifier; primary_identifier; secondary_identifier.
|
1119
|
+
# a cross_ref pre one line
|
1120
|
+
@@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
|
1121
|
+
'ECOGENE',
|
1122
|
+
'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
|
1123
|
+
'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
|
1124
|
+
'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
|
1125
|
+
'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
|
1126
|
+
|
1127
|
+
# Backup Bio::EMBLDB#dr as embl_dr
|
1128
|
+
alias :embl_dr :dr
|
1129
|
+
|
1130
|
+
# Bio::UniProtKB#dr
|
1131
|
+
def dr(key = nil)
|
1132
|
+
unless key
|
1133
|
+
embl_dr
|
1134
|
+
else
|
1135
|
+
(embl_dr[key] or []).map {|x|
|
1136
|
+
{'Accession' => x[0],
|
1137
|
+
'Version' => x[1],
|
1138
|
+
' ' => x[2],
|
1139
|
+
'Molecular Type' => x[3]}
|
1140
|
+
}
|
1141
|
+
end
|
1142
|
+
end
|
1143
|
+
|
1144
|
+
|
1145
|
+
# Bio::EMBLDB::Common#kw - Array
|
1146
|
+
# #keywords -> Array
|
1147
|
+
#
|
1148
|
+
# KW Line; keyword (>=1)
|
1149
|
+
# KW [Keyword;]+
|
1150
|
+
|
1151
|
+
|
1152
|
+
# returns contents in the feature table.
|
1153
|
+
#
|
1154
|
+
# == Examples
|
1155
|
+
#
|
1156
|
+
# sp = Bio::UniProtKB.new(entry)
|
1157
|
+
# ft = sp.ft
|
1158
|
+
# ft.class #=> Hash
|
1159
|
+
# ft.keys.each do |feature_key|
|
1160
|
+
# ft[feature_key].each do |feature|
|
1161
|
+
# feature['From'] #=> '1'
|
1162
|
+
# feature['To'] #=> '21'
|
1163
|
+
# feature['Description'] #=> ''
|
1164
|
+
# feature['FTId'] #=> ''
|
1165
|
+
# feature['diff'] #=> []
|
1166
|
+
# feature['original'] #=> [feature_key, '1', '21', '', '']
|
1167
|
+
# end
|
1168
|
+
# end
|
1169
|
+
#
|
1170
|
+
# * Bio::UniProtKB#ft -> Hash
|
1171
|
+
# {FEATURE_KEY => [{'From' => int, 'To' => int,
|
1172
|
+
# 'Description' => aStr, 'FTId' => aStr,
|
1173
|
+
# 'diff' => [original_residues, changed_residues],
|
1174
|
+
# 'original' => aAry }],...}
|
1175
|
+
#
|
1176
|
+
# returns an Array of the information about the feature_name in the feature table.
|
1177
|
+
# * Bio::UniProtKB#ft(feature_name) -> Array of Hash
|
1178
|
+
# [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
|
1179
|
+
#
|
1180
|
+
# == FT Line; feature table data (>=0, optional)
|
1181
|
+
#
|
1182
|
+
# Col Data item
|
1183
|
+
# ----- -----------------
|
1184
|
+
# 1- 2 FT
|
1185
|
+
# 6-13 Feature name
|
1186
|
+
# 15-20 `FROM' endpoint
|
1187
|
+
# 22-27 `TO' endpoint
|
1188
|
+
# 35-75 Description (>=0 per key)
|
1189
|
+
# ----- -----------------
|
1190
|
+
#
|
1191
|
+
# Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors
|
1192
|
+
# including '<', '>' or '?'. (c.f. '<1', '?42')
|
1193
|
+
#
|
1194
|
+
# See also http://www.expasy.org/sprot/userman.html#FT_line
|
1195
|
+
#
|
1196
|
+
def ft(feature_key = nil)
|
1197
|
+
return ft[feature_key] if feature_key
|
1198
|
+
return @data['FT'] if @data['FT']
|
1199
|
+
|
1200
|
+
table = []
|
1201
|
+
begin
|
1202
|
+
get('FT').split("\n").each do |line|
|
1203
|
+
if line =~ /^FT \w/
|
1204
|
+
feature = line.chomp.ljust(74)
|
1205
|
+
table << [feature[ 5..12].strip, # Feature Name
|
1206
|
+
feature[14..19].strip, # From
|
1207
|
+
feature[21..26].strip, # To
|
1208
|
+
feature[34..74].strip ] # Description
|
1209
|
+
else
|
1210
|
+
table.last << line.chomp.sub!(/^FT +/, '')
|
1211
|
+
end
|
1212
|
+
end
|
1213
|
+
|
1214
|
+
# Joining Description lines
|
1215
|
+
table = table.map { |feature|
|
1216
|
+
ftid = feature.pop if feature.last =~ /FTId=/
|
1217
|
+
if feature.size > 4
|
1218
|
+
feature = [feature[0],
|
1219
|
+
feature[1],
|
1220
|
+
feature[2],
|
1221
|
+
feature[3, feature.size - 3].join(" ")]
|
1222
|
+
end
|
1223
|
+
feature << if ftid then ftid else '' end
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
hash = {}
|
1227
|
+
table.each do |feature|
|
1228
|
+
hash[feature[0]] = [] unless hash[feature[0]]
|
1229
|
+
hash[feature[0]] << {
|
1230
|
+
# Removing '<', '>' or '?' in FROM/TO endopoint.
|
1231
|
+
'From' => feature[1].sub(/\D/, '').to_i,
|
1232
|
+
'To' => feature[2].sub(/\D/, '').to_i,
|
1233
|
+
'Description' => feature[3],
|
1234
|
+
'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
|
1235
|
+
'diff' => [],
|
1236
|
+
'original' => feature
|
1237
|
+
}
|
1238
|
+
|
1239
|
+
case feature[0]
|
1240
|
+
when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
|
1241
|
+
case hash[feature[0]].last['Description']
|
1242
|
+
when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
|
1243
|
+
original_res = $1
|
1244
|
+
changed_res = $2
|
1245
|
+
original_res = original_res.gsub(/ /,'').strip
|
1246
|
+
chenged_res = changed_res.gsub(/ /,'').strip
|
1247
|
+
when /Missing/i
|
1248
|
+
original_res = seq.subseq(hash[feature[0]].last['From'],
|
1249
|
+
hash[feature[0]].last['To'])
|
1250
|
+
changed_res = ''
|
1251
|
+
end
|
1252
|
+
hash[feature[0]].last['diff'] = [original_res, chenged_res]
|
1253
|
+
end
|
1254
|
+
end
|
1255
|
+
rescue
|
1256
|
+
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
|
1257
|
+
end
|
1258
|
+
|
1259
|
+
@data['FT'] = hash
|
1260
|
+
end
|
1261
|
+
|
1262
|
+
|
1263
|
+
|
1264
|
+
# returns a Hash of conteins in the SQ lines.
|
1265
|
+
# * Bio::UniProtKBL#sq -> hsh
|
1266
|
+
#
|
1267
|
+
# returns a value of a key given in the SQ lines.
|
1268
|
+
# * Bio::UniProtKBL#sq(key) -> int or str
|
1269
|
+
# * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
|
1270
|
+
# 'CRC64']
|
1271
|
+
#
|
1272
|
+
# === SQ Line; sequence header (1/entry)
|
1273
|
+
# SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
|
1274
|
+
# SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
|
1275
|
+
#
|
1276
|
+
# MW, Dalton unit.
|
1277
|
+
# CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
|
1278
|
+
def sq(key = nil)
|
1279
|
+
unless @data['SQ']
|
1280
|
+
if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
|
1281
|
+
@data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
|
1282
|
+
else
|
1283
|
+
raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
|
1284
|
+
end
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
if key
|
1288
|
+
case key
|
1289
|
+
when /mw/, /molecular/, /weight/
|
1290
|
+
@data['SQ']['MW']
|
1291
|
+
when /len/, /length/, /AA/
|
1292
|
+
@data['SQ']['aalen']
|
1293
|
+
else
|
1294
|
+
@data['SQ'][key]
|
1295
|
+
end
|
1296
|
+
else
|
1297
|
+
@data['SQ']
|
1298
|
+
end
|
1299
|
+
end
|
1300
|
+
|
1301
|
+
|
1302
|
+
# returns a Bio::Sequence::AA of the amino acid sequence.
|
1303
|
+
# * Bio::UniProtKB#seq -> Bio::Sequence::AA
|
1304
|
+
#
|
1305
|
+
# blank Line; sequence data (>=1)
|
1306
|
+
def seq
|
1307
|
+
unless @data['']
|
1308
|
+
@data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
|
1309
|
+
end
|
1310
|
+
return @data['']
|
1311
|
+
end
|
1312
|
+
alias aaseq seq
|
1313
|
+
|
1314
|
+
end # class UniProtKB
|
1315
|
+
|
1316
|
+
end # module Bio
|
1317
|
+
|
1318
|
+
|
1319
|
+
|
1320
|
+
=begin
|
1321
|
+
|
1322
|
+
= Bio::UniProtKB < Bio::DB
|
1323
|
+
|
1324
|
+
Class for a entry in the SWISS-PROT/TrEMBL database.
|
1325
|
+
|
1326
|
+
* ((<URL:http://www.ebi.ac.uk/swissprot/>))
|
1327
|
+
* ((<URL:http://www.ebi.ac.uk/trembl/>))
|
1328
|
+
* ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
|
1329
|
+
|
1330
|
+
|
1331
|
+
--- Bio::UniProtKB.new(a_sp_entry)
|
1332
|
+
|
1333
|
+
=== ID line (Identification)
|
1334
|
+
|
1335
|
+
--- Bio::UniProtKB#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
|
1336
|
+
'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
|
1337
|
+
--- Bio::UniProtKB#id_line(key) -> str
|
1338
|
+
|
1339
|
+
key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
|
1340
|
+
|
1341
|
+
--- Bio::UniProtKB#entry_id -> str
|
1342
|
+
--- Bio::UniProtKB#molecule -> str
|
1343
|
+
--- Bio::UniProtKB#sequence_length -> int
|
1344
|
+
|
1345
|
+
|
1346
|
+
=== AC lines (Accession number)
|
1347
|
+
|
1348
|
+
--- Bio::UniProtKB#ac -> ary
|
1349
|
+
--- Bio::UniProtKB#accessions -> ary
|
1350
|
+
--- Bio::UniProtKB#accession -> accessions.first
|
1351
|
+
|
1352
|
+
|
1353
|
+
=== GN line (Gene name(s))
|
1354
|
+
|
1355
|
+
--- Bio::UniProtKB#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
|
1356
|
+
--- Bio::UniProtKB#gene_name -> str
|
1357
|
+
--- Bio::UniProtKB#gene_names -> [str] or [str]
|
1358
|
+
|
1359
|
+
|
1360
|
+
=== DT lines (Date)
|
1361
|
+
|
1362
|
+
--- Bio::UniProtKB#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
|
1363
|
+
--- Bio::UniProtKB#dt(key) -> str
|
1364
|
+
|
1365
|
+
key := (created|annotation|sequence)
|
1366
|
+
|
1367
|
+
|
1368
|
+
=== DE lines (Description)
|
1369
|
+
|
1370
|
+
--- Bio::UniProtKB#de -> str
|
1371
|
+
#definition -> str
|
1372
|
+
|
1373
|
+
--- Bio::UniProtKB#protein_name
|
1374
|
+
|
1375
|
+
Returns the proposed official name of the protein
|
1376
|
+
|
1377
|
+
|
1378
|
+
--- Bio::UniProtKB#synonyms
|
1379
|
+
|
1380
|
+
Returns an array of synonyms (unofficial names)
|
1381
|
+
|
1382
|
+
=== KW lines (Keyword)
|
1383
|
+
|
1384
|
+
--- Bio::UniProtKB#kw -> ary
|
1385
|
+
|
1386
|
+
=== OS lines (Organism species)
|
1387
|
+
|
1388
|
+
--- Bio::UniProtKB#os -> [{'name' => str, 'os' => str}, ...]
|
1389
|
+
|
1390
|
+
=== OC lines (organism classification)
|
1391
|
+
|
1392
|
+
--- Bio::UniProtKB#oc -> ary
|
1393
|
+
|
1394
|
+
=== OG line (Organella)
|
1395
|
+
|
1396
|
+
--- Bio::UniProtKB#og -> ary
|
1397
|
+
|
1398
|
+
=== OX line (Organism taxonomy cross-reference)
|
1399
|
+
|
1400
|
+
--- Bio::UniProtKB#ox -> {'NCBI_TaxID' => [], ...}
|
1401
|
+
|
1402
|
+
=== RN RC RP RX RA RT RL RG lines (Reference)
|
1403
|
+
|
1404
|
+
--- Bio::UniProtKB#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
|
1405
|
+
|
1406
|
+
=== DR lines (Database cross-reference)
|
1407
|
+
|
1408
|
+
--- Bio::UniProtKB#dr -> {'EMBL' => ary, ...}
|
1409
|
+
|
1410
|
+
=== FT lines (Feature table data)
|
1411
|
+
|
1412
|
+
--- Bio::UniProtKB#ft -> hsh
|
1413
|
+
|
1414
|
+
=== SQ lines (Sequence header and data)
|
1415
|
+
|
1416
|
+
--- Bio::UniProtKB#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
|
1417
|
+
--- Bio::UniProtKB#sq(key) -> int or str
|
1418
|
+
|
1419
|
+
key := (aalen|MW|CRC64)
|
1420
|
+
|
1421
|
+
--- Bio::UniProtKB#seq -> Bio::Sequece::AA
|
1422
|
+
#aaseq -> Bio::Sequece::AA
|
1423
|
+
|
1424
|
+
=end
|
1425
|
+
|
1426
|
+
# Content Occurrence in an entry
|
1427
|
+
# ---- --------------------------- --------------------------------
|
1428
|
+
# ID - identification (begins each entry; 1 per entry)
|
1429
|
+
# AC - accession number(s) (>=1 per entry)
|
1430
|
+
# DT - date (3 per entry)
|
1431
|
+
# DE - description (>=1 per entry)
|
1432
|
+
# GN - gene name(s) (>=0 per entry; optional)
|
1433
|
+
# OS - organism species (>=1 per entry)
|
1434
|
+
# OG - organelle (0 or 1 per entry; optional)
|
1435
|
+
# OC - organism classification (>=1 per entry)
|
1436
|
+
# OX - organism taxonomy x-ref (>=1 per entry)
|
1437
|
+
# OH - Organism Host
|
1438
|
+
# RN - reference number (>=1 per entry)
|
1439
|
+
# RP - reference positions (>=1 per entry)
|
1440
|
+
# RC - reference comment(s) (>=0 per entry; optional)
|
1441
|
+
# RX - reference cross-reference(s) (>=0 per entry; optional)
|
1442
|
+
# RA - reference author(s) (>=1 per entry)
|
1443
|
+
# RT - reference title (>=0 per entry; optional)
|
1444
|
+
# RL - reference location (>=1 per entry)
|
1445
|
+
# RG - reference group(s)
|
1446
|
+
# CC - comments or notes (>=0 per entry; optional)
|
1447
|
+
# DR - database cross-references (>=0 per entry; optional)
|
1448
|
+
# KW - keywords (>=1 per entry)
|
1449
|
+
# FT - feature table data (>=0 per entry; optional)
|
1450
|
+
# SQ - sequence header (1 per entry)
|
1451
|
+
# - (blanks) The sequence data (>=1 per entry)
|
1452
|
+
# // - termination line (ends each entry; 1 per entry)
|
1453
|
+
# ---- --------------------------- --------------------------------
|
1454
|
+
|
1455
|
+
|