bio 1.4.3.0001 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +39 -33
- data/BSDL +22 -0
- data/COPYING +2 -2
- data/COPYING.ja +36 -36
- data/ChangeLog +2404 -1025
- data/KNOWN_ISSUES.rdoc +15 -55
- data/README.rdoc +17 -23
- data/RELEASE_NOTES.rdoc +246 -183
- data/Rakefile +3 -2
- data/bin/br_biofetch.rb +29 -5
- data/bioruby.gemspec +15 -32
- data/bioruby.gemspec.erb +10 -20
- data/doc/ChangeLog-1.4.3 +1478 -0
- data/doc/RELEASE_NOTES-1.4.3.rdoc +204 -0
- data/doc/Tutorial.rd +0 -6
- data/doc/Tutorial.rd.html +7 -12
- data/doc/Tutorial.rd.ja +960 -1064
- data/doc/Tutorial.rd.ja.html +977 -1067
- data/gemfiles/Gemfile.travis-jruby1.8 +2 -1
- data/gemfiles/Gemfile.travis-jruby1.9 +2 -4
- data/gemfiles/Gemfile.travis-rbx +13 -0
- data/gemfiles/Gemfile.travis-ruby1.8 +2 -1
- data/gemfiles/Gemfile.travis-ruby1.9 +2 -4
- data/gemfiles/Gemfile.travis-ruby2.2 +9 -0
- data/lib/bio.rb +10 -43
- data/lib/bio/alignment.rb +8 -14
- data/lib/bio/appl/blast.rb +1 -2
- data/lib/bio/appl/blast/format0.rb +18 -7
- data/lib/bio/appl/blast/remote.rb +0 -9
- data/lib/bio/appl/blast/report.rb +1 -1
- data/lib/bio/appl/clustalw/report.rb +3 -1
- data/lib/bio/appl/genscan/report.rb +1 -2
- data/lib/bio/appl/iprscan/report.rb +1 -2
- data/lib/bio/appl/meme/mast.rb +4 -4
- data/lib/bio/appl/meme/mast/report.rb +1 -1
- data/lib/bio/appl/paml/codeml.rb +2 -2
- data/lib/bio/appl/paml/codeml/report.rb +1 -0
- data/lib/bio/appl/paml/common.rb +1 -1
- data/lib/bio/appl/sosui/report.rb +1 -2
- data/lib/bio/command.rb +62 -2
- data/lib/bio/data/aa.rb +13 -31
- data/lib/bio/data/codontable.rb +1 -2
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +1 -0
- data/lib/bio/db/biosql/sequence.rb +1 -1
- data/lib/bio/db/embl/common.rb +1 -1
- data/lib/bio/db/embl/embl.rb +5 -4
- data/lib/bio/db/embl/format_embl.rb +3 -3
- data/lib/bio/db/embl/sptr.rb +9 -1444
- data/lib/bio/db/embl/swissprot.rb +12 -29
- data/lib/bio/db/embl/trembl.rb +13 -30
- data/lib/bio/db/embl/uniprot.rb +12 -29
- data/lib/bio/db/embl/uniprotkb.rb +1455 -0
- data/lib/bio/db/fasta.rb +17 -0
- data/lib/bio/db/fasta/defline.rb +1 -3
- data/lib/bio/db/fastq.rb +1 -1
- data/lib/bio/db/genbank/ddbj.rb +9 -5
- data/lib/bio/db/genbank/refseq.rb +11 -3
- data/lib/bio/db/gff.rb +3 -4
- data/lib/bio/db/go.rb +5 -6
- data/lib/bio/db/kegg/module.rb +4 -5
- data/lib/bio/db/kegg/pathway.rb +4 -5
- data/lib/bio/db/kegg/reaction.rb +1 -1
- data/lib/bio/db/nexus.rb +3 -2
- data/lib/bio/db/pdb/pdb.rb +2 -2
- data/lib/bio/db/phyloxml/phyloxml_elements.rb +82 -59
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +2 -2
- data/lib/bio/db/phyloxml/phyloxml_writer.rb +1 -2
- data/lib/bio/db/sanger_chromatogram/chromatogram.rb +1 -2
- data/lib/bio/db/transfac.rb +1 -1
- data/lib/bio/io/das.rb +40 -41
- data/lib/bio/io/fastacmd.rb +0 -16
- data/lib/bio/io/fetch.rb +111 -55
- data/lib/bio/io/flatfile/buffer.rb +4 -5
- data/lib/bio/io/hinv.rb +2 -3
- data/lib/bio/io/ncbirest.rb +43 -6
- data/lib/bio/io/pubmed.rb +76 -81
- data/lib/bio/io/togows.rb +33 -10
- data/lib/bio/map.rb +1 -1
- data/lib/bio/pathway.rb +1 -1
- data/lib/bio/sequence/compat.rb +1 -1
- data/lib/bio/sequence/na.rb +63 -12
- data/lib/bio/shell.rb +0 -2
- data/lib/bio/shell/core.rb +5 -6
- data/lib/bio/shell/interface.rb +3 -4
- data/lib/bio/shell/irb.rb +1 -2
- data/lib/bio/shell/plugin/entry.rb +2 -3
- data/lib/bio/shell/plugin/seq.rb +7 -6
- data/lib/bio/shell/setup.rb +1 -2
- data/lib/bio/tree.rb +2 -2
- data/lib/bio/util/contingency_table.rb +0 -2
- data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +2 -2
- data/lib/bio/util/sirna.rb +76 -16
- data/lib/bio/version.rb +8 -9
- data/sample/benchmark_clustalw_report.rb +47 -0
- data/sample/biofetch.rb +248 -151
- data/setup.rb +6 -7
- data/test/data/clustalw/example1-seqnos.aln +58 -0
- data/test/network/bio/appl/blast/test_remote.rb +1 -15
- data/test/network/bio/appl/test_blast.rb +0 -12
- data/test/network/bio/io/test_pubmed.rb +49 -0
- data/test/network/bio/io/test_togows.rb +0 -1
- data/test/network/bio/test_command.rb +65 -2
- data/test/unit/bio/appl/bl2seq/test_report.rb +0 -1
- data/test/unit/bio/appl/blast/test_report.rb +110 -48
- data/test/unit/bio/appl/clustalw/test_report.rb +67 -51
- data/test/unit/bio/appl/sim4/test_report.rb +46 -17
- data/test/unit/bio/appl/test_blast.rb +2 -2
- data/test/unit/bio/db/embl/test_embl.rb +0 -1
- data/test/unit/bio/db/embl/test_embl_rel89.rb +0 -1
- data/test/unit/bio/db/embl/{test_sptr.rb → test_uniprotkb.rb} +111 -115
- data/test/unit/bio/db/embl/{test_uniprot_new_part.rb → test_uniprotkb_new_part.rb} +11 -11
- data/test/unit/bio/db/genbank/test_genbank.rb +10 -4
- data/test/unit/bio/db/pdb/test_pdb.rb +14 -8
- data/test/unit/bio/db/test_fasta.rb +41 -1
- data/test/unit/bio/db/test_fastq.rb +14 -4
- data/test/unit/bio/db/test_gff.rb +2 -2
- data/test/unit/bio/db/test_phyloxml.rb +30 -30
- data/test/unit/bio/db/test_phyloxml_writer.rb +2 -2
- data/test/unit/bio/io/flatfile/test_autodetection.rb +1 -2
- data/test/unit/bio/io/flatfile/test_buffer.rb +7 -1
- data/test/unit/bio/io/flatfile/test_splitter.rb +1 -1
- data/test/unit/bio/io/test_togows.rb +3 -2
- data/test/unit/bio/sequence/test_dblink.rb +1 -1
- data/test/unit/bio/sequence/test_na.rb +3 -1
- data/test/unit/bio/test_alignment.rb +1 -2
- data/test/unit/bio/test_command.rb +5 -4
- data/test/unit/bio/test_db.rb +4 -2
- data/test/unit/bio/test_pathway.rb +25 -10
- data/test/unit/bio/util/test_sirna.rb +22 -22
- metadata +656 -1430
- data/doc/KEGG_API.rd +0 -1843
- data/doc/KEGG_API.rd.ja +0 -1834
- data/extconf.rb +0 -2
- data/lib/bio/appl/blast/ddbj.rb +0 -131
- data/lib/bio/db/kegg/taxonomy.rb +0 -280
- data/lib/bio/io/dbget.rb +0 -194
- data/lib/bio/io/ddbjrest.rb +0 -344
- data/lib/bio/io/ddbjxml.rb +0 -458
- data/lib/bio/io/ebisoap.rb +0 -158
- data/lib/bio/io/ensembl.rb +0 -229
- data/lib/bio/io/higet.rb +0 -73
- data/lib/bio/io/keggapi.rb +0 -363
- data/lib/bio/io/ncbisoap.rb +0 -156
- data/lib/bio/io/soapwsdl.rb +0 -119
- data/lib/bio/shell/plugin/keggapi.rb +0 -181
- data/lib/bio/shell/plugin/soap.rb +0 -87
- data/sample/dbget +0 -37
- data/sample/demo_ddbjxml.rb +0 -212
- data/sample/demo_kegg_taxonomy.rb +0 -92
- data/sample/demo_keggapi.rb +0 -502
- data/sample/psortplot_html.rb +0 -214
- data/test/network/bio/io/test_ddbjrest.rb +0 -47
- data/test/network/bio/io/test_ensembl.rb +0 -230
- data/test/network/bio/io/test_soapwsdl.rb +0 -53
- data/test/unit/bio/io/test_ddbjxml.rb +0 -81
- data/test/unit/bio/io/test_ensembl.rb +0 -111
- data/test/unit/bio/io/test_soapwsdl.rb +0 -33
@@ -5,7 +5,6 @@
|
|
5
5
|
# Mitsuteru C. Nakao <n@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id:$
|
9
8
|
#
|
10
9
|
# == Example
|
11
10
|
#
|
@@ -60,7 +59,7 @@ module Bio
|
|
60
59
|
if /NUMBER OF TM HELIX = (\d+)/ =~ line
|
61
60
|
@tms = $1
|
62
61
|
elsif /TM (\d+) +(\d+)- *(\d+) (\w+) +(\w+)/ =~ line
|
63
|
-
tmh = $1.to_i
|
62
|
+
#tmh = $1.to_i
|
64
63
|
range = Range.new($2.to_i, $3.to_i)
|
65
64
|
grade = $4
|
66
65
|
seq = $5
|
data/lib/bio/command.rb
CHANGED
@@ -6,7 +6,6 @@
|
|
6
6
|
# Toshiaki Katayama <k@bioruby.org>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id:$
|
10
9
|
#
|
11
10
|
|
12
11
|
require 'open3'
|
@@ -856,7 +855,7 @@ module Command
|
|
856
855
|
end.join('&')
|
857
856
|
end
|
858
857
|
when String
|
859
|
-
|
858
|
+
raise TypeError, 'Bio::Command.make_cgi_params no longer accepts a single String as a form'
|
860
859
|
end
|
861
860
|
return data
|
862
861
|
end
|
@@ -882,6 +881,67 @@ module Command
|
|
882
881
|
return result
|
883
882
|
end
|
884
883
|
|
884
|
+
# Same as:
|
885
|
+
# http = Net::HTTP.new(...); http.post(path, data, header)
|
886
|
+
# and
|
887
|
+
# it uses proxy if an environment variable (same as OpenURI.open_uri)
|
888
|
+
# is set.
|
889
|
+
# In addition, +header+ can be set.
|
890
|
+
# (Default Content-Type is application/octet-stream.
|
891
|
+
# Content-Length is automatically set by default.)
|
892
|
+
# +uri+ must be a URI object, +params+ must be a hash, and
|
893
|
+
# +header+ must be a hash.
|
894
|
+
#
|
895
|
+
# ---
|
896
|
+
# *Arguments*:
|
897
|
+
# * (required) _http_: Net::HTTP object or compatible object
|
898
|
+
# * (required) _path_: String
|
899
|
+
# * (required) _data_: String containing data
|
900
|
+
# * (optional) _header_: Hash containing header strings
|
901
|
+
# *Returns*:: (same as Net::HTTP::post)
|
902
|
+
def http_post(http, path, data, header = {})
|
903
|
+
hash = {
|
904
|
+
'Content-Type' => 'application/octet-stream',
|
905
|
+
'Content-Length' => data.length.to_s
|
906
|
+
}
|
907
|
+
hash.update(header)
|
908
|
+
|
909
|
+
http.post(path, data, hash)
|
910
|
+
end
|
911
|
+
|
912
|
+
# Same as:
|
913
|
+
# Net::HTTP.post(uri, params)
|
914
|
+
# and
|
915
|
+
# it uses proxy if an environment variable (same as OpenURI.open_uri)
|
916
|
+
# is set.
|
917
|
+
# In addition, +header+ can be set.
|
918
|
+
# (Default Content-Type is application/octet-stream.
|
919
|
+
# Content-Length is automatically set by default.)
|
920
|
+
# +uri+ must be a URI object, +data+ must be a String, and
|
921
|
+
# +header+ must be a hash.
|
922
|
+
#
|
923
|
+
# ---
|
924
|
+
# *Arguments*:
|
925
|
+
# * (required) _uri_: URI object or String
|
926
|
+
# * (optional) _data_: String containing data
|
927
|
+
# * (optional) _header_: Hash containing header strings
|
928
|
+
# *Returns*:: (same as Net::HTTP::post)
|
929
|
+
def post(uri, data, header = {})
|
930
|
+
unless uri.is_a?(URI)
|
931
|
+
uri = URI.parse(uri)
|
932
|
+
end
|
933
|
+
|
934
|
+
hash = {
|
935
|
+
'Content-Type' => 'application/octet-stream',
|
936
|
+
'Content-Length' => data.length.to_s
|
937
|
+
}
|
938
|
+
hash.update(header)
|
939
|
+
|
940
|
+
start_http(uri.host, uri.port) do |http|
|
941
|
+
http.post(uri.path, data, hash)
|
942
|
+
end
|
943
|
+
end
|
944
|
+
|
885
945
|
end # module Command
|
886
946
|
end # module Bio
|
887
947
|
|
data/lib/bio/data/aa.rb
CHANGED
@@ -108,23 +108,21 @@ class AminoAcid
|
|
108
108
|
}
|
109
109
|
|
110
110
|
def weight(x = nil)
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
111
|
+
return WEIGHT unless x
|
112
|
+
|
113
|
+
if x.length > 1
|
114
|
+
total = 0.0
|
115
|
+
x.each_byte do |byte|
|
116
|
+
aa = byte.chr.upcase
|
117
|
+
if WEIGHT[aa]
|
118
|
+
total += WEIGHT[aa]
|
119
|
+
else
|
120
|
+
raise "Error: invalid amino acid '#{aa}'"
|
121
121
|
end
|
122
|
-
total -= NucleicAcid.weight[:water] * (x.length - 1)
|
123
|
-
else
|
124
|
-
WEIGHT[x]
|
125
122
|
end
|
123
|
+
total -= NucleicAcid.weight[:water] * (x.length - 1)
|
126
124
|
else
|
127
|
-
WEIGHT
|
125
|
+
WEIGHT[x]
|
128
126
|
end
|
129
127
|
end
|
130
128
|
|
@@ -237,11 +235,7 @@ class AminoAcid
|
|
237
235
|
|
238
236
|
|
239
237
|
def reverse
|
240
|
-
|
241
|
-
NAMES.each do |k, v|
|
242
|
-
hash[v] = k
|
243
|
-
end
|
244
|
-
hash
|
238
|
+
@reverse ||= NAMES.invert
|
245
239
|
end
|
246
240
|
|
247
241
|
end
|
@@ -254,18 +248,6 @@ class AminoAcid
|
|
254
248
|
extend Data
|
255
249
|
|
256
250
|
|
257
|
-
private
|
258
|
-
|
259
|
-
|
260
|
-
# override when used as an instance method to improve performance
|
261
|
-
alias orig_reverse reverse
|
262
|
-
def reverse
|
263
|
-
unless @reverse
|
264
|
-
@reverse = orig_reverse
|
265
|
-
end
|
266
|
-
@reverse
|
267
|
-
end
|
268
|
-
|
269
251
|
end
|
270
252
|
|
271
253
|
end # module Bio
|
data/lib/bio/data/codontable.rb
CHANGED
@@ -5,7 +5,6 @@
|
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id:$
|
9
8
|
#
|
10
9
|
# == Data source
|
11
10
|
#
|
@@ -127,7 +126,7 @@ class CodonTable
|
|
127
126
|
# table.revtrans("A") # => ["gcg", "gct", "gca", "gcc"]
|
128
127
|
#
|
129
128
|
def revtrans(aa)
|
130
|
-
unless @reverse
|
129
|
+
unless (defined? @reverse) && @reverse
|
131
130
|
@reverse = {}
|
132
131
|
@table.each do |k, v|
|
133
132
|
@reverse[v] ||= []
|
@@ -371,7 +371,7 @@ module Bio
|
|
371
371
|
#probably would be better to d a class refrence to collect these informations
|
372
372
|
@entry.bioentry_references.collect do |bio_ref|
|
373
373
|
hash = Hash.new
|
374
|
-
hash['authors'] = bio_ref.reference.authors.gsub(/\.\s/, "\.\s\|").split(/\|/)
|
374
|
+
hash['authors'] = bio_ref.reference.authors.gsub(/\.\s/, "\.\s\|").split(/\|/) if (bio_ref.reference and bio_ref.reference.authors)
|
375
375
|
|
376
376
|
hash['sequence_position'] = "#{bio_ref.start_pos}-#{bio_ref.end_pos}" if (bio_ref.start_pos and bio_ref.end_pos)
|
377
377
|
hash['title'] = bio_ref.reference.title
|
data/lib/bio/db/embl/common.rb
CHANGED
@@ -149,7 +149,7 @@ module Common
|
|
149
149
|
unless @data['OS']
|
150
150
|
os = Array.new
|
151
151
|
fetch('OS').split(/, and|, /).each do |tmp|
|
152
|
-
if tmp =~ /([A-Z][a-z]* *[\w
|
152
|
+
if tmp =~ /([A-Z][a-z]* *[\w \:\'\+\-]+\w)/
|
153
153
|
org = $1
|
154
154
|
tmp =~ /(\(.+\))/
|
155
155
|
os.push({'name' => $1, 'os' => org})
|
data/lib/bio/db/embl/embl.rb
CHANGED
@@ -267,9 +267,10 @@ class EMBL < EMBLDB
|
|
267
267
|
unless @data['OS']
|
268
268
|
os = Array.new
|
269
269
|
tmp = fetch('OS')
|
270
|
-
if /([A-Z][a-z]* *[\w
|
270
|
+
if /([A-Z][a-z]* *[\w \:\'\+\-]+\w) *\(([\w ]+)\)\s*\z/ =~ tmp
|
271
271
|
org = $1
|
272
|
-
|
272
|
+
name = $2
|
273
|
+
os.push({'name' => name, 'os' => org})
|
273
274
|
else
|
274
275
|
os.push({'name' => nil, 'os' => tmp})
|
275
276
|
end
|
@@ -340,7 +341,7 @@ class EMBL < EMBLDB
|
|
340
341
|
@orig['FT'].each_line do |line|
|
341
342
|
next if line =~ /^FEATURES/
|
342
343
|
|
343
|
-
head = line[0,20].strip # feature key (source, CDS, ...)
|
344
|
+
#head = line[0,20].strip # feature key (source, CDS, ...)
|
344
345
|
body = line[20,60].chomp # feature value (position, /qualifier=)
|
345
346
|
if line =~ /^FT {3}(\S+)/
|
346
347
|
ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
|
@@ -491,7 +492,7 @@ class EMBL < EMBLDB
|
|
491
492
|
def parse_release_version(str)
|
492
493
|
return [ nil, nil ] unless str
|
493
494
|
a = str.split(/[\(\,\)]/)
|
494
|
-
|
495
|
+
a.shift #date string e.g. "14-OCT-2006"
|
495
496
|
rel = nil
|
496
497
|
ver = nil
|
497
498
|
a.each do |x|
|
@@ -126,9 +126,9 @@ module Bio::Sequence::Format::NucFormatter
|
|
126
126
|
def mol_type_embl
|
127
127
|
if mt = molecule_type then
|
128
128
|
mt
|
129
|
-
elsif
|
130
|
-
|
131
|
-
|
129
|
+
elsif fe = (features or []).find { |f| f.feature == 'source' } and
|
130
|
+
qu = fe.qualifiers.find { |q| q.qualifier == 'mol_type' } then
|
131
|
+
qu.value
|
132
132
|
else
|
133
133
|
'NA'
|
134
134
|
end
|
data/lib/bio/db/embl/sptr.rb
CHANGED
@@ -1,1455 +1,20 @@
|
|
1
1
|
#
|
2
|
-
# = bio/db/embl/sptr.rb -
|
2
|
+
# = bio/db/embl/sptr.rb - Bio::SPTR is an alias of Bio::UniProtKB
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C)
|
4
|
+
# Copyright:: Copyright (C) 2013 BioRuby Project
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id:$
|
8
|
-
#
|
9
|
-
# == Description
|
10
|
-
#
|
11
|
-
# Shared methods for UniProtKB/SwissProt and TrEMBL classes.
|
12
|
-
#
|
13
|
-
# See the SWISS-PROT document file SPECLIST.TXT or UniProtKB/SwissProt
|
14
|
-
# user manual.
|
15
|
-
#
|
16
|
-
# == Examples
|
17
|
-
#
|
18
|
-
# str = File.read("p53_human.swiss")
|
19
|
-
# obj = Bio::SPTR.new(str)
|
20
|
-
# obj.entry_id #=> "P53_HUMAN"
|
21
|
-
#
|
22
|
-
# == References
|
23
|
-
#
|
24
|
-
# * Swiss-Prot Protein knowledgebase. TrEMBL Computer-annotated supplement
|
25
|
-
# to Swiss-Prot
|
26
|
-
# http://au.expasy.org/sprot/
|
27
|
-
#
|
28
|
-
# * UniProt
|
29
|
-
# http://uniprot.org/
|
30
|
-
#
|
31
|
-
# * The UniProtKB/SwissProt/TrEMBL User Manual
|
32
|
-
# http://www.expasy.org/sprot/userman.html
|
33
|
-
#
|
34
|
-
|
35
7
|
|
36
|
-
|
37
|
-
require 'bio/db/embl/common'
|
8
|
+
warn "Bio::SPTR is changed to an alias of Bio::UniProtKB. Please use Bio::UniProtKB. Bio::SPTR may be deprecated in the future." if $VERBOSE
|
38
9
|
|
39
10
|
module Bio
|
40
11
|
|
41
|
-
|
42
|
-
class SPTR < EMBLDB
|
43
|
-
include Bio::EMBLDB::Common
|
44
|
-
|
45
|
-
@@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
|
46
|
-
@@data_class = ["STANDARD", "PRELIMINARY"]
|
47
|
-
|
48
|
-
# returns a Hash of the ID line.
|
49
|
-
#
|
50
|
-
# returns a content (Int or String) of the ID line by a given key.
|
51
|
-
# Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
|
52
|
-
#
|
53
|
-
# === ID Line (since UniProtKB release 9.0 of 31-Oct-2006)
|
54
|
-
# ID P53_HUMAN Reviewed; 393 AA.
|
55
|
-
# #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
|
56
|
-
#
|
57
|
-
# === Examples
|
58
|
-
# obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed",
|
59
|
-
# "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}
|
60
|
-
#
|
61
|
-
# obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
|
62
|
-
#
|
63
|
-
#
|
64
|
-
# === ID Line (older style)
|
65
|
-
# ID P53_HUMAN STANDARD; PRT; 393 AA.
|
66
|
-
# #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
|
67
|
-
#
|
68
|
-
# === Examples
|
69
|
-
# obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD",
|
70
|
-
# "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}
|
71
|
-
#
|
72
|
-
# obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
|
73
|
-
#
|
74
|
-
def id_line(key = nil)
|
75
|
-
return id_line[key] if key
|
76
|
-
return @data['ID'] if @data['ID']
|
77
|
-
|
78
|
-
part = @orig['ID'].split(/ +/)
|
79
|
-
if part[4].to_s.chomp == 'AA.' then
|
80
|
-
# after UniProtKB release 9.0 of 31-Oct-2006
|
81
|
-
# (http://www.uniprot.org/docs/sp_news.htm)
|
82
|
-
molecule_type = nil
|
83
|
-
sequence_length = part[3].to_i
|
84
|
-
else
|
85
|
-
molecule_type = part[3].sub(/;/,'')
|
86
|
-
sequence_length = part[4].to_i
|
87
|
-
end
|
88
|
-
@data['ID'] = {
|
89
|
-
'ENTRY_NAME' => part[1],
|
90
|
-
'DATA_CLASS' => part[2].sub(/;/,''),
|
91
|
-
'MOLECULE_TYPE' => molecule_type,
|
92
|
-
'SEQUENCE_LENGTH' => sequence_length
|
93
|
-
}
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
# returns a ENTRY_NAME in the ID line.
|
98
|
-
#
|
99
|
-
def entry_id
|
100
|
-
id_line('ENTRY_NAME')
|
101
|
-
end
|
102
|
-
alias entry_name entry_id
|
103
|
-
alias entry entry_id
|
104
|
-
|
105
|
-
|
106
|
-
# returns a MOLECULE_TYPE in the ID line.
|
107
|
-
#
|
108
|
-
# A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
|
109
|
-
def molecule
|
110
|
-
id_line('MOLECULE_TYPE')
|
111
|
-
end
|
112
|
-
alias molecule_type molecule
|
113
|
-
|
114
|
-
|
115
|
-
# returns a SEQUENCE_LENGTH in the ID line.
|
116
|
-
#
|
117
|
-
# A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
|
118
|
-
def sequence_length
|
119
|
-
id_line('SEQUENCE_LENGTH')
|
120
|
-
end
|
121
|
-
alias aalen sequence_length
|
122
|
-
|
123
|
-
|
124
|
-
# Bio::EMBLDB::Common#ac -> ary
|
125
|
-
# #accessions -> ary
|
126
|
-
# #accession -> String (accessions.first)
|
127
|
-
@@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
# returns a Hash of information in the DT lines.
|
132
|
-
# hash keys:
|
133
|
-
# ['created', 'sequence', 'annotation']
|
134
|
-
#--
|
135
|
-
# also Symbols acceptable (ASAP):
|
136
|
-
# [:created, :sequence, :annotation]
|
137
|
-
#++
|
138
|
-
#
|
139
|
-
# Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is
|
140
|
-
# changed, and the word "annotation" is no longer used in DT lines.
|
141
|
-
# Despite the change, the word "annotation" is still used for keeping
|
142
|
-
# compatibility.
|
143
|
-
#
|
144
|
-
# returns a String of information in the DT lines by a given key.
|
145
|
-
#
|
146
|
-
# === DT Line; date (3/entry)
|
147
|
-
# DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
|
148
|
-
# DT DD-MMM-YYY (sequence version NN)
|
149
|
-
# DT DD-MMM-YYY (entry version NN)
|
150
|
-
#
|
151
|
-
# The format have been changed in UniProtKB release 7.0 of 07-Feb-2006.
|
152
|
-
# Below is the older format.
|
153
|
-
#
|
154
|
-
# === Old format of DT Line; date (3/entry)
|
155
|
-
# DT DD-MMM-YYY (rel. NN, Created)
|
156
|
-
# DT DD-MMM-YYY (rel. NN, Last sequence update)
|
157
|
-
# DT DD-MMM-YYY (rel. NN, Last annotation update)
|
158
|
-
def dt(key = nil)
|
159
|
-
return dt[key] if key
|
160
|
-
return @data['DT'] if @data['DT']
|
161
|
-
|
162
|
-
part = self.get('DT').split(/\n/)
|
163
|
-
@data['DT'] = {
|
164
|
-
'created' => part[0].sub(/\w{2} /,'').strip,
|
165
|
-
'sequence' => part[1].sub(/\w{2} /,'').strip,
|
166
|
-
'annotation' => part[2].sub(/\w{2} /,'').strip
|
167
|
-
}
|
168
|
-
end
|
169
|
-
|
170
|
-
|
171
|
-
# (private) parses DE line (description lines)
|
172
|
-
# since UniProtKB release 14.0 of 22-Jul-2008
|
173
|
-
#
|
174
|
-
# Return array containing array.
|
175
|
-
#
|
176
|
-
# http://www.uniprot.org/docs/sp_news.htm
|
177
|
-
def parse_DE_line_rel14(str)
|
178
|
-
# Retruns if it is not the new format since Rel.14
|
179
|
-
return nil unless /^DE (RecName|AltName|SubName)\: / =~ str
|
180
|
-
ret = []
|
181
|
-
cur = nil
|
182
|
-
str.each_line do |line|
|
183
|
-
case line
|
184
|
-
when /^DE (Includes|Contains)\: *$/
|
185
|
-
cur = [ $1 ]
|
186
|
-
ret.push cur
|
187
|
-
cur = nil
|
188
|
-
#subcat_and_desc = nil
|
189
|
-
next
|
190
|
-
when /^DE *(RecName|AltName|SubName)\: +(.*)/
|
191
|
-
category = $1
|
192
|
-
subcat_and_desc = $2
|
193
|
-
cur = [ category ]
|
194
|
-
ret.push cur
|
195
|
-
when /^DE *(Flags)\: +(.*)/
|
196
|
-
category = $1
|
197
|
-
desc = $2
|
198
|
-
flags = desc.strip.split(/\s*\;\s*/) || []
|
199
|
-
cur = [ category, flags ]
|
200
|
-
ret.push cur
|
201
|
-
cur = nil
|
202
|
-
#subcat_and_desc = nil
|
203
|
-
next
|
204
|
-
when /^DE *(.*)/
|
205
|
-
subcat_and_desc = $1
|
206
|
-
else
|
207
|
-
warn "Warning: skipped DE line in unknown format: #{line.inspect}"
|
208
|
-
#subcat_and_desc = nil
|
209
|
-
next
|
210
|
-
end
|
211
|
-
case subcat_and_desc
|
212
|
-
when nil
|
213
|
-
# does nothing
|
214
|
-
when /\A([^\=]+)\=(.*)/
|
215
|
-
subcat = $1
|
216
|
-
desc = $2
|
217
|
-
desc.sub!(/\;\s*\z/, '')
|
218
|
-
unless cur
|
219
|
-
warn "Warning: unknown category in DE line: #{line.inspect}"
|
220
|
-
cur = [ '' ]
|
221
|
-
ret.push cur
|
222
|
-
end
|
223
|
-
cur.push [ subcat, desc ]
|
224
|
-
else
|
225
|
-
warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
|
226
|
-
end
|
227
|
-
end
|
228
|
-
ret
|
229
|
-
end
|
230
|
-
private :parse_DE_line_rel14
|
231
|
-
|
232
|
-
# returns the proposed official name of the protein.
|
233
|
-
# Returns a String.
|
234
|
-
#
|
235
|
-
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
|
236
|
-
# been changed. The method returns the full name which is taken from
|
237
|
-
# "RecName: Full=" or "SubName: Full=" line normally in the beginning of
|
238
|
-
# the DE lines.
|
239
|
-
# Unlike parser for old format, no special treatments for fragment or
|
240
|
-
# precursor.
|
241
|
-
#
|
242
|
-
# For old format, the method parses the DE lines and returns the protein
|
243
|
-
# name as a String.
|
244
|
-
#
|
245
|
-
# === DE Line; description (>=1)
|
246
|
-
# "DE #{OFFICIAL_NAME} (#{SYNONYM})"
|
247
|
-
# "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
|
248
|
-
# OFFICIAL_NAME 1/entry
|
249
|
-
# SYNONYM >=0
|
250
|
-
# CONTEINS >=0
|
251
|
-
def protein_name
|
252
|
-
@data['DE'] ||= parse_DE_line_rel14(get('DE'))
|
253
|
-
parsed_de_line = @data['DE']
|
254
|
-
if parsed_de_line then
|
255
|
-
# since UniProtKB release 14.0 of 22-Jul-2008
|
256
|
-
name = nil
|
257
|
-
parsed_de_line.each do |a|
|
258
|
-
case a[0]
|
259
|
-
when 'RecName', 'SubName'
|
260
|
-
if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
|
261
|
-
name = name_pair[1]
|
262
|
-
break
|
263
|
-
end
|
264
|
-
end
|
265
|
-
end
|
266
|
-
name = name.to_s
|
267
|
-
else
|
268
|
-
# old format (before Rel. 13.x)
|
269
|
-
name = ""
|
270
|
-
if de_line = fetch('DE') then
|
271
|
-
str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
|
272
|
-
name = str[/^[^(]*/].strip
|
273
|
-
name << ' (Fragment)' if str =~ /fragment/i
|
274
|
-
end
|
275
|
-
end
|
276
|
-
return name
|
277
|
-
end
|
278
|
-
|
279
|
-
|
280
|
-
# returns synonyms (unofficial and/or alternative names).
|
281
|
-
# Returns an Array containing String objects.
|
282
|
-
#
|
283
|
-
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
|
284
|
-
# been changed. The method returns the full or short names which are
|
285
|
-
# taken from "RecName: Short=", "RecName: EC=", and AltName lines,
|
286
|
-
# except after "Contains:" or "Includes:".
|
287
|
-
# For keeping compatibility with old format parser, "RecName: EC=N.N.N.N"
|
288
|
-
# is reported as "EC N.N.N.N".
|
289
|
-
# In addition, to prevent confusion, "Allergen=" and "CD_antigen="
|
290
|
-
# prefixes are added for the corresponding fields.
|
291
|
-
#
|
292
|
-
# For old format, the method parses the DE lines and returns synonyms.
|
293
|
-
# synonyms are each placed in () following the official name on the DE line.
|
294
|
-
def synonyms
|
295
|
-
ary = Array.new
|
296
|
-
@data['DE'] ||= parse_DE_line_rel14(get('DE'))
|
297
|
-
parsed_de_line = @data['DE']
|
298
|
-
if parsed_de_line then
|
299
|
-
# since UniProtKB release 14.0 of 22-Jul-2008
|
300
|
-
parsed_de_line.each do |a|
|
301
|
-
case a[0]
|
302
|
-
when 'Includes', 'Contains'
|
303
|
-
break #the each loop
|
304
|
-
when 'RecName', 'SubName', 'AltName'
|
305
|
-
a[1..-1].each do |b|
|
306
|
-
if name = b[1] and b[1] != self.protein_name then
|
307
|
-
case b[0]
|
308
|
-
when 'EC'
|
309
|
-
name = "EC " + b[1]
|
310
|
-
when 'Allergen', 'CD_antigen'
|
311
|
-
name = b[0] + '=' + b[1]
|
312
|
-
else
|
313
|
-
name = b[1]
|
314
|
-
end
|
315
|
-
ary.push name
|
316
|
-
end
|
317
|
-
end
|
318
|
-
end #case a[0]
|
319
|
-
end #parsed_de_line.each
|
320
|
-
else
|
321
|
-
# old format (before Rel. 13.x)
|
322
|
-
if de_line = fetch('DE') then
|
323
|
-
line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
|
324
|
-
line.scan(/\([^)]+/) do |synonym|
|
325
|
-
unless synonym =~ /fragment/i then
|
326
|
-
ary << synonym[1..-1].strip # index to remove the leading (
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
|
-
return ary
|
332
|
-
end
|
333
|
-
|
334
|
-
|
335
|
-
# returns gene names in the GN line.
|
336
|
-
#
|
337
|
-
# New UniProt/SwissProt format:
|
338
|
-
# * Bio::SPTR#gn -> [ <gene record>* ]
|
339
|
-
# where <gene record> is:
|
340
|
-
# { :name => '...',
|
341
|
-
# :synonyms => [ 's1', 's2', ... ],
|
342
|
-
# :loci => [ 'l1', 'l2', ... ],
|
343
|
-
# :orfs => [ 'o1', 'o2', ... ]
|
344
|
-
# }
|
345
|
-
#
|
346
|
-
# Old format:
|
347
|
-
# * Bio::SPTR#gn -> Array # AND
|
348
|
-
# * Bio::SPTR#gn[0] -> Array # OR
|
349
|
-
#
|
350
|
-
# === GN Line: Gene name(s) (>=0, optional)
|
351
|
-
def gn
|
352
|
-
unless @data['GN']
|
353
|
-
case fetch('GN')
|
354
|
-
when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
|
355
|
-
@data['GN'] = gn_uniprot_parser
|
356
|
-
else
|
357
|
-
@data['GN'] = gn_old_parser
|
358
|
-
end
|
359
|
-
end
|
360
|
-
@data['GN']
|
361
|
-
end
|
362
|
-
|
363
|
-
|
364
|
-
# returns contents in the old style GN line.
|
365
|
-
# === GN Line: Gene name(s) (>=0, optional)
|
366
|
-
# GN HNS OR DRDX OR OSMZ OR BGLY.
|
367
|
-
# GN CECA1 AND CECA2.
|
368
|
-
# GN CECA1 AND (HOGE OR FUGA).
|
369
|
-
#
|
370
|
-
# GN NAME1 [(AND|OR) NAME]+.
|
371
|
-
#
|
372
|
-
# Bio::SPTR#gn -> Array # AND
|
373
|
-
# #gn[0] -> Array # OR
|
374
|
-
# #gene_names -> Array
|
375
|
-
def gn_old_parser
|
376
|
-
names = Array.new
|
377
|
-
if get('GN').size > 0
|
378
|
-
names = fetch('GN').sub(/\.$/,'').split(/ AND /)
|
379
|
-
names.map! { |synonyms|
|
380
|
-
synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
|
381
|
-
e.strip
|
382
|
-
}
|
383
|
-
}
|
384
|
-
end
|
385
|
-
@data['GN'] = names
|
386
|
-
end
|
387
|
-
private :gn_old_parser
|
388
|
-
|
389
|
-
# returns contents in the structured GN line.
|
390
|
-
# The new format of the GN line is:
|
391
|
-
# GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
|
392
|
-
# GN ORFNames=[, ...];
|
393
|
-
#
|
394
|
-
# * Bio::SPTR#gn -> [ <gene record>* ]
|
395
|
-
# where <gene record> is:
|
396
|
-
# { :name => '...',
|
397
|
-
# :synonyms => [ 's1', 's2', ... ],
|
398
|
-
# :loci => [ 'l1', 'l2', ... ],
|
399
|
-
# :orfs => [ 'o1', 'o2', ... ]
|
400
|
-
# }
|
401
|
-
def gn_uniprot_parser
|
402
|
-
@data['GN'] = Array.new
|
403
|
-
gn_line = fetch('GN').strip
|
404
|
-
records = gn_line.split(/\s*and\s*/)
|
405
|
-
records.each do |record|
|
406
|
-
gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
|
407
|
-
record.each_line(';') do |element|
|
408
|
-
case element
|
409
|
-
when /Name=/ then
|
410
|
-
gene_hash[:name] = $'[0..-2]
|
411
|
-
when /Synonyms=/ then
|
412
|
-
gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
|
413
|
-
when /OrderedLocusNames=/ then
|
414
|
-
gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
|
415
|
-
when /ORFNames=/ then
|
416
|
-
gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
|
417
|
-
end
|
418
|
-
end
|
419
|
-
@data['GN'] << gene_hash
|
420
|
-
end
|
421
|
-
return @data['GN']
|
422
|
-
end
|
423
|
-
private :gn_uniprot_parser
|
424
|
-
|
425
|
-
|
426
|
-
# returns a Array of gene names in the GN line.
|
427
|
-
def gene_names
|
428
|
-
gn # set @data['GN'] if it hasn't been already done
|
429
|
-
if @data['GN'].first.class == Hash then
|
430
|
-
@data['GN'].collect { |element| element[:name] }
|
431
|
-
else
|
432
|
-
@data['GN'].first
|
433
|
-
end
|
434
|
-
end
|
435
|
-
|
436
|
-
|
437
|
-
# returns a String of the first gene name in the GN line.
|
438
|
-
def gene_name
|
439
|
-
gene_names.first
|
440
|
-
end
|
441
|
-
|
442
|
-
|
443
|
-
# returns a Array of Hashs or a String of the OS line when a key given.
|
444
|
-
# * Bio::EMBLDB#os -> Array
|
445
|
-
# [{'name' => '(Human)', 'os' => 'Homo sapiens'},
|
446
|
-
# {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
|
447
|
-
# * Bio::EPTR#os[0] -> Hash
|
448
|
-
# {'name' => "(Human)", 'os' => 'Homo sapiens'}
|
449
|
-
# * Bio::SPTR#os[0]['name'] -> "(Human)"
|
450
|
-
# * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
|
451
|
-
#
|
452
|
-
# === OS Line; organism species (>=1)
|
453
|
-
# OS Genus species (name).
|
454
|
-
# OS Genus species (name0) (name1).
|
455
|
-
# OS Genus species (name0) (name1).
|
456
|
-
# OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
|
457
|
-
# OS Homo sapiens (Human), and Rarrus norveticus (Rat)
|
458
|
-
# OS Hippotis sp. Clark and Watts 825.
|
459
|
-
# OS unknown cyperaceous sp.
|
460
|
-
def os(num = nil)
|
461
|
-
unless @data['OS']
|
462
|
-
os = Array.new
|
463
|
-
fetch('OS').split(/, and|, /).each do |tmp|
|
464
|
-
if tmp =~ /(\w+ *[\w\d \:\'\+\-\.]+[\w\d\.])/
|
465
|
-
org = $1
|
466
|
-
tmp =~ /(\(.+\))/
|
467
|
-
os.push({'name' => $1, 'os' => org})
|
468
|
-
else
|
469
|
-
raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
|
470
|
-
end
|
471
|
-
end
|
472
|
-
@data['OS'] = os
|
473
|
-
end
|
474
|
-
|
475
|
-
if num
|
476
|
-
# EX. "Trifolium repens (white clover)"
|
477
|
-
return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
|
478
|
-
else
|
479
|
-
return @data['OS']
|
480
|
-
end
|
481
|
-
end
|
482
|
-
|
483
|
-
|
484
|
-
# Bio::EMBLDB::Common#og -> Array
|
485
|
-
# OG Line; organella (0 or 1/entry)
|
486
|
-
# ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
|
487
|
-
# or a plasmid name (e.g. "Plasmid pBR322").
|
488
|
-
|
489
|
-
|
490
|
-
# Bio::EMBLDB::Common#oc -> Array
|
491
|
-
# OC Line; organism classification (>=1)
|
492
|
-
# "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
|
493
|
-
# "OC Theileria."
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
# returns a Hash of oraganism taxonomy cross-references.
|
498
|
-
# * Bio::SPTR#ox -> Hash
|
499
|
-
# {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
|
500
|
-
#
|
501
|
-
# === OX Line; organism taxonomy cross-reference (>=1 per entry)
|
502
|
-
# OX NCBI_TaxID=1234;
|
503
|
-
# OX NCBI_TaxID=1234, 2345, 3456, 4567;
|
504
|
-
def ox
|
505
|
-
unless @data['OX']
|
506
|
-
tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
|
507
|
-
hsh = Hash.new
|
508
|
-
tmp.each do |e|
|
509
|
-
db,refs = e.split(/=/)
|
510
|
-
hsh[db] = refs.split(/, */)
|
511
|
-
end
|
512
|
-
@data['OX'] = hsh
|
513
|
-
end
|
514
|
-
return @data['OX']
|
515
|
-
end
|
516
|
-
|
517
|
-
# === The OH Line;
|
518
|
-
#
|
519
|
-
# OH NCBI_TaxID=TaxID; HostName.
|
520
|
-
# http://br.expasy.org/sprot/userman.html#OH_line
|
521
|
-
def oh
|
522
|
-
unless @data['OH']
|
523
|
-
@data['OH'] = fetch('OH').split("\. ").map {|x|
|
524
|
-
if x =~ /NCBI_TaxID=(\d+);/
|
525
|
-
taxid = $1
|
526
|
-
else
|
527
|
-
raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
|
528
|
-
$!, "\n", get('OH'), "\n"].join
|
529
|
-
|
530
|
-
end
|
531
|
-
if x =~ /NCBI_TaxID=\d+; (.+)/
|
532
|
-
host_name = $1
|
533
|
-
host_name.sub!(/\.$/, '')
|
534
|
-
else
|
535
|
-
host_name = nil
|
536
|
-
end
|
537
|
-
{'NCBI_TaxID' => taxid, 'HostName' => host_name}
|
538
|
-
}
|
539
|
-
end
|
540
|
-
@data['OH']
|
541
|
-
end
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
# Bio::EMBLDB::Common#ref -> Array
|
546
|
-
# R Lines
|
547
|
-
# RN RC RP RX RA RT RL
|
548
|
-
|
549
|
-
# returns contents in the R lines.
|
550
|
-
# * Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
|
551
|
-
# where <reference information Hash> is:
|
552
|
-
# {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
553
|
-
# 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
554
|
-
#
|
555
|
-
# R Lines
|
556
|
-
# * RN RC RP RX RA RT RL RG
|
557
|
-
def ref
|
558
|
-
unless @data['R']
|
559
|
-
@data['R'] = [get('R').split(/\nRN /)].flatten.map { |str|
|
560
|
-
hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
561
|
-
'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
562
|
-
str = 'RN ' + str unless /^RN / =~ str
|
563
|
-
|
564
|
-
str.split("\n").each do |line|
|
565
|
-
if /^(R[NPXARLCTG]) (.+)/ =~ line
|
566
|
-
hash[$1] += $2 + ' '
|
567
|
-
else
|
568
|
-
raise "Invalid format in R lines, \n[#{line}]\n"
|
569
|
-
end
|
570
|
-
end
|
571
|
-
|
572
|
-
hash['RN'] = set_RN(hash['RN'])
|
573
|
-
hash['RC'] = set_RC(hash['RC'])
|
574
|
-
hash['RP'] = set_RP(hash['RP'])
|
575
|
-
hash['RX'] = set_RX(hash['RX'])
|
576
|
-
hash['RA'] = set_RA(hash['RA'])
|
577
|
-
hash['RT'] = set_RT(hash['RT'])
|
578
|
-
hash['RL'] = set_RL(hash['RL'])
|
579
|
-
hash['RG'] = set_RG(hash['RG'])
|
580
|
-
|
581
|
-
hash
|
582
|
-
}
|
583
|
-
|
584
|
-
end
|
585
|
-
@data['R']
|
586
|
-
end
|
587
|
-
|
588
|
-
def set_RN(data)
|
589
|
-
data.strip
|
590
|
-
end
|
591
|
-
|
592
|
-
def set_RC(data)
|
593
|
-
data.scan(/([STP]\w+)=(.+);/).map { |comment|
|
594
|
-
[comment[1].split(/, and |, /)].flatten.map { |text|
|
595
|
-
{'Token' => comment[0], 'Text' => text}
|
596
|
-
}
|
597
|
-
}.flatten
|
598
|
-
end
|
599
|
-
private :set_RC
|
600
|
-
|
601
|
-
def set_RP(data)
|
602
|
-
data = data.strip
|
603
|
-
data = data.sub(/\.$/, '')
|
604
|
-
data.split(/, AND |, /i).map {|x|
|
605
|
-
x = x.strip
|
606
|
-
x = x.gsub(' ', ' ')
|
607
|
-
}
|
608
|
-
end
|
609
|
-
private :set_RP
|
610
|
-
|
611
|
-
def set_RX(data)
|
612
|
-
rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
|
613
|
-
if data =~ /MEDLINE=(.+?);/
|
614
|
-
rx['MEDLINE'] = $1
|
615
|
-
end
|
616
|
-
if data =~ /PubMed=(.+?);/
|
617
|
-
rx['PubMed'] = $1
|
618
|
-
end
|
619
|
-
if data =~ /DOI=(.+?);/
|
620
|
-
rx['DOI'] = $1
|
621
|
-
end
|
622
|
-
rx
|
623
|
-
end
|
624
|
-
private :set_RX
|
625
|
-
|
626
|
-
def set_RA(data)
|
627
|
-
data = data.sub(/; *$/, '')
|
628
|
-
end
|
629
|
-
private :set_RA
|
630
|
-
|
631
|
-
def set_RT(data)
|
632
|
-
data = data.sub(/; *$/, '')
|
633
|
-
data = data.gsub(/(^"|"$)/, '')
|
634
|
-
end
|
635
|
-
private :set_RT
|
636
|
-
|
637
|
-
def set_RL(data)
|
638
|
-
data = data.strip
|
639
|
-
end
|
640
|
-
private :set_RL
|
641
|
-
|
642
|
-
def set_RG(data)
|
643
|
-
data = data.split('; ')
|
644
|
-
end
|
645
|
-
private :set_RG
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
# returns Bio::Reference object from Bio::EMBLDB::Common#ref.
|
650
|
-
# * Bio::EMBLDB::Common#ref -> Bio::References
|
651
|
-
def references
|
652
|
-
unless @data['references']
|
653
|
-
ary = self.ref.map {|ent|
|
654
|
-
hash = Hash.new('')
|
655
|
-
ent.each {|key, value|
|
656
|
-
case key
|
657
|
-
when 'RA'
|
658
|
-
hash['authors'] = value.split(/, /)
|
659
|
-
when 'RT'
|
660
|
-
hash['title'] = value
|
661
|
-
when 'RL'
|
662
|
-
if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
|
663
|
-
hash['journal'] = $1
|
664
|
-
hash['volume'] = $2
|
665
|
-
hash['issue'] = $3
|
666
|
-
hash['pages'] = $4
|
667
|
-
hash['year'] = $5
|
668
|
-
else
|
669
|
-
hash['journal'] = value
|
670
|
-
end
|
671
|
-
when 'RX' # PUBMED, MEDLINE, DOI
|
672
|
-
value.each do |tag, xref|
|
673
|
-
hash[ tag.downcase ] = xref
|
674
|
-
end
|
675
|
-
end
|
676
|
-
}
|
677
|
-
Reference.new(hash)
|
678
|
-
}
|
679
|
-
@data['references'] = References.new(ary)
|
680
|
-
end
|
681
|
-
@data['references']
|
682
|
-
end
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
# === The HI line
|
690
|
-
# Bio::SPTR#hi #=> hash
|
691
|
-
def hi
|
692
|
-
unless @data['HI']
|
693
|
-
@data['HI'] = []
|
694
|
-
fetch('HI').split(/\. /).each do |hlist|
|
695
|
-
hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''}
|
696
|
-
hash['Category'], hash['Keywords'] = hlist.split(': ')
|
697
|
-
hash['Keywords'] = hash['Keywords'].split('; ')
|
698
|
-
hash['Keyword'] = hash['Keywords'].pop
|
699
|
-
hash['Keyword'].sub!(/\.$/, '')
|
700
|
-
@data['HI'] << hash
|
701
|
-
end
|
702
|
-
end
|
703
|
-
@data['HI']
|
704
|
-
end
|
705
|
-
|
706
|
-
|
707
|
-
@@cc_topics = ['PHARMACEUTICAL',
|
708
|
-
'BIOTECHNOLOGY',
|
709
|
-
'TOXIC DOSE',
|
710
|
-
'ALLERGEN',
|
711
|
-
'RNA EDITING',
|
712
|
-
'POLYMORPHISM',
|
713
|
-
'BIOPHYSICOCHEMICAL PROPERTIES',
|
714
|
-
'MASS SPECTROMETRY',
|
715
|
-
'WEB RESOURCE',
|
716
|
-
'ENZYME REGULATION',
|
717
|
-
'DISEASE',
|
718
|
-
'INTERACTION',
|
719
|
-
'DEVELOPMENTAL STAGE',
|
720
|
-
'INDUCTION',
|
721
|
-
'CAUTION',
|
722
|
-
'ALTERNATIVE PRODUCTS',
|
723
|
-
'DOMAIN',
|
724
|
-
'PTM',
|
725
|
-
'MISCELLANEOUS',
|
726
|
-
'TISSUE SPECIFICITY',
|
727
|
-
'COFACTOR',
|
728
|
-
'PATHWAY',
|
729
|
-
'SUBUNIT',
|
730
|
-
'CATALYTIC ACTIVITY',
|
731
|
-
'SUBCELLULAR LOCATION',
|
732
|
-
'FUNCTION',
|
733
|
-
'SIMILARITY']
|
734
|
-
# returns contents in the CC lines.
|
735
|
-
# * Bio::SPTR#cc -> Hash
|
736
|
-
#
|
737
|
-
# returns an object of contents in the TOPIC.
|
738
|
-
# * Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
|
739
|
-
#
|
740
|
-
# returns contents of the "ALTERNATIVE PRODUCTS".
|
741
|
-
# * Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
|
742
|
-
# {'Event' => str,
|
743
|
-
# 'Named isoforms' => int,
|
744
|
-
# 'Comment' => str,
|
745
|
-
# 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
|
746
|
-
#
|
747
|
-
# CC -!- ALTERNATIVE PRODUCTS:
|
748
|
-
# CC Event=Alternative splicing; Named isoforms=15;
|
749
|
-
# ...
|
750
|
-
# CC placentae isoforms. All tissues differentially splice exon 13;
|
751
|
-
# CC Name=A; Synonyms=no del;
|
752
|
-
# CC IsoId=P15529-1; Sequence=Displayed;
|
753
|
-
#
|
754
|
-
# returns contents of the "DATABASE".
|
755
|
-
# * Bio::SPTR#cc('DATABASE') -> Array
|
756
|
-
# [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
|
757
|
-
#
|
758
|
-
# CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
759
|
-
#
|
760
|
-
# returns contents of the "MASS SPECTROMETRY".
|
761
|
-
# * Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
|
762
|
-
# [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
|
763
|
-
#
|
764
|
-
# CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
765
|
-
#
|
766
|
-
# === CC lines (>=0, optional)
|
767
|
-
# CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
|
768
|
-
# CC IN LIVER, KIDNEY, LUNG AND BRAIN.
|
769
|
-
#
|
770
|
-
# CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
|
771
|
-
# CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
|
772
|
-
#
|
773
|
-
# See also http://www.expasy.org/sprot/userman.html#CC_line
|
774
|
-
#
|
775
|
-
def cc(topic = nil)
|
776
|
-
unless @data['CC']
|
777
|
-
cc = Hash.new
|
778
|
-
comment_border= '-' * (77 - 4 + 1)
|
779
|
-
dlm = /-!- /
|
780
|
-
|
781
|
-
# 12KD_MYCSM has no CC lines.
|
782
|
-
return cc if get('CC').size == 0
|
783
|
-
|
784
|
-
cc_raw = fetch('CC')
|
785
|
-
|
786
|
-
# Removing the copyright statement.
|
787
|
-
cc_raw.sub!(/ *---.+---/m, '')
|
788
|
-
|
789
|
-
# Not any CC Lines without the copyright statement.
|
790
|
-
return cc if cc_raw == ''
|
791
|
-
|
792
|
-
begin
|
793
|
-
cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
|
794
|
-
cc_raw = cc_raw.sub(dlm,'')
|
795
|
-
cc_raw.split(dlm).each do |tmp|
|
796
|
-
tmp = tmp.strip
|
797
|
-
|
798
|
-
if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
|
799
|
-
key = $1
|
800
|
-
body = $2
|
801
|
-
body.gsub!(/- (?!AND)/,'-')
|
802
|
-
body.strip!
|
803
|
-
unless cc[key]
|
804
|
-
cc[key] = [body]
|
805
|
-
else
|
806
|
-
cc[key].push(body)
|
807
|
-
end
|
808
|
-
else
|
809
|
-
raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
|
810
|
-
'', get('CC'),''].join("\n")
|
811
|
-
end
|
812
|
-
end
|
813
|
-
rescue NameError
|
814
|
-
if fetch('CC') == ''
|
815
|
-
return {}
|
816
|
-
else
|
817
|
-
raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
|
818
|
-
"\n'#{self.get('CC')}'\n", "(#{$!})"].join
|
819
|
-
end
|
820
|
-
rescue NoMethodError
|
821
|
-
end
|
822
|
-
|
823
|
-
@data['CC'] = cc
|
824
|
-
end
|
825
|
-
|
826
|
-
|
827
|
-
case topic
|
828
|
-
when 'ALLERGEN'
|
829
|
-
return @data['CC'][topic]
|
830
|
-
when 'ALTERNATIVE PRODUCTS'
|
831
|
-
return cc_alternative_products(@data['CC'][topic])
|
832
|
-
when 'BIOPHYSICOCHEMICAL PROPERTIES'
|
833
|
-
return cc_biophysiochemical_properties(@data['CC'][topic])
|
834
|
-
when 'BIOTECHNOLOGY'
|
835
|
-
return @data['CC'][topic]
|
836
|
-
when 'CATALITIC ACTIVITY'
|
837
|
-
return cc_catalytic_activity(@data['CC'][topic])
|
838
|
-
when 'CAUTION'
|
839
|
-
return cc_caution(@data['CC'][topic])
|
840
|
-
when 'COFACTOR'
|
841
|
-
return @data['CC'][topic]
|
842
|
-
when 'DEVELOPMENTAL STAGE'
|
843
|
-
return @data['CC'][topic].join('')
|
844
|
-
when 'DISEASE'
|
845
|
-
return @data['CC'][topic].join('')
|
846
|
-
when 'DOMAIN'
|
847
|
-
return @data['CC'][topic]
|
848
|
-
when 'ENZYME REGULATION'
|
849
|
-
return @data['CC'][topic].join('')
|
850
|
-
when 'FUNCTION'
|
851
|
-
return @data['CC'][topic].join('')
|
852
|
-
when 'INDUCTION'
|
853
|
-
return @data['CC'][topic].join('')
|
854
|
-
when 'INTERACTION'
|
855
|
-
return cc_interaction(@data['CC'][topic])
|
856
|
-
when 'MASS SPECTROMETRY'
|
857
|
-
return cc_mass_spectrometry(@data['CC'][topic])
|
858
|
-
when 'MISCELLANEOUS'
|
859
|
-
return @data['CC'][topic]
|
860
|
-
when 'PATHWAY'
|
861
|
-
return cc_pathway(@data['CC'][topic])
|
862
|
-
when 'PHARMACEUTICAL'
|
863
|
-
return @data['CC'][topic]
|
864
|
-
when 'POLYMORPHISM'
|
865
|
-
return @data['CC'][topic]
|
866
|
-
when 'PTM'
|
867
|
-
return @data['CC'][topic]
|
868
|
-
when 'RNA EDITING'
|
869
|
-
return cc_rna_editing(@data['CC'][topic])
|
870
|
-
when 'SIMILARITY'
|
871
|
-
return @data['CC'][topic]
|
872
|
-
when 'SUBCELLULAR LOCATION'
|
873
|
-
return cc_subcellular_location(@data['CC'][topic])
|
874
|
-
when 'SUBUNIT'
|
875
|
-
return @data['CC'][topic]
|
876
|
-
when 'TISSUE SPECIFICITY'
|
877
|
-
return @data['CC'][topic]
|
878
|
-
when 'TOXIC DOSE'
|
879
|
-
return @data['CC'][topic]
|
880
|
-
when 'WEB RESOURCE'
|
881
|
-
return cc_web_resource(@data['CC'][topic])
|
882
|
-
when 'DATABASE'
|
883
|
-
# DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
884
|
-
tmp = Array.new
|
885
|
-
db = @data['CC']['DATABASE']
|
886
|
-
return db unless db
|
887
|
-
|
888
|
-
db.each do |e|
|
889
|
-
db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
|
890
|
-
e.sub(/.$/,'').split(/;/).each do |line|
|
891
|
-
case line
|
892
|
-
when /NAME=(.+)/
|
893
|
-
db['NAME'] = $1
|
894
|
-
when /NOTE=(.+)/
|
895
|
-
db['NOTE'] = $1
|
896
|
-
when /WWW="(.+)"/
|
897
|
-
db['WWW'] = $1
|
898
|
-
when /FTP="(.+)"/
|
899
|
-
db['FTP'] = $1
|
900
|
-
end
|
901
|
-
end
|
902
|
-
tmp.push(db)
|
903
|
-
end
|
904
|
-
return tmp
|
905
|
-
when nil
|
906
|
-
return @data['CC']
|
907
|
-
else
|
908
|
-
return @data['CC'][topic]
|
909
|
-
end
|
910
|
-
end
|
911
|
-
|
912
|
-
|
913
|
-
def cc_alternative_products(data)
|
914
|
-
ap = data.join('')
|
915
|
-
return ap unless ap
|
916
|
-
|
917
|
-
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
|
918
|
-
tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "",
|
919
|
-
'Variants' => []}
|
920
|
-
if /Event=(.+?);/ =~ ap
|
921
|
-
tmp['Event'] = $1
|
922
|
-
tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
|
923
|
-
end
|
924
|
-
if /Named isoforms=(\S+?);/ =~ ap
|
925
|
-
tmp['Named isoforms'] = $1
|
926
|
-
end
|
927
|
-
if /Comment=(.+?);/m =~ ap
|
928
|
-
tmp['Comment'] = $1
|
929
|
-
end
|
930
|
-
ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
|
931
|
-
tmp['Variants'] << cc_alternative_products_variants(ent)
|
932
|
-
end
|
933
|
-
return tmp
|
934
|
-
end
|
935
|
-
private :cc_alternative_products
|
936
|
-
|
937
|
-
def cc_alternative_products_variants(data)
|
938
|
-
variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
|
939
|
-
data.split(/; /).map {|x| x.split(/=/) }.each do |e|
|
940
|
-
case e[0]
|
941
|
-
when 'Sequence', 'Synonyms', 'IsoId'
|
942
|
-
e[1] = e[1].sub(/;/,'').split(/, /)
|
943
|
-
end
|
944
|
-
variant[e[0]] = e[1]
|
945
|
-
end
|
946
|
-
variant
|
947
|
-
end
|
948
|
-
private :cc_alternative_products_variants
|
949
|
-
|
950
|
-
|
951
|
-
def cc_biophysiochemical_properties(data)
|
952
|
-
data = data[0]
|
953
|
-
|
954
|
-
hash = {'Absorption' => {},
|
955
|
-
'Kinetic parameters' => {},
|
956
|
-
'pH dependence' => "",
|
957
|
-
'Redox potential' => "",
|
958
|
-
'Temperature dependence' => ""}
|
959
|
-
if data =~ /Absorption: Abs\(max\)=(.+?);/
|
960
|
-
hash['Absorption']['Abs(max)'] = $1
|
961
|
-
end
|
962
|
-
if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
|
963
|
-
hash['Absorption']['Note'] = $1
|
964
|
-
end
|
965
|
-
if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
|
966
|
-
hash['Kinetic parameters']['KM'] = $1
|
967
|
-
hash['Kinetic parameters']['Vmax'] = $2
|
968
|
-
end
|
969
|
-
if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
|
970
|
-
hash['Kinetic parameters']['Note'] = $1
|
971
|
-
end
|
972
|
-
if data =~ /pH dependence: (.+?);/
|
973
|
-
hash['pH dependence'] = $1
|
974
|
-
end
|
975
|
-
if data =~ /Redox potential: (.+?);/
|
976
|
-
hash['Redox potential'] = $1
|
977
|
-
end
|
978
|
-
if data =~ /Temperature dependence: (.+?);/
|
979
|
-
hash['Temperature dependence'] = $1
|
980
|
-
end
|
981
|
-
hash
|
982
|
-
end
|
983
|
-
private :cc_biophysiochemical_properties
|
984
|
-
|
985
|
-
|
986
|
-
def cc_caution(data)
|
987
|
-
data.join('')
|
988
|
-
end
|
989
|
-
private :cc_caution
|
990
|
-
|
991
|
-
|
992
|
-
# returns conteins in a line of the CC INTERACTION section.
|
993
|
-
#
|
994
|
-
# CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
|
995
|
-
def cc_interaction(data)
|
996
|
-
str = data.join('')
|
997
|
-
it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
|
998
|
-
it.map {|ent|
|
999
|
-
ent.map! {|x| x.strip }
|
1000
|
-
if ent[0] =~ /^(.+):(.+)/
|
1001
|
-
spac = $1
|
1002
|
-
spid = $2.split(' ')[0]
|
1003
|
-
optid = nil
|
1004
|
-
elsif ent[0] =~ /Self/
|
1005
|
-
spac = self.entry_id
|
1006
|
-
spid = self.entry_id
|
1007
|
-
optid = nil
|
1008
|
-
end
|
1009
|
-
if ent[0] =~ /^.+:.+ (.+)/
|
1010
|
-
optid = $1
|
1011
|
-
end
|
1012
|
-
|
1013
|
-
{'SP_Ac' => spac,
|
1014
|
-
'identifier' => spid,
|
1015
|
-
'NbExp' => ent[1],
|
1016
|
-
'IntAct' => ent[2].split(', '),
|
1017
|
-
'optional_identifier' => optid}
|
1018
|
-
}
|
1019
|
-
end
|
1020
|
-
private :cc_interaction
|
1021
|
-
|
1022
|
-
|
1023
|
-
def cc_mass_spectrometry(data)
|
1024
|
-
# MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
1025
|
-
return data unless data
|
1026
|
-
|
1027
|
-
data.map { |m|
|
1028
|
-
mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
|
1029
|
-
'NOTE' => nil}
|
1030
|
-
m.sub(/.$/,'').split(/;/).each do |line|
|
1031
|
-
case line
|
1032
|
-
when /MW=(.+)/
|
1033
|
-
mass['MW'] = $1
|
1034
|
-
when /MW_ERR=(.+)/
|
1035
|
-
mass['MW_ERR'] = $1
|
1036
|
-
when /METHOD=(.+)/
|
1037
|
-
mass['METHOD'] = $1
|
1038
|
-
when /RANGE=(\d+-\d+)/
|
1039
|
-
mass['RANGE'] = $1 # RANGE class ?
|
1040
|
-
when /NOTE=(.+)/
|
1041
|
-
mass['NOTE'] = $1
|
1042
|
-
end
|
1043
|
-
end
|
1044
|
-
mass
|
1045
|
-
}
|
1046
|
-
end
|
1047
|
-
private :cc_mass_spectrometry
|
1048
|
-
|
1049
|
-
|
1050
|
-
def cc_pathway(data)
|
1051
|
-
data.map {|x| x.sub(/\.$/, '') }.map {|x|
|
1052
|
-
x.split(/; | and |: /)
|
1053
|
-
}[0]
|
1054
|
-
end
|
1055
|
-
private :cc_pathway
|
1056
|
-
|
1057
|
-
|
1058
|
-
def cc_rna_editing(data)
|
1059
|
-
data = data.join('')
|
1060
|
-
entry = {'Modified_positions' => [], 'Note' => ""}
|
1061
|
-
if data =~ /Modified_positions=(.+?)(\.|;)/
|
1062
|
-
entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
|
1063
|
-
else
|
1064
|
-
raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
|
1065
|
-
end
|
1066
|
-
if data =~ /Note=(.+)/
|
1067
|
-
entry['Note'] = $1
|
1068
|
-
end
|
1069
|
-
entry
|
1070
|
-
end
|
1071
|
-
private :cc_rna_editing
|
1072
|
-
|
1073
|
-
|
1074
|
-
def cc_subcellular_location(data)
|
1075
|
-
data.map {|x|
|
1076
|
-
x.split('. ').map {|y|
|
1077
|
-
y.split('; ').map {|z|
|
1078
|
-
z.sub(/\.$/, '')
|
1079
|
-
}
|
1080
|
-
}
|
1081
|
-
}[0]
|
1082
|
-
end
|
1083
|
-
private :cc_subcellular_location
|
1084
|
-
|
1085
|
-
|
1086
|
-
#--
|
1087
|
-
# Since UniProtKB release 12.2 of 11-Sep-2007:
|
1088
|
-
# CC -!- WEB RESOURCE: Name=ResourceName[; Note=FreeText][; URL=WWWAddress]. # Old format:
|
1089
|
-
# CC -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].
|
1090
|
-
#++
|
1091
|
-
|
1092
|
-
def cc_web_resource(data)
|
1093
|
-
data.map {|x|
|
1094
|
-
entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
|
1095
|
-
x.split(';').each do |y|
|
1096
|
-
case y
|
1097
|
-
when /(Name|Note)\=(.+)/
|
1098
|
-
key = $1
|
1099
|
-
val = $2.strip
|
1100
|
-
entry[key] = val
|
1101
|
-
when /(NAME|NOTE)\=(.+)/
|
1102
|
-
key = $1.downcase.capitalize
|
1103
|
-
val = $2.strip
|
1104
|
-
entry[key] = val
|
1105
|
-
when /URL\=\"(.+)\"/
|
1106
|
-
entry['URL'] = $1.strip
|
1107
|
-
end
|
1108
|
-
end
|
1109
|
-
entry
|
1110
|
-
}
|
1111
|
-
end
|
1112
|
-
private :cc_web_resource
|
1113
|
-
|
1114
|
-
# returns databases cross-references in the DR lines.
|
1115
|
-
# * Bio::SPTR#dr -> Hash w/in Array
|
1116
|
-
#
|
1117
|
-
# === DR Line; defabases cross-reference (>=0)
|
1118
|
-
# DR database_identifier; primary_identifier; secondary_identifier.
|
1119
|
-
# a cross_ref pre one line
|
1120
|
-
@@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
|
1121
|
-
'ECOGENE',
|
1122
|
-
'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
|
1123
|
-
'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
|
1124
|
-
'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
|
1125
|
-
'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
|
1126
|
-
|
1127
|
-
# Backup Bio::EMBLDB#dr as embl_dr
|
1128
|
-
alias :embl_dr :dr
|
1129
|
-
|
1130
|
-
# Bio::SPTR#dr
|
1131
|
-
def dr(key = nil)
|
1132
|
-
unless key
|
1133
|
-
embl_dr
|
1134
|
-
else
|
1135
|
-
(embl_dr[key] or []).map {|x|
|
1136
|
-
{'Accession' => x[0],
|
1137
|
-
'Version' => x[1],
|
1138
|
-
' ' => x[2],
|
1139
|
-
'Molecular Type' => x[3]}
|
1140
|
-
}
|
1141
|
-
end
|
1142
|
-
end
|
1143
|
-
|
1144
|
-
|
1145
|
-
# Bio::EMBLDB::Common#kw - Array
|
1146
|
-
# #keywords -> Array
|
1147
|
-
#
|
1148
|
-
# KW Line; keyword (>=1)
|
1149
|
-
# KW [Keyword;]+
|
1150
|
-
|
1151
|
-
|
1152
|
-
# returns contents in the feature table.
|
1153
|
-
#
|
1154
|
-
# == Examples
|
1155
|
-
#
|
1156
|
-
# sp = Bio::SPTR.new(entry)
|
1157
|
-
# ft = sp.ft
|
1158
|
-
# ft.class #=> Hash
|
1159
|
-
# ft.keys.each do |feature_key|
|
1160
|
-
# ft[feature_key].each do |feature|
|
1161
|
-
# feature['From'] #=> '1'
|
1162
|
-
# feature['To'] #=> '21'
|
1163
|
-
# feature['Description'] #=> ''
|
1164
|
-
# feature['FTId'] #=> ''
|
1165
|
-
# feature['diff'] #=> []
|
1166
|
-
# feature['original'] #=> [feature_key, '1', '21', '', '']
|
1167
|
-
# end
|
1168
|
-
# end
|
1169
|
-
#
|
1170
|
-
# * Bio::SPTR#ft -> Hash
|
1171
|
-
# {FEATURE_KEY => [{'From' => int, 'To' => int,
|
1172
|
-
# 'Description' => aStr, 'FTId' => aStr,
|
1173
|
-
# 'diff' => [original_residues, changed_residues],
|
1174
|
-
# 'original' => aAry }],...}
|
1175
|
-
#
|
1176
|
-
# returns an Array of the information about the feature_name in the feature table.
|
1177
|
-
# * Bio::SPTR#ft(feature_name) -> Array of Hash
|
1178
|
-
# [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
|
1179
|
-
#
|
1180
|
-
# == FT Line; feature table data (>=0, optional)
|
1181
|
-
#
|
1182
|
-
# Col Data item
|
1183
|
-
# ----- -----------------
|
1184
|
-
# 1- 2 FT
|
1185
|
-
# 6-13 Feature name
|
1186
|
-
# 15-20 `FROM' endpoint
|
1187
|
-
# 22-27 `TO' endpoint
|
1188
|
-
# 35-75 Description (>=0 per key)
|
1189
|
-
# ----- -----------------
|
1190
|
-
#
|
1191
|
-
# Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors
|
1192
|
-
# including '<', '>' or '?'. (c.f. '<1', '?42')
|
1193
|
-
#
|
1194
|
-
# See also http://www.expasy.org/sprot/userman.html#FT_line
|
1195
|
-
#
|
1196
|
-
def ft(feature_key = nil)
|
1197
|
-
return ft[feature_key] if feature_key
|
1198
|
-
return @data['FT'] if @data['FT']
|
1199
|
-
|
1200
|
-
table = []
|
1201
|
-
begin
|
1202
|
-
get('FT').split("\n").each do |line|
|
1203
|
-
if line =~ /^FT \w/
|
1204
|
-
feature = line.chomp.ljust(74)
|
1205
|
-
table << [feature[ 5..12].strip, # Feature Name
|
1206
|
-
feature[14..19].strip, # From
|
1207
|
-
feature[21..26].strip, # To
|
1208
|
-
feature[34..74].strip ] # Description
|
1209
|
-
else
|
1210
|
-
table.last << line.chomp.sub!(/^FT +/, '')
|
1211
|
-
end
|
1212
|
-
end
|
1213
|
-
|
1214
|
-
# Joining Description lines
|
1215
|
-
table = table.map { |feature|
|
1216
|
-
ftid = feature.pop if feature.last =~ /FTId=/
|
1217
|
-
if feature.size > 4
|
1218
|
-
feature = [feature[0],
|
1219
|
-
feature[1],
|
1220
|
-
feature[2],
|
1221
|
-
feature[3, feature.size - 3].join(" ")]
|
1222
|
-
end
|
1223
|
-
feature << if ftid then ftid else '' end
|
1224
|
-
}
|
1225
|
-
|
1226
|
-
hash = {}
|
1227
|
-
table.each do |feature|
|
1228
|
-
hash[feature[0]] = [] unless hash[feature[0]]
|
1229
|
-
hash[feature[0]] << {
|
1230
|
-
# Removing '<', '>' or '?' in FROM/TO endopoint.
|
1231
|
-
'From' => feature[1].sub(/\D/, '').to_i,
|
1232
|
-
'To' => feature[2].sub(/\D/, '').to_i,
|
1233
|
-
'Description' => feature[3],
|
1234
|
-
'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
|
1235
|
-
'diff' => [],
|
1236
|
-
'original' => feature
|
1237
|
-
}
|
1238
|
-
|
1239
|
-
case feature[0]
|
1240
|
-
when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
|
1241
|
-
case hash[feature[0]].last['Description']
|
1242
|
-
when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
|
1243
|
-
original_res = $1
|
1244
|
-
changed_res = $2
|
1245
|
-
original_res = original_res.gsub(/ /,'').strip
|
1246
|
-
chenged_res = changed_res.gsub(/ /,'').strip
|
1247
|
-
when /Missing/i
|
1248
|
-
original_res = seq.subseq(hash[feature[0]].last['From'],
|
1249
|
-
hash[feature[0]].last['To'])
|
1250
|
-
changed_res = ''
|
1251
|
-
end
|
1252
|
-
hash[feature[0]].last['diff'] = [original_res, chenged_res]
|
1253
|
-
end
|
1254
|
-
end
|
1255
|
-
rescue
|
1256
|
-
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
|
1257
|
-
end
|
1258
|
-
|
1259
|
-
@data['FT'] = hash
|
1260
|
-
end
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
# returns a Hash of conteins in the SQ lines.
|
1265
|
-
# * Bio::SPTRL#sq -> hsh
|
1266
|
-
#
|
1267
|
-
# returns a value of a key given in the SQ lines.
|
1268
|
-
# * Bio::SPTRL#sq(key) -> int or str
|
1269
|
-
# * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
|
1270
|
-
# 'CRC64']
|
1271
|
-
#
|
1272
|
-
# === SQ Line; sequence header (1/entry)
|
1273
|
-
# SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
|
1274
|
-
# SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
|
1275
|
-
#
|
1276
|
-
# MW, Dalton unit.
|
1277
|
-
# CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
|
1278
|
-
def sq(key = nil)
|
1279
|
-
unless @data['SQ']
|
1280
|
-
if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
|
1281
|
-
@data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
|
1282
|
-
else
|
1283
|
-
raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
|
1284
|
-
end
|
1285
|
-
end
|
1286
|
-
|
1287
|
-
if key
|
1288
|
-
case key
|
1289
|
-
when /mw/, /molecular/, /weight/
|
1290
|
-
@data['SQ']['MW']
|
1291
|
-
when /len/, /length/, /AA/
|
1292
|
-
@data['SQ']['aalen']
|
1293
|
-
else
|
1294
|
-
@data['SQ'][key]
|
1295
|
-
end
|
1296
|
-
else
|
1297
|
-
@data['SQ']
|
1298
|
-
end
|
1299
|
-
end
|
1300
|
-
|
1301
|
-
|
1302
|
-
# returns a Bio::Sequence::AA of the amino acid sequence.
|
1303
|
-
# * Bio::SPTR#seq -> Bio::Sequence::AA
|
1304
|
-
#
|
1305
|
-
# blank Line; sequence data (>=1)
|
1306
|
-
def seq
|
1307
|
-
unless @data['']
|
1308
|
-
@data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
|
1309
|
-
end
|
1310
|
-
return @data['']
|
1311
|
-
end
|
1312
|
-
alias aaseq seq
|
1313
|
-
|
1314
|
-
end # class SPTR
|
1315
|
-
|
1316
|
-
end # module Bio
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
=begin
|
1321
|
-
|
1322
|
-
= Bio::SPTR < Bio::DB
|
1323
|
-
|
1324
|
-
Class for a entry in the SWISS-PROT/TrEMBL database.
|
1325
|
-
|
1326
|
-
* ((<URL:http://www.ebi.ac.uk/swissprot/>))
|
1327
|
-
* ((<URL:http://www.ebi.ac.uk/trembl/>))
|
1328
|
-
* ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
|
1329
|
-
|
1330
|
-
|
1331
|
-
--- Bio::SPTR.new(a_sp_entry)
|
1332
|
-
|
1333
|
-
=== ID line (Identification)
|
1334
|
-
|
1335
|
-
--- Bio::SPTR#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
|
1336
|
-
'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
|
1337
|
-
--- Bio::SPTR#id_line(key) -> str
|
1338
|
-
|
1339
|
-
key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
|
1340
|
-
|
1341
|
-
--- Bio::SPTR#entry_id -> str
|
1342
|
-
--- Bio::SPTR#molecule -> str
|
1343
|
-
--- Bio::SPTR#sequence_length -> int
|
1344
|
-
|
1345
|
-
|
1346
|
-
=== AC lines (Accession number)
|
1347
|
-
|
1348
|
-
--- Bio::SPTR#ac -> ary
|
1349
|
-
--- Bio::SPTR#accessions -> ary
|
1350
|
-
--- Bio::SPTR#accession -> accessions.first
|
1351
|
-
|
1352
|
-
|
1353
|
-
=== GN line (Gene name(s))
|
1354
|
-
|
1355
|
-
--- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
|
1356
|
-
--- Bio::SPTR#gene_name -> str
|
1357
|
-
--- Bio::SPTR#gene_names -> [str] or [str]
|
1358
|
-
|
1359
|
-
|
1360
|
-
=== DT lines (Date)
|
1361
|
-
|
1362
|
-
--- Bio::SPTR#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
|
1363
|
-
--- Bio::SPTR#dt(key) -> str
|
1364
|
-
|
1365
|
-
key := (created|annotation|sequence)
|
1366
|
-
|
1367
|
-
|
1368
|
-
=== DE lines (Description)
|
1369
|
-
|
1370
|
-
--- Bio::SPTR#de -> str
|
1371
|
-
#definition -> str
|
1372
|
-
|
1373
|
-
--- Bio::SPTR#protein_name
|
1374
|
-
|
1375
|
-
Returns the proposed official name of the protein
|
1376
|
-
|
1377
|
-
|
1378
|
-
--- Bio::SPTR#synonyms
|
1379
|
-
|
1380
|
-
Returns an array of synonyms (unofficial names)
|
1381
|
-
|
1382
|
-
=== KW lines (Keyword)
|
1383
|
-
|
1384
|
-
--- Bio::SPTR#kw -> ary
|
1385
|
-
|
1386
|
-
=== OS lines (Organism species)
|
1387
|
-
|
1388
|
-
--- Bio::SPTR#os -> [{'name' => str, 'os' => str}, ...]
|
1389
|
-
|
1390
|
-
=== OC lines (organism classification)
|
1391
|
-
|
1392
|
-
--- Bio::SPTR#oc -> ary
|
1393
|
-
|
1394
|
-
=== OG line (Organella)
|
1395
|
-
|
1396
|
-
--- Bio::SPTR#og -> ary
|
1397
|
-
|
1398
|
-
=== OX line (Organism taxonomy cross-reference)
|
1399
|
-
|
1400
|
-
--- Bio::SPTR#ox -> {'NCBI_TaxID' => [], ...}
|
1401
|
-
|
1402
|
-
=== RN RC RP RX RA RT RL RG lines (Reference)
|
1403
|
-
|
1404
|
-
--- Bio::SPTR#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
|
1405
|
-
|
1406
|
-
=== DR lines (Database cross-reference)
|
1407
|
-
|
1408
|
-
--- Bio::SPTR#dr -> {'EMBL' => ary, ...}
|
1409
|
-
|
1410
|
-
=== FT lines (Feature table data)
|
1411
|
-
|
1412
|
-
--- Bio::SPTR#ft -> hsh
|
1413
|
-
|
1414
|
-
=== SQ lines (Sequence header and data)
|
1415
|
-
|
1416
|
-
--- Bio::SPTR#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
|
1417
|
-
--- Bio::SPTR#sq(key) -> int or str
|
1418
|
-
|
1419
|
-
key := (aalen|MW|CRC64)
|
1420
|
-
|
1421
|
-
--- Bio::EMBL#seq -> Bio::Sequece::AA
|
1422
|
-
#aaseq -> Bio::Sequece::AA
|
1423
|
-
|
1424
|
-
=end
|
12
|
+
require "bio/db/embl/uniprotkb" unless const_defined?(:UniProtKB)
|
1425
13
|
|
1426
|
-
#
|
1427
|
-
#
|
1428
|
-
#
|
1429
|
-
|
1430
|
-
# DT - date (3 per entry)
|
1431
|
-
# DE - description (>=1 per entry)
|
1432
|
-
# GN - gene name(s) (>=0 per entry; optional)
|
1433
|
-
# OS - organism species (>=1 per entry)
|
1434
|
-
# OG - organelle (0 or 1 per entry; optional)
|
1435
|
-
# OC - organism classification (>=1 per entry)
|
1436
|
-
# OX - organism taxonomy x-ref (>=1 per entry)
|
1437
|
-
# OH - Organism Host
|
1438
|
-
# RN - reference number (>=1 per entry)
|
1439
|
-
# RP - reference positions (>=1 per entry)
|
1440
|
-
# RC - reference comment(s) (>=0 per entry; optional)
|
1441
|
-
# RX - reference cross-reference(s) (>=0 per entry; optional)
|
1442
|
-
# RA - reference author(s) (>=1 per entry)
|
1443
|
-
# RT - reference title (>=0 per entry; optional)
|
1444
|
-
# RL - reference location (>=1 per entry)
|
1445
|
-
# RG - reference group(s)
|
1446
|
-
# CC - comments or notes (>=0 per entry; optional)
|
1447
|
-
# DR - database cross-references (>=0 per entry; optional)
|
1448
|
-
# KW - keywords (>=1 per entry)
|
1449
|
-
# FT - feature table data (>=0 per entry; optional)
|
1450
|
-
# SQ - sequence header (1 per entry)
|
1451
|
-
# - (blanks) The sequence data (>=1 per entry)
|
1452
|
-
# // - termination line (ends each entry; 1 per entry)
|
1453
|
-
# ---- --------------------------- --------------------------------
|
14
|
+
# Bio::SPTR is changed to an alias of Bio::UniProtKB.
|
15
|
+
# Please use Bio::UniProtKB.
|
16
|
+
# Bio::SPTR may be deprecated in the future.
|
17
|
+
SPTR = UniProtKB
|
1454
18
|
|
19
|
+
end #module Bio
|
1455
20
|
|