bio 1.4.3.0001 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +39 -33
- data/BSDL +22 -0
- data/COPYING +2 -2
- data/COPYING.ja +36 -36
- data/ChangeLog +2404 -1025
- data/KNOWN_ISSUES.rdoc +15 -55
- data/README.rdoc +17 -23
- data/RELEASE_NOTES.rdoc +246 -183
- data/Rakefile +3 -2
- data/bin/br_biofetch.rb +29 -5
- data/bioruby.gemspec +15 -32
- data/bioruby.gemspec.erb +10 -20
- data/doc/ChangeLog-1.4.3 +1478 -0
- data/doc/RELEASE_NOTES-1.4.3.rdoc +204 -0
- data/doc/Tutorial.rd +0 -6
- data/doc/Tutorial.rd.html +7 -12
- data/doc/Tutorial.rd.ja +960 -1064
- data/doc/Tutorial.rd.ja.html +977 -1067
- data/gemfiles/Gemfile.travis-jruby1.8 +2 -1
- data/gemfiles/Gemfile.travis-jruby1.9 +2 -4
- data/gemfiles/Gemfile.travis-rbx +13 -0
- data/gemfiles/Gemfile.travis-ruby1.8 +2 -1
- data/gemfiles/Gemfile.travis-ruby1.9 +2 -4
- data/gemfiles/Gemfile.travis-ruby2.2 +9 -0
- data/lib/bio.rb +10 -43
- data/lib/bio/alignment.rb +8 -14
- data/lib/bio/appl/blast.rb +1 -2
- data/lib/bio/appl/blast/format0.rb +18 -7
- data/lib/bio/appl/blast/remote.rb +0 -9
- data/lib/bio/appl/blast/report.rb +1 -1
- data/lib/bio/appl/clustalw/report.rb +3 -1
- data/lib/bio/appl/genscan/report.rb +1 -2
- data/lib/bio/appl/iprscan/report.rb +1 -2
- data/lib/bio/appl/meme/mast.rb +4 -4
- data/lib/bio/appl/meme/mast/report.rb +1 -1
- data/lib/bio/appl/paml/codeml.rb +2 -2
- data/lib/bio/appl/paml/codeml/report.rb +1 -0
- data/lib/bio/appl/paml/common.rb +1 -1
- data/lib/bio/appl/sosui/report.rb +1 -2
- data/lib/bio/command.rb +62 -2
- data/lib/bio/data/aa.rb +13 -31
- data/lib/bio/data/codontable.rb +1 -2
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +1 -0
- data/lib/bio/db/biosql/sequence.rb +1 -1
- data/lib/bio/db/embl/common.rb +1 -1
- data/lib/bio/db/embl/embl.rb +5 -4
- data/lib/bio/db/embl/format_embl.rb +3 -3
- data/lib/bio/db/embl/sptr.rb +9 -1444
- data/lib/bio/db/embl/swissprot.rb +12 -29
- data/lib/bio/db/embl/trembl.rb +13 -30
- data/lib/bio/db/embl/uniprot.rb +12 -29
- data/lib/bio/db/embl/uniprotkb.rb +1455 -0
- data/lib/bio/db/fasta.rb +17 -0
- data/lib/bio/db/fasta/defline.rb +1 -3
- data/lib/bio/db/fastq.rb +1 -1
- data/lib/bio/db/genbank/ddbj.rb +9 -5
- data/lib/bio/db/genbank/refseq.rb +11 -3
- data/lib/bio/db/gff.rb +3 -4
- data/lib/bio/db/go.rb +5 -6
- data/lib/bio/db/kegg/module.rb +4 -5
- data/lib/bio/db/kegg/pathway.rb +4 -5
- data/lib/bio/db/kegg/reaction.rb +1 -1
- data/lib/bio/db/nexus.rb +3 -2
- data/lib/bio/db/pdb/pdb.rb +2 -2
- data/lib/bio/db/phyloxml/phyloxml_elements.rb +82 -59
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +2 -2
- data/lib/bio/db/phyloxml/phyloxml_writer.rb +1 -2
- data/lib/bio/db/sanger_chromatogram/chromatogram.rb +1 -2
- data/lib/bio/db/transfac.rb +1 -1
- data/lib/bio/io/das.rb +40 -41
- data/lib/bio/io/fastacmd.rb +0 -16
- data/lib/bio/io/fetch.rb +111 -55
- data/lib/bio/io/flatfile/buffer.rb +4 -5
- data/lib/bio/io/hinv.rb +2 -3
- data/lib/bio/io/ncbirest.rb +43 -6
- data/lib/bio/io/pubmed.rb +76 -81
- data/lib/bio/io/togows.rb +33 -10
- data/lib/bio/map.rb +1 -1
- data/lib/bio/pathway.rb +1 -1
- data/lib/bio/sequence/compat.rb +1 -1
- data/lib/bio/sequence/na.rb +63 -12
- data/lib/bio/shell.rb +0 -2
- data/lib/bio/shell/core.rb +5 -6
- data/lib/bio/shell/interface.rb +3 -4
- data/lib/bio/shell/irb.rb +1 -2
- data/lib/bio/shell/plugin/entry.rb +2 -3
- data/lib/bio/shell/plugin/seq.rb +7 -6
- data/lib/bio/shell/setup.rb +1 -2
- data/lib/bio/tree.rb +2 -2
- data/lib/bio/util/contingency_table.rb +0 -2
- data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +2 -2
- data/lib/bio/util/sirna.rb +76 -16
- data/lib/bio/version.rb +8 -9
- data/sample/benchmark_clustalw_report.rb +47 -0
- data/sample/biofetch.rb +248 -151
- data/setup.rb +6 -7
- data/test/data/clustalw/example1-seqnos.aln +58 -0
- data/test/network/bio/appl/blast/test_remote.rb +1 -15
- data/test/network/bio/appl/test_blast.rb +0 -12
- data/test/network/bio/io/test_pubmed.rb +49 -0
- data/test/network/bio/io/test_togows.rb +0 -1
- data/test/network/bio/test_command.rb +65 -2
- data/test/unit/bio/appl/bl2seq/test_report.rb +0 -1
- data/test/unit/bio/appl/blast/test_report.rb +110 -48
- data/test/unit/bio/appl/clustalw/test_report.rb +67 -51
- data/test/unit/bio/appl/sim4/test_report.rb +46 -17
- data/test/unit/bio/appl/test_blast.rb +2 -2
- data/test/unit/bio/db/embl/test_embl.rb +0 -1
- data/test/unit/bio/db/embl/test_embl_rel89.rb +0 -1
- data/test/unit/bio/db/embl/{test_sptr.rb → test_uniprotkb.rb} +111 -115
- data/test/unit/bio/db/embl/{test_uniprot_new_part.rb → test_uniprotkb_new_part.rb} +11 -11
- data/test/unit/bio/db/genbank/test_genbank.rb +10 -4
- data/test/unit/bio/db/pdb/test_pdb.rb +14 -8
- data/test/unit/bio/db/test_fasta.rb +41 -1
- data/test/unit/bio/db/test_fastq.rb +14 -4
- data/test/unit/bio/db/test_gff.rb +2 -2
- data/test/unit/bio/db/test_phyloxml.rb +30 -30
- data/test/unit/bio/db/test_phyloxml_writer.rb +2 -2
- data/test/unit/bio/io/flatfile/test_autodetection.rb +1 -2
- data/test/unit/bio/io/flatfile/test_buffer.rb +7 -1
- data/test/unit/bio/io/flatfile/test_splitter.rb +1 -1
- data/test/unit/bio/io/test_togows.rb +3 -2
- data/test/unit/bio/sequence/test_dblink.rb +1 -1
- data/test/unit/bio/sequence/test_na.rb +3 -1
- data/test/unit/bio/test_alignment.rb +1 -2
- data/test/unit/bio/test_command.rb +5 -4
- data/test/unit/bio/test_db.rb +4 -2
- data/test/unit/bio/test_pathway.rb +25 -10
- data/test/unit/bio/util/test_sirna.rb +22 -22
- metadata +656 -1430
- data/doc/KEGG_API.rd +0 -1843
- data/doc/KEGG_API.rd.ja +0 -1834
- data/extconf.rb +0 -2
- data/lib/bio/appl/blast/ddbj.rb +0 -131
- data/lib/bio/db/kegg/taxonomy.rb +0 -280
- data/lib/bio/io/dbget.rb +0 -194
- data/lib/bio/io/ddbjrest.rb +0 -344
- data/lib/bio/io/ddbjxml.rb +0 -458
- data/lib/bio/io/ebisoap.rb +0 -158
- data/lib/bio/io/ensembl.rb +0 -229
- data/lib/bio/io/higet.rb +0 -73
- data/lib/bio/io/keggapi.rb +0 -363
- data/lib/bio/io/ncbisoap.rb +0 -156
- data/lib/bio/io/soapwsdl.rb +0 -119
- data/lib/bio/shell/plugin/keggapi.rb +0 -181
- data/lib/bio/shell/plugin/soap.rb +0 -87
- data/sample/dbget +0 -37
- data/sample/demo_ddbjxml.rb +0 -212
- data/sample/demo_kegg_taxonomy.rb +0 -92
- data/sample/demo_keggapi.rb +0 -502
- data/sample/psortplot_html.rb +0 -214
- data/test/network/bio/io/test_ddbjrest.rb +0 -47
- data/test/network/bio/io/test_ensembl.rb +0 -230
- data/test/network/bio/io/test_soapwsdl.rb +0 -53
- data/test/unit/bio/io/test_ddbjxml.rb +0 -81
- data/test/unit/bio/io/test_ensembl.rb +0 -111
- data/test/unit/bio/io/test_soapwsdl.rb +0 -33
@@ -5,7 +5,6 @@
|
|
5
5
|
# Mitsuteru C. Nakao <n@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id:$
|
9
8
|
#
|
10
9
|
# == Example
|
11
10
|
#
|
@@ -60,7 +59,7 @@ module Bio
|
|
60
59
|
if /NUMBER OF TM HELIX = (\d+)/ =~ line
|
61
60
|
@tms = $1
|
62
61
|
elsif /TM (\d+) +(\d+)- *(\d+) (\w+) +(\w+)/ =~ line
|
63
|
-
tmh = $1.to_i
|
62
|
+
#tmh = $1.to_i
|
64
63
|
range = Range.new($2.to_i, $3.to_i)
|
65
64
|
grade = $4
|
66
65
|
seq = $5
|
data/lib/bio/command.rb
CHANGED
@@ -6,7 +6,6 @@
|
|
6
6
|
# Toshiaki Katayama <k@bioruby.org>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id:$
|
10
9
|
#
|
11
10
|
|
12
11
|
require 'open3'
|
@@ -856,7 +855,7 @@ module Command
|
|
856
855
|
end.join('&')
|
857
856
|
end
|
858
857
|
when String
|
859
|
-
|
858
|
+
raise TypeError, 'Bio::Command.make_cgi_params no longer accepts a single String as a form'
|
860
859
|
end
|
861
860
|
return data
|
862
861
|
end
|
@@ -882,6 +881,67 @@ module Command
|
|
882
881
|
return result
|
883
882
|
end
|
884
883
|
|
884
|
+
# Same as:
|
885
|
+
# http = Net::HTTP.new(...); http.post(path, data, header)
|
886
|
+
# and
|
887
|
+
# it uses proxy if an environment variable (same as OpenURI.open_uri)
|
888
|
+
# is set.
|
889
|
+
# In addition, +header+ can be set.
|
890
|
+
# (Default Content-Type is application/octet-stream.
|
891
|
+
# Content-Length is automatically set by default.)
|
892
|
+
# +uri+ must be a URI object, +params+ must be a hash, and
|
893
|
+
# +header+ must be a hash.
|
894
|
+
#
|
895
|
+
# ---
|
896
|
+
# *Arguments*:
|
897
|
+
# * (required) _http_: Net::HTTP object or compatible object
|
898
|
+
# * (required) _path_: String
|
899
|
+
# * (required) _data_: String containing data
|
900
|
+
# * (optional) _header_: Hash containing header strings
|
901
|
+
# *Returns*:: (same as Net::HTTP::post)
|
902
|
+
def http_post(http, path, data, header = {})
|
903
|
+
hash = {
|
904
|
+
'Content-Type' => 'application/octet-stream',
|
905
|
+
'Content-Length' => data.length.to_s
|
906
|
+
}
|
907
|
+
hash.update(header)
|
908
|
+
|
909
|
+
http.post(path, data, hash)
|
910
|
+
end
|
911
|
+
|
912
|
+
# Same as:
|
913
|
+
# Net::HTTP.post(uri, params)
|
914
|
+
# and
|
915
|
+
# it uses proxy if an environment variable (same as OpenURI.open_uri)
|
916
|
+
# is set.
|
917
|
+
# In addition, +header+ can be set.
|
918
|
+
# (Default Content-Type is application/octet-stream.
|
919
|
+
# Content-Length is automatically set by default.)
|
920
|
+
# +uri+ must be a URI object, +data+ must be a String, and
|
921
|
+
# +header+ must be a hash.
|
922
|
+
#
|
923
|
+
# ---
|
924
|
+
# *Arguments*:
|
925
|
+
# * (required) _uri_: URI object or String
|
926
|
+
# * (optional) _data_: String containing data
|
927
|
+
# * (optional) _header_: Hash containing header strings
|
928
|
+
# *Returns*:: (same as Net::HTTP::post)
|
929
|
+
def post(uri, data, header = {})
|
930
|
+
unless uri.is_a?(URI)
|
931
|
+
uri = URI.parse(uri)
|
932
|
+
end
|
933
|
+
|
934
|
+
hash = {
|
935
|
+
'Content-Type' => 'application/octet-stream',
|
936
|
+
'Content-Length' => data.length.to_s
|
937
|
+
}
|
938
|
+
hash.update(header)
|
939
|
+
|
940
|
+
start_http(uri.host, uri.port) do |http|
|
941
|
+
http.post(uri.path, data, hash)
|
942
|
+
end
|
943
|
+
end
|
944
|
+
|
885
945
|
end # module Command
|
886
946
|
end # module Bio
|
887
947
|
|
data/lib/bio/data/aa.rb
CHANGED
@@ -108,23 +108,21 @@ class AminoAcid
|
|
108
108
|
}
|
109
109
|
|
110
110
|
def weight(x = nil)
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
111
|
+
return WEIGHT unless x
|
112
|
+
|
113
|
+
if x.length > 1
|
114
|
+
total = 0.0
|
115
|
+
x.each_byte do |byte|
|
116
|
+
aa = byte.chr.upcase
|
117
|
+
if WEIGHT[aa]
|
118
|
+
total += WEIGHT[aa]
|
119
|
+
else
|
120
|
+
raise "Error: invalid amino acid '#{aa}'"
|
121
121
|
end
|
122
|
-
total -= NucleicAcid.weight[:water] * (x.length - 1)
|
123
|
-
else
|
124
|
-
WEIGHT[x]
|
125
122
|
end
|
123
|
+
total -= NucleicAcid.weight[:water] * (x.length - 1)
|
126
124
|
else
|
127
|
-
WEIGHT
|
125
|
+
WEIGHT[x]
|
128
126
|
end
|
129
127
|
end
|
130
128
|
|
@@ -237,11 +235,7 @@ class AminoAcid
|
|
237
235
|
|
238
236
|
|
239
237
|
def reverse
|
240
|
-
|
241
|
-
NAMES.each do |k, v|
|
242
|
-
hash[v] = k
|
243
|
-
end
|
244
|
-
hash
|
238
|
+
@reverse ||= NAMES.invert
|
245
239
|
end
|
246
240
|
|
247
241
|
end
|
@@ -254,18 +248,6 @@ class AminoAcid
|
|
254
248
|
extend Data
|
255
249
|
|
256
250
|
|
257
|
-
private
|
258
|
-
|
259
|
-
|
260
|
-
# override when used as an instance method to improve performance
|
261
|
-
alias orig_reverse reverse
|
262
|
-
def reverse
|
263
|
-
unless @reverse
|
264
|
-
@reverse = orig_reverse
|
265
|
-
end
|
266
|
-
@reverse
|
267
|
-
end
|
268
|
-
|
269
251
|
end
|
270
252
|
|
271
253
|
end # module Bio
|
data/lib/bio/data/codontable.rb
CHANGED
@@ -5,7 +5,6 @@
|
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id:$
|
9
8
|
#
|
10
9
|
# == Data source
|
11
10
|
#
|
@@ -127,7 +126,7 @@ class CodonTable
|
|
127
126
|
# table.revtrans("A") # => ["gcg", "gct", "gca", "gcc"]
|
128
127
|
#
|
129
128
|
def revtrans(aa)
|
130
|
-
unless @reverse
|
129
|
+
unless (defined? @reverse) && @reverse
|
131
130
|
@reverse = {}
|
132
131
|
@table.each do |k, v|
|
133
132
|
@reverse[v] ||= []
|
@@ -371,7 +371,7 @@ module Bio
|
|
371
371
|
#probably would be better to d a class refrence to collect these informations
|
372
372
|
@entry.bioentry_references.collect do |bio_ref|
|
373
373
|
hash = Hash.new
|
374
|
-
hash['authors'] = bio_ref.reference.authors.gsub(/\.\s/, "\.\s\|").split(/\|/)
|
374
|
+
hash['authors'] = bio_ref.reference.authors.gsub(/\.\s/, "\.\s\|").split(/\|/) if (bio_ref.reference and bio_ref.reference.authors)
|
375
375
|
|
376
376
|
hash['sequence_position'] = "#{bio_ref.start_pos}-#{bio_ref.end_pos}" if (bio_ref.start_pos and bio_ref.end_pos)
|
377
377
|
hash['title'] = bio_ref.reference.title
|
data/lib/bio/db/embl/common.rb
CHANGED
@@ -149,7 +149,7 @@ module Common
|
|
149
149
|
unless @data['OS']
|
150
150
|
os = Array.new
|
151
151
|
fetch('OS').split(/, and|, /).each do |tmp|
|
152
|
-
if tmp =~ /([A-Z][a-z]* *[\w
|
152
|
+
if tmp =~ /([A-Z][a-z]* *[\w \:\'\+\-]+\w)/
|
153
153
|
org = $1
|
154
154
|
tmp =~ /(\(.+\))/
|
155
155
|
os.push({'name' => $1, 'os' => org})
|
data/lib/bio/db/embl/embl.rb
CHANGED
@@ -267,9 +267,10 @@ class EMBL < EMBLDB
|
|
267
267
|
unless @data['OS']
|
268
268
|
os = Array.new
|
269
269
|
tmp = fetch('OS')
|
270
|
-
if /([A-Z][a-z]* *[\w
|
270
|
+
if /([A-Z][a-z]* *[\w \:\'\+\-]+\w) *\(([\w ]+)\)\s*\z/ =~ tmp
|
271
271
|
org = $1
|
272
|
-
|
272
|
+
name = $2
|
273
|
+
os.push({'name' => name, 'os' => org})
|
273
274
|
else
|
274
275
|
os.push({'name' => nil, 'os' => tmp})
|
275
276
|
end
|
@@ -340,7 +341,7 @@ class EMBL < EMBLDB
|
|
340
341
|
@orig['FT'].each_line do |line|
|
341
342
|
next if line =~ /^FEATURES/
|
342
343
|
|
343
|
-
head = line[0,20].strip # feature key (source, CDS, ...)
|
344
|
+
#head = line[0,20].strip # feature key (source, CDS, ...)
|
344
345
|
body = line[20,60].chomp # feature value (position, /qualifier=)
|
345
346
|
if line =~ /^FT {3}(\S+)/
|
346
347
|
ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
|
@@ -491,7 +492,7 @@ class EMBL < EMBLDB
|
|
491
492
|
def parse_release_version(str)
|
492
493
|
return [ nil, nil ] unless str
|
493
494
|
a = str.split(/[\(\,\)]/)
|
494
|
-
|
495
|
+
a.shift #date string e.g. "14-OCT-2006"
|
495
496
|
rel = nil
|
496
497
|
ver = nil
|
497
498
|
a.each do |x|
|
@@ -126,9 +126,9 @@ module Bio::Sequence::Format::NucFormatter
|
|
126
126
|
def mol_type_embl
|
127
127
|
if mt = molecule_type then
|
128
128
|
mt
|
129
|
-
elsif
|
130
|
-
|
131
|
-
|
129
|
+
elsif fe = (features or []).find { |f| f.feature == 'source' } and
|
130
|
+
qu = fe.qualifiers.find { |q| q.qualifier == 'mol_type' } then
|
131
|
+
qu.value
|
132
132
|
else
|
133
133
|
'NA'
|
134
134
|
end
|
data/lib/bio/db/embl/sptr.rb
CHANGED
@@ -1,1455 +1,20 @@
|
|
1
1
|
#
|
2
|
-
# = bio/db/embl/sptr.rb -
|
2
|
+
# = bio/db/embl/sptr.rb - Bio::SPTR is an alias of Bio::UniProtKB
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C)
|
4
|
+
# Copyright:: Copyright (C) 2013 BioRuby Project
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id:$
|
8
|
-
#
|
9
|
-
# == Description
|
10
|
-
#
|
11
|
-
# Shared methods for UniProtKB/SwissProt and TrEMBL classes.
|
12
|
-
#
|
13
|
-
# See the SWISS-PROT document file SPECLIST.TXT or UniProtKB/SwissProt
|
14
|
-
# user manual.
|
15
|
-
#
|
16
|
-
# == Examples
|
17
|
-
#
|
18
|
-
# str = File.read("p53_human.swiss")
|
19
|
-
# obj = Bio::SPTR.new(str)
|
20
|
-
# obj.entry_id #=> "P53_HUMAN"
|
21
|
-
#
|
22
|
-
# == References
|
23
|
-
#
|
24
|
-
# * Swiss-Prot Protein knowledgebase. TrEMBL Computer-annotated supplement
|
25
|
-
# to Swiss-Prot
|
26
|
-
# http://au.expasy.org/sprot/
|
27
|
-
#
|
28
|
-
# * UniProt
|
29
|
-
# http://uniprot.org/
|
30
|
-
#
|
31
|
-
# * The UniProtKB/SwissProt/TrEMBL User Manual
|
32
|
-
# http://www.expasy.org/sprot/userman.html
|
33
|
-
#
|
34
|
-
|
35
7
|
|
36
|
-
|
37
|
-
require 'bio/db/embl/common'
|
8
|
+
warn "Bio::SPTR is changed to an alias of Bio::UniProtKB. Please use Bio::UniProtKB. Bio::SPTR may be deprecated in the future." if $VERBOSE
|
38
9
|
|
39
10
|
module Bio
|
40
11
|
|
41
|
-
|
42
|
-
class SPTR < EMBLDB
|
43
|
-
include Bio::EMBLDB::Common
|
44
|
-
|
45
|
-
@@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
|
46
|
-
@@data_class = ["STANDARD", "PRELIMINARY"]
|
47
|
-
|
48
|
-
# returns a Hash of the ID line.
|
49
|
-
#
|
50
|
-
# returns a content (Int or String) of the ID line by a given key.
|
51
|
-
# Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
|
52
|
-
#
|
53
|
-
# === ID Line (since UniProtKB release 9.0 of 31-Oct-2006)
|
54
|
-
# ID P53_HUMAN Reviewed; 393 AA.
|
55
|
-
# #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
|
56
|
-
#
|
57
|
-
# === Examples
|
58
|
-
# obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed",
|
59
|
-
# "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}
|
60
|
-
#
|
61
|
-
# obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
|
62
|
-
#
|
63
|
-
#
|
64
|
-
# === ID Line (older style)
|
65
|
-
# ID P53_HUMAN STANDARD; PRT; 393 AA.
|
66
|
-
# #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
|
67
|
-
#
|
68
|
-
# === Examples
|
69
|
-
# obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD",
|
70
|
-
# "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}
|
71
|
-
#
|
72
|
-
# obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
|
73
|
-
#
|
74
|
-
def id_line(key = nil)
|
75
|
-
return id_line[key] if key
|
76
|
-
return @data['ID'] if @data['ID']
|
77
|
-
|
78
|
-
part = @orig['ID'].split(/ +/)
|
79
|
-
if part[4].to_s.chomp == 'AA.' then
|
80
|
-
# after UniProtKB release 9.0 of 31-Oct-2006
|
81
|
-
# (http://www.uniprot.org/docs/sp_news.htm)
|
82
|
-
molecule_type = nil
|
83
|
-
sequence_length = part[3].to_i
|
84
|
-
else
|
85
|
-
molecule_type = part[3].sub(/;/,'')
|
86
|
-
sequence_length = part[4].to_i
|
87
|
-
end
|
88
|
-
@data['ID'] = {
|
89
|
-
'ENTRY_NAME' => part[1],
|
90
|
-
'DATA_CLASS' => part[2].sub(/;/,''),
|
91
|
-
'MOLECULE_TYPE' => molecule_type,
|
92
|
-
'SEQUENCE_LENGTH' => sequence_length
|
93
|
-
}
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
# returns a ENTRY_NAME in the ID line.
|
98
|
-
#
|
99
|
-
def entry_id
|
100
|
-
id_line('ENTRY_NAME')
|
101
|
-
end
|
102
|
-
alias entry_name entry_id
|
103
|
-
alias entry entry_id
|
104
|
-
|
105
|
-
|
106
|
-
# returns a MOLECULE_TYPE in the ID line.
|
107
|
-
#
|
108
|
-
# A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
|
109
|
-
def molecule
|
110
|
-
id_line('MOLECULE_TYPE')
|
111
|
-
end
|
112
|
-
alias molecule_type molecule
|
113
|
-
|
114
|
-
|
115
|
-
# returns a SEQUENCE_LENGTH in the ID line.
|
116
|
-
#
|
117
|
-
# A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
|
118
|
-
def sequence_length
|
119
|
-
id_line('SEQUENCE_LENGTH')
|
120
|
-
end
|
121
|
-
alias aalen sequence_length
|
122
|
-
|
123
|
-
|
124
|
-
# Bio::EMBLDB::Common#ac -> ary
|
125
|
-
# #accessions -> ary
|
126
|
-
# #accession -> String (accessions.first)
|
127
|
-
@@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
# returns a Hash of information in the DT lines.
|
132
|
-
# hash keys:
|
133
|
-
# ['created', 'sequence', 'annotation']
|
134
|
-
#--
|
135
|
-
# also Symbols acceptable (ASAP):
|
136
|
-
# [:created, :sequence, :annotation]
|
137
|
-
#++
|
138
|
-
#
|
139
|
-
# Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is
|
140
|
-
# changed, and the word "annotation" is no longer used in DT lines.
|
141
|
-
# Despite the change, the word "annotation" is still used for keeping
|
142
|
-
# compatibility.
|
143
|
-
#
|
144
|
-
# returns a String of information in the DT lines by a given key.
|
145
|
-
#
|
146
|
-
# === DT Line; date (3/entry)
|
147
|
-
# DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
|
148
|
-
# DT DD-MMM-YYY (sequence version NN)
|
149
|
-
# DT DD-MMM-YYY (entry version NN)
|
150
|
-
#
|
151
|
-
# The format have been changed in UniProtKB release 7.0 of 07-Feb-2006.
|
152
|
-
# Below is the older format.
|
153
|
-
#
|
154
|
-
# === Old format of DT Line; date (3/entry)
|
155
|
-
# DT DD-MMM-YYY (rel. NN, Created)
|
156
|
-
# DT DD-MMM-YYY (rel. NN, Last sequence update)
|
157
|
-
# DT DD-MMM-YYY (rel. NN, Last annotation update)
|
158
|
-
def dt(key = nil)
|
159
|
-
return dt[key] if key
|
160
|
-
return @data['DT'] if @data['DT']
|
161
|
-
|
162
|
-
part = self.get('DT').split(/\n/)
|
163
|
-
@data['DT'] = {
|
164
|
-
'created' => part[0].sub(/\w{2} /,'').strip,
|
165
|
-
'sequence' => part[1].sub(/\w{2} /,'').strip,
|
166
|
-
'annotation' => part[2].sub(/\w{2} /,'').strip
|
167
|
-
}
|
168
|
-
end
|
169
|
-
|
170
|
-
|
171
|
-
# (private) parses DE line (description lines)
|
172
|
-
# since UniProtKB release 14.0 of 22-Jul-2008
|
173
|
-
#
|
174
|
-
# Return array containing array.
|
175
|
-
#
|
176
|
-
# http://www.uniprot.org/docs/sp_news.htm
|
177
|
-
def parse_DE_line_rel14(str)
|
178
|
-
# Retruns if it is not the new format since Rel.14
|
179
|
-
return nil unless /^DE (RecName|AltName|SubName)\: / =~ str
|
180
|
-
ret = []
|
181
|
-
cur = nil
|
182
|
-
str.each_line do |line|
|
183
|
-
case line
|
184
|
-
when /^DE (Includes|Contains)\: *$/
|
185
|
-
cur = [ $1 ]
|
186
|
-
ret.push cur
|
187
|
-
cur = nil
|
188
|
-
#subcat_and_desc = nil
|
189
|
-
next
|
190
|
-
when /^DE *(RecName|AltName|SubName)\: +(.*)/
|
191
|
-
category = $1
|
192
|
-
subcat_and_desc = $2
|
193
|
-
cur = [ category ]
|
194
|
-
ret.push cur
|
195
|
-
when /^DE *(Flags)\: +(.*)/
|
196
|
-
category = $1
|
197
|
-
desc = $2
|
198
|
-
flags = desc.strip.split(/\s*\;\s*/) || []
|
199
|
-
cur = [ category, flags ]
|
200
|
-
ret.push cur
|
201
|
-
cur = nil
|
202
|
-
#subcat_and_desc = nil
|
203
|
-
next
|
204
|
-
when /^DE *(.*)/
|
205
|
-
subcat_and_desc = $1
|
206
|
-
else
|
207
|
-
warn "Warning: skipped DE line in unknown format: #{line.inspect}"
|
208
|
-
#subcat_and_desc = nil
|
209
|
-
next
|
210
|
-
end
|
211
|
-
case subcat_and_desc
|
212
|
-
when nil
|
213
|
-
# does nothing
|
214
|
-
when /\A([^\=]+)\=(.*)/
|
215
|
-
subcat = $1
|
216
|
-
desc = $2
|
217
|
-
desc.sub!(/\;\s*\z/, '')
|
218
|
-
unless cur
|
219
|
-
warn "Warning: unknown category in DE line: #{line.inspect}"
|
220
|
-
cur = [ '' ]
|
221
|
-
ret.push cur
|
222
|
-
end
|
223
|
-
cur.push [ subcat, desc ]
|
224
|
-
else
|
225
|
-
warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
|
226
|
-
end
|
227
|
-
end
|
228
|
-
ret
|
229
|
-
end
|
230
|
-
private :parse_DE_line_rel14
|
231
|
-
|
232
|
-
# returns the proposed official name of the protein.
|
233
|
-
# Returns a String.
|
234
|
-
#
|
235
|
-
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
|
236
|
-
# been changed. The method returns the full name which is taken from
|
237
|
-
# "RecName: Full=" or "SubName: Full=" line normally in the beginning of
|
238
|
-
# the DE lines.
|
239
|
-
# Unlike parser for old format, no special treatments for fragment or
|
240
|
-
# precursor.
|
241
|
-
#
|
242
|
-
# For old format, the method parses the DE lines and returns the protein
|
243
|
-
# name as a String.
|
244
|
-
#
|
245
|
-
# === DE Line; description (>=1)
|
246
|
-
# "DE #{OFFICIAL_NAME} (#{SYNONYM})"
|
247
|
-
# "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
|
248
|
-
# OFFICIAL_NAME 1/entry
|
249
|
-
# SYNONYM >=0
|
250
|
-
# CONTEINS >=0
|
251
|
-
def protein_name
|
252
|
-
@data['DE'] ||= parse_DE_line_rel14(get('DE'))
|
253
|
-
parsed_de_line = @data['DE']
|
254
|
-
if parsed_de_line then
|
255
|
-
# since UniProtKB release 14.0 of 22-Jul-2008
|
256
|
-
name = nil
|
257
|
-
parsed_de_line.each do |a|
|
258
|
-
case a[0]
|
259
|
-
when 'RecName', 'SubName'
|
260
|
-
if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
|
261
|
-
name = name_pair[1]
|
262
|
-
break
|
263
|
-
end
|
264
|
-
end
|
265
|
-
end
|
266
|
-
name = name.to_s
|
267
|
-
else
|
268
|
-
# old format (before Rel. 13.x)
|
269
|
-
name = ""
|
270
|
-
if de_line = fetch('DE') then
|
271
|
-
str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
|
272
|
-
name = str[/^[^(]*/].strip
|
273
|
-
name << ' (Fragment)' if str =~ /fragment/i
|
274
|
-
end
|
275
|
-
end
|
276
|
-
return name
|
277
|
-
end
|
278
|
-
|
279
|
-
|
280
|
-
# returns synonyms (unofficial and/or alternative names).
|
281
|
-
# Returns an Array containing String objects.
|
282
|
-
#
|
283
|
-
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
|
284
|
-
# been changed. The method returns the full or short names which are
|
285
|
-
# taken from "RecName: Short=", "RecName: EC=", and AltName lines,
|
286
|
-
# except after "Contains:" or "Includes:".
|
287
|
-
# For keeping compatibility with old format parser, "RecName: EC=N.N.N.N"
|
288
|
-
# is reported as "EC N.N.N.N".
|
289
|
-
# In addition, to prevent confusion, "Allergen=" and "CD_antigen="
|
290
|
-
# prefixes are added for the corresponding fields.
|
291
|
-
#
|
292
|
-
# For old format, the method parses the DE lines and returns synonyms.
|
293
|
-
# synonyms are each placed in () following the official name on the DE line.
|
294
|
-
def synonyms
|
295
|
-
ary = Array.new
|
296
|
-
@data['DE'] ||= parse_DE_line_rel14(get('DE'))
|
297
|
-
parsed_de_line = @data['DE']
|
298
|
-
if parsed_de_line then
|
299
|
-
# since UniProtKB release 14.0 of 22-Jul-2008
|
300
|
-
parsed_de_line.each do |a|
|
301
|
-
case a[0]
|
302
|
-
when 'Includes', 'Contains'
|
303
|
-
break #the each loop
|
304
|
-
when 'RecName', 'SubName', 'AltName'
|
305
|
-
a[1..-1].each do |b|
|
306
|
-
if name = b[1] and b[1] != self.protein_name then
|
307
|
-
case b[0]
|
308
|
-
when 'EC'
|
309
|
-
name = "EC " + b[1]
|
310
|
-
when 'Allergen', 'CD_antigen'
|
311
|
-
name = b[0] + '=' + b[1]
|
312
|
-
else
|
313
|
-
name = b[1]
|
314
|
-
end
|
315
|
-
ary.push name
|
316
|
-
end
|
317
|
-
end
|
318
|
-
end #case a[0]
|
319
|
-
end #parsed_de_line.each
|
320
|
-
else
|
321
|
-
# old format (before Rel. 13.x)
|
322
|
-
if de_line = fetch('DE') then
|
323
|
-
line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
|
324
|
-
line.scan(/\([^)]+/) do |synonym|
|
325
|
-
unless synonym =~ /fragment/i then
|
326
|
-
ary << synonym[1..-1].strip # index to remove the leading (
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
|
-
return ary
|
332
|
-
end
|
333
|
-
|
334
|
-
|
335
|
-
# returns gene names in the GN line.
|
336
|
-
#
|
337
|
-
# New UniProt/SwissProt format:
|
338
|
-
# * Bio::SPTR#gn -> [ <gene record>* ]
|
339
|
-
# where <gene record> is:
|
340
|
-
# { :name => '...',
|
341
|
-
# :synonyms => [ 's1', 's2', ... ],
|
342
|
-
# :loci => [ 'l1', 'l2', ... ],
|
343
|
-
# :orfs => [ 'o1', 'o2', ... ]
|
344
|
-
# }
|
345
|
-
#
|
346
|
-
# Old format:
|
347
|
-
# * Bio::SPTR#gn -> Array # AND
|
348
|
-
# * Bio::SPTR#gn[0] -> Array # OR
|
349
|
-
#
|
350
|
-
# === GN Line: Gene name(s) (>=0, optional)
|
351
|
-
def gn
|
352
|
-
unless @data['GN']
|
353
|
-
case fetch('GN')
|
354
|
-
when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
|
355
|
-
@data['GN'] = gn_uniprot_parser
|
356
|
-
else
|
357
|
-
@data['GN'] = gn_old_parser
|
358
|
-
end
|
359
|
-
end
|
360
|
-
@data['GN']
|
361
|
-
end
|
362
|
-
|
363
|
-
|
364
|
-
# returns contents in the old style GN line.
|
365
|
-
# === GN Line: Gene name(s) (>=0, optional)
|
366
|
-
# GN HNS OR DRDX OR OSMZ OR BGLY.
|
367
|
-
# GN CECA1 AND CECA2.
|
368
|
-
# GN CECA1 AND (HOGE OR FUGA).
|
369
|
-
#
|
370
|
-
# GN NAME1 [(AND|OR) NAME]+.
|
371
|
-
#
|
372
|
-
# Bio::SPTR#gn -> Array # AND
|
373
|
-
# #gn[0] -> Array # OR
|
374
|
-
# #gene_names -> Array
|
375
|
-
def gn_old_parser
|
376
|
-
names = Array.new
|
377
|
-
if get('GN').size > 0
|
378
|
-
names = fetch('GN').sub(/\.$/,'').split(/ AND /)
|
379
|
-
names.map! { |synonyms|
|
380
|
-
synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
|
381
|
-
e.strip
|
382
|
-
}
|
383
|
-
}
|
384
|
-
end
|
385
|
-
@data['GN'] = names
|
386
|
-
end
|
387
|
-
private :gn_old_parser
|
388
|
-
|
389
|
-
# returns contents in the structured GN line.
|
390
|
-
# The new format of the GN line is:
|
391
|
-
# GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
|
392
|
-
# GN ORFNames=[, ...];
|
393
|
-
#
|
394
|
-
# * Bio::SPTR#gn -> [ <gene record>* ]
|
395
|
-
# where <gene record> is:
|
396
|
-
# { :name => '...',
|
397
|
-
# :synonyms => [ 's1', 's2', ... ],
|
398
|
-
# :loci => [ 'l1', 'l2', ... ],
|
399
|
-
# :orfs => [ 'o1', 'o2', ... ]
|
400
|
-
# }
|
401
|
-
def gn_uniprot_parser
|
402
|
-
@data['GN'] = Array.new
|
403
|
-
gn_line = fetch('GN').strip
|
404
|
-
records = gn_line.split(/\s*and\s*/)
|
405
|
-
records.each do |record|
|
406
|
-
gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
|
407
|
-
record.each_line(';') do |element|
|
408
|
-
case element
|
409
|
-
when /Name=/ then
|
410
|
-
gene_hash[:name] = $'[0..-2]
|
411
|
-
when /Synonyms=/ then
|
412
|
-
gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
|
413
|
-
when /OrderedLocusNames=/ then
|
414
|
-
gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
|
415
|
-
when /ORFNames=/ then
|
416
|
-
gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
|
417
|
-
end
|
418
|
-
end
|
419
|
-
@data['GN'] << gene_hash
|
420
|
-
end
|
421
|
-
return @data['GN']
|
422
|
-
end
|
423
|
-
private :gn_uniprot_parser
|
424
|
-
|
425
|
-
|
426
|
-
# returns a Array of gene names in the GN line.
|
427
|
-
def gene_names
|
428
|
-
gn # set @data['GN'] if it hasn't been already done
|
429
|
-
if @data['GN'].first.class == Hash then
|
430
|
-
@data['GN'].collect { |element| element[:name] }
|
431
|
-
else
|
432
|
-
@data['GN'].first
|
433
|
-
end
|
434
|
-
end
|
435
|
-
|
436
|
-
|
437
|
-
# returns a String of the first gene name in the GN line.
|
438
|
-
def gene_name
|
439
|
-
gene_names.first
|
440
|
-
end
|
441
|
-
|
442
|
-
|
443
|
-
# returns a Array of Hashs or a String of the OS line when a key given.
|
444
|
-
# * Bio::EMBLDB#os -> Array
|
445
|
-
# [{'name' => '(Human)', 'os' => 'Homo sapiens'},
|
446
|
-
# {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
|
447
|
-
# * Bio::EPTR#os[0] -> Hash
|
448
|
-
# {'name' => "(Human)", 'os' => 'Homo sapiens'}
|
449
|
-
# * Bio::SPTR#os[0]['name'] -> "(Human)"
|
450
|
-
# * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
|
451
|
-
#
|
452
|
-
# === OS Line; organism species (>=1)
|
453
|
-
# OS Genus species (name).
|
454
|
-
# OS Genus species (name0) (name1).
|
455
|
-
# OS Genus species (name0) (name1).
|
456
|
-
# OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
|
457
|
-
# OS Homo sapiens (Human), and Rarrus norveticus (Rat)
|
458
|
-
# OS Hippotis sp. Clark and Watts 825.
|
459
|
-
# OS unknown cyperaceous sp.
|
460
|
-
def os(num = nil)
|
461
|
-
unless @data['OS']
|
462
|
-
os = Array.new
|
463
|
-
fetch('OS').split(/, and|, /).each do |tmp|
|
464
|
-
if tmp =~ /(\w+ *[\w\d \:\'\+\-\.]+[\w\d\.])/
|
465
|
-
org = $1
|
466
|
-
tmp =~ /(\(.+\))/
|
467
|
-
os.push({'name' => $1, 'os' => org})
|
468
|
-
else
|
469
|
-
raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
|
470
|
-
end
|
471
|
-
end
|
472
|
-
@data['OS'] = os
|
473
|
-
end
|
474
|
-
|
475
|
-
if num
|
476
|
-
# EX. "Trifolium repens (white clover)"
|
477
|
-
return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
|
478
|
-
else
|
479
|
-
return @data['OS']
|
480
|
-
end
|
481
|
-
end
|
482
|
-
|
483
|
-
|
484
|
-
# Bio::EMBLDB::Common#og -> Array
|
485
|
-
# OG Line; organella (0 or 1/entry)
|
486
|
-
# ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
|
487
|
-
# or a plasmid name (e.g. "Plasmid pBR322").
|
488
|
-
|
489
|
-
|
490
|
-
# Bio::EMBLDB::Common#oc -> Array
|
491
|
-
# OC Line; organism classification (>=1)
|
492
|
-
# "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
|
493
|
-
# "OC Theileria."
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
# returns a Hash of oraganism taxonomy cross-references.
|
498
|
-
# * Bio::SPTR#ox -> Hash
|
499
|
-
# {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
|
500
|
-
#
|
501
|
-
# === OX Line; organism taxonomy cross-reference (>=1 per entry)
|
502
|
-
# OX NCBI_TaxID=1234;
|
503
|
-
# OX NCBI_TaxID=1234, 2345, 3456, 4567;
|
504
|
-
def ox
|
505
|
-
unless @data['OX']
|
506
|
-
tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
|
507
|
-
hsh = Hash.new
|
508
|
-
tmp.each do |e|
|
509
|
-
db,refs = e.split(/=/)
|
510
|
-
hsh[db] = refs.split(/, */)
|
511
|
-
end
|
512
|
-
@data['OX'] = hsh
|
513
|
-
end
|
514
|
-
return @data['OX']
|
515
|
-
end
|
516
|
-
|
517
|
-
# === The OH Line;
|
518
|
-
#
|
519
|
-
# OH NCBI_TaxID=TaxID; HostName.
|
520
|
-
# http://br.expasy.org/sprot/userman.html#OH_line
|
521
|
-
def oh
|
522
|
-
unless @data['OH']
|
523
|
-
@data['OH'] = fetch('OH').split("\. ").map {|x|
|
524
|
-
if x =~ /NCBI_TaxID=(\d+);/
|
525
|
-
taxid = $1
|
526
|
-
else
|
527
|
-
raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
|
528
|
-
$!, "\n", get('OH'), "\n"].join
|
529
|
-
|
530
|
-
end
|
531
|
-
if x =~ /NCBI_TaxID=\d+; (.+)/
|
532
|
-
host_name = $1
|
533
|
-
host_name.sub!(/\.$/, '')
|
534
|
-
else
|
535
|
-
host_name = nil
|
536
|
-
end
|
537
|
-
{'NCBI_TaxID' => taxid, 'HostName' => host_name}
|
538
|
-
}
|
539
|
-
end
|
540
|
-
@data['OH']
|
541
|
-
end
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
# Bio::EMBLDB::Common#ref -> Array
|
546
|
-
# R Lines
|
547
|
-
# RN RC RP RX RA RT RL
|
548
|
-
|
549
|
-
# returns contents in the R lines.
|
550
|
-
# * Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
|
551
|
-
# where <reference information Hash> is:
|
552
|
-
# {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
553
|
-
# 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
554
|
-
#
|
555
|
-
# R Lines
|
556
|
-
# * RN RC RP RX RA RT RL RG
|
557
|
-
def ref
|
558
|
-
unless @data['R']
|
559
|
-
@data['R'] = [get('R').split(/\nRN /)].flatten.map { |str|
|
560
|
-
hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
|
561
|
-
'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
|
562
|
-
str = 'RN ' + str unless /^RN / =~ str
|
563
|
-
|
564
|
-
str.split("\n").each do |line|
|
565
|
-
if /^(R[NPXARLCTG]) (.+)/ =~ line
|
566
|
-
hash[$1] += $2 + ' '
|
567
|
-
else
|
568
|
-
raise "Invalid format in R lines, \n[#{line}]\n"
|
569
|
-
end
|
570
|
-
end
|
571
|
-
|
572
|
-
hash['RN'] = set_RN(hash['RN'])
|
573
|
-
hash['RC'] = set_RC(hash['RC'])
|
574
|
-
hash['RP'] = set_RP(hash['RP'])
|
575
|
-
hash['RX'] = set_RX(hash['RX'])
|
576
|
-
hash['RA'] = set_RA(hash['RA'])
|
577
|
-
hash['RT'] = set_RT(hash['RT'])
|
578
|
-
hash['RL'] = set_RL(hash['RL'])
|
579
|
-
hash['RG'] = set_RG(hash['RG'])
|
580
|
-
|
581
|
-
hash
|
582
|
-
}
|
583
|
-
|
584
|
-
end
|
585
|
-
@data['R']
|
586
|
-
end
|
587
|
-
|
588
|
-
def set_RN(data)
|
589
|
-
data.strip
|
590
|
-
end
|
591
|
-
|
592
|
-
def set_RC(data)
|
593
|
-
data.scan(/([STP]\w+)=(.+);/).map { |comment|
|
594
|
-
[comment[1].split(/, and |, /)].flatten.map { |text|
|
595
|
-
{'Token' => comment[0], 'Text' => text}
|
596
|
-
}
|
597
|
-
}.flatten
|
598
|
-
end
|
599
|
-
private :set_RC
|
600
|
-
|
601
|
-
def set_RP(data)
|
602
|
-
data = data.strip
|
603
|
-
data = data.sub(/\.$/, '')
|
604
|
-
data.split(/, AND |, /i).map {|x|
|
605
|
-
x = x.strip
|
606
|
-
x = x.gsub(' ', ' ')
|
607
|
-
}
|
608
|
-
end
|
609
|
-
private :set_RP
|
610
|
-
|
611
|
-
def set_RX(data)
|
612
|
-
rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
|
613
|
-
if data =~ /MEDLINE=(.+?);/
|
614
|
-
rx['MEDLINE'] = $1
|
615
|
-
end
|
616
|
-
if data =~ /PubMed=(.+?);/
|
617
|
-
rx['PubMed'] = $1
|
618
|
-
end
|
619
|
-
if data =~ /DOI=(.+?);/
|
620
|
-
rx['DOI'] = $1
|
621
|
-
end
|
622
|
-
rx
|
623
|
-
end
|
624
|
-
private :set_RX
|
625
|
-
|
626
|
-
def set_RA(data)
|
627
|
-
data = data.sub(/; *$/, '')
|
628
|
-
end
|
629
|
-
private :set_RA
|
630
|
-
|
631
|
-
def set_RT(data)
|
632
|
-
data = data.sub(/; *$/, '')
|
633
|
-
data = data.gsub(/(^"|"$)/, '')
|
634
|
-
end
|
635
|
-
private :set_RT
|
636
|
-
|
637
|
-
def set_RL(data)
|
638
|
-
data = data.strip
|
639
|
-
end
|
640
|
-
private :set_RL
|
641
|
-
|
642
|
-
def set_RG(data)
|
643
|
-
data = data.split('; ')
|
644
|
-
end
|
645
|
-
private :set_RG
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
# returns Bio::Reference object from Bio::EMBLDB::Common#ref.
|
650
|
-
# * Bio::EMBLDB::Common#ref -> Bio::References
|
651
|
-
def references
|
652
|
-
unless @data['references']
|
653
|
-
ary = self.ref.map {|ent|
|
654
|
-
hash = Hash.new('')
|
655
|
-
ent.each {|key, value|
|
656
|
-
case key
|
657
|
-
when 'RA'
|
658
|
-
hash['authors'] = value.split(/, /)
|
659
|
-
when 'RT'
|
660
|
-
hash['title'] = value
|
661
|
-
when 'RL'
|
662
|
-
if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
|
663
|
-
hash['journal'] = $1
|
664
|
-
hash['volume'] = $2
|
665
|
-
hash['issue'] = $3
|
666
|
-
hash['pages'] = $4
|
667
|
-
hash['year'] = $5
|
668
|
-
else
|
669
|
-
hash['journal'] = value
|
670
|
-
end
|
671
|
-
when 'RX' # PUBMED, MEDLINE, DOI
|
672
|
-
value.each do |tag, xref|
|
673
|
-
hash[ tag.downcase ] = xref
|
674
|
-
end
|
675
|
-
end
|
676
|
-
}
|
677
|
-
Reference.new(hash)
|
678
|
-
}
|
679
|
-
@data['references'] = References.new(ary)
|
680
|
-
end
|
681
|
-
@data['references']
|
682
|
-
end
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
# === The HI line
|
690
|
-
# Bio::SPTR#hi #=> hash
|
691
|
-
def hi
|
692
|
-
unless @data['HI']
|
693
|
-
@data['HI'] = []
|
694
|
-
fetch('HI').split(/\. /).each do |hlist|
|
695
|
-
hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''}
|
696
|
-
hash['Category'], hash['Keywords'] = hlist.split(': ')
|
697
|
-
hash['Keywords'] = hash['Keywords'].split('; ')
|
698
|
-
hash['Keyword'] = hash['Keywords'].pop
|
699
|
-
hash['Keyword'].sub!(/\.$/, '')
|
700
|
-
@data['HI'] << hash
|
701
|
-
end
|
702
|
-
end
|
703
|
-
@data['HI']
|
704
|
-
end
|
705
|
-
|
706
|
-
|
707
|
-
@@cc_topics = ['PHARMACEUTICAL',
|
708
|
-
'BIOTECHNOLOGY',
|
709
|
-
'TOXIC DOSE',
|
710
|
-
'ALLERGEN',
|
711
|
-
'RNA EDITING',
|
712
|
-
'POLYMORPHISM',
|
713
|
-
'BIOPHYSICOCHEMICAL PROPERTIES',
|
714
|
-
'MASS SPECTROMETRY',
|
715
|
-
'WEB RESOURCE',
|
716
|
-
'ENZYME REGULATION',
|
717
|
-
'DISEASE',
|
718
|
-
'INTERACTION',
|
719
|
-
'DEVELOPMENTAL STAGE',
|
720
|
-
'INDUCTION',
|
721
|
-
'CAUTION',
|
722
|
-
'ALTERNATIVE PRODUCTS',
|
723
|
-
'DOMAIN',
|
724
|
-
'PTM',
|
725
|
-
'MISCELLANEOUS',
|
726
|
-
'TISSUE SPECIFICITY',
|
727
|
-
'COFACTOR',
|
728
|
-
'PATHWAY',
|
729
|
-
'SUBUNIT',
|
730
|
-
'CATALYTIC ACTIVITY',
|
731
|
-
'SUBCELLULAR LOCATION',
|
732
|
-
'FUNCTION',
|
733
|
-
'SIMILARITY']
|
734
|
-
# returns contents in the CC lines.
|
735
|
-
# * Bio::SPTR#cc -> Hash
|
736
|
-
#
|
737
|
-
# returns an object of contents in the TOPIC.
|
738
|
-
# * Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
|
739
|
-
#
|
740
|
-
# returns contents of the "ALTERNATIVE PRODUCTS".
|
741
|
-
# * Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
|
742
|
-
# {'Event' => str,
|
743
|
-
# 'Named isoforms' => int,
|
744
|
-
# 'Comment' => str,
|
745
|
-
# 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
|
746
|
-
#
|
747
|
-
# CC -!- ALTERNATIVE PRODUCTS:
|
748
|
-
# CC Event=Alternative splicing; Named isoforms=15;
|
749
|
-
# ...
|
750
|
-
# CC placentae isoforms. All tissues differentially splice exon 13;
|
751
|
-
# CC Name=A; Synonyms=no del;
|
752
|
-
# CC IsoId=P15529-1; Sequence=Displayed;
|
753
|
-
#
|
754
|
-
# returns contents of the "DATABASE".
|
755
|
-
# * Bio::SPTR#cc('DATABASE') -> Array
|
756
|
-
# [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
|
757
|
-
#
|
758
|
-
# CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
759
|
-
#
|
760
|
-
# returns contents of the "MASS SPECTROMETRY".
|
761
|
-
# * Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
|
762
|
-
# [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
|
763
|
-
#
|
764
|
-
# CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
765
|
-
#
|
766
|
-
# === CC lines (>=0, optional)
|
767
|
-
# CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
|
768
|
-
# CC IN LIVER, KIDNEY, LUNG AND BRAIN.
|
769
|
-
#
|
770
|
-
# CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
|
771
|
-
# CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
|
772
|
-
#
|
773
|
-
# See also http://www.expasy.org/sprot/userman.html#CC_line
|
774
|
-
#
|
775
|
-
def cc(topic = nil)
|
776
|
-
unless @data['CC']
|
777
|
-
cc = Hash.new
|
778
|
-
comment_border= '-' * (77 - 4 + 1)
|
779
|
-
dlm = /-!- /
|
780
|
-
|
781
|
-
# 12KD_MYCSM has no CC lines.
|
782
|
-
return cc if get('CC').size == 0
|
783
|
-
|
784
|
-
cc_raw = fetch('CC')
|
785
|
-
|
786
|
-
# Removing the copyright statement.
|
787
|
-
cc_raw.sub!(/ *---.+---/m, '')
|
788
|
-
|
789
|
-
# Not any CC Lines without the copyright statement.
|
790
|
-
return cc if cc_raw == ''
|
791
|
-
|
792
|
-
begin
|
793
|
-
cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
|
794
|
-
cc_raw = cc_raw.sub(dlm,'')
|
795
|
-
cc_raw.split(dlm).each do |tmp|
|
796
|
-
tmp = tmp.strip
|
797
|
-
|
798
|
-
if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
|
799
|
-
key = $1
|
800
|
-
body = $2
|
801
|
-
body.gsub!(/- (?!AND)/,'-')
|
802
|
-
body.strip!
|
803
|
-
unless cc[key]
|
804
|
-
cc[key] = [body]
|
805
|
-
else
|
806
|
-
cc[key].push(body)
|
807
|
-
end
|
808
|
-
else
|
809
|
-
raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
|
810
|
-
'', get('CC'),''].join("\n")
|
811
|
-
end
|
812
|
-
end
|
813
|
-
rescue NameError
|
814
|
-
if fetch('CC') == ''
|
815
|
-
return {}
|
816
|
-
else
|
817
|
-
raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
|
818
|
-
"\n'#{self.get('CC')}'\n", "(#{$!})"].join
|
819
|
-
end
|
820
|
-
rescue NoMethodError
|
821
|
-
end
|
822
|
-
|
823
|
-
@data['CC'] = cc
|
824
|
-
end
|
825
|
-
|
826
|
-
|
827
|
-
case topic
|
828
|
-
when 'ALLERGEN'
|
829
|
-
return @data['CC'][topic]
|
830
|
-
when 'ALTERNATIVE PRODUCTS'
|
831
|
-
return cc_alternative_products(@data['CC'][topic])
|
832
|
-
when 'BIOPHYSICOCHEMICAL PROPERTIES'
|
833
|
-
return cc_biophysiochemical_properties(@data['CC'][topic])
|
834
|
-
when 'BIOTECHNOLOGY'
|
835
|
-
return @data['CC'][topic]
|
836
|
-
when 'CATALITIC ACTIVITY'
|
837
|
-
return cc_catalytic_activity(@data['CC'][topic])
|
838
|
-
when 'CAUTION'
|
839
|
-
return cc_caution(@data['CC'][topic])
|
840
|
-
when 'COFACTOR'
|
841
|
-
return @data['CC'][topic]
|
842
|
-
when 'DEVELOPMENTAL STAGE'
|
843
|
-
return @data['CC'][topic].join('')
|
844
|
-
when 'DISEASE'
|
845
|
-
return @data['CC'][topic].join('')
|
846
|
-
when 'DOMAIN'
|
847
|
-
return @data['CC'][topic]
|
848
|
-
when 'ENZYME REGULATION'
|
849
|
-
return @data['CC'][topic].join('')
|
850
|
-
when 'FUNCTION'
|
851
|
-
return @data['CC'][topic].join('')
|
852
|
-
when 'INDUCTION'
|
853
|
-
return @data['CC'][topic].join('')
|
854
|
-
when 'INTERACTION'
|
855
|
-
return cc_interaction(@data['CC'][topic])
|
856
|
-
when 'MASS SPECTROMETRY'
|
857
|
-
return cc_mass_spectrometry(@data['CC'][topic])
|
858
|
-
when 'MISCELLANEOUS'
|
859
|
-
return @data['CC'][topic]
|
860
|
-
when 'PATHWAY'
|
861
|
-
return cc_pathway(@data['CC'][topic])
|
862
|
-
when 'PHARMACEUTICAL'
|
863
|
-
return @data['CC'][topic]
|
864
|
-
when 'POLYMORPHISM'
|
865
|
-
return @data['CC'][topic]
|
866
|
-
when 'PTM'
|
867
|
-
return @data['CC'][topic]
|
868
|
-
when 'RNA EDITING'
|
869
|
-
return cc_rna_editing(@data['CC'][topic])
|
870
|
-
when 'SIMILARITY'
|
871
|
-
return @data['CC'][topic]
|
872
|
-
when 'SUBCELLULAR LOCATION'
|
873
|
-
return cc_subcellular_location(@data['CC'][topic])
|
874
|
-
when 'SUBUNIT'
|
875
|
-
return @data['CC'][topic]
|
876
|
-
when 'TISSUE SPECIFICITY'
|
877
|
-
return @data['CC'][topic]
|
878
|
-
when 'TOXIC DOSE'
|
879
|
-
return @data['CC'][topic]
|
880
|
-
when 'WEB RESOURCE'
|
881
|
-
return cc_web_resource(@data['CC'][topic])
|
882
|
-
when 'DATABASE'
|
883
|
-
# DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
|
884
|
-
tmp = Array.new
|
885
|
-
db = @data['CC']['DATABASE']
|
886
|
-
return db unless db
|
887
|
-
|
888
|
-
db.each do |e|
|
889
|
-
db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
|
890
|
-
e.sub(/.$/,'').split(/;/).each do |line|
|
891
|
-
case line
|
892
|
-
when /NAME=(.+)/
|
893
|
-
db['NAME'] = $1
|
894
|
-
when /NOTE=(.+)/
|
895
|
-
db['NOTE'] = $1
|
896
|
-
when /WWW="(.+)"/
|
897
|
-
db['WWW'] = $1
|
898
|
-
when /FTP="(.+)"/
|
899
|
-
db['FTP'] = $1
|
900
|
-
end
|
901
|
-
end
|
902
|
-
tmp.push(db)
|
903
|
-
end
|
904
|
-
return tmp
|
905
|
-
when nil
|
906
|
-
return @data['CC']
|
907
|
-
else
|
908
|
-
return @data['CC'][topic]
|
909
|
-
end
|
910
|
-
end
|
911
|
-
|
912
|
-
|
913
|
-
def cc_alternative_products(data)
|
914
|
-
ap = data.join('')
|
915
|
-
return ap unless ap
|
916
|
-
|
917
|
-
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
|
918
|
-
tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "",
|
919
|
-
'Variants' => []}
|
920
|
-
if /Event=(.+?);/ =~ ap
|
921
|
-
tmp['Event'] = $1
|
922
|
-
tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
|
923
|
-
end
|
924
|
-
if /Named isoforms=(\S+?);/ =~ ap
|
925
|
-
tmp['Named isoforms'] = $1
|
926
|
-
end
|
927
|
-
if /Comment=(.+?);/m =~ ap
|
928
|
-
tmp['Comment'] = $1
|
929
|
-
end
|
930
|
-
ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
|
931
|
-
tmp['Variants'] << cc_alternative_products_variants(ent)
|
932
|
-
end
|
933
|
-
return tmp
|
934
|
-
end
|
935
|
-
private :cc_alternative_products
|
936
|
-
|
937
|
-
def cc_alternative_products_variants(data)
|
938
|
-
variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
|
939
|
-
data.split(/; /).map {|x| x.split(/=/) }.each do |e|
|
940
|
-
case e[0]
|
941
|
-
when 'Sequence', 'Synonyms', 'IsoId'
|
942
|
-
e[1] = e[1].sub(/;/,'').split(/, /)
|
943
|
-
end
|
944
|
-
variant[e[0]] = e[1]
|
945
|
-
end
|
946
|
-
variant
|
947
|
-
end
|
948
|
-
private :cc_alternative_products_variants
|
949
|
-
|
950
|
-
|
951
|
-
def cc_biophysiochemical_properties(data)
|
952
|
-
data = data[0]
|
953
|
-
|
954
|
-
hash = {'Absorption' => {},
|
955
|
-
'Kinetic parameters' => {},
|
956
|
-
'pH dependence' => "",
|
957
|
-
'Redox potential' => "",
|
958
|
-
'Temperature dependence' => ""}
|
959
|
-
if data =~ /Absorption: Abs\(max\)=(.+?);/
|
960
|
-
hash['Absorption']['Abs(max)'] = $1
|
961
|
-
end
|
962
|
-
if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
|
963
|
-
hash['Absorption']['Note'] = $1
|
964
|
-
end
|
965
|
-
if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
|
966
|
-
hash['Kinetic parameters']['KM'] = $1
|
967
|
-
hash['Kinetic parameters']['Vmax'] = $2
|
968
|
-
end
|
969
|
-
if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
|
970
|
-
hash['Kinetic parameters']['Note'] = $1
|
971
|
-
end
|
972
|
-
if data =~ /pH dependence: (.+?);/
|
973
|
-
hash['pH dependence'] = $1
|
974
|
-
end
|
975
|
-
if data =~ /Redox potential: (.+?);/
|
976
|
-
hash['Redox potential'] = $1
|
977
|
-
end
|
978
|
-
if data =~ /Temperature dependence: (.+?);/
|
979
|
-
hash['Temperature dependence'] = $1
|
980
|
-
end
|
981
|
-
hash
|
982
|
-
end
|
983
|
-
private :cc_biophysiochemical_properties
|
984
|
-
|
985
|
-
|
986
|
-
def cc_caution(data)
|
987
|
-
data.join('')
|
988
|
-
end
|
989
|
-
private :cc_caution
|
990
|
-
|
991
|
-
|
992
|
-
# returns conteins in a line of the CC INTERACTION section.
|
993
|
-
#
|
994
|
-
# CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
|
995
|
-
def cc_interaction(data)
|
996
|
-
str = data.join('')
|
997
|
-
it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
|
998
|
-
it.map {|ent|
|
999
|
-
ent.map! {|x| x.strip }
|
1000
|
-
if ent[0] =~ /^(.+):(.+)/
|
1001
|
-
spac = $1
|
1002
|
-
spid = $2.split(' ')[0]
|
1003
|
-
optid = nil
|
1004
|
-
elsif ent[0] =~ /Self/
|
1005
|
-
spac = self.entry_id
|
1006
|
-
spid = self.entry_id
|
1007
|
-
optid = nil
|
1008
|
-
end
|
1009
|
-
if ent[0] =~ /^.+:.+ (.+)/
|
1010
|
-
optid = $1
|
1011
|
-
end
|
1012
|
-
|
1013
|
-
{'SP_Ac' => spac,
|
1014
|
-
'identifier' => spid,
|
1015
|
-
'NbExp' => ent[1],
|
1016
|
-
'IntAct' => ent[2].split(', '),
|
1017
|
-
'optional_identifier' => optid}
|
1018
|
-
}
|
1019
|
-
end
|
1020
|
-
private :cc_interaction
|
1021
|
-
|
1022
|
-
|
1023
|
-
def cc_mass_spectrometry(data)
|
1024
|
-
# MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
|
1025
|
-
return data unless data
|
1026
|
-
|
1027
|
-
data.map { |m|
|
1028
|
-
mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
|
1029
|
-
'NOTE' => nil}
|
1030
|
-
m.sub(/.$/,'').split(/;/).each do |line|
|
1031
|
-
case line
|
1032
|
-
when /MW=(.+)/
|
1033
|
-
mass['MW'] = $1
|
1034
|
-
when /MW_ERR=(.+)/
|
1035
|
-
mass['MW_ERR'] = $1
|
1036
|
-
when /METHOD=(.+)/
|
1037
|
-
mass['METHOD'] = $1
|
1038
|
-
when /RANGE=(\d+-\d+)/
|
1039
|
-
mass['RANGE'] = $1 # RANGE class ?
|
1040
|
-
when /NOTE=(.+)/
|
1041
|
-
mass['NOTE'] = $1
|
1042
|
-
end
|
1043
|
-
end
|
1044
|
-
mass
|
1045
|
-
}
|
1046
|
-
end
|
1047
|
-
private :cc_mass_spectrometry
|
1048
|
-
|
1049
|
-
|
1050
|
-
def cc_pathway(data)
|
1051
|
-
data.map {|x| x.sub(/\.$/, '') }.map {|x|
|
1052
|
-
x.split(/; | and |: /)
|
1053
|
-
}[0]
|
1054
|
-
end
|
1055
|
-
private :cc_pathway
|
1056
|
-
|
1057
|
-
|
1058
|
-
def cc_rna_editing(data)
|
1059
|
-
data = data.join('')
|
1060
|
-
entry = {'Modified_positions' => [], 'Note' => ""}
|
1061
|
-
if data =~ /Modified_positions=(.+?)(\.|;)/
|
1062
|
-
entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
|
1063
|
-
else
|
1064
|
-
raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
|
1065
|
-
end
|
1066
|
-
if data =~ /Note=(.+)/
|
1067
|
-
entry['Note'] = $1
|
1068
|
-
end
|
1069
|
-
entry
|
1070
|
-
end
|
1071
|
-
private :cc_rna_editing
|
1072
|
-
|
1073
|
-
|
1074
|
-
def cc_subcellular_location(data)
|
1075
|
-
data.map {|x|
|
1076
|
-
x.split('. ').map {|y|
|
1077
|
-
y.split('; ').map {|z|
|
1078
|
-
z.sub(/\.$/, '')
|
1079
|
-
}
|
1080
|
-
}
|
1081
|
-
}[0]
|
1082
|
-
end
|
1083
|
-
private :cc_subcellular_location
|
1084
|
-
|
1085
|
-
|
1086
|
-
#--
|
1087
|
-
# Since UniProtKB release 12.2 of 11-Sep-2007:
|
1088
|
-
# CC -!- WEB RESOURCE: Name=ResourceName[; Note=FreeText][; URL=WWWAddress]. # Old format:
|
1089
|
-
# CC -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].
|
1090
|
-
#++
|
1091
|
-
|
1092
|
-
def cc_web_resource(data)
|
1093
|
-
data.map {|x|
|
1094
|
-
entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
|
1095
|
-
x.split(';').each do |y|
|
1096
|
-
case y
|
1097
|
-
when /(Name|Note)\=(.+)/
|
1098
|
-
key = $1
|
1099
|
-
val = $2.strip
|
1100
|
-
entry[key] = val
|
1101
|
-
when /(NAME|NOTE)\=(.+)/
|
1102
|
-
key = $1.downcase.capitalize
|
1103
|
-
val = $2.strip
|
1104
|
-
entry[key] = val
|
1105
|
-
when /URL\=\"(.+)\"/
|
1106
|
-
entry['URL'] = $1.strip
|
1107
|
-
end
|
1108
|
-
end
|
1109
|
-
entry
|
1110
|
-
}
|
1111
|
-
end
|
1112
|
-
private :cc_web_resource
|
1113
|
-
|
1114
|
-
# returns databases cross-references in the DR lines.
|
1115
|
-
# * Bio::SPTR#dr -> Hash w/in Array
|
1116
|
-
#
|
1117
|
-
# === DR Line; defabases cross-reference (>=0)
|
1118
|
-
# DR database_identifier; primary_identifier; secondary_identifier.
|
1119
|
-
# a cross_ref pre one line
|
1120
|
-
@@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
|
1121
|
-
'ECOGENE',
|
1122
|
-
'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
|
1123
|
-
'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
|
1124
|
-
'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
|
1125
|
-
'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
|
1126
|
-
|
1127
|
-
# Backup Bio::EMBLDB#dr as embl_dr
|
1128
|
-
alias :embl_dr :dr
|
1129
|
-
|
1130
|
-
# Bio::SPTR#dr
|
1131
|
-
def dr(key = nil)
|
1132
|
-
unless key
|
1133
|
-
embl_dr
|
1134
|
-
else
|
1135
|
-
(embl_dr[key] or []).map {|x|
|
1136
|
-
{'Accession' => x[0],
|
1137
|
-
'Version' => x[1],
|
1138
|
-
' ' => x[2],
|
1139
|
-
'Molecular Type' => x[3]}
|
1140
|
-
}
|
1141
|
-
end
|
1142
|
-
end
|
1143
|
-
|
1144
|
-
|
1145
|
-
# Bio::EMBLDB::Common#kw - Array
|
1146
|
-
# #keywords -> Array
|
1147
|
-
#
|
1148
|
-
# KW Line; keyword (>=1)
|
1149
|
-
# KW [Keyword;]+
|
1150
|
-
|
1151
|
-
|
1152
|
-
# returns contents in the feature table.
|
1153
|
-
#
|
1154
|
-
# == Examples
|
1155
|
-
#
|
1156
|
-
# sp = Bio::SPTR.new(entry)
|
1157
|
-
# ft = sp.ft
|
1158
|
-
# ft.class #=> Hash
|
1159
|
-
# ft.keys.each do |feature_key|
|
1160
|
-
# ft[feature_key].each do |feature|
|
1161
|
-
# feature['From'] #=> '1'
|
1162
|
-
# feature['To'] #=> '21'
|
1163
|
-
# feature['Description'] #=> ''
|
1164
|
-
# feature['FTId'] #=> ''
|
1165
|
-
# feature['diff'] #=> []
|
1166
|
-
# feature['original'] #=> [feature_key, '1', '21', '', '']
|
1167
|
-
# end
|
1168
|
-
# end
|
1169
|
-
#
|
1170
|
-
# * Bio::SPTR#ft -> Hash
|
1171
|
-
# {FEATURE_KEY => [{'From' => int, 'To' => int,
|
1172
|
-
# 'Description' => aStr, 'FTId' => aStr,
|
1173
|
-
# 'diff' => [original_residues, changed_residues],
|
1174
|
-
# 'original' => aAry }],...}
|
1175
|
-
#
|
1176
|
-
# returns an Array of the information about the feature_name in the feature table.
|
1177
|
-
# * Bio::SPTR#ft(feature_name) -> Array of Hash
|
1178
|
-
# [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
|
1179
|
-
#
|
1180
|
-
# == FT Line; feature table data (>=0, optional)
|
1181
|
-
#
|
1182
|
-
# Col Data item
|
1183
|
-
# ----- -----------------
|
1184
|
-
# 1- 2 FT
|
1185
|
-
# 6-13 Feature name
|
1186
|
-
# 15-20 `FROM' endpoint
|
1187
|
-
# 22-27 `TO' endpoint
|
1188
|
-
# 35-75 Description (>=0 per key)
|
1189
|
-
# ----- -----------------
|
1190
|
-
#
|
1191
|
-
# Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors
|
1192
|
-
# including '<', '>' or '?'. (c.f. '<1', '?42')
|
1193
|
-
#
|
1194
|
-
# See also http://www.expasy.org/sprot/userman.html#FT_line
|
1195
|
-
#
|
1196
|
-
def ft(feature_key = nil)
|
1197
|
-
return ft[feature_key] if feature_key
|
1198
|
-
return @data['FT'] if @data['FT']
|
1199
|
-
|
1200
|
-
table = []
|
1201
|
-
begin
|
1202
|
-
get('FT').split("\n").each do |line|
|
1203
|
-
if line =~ /^FT \w/
|
1204
|
-
feature = line.chomp.ljust(74)
|
1205
|
-
table << [feature[ 5..12].strip, # Feature Name
|
1206
|
-
feature[14..19].strip, # From
|
1207
|
-
feature[21..26].strip, # To
|
1208
|
-
feature[34..74].strip ] # Description
|
1209
|
-
else
|
1210
|
-
table.last << line.chomp.sub!(/^FT +/, '')
|
1211
|
-
end
|
1212
|
-
end
|
1213
|
-
|
1214
|
-
# Joining Description lines
|
1215
|
-
table = table.map { |feature|
|
1216
|
-
ftid = feature.pop if feature.last =~ /FTId=/
|
1217
|
-
if feature.size > 4
|
1218
|
-
feature = [feature[0],
|
1219
|
-
feature[1],
|
1220
|
-
feature[2],
|
1221
|
-
feature[3, feature.size - 3].join(" ")]
|
1222
|
-
end
|
1223
|
-
feature << if ftid then ftid else '' end
|
1224
|
-
}
|
1225
|
-
|
1226
|
-
hash = {}
|
1227
|
-
table.each do |feature|
|
1228
|
-
hash[feature[0]] = [] unless hash[feature[0]]
|
1229
|
-
hash[feature[0]] << {
|
1230
|
-
# Removing '<', '>' or '?' in FROM/TO endopoint.
|
1231
|
-
'From' => feature[1].sub(/\D/, '').to_i,
|
1232
|
-
'To' => feature[2].sub(/\D/, '').to_i,
|
1233
|
-
'Description' => feature[3],
|
1234
|
-
'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
|
1235
|
-
'diff' => [],
|
1236
|
-
'original' => feature
|
1237
|
-
}
|
1238
|
-
|
1239
|
-
case feature[0]
|
1240
|
-
when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
|
1241
|
-
case hash[feature[0]].last['Description']
|
1242
|
-
when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
|
1243
|
-
original_res = $1
|
1244
|
-
changed_res = $2
|
1245
|
-
original_res = original_res.gsub(/ /,'').strip
|
1246
|
-
chenged_res = changed_res.gsub(/ /,'').strip
|
1247
|
-
when /Missing/i
|
1248
|
-
original_res = seq.subseq(hash[feature[0]].last['From'],
|
1249
|
-
hash[feature[0]].last['To'])
|
1250
|
-
changed_res = ''
|
1251
|
-
end
|
1252
|
-
hash[feature[0]].last['diff'] = [original_res, chenged_res]
|
1253
|
-
end
|
1254
|
-
end
|
1255
|
-
rescue
|
1256
|
-
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
|
1257
|
-
end
|
1258
|
-
|
1259
|
-
@data['FT'] = hash
|
1260
|
-
end
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
# returns a Hash of conteins in the SQ lines.
|
1265
|
-
# * Bio::SPTRL#sq -> hsh
|
1266
|
-
#
|
1267
|
-
# returns a value of a key given in the SQ lines.
|
1268
|
-
# * Bio::SPTRL#sq(key) -> int or str
|
1269
|
-
# * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
|
1270
|
-
# 'CRC64']
|
1271
|
-
#
|
1272
|
-
# === SQ Line; sequence header (1/entry)
|
1273
|
-
# SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
|
1274
|
-
# SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
|
1275
|
-
#
|
1276
|
-
# MW, Dalton unit.
|
1277
|
-
# CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
|
1278
|
-
def sq(key = nil)
|
1279
|
-
unless @data['SQ']
|
1280
|
-
if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
|
1281
|
-
@data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
|
1282
|
-
else
|
1283
|
-
raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
|
1284
|
-
end
|
1285
|
-
end
|
1286
|
-
|
1287
|
-
if key
|
1288
|
-
case key
|
1289
|
-
when /mw/, /molecular/, /weight/
|
1290
|
-
@data['SQ']['MW']
|
1291
|
-
when /len/, /length/, /AA/
|
1292
|
-
@data['SQ']['aalen']
|
1293
|
-
else
|
1294
|
-
@data['SQ'][key]
|
1295
|
-
end
|
1296
|
-
else
|
1297
|
-
@data['SQ']
|
1298
|
-
end
|
1299
|
-
end
|
1300
|
-
|
1301
|
-
|
1302
|
-
# returns a Bio::Sequence::AA of the amino acid sequence.
|
1303
|
-
# * Bio::SPTR#seq -> Bio::Sequence::AA
|
1304
|
-
#
|
1305
|
-
# blank Line; sequence data (>=1)
|
1306
|
-
def seq
|
1307
|
-
unless @data['']
|
1308
|
-
@data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
|
1309
|
-
end
|
1310
|
-
return @data['']
|
1311
|
-
end
|
1312
|
-
alias aaseq seq
|
1313
|
-
|
1314
|
-
end # class SPTR
|
1315
|
-
|
1316
|
-
end # module Bio
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
=begin
|
1321
|
-
|
1322
|
-
= Bio::SPTR < Bio::DB
|
1323
|
-
|
1324
|
-
Class for a entry in the SWISS-PROT/TrEMBL database.
|
1325
|
-
|
1326
|
-
* ((<URL:http://www.ebi.ac.uk/swissprot/>))
|
1327
|
-
* ((<URL:http://www.ebi.ac.uk/trembl/>))
|
1328
|
-
* ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
|
1329
|
-
|
1330
|
-
|
1331
|
-
--- Bio::SPTR.new(a_sp_entry)
|
1332
|
-
|
1333
|
-
=== ID line (Identification)
|
1334
|
-
|
1335
|
-
--- Bio::SPTR#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
|
1336
|
-
'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
|
1337
|
-
--- Bio::SPTR#id_line(key) -> str
|
1338
|
-
|
1339
|
-
key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
|
1340
|
-
|
1341
|
-
--- Bio::SPTR#entry_id -> str
|
1342
|
-
--- Bio::SPTR#molecule -> str
|
1343
|
-
--- Bio::SPTR#sequence_length -> int
|
1344
|
-
|
1345
|
-
|
1346
|
-
=== AC lines (Accession number)
|
1347
|
-
|
1348
|
-
--- Bio::SPTR#ac -> ary
|
1349
|
-
--- Bio::SPTR#accessions -> ary
|
1350
|
-
--- Bio::SPTR#accession -> accessions.first
|
1351
|
-
|
1352
|
-
|
1353
|
-
=== GN line (Gene name(s))
|
1354
|
-
|
1355
|
-
--- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
|
1356
|
-
--- Bio::SPTR#gene_name -> str
|
1357
|
-
--- Bio::SPTR#gene_names -> [str] or [str]
|
1358
|
-
|
1359
|
-
|
1360
|
-
=== DT lines (Date)
|
1361
|
-
|
1362
|
-
--- Bio::SPTR#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
|
1363
|
-
--- Bio::SPTR#dt(key) -> str
|
1364
|
-
|
1365
|
-
key := (created|annotation|sequence)
|
1366
|
-
|
1367
|
-
|
1368
|
-
=== DE lines (Description)
|
1369
|
-
|
1370
|
-
--- Bio::SPTR#de -> str
|
1371
|
-
#definition -> str
|
1372
|
-
|
1373
|
-
--- Bio::SPTR#protein_name
|
1374
|
-
|
1375
|
-
Returns the proposed official name of the protein
|
1376
|
-
|
1377
|
-
|
1378
|
-
--- Bio::SPTR#synonyms
|
1379
|
-
|
1380
|
-
Returns an array of synonyms (unofficial names)
|
1381
|
-
|
1382
|
-
=== KW lines (Keyword)
|
1383
|
-
|
1384
|
-
--- Bio::SPTR#kw -> ary
|
1385
|
-
|
1386
|
-
=== OS lines (Organism species)
|
1387
|
-
|
1388
|
-
--- Bio::SPTR#os -> [{'name' => str, 'os' => str}, ...]
|
1389
|
-
|
1390
|
-
=== OC lines (organism classification)
|
1391
|
-
|
1392
|
-
--- Bio::SPTR#oc -> ary
|
1393
|
-
|
1394
|
-
=== OG line (Organella)
|
1395
|
-
|
1396
|
-
--- Bio::SPTR#og -> ary
|
1397
|
-
|
1398
|
-
=== OX line (Organism taxonomy cross-reference)
|
1399
|
-
|
1400
|
-
--- Bio::SPTR#ox -> {'NCBI_TaxID' => [], ...}
|
1401
|
-
|
1402
|
-
=== RN RC RP RX RA RT RL RG lines (Reference)
|
1403
|
-
|
1404
|
-
--- Bio::SPTR#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
|
1405
|
-
|
1406
|
-
=== DR lines (Database cross-reference)
|
1407
|
-
|
1408
|
-
--- Bio::SPTR#dr -> {'EMBL' => ary, ...}
|
1409
|
-
|
1410
|
-
=== FT lines (Feature table data)
|
1411
|
-
|
1412
|
-
--- Bio::SPTR#ft -> hsh
|
1413
|
-
|
1414
|
-
=== SQ lines (Sequence header and data)
|
1415
|
-
|
1416
|
-
--- Bio::SPTR#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
|
1417
|
-
--- Bio::SPTR#sq(key) -> int or str
|
1418
|
-
|
1419
|
-
key := (aalen|MW|CRC64)
|
1420
|
-
|
1421
|
-
--- Bio::EMBL#seq -> Bio::Sequece::AA
|
1422
|
-
#aaseq -> Bio::Sequece::AA
|
1423
|
-
|
1424
|
-
=end
|
12
|
+
require "bio/db/embl/uniprotkb" unless const_defined?(:UniProtKB)
|
1425
13
|
|
1426
|
-
#
|
1427
|
-
#
|
1428
|
-
#
|
1429
|
-
|
1430
|
-
# DT - date (3 per entry)
|
1431
|
-
# DE - description (>=1 per entry)
|
1432
|
-
# GN - gene name(s) (>=0 per entry; optional)
|
1433
|
-
# OS - organism species (>=1 per entry)
|
1434
|
-
# OG - organelle (0 or 1 per entry; optional)
|
1435
|
-
# OC - organism classification (>=1 per entry)
|
1436
|
-
# OX - organism taxonomy x-ref (>=1 per entry)
|
1437
|
-
# OH - Organism Host
|
1438
|
-
# RN - reference number (>=1 per entry)
|
1439
|
-
# RP - reference positions (>=1 per entry)
|
1440
|
-
# RC - reference comment(s) (>=0 per entry; optional)
|
1441
|
-
# RX - reference cross-reference(s) (>=0 per entry; optional)
|
1442
|
-
# RA - reference author(s) (>=1 per entry)
|
1443
|
-
# RT - reference title (>=0 per entry; optional)
|
1444
|
-
# RL - reference location (>=1 per entry)
|
1445
|
-
# RG - reference group(s)
|
1446
|
-
# CC - comments or notes (>=0 per entry; optional)
|
1447
|
-
# DR - database cross-references (>=0 per entry; optional)
|
1448
|
-
# KW - keywords (>=1 per entry)
|
1449
|
-
# FT - feature table data (>=0 per entry; optional)
|
1450
|
-
# SQ - sequence header (1 per entry)
|
1451
|
-
# - (blanks) The sequence data (>=1 per entry)
|
1452
|
-
# // - termination line (ends each entry; 1 per entry)
|
1453
|
-
# ---- --------------------------- --------------------------------
|
14
|
+
# Bio::SPTR is changed to an alias of Bio::UniProtKB.
|
15
|
+
# Please use Bio::UniProtKB.
|
16
|
+
# Bio::SPTR may be deprecated in the future.
|
17
|
+
SPTR = UniProtKB
|
1454
18
|
|
19
|
+
end #module Bio
|
1455
20
|
|