bio 1.4.3.0001 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +39 -33
  3. data/BSDL +22 -0
  4. data/COPYING +2 -2
  5. data/COPYING.ja +36 -36
  6. data/ChangeLog +2404 -1025
  7. data/KNOWN_ISSUES.rdoc +15 -55
  8. data/README.rdoc +17 -23
  9. data/RELEASE_NOTES.rdoc +246 -183
  10. data/Rakefile +3 -2
  11. data/bin/br_biofetch.rb +29 -5
  12. data/bioruby.gemspec +15 -32
  13. data/bioruby.gemspec.erb +10 -20
  14. data/doc/ChangeLog-1.4.3 +1478 -0
  15. data/doc/RELEASE_NOTES-1.4.3.rdoc +204 -0
  16. data/doc/Tutorial.rd +0 -6
  17. data/doc/Tutorial.rd.html +7 -12
  18. data/doc/Tutorial.rd.ja +960 -1064
  19. data/doc/Tutorial.rd.ja.html +977 -1067
  20. data/gemfiles/Gemfile.travis-jruby1.8 +2 -1
  21. data/gemfiles/Gemfile.travis-jruby1.9 +2 -4
  22. data/gemfiles/Gemfile.travis-rbx +13 -0
  23. data/gemfiles/Gemfile.travis-ruby1.8 +2 -1
  24. data/gemfiles/Gemfile.travis-ruby1.9 +2 -4
  25. data/gemfiles/Gemfile.travis-ruby2.2 +9 -0
  26. data/lib/bio.rb +10 -43
  27. data/lib/bio/alignment.rb +8 -14
  28. data/lib/bio/appl/blast.rb +1 -2
  29. data/lib/bio/appl/blast/format0.rb +18 -7
  30. data/lib/bio/appl/blast/remote.rb +0 -9
  31. data/lib/bio/appl/blast/report.rb +1 -1
  32. data/lib/bio/appl/clustalw/report.rb +3 -1
  33. data/lib/bio/appl/genscan/report.rb +1 -2
  34. data/lib/bio/appl/iprscan/report.rb +1 -2
  35. data/lib/bio/appl/meme/mast.rb +4 -4
  36. data/lib/bio/appl/meme/mast/report.rb +1 -1
  37. data/lib/bio/appl/paml/codeml.rb +2 -2
  38. data/lib/bio/appl/paml/codeml/report.rb +1 -0
  39. data/lib/bio/appl/paml/common.rb +1 -1
  40. data/lib/bio/appl/sosui/report.rb +1 -2
  41. data/lib/bio/command.rb +62 -2
  42. data/lib/bio/data/aa.rb +13 -31
  43. data/lib/bio/data/codontable.rb +1 -2
  44. data/lib/bio/db/biosql/biosql_to_biosequence.rb +1 -0
  45. data/lib/bio/db/biosql/sequence.rb +1 -1
  46. data/lib/bio/db/embl/common.rb +1 -1
  47. data/lib/bio/db/embl/embl.rb +5 -4
  48. data/lib/bio/db/embl/format_embl.rb +3 -3
  49. data/lib/bio/db/embl/sptr.rb +9 -1444
  50. data/lib/bio/db/embl/swissprot.rb +12 -29
  51. data/lib/bio/db/embl/trembl.rb +13 -30
  52. data/lib/bio/db/embl/uniprot.rb +12 -29
  53. data/lib/bio/db/embl/uniprotkb.rb +1455 -0
  54. data/lib/bio/db/fasta.rb +17 -0
  55. data/lib/bio/db/fasta/defline.rb +1 -3
  56. data/lib/bio/db/fastq.rb +1 -1
  57. data/lib/bio/db/genbank/ddbj.rb +9 -5
  58. data/lib/bio/db/genbank/refseq.rb +11 -3
  59. data/lib/bio/db/gff.rb +3 -4
  60. data/lib/bio/db/go.rb +5 -6
  61. data/lib/bio/db/kegg/module.rb +4 -5
  62. data/lib/bio/db/kegg/pathway.rb +4 -5
  63. data/lib/bio/db/kegg/reaction.rb +1 -1
  64. data/lib/bio/db/nexus.rb +3 -2
  65. data/lib/bio/db/pdb/pdb.rb +2 -2
  66. data/lib/bio/db/phyloxml/phyloxml_elements.rb +82 -59
  67. data/lib/bio/db/phyloxml/phyloxml_parser.rb +2 -2
  68. data/lib/bio/db/phyloxml/phyloxml_writer.rb +1 -2
  69. data/lib/bio/db/sanger_chromatogram/chromatogram.rb +1 -2
  70. data/lib/bio/db/transfac.rb +1 -1
  71. data/lib/bio/io/das.rb +40 -41
  72. data/lib/bio/io/fastacmd.rb +0 -16
  73. data/lib/bio/io/fetch.rb +111 -55
  74. data/lib/bio/io/flatfile/buffer.rb +4 -5
  75. data/lib/bio/io/hinv.rb +2 -3
  76. data/lib/bio/io/ncbirest.rb +43 -6
  77. data/lib/bio/io/pubmed.rb +76 -81
  78. data/lib/bio/io/togows.rb +33 -10
  79. data/lib/bio/map.rb +1 -1
  80. data/lib/bio/pathway.rb +1 -1
  81. data/lib/bio/sequence/compat.rb +1 -1
  82. data/lib/bio/sequence/na.rb +63 -12
  83. data/lib/bio/shell.rb +0 -2
  84. data/lib/bio/shell/core.rb +5 -6
  85. data/lib/bio/shell/interface.rb +3 -4
  86. data/lib/bio/shell/irb.rb +1 -2
  87. data/lib/bio/shell/plugin/entry.rb +2 -3
  88. data/lib/bio/shell/plugin/seq.rb +7 -6
  89. data/lib/bio/shell/setup.rb +1 -2
  90. data/lib/bio/tree.rb +2 -2
  91. data/lib/bio/util/contingency_table.rb +0 -2
  92. data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +2 -2
  93. data/lib/bio/util/sirna.rb +76 -16
  94. data/lib/bio/version.rb +8 -9
  95. data/sample/benchmark_clustalw_report.rb +47 -0
  96. data/sample/biofetch.rb +248 -151
  97. data/setup.rb +6 -7
  98. data/test/data/clustalw/example1-seqnos.aln +58 -0
  99. data/test/network/bio/appl/blast/test_remote.rb +1 -15
  100. data/test/network/bio/appl/test_blast.rb +0 -12
  101. data/test/network/bio/io/test_pubmed.rb +49 -0
  102. data/test/network/bio/io/test_togows.rb +0 -1
  103. data/test/network/bio/test_command.rb +65 -2
  104. data/test/unit/bio/appl/bl2seq/test_report.rb +0 -1
  105. data/test/unit/bio/appl/blast/test_report.rb +110 -48
  106. data/test/unit/bio/appl/clustalw/test_report.rb +67 -51
  107. data/test/unit/bio/appl/sim4/test_report.rb +46 -17
  108. data/test/unit/bio/appl/test_blast.rb +2 -2
  109. data/test/unit/bio/db/embl/test_embl.rb +0 -1
  110. data/test/unit/bio/db/embl/test_embl_rel89.rb +0 -1
  111. data/test/unit/bio/db/embl/{test_sptr.rb → test_uniprotkb.rb} +111 -115
  112. data/test/unit/bio/db/embl/{test_uniprot_new_part.rb → test_uniprotkb_new_part.rb} +11 -11
  113. data/test/unit/bio/db/genbank/test_genbank.rb +10 -4
  114. data/test/unit/bio/db/pdb/test_pdb.rb +14 -8
  115. data/test/unit/bio/db/test_fasta.rb +41 -1
  116. data/test/unit/bio/db/test_fastq.rb +14 -4
  117. data/test/unit/bio/db/test_gff.rb +2 -2
  118. data/test/unit/bio/db/test_phyloxml.rb +30 -30
  119. data/test/unit/bio/db/test_phyloxml_writer.rb +2 -2
  120. data/test/unit/bio/io/flatfile/test_autodetection.rb +1 -2
  121. data/test/unit/bio/io/flatfile/test_buffer.rb +7 -1
  122. data/test/unit/bio/io/flatfile/test_splitter.rb +1 -1
  123. data/test/unit/bio/io/test_togows.rb +3 -2
  124. data/test/unit/bio/sequence/test_dblink.rb +1 -1
  125. data/test/unit/bio/sequence/test_na.rb +3 -1
  126. data/test/unit/bio/test_alignment.rb +1 -2
  127. data/test/unit/bio/test_command.rb +5 -4
  128. data/test/unit/bio/test_db.rb +4 -2
  129. data/test/unit/bio/test_pathway.rb +25 -10
  130. data/test/unit/bio/util/test_sirna.rb +22 -22
  131. metadata +656 -1430
  132. data/doc/KEGG_API.rd +0 -1843
  133. data/doc/KEGG_API.rd.ja +0 -1834
  134. data/extconf.rb +0 -2
  135. data/lib/bio/appl/blast/ddbj.rb +0 -131
  136. data/lib/bio/db/kegg/taxonomy.rb +0 -280
  137. data/lib/bio/io/dbget.rb +0 -194
  138. data/lib/bio/io/ddbjrest.rb +0 -344
  139. data/lib/bio/io/ddbjxml.rb +0 -458
  140. data/lib/bio/io/ebisoap.rb +0 -158
  141. data/lib/bio/io/ensembl.rb +0 -229
  142. data/lib/bio/io/higet.rb +0 -73
  143. data/lib/bio/io/keggapi.rb +0 -363
  144. data/lib/bio/io/ncbisoap.rb +0 -156
  145. data/lib/bio/io/soapwsdl.rb +0 -119
  146. data/lib/bio/shell/plugin/keggapi.rb +0 -181
  147. data/lib/bio/shell/plugin/soap.rb +0 -87
  148. data/sample/dbget +0 -37
  149. data/sample/demo_ddbjxml.rb +0 -212
  150. data/sample/demo_kegg_taxonomy.rb +0 -92
  151. data/sample/demo_keggapi.rb +0 -502
  152. data/sample/psortplot_html.rb +0 -214
  153. data/test/network/bio/io/test_ddbjrest.rb +0 -47
  154. data/test/network/bio/io/test_ensembl.rb +0 -230
  155. data/test/network/bio/io/test_soapwsdl.rb +0 -53
  156. data/test/unit/bio/io/test_ddbjxml.rb +0 -81
  157. data/test/unit/bio/io/test_ensembl.rb +0 -111
  158. data/test/unit/bio/io/test_soapwsdl.rb +0 -33
@@ -1,41 +1,24 @@
1
1
  #
2
- # = bio/db/embl/swissprot.rb - SwissProt database class
2
+ # = bio/db/embl/swissprot.rb - (deprecated) SwissProt database class
3
3
  #
4
- # Copyright:: Copyright (C) 2001, 2002 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2013 BioRuby Project
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: swissprot.rb,v 1.7 2007/04/05 23:35:40 trevor Exp $
8
- #
9
7
 
10
- require 'bio/db/embl/sptr'
8
+ warn "Bio::SwissProt is deprecated. Use Bio::UniProtKB."
11
9
 
12
10
  module Bio
13
11
 
14
- # == Description
15
- #
16
- # Parser class for SwissProt database entry. See also Bio::SPTR class.
17
- # This class holds name space for SwissProt specific methods.
18
- #
19
- # SwissProt (before UniProtKB/SwissProt) specific methods are defined in
20
- # this class. Shared methods for UniProtKB/SwissProt and TrEMBL classes
21
- # are defined in Bio::SPTR class.
22
- #
23
- # == Examples
24
- #
25
- # str = File.read("p53_human.swiss")
26
- # obj = Bio::SwissProt.new(str)
27
- # obj.entry_id #=> "P53_HUMAN"
28
- #
29
- # == Referencees
30
- #
31
- # * Swiss-Prot Protein knowledgebase
32
- # http://au.expasy.org/sprot/
33
- #
34
- # * Swiss-Prot Protein Knowledgebase User Manual
35
- # http://au.expasy.org/sprot/userman.html
36
- #
12
+ require 'bio/db/embl/uniprotkb' unless const_defined?(:UniProtKB)
13
+
14
+ # Bio::SwissProt is deprecated. Use Bio::UniProtKB.
37
15
  class SwissProt < SPTR
38
- # Nothing to do (SwissProt format is abstracted in SPTR)
16
+
17
+ # Bio::SwissProt is deprecated. Use Bio::UniProtKB.
18
+ def initialize(str)
19
+ warn "Bio::SwissProt is deprecated. Use Bio::UniProtKB."
20
+ super(str)
21
+ end
39
22
  end
40
23
 
41
24
  end
@@ -1,41 +1,24 @@
1
1
  #
2
- # = bio/db/embl/trembl.rb - TrEMBL database class
2
+ # = bio/db/embl/trembl.rb - (deprecated) TrEMBL database class
3
3
  #
4
- # Copyright:: Copyright (C) 2001, 2002 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2013 BioRuby Project
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: trembl.rb,v 1.7 2007/04/05 23:35:40 trevor Exp $
8
- #
9
7
 
10
- require 'bio/db/embl/sptr'
8
+ warn "Bio::TrEMBL is deprecated. Use Bio::UniProtKB."
11
9
 
12
10
  module Bio
13
11
 
14
- # == Description
15
- #
16
- # Parser class for TrEMBL database entry. See also Bio::SPTR class.
17
- # This class holds name space for TrEMBL specific methods.
18
- #
19
- # UniProtKB/SwissProt specific methods are defined in this class.
20
- # Shared methods for UniProtKB/SwissProt and TrEMBL classes are
21
- # defined in Bio::SPTR class.
22
- #
23
- # == Examples
24
- #
25
- # str = File.read("Q2UNG2_ASPOR.trembl")
26
- # obj = Bio::TrEMBL.new(str)
27
- # obj.entry_id #=> "Q2UNG2_ASPOR"
28
- #
29
- # == Referencees
30
- #
31
- # * TrEMBL Computer-annotated supplement to Swiss-Prot
32
- # http://au.expasy.org/sprot/
33
- #
34
- # * TrEMBL Computer-annotated supplement to Swiss-Prot User Manual
35
- # http://au.expasy.org/sprot/userman.html
36
- #
37
- class TrEMBL < SPTR
38
- # Nothing to do (TrEMBL format is abstracted in SPTR)
12
+ require 'bio/db/embl/uniprotkb' unless const_defined?(:UniProtKB)
13
+
14
+ # Bio::TrEMBL is deprecated. Use Bio::UniProtKB.
15
+ class TrEMBL < UniProtKB
16
+
17
+ # Bio::TrEMBL is deprecated. Use Bio::UniProtKB.
18
+ def initialize(str)
19
+ warn "Bio::TrEMBL is deprecated. Use Bio::UniProtKB."
20
+ super(str)
21
+ end
39
22
  end
40
23
 
41
24
  end
@@ -1,42 +1,25 @@
1
1
  #
2
2
  # = bio/db/embl/uniprot.rb - UniProt database class
3
3
  #
4
- # Copyright:: Copyright (C) 2005 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2013 BioRuby Project
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: uniprot.rb,v 1.5 2007/04/05 23:35:40 trevor Exp $
8
7
  #
9
8
 
10
- require 'bio/db/embl/sptr'
9
+ warn "Bio::UniProt is an alias of Bio::UniProtKB. Please use Bio::UniProtKB. Bio::UniProt may be deprecated in the future." if $VERBOSE
11
10
 
12
11
  module Bio
13
12
 
14
- # == Description
15
- #
16
- # Parser class for SwissProt database entry.# See also Bio::SPTR class.
17
- # This class holds name space for UniProtKB/SwissProt specific methods.
18
- #
19
- # UniProtKB/SwissProt specific methods are defined in this class.
20
- # Shared methods for UniProtKB/SwissProt and TrEMBL classes are
21
- # defined in Bio::SPTR class.
22
- #
23
- # == Examples
24
- #
25
- # str = File.read("p53_human.swiss")
26
- # obj = Bio::UniProt.new(str)
27
- # obj.entry_id #=> "P53_HUMAN"
28
- #
29
- # == Referencees
30
- #
31
- # * UniProt
32
- # http://uniprot.org/
33
- #
34
- # * The UniProtKB/SwissProt/TrEMBL User Manual
35
- # http://www.expasy.org/sprot/userman.html
36
- #
37
- class UniProt < SPTR
38
- # Nothing to do (UniProt format is abstracted in SPTR)
39
- end
13
+ require 'bio/db/embl/uniprotkb' unless const_defined?(:UniProtKB)
14
+
15
+ # Bio::UniProt is changed to an alias of Bio::UniProtKB.
16
+ # Please use Bio::UniProtKB.
17
+ # Bio::UniProt may be deprecated in the future.
18
+ #
19
+ # Note that Bio::SPTR have been renamed to Bio::UniProtKB and
20
+ # is also an alias of Bio::UniProtKB.
21
+ #
22
+ UniProt = UniProtKB
40
23
 
41
24
  end
42
25
 
@@ -0,0 +1,1455 @@
1
+ #
2
+ # = bio/db/embl/uniprotkb.rb - UniProtKB data parser class
3
+ #
4
+ # Copyright:: Copyright (C) 2001-2006 Mitsuteru C. Nakao <n@bioruby.org>
5
+ # License:: The Ruby License
6
+ #
7
+ #
8
+ # == Description
9
+ #
10
+ # See Bio::UniProtKB documents.
11
+ #
12
+
13
+ require 'bio/db'
14
+ require 'bio/db/embl/common'
15
+
16
+ module Bio
17
+
18
+ # == Description
19
+ #
20
+ # Parser class for UniProtKB/SwissProt and TrEMBL database entry.
21
+ #
22
+ # See the UniProtKB document files and manuals.
23
+ #
24
+ # == Examples
25
+ #
26
+ # str = File.read("p53_human.swiss")
27
+ # obj = Bio::UniProtKB.new(str)
28
+ # obj.entry_id #=> "P53_HUMAN"
29
+ #
30
+ # == References
31
+ #
32
+ # * The UniProt Knowledgebase (UniProtKB)
33
+ # http://www.uniprot.org/help/uniprotkb
34
+ #
35
+ # * The Universal Protein Resource (UniProt)
36
+ # http://uniprot.org/
37
+ #
38
+ # * The UniProtKB/SwissProt/TrEMBL User Manual
39
+ # http://www.uniprot.org/docs/userman.html
40
+ #
41
+ class UniProtKB < EMBLDB
42
+ include Bio::EMBLDB::Common
43
+
44
+ @@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
45
+ @@data_class = ["STANDARD", "PRELIMINARY"]
46
+
47
+ # returns a Hash of the ID line.
48
+ #
49
+ # returns a content (Int or String) of the ID line by a given key.
50
+ # Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
51
+ #
52
+ # === ID Line (since UniProtKB release 9.0 of 31-Oct-2006)
53
+ # ID P53_HUMAN Reviewed; 393 AA.
54
+ # #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
55
+ #
56
+ # === Examples
57
+ # obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed",
58
+ # "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}
59
+ #
60
+ # obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
61
+ #
62
+ #
63
+ # === ID Line (older style)
64
+ # ID P53_HUMAN STANDARD; PRT; 393 AA.
65
+ # #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
66
+ #
67
+ # === Examples
68
+ # obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD",
69
+ # "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}
70
+ #
71
+ # obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
72
+ #
73
+ def id_line(key = nil)
74
+ return id_line[key] if key
75
+ return @data['ID'] if @data['ID']
76
+
77
+ part = @orig['ID'].split(/ +/)
78
+ if part[4].to_s.chomp == 'AA.' then
79
+ # after UniProtKB release 9.0 of 31-Oct-2006
80
+ # (http://www.uniprot.org/docs/sp_news.htm)
81
+ molecule_type = nil
82
+ sequence_length = part[3].to_i
83
+ else
84
+ molecule_type = part[3].sub(/;/,'')
85
+ sequence_length = part[4].to_i
86
+ end
87
+ @data['ID'] = {
88
+ 'ENTRY_NAME' => part[1],
89
+ 'DATA_CLASS' => part[2].sub(/;/,''),
90
+ 'MOLECULE_TYPE' => molecule_type,
91
+ 'SEQUENCE_LENGTH' => sequence_length
92
+ }
93
+ end
94
+
95
+
96
+ # returns a ENTRY_NAME in the ID line.
97
+ #
98
+ def entry_id
99
+ id_line('ENTRY_NAME')
100
+ end
101
+ alias entry_name entry_id
102
+ alias entry entry_id
103
+
104
+
105
+ # returns a MOLECULE_TYPE in the ID line.
106
+ #
107
+ # A short-cut for Bio::UniProtKB#id_line('MOLECULE_TYPE').
108
+ def molecule
109
+ id_line('MOLECULE_TYPE')
110
+ end
111
+ alias molecule_type molecule
112
+
113
+
114
+ # returns a SEQUENCE_LENGTH in the ID line.
115
+ #
116
+ # A short-cut for Bio::UniProtKB#id_line('SEQUENCE_LENGHT').
117
+ def sequence_length
118
+ id_line('SEQUENCE_LENGTH')
119
+ end
120
+ alias aalen sequence_length
121
+
122
+
123
+ # Bio::EMBLDB::Common#ac -> ary
124
+ # #accessions -> ary
125
+ # #accession -> String (accessions.first)
126
+ @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
127
+
128
+
129
+
130
+ # returns a Hash of information in the DT lines.
131
+ # hash keys:
132
+ # ['created', 'sequence', 'annotation']
133
+ #--
134
+ # also Symbols acceptable (ASAP):
135
+ # [:created, :sequence, :annotation]
136
+ #++
137
+ #
138
+ # Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is
139
+ # changed, and the word "annotation" is no longer used in DT lines.
140
+ # Despite the change, the word "annotation" is still used for keeping
141
+ # compatibility.
142
+ #
143
+ # returns a String of information in the DT lines by a given key.
144
+ #
145
+ # === DT Line; date (3/entry)
146
+ # DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
147
+ # DT DD-MMM-YYY (sequence version NN)
148
+ # DT DD-MMM-YYY (entry version NN)
149
+ #
150
+ # The format have been changed in UniProtKB release 7.0 of 07-Feb-2006.
151
+ # Below is the older format.
152
+ #
153
+ # === Old format of DT Line; date (3/entry)
154
+ # DT DD-MMM-YYY (rel. NN, Created)
155
+ # DT DD-MMM-YYY (rel. NN, Last sequence update)
156
+ # DT DD-MMM-YYY (rel. NN, Last annotation update)
157
+ def dt(key = nil)
158
+ return dt[key] if key
159
+ return @data['DT'] if @data['DT']
160
+
161
+ part = self.get('DT').split(/\n/)
162
+ @data['DT'] = {
163
+ 'created' => part[0].sub(/\w{2} /,'').strip,
164
+ 'sequence' => part[1].sub(/\w{2} /,'').strip,
165
+ 'annotation' => part[2].sub(/\w{2} /,'').strip
166
+ }
167
+ end
168
+
169
+
170
+ # (private) parses DE line (description lines)
171
+ # since UniProtKB release 14.0 of 22-Jul-2008
172
+ #
173
+ # Return array containing array.
174
+ #
175
+ # http://www.uniprot.org/docs/sp_news.htm
176
+ def parse_DE_line_rel14(str)
177
+ # Retruns if it is not the new format since Rel.14
178
+ return nil unless /^DE (RecName|AltName|SubName)\: / =~ str
179
+ ret = []
180
+ cur = nil
181
+ str.each_line do |line|
182
+ case line
183
+ when /^DE (Includes|Contains)\: *$/
184
+ cur = [ $1 ]
185
+ ret.push cur
186
+ cur = nil
187
+ #subcat_and_desc = nil
188
+ next
189
+ when /^DE *(RecName|AltName|SubName)\: +(.*)/
190
+ category = $1
191
+ subcat_and_desc = $2
192
+ cur = [ category ]
193
+ ret.push cur
194
+ when /^DE *(Flags)\: +(.*)/
195
+ category = $1
196
+ desc = $2
197
+ flags = desc.strip.split(/\s*\;\s*/) || []
198
+ cur = [ category, flags ]
199
+ ret.push cur
200
+ cur = nil
201
+ #subcat_and_desc = nil
202
+ next
203
+ when /^DE *(.*)/
204
+ subcat_and_desc = $1
205
+ else
206
+ warn "Warning: skipped DE line in unknown format: #{line.inspect}"
207
+ #subcat_and_desc = nil
208
+ next
209
+ end
210
+ case subcat_and_desc
211
+ when nil
212
+ # does nothing
213
+ when /\A([^\=]+)\=(.*)/
214
+ subcat = $1
215
+ desc = $2
216
+ desc.sub!(/\;\s*\z/, '')
217
+ unless cur
218
+ warn "Warning: unknown category in DE line: #{line.inspect}"
219
+ cur = [ '' ]
220
+ ret.push cur
221
+ end
222
+ cur.push [ subcat, desc ]
223
+ else
224
+ warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
225
+ end
226
+ end
227
+ ret
228
+ end
229
+ private :parse_DE_line_rel14
230
+
231
+ # returns the proposed official name of the protein.
232
+ # Returns a String.
233
+ #
234
+ # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
235
+ # been changed. The method returns the full name which is taken from
236
+ # "RecName: Full=" or "SubName: Full=" line normally in the beginning of
237
+ # the DE lines.
238
+ # Unlike parser for old format, no special treatments for fragment or
239
+ # precursor.
240
+ #
241
+ # For old format, the method parses the DE lines and returns the protein
242
+ # name as a String.
243
+ #
244
+ # === DE Line; description (>=1)
245
+ # "DE #{OFFICIAL_NAME} (#{SYNONYM})"
246
+ # "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
247
+ # OFFICIAL_NAME 1/entry
248
+ # SYNONYM >=0
249
+ # CONTEINS >=0
250
+ def protein_name
251
+ @data['DE'] ||= parse_DE_line_rel14(get('DE'))
252
+ parsed_de_line = @data['DE']
253
+ if parsed_de_line then
254
+ # since UniProtKB release 14.0 of 22-Jul-2008
255
+ name = nil
256
+ parsed_de_line.each do |a|
257
+ case a[0]
258
+ when 'RecName', 'SubName'
259
+ if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
260
+ name = name_pair[1]
261
+ break
262
+ end
263
+ end
264
+ end
265
+ name = name.to_s
266
+ else
267
+ # old format (before Rel. 13.x)
268
+ name = ""
269
+ if de_line = fetch('DE') then
270
+ str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
271
+ name = str[/^[^(]*/].strip
272
+ name << ' (Fragment)' if str =~ /fragment/i
273
+ end
274
+ end
275
+ return name
276
+ end
277
+
278
+
279
+ # returns synonyms (unofficial and/or alternative names).
280
+ # Returns an Array containing String objects.
281
+ #
282
+ # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
283
+ # been changed. The method returns the full or short names which are
284
+ # taken from "RecName: Short=", "RecName: EC=", and AltName lines,
285
+ # except after "Contains:" or "Includes:".
286
+ # For keeping compatibility with old format parser, "RecName: EC=N.N.N.N"
287
+ # is reported as "EC N.N.N.N".
288
+ # In addition, to prevent confusion, "Allergen=" and "CD_antigen="
289
+ # prefixes are added for the corresponding fields.
290
+ #
291
+ # For old format, the method parses the DE lines and returns synonyms.
292
+ # synonyms are each placed in () following the official name on the DE line.
293
+ def synonyms
294
+ ary = Array.new
295
+ @data['DE'] ||= parse_DE_line_rel14(get('DE'))
296
+ parsed_de_line = @data['DE']
297
+ if parsed_de_line then
298
+ # since UniProtKB release 14.0 of 22-Jul-2008
299
+ parsed_de_line.each do |a|
300
+ case a[0]
301
+ when 'Includes', 'Contains'
302
+ break #the each loop
303
+ when 'RecName', 'SubName', 'AltName'
304
+ a[1..-1].each do |b|
305
+ if name = b[1] and b[1] != self.protein_name then
306
+ case b[0]
307
+ when 'EC'
308
+ name = "EC " + b[1]
309
+ when 'Allergen', 'CD_antigen'
310
+ name = b[0] + '=' + b[1]
311
+ else
312
+ name = b[1]
313
+ end
314
+ ary.push name
315
+ end
316
+ end
317
+ end #case a[0]
318
+ end #parsed_de_line.each
319
+ else
320
+ # old format (before Rel. 13.x)
321
+ if de_line = fetch('DE') then
322
+ line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
323
+ line.scan(/\([^)]+/) do |synonym|
324
+ unless synonym =~ /fragment/i then
325
+ ary << synonym[1..-1].strip # index to remove the leading (
326
+ end
327
+ end
328
+ end
329
+ end
330
+ return ary
331
+ end
332
+
333
+
334
+ # returns gene names in the GN line.
335
+ #
336
+ # New UniProt/SwissProt format:
337
+ # * Bio::UniProtKB#gn -> [ <gene record>* ]
338
+ # where <gene record> is:
339
+ # { :name => '...',
340
+ # :synonyms => [ 's1', 's2', ... ],
341
+ # :loci => [ 'l1', 'l2', ... ],
342
+ # :orfs => [ 'o1', 'o2', ... ]
343
+ # }
344
+ #
345
+ # Old format:
346
+ # * Bio::UniProtKB#gn -> Array # AND
347
+ # * Bio::UniProtKB#gn[0] -> Array # OR
348
+ #
349
+ # === GN Line: Gene name(s) (>=0, optional)
350
+ def gn
351
+ unless @data['GN']
352
+ case fetch('GN')
353
+ when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
354
+ @data['GN'] = gn_uniprot_parser
355
+ else
356
+ @data['GN'] = gn_old_parser
357
+ end
358
+ end
359
+ @data['GN']
360
+ end
361
+
362
+
363
+ # returns contents in the old style GN line.
364
+ # === GN Line: Gene name(s) (>=0, optional)
365
+ # GN HNS OR DRDX OR OSMZ OR BGLY.
366
+ # GN CECA1 AND CECA2.
367
+ # GN CECA1 AND (HOGE OR FUGA).
368
+ #
369
+ # GN NAME1 [(AND|OR) NAME]+.
370
+ #
371
+ # Bio::UniProtKB#gn -> Array # AND
372
+ # #gn[0] -> Array # OR
373
+ # #gene_names -> Array
374
+ def gn_old_parser
375
+ names = Array.new
376
+ if get('GN').size > 0
377
+ names = fetch('GN').sub(/\.$/,'').split(/ AND /)
378
+ names.map! { |synonyms|
379
+ synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
380
+ e.strip
381
+ }
382
+ }
383
+ end
384
+ @data['GN'] = names
385
+ end
386
+ private :gn_old_parser
387
+
388
+ # returns contents in the structured GN line.
389
+ # The new format of the GN line is:
390
+ # GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
391
+ # GN ORFNames=[, ...];
392
+ #
393
+ # * Bio::UniProtKB#gn -> [ <gene record>* ]
394
+ # where <gene record> is:
395
+ # { :name => '...',
396
+ # :synonyms => [ 's1', 's2', ... ],
397
+ # :loci => [ 'l1', 'l2', ... ],
398
+ # :orfs => [ 'o1', 'o2', ... ]
399
+ # }
400
+ def gn_uniprot_parser
401
+ @data['GN'] = Array.new
402
+ gn_line = fetch('GN').strip
403
+ records = gn_line.split(/\s*and\s*/)
404
+ records.each do |record|
405
+ gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
406
+ record.each_line(';') do |element|
407
+ case element
408
+ when /Name=/ then
409
+ gene_hash[:name] = $'[0..-2]
410
+ when /Synonyms=/ then
411
+ gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
412
+ when /OrderedLocusNames=/ then
413
+ gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
414
+ when /ORFNames=/ then
415
+ gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
416
+ end
417
+ end
418
+ @data['GN'] << gene_hash
419
+ end
420
+ return @data['GN']
421
+ end
422
+ private :gn_uniprot_parser
423
+
424
+
425
+ # returns a Array of gene names in the GN line.
426
+ def gene_names
427
+ gn # set @data['GN'] if it hasn't been already done
428
+ if @data['GN'].first.class == Hash then
429
+ @data['GN'].collect { |element| element[:name] }
430
+ else
431
+ @data['GN'].first
432
+ end
433
+ end
434
+
435
+
436
+ # returns a String of the first gene name in the GN line.
437
+ def gene_name
438
+ (x = self.gene_names) ? x.first : nil
439
+ end
440
+
441
+
442
+ # returns a Array of Hashs or a String of the OS line when a key given.
443
+ # * Bio::EMBLDB#os -> Array
444
+ # [{'name' => '(Human)', 'os' => 'Homo sapiens'},
445
+ # {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
446
+ # * Bio::EPTR#os[0] -> Hash
447
+ # {'name' => "(Human)", 'os' => 'Homo sapiens'}
448
+ # * Bio::UniProtKB#os[0]['name'] -> "(Human)"
449
+ # * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
450
+ #
451
+ # === OS Line; organism species (>=1)
452
+ # OS Genus species (name).
453
+ # OS Genus species (name0) (name1).
454
+ # OS Genus species (name0) (name1).
455
+ # OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
456
+ # OS Homo sapiens (Human), and Rarrus norveticus (Rat)
457
+ # OS Hippotis sp. Clark and Watts 825.
458
+ # OS unknown cyperaceous sp.
459
+ def os(num = nil)
460
+ unless @data['OS']
461
+ os = Array.new
462
+ fetch('OS').split(/, and|, /).each do |tmp|
463
+ if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/
464
+ org = $1
465
+ tmp =~ /(\(.+\))/
466
+ os.push({'name' => $1, 'os' => org})
467
+ else
468
+ raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
469
+ end
470
+ end
471
+ @data['OS'] = os
472
+ end
473
+
474
+ if num
475
+ # EX. "Trifolium repens (white clover)"
476
+ return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
477
+ else
478
+ return @data['OS']
479
+ end
480
+ end
481
+
482
+
483
+ # Bio::EMBLDB::Common#og -> Array
484
+ # OG Line; organella (0 or 1/entry)
485
+ # ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
486
+ # or a plasmid name (e.g. "Plasmid pBR322").
487
+
488
+
489
+ # Bio::EMBLDB::Common#oc -> Array
490
+ # OC Line; organism classification (>=1)
491
+ # "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
492
+ # "OC Theileria."
493
+
494
+
495
+
496
+ # returns a Hash of oraganism taxonomy cross-references.
497
+ # * Bio::UniProtKB#ox -> Hash
498
+ # {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
499
+ #
500
+ # === OX Line; organism taxonomy cross-reference (>=1 per entry)
501
+ # OX NCBI_TaxID=1234;
502
+ # OX NCBI_TaxID=1234, 2345, 3456, 4567;
503
+ def ox
504
+ unless @data['OX']
505
+ tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
506
+ hsh = Hash.new
507
+ tmp.each do |e|
508
+ db,refs = e.split(/=/)
509
+ hsh[db] = refs.split(/, */)
510
+ end
511
+ @data['OX'] = hsh
512
+ end
513
+ return @data['OX']
514
+ end
515
+
516
+ # === The OH Line;
517
+ #
518
+ # OH NCBI_TaxID=TaxID; HostName.
519
+ # http://br.expasy.org/sprot/userman.html#OH_line
520
+ def oh
521
+ unless @data['OH']
522
+ @data['OH'] = fetch('OH').split("\. ").map {|x|
523
+ if x =~ /NCBI_TaxID=(\d+);/
524
+ taxid = $1
525
+ else
526
+ raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
527
+ $!, "\n", get('OH'), "\n"].join
528
+
529
+ end
530
+ if x =~ /NCBI_TaxID=\d+; (.+)/
531
+ host_name = $1
532
+ host_name.sub!(/\.$/, '')
533
+ else
534
+ host_name = nil
535
+ end
536
+ {'NCBI_TaxID' => taxid, 'HostName' => host_name}
537
+ }
538
+ end
539
+ @data['OH']
540
+ end
541
+
542
+
543
+
544
+ # Bio::EMBLDB::Common#ref -> Array
545
+ # R Lines
546
+ # RN RC RP RX RA RT RL
547
+
548
+ # returns contents in the R lines.
549
+ # * Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
550
+ # where <reference information Hash> is:
551
+ # {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
552
+ # 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
553
+ #
554
+ # R Lines
555
+ # * RN RC RP RX RA RT RL RG
556
+ def ref
557
+ unless @data['R']
558
+ @data['R'] = [get('R').split(/\nRN /)].flatten.map { |str|
559
+ hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
560
+ 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
561
+ str = 'RN ' + str unless /^RN / =~ str
562
+
563
+ str.split("\n").each do |line|
564
+ if /^(R[NPXARLCTG]) (.+)/ =~ line
565
+ hash[$1] += $2 + ' '
566
+ else
567
+ raise "Invalid format in R lines, \n[#{line}]\n"
568
+ end
569
+ end
570
+
571
+ hash['RN'] = set_RN(hash['RN'])
572
+ hash['RC'] = set_RC(hash['RC'])
573
+ hash['RP'] = set_RP(hash['RP'])
574
+ hash['RX'] = set_RX(hash['RX'])
575
+ hash['RA'] = set_RA(hash['RA'])
576
+ hash['RT'] = set_RT(hash['RT'])
577
+ hash['RL'] = set_RL(hash['RL'])
578
+ hash['RG'] = set_RG(hash['RG'])
579
+
580
+ hash
581
+ }
582
+
583
+ end
584
+ @data['R']
585
+ end
586
+
587
+ def set_RN(data)
588
+ data.strip
589
+ end
590
+
591
+ def set_RC(data)
592
+ data.scan(/([STP]\w+)=(.+);/).map { |comment|
593
+ [comment[1].split(/, and |, /)].flatten.map { |text|
594
+ {'Token' => comment[0], 'Text' => text}
595
+ }
596
+ }.flatten
597
+ end
598
+ private :set_RC
599
+
600
+ def set_RP(data)
601
+ data = data.strip
602
+ data = data.sub(/\.$/, '')
603
+ data.split(/, AND |, /i).map {|x|
604
+ x = x.strip
605
+ x = x.gsub(' ', ' ')
606
+ }
607
+ end
608
+ private :set_RP
609
+
610
+ def set_RX(data)
611
+ rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
612
+ if data =~ /MEDLINE=(.+?);/
613
+ rx['MEDLINE'] = $1
614
+ end
615
+ if data =~ /PubMed=(.+?);/
616
+ rx['PubMed'] = $1
617
+ end
618
+ if data =~ /DOI=(.+?);/
619
+ rx['DOI'] = $1
620
+ end
621
+ rx
622
+ end
623
+ private :set_RX
624
+
625
+ def set_RA(data)
626
+ data = data.sub(/; *$/, '')
627
+ end
628
+ private :set_RA
629
+
630
+ def set_RT(data)
631
+ data = data.sub(/; *$/, '')
632
+ data = data.gsub(/(^"|"$)/, '')
633
+ end
634
+ private :set_RT
635
+
636
+ def set_RL(data)
637
+ data = data.strip
638
+ end
639
+ private :set_RL
640
+
641
+ def set_RG(data)
642
+ data = data.split('; ')
643
+ end
644
+ private :set_RG
645
+
646
+
647
+
648
+ # returns Bio::Reference object from Bio::EMBLDB::Common#ref.
649
+ # * Bio::EMBLDB::Common#ref -> Bio::References
650
+ def references
651
+ unless @data['references']
652
+ ary = self.ref.map {|ent|
653
+ hash = Hash.new('')
654
+ ent.each {|key, value|
655
+ case key
656
+ when 'RA'
657
+ hash['authors'] = value.split(/, /)
658
+ when 'RT'
659
+ hash['title'] = value
660
+ when 'RL'
661
+ if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
662
+ hash['journal'] = $1
663
+ hash['volume'] = $2
664
+ hash['issue'] = $3
665
+ hash['pages'] = $4
666
+ hash['year'] = $5
667
+ else
668
+ hash['journal'] = value
669
+ end
670
+ when 'RX' # PUBMED, MEDLINE, DOI
671
+ value.each do |tag, xref|
672
+ hash[ tag.downcase ] = xref
673
+ end
674
+ end
675
+ }
676
+ Reference.new(hash)
677
+ }
678
+ @data['references'] = References.new(ary)
679
+ end
680
+ @data['references']
681
+ end
682
+
683
+
684
+
685
+
686
+
687
+
688
+ # === The HI line
689
+ # Bio::UniProtKB#hi #=> hash
690
+ def hi
691
+ unless @data['HI']
692
+ @data['HI'] = []
693
+ fetch('HI').split(/\. /).each do |hlist|
694
+ hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''}
695
+ hash['Category'], hash['Keywords'] = hlist.split(': ')
696
+ hash['Keywords'] = hash['Keywords'].split('; ')
697
+ hash['Keyword'] = hash['Keywords'].pop
698
+ hash['Keyword'].sub!(/\.$/, '')
699
+ @data['HI'] << hash
700
+ end
701
+ end
702
+ @data['HI']
703
+ end
704
+
705
+
706
+ @@cc_topics = ['PHARMACEUTICAL',
707
+ 'BIOTECHNOLOGY',
708
+ 'TOXIC DOSE',
709
+ 'ALLERGEN',
710
+ 'RNA EDITING',
711
+ 'POLYMORPHISM',
712
+ 'BIOPHYSICOCHEMICAL PROPERTIES',
713
+ 'MASS SPECTROMETRY',
714
+ 'WEB RESOURCE',
715
+ 'ENZYME REGULATION',
716
+ 'DISEASE',
717
+ 'INTERACTION',
718
+ 'DEVELOPMENTAL STAGE',
719
+ 'INDUCTION',
720
+ 'CAUTION',
721
+ 'ALTERNATIVE PRODUCTS',
722
+ 'DOMAIN',
723
+ 'PTM',
724
+ 'MISCELLANEOUS',
725
+ 'TISSUE SPECIFICITY',
726
+ 'COFACTOR',
727
+ 'PATHWAY',
728
+ 'SUBUNIT',
729
+ 'CATALYTIC ACTIVITY',
730
+ 'SUBCELLULAR LOCATION',
731
+ 'FUNCTION',
732
+ 'SIMILARITY']
733
+ # returns contents in the CC lines.
734
+ # * Bio::UniProtKB#cc -> Hash
735
+ #
736
+ # returns an object of contents in the TOPIC.
737
+ # * Bio::UniProtKB#cc(TOPIC) -> Array w/in Hash, Hash
738
+ #
739
+ # returns contents of the "ALTERNATIVE PRODUCTS".
740
+ # * Bio::UniProtKB#cc('ALTERNATIVE PRODUCTS') -> Hash
741
+ # {'Event' => str,
742
+ # 'Named isoforms' => int,
743
+ # 'Comment' => str,
744
+ # 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
745
+ #
746
+ # CC -!- ALTERNATIVE PRODUCTS:
747
+ # CC Event=Alternative splicing; Named isoforms=15;
748
+ # ...
749
+ # CC placentae isoforms. All tissues differentially splice exon 13;
750
+ # CC Name=A; Synonyms=no del;
751
+ # CC IsoId=P15529-1; Sequence=Displayed;
752
+ #
753
+ # returns contents of the "DATABASE".
754
+ # * Bio::UniProtKB#cc('DATABASE') -> Array
755
+ # [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
756
+ #
757
+ # CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
758
+ #
759
+ # returns contents of the "MASS SPECTROMETRY".
760
+ # * Bio::UniProtKB#cc('MASS SPECTROMETRY') -> Array
761
+ # [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
762
+ #
763
+ # CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
764
+ #
765
+ # === CC lines (>=0, optional)
766
+ # CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
767
+ # CC IN LIVER, KIDNEY, LUNG AND BRAIN.
768
+ #
769
+ # CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
770
+ # CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
771
+ #
772
+ # See also http://www.expasy.org/sprot/userman.html#CC_line
773
+ #
774
+ def cc(topic = nil)
775
+ unless @data['CC']
776
+ cc = Hash.new
777
+ comment_border= '-' * (77 - 4 + 1)
778
+ dlm = /-!- /
779
+
780
+ # 12KD_MYCSM has no CC lines.
781
+ return cc if get('CC').size == 0
782
+
783
+ cc_raw = fetch('CC')
784
+
785
+ # Removing the copyright statement.
786
+ cc_raw.sub!(/ *---.+---/m, '')
787
+
788
+ # Not any CC Lines without the copyright statement.
789
+ return cc if cc_raw == ''
790
+
791
+ begin
792
+ cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
793
+ _ = copyright #dummy for suppress "assigned but unused variable"
794
+ cc_raw = cc_raw.sub(dlm,'')
795
+ cc_raw.split(dlm).each do |tmp|
796
+ tmp = tmp.strip
797
+
798
+ if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
799
+ key = $1
800
+ body = $2
801
+ body.gsub!(/- (?!AND)/,'-')
802
+ body.strip!
803
+ unless cc[key]
804
+ cc[key] = [body]
805
+ else
806
+ cc[key].push(body)
807
+ end
808
+ else
809
+ raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
810
+ '', get('CC'),''].join("\n")
811
+ end
812
+ end
813
+ rescue NameError
814
+ if fetch('CC') == ''
815
+ return {}
816
+ else
817
+ raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
818
+ "\n'#{self.get('CC')}'\n", "(#{$!})"].join
819
+ end
820
+ rescue NoMethodError
821
+ end
822
+
823
+ @data['CC'] = cc
824
+ end
825
+
826
+
827
+ case topic
828
+ when 'ALLERGEN'
829
+ return @data['CC'][topic]
830
+ when 'ALTERNATIVE PRODUCTS'
831
+ return cc_alternative_products(@data['CC'][topic])
832
+ when 'BIOPHYSICOCHEMICAL PROPERTIES'
833
+ return cc_biophysiochemical_properties(@data['CC'][topic])
834
+ when 'BIOTECHNOLOGY'
835
+ return @data['CC'][topic]
836
+ when 'CATALITIC ACTIVITY'
837
+ return cc_catalytic_activity(@data['CC'][topic])
838
+ when 'CAUTION'
839
+ return cc_caution(@data['CC'][topic])
840
+ when 'COFACTOR'
841
+ return @data['CC'][topic]
842
+ when 'DEVELOPMENTAL STAGE'
843
+ return @data['CC'][topic].join('')
844
+ when 'DISEASE'
845
+ return @data['CC'][topic].join('')
846
+ when 'DOMAIN'
847
+ return @data['CC'][topic]
848
+ when 'ENZYME REGULATION'
849
+ return @data['CC'][topic].join('')
850
+ when 'FUNCTION'
851
+ return @data['CC'][topic].join('')
852
+ when 'INDUCTION'
853
+ return @data['CC'][topic].join('')
854
+ when 'INTERACTION'
855
+ return cc_interaction(@data['CC'][topic])
856
+ when 'MASS SPECTROMETRY'
857
+ return cc_mass_spectrometry(@data['CC'][topic])
858
+ when 'MISCELLANEOUS'
859
+ return @data['CC'][topic]
860
+ when 'PATHWAY'
861
+ return cc_pathway(@data['CC'][topic])
862
+ when 'PHARMACEUTICAL'
863
+ return @data['CC'][topic]
864
+ when 'POLYMORPHISM'
865
+ return @data['CC'][topic]
866
+ when 'PTM'
867
+ return @data['CC'][topic]
868
+ when 'RNA EDITING'
869
+ return cc_rna_editing(@data['CC'][topic])
870
+ when 'SIMILARITY'
871
+ return @data['CC'][topic]
872
+ when 'SUBCELLULAR LOCATION'
873
+ return cc_subcellular_location(@data['CC'][topic])
874
+ when 'SUBUNIT'
875
+ return @data['CC'][topic]
876
+ when 'TISSUE SPECIFICITY'
877
+ return @data['CC'][topic]
878
+ when 'TOXIC DOSE'
879
+ return @data['CC'][topic]
880
+ when 'WEB RESOURCE'
881
+ return cc_web_resource(@data['CC'][topic])
882
+ when 'DATABASE'
883
+ # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
884
+ tmp = Array.new
885
+ db = @data['CC']['DATABASE']
886
+ return db unless db
887
+
888
+ db.each do |e|
889
+ db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
890
+ e.sub(/.$/,'').split(/;/).each do |line|
891
+ case line
892
+ when /NAME=(.+)/
893
+ db['NAME'] = $1
894
+ when /NOTE=(.+)/
895
+ db['NOTE'] = $1
896
+ when /WWW="(.+)"/
897
+ db['WWW'] = $1
898
+ when /FTP="(.+)"/
899
+ db['FTP'] = $1
900
+ end
901
+ end
902
+ tmp.push(db)
903
+ end
904
+ return tmp
905
+ when nil
906
+ return @data['CC']
907
+ else
908
+ return @data['CC'][topic]
909
+ end
910
+ end
911
+
912
+
913
+ def cc_alternative_products(data)
914
+ ap = data.join('')
915
+ return ap unless ap
916
+
917
+ # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
918
+ tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "",
919
+ 'Variants' => []}
920
+ if /Event=(.+?);/ =~ ap
921
+ tmp['Event'] = $1
922
+ tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
923
+ end
924
+ if /Named isoforms=(\S+?);/ =~ ap
925
+ tmp['Named isoforms'] = $1
926
+ end
927
+ if /Comment=(.+?);/m =~ ap
928
+ tmp['Comment'] = $1
929
+ end
930
+ ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
931
+ tmp['Variants'] << cc_alternative_products_variants(ent)
932
+ end
933
+ return tmp
934
+ end
935
+ private :cc_alternative_products
936
+
937
+ def cc_alternative_products_variants(data)
938
+ variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
939
+ data.split(/; /).map {|x| x.split(/=/) }.each do |e|
940
+ case e[0]
941
+ when 'Sequence', 'Synonyms', 'IsoId'
942
+ e[1] = e[1].sub(/;/,'').split(/, /)
943
+ end
944
+ variant[e[0]] = e[1]
945
+ end
946
+ variant
947
+ end
948
+ private :cc_alternative_products_variants
949
+
950
+
951
+ def cc_biophysiochemical_properties(data)
952
+ data = data[0]
953
+
954
+ hash = {'Absorption' => {},
955
+ 'Kinetic parameters' => {},
956
+ 'pH dependence' => "",
957
+ 'Redox potential' => "",
958
+ 'Temperature dependence' => ""}
959
+ if data =~ /Absorption: Abs\(max\)=(.+?);/
960
+ hash['Absorption']['Abs(max)'] = $1
961
+ end
962
+ if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
963
+ hash['Absorption']['Note'] = $1
964
+ end
965
+ if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
966
+ hash['Kinetic parameters']['KM'] = $1
967
+ hash['Kinetic parameters']['Vmax'] = $2
968
+ end
969
+ if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
970
+ hash['Kinetic parameters']['Note'] = $1
971
+ end
972
+ if data =~ /pH dependence: (.+?);/
973
+ hash['pH dependence'] = $1
974
+ end
975
+ if data =~ /Redox potential: (.+?);/
976
+ hash['Redox potential'] = $1
977
+ end
978
+ if data =~ /Temperature dependence: (.+?);/
979
+ hash['Temperature dependence'] = $1
980
+ end
981
+ hash
982
+ end
983
+ private :cc_biophysiochemical_properties
984
+
985
+
986
+ def cc_caution(data)
987
+ data.join('')
988
+ end
989
+ private :cc_caution
990
+
991
+
992
+ # returns conteins in a line of the CC INTERACTION section.
993
+ #
994
+ # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
995
+ def cc_interaction(data)
996
+ str = data.join('')
997
+ it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
998
+ it.map {|ent|
999
+ ent.map! {|x| x.strip }
1000
+ if ent[0] =~ /^(.+):(.+)/
1001
+ spac = $1
1002
+ spid = $2.split(' ')[0]
1003
+ optid = nil
1004
+ elsif ent[0] =~ /Self/
1005
+ spac = self.entry_id
1006
+ spid = self.entry_id
1007
+ optid = nil
1008
+ end
1009
+ if ent[0] =~ /^.+:.+ (.+)/
1010
+ optid = $1
1011
+ end
1012
+
1013
+ {'SP_Ac' => spac,
1014
+ 'identifier' => spid,
1015
+ 'NbExp' => ent[1],
1016
+ 'IntAct' => ent[2].split(', '),
1017
+ 'optional_identifier' => optid}
1018
+ }
1019
+ end
1020
+ private :cc_interaction
1021
+
1022
+
1023
+ def cc_mass_spectrometry(data)
1024
+ # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
1025
+ return data unless data
1026
+
1027
+ data.map { |m|
1028
+ mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
1029
+ 'NOTE' => nil}
1030
+ m.sub(/.$/,'').split(/;/).each do |line|
1031
+ case line
1032
+ when /MW=(.+)/
1033
+ mass['MW'] = $1
1034
+ when /MW_ERR=(.+)/
1035
+ mass['MW_ERR'] = $1
1036
+ when /METHOD=(.+)/
1037
+ mass['METHOD'] = $1
1038
+ when /RANGE=(\d+-\d+)/
1039
+ mass['RANGE'] = $1 # RANGE class ?
1040
+ when /NOTE=(.+)/
1041
+ mass['NOTE'] = $1
1042
+ end
1043
+ end
1044
+ mass
1045
+ }
1046
+ end
1047
+ private :cc_mass_spectrometry
1048
+
1049
+
1050
+ def cc_pathway(data)
1051
+ data.map {|x| x.sub(/\.$/, '') }.map {|x|
1052
+ x.split(/; | and |: /)
1053
+ }[0]
1054
+ end
1055
+ private :cc_pathway
1056
+
1057
+
1058
+ def cc_rna_editing(data)
1059
+ data = data.join('')
1060
+ entry = {'Modified_positions' => [], 'Note' => ""}
1061
+ if data =~ /Modified_positions=(.+?)(\.|;)/
1062
+ entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
1063
+ else
1064
+ raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
1065
+ end
1066
+ if data =~ /Note=(.+)/
1067
+ entry['Note'] = $1
1068
+ end
1069
+ entry
1070
+ end
1071
+ private :cc_rna_editing
1072
+
1073
+
1074
+ def cc_subcellular_location(data)
1075
+ data.map {|x|
1076
+ x.split('. ').map {|y|
1077
+ y.split('; ').map {|z|
1078
+ z.sub(/\.$/, '')
1079
+ }
1080
+ }
1081
+ }[0]
1082
+ end
1083
+ private :cc_subcellular_location
1084
+
1085
+
1086
+ #--
1087
+ # Since UniProtKB release 12.2 of 11-Sep-2007:
1088
+ # CC -!- WEB RESOURCE: Name=ResourceName[; Note=FreeText][; URL=WWWAddress]. # Old format:
1089
+ # CC -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].
1090
+ #++
1091
+
1092
+ def cc_web_resource(data)
1093
+ data.map {|x|
1094
+ entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
1095
+ x.split(';').each do |y|
1096
+ case y
1097
+ when /(Name|Note)\=(.+)/
1098
+ key = $1
1099
+ val = $2.strip
1100
+ entry[key] = val
1101
+ when /(NAME|NOTE)\=(.+)/
1102
+ key = $1.downcase.capitalize
1103
+ val = $2.strip
1104
+ entry[key] = val
1105
+ when /URL\=\"(.+)\"/
1106
+ entry['URL'] = $1.strip
1107
+ end
1108
+ end
1109
+ entry
1110
+ }
1111
+ end
1112
+ private :cc_web_resource
1113
+
1114
+ # returns databases cross-references in the DR lines.
1115
+ # * Bio::UniProtKB#dr -> Hash w/in Array
1116
+ #
1117
+ # === DR Line; defabases cross-reference (>=0)
1118
+ # DR database_identifier; primary_identifier; secondary_identifier.
1119
+ # a cross_ref pre one line
1120
+ @@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
1121
+ 'ECOGENE',
1122
+ 'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
1123
+ 'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
1124
+ 'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
1125
+ 'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
1126
+
1127
+ # Backup Bio::EMBLDB#dr as embl_dr
1128
+ alias :embl_dr :dr
1129
+
1130
+ # Bio::UniProtKB#dr
1131
+ def dr(key = nil)
1132
+ unless key
1133
+ embl_dr
1134
+ else
1135
+ (embl_dr[key] or []).map {|x|
1136
+ {'Accession' => x[0],
1137
+ 'Version' => x[1],
1138
+ ' ' => x[2],
1139
+ 'Molecular Type' => x[3]}
1140
+ }
1141
+ end
1142
+ end
1143
+
1144
+
1145
+ # Bio::EMBLDB::Common#kw - Array
1146
+ # #keywords -> Array
1147
+ #
1148
+ # KW Line; keyword (>=1)
1149
+ # KW [Keyword;]+
1150
+
1151
+
1152
+ # returns contents in the feature table.
1153
+ #
1154
+ # == Examples
1155
+ #
1156
+ # sp = Bio::UniProtKB.new(entry)
1157
+ # ft = sp.ft
1158
+ # ft.class #=> Hash
1159
+ # ft.keys.each do |feature_key|
1160
+ # ft[feature_key].each do |feature|
1161
+ # feature['From'] #=> '1'
1162
+ # feature['To'] #=> '21'
1163
+ # feature['Description'] #=> ''
1164
+ # feature['FTId'] #=> ''
1165
+ # feature['diff'] #=> []
1166
+ # feature['original'] #=> [feature_key, '1', '21', '', '']
1167
+ # end
1168
+ # end
1169
+ #
1170
+ # * Bio::UniProtKB#ft -> Hash
1171
+ # {FEATURE_KEY => [{'From' => int, 'To' => int,
1172
+ # 'Description' => aStr, 'FTId' => aStr,
1173
+ # 'diff' => [original_residues, changed_residues],
1174
+ # 'original' => aAry }],...}
1175
+ #
1176
+ # returns an Array of the information about the feature_name in the feature table.
1177
+ # * Bio::UniProtKB#ft(feature_name) -> Array of Hash
1178
+ # [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
1179
+ #
1180
+ # == FT Line; feature table data (>=0, optional)
1181
+ #
1182
+ # Col Data item
1183
+ # ----- -----------------
1184
+ # 1- 2 FT
1185
+ # 6-13 Feature name
1186
+ # 15-20 `FROM' endpoint
1187
+ # 22-27 `TO' endpoint
1188
+ # 35-75 Description (>=0 per key)
1189
+ # ----- -----------------
1190
+ #
1191
+ # Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors
1192
+ # including '<', '>' or '?'. (c.f. '<1', '?42')
1193
+ #
1194
+ # See also http://www.expasy.org/sprot/userman.html#FT_line
1195
+ #
1196
+ def ft(feature_key = nil)
1197
+ return ft[feature_key] if feature_key
1198
+ return @data['FT'] if @data['FT']
1199
+
1200
+ table = []
1201
+ begin
1202
+ get('FT').split("\n").each do |line|
1203
+ if line =~ /^FT \w/
1204
+ feature = line.chomp.ljust(74)
1205
+ table << [feature[ 5..12].strip, # Feature Name
1206
+ feature[14..19].strip, # From
1207
+ feature[21..26].strip, # To
1208
+ feature[34..74].strip ] # Description
1209
+ else
1210
+ table.last << line.chomp.sub!(/^FT +/, '')
1211
+ end
1212
+ end
1213
+
1214
+ # Joining Description lines
1215
+ table = table.map { |feature|
1216
+ ftid = feature.pop if feature.last =~ /FTId=/
1217
+ if feature.size > 4
1218
+ feature = [feature[0],
1219
+ feature[1],
1220
+ feature[2],
1221
+ feature[3, feature.size - 3].join(" ")]
1222
+ end
1223
+ feature << if ftid then ftid else '' end
1224
+ }
1225
+
1226
+ hash = {}
1227
+ table.each do |feature|
1228
+ hash[feature[0]] = [] unless hash[feature[0]]
1229
+ hash[feature[0]] << {
1230
+ # Removing '<', '>' or '?' in FROM/TO endopoint.
1231
+ 'From' => feature[1].sub(/\D/, '').to_i,
1232
+ 'To' => feature[2].sub(/\D/, '').to_i,
1233
+ 'Description' => feature[3],
1234
+ 'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
1235
+ 'diff' => [],
1236
+ 'original' => feature
1237
+ }
1238
+
1239
+ case feature[0]
1240
+ when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
1241
+ case hash[feature[0]].last['Description']
1242
+ when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
1243
+ original_res = $1
1244
+ changed_res = $2
1245
+ original_res = original_res.gsub(/ /,'').strip
1246
+ chenged_res = changed_res.gsub(/ /,'').strip
1247
+ when /Missing/i
1248
+ original_res = seq.subseq(hash[feature[0]].last['From'],
1249
+ hash[feature[0]].last['To'])
1250
+ changed_res = ''
1251
+ end
1252
+ hash[feature[0]].last['diff'] = [original_res, chenged_res]
1253
+ end
1254
+ end
1255
+ rescue
1256
+ raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1257
+ end
1258
+
1259
+ @data['FT'] = hash
1260
+ end
1261
+
1262
+
1263
+
1264
+ # returns a Hash of conteins in the SQ lines.
1265
+ # * Bio::UniProtKBL#sq -> hsh
1266
+ #
1267
+ # returns a value of a key given in the SQ lines.
1268
+ # * Bio::UniProtKBL#sq(key) -> int or str
1269
+ # * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
1270
+ # 'CRC64']
1271
+ #
1272
+ # === SQ Line; sequence header (1/entry)
1273
+ # SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
1274
+ # SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
1275
+ #
1276
+ # MW, Dalton unit.
1277
+ # CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
1278
+ def sq(key = nil)
1279
+ unless @data['SQ']
1280
+ if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
1281
+ @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
1282
+ else
1283
+ raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
1284
+ end
1285
+ end
1286
+
1287
+ if key
1288
+ case key
1289
+ when /mw/, /molecular/, /weight/
1290
+ @data['SQ']['MW']
1291
+ when /len/, /length/, /AA/
1292
+ @data['SQ']['aalen']
1293
+ else
1294
+ @data['SQ'][key]
1295
+ end
1296
+ else
1297
+ @data['SQ']
1298
+ end
1299
+ end
1300
+
1301
+
1302
+ # returns a Bio::Sequence::AA of the amino acid sequence.
1303
+ # * Bio::UniProtKB#seq -> Bio::Sequence::AA
1304
+ #
1305
+ # blank Line; sequence data (>=1)
1306
+ def seq
1307
+ unless @data['']
1308
+ @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
1309
+ end
1310
+ return @data['']
1311
+ end
1312
+ alias aaseq seq
1313
+
1314
+ end # class UniProtKB
1315
+
1316
+ end # module Bio
1317
+
1318
+
1319
+
1320
+ =begin
1321
+
1322
+ = Bio::UniProtKB < Bio::DB
1323
+
1324
+ Class for a entry in the SWISS-PROT/TrEMBL database.
1325
+
1326
+ * ((<URL:http://www.ebi.ac.uk/swissprot/>))
1327
+ * ((<URL:http://www.ebi.ac.uk/trembl/>))
1328
+ * ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
1329
+
1330
+
1331
+ --- Bio::UniProtKB.new(a_sp_entry)
1332
+
1333
+ === ID line (Identification)
1334
+
1335
+ --- Bio::UniProtKB#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
1336
+ 'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
1337
+ --- Bio::UniProtKB#id_line(key) -> str
1338
+
1339
+ key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
1340
+
1341
+ --- Bio::UniProtKB#entry_id -> str
1342
+ --- Bio::UniProtKB#molecule -> str
1343
+ --- Bio::UniProtKB#sequence_length -> int
1344
+
1345
+
1346
+ === AC lines (Accession number)
1347
+
1348
+ --- Bio::UniProtKB#ac -> ary
1349
+ --- Bio::UniProtKB#accessions -> ary
1350
+ --- Bio::UniProtKB#accession -> accessions.first
1351
+
1352
+
1353
+ === GN line (Gene name(s))
1354
+
1355
+ --- Bio::UniProtKB#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
1356
+ --- Bio::UniProtKB#gene_name -> str
1357
+ --- Bio::UniProtKB#gene_names -> [str] or [str]
1358
+
1359
+
1360
+ === DT lines (Date)
1361
+
1362
+ --- Bio::UniProtKB#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
1363
+ --- Bio::UniProtKB#dt(key) -> str
1364
+
1365
+ key := (created|annotation|sequence)
1366
+
1367
+
1368
+ === DE lines (Description)
1369
+
1370
+ --- Bio::UniProtKB#de -> str
1371
+ #definition -> str
1372
+
1373
+ --- Bio::UniProtKB#protein_name
1374
+
1375
+ Returns the proposed official name of the protein
1376
+
1377
+
1378
+ --- Bio::UniProtKB#synonyms
1379
+
1380
+ Returns an array of synonyms (unofficial names)
1381
+
1382
+ === KW lines (Keyword)
1383
+
1384
+ --- Bio::UniProtKB#kw -> ary
1385
+
1386
+ === OS lines (Organism species)
1387
+
1388
+ --- Bio::UniProtKB#os -> [{'name' => str, 'os' => str}, ...]
1389
+
1390
+ === OC lines (organism classification)
1391
+
1392
+ --- Bio::UniProtKB#oc -> ary
1393
+
1394
+ === OG line (Organella)
1395
+
1396
+ --- Bio::UniProtKB#og -> ary
1397
+
1398
+ === OX line (Organism taxonomy cross-reference)
1399
+
1400
+ --- Bio::UniProtKB#ox -> {'NCBI_TaxID' => [], ...}
1401
+
1402
+ === RN RC RP RX RA RT RL RG lines (Reference)
1403
+
1404
+ --- Bio::UniProtKB#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
1405
+
1406
+ === DR lines (Database cross-reference)
1407
+
1408
+ --- Bio::UniProtKB#dr -> {'EMBL' => ary, ...}
1409
+
1410
+ === FT lines (Feature table data)
1411
+
1412
+ --- Bio::UniProtKB#ft -> hsh
1413
+
1414
+ === SQ lines (Sequence header and data)
1415
+
1416
+ --- Bio::UniProtKB#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
1417
+ --- Bio::UniProtKB#sq(key) -> int or str
1418
+
1419
+ key := (aalen|MW|CRC64)
1420
+
1421
+ --- Bio::UniProtKB#seq -> Bio::Sequece::AA
1422
+ #aaseq -> Bio::Sequece::AA
1423
+
1424
+ =end
1425
+
1426
+ # Content Occurrence in an entry
1427
+ # ---- --------------------------- --------------------------------
1428
+ # ID - identification (begins each entry; 1 per entry)
1429
+ # AC - accession number(s) (>=1 per entry)
1430
+ # DT - date (3 per entry)
1431
+ # DE - description (>=1 per entry)
1432
+ # GN - gene name(s) (>=0 per entry; optional)
1433
+ # OS - organism species (>=1 per entry)
1434
+ # OG - organelle (0 or 1 per entry; optional)
1435
+ # OC - organism classification (>=1 per entry)
1436
+ # OX - organism taxonomy x-ref (>=1 per entry)
1437
+ # OH - Organism Host
1438
+ # RN - reference number (>=1 per entry)
1439
+ # RP - reference positions (>=1 per entry)
1440
+ # RC - reference comment(s) (>=0 per entry; optional)
1441
+ # RX - reference cross-reference(s) (>=0 per entry; optional)
1442
+ # RA - reference author(s) (>=1 per entry)
1443
+ # RT - reference title (>=0 per entry; optional)
1444
+ # RL - reference location (>=1 per entry)
1445
+ # RG - reference group(s)
1446
+ # CC - comments or notes (>=0 per entry; optional)
1447
+ # DR - database cross-references (>=0 per entry; optional)
1448
+ # KW - keywords (>=1 per entry)
1449
+ # FT - feature table data (>=0 per entry; optional)
1450
+ # SQ - sequence header (1 per entry)
1451
+ # - (blanks) The sequence data (>=1 per entry)
1452
+ # // - termination line (ends each entry; 1 per entry)
1453
+ # ---- --------------------------- --------------------------------
1454
+
1455
+