bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,446 @@
1
+ #
2
+ # = bio/db/embl/embl.rb - EMBL database class
3
+ #
4
+ #
5
+ # Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: embl.rb,v 1.25 2005/11/02 07:30:14 nakao Exp $
9
+ #
10
+ # == EMBL database entry
11
+ #
12
+ #
13
+ #
14
+ # == Example
15
+ #
16
+ # emb = Bio::EMBL.new($<.read)
17
+ # emb.entry_id
18
+ # emb.each_cds do |cds|
19
+ # cds
20
+ # end
21
+ # emb.seq
22
+ #
23
+ #--
24
+ #
25
+ # This library is free software; you can redistribute it and/or
26
+ # modify it under the terms of the GNU Lesser General Public
27
+ # License as published by the Free Software Foundation; either
28
+ # version 2 of the License, or (at your option) any later version.
29
+ #
30
+ # This library is distributed in the hope that it will be useful,
31
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
32
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
33
+ # Lesser General Public License for more details.
34
+ #
35
+ # You should have received a copy of the GNU Lesser General Public
36
+ # License along with this library; if not, write to the Free Software
37
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38
+ #
39
+ #++
40
+ #
41
+
42
+ require 'bio/db'
43
+ require 'bio/db/embl/common'
44
+
45
+ module Bio
46
+ class EMBL < EMBLDB
47
+ include Bio::EMBLDB::Common
48
+
49
+ # returns contents in the ID line.
50
+ # * Bio::EMBL#id_line -> <ID Hash>
51
+ # where <ID Hash> is:
52
+ # {'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
53
+ # 'SEQUENCE_LENGTH' => Int}
54
+ #
55
+ # ID Line
56
+ # "ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."
57
+ #
58
+ # DATA_CLASS = ['standard']
59
+ #
60
+ # MOLECULE_TYPE: DNA RNA XXX
61
+ #
62
+ # Code ( DIVISION )
63
+ # EST (ESTs)
64
+ # PHG (Bacteriophage)
65
+ # FUN (Fungi)
66
+ # GSS (Genome survey)
67
+ # HTC (High Throughput cDNAs)
68
+ # HTG (HTGs)
69
+ # HUM (Human)
70
+ # INV (Invertebrates)
71
+ # ORG (Organelles)
72
+ # MAM (Other Mammals)
73
+ # VRT (Other Vertebrates)
74
+ # PLN (Plants)
75
+ # PRO (Prokaryotes)
76
+ # ROD (Rodents)
77
+ # SYN (Synthetic)
78
+ # STS (STSs)
79
+ # UNC (Unclassified)
80
+ # VRL (Viruses)
81
+ #
82
+ def id_line(key=nil)
83
+ unless @data['ID']
84
+ tmp = Hash.new
85
+ idline = fetch('ID').split(/; +/)
86
+ tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline[0].split(/ +/)
87
+ tmp['MOLECULE_TYPE'] = idline[1]
88
+ tmp['DIVISION'] = idline[2]
89
+ tmp['SEQUENCE_LENGTH'] = idline[3].strip.split(' ').first.to_i
90
+
91
+ @data['ID'] = tmp
92
+ end
93
+
94
+ if key
95
+ @data['ID'][key]
96
+ else
97
+ @data['ID']
98
+ end
99
+ end
100
+
101
+ # returns ENTRY_NAME in the ID line.
102
+ # * Bio::EMBL#entry -> String
103
+ def entry
104
+ id_line('ENTRY_NAME')
105
+ end
106
+ alias entry_name entry
107
+ alias entry_id entry
108
+
109
+ # returns MOLECULE_TYPE in the ID line.
110
+ # * Bio::EMBL#molecule -> String
111
+ def molecule
112
+ id_line('MOLECULE_TYPE')
113
+ end
114
+ alias molecule_type molecule
115
+
116
+ # returns DIVISION in the ID line.
117
+ # * Bio::EMBL#division -> String
118
+ def division
119
+ id_line('DIVISION')
120
+ end
121
+
122
+ # returns SEQUENCE_LENGTH in the ID line.
123
+ # * Bio::EMBL#sequencelength -> String
124
+ def sequence_length
125
+ id_line('SEQUENCE_LENGTH')
126
+ end
127
+ alias seqlen sequence_length
128
+
129
+
130
+ # AC Line
131
+ # "AC A12345; B23456;"
132
+
133
+
134
+ # returns the version information in the sequence version (SV) line.
135
+ # * Bio::EMBL#sv -> Accession.Version in String
136
+ # * Bio::EMBL#version -> accession in Int
137
+ #
138
+ # SV Line; sequence version (1/entry)
139
+ # SV Accession.Version
140
+ def sv
141
+ field_fetch('SV').sub(/;/,'')
142
+ end
143
+ def version
144
+ sv.split(".")[1].to_i
145
+ end
146
+
147
+
148
+ # returns contents in the date (DT) line.
149
+ # * Bio::EMBL#dt -> <DT Hash>
150
+ # where <DT Hash> is:
151
+ # {}
152
+ # * Bio::EMBL#dt(key) -> String
153
+ # keys: 'created' and 'updated'
154
+ #
155
+ # DT Line; date (2/entry)
156
+ def dt(key=nil)
157
+ unless @data['DT']
158
+ tmp = Hash.new
159
+ dt_line = self.get('DT').split(/\n/)
160
+ tmp['created'] = dt_line[0].sub(/\w{2} /,'').strip
161
+ tmp['updated'] = dt_line[1].sub(/\w{2} /,'').strip
162
+ @data['DT'] = tmp
163
+ end
164
+ if key
165
+ @data['DT'][key]
166
+ else
167
+ @data['DT']
168
+ end
169
+ end
170
+
171
+
172
+
173
+ ##
174
+ # DE Line; description (>=1)
175
+ #
176
+
177
+
178
+ ##
179
+ # KW Line; keyword (>=1)
180
+ # KW [Keyword;]+
181
+ #
182
+ # Bio::EMBLDB#kw -> Array
183
+ # #keywords -> Array
184
+
185
+
186
+ ##
187
+ # OS Line; organism species (>=1)
188
+ # OS Genus species (name)
189
+ # "OS Trifolium repens (white clover)"
190
+ #
191
+ # Bio::EMBLDB#os -> Array
192
+
193
+
194
+ ##
195
+ # OC Line; organism classification (>=1)
196
+ #
197
+ # Bio::EMBLDB#oc -> Array
198
+
199
+
200
+ ##
201
+ # OG Line; organella (0 or 1/entry)
202
+ # ["Mitochondrion", "Chloroplast","Kinetoplast", "Cyanelle", "Plastid"]
203
+ # or a plasmid name (e.g. "Plasmid pBR322").
204
+ #
205
+ # Bio::EMBLDB#og -> String
206
+
207
+
208
+ ##
209
+ # R Lines
210
+ # RN RC RP RX RA RT RL
211
+ #
212
+ # Bio::EMBLDB#ref
213
+
214
+
215
+ ##
216
+ # DR Line; defabases cross-regerence (>=0)
217
+ # "DR database_identifier; primary_identifier; secondary_identifier."
218
+ #
219
+ # Bio::EMBLDB#dr
220
+
221
+
222
+ # returns feature table header (String) in the feature header (FH) line.
223
+ #
224
+ # FH Line; feature table header (0 or 2)
225
+ def fh
226
+ fetch('FH')
227
+ end
228
+
229
+ # returns contents in the feature table (FT) lines.
230
+ # * Bio::EMBL#ft -> Bio::Features
231
+ # * Bio::EMBL#ft {} -> {|Bio::Feature| }
232
+ #
233
+ # same as features method in bio/db/genbank.rb
234
+ #
235
+ # FT Line; feature table data (>=0)
236
+ def ft
237
+ unless @data['FT']
238
+ @data['FT'] = Array.new
239
+ ary = Array.new
240
+ in_quote = false
241
+ @orig['FT'].each_line do |line|
242
+ next if line =~ /^FEATURES/
243
+
244
+ head = line[0,20].strip # feature key (source, CDS, ...)
245
+ body = line[20,60].chomp # feature value (position, /qualifier=)
246
+ if line =~ /^FT {3}(\S+)/
247
+ ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
248
+ elsif body =~ /^ \// and not in_quote
249
+ ary.last.push(body) # /q="data..., /q=data, /q
250
+
251
+ if body =~ /=" / and body !~ /"$/
252
+ in_quote = true
253
+ end
254
+
255
+ else
256
+ ary.last.last << body # ...data..., ...data..."
257
+
258
+ if body =~ /"$/
259
+ in_quote = false
260
+ end
261
+ end
262
+ end
263
+
264
+ ary.map! do |subary|
265
+ parse_qualifiers(subary)
266
+ end
267
+
268
+ @data['FT'] = Features.new(ary)
269
+ end
270
+ if block_given?
271
+ @data['FT'].each do |feature|
272
+ yield feature
273
+ end
274
+ else
275
+ @data['FT']
276
+ end
277
+ end
278
+ alias features ft
279
+
280
+ # iterates on CDS features in the FT lines.
281
+ def each_cds
282
+ ft.each do |cds_feature|
283
+ if cds_feature.feature == 'CDS'
284
+ yield cds_feature
285
+ end
286
+ end
287
+ end
288
+
289
+ # iterates on gene features in the FT lines.
290
+ def each_gene
291
+ ft.each do |gene_feature|
292
+ if gene_feature.feature == 'gene'
293
+ yield gene_feature
294
+ end
295
+ end
296
+ end
297
+
298
+
299
+ # returns comment text in the comments (CC) line.
300
+ #
301
+ # CC Line; comments of notes (>=0)
302
+ def cc
303
+ get('CC')
304
+ end
305
+
306
+
307
+ ##
308
+ # XX Line; spacer line (many)
309
+ # def nxx
310
+ # end
311
+
312
+
313
+ # returns sequence header information in the sequence header (SQ) line.
314
+ # * Bio::EMBL#sq -> <SQ Hash>
315
+ # where <SQ Hash> is:
316
+ # {'ntlen' => Int, 'other' => Int,
317
+ # 'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}
318
+ # * Bio::EMBL#sq(base) -> <base content in Int>
319
+ # * Bio::EMBL#sq[base] -> <base content in Int>
320
+ #
321
+ # SQ Line; sequence header (1/entry)
322
+ # SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
323
+ def sq(base = nil)
324
+ unless @data['SQ']
325
+ fetch('SQ') =~ \
326
+ /(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
327
+ @data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
328
+ 'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
329
+ else
330
+ @data['SQ']
331
+ end
332
+
333
+ if base
334
+ @data['SQ'][base.downcase]
335
+ else
336
+ @data['SQ']
337
+ end
338
+ end
339
+
340
+
341
+ # returns the nucleotie sequence in this entry.
342
+ # * Bio::EMBL#seq -> Bio::Sequence::NA
343
+ #
344
+ # @orig[''] as sequence
345
+ # bb Line; (blanks) sequence data (>=1)
346
+ def seq
347
+ Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
348
+ end
349
+ alias naseq seq
350
+ alias ntseq seq
351
+
352
+ # // Line; termination line (end; 1/entry)
353
+
354
+
355
+ ### private methods
356
+
357
+ private
358
+
359
+ ##
360
+ # same as Bio::GenBank#parse_qualifiers(feature)
361
+ def parse_qualifiers(ary)
362
+ feature = Feature.new
363
+
364
+ feature.feature = ary.shift
365
+ feature.position = ary.shift.gsub(/\s/, '')
366
+
367
+ ary.each do |f|
368
+ if f =~ %r{/([^=]+)=?"?([^"]*)"?}
369
+ qualifier, value = $1, $2
370
+
371
+ if value.empty?
372
+ value = true
373
+ end
374
+
375
+ case qualifier
376
+ when 'translation'
377
+ value = Sequence::AA.new(value.gsub(/\s/, ''))
378
+ when 'codon_start'
379
+ value = value.to_i
380
+ end
381
+
382
+ feature.append(Feature::Qualifier.new(qualifier, value))
383
+ end
384
+ end
385
+
386
+ return feature
387
+ end
388
+
389
+ end
390
+
391
+ end
392
+
393
+
394
+ if __FILE__ == $0
395
+ while ent = $<.gets(Bio::EMBL::RS)
396
+ puts "\n ==> e = Bio::EMBL.new(ent) "
397
+ e = Bio::EMBL.new(ent)
398
+
399
+ puts "\n ==> e.entry_id "
400
+ p e.entry_id
401
+ puts "\n ==> e.id_line "
402
+ p e.id_line
403
+ puts "\n ==> e.id_line('molecule') "
404
+ p e.id_line('molecule')
405
+ puts "\n ==> e.molecule "
406
+ p e.molecule
407
+ puts "\n ==> e.ac "
408
+ p e.ac
409
+ puts "\n ==> e.sv "
410
+ p e.sv
411
+ puts "\n ==> e.dt "
412
+ p e.dt
413
+ puts "\n ==> e.dt('created') "
414
+ p e.dt('created')
415
+ puts "\n ==> e.de "
416
+ p e.de
417
+ puts "\n ==> e.kw "
418
+ p e.kw
419
+ puts "\n ==> e.os "
420
+ p e.os
421
+ puts "\n ==> e.oc "
422
+ p e.oc
423
+ puts "\n ==> e.og "
424
+ p e.og
425
+ puts "\n ==> e.ref "
426
+ p e.ref
427
+ puts "\n ==> e.dr "
428
+ p e.dr
429
+ puts "\n ==> e.ft "
430
+ p e.ft
431
+ puts "\n ==> e.each_cds {|c| p c}"
432
+ p e.each_cds {|c| p c }
433
+ puts "\n ==> e.sq "
434
+ p e.sq
435
+ puts "\n ==> e.sq('a') "
436
+ p e.sq('a')
437
+ puts "\n ==> e.gc"
438
+ p e.gc
439
+ puts "\n ==> e.seq "
440
+ p e.seq
441
+ end
442
+
443
+ end
444
+
445
+
446
+
@@ -0,0 +1,954 @@
1
+ #
2
+ # = bio/db/embl/sptr.rb - UniProt/SwissProt and TrEMBL database class
3
+ #
4
+ # Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: sptr.rb,v 1.29 2005/11/02 07:30:14 nakao Exp $
8
+ #
9
+ # == UniProtKB/SwissProt and TrEMBL
10
+ #
11
+ # See the SWISS-PROT dicument file SPECLIST.TXT.
12
+ #
13
+ # == Example
14
+ #
15
+ #--
16
+ #
17
+ # This library is free software; you can redistribute it and/or
18
+ # modify it under the terms of the GNU Lesser General Public
19
+ # License as published by the Free Software Foundation; either
20
+ # version 2 of the License, or (at your option) any later version.
21
+ #
22
+ # This library is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25
+ # Lesser General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU Lesser General Public
28
+ # License along with this library; if not, write to the Free Software
29
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30
+ #
31
+ #++
32
+ #
33
+
34
+ require 'bio/db'
35
+ require 'bio/db/embl/common'
36
+
37
+ module Bio
38
+
39
+ # Parser class for UniProtKB/SwissProt and TrEMBL database entry
40
+ class SPTR < EMBLDB
41
+ include Bio::EMBLDB::Common
42
+
43
+ @@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
44
+ @@data_class = ["STANDARD", "PRELIMINARY"]
45
+
46
+
47
+ # returns a Hash of the ID line.
48
+ # returns a content (Int or String) of the ID line by a given key.
49
+ # Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
50
+ #
51
+ # ID Line
52
+ # "ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
53
+ #
54
+ # ENTRY_NAME := "#{X}_#{Y}"
55
+ # X =~ /[A-Z0-9]{1,5}/ # The protein name.
56
+ # Y =~ /[A-Z0-9]{1,5}/ # The biological source of the protein.
57
+ # MOLECULE_TYPE := 'PRT' =~ /\w{3}/
58
+ # SEQUENCE_LENGTH =~ /\d+ AA/
59
+ def id_line(key = nil)
60
+ unless @data['ID']
61
+ tmp = Hash.new
62
+ a = @orig['ID'].split(/ +/)
63
+ tmp['ENTRY_NAME'] = a[1]
64
+ tmp['DATA_CLASS'] = a[2].sub(/;/,'')
65
+ tmp['MOLECULE_TYPE'] = a[3].sub(/;/,'')
66
+ tmp['SEQUENCE_LENGTH'] = a[4].to_i
67
+ @data['ID'] = tmp
68
+ end
69
+
70
+ if key
71
+ @data['ID'][key] # String/Int
72
+ else
73
+ @data['ID'] # Hash
74
+ end
75
+ end
76
+
77
+
78
+
79
+ # returns a ENTRY_NAME in the ID line.
80
+ #
81
+ # A short-cut for Bio::SPTR#id_line('ENTRY_NAME').
82
+ def entry_id
83
+ id_line('ENTRY_NAME')
84
+ end
85
+ alias entry_name entry_id
86
+ alias entry entry_id
87
+
88
+
89
+ # returns a MOLECULE_TYPE in the ID line.
90
+ #
91
+ # A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
92
+ def molecule
93
+ id_line('MOLECULE_TYPE')
94
+ end
95
+ alias molecule_type molecule
96
+
97
+
98
+ # returns a SEQUENCE_LENGTH in the ID line.
99
+ #
100
+ # A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
101
+ def sequence_length
102
+ id_line('SEQUENCE_LENGTH')
103
+ end
104
+ alias aalen sequence_length
105
+
106
+
107
+ # Bio::EMBLDB::Common#ac -> ary
108
+ # #accessions -> ary
109
+ # #accession -> String (accessions.first)
110
+ @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
111
+
112
+
113
+
114
+ # returns a Hash of information in the DT lines.
115
+ # hash keys:
116
+ # ['created', 'sequence', 'annotation']
117
+ # also Symbols acceptable (ASAP):
118
+ # [:created, :sequence, :annotation]
119
+ #
120
+ # returns a String of information in the DT lines by a given key..
121
+ #
122
+ # DT Line; date (3/entry)
123
+ # DT DD-MMM-YYY (rel. NN, Created)
124
+ # DT DD-MMM-YYY (rel. NN, Last sequence update)
125
+ # DT DD-MMM-YYY (rel. NN, Last annotation update)
126
+ def dt(key = nil)
127
+ unless @data['DT']
128
+ tmp = Hash.new
129
+ a = self.get('DT').split(/\n/)
130
+ tmp['created'] = a[0].sub(/\w{2} /,'').strip
131
+ tmp['sequence'] = a[1].sub(/\w{2} /,'').strip
132
+ tmp['annotation'] = a[2].sub(/\w{2} /,'').strip
133
+ @data['DT'] = tmp
134
+ end
135
+
136
+ if key
137
+ @data['DT'][key]
138
+ else
139
+ @data['DT']
140
+ end
141
+ end
142
+
143
+
144
+ # returns the proposed official name of the protein.
145
+ #
146
+ # DE Line; description (>=1)
147
+ # "DE #{OFFICIAL_NAME} (#{SYNONYM})"
148
+ # "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
149
+ # OFFICIAL_NAME 1/entry
150
+ # SYNONYM >=0
151
+ # CONTEINS >=0
152
+ def protein_name
153
+ name = ""
154
+ if de_line = fetch('DE') then
155
+ str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
156
+ name = str[/^[^(]*/].strip
157
+ name << ' (Fragment)' if str =~ /fragment/i
158
+ end
159
+ return name
160
+ end
161
+
162
+
163
+ # returns an array of synonyms (unofficial names).
164
+ #
165
+ # synonyms are each placed in () following the official name on the DE line.
166
+ def synonyms
167
+ ary = Array.new
168
+ if de_line = fetch('DE') then
169
+ line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
170
+ line.scan(/\([^)]+/) do |synonym|
171
+ unless synonym =~ /fragment/i then
172
+ ary << synonym[1..-1].strip # index to remove the leading (
173
+ end
174
+ end
175
+ end
176
+ return ary
177
+ end
178
+
179
+
180
+ # returns gene names in the GN line.
181
+ #
182
+ # New UniProt/SwissProt format:
183
+ # * Bio::SPTR#gn -> [ <gene record>* ]
184
+ # where <gene record> is:
185
+ # { :name => '...',
186
+ # :synonyms => [ 's1', 's2', ... ],
187
+ # :loci => [ 'l1', 'l2', ... ],
188
+ # :orfs => [ 'o1', 'o2', ... ]
189
+ # }
190
+ #
191
+ # Old format:
192
+ # * Bio::SPTR#gn -> Array # AND
193
+ # * Bio::SPTR#gn[0] -> Array # OR
194
+ #
195
+ # GN Line: Gene name(s) (>=0, optional)
196
+ def gn
197
+ return @data['GN'] if @data['GN']
198
+
199
+ case fetch('GN')
200
+ when /Name=/ then
201
+ return gn_uniprot_parser
202
+ else
203
+ return gn_old_parser
204
+ end
205
+ end
206
+
207
+ # returns contents in the old style GN line.
208
+ # GN Line: Gene name(s) (>=0, optional)
209
+ # GN HNS OR DRDX OR OSMZ OR BGLY.
210
+ # GN CECA1 AND CECA2.
211
+ # GN CECA1 AND (HOGE OR FUGA).
212
+ #
213
+ # GN NAME1 [(AND|OR) NAME]+.
214
+ #
215
+ # Bio::SPTR#gn -> Array # AND
216
+ # #gn[0] -> Array # OR
217
+ # #gene_names -> Array
218
+ def gn_old_parser
219
+ names = Array.new
220
+ if get('GN').size > 0
221
+ names = fetch('GN').sub(/\.$/,'').split(/ AND /)
222
+ names.map! { |synonyms|
223
+ synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
224
+ e.strip
225
+ }
226
+ }
227
+ end
228
+ return @data['GN'] = names
229
+ end
230
+ private :gn_old_parser
231
+
232
+ # returns contents in the structured GN line.
233
+ # The new format of the GN line is:
234
+ # GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
235
+ # GN ORFNames=[, ...];
236
+ #
237
+ # * Bio::SPTR#gn -> [ <gene record>* ]
238
+ # where <gene record> is:
239
+ # { :name => '...',
240
+ # :synonyms => [ 's1', 's2', ... ],
241
+ # :loci => [ 'l1', 'l2', ... ],
242
+ # :orfs => [ 'o1', 'o2', ... ]
243
+ # }
244
+ def gn_uniprot_parser
245
+ @data['GN'] = Array.new
246
+ gn_line = fetch('GN').strip
247
+ records = gn_line.split(/\s*and\s*/)
248
+ records.each do |record|
249
+ gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
250
+ record.each(';') do |element|
251
+ case element
252
+ when /Name=/ then
253
+ gene_hash[:name] = $'[0..-2]
254
+ when /Synonyms=/ then
255
+ gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
256
+ when /OrderedLocusNames=/ then
257
+ gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
258
+ when /ORFNames=/ then
259
+ gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
260
+ end
261
+ end
262
+ @data['GN'] << gene_hash
263
+ end
264
+ return @data['GN']
265
+ end
266
+ private :gn_uniprot_parser
267
+
268
+
269
+ # returns a Array of gene names in the GN line.
270
+ def gene_names
271
+ gn # set @data['GN'] if it hasn't been already done
272
+ if @data['GN'].first.class == Hash then
273
+ @data['GN'].collect { |element| element[:name] }
274
+ else
275
+ @data['GN'].first
276
+ end
277
+ end
278
+
279
+
280
+ # returns a String of the first gene name in the GN line.
281
+ def gene_name
282
+ gene_names.first
283
+ end
284
+
285
+
286
+ # returns a Array of Hashs or a String of the OS line when a key given.
287
+ # * Bio::EMBLDB#os -> Array
288
+ # [{'name' => '(Human)', 'os' => 'Homo sapiens'},
289
+ # {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
290
+ # * Bio::EPTR#os[0] -> Hash
291
+ # {'name' => "(Human)", 'os' => 'Homo sapiens'}
292
+ # * Bio::SPTR#os[0]['name'] -> "(Human)"
293
+ # * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
294
+ #
295
+ # OS Line; organism species (>=1)
296
+ # OS Genus species (name).
297
+ # OS Genus species (name0) (name1).
298
+ # OS Genus species (name0) (name1).
299
+ # OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
300
+ # OS Homo sapiens (Human), and Rarrus norveticus (Rat)
301
+ def os(num = nil)
302
+ unless @data['OS']
303
+ os = Array.new
304
+ fetch('OS').split(/, and|, /).each do |tmp|
305
+ if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
306
+ org = $1
307
+ tmp =~ /(\(.+\))/
308
+ os.push({'name' => $1, 'os' => org})
309
+ else
310
+ raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
311
+ end
312
+ end
313
+ @data['OS'] = os
314
+ end
315
+
316
+ if num
317
+ # EX. "Trifolium repens (white clover)"
318
+ return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
319
+ else
320
+ return @data['OS']
321
+ end
322
+ end
323
+
324
+
325
+ # Bio::EMBLDB::Common#og -> Array
326
+ # OG Line; organella (0 or 1/entry)
327
+ # ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
328
+ # or a plasmid name (e.g. "Plasmid pBR322").
329
+
330
+
331
+ # Bio::EMBLDB::Common#oc -> Array
332
+ # OC Line; organism classification (>=1)
333
+ # "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
334
+ # "OC Theileria."
335
+
336
+
337
+
338
+ # returns a Hash of oraganism taxonomy cross-references.
339
+ # * Bio::SPTR#ox -> Hash
340
+ # {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
341
+ #
342
+ # OX Line; organism taxonomy cross-reference (>=1 per entry)
343
+ # OX NCBI_TaxID=1234;
344
+ # OX NCBI_TaxID=1234, 2345, 3456, 4567;
345
+ def ox
346
+ unless @data['OX']
347
+ tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
348
+ hsh = Hash.new
349
+ tmp.each do |e|
350
+ db,refs = e.split(/=/)
351
+ hsh[db] = refs.split(/, */)
352
+ end
353
+ @data['OX'] = hsh
354
+ end
355
+ return @data['OX']
356
+ end
357
+
358
+
359
+ # Bio::EMBLDB::Common#ref -> Array
360
+ # R Lines
361
+ # RN RC RP RX RA RT RL
362
+
363
+
364
+ @@cc_topics = ['ALTERNATIVE PRODUCTS','CATALYTIC ACTIVITY','CAUTION',
365
+ 'COFACTOR','DATABASE','DEVELOPMENTAL STAGE','DISEASE','DOMAIN',
366
+ 'ENZYME REGULATION','FUNCTION','INDUCTION','MASS SPECTROMETRY',
367
+ 'MISCELLANEOUS','PATHWAY','PHARMACEUTICAL','POLYMORPHISM','PTM',
368
+ 'SIMILARITY','SUBCELLULAR LOCATION','SUBUNIT','TISSUE SPECIFICITY']
369
+ # returns contents in the CC lines.
370
+ # * Bio::SPTR#cc -> Hash
371
+
372
+ # * Bio::SPTR#cc(Int) -> String
373
+ # returns an Array of contents in the TOPIC string.
374
+ # * Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
375
+ #
376
+ # returns contents of the "ALTERNATIVE PRODUCTS".
377
+ # * Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
378
+ # {'Event' => str,
379
+ # 'Named isoforms' => int,
380
+ # 'Comment' => str,
381
+ # 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
382
+ #
383
+ # CC -!- ALTERNATIVE PRODUCTS:
384
+ # CC Event=Alternative splicing; Named isoforms=15;
385
+ # ...
386
+ # CC placentae isoforms. All tissues differentially splice exon 13;
387
+ # CC Name=A; Synonyms=no del;
388
+ # CC IsoId=P15529-1; Sequence=Displayed;
389
+ #
390
+ # returns contents of the "DATABASE".
391
+ # * Bio::SPTR#cc('DATABASE') -> Array
392
+ # [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
393
+ #
394
+ # CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
395
+ #
396
+ # returns contents of the "MASS SPECTROMETRY".
397
+ # * Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
398
+ # [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
399
+ #
400
+ # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
401
+ #
402
+ # CC lines (>=0, optional)
403
+ # CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
404
+ # CC IN LIVER, KIDNEY, LUNG AND BRAIN.
405
+ #
406
+ # CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
407
+ # CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
408
+ def cc(tag = nil)
409
+ unless @data['CC']
410
+ cc = Hash.new
411
+ cmt = '-' * (77 - 4 + 1)
412
+ dlm = /-!- /
413
+
414
+ return cc if get('CC').size == 0 # 12KD_MYCSM has no CC lines.
415
+
416
+ begin
417
+ fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
418
+ if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
419
+ key = $1
420
+ body = $2.gsub(/- (?!AND)/,'-')
421
+ unless cc[key]
422
+ cc[key] = [body]
423
+ else
424
+ cc[key].push(body)
425
+ end
426
+ else
427
+ raise ["Error: [#{entry_id}]: CC Lines", '',
428
+ tmp, '', '', fetch('CC'),''].join("\n")
429
+ end
430
+ end
431
+ rescue NameError
432
+ if fetch('CC') == ''
433
+ return {}
434
+ else
435
+ raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
436
+ "\n'#{self.get('CC')}'\n", "(#{$!})"].join
437
+ end
438
+ rescue NoMethodError
439
+ end
440
+
441
+ @data['CC'] = cc
442
+ end
443
+
444
+ case tag
445
+ when 'ALTERNATIVE PRODUCTS'
446
+ ap = @data['CC']['ALTERNATIVE PRODUCTS'].to_s
447
+ return ap unless ap
448
+
449
+ # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
450
+ tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 'Variants' => []}
451
+
452
+ if /Event=(.+?);/ =~ ap
453
+ tmp['Event'] = $1
454
+ end
455
+ if /Named isoforms=(\S+?);/ =~ ap
456
+ tmp['Named isoforms'] = $1
457
+ end
458
+ if /Comment=(.+?);/m =~ ap
459
+ tmp['Comment'] = $1
460
+ end
461
+ ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
462
+ tmp['Variants'] << cc_ap_variants_parse(ent)
463
+ end
464
+ return tmp
465
+
466
+
467
+ when 'DATABASE'
468
+ # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
469
+ tmp = Array.new
470
+ db = @data['CC']['DATABASE']
471
+ return db unless db
472
+
473
+ db.each do |e|
474
+ db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
475
+ e.sub(/.$/,'').split(/;/).each do |line|
476
+ case line
477
+ when /NAME=(.+)/
478
+ db['NAME'] = $1
479
+ when /NOTE=(.+)/
480
+ db['NOTE'] = $1
481
+ when /WWW="(.+)"/
482
+ db['WWW'] = $1
483
+ when /FTP="(.+)"/
484
+ db['FTP'] = $1
485
+ end
486
+ end
487
+ tmp.push(db)
488
+ end
489
+ return tmp
490
+
491
+ when 'MASS SPECTOROMETRY'
492
+ # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
493
+ tmp = Array.new
494
+ ms = @data['CC']['MASS SPECTOROMETRY']
495
+ return ms unless ms
496
+
497
+ ms.each do |m|
498
+ mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
499
+ m.sub(/.$/,'').split(/;/).each do |line|
500
+ case line
501
+ when /MW=(.+)/
502
+ mass['MW'] = $1.to_f
503
+ when /MW_ERR=(.+)/
504
+ mass['MW_ERR'] = $1.to_f
505
+ when /METHOD="(.+)"/
506
+ mass['METHOD'] = $1.to_s
507
+ when /RANGE="(\d+-\d+)"/
508
+ mass['RANGE'] = $1 # RANGE class ?
509
+ end
510
+ end
511
+ tmp.push(mass)
512
+ end
513
+ return tmp
514
+
515
+ when 'INTERACTION'
516
+ return cc_interaction_parse(@data['CC']['INTERACTION'].to_s)
517
+
518
+ when nil
519
+ return @data['CC']
520
+
521
+ else
522
+ return @data['CC'][tag]
523
+ end
524
+ end
525
+
526
+
527
+
528
+ def cc_ap_variants_parse(ent)
529
+ hsh = {}
530
+ ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
531
+ case e[0]
532
+ when 'Sequence'
533
+ e[1] = e[1].sub(/;/,'').split(/, /)
534
+ end
535
+ hsh[e[0]] = e[1]
536
+ end
537
+ return hsh
538
+ end
539
+ private :cc_ap_variants_parse
540
+
541
+
542
+ # returns conteins in a line of the CC INTERACTION section.
543
+ #
544
+ # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
545
+ def cc_interaction_parse(str)
546
+ it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
547
+ it.map {|ent|
548
+ {:partner_id => ent[0].strip,
549
+ :nbexp => ent[1].strip,
550
+ :intact_acc => ent[2].split(', ') }
551
+ }
552
+ end
553
+ private :cc_interaction_parse
554
+
555
+ # returns databases cross-references in the DR lines.
556
+ # * Bio::EMBLDB#dr -> Hash w/in Array
557
+ #
558
+ # DR Line; defabases cross-reference (>=0)
559
+ # a cross_ref pre one line
560
+ # DR database_identifier; primary_identifier; secondary_identifier.
561
+ @@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
562
+ 'ECOGENE',
563
+ 'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
564
+ 'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
565
+ 'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
566
+ 'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
567
+
568
+ # Bio::EMBLDB::Common#kw - Array
569
+ # #keywords -> Array
570
+ #
571
+ # KW Line; keyword (>=1)
572
+ # KW [Keyword;]+
573
+
574
+
575
+ # returns conteins in the feature table.
576
+ # * Bio::SPTR#ft -> Hash
577
+ # {'feature_name' => [{'From' => str, 'To' => str,
578
+ # 'Description' => str, 'FTId' => str}],...}
579
+ #
580
+ # returns an Array of the information about the feature_name in the feature table.
581
+ # * Bio::SPTR#ft(feature_name) -> Array of Hash
582
+ # [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
583
+ #
584
+ # FT Line; feature table data (>=0, optional)
585
+ #
586
+ # Col Data item
587
+ # ----- -----------------
588
+ # 1- 2 FT
589
+ # 6-13 Feature name
590
+ # 15-20 `FROM' endpoint
591
+ # 22-27 `TO' endpoint
592
+ # 35-75 Description (>=0 per key)
593
+ # ----- -----------------
594
+ def ft(feature_name = nil)
595
+ unless @data['FT']
596
+ table = Hash.new()
597
+ last_feature = nil
598
+
599
+ begin
600
+ get('FT').split(/\n/).each {|line|
601
+
602
+ feature = line[5..12].strip
603
+
604
+ if feature == '' and line[34..74]
605
+ tmp = ' ' + line[34..74].strip
606
+ table[last_feature].last['Description'] << tmp
607
+
608
+ next unless /\.$/ =~ line
609
+ else
610
+ from = line[14..19].strip
611
+ to = line[21..26].strip
612
+ desc = line[34..74].strip if line[34..74]
613
+
614
+ table[feature] = [] unless table[feature]
615
+ table[feature] << {
616
+ 'From' => from.to_i,
617
+ 'To' => to.to_i,
618
+ 'Description' => desc,
619
+ 'diff' => [],
620
+ 'FTId' => nil }
621
+ last_feature = feature
622
+ next
623
+ end
624
+
625
+ case last_feature
626
+ when 'VARSPLIC', 'VARIANT', 'CONFLICT'
627
+ if /FTId=(.+?)\./ =~ line # version 41 >
628
+ ftid = $1
629
+ table[last_feature].last['FTId'] = ftid
630
+ table[last_feature].last['Description'].sub!(/ \/FTId=#{ftid}./,'')
631
+ end
632
+
633
+ case table[last_feature].last['Description']
634
+ when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
635
+ original = $1
636
+ swap = $2
637
+ original = original.gsub(/ /,'').strip
638
+ swap = swap.gsub(/ /,'').strip
639
+ when /Missing/i
640
+ original = seq.subseq(table[last_feature].last['From'],
641
+ table[last_feature].last['To'])
642
+ swap = ''
643
+ else
644
+ raise line
645
+ end
646
+ table[last_feature].last['diff'] = [original, swap]
647
+ end
648
+ }
649
+
650
+ rescue
651
+ raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n" +
652
+ "'#{self.get('FT')}'\n"
653
+ end
654
+
655
+ table.each_key do |k|
656
+ table[k].each do |e|
657
+ if / -> / =~ e['Description']
658
+ pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
659
+ e['Description'].sub!(pattern) {
660
+ a = $1
661
+ b = $2
662
+ a.gsub(/ /,'') + " -> " + b.gsub(/ /,'')
663
+ }
664
+ end
665
+ if /- [\w\d]/ =~ e['Description']
666
+ e['Description'].gsub!(/([\w\d]- [\w\d]+)/) {
667
+ a = $1
668
+ if /- AND/ =~ a
669
+ a
670
+ else
671
+ a.sub(/ /,'')
672
+ end
673
+ }
674
+ end
675
+ end
676
+ end
677
+ @data['FT'] = table
678
+ end
679
+
680
+ if feature_name
681
+ @data['FT'][feature_name]
682
+ else
683
+ @data['FT']
684
+ end
685
+ end
686
+
687
+
688
+ # returns a Hash of conteins in the SQ lines.
689
+ # * Bio::SPTRL#sq -> hsh
690
+ #
691
+ # returns a value of a key given in the SQ lines.
692
+ # * Bio::SPTRL#sq(key) -> int or str
693
+ # * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length', 'CRC64']
694
+ #
695
+ # SQ Line; sequence header (1/entry)
696
+ # SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
697
+ # SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
698
+ #
699
+ # MW, Dalton unit.
700
+ # CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
701
+ def sq(key = nil)
702
+ unless @data['SQ']
703
+ if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
704
+ @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
705
+ else
706
+ raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
707
+ end
708
+ end
709
+
710
+ if key
711
+ case key
712
+ when /mw/, /molecular/, /weight/
713
+ @data['SQ']['MW']
714
+ when /len/, /length/, /AA/
715
+ @data['SQ']['aalen']
716
+ else
717
+ @data['SQ'][key]
718
+ end
719
+ else
720
+ @data['SQ']
721
+ end
722
+ end
723
+
724
+
725
+ # returns a Bio::Sequence::AA of the amino acid sequence.
726
+ # * Bio::SPTR#seq -> Bio::Sequence::AA
727
+ #
728
+ # blank Line; sequence data (>=1)
729
+ def seq
730
+ unless @data['']
731
+ @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
732
+ end
733
+ return @data['']
734
+ end
735
+ alias aaseq seq
736
+
737
+ end # class SPTR
738
+
739
+ end # module Bio
740
+
741
+
742
+ if __FILE__ == $0
743
+ # Usage: ruby __FILE__ uniprot_sprot.dat
744
+ # Usage: ruby __FILE__ uniprot_sprot.dat | egrep '^RuntimeError'
745
+
746
+ begin
747
+ require 'pp'
748
+ alias pp p
749
+ rescue LoadError
750
+ end
751
+
752
+ def cmd(cmd, tag = nil, ent = $ent)
753
+ puts " ==> #{cmd} "
754
+ puts Bio::SPTR.new(ent).get(tag) if tag
755
+ begin
756
+ p eval(cmd)
757
+ rescue RuntimeError
758
+ puts "RuntimeError(#{Bio::SPTR.new($ent).entry_id})}: #{$!} "
759
+ end
760
+ puts
761
+ end
762
+
763
+
764
+ while $ent = $<.gets(Bio::SPTR::RS)
765
+
766
+ cmd "Bio::SPTR.new($ent).entry_id"
767
+
768
+ cmd "Bio::SPTR.new($ent).id_line", 'ID'
769
+ cmd "Bio::SPTR.new($ent).entry"
770
+ cmd "Bio::SPTR.new($ent).entry_name"
771
+ cmd "Bio::SPTR.new($ent).molecule"
772
+ cmd "Bio::SPTR.new($ent).sequence_length"
773
+
774
+ cmd "Bio::SPTR.new($ent).ac", 'AC'
775
+ cmd "Bio::SPTR.new($ent).accession"
776
+
777
+
778
+ cmd "Bio::SPTR.new($ent).gn", 'GN'
779
+ cmd "Bio::SPTR.new($ent).gene_name"
780
+ cmd "Bio::SPTR.new($ent).gene_names"
781
+
782
+ cmd "Bio::SPTR.new($ent).dt", "DT"
783
+ ['created','annotation','sequence'].each do |key|
784
+ cmd "Bio::SPTR.new($ent).dt('#{key}')"
785
+ end
786
+
787
+ cmd "Bio::SPTR.new($ent).de", 'DE'
788
+ cmd "Bio::SPTR.new($ent).definition"
789
+ cmd "Bio::SPTR.new($ent).protein_name"
790
+ cmd "Bio::SPTR.new($ent).synonyms"
791
+
792
+ cmd "Bio::SPTR.new($ent).kw", 'KW'
793
+
794
+ cmd "Bio::SPTR.new($ent).os", 'OS'
795
+
796
+ cmd "Bio::SPTR.new($ent).oc", 'OC'
797
+
798
+ cmd "Bio::SPTR.new($ent).og", 'OG'
799
+
800
+ cmd "Bio::SPTR.new($ent).ox", 'OX'
801
+
802
+ cmd "Bio::SPTR.new($ent).ref", 'R'
803
+
804
+ cmd "Bio::SPTR.new($ent).cc", 'CC'
805
+ cmd "Bio::SPTR.new($ent).cc('ALTERNATIVE PRODUCTS')"
806
+ cmd "Bio::SPTR.new($ent).cc('DATABASE')"
807
+ cmd "Bio::SPTR.new($ent).cc('MASS SPECTOMETRY')"
808
+
809
+ cmd "Bio::SPTR.new($ent).dr", 'DR'
810
+
811
+ cmd "Bio::SPTR.new($ent).ft", 'FT'
812
+ cmd "Bio::SPTR.new($ent).ft['DOMAIN']"
813
+
814
+ cmd "Bio::SPTR.new($ent).sq", "SQ"
815
+ cmd "Bio::SPTR.new($ent).seq"
816
+ end
817
+
818
+ end
819
+
820
+
821
+ =begin
822
+
823
+ = Bio::SPTR < Bio::DB
824
+
825
+ Class for a entry in the SWISS-PROT/TrEMBL database.
826
+
827
+ * ((<URL:http://www.ebi.ac.uk/swissprot/>))
828
+ * ((<URL:http://www.ebi.ac.uk/trembl/>))
829
+ * ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
830
+
831
+
832
+ --- Bio::SPTR.new(a_sp_entry)
833
+
834
+ === ID line (Identification)
835
+
836
+ --- Bio::SPTR#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
837
+ 'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
838
+ --- Bio::SPTR#id_line(key) -> str
839
+
840
+ key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
841
+
842
+ --- Bio::SPTR#entry_id -> str
843
+ --- Bio::SPTR#molecule -> str
844
+ --- Bio::SPTR#sequence_length -> int
845
+
846
+
847
+ === AC lines (Accession number)
848
+
849
+ --- Bio::SPTR#ac -> ary
850
+ --- Bio::SPTR#accessions -> ary
851
+ --- Bio::SPTR#accession -> accessions.first
852
+
853
+
854
+ === GN line (Gene name(s))
855
+
856
+ --- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
857
+ --- Bio::SPTR#gene_name -> str
858
+ --- Bio::SPTR#gene_names -> [str] or [str]
859
+
860
+
861
+ === DT lines (Date)
862
+
863
+ --- Bio::SPTR#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
864
+ --- Bio::SPTR#dt(key) -> str
865
+
866
+ key := (created|annotation|sequence)
867
+
868
+
869
+ === DE lines (Description)
870
+
871
+ --- Bio::SPTR#de -> str
872
+ #definition -> str
873
+
874
+ --- Bio::SPTR#protein_name
875
+
876
+ Returns the proposed official name of the protein
877
+
878
+
879
+ --- Bio::SPTR#synonyms
880
+
881
+ Returns an array of synonyms (unofficial names)
882
+
883
+ === KW lines (Keyword)
884
+
885
+ --- Bio::SPTR#kw -> ary
886
+
887
+ === OS lines (Organism species)
888
+
889
+ --- Bio::SPTR#os -> [{'name' => str, 'os' => str}, ...]
890
+
891
+ === OC lines (organism classification)
892
+
893
+ --- Bio::SPTR#oc -> ary
894
+
895
+ === OG line (Organella)
896
+
897
+ --- Bio::SPTR#og -> ary
898
+
899
+ === OX line (Organism taxonomy cross-reference)
900
+
901
+ --- Bio::SPTR#ox -> {'NCBI_TaxID' => [], ...}
902
+
903
+ === RN RC RP RX RA RT RL RG lines (Reference)
904
+
905
+ --- Bio::SPTR#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
906
+
907
+ === DR lines (Database cross-reference)
908
+
909
+ --- Bio::SPTR#dr -> {'EMBL' => ary, ...}
910
+
911
+ === FT lines (Feature table data)
912
+
913
+ --- Bio::SPTR#ft -> hsh
914
+
915
+ === SQ lines (Sequence header and data)
916
+
917
+ --- Bio::SPTR#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
918
+ --- Bio::SPTR#sq(key) -> int or str
919
+
920
+ key := (aalen|MW|CRC64)
921
+
922
+ --- Bio::EMBL#seq -> Bio::Sequece::AA
923
+ #aaseq -> Bio::Sequece::AA
924
+
925
+ =end
926
+
927
+ # Content Occurrence in an entry
928
+ # ---- --------------------------- --------------------------------
929
+ # ID - identification (begins each entry; 1 per entry)
930
+ # AC - accession number(s) (>=1 per entry)
931
+ # DT - date (3 per entry)
932
+ # DE - description (>=1 per entry)
933
+ # GN - gene name(s) (>=0 per entry; optional)
934
+ # OS - organism species (>=1 per entry)
935
+ # OG - organelle (0 or 1 per entry; optional)
936
+ # OC - organism classification (>=1 per entry)
937
+ # OX - organism taxonomy x-ref (>=1 per entry)
938
+ # RN - reference number (>=1 per entry)
939
+ # RP - reference positions (>=1 per entry)
940
+ # RC - reference comment(s) (>=0 per entry; optional)
941
+ # RX - reference cross-reference(s) (>=0 per entry; optional)
942
+ # RA - reference author(s) (>=1 per entry)
943
+ # RT - reference title (>=0 per entry; optional)
944
+ # RL - reference location (>=1 per entry)
945
+ # CC - comments or notes (>=0 per entry; optional)
946
+ # DR - database cross-references (>=0 per entry; optional)
947
+ # KW - keywords (>=1 per entry)
948
+ # FT - feature table data (>=0 per entry; optional)
949
+ # SQ - sequence header (1 per entry)
950
+ # - (blanks) The sequence data (>=1 per entry)
951
+ # // - termination line (ends each entry; 1 per entry)
952
+ # ---- --------------------------- --------------------------------
953
+
954
+