bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,446 @@
1
+ #
2
+ # = bio/db/embl/embl.rb - EMBL database class
3
+ #
4
+ #
5
+ # Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: embl.rb,v 1.25 2005/11/02 07:30:14 nakao Exp $
9
+ #
10
+ # == EMBL database entry
11
+ #
12
+ #
13
+ #
14
+ # == Example
15
+ #
16
+ # emb = Bio::EMBL.new($<.read)
17
+ # emb.entry_id
18
+ # emb.each_cds do |cds|
19
+ # cds
20
+ # end
21
+ # emb.seq
22
+ #
23
+ #--
24
+ #
25
+ # This library is free software; you can redistribute it and/or
26
+ # modify it under the terms of the GNU Lesser General Public
27
+ # License as published by the Free Software Foundation; either
28
+ # version 2 of the License, or (at your option) any later version.
29
+ #
30
+ # This library is distributed in the hope that it will be useful,
31
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
32
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
33
+ # Lesser General Public License for more details.
34
+ #
35
+ # You should have received a copy of the GNU Lesser General Public
36
+ # License along with this library; if not, write to the Free Software
37
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38
+ #
39
+ #++
40
+ #
41
+
42
+ require 'bio/db'
43
+ require 'bio/db/embl/common'
44
+
45
+ module Bio
46
+ class EMBL < EMBLDB
47
+ include Bio::EMBLDB::Common
48
+
49
+ # returns contents in the ID line.
50
+ # * Bio::EMBL#id_line -> <ID Hash>
51
+ # where <ID Hash> is:
52
+ # {'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
53
+ # 'SEQUENCE_LENGTH' => Int}
54
+ #
55
+ # ID Line
56
+ # "ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."
57
+ #
58
+ # DATA_CLASS = ['standard']
59
+ #
60
+ # MOLECULE_TYPE: DNA RNA XXX
61
+ #
62
+ # Code ( DIVISION )
63
+ # EST (ESTs)
64
+ # PHG (Bacteriophage)
65
+ # FUN (Fungi)
66
+ # GSS (Genome survey)
67
+ # HTC (High Throughput cDNAs)
68
+ # HTG (HTGs)
69
+ # HUM (Human)
70
+ # INV (Invertebrates)
71
+ # ORG (Organelles)
72
+ # MAM (Other Mammals)
73
+ # VRT (Other Vertebrates)
74
+ # PLN (Plants)
75
+ # PRO (Prokaryotes)
76
+ # ROD (Rodents)
77
+ # SYN (Synthetic)
78
+ # STS (STSs)
79
+ # UNC (Unclassified)
80
+ # VRL (Viruses)
81
+ #
82
+ def id_line(key=nil)
83
+ unless @data['ID']
84
+ tmp = Hash.new
85
+ idline = fetch('ID').split(/; +/)
86
+ tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline[0].split(/ +/)
87
+ tmp['MOLECULE_TYPE'] = idline[1]
88
+ tmp['DIVISION'] = idline[2]
89
+ tmp['SEQUENCE_LENGTH'] = idline[3].strip.split(' ').first.to_i
90
+
91
+ @data['ID'] = tmp
92
+ end
93
+
94
+ if key
95
+ @data['ID'][key]
96
+ else
97
+ @data['ID']
98
+ end
99
+ end
100
+
101
+ # returns ENTRY_NAME in the ID line.
102
+ # * Bio::EMBL#entry -> String
103
+ def entry
104
+ id_line('ENTRY_NAME')
105
+ end
106
+ alias entry_name entry
107
+ alias entry_id entry
108
+
109
+ # returns MOLECULE_TYPE in the ID line.
110
+ # * Bio::EMBL#molecule -> String
111
+ def molecule
112
+ id_line('MOLECULE_TYPE')
113
+ end
114
+ alias molecule_type molecule
115
+
116
+ # returns DIVISION in the ID line.
117
+ # * Bio::EMBL#division -> String
118
+ def division
119
+ id_line('DIVISION')
120
+ end
121
+
122
+ # returns SEQUENCE_LENGTH in the ID line.
123
+ # * Bio::EMBL#sequencelength -> String
124
+ def sequence_length
125
+ id_line('SEQUENCE_LENGTH')
126
+ end
127
+ alias seqlen sequence_length
128
+
129
+
130
+ # AC Line
131
+ # "AC A12345; B23456;"
132
+
133
+
134
+ # returns the version information in the sequence version (SV) line.
135
+ # * Bio::EMBL#sv -> Accession.Version in String
136
+ # * Bio::EMBL#version -> accession in Int
137
+ #
138
+ # SV Line; sequence version (1/entry)
139
+ # SV Accession.Version
140
+ def sv
141
+ field_fetch('SV').sub(/;/,'')
142
+ end
143
+ def version
144
+ sv.split(".")[1].to_i
145
+ end
146
+
147
+
148
+ # returns contents in the date (DT) line.
149
+ # * Bio::EMBL#dt -> <DT Hash>
150
+ # where <DT Hash> is:
151
+ # {}
152
+ # * Bio::EMBL#dt(key) -> String
153
+ # keys: 'created' and 'updated'
154
+ #
155
+ # DT Line; date (2/entry)
156
+ def dt(key=nil)
157
+ unless @data['DT']
158
+ tmp = Hash.new
159
+ dt_line = self.get('DT').split(/\n/)
160
+ tmp['created'] = dt_line[0].sub(/\w{2} /,'').strip
161
+ tmp['updated'] = dt_line[1].sub(/\w{2} /,'').strip
162
+ @data['DT'] = tmp
163
+ end
164
+ if key
165
+ @data['DT'][key]
166
+ else
167
+ @data['DT']
168
+ end
169
+ end
170
+
171
+
172
+
173
+ ##
174
+ # DE Line; description (>=1)
175
+ #
176
+
177
+
178
+ ##
179
+ # KW Line; keyword (>=1)
180
+ # KW [Keyword;]+
181
+ #
182
+ # Bio::EMBLDB#kw -> Array
183
+ # #keywords -> Array
184
+
185
+
186
+ ##
187
+ # OS Line; organism species (>=1)
188
+ # OS Genus species (name)
189
+ # "OS Trifolium repens (white clover)"
190
+ #
191
+ # Bio::EMBLDB#os -> Array
192
+
193
+
194
+ ##
195
+ # OC Line; organism classification (>=1)
196
+ #
197
+ # Bio::EMBLDB#oc -> Array
198
+
199
+
200
+ ##
201
+ # OG Line; organella (0 or 1/entry)
202
+ # ["Mitochondrion", "Chloroplast","Kinetoplast", "Cyanelle", "Plastid"]
203
+ # or a plasmid name (e.g. "Plasmid pBR322").
204
+ #
205
+ # Bio::EMBLDB#og -> String
206
+
207
+
208
+ ##
209
+ # R Lines
210
+ # RN RC RP RX RA RT RL
211
+ #
212
+ # Bio::EMBLDB#ref
213
+
214
+
215
+ ##
216
+ # DR Line; defabases cross-regerence (>=0)
217
+ # "DR database_identifier; primary_identifier; secondary_identifier."
218
+ #
219
+ # Bio::EMBLDB#dr
220
+
221
+
222
+ # returns feature table header (String) in the feature header (FH) line.
223
+ #
224
+ # FH Line; feature table header (0 or 2)
225
+ def fh
226
+ fetch('FH')
227
+ end
228
+
229
+ # returns contents in the feature table (FT) lines.
230
+ # * Bio::EMBL#ft -> Bio::Features
231
+ # * Bio::EMBL#ft {} -> {|Bio::Feature| }
232
+ #
233
+ # same as features method in bio/db/genbank.rb
234
+ #
235
+ # FT Line; feature table data (>=0)
236
+ def ft
237
+ unless @data['FT']
238
+ @data['FT'] = Array.new
239
+ ary = Array.new
240
+ in_quote = false
241
+ @orig['FT'].each_line do |line|
242
+ next if line =~ /^FEATURES/
243
+
244
+ head = line[0,20].strip # feature key (source, CDS, ...)
245
+ body = line[20,60].chomp # feature value (position, /qualifier=)
246
+ if line =~ /^FT {3}(\S+)/
247
+ ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
248
+ elsif body =~ /^ \// and not in_quote
249
+ ary.last.push(body) # /q="data..., /q=data, /q
250
+
251
+ if body =~ /=" / and body !~ /"$/
252
+ in_quote = true
253
+ end
254
+
255
+ else
256
+ ary.last.last << body # ...data..., ...data..."
257
+
258
+ if body =~ /"$/
259
+ in_quote = false
260
+ end
261
+ end
262
+ end
263
+
264
+ ary.map! do |subary|
265
+ parse_qualifiers(subary)
266
+ end
267
+
268
+ @data['FT'] = Features.new(ary)
269
+ end
270
+ if block_given?
271
+ @data['FT'].each do |feature|
272
+ yield feature
273
+ end
274
+ else
275
+ @data['FT']
276
+ end
277
+ end
278
+ alias features ft
279
+
280
+ # iterates on CDS features in the FT lines.
281
+ def each_cds
282
+ ft.each do |cds_feature|
283
+ if cds_feature.feature == 'CDS'
284
+ yield cds_feature
285
+ end
286
+ end
287
+ end
288
+
289
+ # iterates on gene features in the FT lines.
290
+ def each_gene
291
+ ft.each do |gene_feature|
292
+ if gene_feature.feature == 'gene'
293
+ yield gene_feature
294
+ end
295
+ end
296
+ end
297
+
298
+
299
+ # returns comment text in the comments (CC) line.
300
+ #
301
+ # CC Line; comments of notes (>=0)
302
+ def cc
303
+ get('CC')
304
+ end
305
+
306
+
307
+ ##
308
+ # XX Line; spacer line (many)
309
+ # def nxx
310
+ # end
311
+
312
+
313
+ # returns sequence header information in the sequence header (SQ) line.
314
+ # * Bio::EMBL#sq -> <SQ Hash>
315
+ # where <SQ Hash> is:
316
+ # {'ntlen' => Int, 'other' => Int,
317
+ # 'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}
318
+ # * Bio::EMBL#sq(base) -> <base content in Int>
319
+ # * Bio::EMBL#sq[base] -> <base content in Int>
320
+ #
321
+ # SQ Line; sequence header (1/entry)
322
+ # SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
323
+ def sq(base = nil)
324
+ unless @data['SQ']
325
+ fetch('SQ') =~ \
326
+ /(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
327
+ @data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
328
+ 'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
329
+ else
330
+ @data['SQ']
331
+ end
332
+
333
+ if base
334
+ @data['SQ'][base.downcase]
335
+ else
336
+ @data['SQ']
337
+ end
338
+ end
339
+
340
+
341
+ # returns the nucleotie sequence in this entry.
342
+ # * Bio::EMBL#seq -> Bio::Sequence::NA
343
+ #
344
+ # @orig[''] as sequence
345
+ # bb Line; (blanks) sequence data (>=1)
346
+ def seq
347
+ Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
348
+ end
349
+ alias naseq seq
350
+ alias ntseq seq
351
+
352
+ # // Line; termination line (end; 1/entry)
353
+
354
+
355
+ ### private methods
356
+
357
+ private
358
+
359
+ ##
360
+ # same as Bio::GenBank#parse_qualifiers(feature)
361
+ def parse_qualifiers(ary)
362
+ feature = Feature.new
363
+
364
+ feature.feature = ary.shift
365
+ feature.position = ary.shift.gsub(/\s/, '')
366
+
367
+ ary.each do |f|
368
+ if f =~ %r{/([^=]+)=?"?([^"]*)"?}
369
+ qualifier, value = $1, $2
370
+
371
+ if value.empty?
372
+ value = true
373
+ end
374
+
375
+ case qualifier
376
+ when 'translation'
377
+ value = Sequence::AA.new(value.gsub(/\s/, ''))
378
+ when 'codon_start'
379
+ value = value.to_i
380
+ end
381
+
382
+ feature.append(Feature::Qualifier.new(qualifier, value))
383
+ end
384
+ end
385
+
386
+ return feature
387
+ end
388
+
389
+ end
390
+
391
+ end
392
+
393
+
394
+ if __FILE__ == $0
395
+ while ent = $<.gets(Bio::EMBL::RS)
396
+ puts "\n ==> e = Bio::EMBL.new(ent) "
397
+ e = Bio::EMBL.new(ent)
398
+
399
+ puts "\n ==> e.entry_id "
400
+ p e.entry_id
401
+ puts "\n ==> e.id_line "
402
+ p e.id_line
403
+ puts "\n ==> e.id_line('molecule') "
404
+ p e.id_line('molecule')
405
+ puts "\n ==> e.molecule "
406
+ p e.molecule
407
+ puts "\n ==> e.ac "
408
+ p e.ac
409
+ puts "\n ==> e.sv "
410
+ p e.sv
411
+ puts "\n ==> e.dt "
412
+ p e.dt
413
+ puts "\n ==> e.dt('created') "
414
+ p e.dt('created')
415
+ puts "\n ==> e.de "
416
+ p e.de
417
+ puts "\n ==> e.kw "
418
+ p e.kw
419
+ puts "\n ==> e.os "
420
+ p e.os
421
+ puts "\n ==> e.oc "
422
+ p e.oc
423
+ puts "\n ==> e.og "
424
+ p e.og
425
+ puts "\n ==> e.ref "
426
+ p e.ref
427
+ puts "\n ==> e.dr "
428
+ p e.dr
429
+ puts "\n ==> e.ft "
430
+ p e.ft
431
+ puts "\n ==> e.each_cds {|c| p c}"
432
+ p e.each_cds {|c| p c }
433
+ puts "\n ==> e.sq "
434
+ p e.sq
435
+ puts "\n ==> e.sq('a') "
436
+ p e.sq('a')
437
+ puts "\n ==> e.gc"
438
+ p e.gc
439
+ puts "\n ==> e.seq "
440
+ p e.seq
441
+ end
442
+
443
+ end
444
+
445
+
446
+
@@ -0,0 +1,954 @@
1
+ #
2
+ # = bio/db/embl/sptr.rb - UniProt/SwissProt and TrEMBL database class
3
+ #
4
+ # Copyright:: Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: sptr.rb,v 1.29 2005/11/02 07:30:14 nakao Exp $
8
+ #
9
+ # == UniProtKB/SwissProt and TrEMBL
10
+ #
11
+ # See the SWISS-PROT dicument file SPECLIST.TXT.
12
+ #
13
+ # == Example
14
+ #
15
+ #--
16
+ #
17
+ # This library is free software; you can redistribute it and/or
18
+ # modify it under the terms of the GNU Lesser General Public
19
+ # License as published by the Free Software Foundation; either
20
+ # version 2 of the License, or (at your option) any later version.
21
+ #
22
+ # This library is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25
+ # Lesser General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU Lesser General Public
28
+ # License along with this library; if not, write to the Free Software
29
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30
+ #
31
+ #++
32
+ #
33
+
34
+ require 'bio/db'
35
+ require 'bio/db/embl/common'
36
+
37
+ module Bio
38
+
39
+ # Parser class for UniProtKB/SwissProt and TrEMBL database entry
40
+ class SPTR < EMBLDB
41
+ include Bio::EMBLDB::Common
42
+
43
+ @@entry_regrexp = /[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/
44
+ @@data_class = ["STANDARD", "PRELIMINARY"]
45
+
46
+
47
+ # returns a Hash of the ID line.
48
+ # returns a content (Int or String) of the ID line by a given key.
49
+ # Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
50
+ #
51
+ # ID Line
52
+ # "ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
53
+ #
54
+ # ENTRY_NAME := "#{X}_#{Y}"
55
+ # X =~ /[A-Z0-9]{1,5}/ # The protein name.
56
+ # Y =~ /[A-Z0-9]{1,5}/ # The biological source of the protein.
57
+ # MOLECULE_TYPE := 'PRT' =~ /\w{3}/
58
+ # SEQUENCE_LENGTH =~ /\d+ AA/
59
+ def id_line(key = nil)
60
+ unless @data['ID']
61
+ tmp = Hash.new
62
+ a = @orig['ID'].split(/ +/)
63
+ tmp['ENTRY_NAME'] = a[1]
64
+ tmp['DATA_CLASS'] = a[2].sub(/;/,'')
65
+ tmp['MOLECULE_TYPE'] = a[3].sub(/;/,'')
66
+ tmp['SEQUENCE_LENGTH'] = a[4].to_i
67
+ @data['ID'] = tmp
68
+ end
69
+
70
+ if key
71
+ @data['ID'][key] # String/Int
72
+ else
73
+ @data['ID'] # Hash
74
+ end
75
+ end
76
+
77
+
78
+
79
+ # returns a ENTRY_NAME in the ID line.
80
+ #
81
+ # A short-cut for Bio::SPTR#id_line('ENTRY_NAME').
82
+ def entry_id
83
+ id_line('ENTRY_NAME')
84
+ end
85
+ alias entry_name entry_id
86
+ alias entry entry_id
87
+
88
+
89
+ # returns a MOLECULE_TYPE in the ID line.
90
+ #
91
+ # A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
92
+ def molecule
93
+ id_line('MOLECULE_TYPE')
94
+ end
95
+ alias molecule_type molecule
96
+
97
+
98
+ # returns a SEQUENCE_LENGTH in the ID line.
99
+ #
100
+ # A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
101
+ def sequence_length
102
+ id_line('SEQUENCE_LENGTH')
103
+ end
104
+ alias aalen sequence_length
105
+
106
+
107
+ # Bio::EMBLDB::Common#ac -> ary
108
+ # #accessions -> ary
109
+ # #accession -> String (accessions.first)
110
+ @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
111
+
112
+
113
+
114
+ # returns a Hash of information in the DT lines.
115
+ # hash keys:
116
+ # ['created', 'sequence', 'annotation']
117
+ # also Symbols acceptable (ASAP):
118
+ # [:created, :sequence, :annotation]
119
+ #
120
+ # returns a String of information in the DT lines by a given key..
121
+ #
122
+ # DT Line; date (3/entry)
123
+ # DT DD-MMM-YYY (rel. NN, Created)
124
+ # DT DD-MMM-YYY (rel. NN, Last sequence update)
125
+ # DT DD-MMM-YYY (rel. NN, Last annotation update)
126
+ def dt(key = nil)
127
+ unless @data['DT']
128
+ tmp = Hash.new
129
+ a = self.get('DT').split(/\n/)
130
+ tmp['created'] = a[0].sub(/\w{2} /,'').strip
131
+ tmp['sequence'] = a[1].sub(/\w{2} /,'').strip
132
+ tmp['annotation'] = a[2].sub(/\w{2} /,'').strip
133
+ @data['DT'] = tmp
134
+ end
135
+
136
+ if key
137
+ @data['DT'][key]
138
+ else
139
+ @data['DT']
140
+ end
141
+ end
142
+
143
+
144
+ # returns the proposed official name of the protein.
145
+ #
146
+ # DE Line; description (>=1)
147
+ # "DE #{OFFICIAL_NAME} (#{SYNONYM})"
148
+ # "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
149
+ # OFFICIAL_NAME 1/entry
150
+ # SYNONYM >=0
151
+ # CONTEINS >=0
152
+ def protein_name
153
+ name = ""
154
+ if de_line = fetch('DE') then
155
+ str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
156
+ name = str[/^[^(]*/].strip
157
+ name << ' (Fragment)' if str =~ /fragment/i
158
+ end
159
+ return name
160
+ end
161
+
162
+
163
+ # returns an array of synonyms (unofficial names).
164
+ #
165
+ # synonyms are each placed in () following the official name on the DE line.
166
+ def synonyms
167
+ ary = Array.new
168
+ if de_line = fetch('DE') then
169
+ line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
170
+ line.scan(/\([^)]+/) do |synonym|
171
+ unless synonym =~ /fragment/i then
172
+ ary << synonym[1..-1].strip # index to remove the leading (
173
+ end
174
+ end
175
+ end
176
+ return ary
177
+ end
178
+
179
+
180
+ # returns gene names in the GN line.
181
+ #
182
+ # New UniProt/SwissProt format:
183
+ # * Bio::SPTR#gn -> [ <gene record>* ]
184
+ # where <gene record> is:
185
+ # { :name => '...',
186
+ # :synonyms => [ 's1', 's2', ... ],
187
+ # :loci => [ 'l1', 'l2', ... ],
188
+ # :orfs => [ 'o1', 'o2', ... ]
189
+ # }
190
+ #
191
+ # Old format:
192
+ # * Bio::SPTR#gn -> Array # AND
193
+ # * Bio::SPTR#gn[0] -> Array # OR
194
+ #
195
+ # GN Line: Gene name(s) (>=0, optional)
196
+ def gn
197
+ return @data['GN'] if @data['GN']
198
+
199
+ case fetch('GN')
200
+ when /Name=/ then
201
+ return gn_uniprot_parser
202
+ else
203
+ return gn_old_parser
204
+ end
205
+ end
206
+
207
+ # returns contents in the old style GN line.
208
+ # GN Line: Gene name(s) (>=0, optional)
209
+ # GN HNS OR DRDX OR OSMZ OR BGLY.
210
+ # GN CECA1 AND CECA2.
211
+ # GN CECA1 AND (HOGE OR FUGA).
212
+ #
213
+ # GN NAME1 [(AND|OR) NAME]+.
214
+ #
215
+ # Bio::SPTR#gn -> Array # AND
216
+ # #gn[0] -> Array # OR
217
+ # #gene_names -> Array
218
+ def gn_old_parser
219
+ names = Array.new
220
+ if get('GN').size > 0
221
+ names = fetch('GN').sub(/\.$/,'').split(/ AND /)
222
+ names.map! { |synonyms|
223
+ synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
224
+ e.strip
225
+ }
226
+ }
227
+ end
228
+ return @data['GN'] = names
229
+ end
230
+ private :gn_old_parser
231
+
232
+ # returns contents in the structured GN line.
233
+ # The new format of the GN line is:
234
+ # GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
235
+ # GN ORFNames=[, ...];
236
+ #
237
+ # * Bio::SPTR#gn -> [ <gene record>* ]
238
+ # where <gene record> is:
239
+ # { :name => '...',
240
+ # :synonyms => [ 's1', 's2', ... ],
241
+ # :loci => [ 'l1', 'l2', ... ],
242
+ # :orfs => [ 'o1', 'o2', ... ]
243
+ # }
244
+ def gn_uniprot_parser
245
+ @data['GN'] = Array.new
246
+ gn_line = fetch('GN').strip
247
+ records = gn_line.split(/\s*and\s*/)
248
+ records.each do |record|
249
+ gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
250
+ record.each(';') do |element|
251
+ case element
252
+ when /Name=/ then
253
+ gene_hash[:name] = $'[0..-2]
254
+ when /Synonyms=/ then
255
+ gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
256
+ when /OrderedLocusNames=/ then
257
+ gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
258
+ when /ORFNames=/ then
259
+ gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
260
+ end
261
+ end
262
+ @data['GN'] << gene_hash
263
+ end
264
+ return @data['GN']
265
+ end
266
+ private :gn_uniprot_parser
267
+
268
+
269
+ # returns a Array of gene names in the GN line.
270
+ def gene_names
271
+ gn # set @data['GN'] if it hasn't been already done
272
+ if @data['GN'].first.class == Hash then
273
+ @data['GN'].collect { |element| element[:name] }
274
+ else
275
+ @data['GN'].first
276
+ end
277
+ end
278
+
279
+
280
+ # returns a String of the first gene name in the GN line.
281
+ def gene_name
282
+ gene_names.first
283
+ end
284
+
285
+
286
+ # returns a Array of Hashs or a String of the OS line when a key given.
287
+ # * Bio::EMBLDB#os -> Array
288
+ # [{'name' => '(Human)', 'os' => 'Homo sapiens'},
289
+ # {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
290
+ # * Bio::EPTR#os[0] -> Hash
291
+ # {'name' => "(Human)", 'os' => 'Homo sapiens'}
292
+ # * Bio::SPTR#os[0]['name'] -> "(Human)"
293
+ # * Bio::EPTR#os(0) -> "Homo sapiens (Human)"
294
+ #
295
+ # OS Line; organism species (>=1)
296
+ # OS Genus species (name).
297
+ # OS Genus species (name0) (name1).
298
+ # OS Genus species (name0) (name1).
299
+ # OS Genus species (name0), G s0 (name0), and G s (name0) (name1).
300
+ # OS Homo sapiens (Human), and Rarrus norveticus (Rat)
301
+ def os(num = nil)
302
+ unless @data['OS']
303
+ os = Array.new
304
+ fetch('OS').split(/, and|, /).each do |tmp|
305
+ if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
306
+ org = $1
307
+ tmp =~ /(\(.+\))/
308
+ os.push({'name' => $1, 'os' => org})
309
+ else
310
+ raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
311
+ end
312
+ end
313
+ @data['OS'] = os
314
+ end
315
+
316
+ if num
317
+ # EX. "Trifolium repens (white clover)"
318
+ return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
319
+ else
320
+ return @data['OS']
321
+ end
322
+ end
323
+
324
+
325
+ # Bio::EMBLDB::Common#og -> Array
326
+ # OG Line; organella (0 or 1/entry)
327
+ # ["MITOCHONDRION", "CHLOROPLAST", "Cyanelle", "Plasmid"]
328
+ # or a plasmid name (e.g. "Plasmid pBR322").
329
+
330
+
331
+ # Bio::EMBLDB::Common#oc -> Array
332
+ # OC Line; organism classification (>=1)
333
+ # "OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;"
334
+ # "OC Theileria."
335
+
336
+
337
+
338
+ # returns a Hash of oraganism taxonomy cross-references.
339
+ # * Bio::SPTR#ox -> Hash
340
+ # {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
341
+ #
342
+ # OX Line; organism taxonomy cross-reference (>=1 per entry)
343
+ # OX NCBI_TaxID=1234;
344
+ # OX NCBI_TaxID=1234, 2345, 3456, 4567;
345
+ def ox
346
+ unless @data['OX']
347
+ tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
348
+ hsh = Hash.new
349
+ tmp.each do |e|
350
+ db,refs = e.split(/=/)
351
+ hsh[db] = refs.split(/, */)
352
+ end
353
+ @data['OX'] = hsh
354
+ end
355
+ return @data['OX']
356
+ end
357
+
358
+
359
+ # Bio::EMBLDB::Common#ref -> Array
360
+ # R Lines
361
+ # RN RC RP RX RA RT RL
362
+
363
+
364
+ @@cc_topics = ['ALTERNATIVE PRODUCTS','CATALYTIC ACTIVITY','CAUTION',
365
+ 'COFACTOR','DATABASE','DEVELOPMENTAL STAGE','DISEASE','DOMAIN',
366
+ 'ENZYME REGULATION','FUNCTION','INDUCTION','MASS SPECTROMETRY',
367
+ 'MISCELLANEOUS','PATHWAY','PHARMACEUTICAL','POLYMORPHISM','PTM',
368
+ 'SIMILARITY','SUBCELLULAR LOCATION','SUBUNIT','TISSUE SPECIFICITY']
369
+ # returns contents in the CC lines.
370
+ # * Bio::SPTR#cc -> Hash
371
+
372
+ # * Bio::SPTR#cc(Int) -> String
373
+ # returns an Array of contents in the TOPIC string.
374
+ # * Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
375
+ #
376
+ # returns contents of the "ALTERNATIVE PRODUCTS".
377
+ # * Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
378
+ # {'Event' => str,
379
+ # 'Named isoforms' => int,
380
+ # 'Comment' => str,
381
+ # 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
382
+ #
383
+ # CC -!- ALTERNATIVE PRODUCTS:
384
+ # CC Event=Alternative splicing; Named isoforms=15;
385
+ # ...
386
+ # CC placentae isoforms. All tissues differentially splice exon 13;
387
+ # CC Name=A; Synonyms=no del;
388
+ # CC IsoId=P15529-1; Sequence=Displayed;
389
+ #
390
+ # returns contents of the "DATABASE".
391
+ # * Bio::SPTR#cc('DATABASE') -> Array
392
+ # [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
393
+ #
394
+ # CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
395
+ #
396
+ # returns contents of the "MASS SPECTROMETRY".
397
+ # * Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
398
+ # [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
399
+ #
400
+ # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
401
+ #
402
+ # CC lines (>=0, optional)
403
+ # CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
404
+ # CC IN LIVER, KIDNEY, LUNG AND BRAIN.
405
+ #
406
+ # CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
407
+ # CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
408
+ def cc(tag = nil)
409
+ unless @data['CC']
410
+ cc = Hash.new
411
+ cmt = '-' * (77 - 4 + 1)
412
+ dlm = /-!- /
413
+
414
+ return cc if get('CC').size == 0 # 12KD_MYCSM has no CC lines.
415
+
416
+ begin
417
+ fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
418
+ if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
419
+ key = $1
420
+ body = $2.gsub(/- (?!AND)/,'-')
421
+ unless cc[key]
422
+ cc[key] = [body]
423
+ else
424
+ cc[key].push(body)
425
+ end
426
+ else
427
+ raise ["Error: [#{entry_id}]: CC Lines", '',
428
+ tmp, '', '', fetch('CC'),''].join("\n")
429
+ end
430
+ end
431
+ rescue NameError
432
+ if fetch('CC') == ''
433
+ return {}
434
+ else
435
+ raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
436
+ "\n'#{self.get('CC')}'\n", "(#{$!})"].join
437
+ end
438
+ rescue NoMethodError
439
+ end
440
+
441
+ @data['CC'] = cc
442
+ end
443
+
444
+ case tag
445
+ when 'ALTERNATIVE PRODUCTS'
446
+ ap = @data['CC']['ALTERNATIVE PRODUCTS'].to_s
447
+ return ap unless ap
448
+
449
+ # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
450
+ tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 'Variants' => []}
451
+
452
+ if /Event=(.+?);/ =~ ap
453
+ tmp['Event'] = $1
454
+ end
455
+ if /Named isoforms=(\S+?);/ =~ ap
456
+ tmp['Named isoforms'] = $1
457
+ end
458
+ if /Comment=(.+?);/m =~ ap
459
+ tmp['Comment'] = $1
460
+ end
461
+ ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
462
+ tmp['Variants'] << cc_ap_variants_parse(ent)
463
+ end
464
+ return tmp
465
+
466
+
467
+ when 'DATABASE'
468
+ # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
469
+ tmp = Array.new
470
+ db = @data['CC']['DATABASE']
471
+ return db unless db
472
+
473
+ db.each do |e|
474
+ db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
475
+ e.sub(/.$/,'').split(/;/).each do |line|
476
+ case line
477
+ when /NAME=(.+)/
478
+ db['NAME'] = $1
479
+ when /NOTE=(.+)/
480
+ db['NOTE'] = $1
481
+ when /WWW="(.+)"/
482
+ db['WWW'] = $1
483
+ when /FTP="(.+)"/
484
+ db['FTP'] = $1
485
+ end
486
+ end
487
+ tmp.push(db)
488
+ end
489
+ return tmp
490
+
491
+ when 'MASS SPECTOROMETRY'
492
+ # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
493
+ tmp = Array.new
494
+ ms = @data['CC']['MASS SPECTOROMETRY']
495
+ return ms unless ms
496
+
497
+ ms.each do |m|
498
+ mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
499
+ m.sub(/.$/,'').split(/;/).each do |line|
500
+ case line
501
+ when /MW=(.+)/
502
+ mass['MW'] = $1.to_f
503
+ when /MW_ERR=(.+)/
504
+ mass['MW_ERR'] = $1.to_f
505
+ when /METHOD="(.+)"/
506
+ mass['METHOD'] = $1.to_s
507
+ when /RANGE="(\d+-\d+)"/
508
+ mass['RANGE'] = $1 # RANGE class ?
509
+ end
510
+ end
511
+ tmp.push(mass)
512
+ end
513
+ return tmp
514
+
515
+ when 'INTERACTION'
516
+ return cc_interaction_parse(@data['CC']['INTERACTION'].to_s)
517
+
518
+ when nil
519
+ return @data['CC']
520
+
521
+ else
522
+ return @data['CC'][tag]
523
+ end
524
+ end
525
+
526
+
527
+
528
+ def cc_ap_variants_parse(ent)
529
+ hsh = {}
530
+ ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
531
+ case e[0]
532
+ when 'Sequence'
533
+ e[1] = e[1].sub(/;/,'').split(/, /)
534
+ end
535
+ hsh[e[0]] = e[1]
536
+ end
537
+ return hsh
538
+ end
539
+ private :cc_ap_variants_parse
540
+
541
+
542
+ # returns conteins in a line of the CC INTERACTION section.
543
+ #
544
+ # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
545
+ def cc_interaction_parse(str)
546
+ it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
547
+ it.map {|ent|
548
+ {:partner_id => ent[0].strip,
549
+ :nbexp => ent[1].strip,
550
+ :intact_acc => ent[2].split(', ') }
551
+ }
552
+ end
553
+ private :cc_interaction_parse
554
+
555
+ # returns databases cross-references in the DR lines.
556
+ # * Bio::EMBLDB#dr -> Hash w/in Array
557
+ #
558
+ # DR Line; defabases cross-reference (>=0)
559
+ # a cross_ref pre one line
560
+ # DR database_identifier; primary_identifier; secondary_identifier.
561
+ @@dr_database_identifier = ['EMBL','CARBBANK','DICTYDB','ECO2DBASE',
562
+ 'ECOGENE',
563
+ 'FLYBASE','GCRDB','HIV','HSC-2DPAGE','HSSP','INTERPRO','MAIZEDB',
564
+ 'MAIZE-2DPAGE','MENDEL','MGD''MIM','PDB','PFAM','PIR','PRINTS',
565
+ 'PROSITE','REBASE','AARHUS/GHENT-2DPAGE','SGD','STYGENE','SUBTILIST',
566
+ 'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
567
+
568
+ # Bio::EMBLDB::Common#kw - Array
569
+ # #keywords -> Array
570
+ #
571
+ # KW Line; keyword (>=1)
572
+ # KW [Keyword;]+
573
+
574
+
575
+ # returns conteins in the feature table.
576
+ # * Bio::SPTR#ft -> Hash
577
+ # {'feature_name' => [{'From' => str, 'To' => str,
578
+ # 'Description' => str, 'FTId' => str}],...}
579
+ #
580
+ # returns an Array of the information about the feature_name in the feature table.
581
+ # * Bio::SPTR#ft(feature_name) -> Array of Hash
582
+ # [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
583
+ #
584
+ # FT Line; feature table data (>=0, optional)
585
+ #
586
+ # Col Data item
587
+ # ----- -----------------
588
+ # 1- 2 FT
589
+ # 6-13 Feature name
590
+ # 15-20 `FROM' endpoint
591
+ # 22-27 `TO' endpoint
592
+ # 35-75 Description (>=0 per key)
593
+ # ----- -----------------
594
+ def ft(feature_name = nil)
595
+ unless @data['FT']
596
+ table = Hash.new()
597
+ last_feature = nil
598
+
599
+ begin
600
+ get('FT').split(/\n/).each {|line|
601
+
602
+ feature = line[5..12].strip
603
+
604
+ if feature == '' and line[34..74]
605
+ tmp = ' ' + line[34..74].strip
606
+ table[last_feature].last['Description'] << tmp
607
+
608
+ next unless /\.$/ =~ line
609
+ else
610
+ from = line[14..19].strip
611
+ to = line[21..26].strip
612
+ desc = line[34..74].strip if line[34..74]
613
+
614
+ table[feature] = [] unless table[feature]
615
+ table[feature] << {
616
+ 'From' => from.to_i,
617
+ 'To' => to.to_i,
618
+ 'Description' => desc,
619
+ 'diff' => [],
620
+ 'FTId' => nil }
621
+ last_feature = feature
622
+ next
623
+ end
624
+
625
+ case last_feature
626
+ when 'VARSPLIC', 'VARIANT', 'CONFLICT'
627
+ if /FTId=(.+?)\./ =~ line # version 41 >
628
+ ftid = $1
629
+ table[last_feature].last['FTId'] = ftid
630
+ table[last_feature].last['Description'].sub!(/ \/FTId=#{ftid}./,'')
631
+ end
632
+
633
+ case table[last_feature].last['Description']
634
+ when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
635
+ original = $1
636
+ swap = $2
637
+ original = original.gsub(/ /,'').strip
638
+ swap = swap.gsub(/ /,'').strip
639
+ when /Missing/i
640
+ original = seq.subseq(table[last_feature].last['From'],
641
+ table[last_feature].last['To'])
642
+ swap = ''
643
+ else
644
+ raise line
645
+ end
646
+ table[last_feature].last['diff'] = [original, swap]
647
+ end
648
+ }
649
+
650
+ rescue
651
+ raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n" +
652
+ "'#{self.get('FT')}'\n"
653
+ end
654
+
655
+ table.each_key do |k|
656
+ table[k].each do |e|
657
+ if / -> / =~ e['Description']
658
+ pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
659
+ e['Description'].sub!(pattern) {
660
+ a = $1
661
+ b = $2
662
+ a.gsub(/ /,'') + " -> " + b.gsub(/ /,'')
663
+ }
664
+ end
665
+ if /- [\w\d]/ =~ e['Description']
666
+ e['Description'].gsub!(/([\w\d]- [\w\d]+)/) {
667
+ a = $1
668
+ if /- AND/ =~ a
669
+ a
670
+ else
671
+ a.sub(/ /,'')
672
+ end
673
+ }
674
+ end
675
+ end
676
+ end
677
+ @data['FT'] = table
678
+ end
679
+
680
+ if feature_name
681
+ @data['FT'][feature_name]
682
+ else
683
+ @data['FT']
684
+ end
685
+ end
686
+
687
+
688
+ # returns a Hash of conteins in the SQ lines.
689
+ # * Bio::SPTRL#sq -> hsh
690
+ #
691
+ # returns a value of a key given in the SQ lines.
692
+ # * Bio::SPTRL#sq(key) -> int or str
693
+ # * Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length', 'CRC64']
694
+ #
695
+ # SQ Line; sequence header (1/entry)
696
+ # SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64;
697
+ # SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
698
+ #
699
+ # MW, Dalton unit.
700
+ # CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
701
+ def sq(key = nil)
702
+ unless @data['SQ']
703
+ if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
704
+ @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
705
+ else
706
+ raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
707
+ end
708
+ end
709
+
710
+ if key
711
+ case key
712
+ when /mw/, /molecular/, /weight/
713
+ @data['SQ']['MW']
714
+ when /len/, /length/, /AA/
715
+ @data['SQ']['aalen']
716
+ else
717
+ @data['SQ'][key]
718
+ end
719
+ else
720
+ @data['SQ']
721
+ end
722
+ end
723
+
724
+
725
+ # returns a Bio::Sequence::AA of the amino acid sequence.
726
+ # * Bio::SPTR#seq -> Bio::Sequence::AA
727
+ #
728
+ # blank Line; sequence data (>=1)
729
+ def seq
730
+ unless @data['']
731
+ @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
732
+ end
733
+ return @data['']
734
+ end
735
+ alias aaseq seq
736
+
737
+ end # class SPTR
738
+
739
+ end # module Bio
740
+
741
+
742
+ if __FILE__ == $0
743
+ # Usage: ruby __FILE__ uniprot_sprot.dat
744
+ # Usage: ruby __FILE__ uniprot_sprot.dat | egrep '^RuntimeError'
745
+
746
+ begin
747
+ require 'pp'
748
+ alias pp p
749
+ rescue LoadError
750
+ end
751
+
752
+ def cmd(cmd, tag = nil, ent = $ent)
753
+ puts " ==> #{cmd} "
754
+ puts Bio::SPTR.new(ent).get(tag) if tag
755
+ begin
756
+ p eval(cmd)
757
+ rescue RuntimeError
758
+ puts "RuntimeError(#{Bio::SPTR.new($ent).entry_id})}: #{$!} "
759
+ end
760
+ puts
761
+ end
762
+
763
+
764
+ while $ent = $<.gets(Bio::SPTR::RS)
765
+
766
+ cmd "Bio::SPTR.new($ent).entry_id"
767
+
768
+ cmd "Bio::SPTR.new($ent).id_line", 'ID'
769
+ cmd "Bio::SPTR.new($ent).entry"
770
+ cmd "Bio::SPTR.new($ent).entry_name"
771
+ cmd "Bio::SPTR.new($ent).molecule"
772
+ cmd "Bio::SPTR.new($ent).sequence_length"
773
+
774
+ cmd "Bio::SPTR.new($ent).ac", 'AC'
775
+ cmd "Bio::SPTR.new($ent).accession"
776
+
777
+
778
+ cmd "Bio::SPTR.new($ent).gn", 'GN'
779
+ cmd "Bio::SPTR.new($ent).gene_name"
780
+ cmd "Bio::SPTR.new($ent).gene_names"
781
+
782
+ cmd "Bio::SPTR.new($ent).dt", "DT"
783
+ ['created','annotation','sequence'].each do |key|
784
+ cmd "Bio::SPTR.new($ent).dt('#{key}')"
785
+ end
786
+
787
+ cmd "Bio::SPTR.new($ent).de", 'DE'
788
+ cmd "Bio::SPTR.new($ent).definition"
789
+ cmd "Bio::SPTR.new($ent).protein_name"
790
+ cmd "Bio::SPTR.new($ent).synonyms"
791
+
792
+ cmd "Bio::SPTR.new($ent).kw", 'KW'
793
+
794
+ cmd "Bio::SPTR.new($ent).os", 'OS'
795
+
796
+ cmd "Bio::SPTR.new($ent).oc", 'OC'
797
+
798
+ cmd "Bio::SPTR.new($ent).og", 'OG'
799
+
800
+ cmd "Bio::SPTR.new($ent).ox", 'OX'
801
+
802
+ cmd "Bio::SPTR.new($ent).ref", 'R'
803
+
804
+ cmd "Bio::SPTR.new($ent).cc", 'CC'
805
+ cmd "Bio::SPTR.new($ent).cc('ALTERNATIVE PRODUCTS')"
806
+ cmd "Bio::SPTR.new($ent).cc('DATABASE')"
807
+ cmd "Bio::SPTR.new($ent).cc('MASS SPECTOMETRY')"
808
+
809
+ cmd "Bio::SPTR.new($ent).dr", 'DR'
810
+
811
+ cmd "Bio::SPTR.new($ent).ft", 'FT'
812
+ cmd "Bio::SPTR.new($ent).ft['DOMAIN']"
813
+
814
+ cmd "Bio::SPTR.new($ent).sq", "SQ"
815
+ cmd "Bio::SPTR.new($ent).seq"
816
+ end
817
+
818
+ end
819
+
820
+
821
+ =begin
822
+
823
+ = Bio::SPTR < Bio::DB
824
+
825
+ Class for a entry in the SWISS-PROT/TrEMBL database.
826
+
827
+ * ((<URL:http://www.ebi.ac.uk/swissprot/>))
828
+ * ((<URL:http://www.ebi.ac.uk/trembl/>))
829
+ * ((<URL:http://www.ebi.ac.uk/sprot/userman.html>))
830
+
831
+
832
+ --- Bio::SPTR.new(a_sp_entry)
833
+
834
+ === ID line (Identification)
835
+
836
+ --- Bio::SPTR#id_line -> {'ENTRY_NAME' => str, 'DATA_CLASS' => str,
837
+ 'MOLECULE_TYPE' => str, 'SEQUENCE_LENGTH' => int }
838
+ --- Bio::SPTR#id_line(key) -> str
839
+
840
+ key = (ENTRY_NAME|MOLECULE_TYPE|DATA_CLASS|SEQUENCE_LENGTH)
841
+
842
+ --- Bio::SPTR#entry_id -> str
843
+ --- Bio::SPTR#molecule -> str
844
+ --- Bio::SPTR#sequence_length -> int
845
+
846
+
847
+ === AC lines (Accession number)
848
+
849
+ --- Bio::SPTR#ac -> ary
850
+ --- Bio::SPTR#accessions -> ary
851
+ --- Bio::SPTR#accession -> accessions.first
852
+
853
+
854
+ === GN line (Gene name(s))
855
+
856
+ --- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
857
+ --- Bio::SPTR#gene_name -> str
858
+ --- Bio::SPTR#gene_names -> [str] or [str]
859
+
860
+
861
+ === DT lines (Date)
862
+
863
+ --- Bio::SPTR#dt -> {'created' => str, 'sequence' => str, 'annotation' => str}
864
+ --- Bio::SPTR#dt(key) -> str
865
+
866
+ key := (created|annotation|sequence)
867
+
868
+
869
+ === DE lines (Description)
870
+
871
+ --- Bio::SPTR#de -> str
872
+ #definition -> str
873
+
874
+ --- Bio::SPTR#protein_name
875
+
876
+ Returns the proposed official name of the protein
877
+
878
+
879
+ --- Bio::SPTR#synonyms
880
+
881
+ Returns an array of synonyms (unofficial names)
882
+
883
+ === KW lines (Keyword)
884
+
885
+ --- Bio::SPTR#kw -> ary
886
+
887
+ === OS lines (Organism species)
888
+
889
+ --- Bio::SPTR#os -> [{'name' => str, 'os' => str}, ...]
890
+
891
+ === OC lines (organism classification)
892
+
893
+ --- Bio::SPTR#oc -> ary
894
+
895
+ === OG line (Organella)
896
+
897
+ --- Bio::SPTR#og -> ary
898
+
899
+ === OX line (Organism taxonomy cross-reference)
900
+
901
+ --- Bio::SPTR#ox -> {'NCBI_TaxID' => [], ...}
902
+
903
+ === RN RC RP RX RA RT RL RG lines (Reference)
904
+
905
+ --- Bio::SPTR#ref -> [{'RN' => int, 'RP' => str, 'RC' => str, 'RX' => str, ''RT' => str, 'RL' => str, 'RA' => str, 'RC' => str, 'RG' => str},...]
906
+
907
+ === DR lines (Database cross-reference)
908
+
909
+ --- Bio::SPTR#dr -> {'EMBL' => ary, ...}
910
+
911
+ === FT lines (Feature table data)
912
+
913
+ --- Bio::SPTR#ft -> hsh
914
+
915
+ === SQ lines (Sequence header and data)
916
+
917
+ --- Bio::SPTR#sq -> {'CRC64' => str, 'MW' => int, 'aalen' => int}
918
+ --- Bio::SPTR#sq(key) -> int or str
919
+
920
+ key := (aalen|MW|CRC64)
921
+
922
+ --- Bio::EMBL#seq -> Bio::Sequece::AA
923
+ #aaseq -> Bio::Sequece::AA
924
+
925
+ =end
926
+
927
+ # Content Occurrence in an entry
928
+ # ---- --------------------------- --------------------------------
929
+ # ID - identification (begins each entry; 1 per entry)
930
+ # AC - accession number(s) (>=1 per entry)
931
+ # DT - date (3 per entry)
932
+ # DE - description (>=1 per entry)
933
+ # GN - gene name(s) (>=0 per entry; optional)
934
+ # OS - organism species (>=1 per entry)
935
+ # OG - organelle (0 or 1 per entry; optional)
936
+ # OC - organism classification (>=1 per entry)
937
+ # OX - organism taxonomy x-ref (>=1 per entry)
938
+ # RN - reference number (>=1 per entry)
939
+ # RP - reference positions (>=1 per entry)
940
+ # RC - reference comment(s) (>=0 per entry; optional)
941
+ # RX - reference cross-reference(s) (>=0 per entry; optional)
942
+ # RA - reference author(s) (>=1 per entry)
943
+ # RT - reference title (>=0 per entry; optional)
944
+ # RL - reference location (>=1 per entry)
945
+ # CC - comments or notes (>=0 per entry; optional)
946
+ # DR - database cross-references (>=0 per entry; optional)
947
+ # KW - keywords (>=1 per entry)
948
+ # FT - feature table data (>=0 per entry; optional)
949
+ # SQ - sequence header (1 per entry)
950
+ # - (blanks) The sequence data (>=1 per entry)
951
+ # // - termination line (ends each entry; 1 per entry)
952
+ # ---- --------------------------- --------------------------------
953
+
954
+