bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,226 @@
1
+ #
2
+ # = bio/data/na.rb - Nucleic Acids
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2005
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: na.rb,v 0.19 2005/12/10 18:14:22 k Exp $
9
+ #
10
+ # == Synopsis
11
+ #
12
+ # Bio::NucleicAcid class contains data related to nucleic acids.
13
+ #
14
+ # == Usage
15
+ #
16
+ # Examples:
17
+ #
18
+ # require 'bio'
19
+ #
20
+ # puts "### na = Bio::NucleicAcid.new"
21
+ # na = Bio::NucleicAcid.new
22
+ #
23
+ # puts "# na.to_re('yrwskmbdhvnatgc')"
24
+ # p na.to_re('yrwskmbdhvnatgc')
25
+ #
26
+ # puts "# Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')"
27
+ # p Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')
28
+ #
29
+ # puts "# na.weight('A')"
30
+ # p na.weight('A')
31
+ #
32
+ # puts "# Bio::NucleicAcid.weight('A')"
33
+ # p Bio::NucleicAcid.weight('A')
34
+ #
35
+ # puts "# na.weight('atgc')"
36
+ # p na.weight('atgc')
37
+ #
38
+ # puts "# Bio::NucleicAcid.weight('atgc')"
39
+ # p Bio::NucleicAcid.weight('atgc')
40
+ #
41
+ #--
42
+ #
43
+ # This library is free software; you can redistribute it and/or
44
+ # modify it under the terms of the GNU Lesser General Public
45
+ # License as published by the Free Software Foundation; either
46
+ # version 2 of the License, or (at your option) any later version.
47
+ #
48
+ # This library is distributed in the hope that it will be useful,
49
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
50
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
51
+ # Lesser General Public License for more details.
52
+ #
53
+ # You should have received a copy of the GNU Lesser General Public
54
+ # License along with this library; if not, write to the Free Software
55
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
56
+ #
57
+ #++
58
+ #
59
+
60
+ module Bio
61
+
62
+ class NucleicAcid
63
+
64
+ module Data
65
+
66
+ # IUPAC code
67
+ # * Faisst and Meyer (Nucleic Acids Res. 20:3-26, 1992)
68
+ # * http://www.ncbi.nlm.nih.gov/collab/FT/
69
+
70
+ NAMES = {
71
+
72
+ 'y' => '[tc]',
73
+ 'r' => '[ag]',
74
+ 'w' => '[at]',
75
+ 's' => '[gc]',
76
+ 'k' => '[tg]',
77
+ 'm' => '[ac]',
78
+
79
+ 'b' => '[tgc]',
80
+ 'd' => '[atg]',
81
+ 'h' => '[atc]',
82
+ 'v' => '[agc]',
83
+
84
+ 'n' => '[atgc]',
85
+
86
+ 'a' => 'a',
87
+ 't' => 't',
88
+ 'g' => 'g',
89
+ 'c' => 'c',
90
+ 'u' => 'u',
91
+
92
+ 'A' => 'Adenine',
93
+ 'T' => 'Thymine',
94
+ 'G' => 'Guanine',
95
+ 'C' => 'Cytosine',
96
+ 'U' => 'Uracil',
97
+
98
+ 'Y' => 'pYrimidine',
99
+ 'R' => 'puRine',
100
+ 'W' => 'Weak',
101
+ 'S' => 'Strong',
102
+ 'K' => 'Keto',
103
+ 'M' => 'aroMatic',
104
+
105
+ 'B' => 'not A',
106
+ 'D' => 'not C',
107
+ 'H' => 'not G',
108
+ 'V' => 'not T',
109
+ }
110
+
111
+ WEIGHT = {
112
+
113
+ # Calculated by BioPerl's Bio::Tools::SeqStats.pm :-)
114
+
115
+ 'a' => 135.15,
116
+ 't' => 126.13,
117
+ 'g' => 151.15,
118
+ 'c' => 111.12,
119
+ 'u' => 112.10,
120
+
121
+ :adenine => 135.15,
122
+ :thymine => 126.13,
123
+ :guanine => 151.15,
124
+ :cytosine => 111.12,
125
+ :uracil => 112.10,
126
+
127
+ :deoxyribose_phosphate => 196.11,
128
+ :ribose_phosphate => 212.11,
129
+
130
+ :hydrogen => 1.00794,
131
+ :water => 18.015,
132
+
133
+ }
134
+
135
+ def weight(x = nil, rna = nil)
136
+ if x
137
+ if x.length > 1
138
+ if rna
139
+ phosphate = WEIGHT[:ribose_phosphate]
140
+ else
141
+ phosphate = WEIGHT[:deoxyribose_phosphate]
142
+ end
143
+ hydrogen = WEIGHT[:hydrogen]
144
+ water = WEIGHT[:water]
145
+
146
+ total = 0.0
147
+ x.each_byte do |byte|
148
+ base = byte.chr.downcase
149
+ if WEIGHT[base]
150
+ total += WEIGHT[base] + phosphate - hydrogen * 2
151
+ else
152
+ raise "Error: invalid nucleic acid '#{base}'"
153
+ end
154
+ end
155
+ total -= water * (x.length - 1)
156
+ else
157
+ WEIGHT[x.to_s.downcase]
158
+ end
159
+ else
160
+ WEIGHT
161
+ end
162
+ end
163
+
164
+ def [](x)
165
+ NAMES[x]
166
+ end
167
+
168
+ # backward compatibility
169
+ def names
170
+ NAMES
171
+ end
172
+ alias na names
173
+
174
+ def name(x)
175
+ NAMES[x.to_s.upcase]
176
+ end
177
+
178
+ def to_re(seq, rna = false)
179
+ str = seq.to_s
180
+ str.gsub!(/[^atgcu]/) { |base|
181
+ NAMES[base] || '.'
182
+ }
183
+ if rna
184
+ str.tr!("t", "u")
185
+ end
186
+ Regexp.new(str)
187
+ end
188
+
189
+ end
190
+
191
+
192
+ # as instance methods
193
+ include Data
194
+
195
+ # as class methods
196
+ extend Data
197
+
198
+ end
199
+
200
+ end # module Bio
201
+
202
+
203
+ if __FILE__ == $0
204
+
205
+ puts "### na = Bio::NucleicAcid.new"
206
+ na = Bio::NucleicAcid.new
207
+
208
+ puts "# na.to_re('yrwskmbdhvnatgc')"
209
+ p na.to_re('yrwskmbdhvnatgc')
210
+
211
+ puts "# Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')"
212
+ p Bio::NucleicAcid.to_re('yrwskmbdhvnatgc')
213
+
214
+ puts "# na.weight('A')"
215
+ p na.weight('A')
216
+
217
+ puts "# Bio::NucleicAcid.weight('A')"
218
+ p Bio::NucleicAcid.weight('A')
219
+
220
+ puts "# na.weight('atgc')"
221
+ p na.weight('atgc')
222
+
223
+ puts "# Bio::NucleicAcid.weight('atgc')"
224
+ p Bio::NucleicAcid.weight('atgc')
225
+
226
+ end
data/lib/bio/db.rb ADDED
@@ -0,0 +1,340 @@
1
+ #
2
+ # = bio/db.rb - common API for database parsers
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2002, 2005
5
+ # KATAYAMA Toshiaki <k@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: db.rb,v 0.31 2005/12/07 11:23:51 k Exp $
9
+ #
10
+ # == On-demand parsing and cache
11
+ #
12
+ # The flatfile parsers (sub classes of the Bio::DB) split the original entry
13
+ # into a Hash and store the hash in the @orig instance variable. To parse
14
+ # in detail is delayed until the method is called which requires a further
15
+ # parsing of a content of the @orig hash. Fully parsed data is cached in the
16
+ # another hash, @data, separately.
17
+ #
18
+ # == Guide lines for the developers to create an new database class
19
+ #
20
+ # --- Bio::DB.new(entry)
21
+ #
22
+ # The 'new' method should accept the entire entry in one String and
23
+ # return the parsed database object.
24
+ #
25
+ # --- Bio::DB#entry_id
26
+ #
27
+ # Database classes should implement the following methods if appropriate:
28
+ #
29
+ # * entry_id
30
+ # * definition
31
+ #
32
+ # Every sub class should define the following constants if appropriate:
33
+ #
34
+ # * DELIMITER (RS)
35
+ # * entry separator of the flatfile of the database.
36
+ # * RS (= record separator) is an alias for the DELIMITER in short.
37
+ #
38
+ # * TAGSIZE
39
+ # * length of the tag field in the FORTRAN-like format.
40
+ #
41
+ # |<- tag ->||<- data ---->|
42
+ # ENTRY_ID A12345
43
+ # DEFINITION Hoge gene of the Pokemonia pikachuae
44
+ #
45
+ # === Template of the sub class
46
+ #
47
+ # module Bio
48
+ # class Hoge < DB
49
+ #
50
+ # DELIMITER = RS = "\n//\n"
51
+ # TAGSIZE = 12 # You can omit this line if not needed
52
+ #
53
+ # def initialize(entry)
54
+ # end
55
+ #
56
+ # def entry_id
57
+ # end
58
+ #
59
+ # end # class Hoge
60
+ # end # module Bio
61
+ #
62
+ # === Recommended method names for sub classes
63
+ #
64
+ # In general, the method name should be in the singular form when returns
65
+ # a Object (including the case when the Object is a String), and should be
66
+ # the plural form when returns same Objects in Array. It depends on the
67
+ # database classes that which form of the method name can be use.
68
+ #
69
+ # For example, GenBank has several REFERENCE fields in one entry, so define
70
+ # Bio::GenBank#references and this method should return an Array of the
71
+ # Reference objects. On the other hand, MEDLINE has one REFERENCE information
72
+ # per one entry, so define Bio::MEDLINE#reference method and this should
73
+ # return a Reference object.
74
+ #
75
+ # The method names used in the sub classes should be taken from the following
76
+ # list if appropriate:
77
+ #
78
+ # --- entry_id #=> String
79
+ #
80
+ # The entry identifier.
81
+ #
82
+ # --- definition #=> String
83
+ #
84
+ # The description of the entry.
85
+ #
86
+ # --- reference #=> Bio::Reference
87
+ # --- references #=> Array of Bio::Reference
88
+ #
89
+ # The reference field(s) of the entry.
90
+ #
91
+ # --- dblink #=> String
92
+ # --- dblinks #=> Array of String
93
+ #
94
+ # The link(s) to the other database entry.
95
+ #
96
+ # --- naseq #=> Bio::Sequence::NA
97
+ #
98
+ # The DNA/RNA sequence of the entry.
99
+ #
100
+ # --- nalen #=> Integer
101
+ #
102
+ # The length of the DNA/RNA sequence of the entry.
103
+ #
104
+ # --- aaseq #=> Bio::Sequence::AA
105
+ #
106
+ # The amino acid sequence of the entry.
107
+ #
108
+ # --- aalen #=> Integer
109
+ #
110
+ # The length of the amino acid sequence of the entry.
111
+ #
112
+ # --- seq #=> Bio::Sequence::NA or Bio::Sequence::AA
113
+ #
114
+ # Returns an appropriate sequence object.
115
+ #
116
+ # --- position #=> String
117
+ #
118
+ # The position of the sequence in the entry or in the genome (depends on
119
+ # the database).
120
+ #
121
+ # --- locations #=> Bio::Locations
122
+ #
123
+ # Returns Bio::Locations.new(position).
124
+ #
125
+ # --- division #=> String
126
+ #
127
+ # The sub division name of the database.
128
+ #
129
+ # * Example:
130
+ # * EST, VRL etc. for GenBank
131
+ # * PATTERN, RULE etc. for PROSITE
132
+ #
133
+ # --- date #=> String
134
+ #
135
+ # The date of the entry.
136
+ # Should we use Date (by ParseDate) instead of String?
137
+ #
138
+ # --- gene #=> String
139
+ # --- genes #=> Array of String
140
+ #
141
+ # The name(s) of the gene.
142
+ #
143
+ # --- organism #=> String
144
+ #
145
+ # The name of the organism.
146
+ #
147
+ #--
148
+ #
149
+ # This library is free software; you can redistribute it and/or
150
+ # modify it under the terms of the GNU Lesser General Public
151
+ # License as published by the Free Software Foundation; either
152
+ # version 2 of the License, or (at your option) any later version.
153
+ #
154
+ # This library is distributed in the hope that it will be useful,
155
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
156
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
157
+ # Lesser General Public License for more details.
158
+ #
159
+ # You should have received a copy of the GNU Lesser General Public
160
+ # License along with this library; if not, write to the Free Software
161
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
162
+ #
163
+ #++
164
+ #
165
+
166
+ require 'bio/sequence'
167
+ require 'bio/reference'
168
+ require 'bio/feature'
169
+
170
+ module Bio
171
+
172
+ class DB
173
+
174
+ def self.open(filename, *mode, &block)
175
+ Bio::FlatFile.open(self, filename, *mode, &block)
176
+ end
177
+
178
+ # Returns an entry identifier as a String. This method must be
179
+ # implemented in every database classes by overriding this method.
180
+ def entry_id
181
+ raise NotImplementedError
182
+ end
183
+
184
+ # Returns a list of the top level tags of the entry as an Array of String.
185
+ def tags
186
+ @orig.keys
187
+ end
188
+
189
+ # Returns true or false - wether the entry contains the field of the
190
+ # given tag name.
191
+ def exists?(tag)
192
+ @orig.include?(tag)
193
+ end
194
+
195
+ # Returns an intact field of the tag as a String.
196
+ def get(tag)
197
+ @orig[tag]
198
+ end
199
+
200
+ # Similar to the get method, however, fetch returns the content of the
201
+ # field without its tag and any extra white spaces stripped.
202
+ def fetch(tag, skip = 0)
203
+ field = @orig[tag].split(/\n/, skip + 1).last.to_s
204
+ truncate(field.gsub(/^.{0,#{@tagsize}}/,''))
205
+ end
206
+
207
+
208
+ private
209
+
210
+ # Returns a String with successive white spaces are replaced by one
211
+ # space and stripeed.
212
+ def truncate(str)
213
+ if str
214
+ str.gsub(/\s+/, ' ').strip
215
+ else
216
+ ""
217
+ end
218
+ end
219
+
220
+ # Returns a tag name of the field as a String.
221
+ def tag_get(str)
222
+ if str
223
+ str[0,@tagsize].strip
224
+ else
225
+ ""
226
+ end
227
+ end
228
+
229
+ # Returns a String of the field without a tag name.
230
+ def tag_cut(str)
231
+ if str
232
+ str[0,@tagsize] = ''
233
+ else
234
+ ""
235
+ end
236
+ end
237
+
238
+ # Returns the content of the field as a String like the fetch method.
239
+ # Furthermore, field_fetch stores the result in the @data hash.
240
+ def field_fetch(tag, skip = 0)
241
+ unless @data[tag]
242
+ @data[tag] = fetch(tag, skip)
243
+ end
244
+ return @data[tag]
245
+ end
246
+
247
+ # Returns an Array containing each line of the field without a tag.
248
+ # lines_fetch also stores the result in the @data hash.
249
+ def lines_fetch(tag)
250
+ unless @data[tag]
251
+ @data[tag] = get(tag).split(/\n/).map{ |l| tag_cut(l) }
252
+ end
253
+ @data[tag]
254
+ end
255
+
256
+ end # class DB
257
+
258
+
259
+ # Stores a NCBI style (GenBank, KEGG etc.) entry.
260
+ class NCBIDB < DB
261
+
262
+ autoload :Common, 'bio/db/genbank/common'
263
+
264
+ # The entire entry is passed as a String. The length of the tag field is
265
+ # passed as an Integer. Parses the entry roughly by the entry2hash method
266
+ # and returns a database object.
267
+ def initialize(entry, tagsize)
268
+ @tagsize = tagsize
269
+ @orig = entry2hash(entry.strip) # Hash of the original entry
270
+ @data = {} # Hash of the parsed entry
271
+ end
272
+
273
+ private
274
+
275
+ # Splits an entry into an Array of Strings at the level of top tags.
276
+ def toptag2array(str)
277
+ sep = "\001"
278
+ str.gsub(/\n([A-Za-z\/])/, "\n#{sep}\\1").split(sep)
279
+ end
280
+
281
+ # Splits a field into an Array of Strings at the level of sub tags.
282
+ def subtag2array(str)
283
+ sep = "\001"
284
+ str.gsub(/\n(\s{1,#{@tagsize-1}}\S)/, "\n#{sep}\\1").split(sep)
285
+ end
286
+
287
+ # Returns the contents of the entry as a Hash with the top level tags as
288
+ # its keys.
289
+ def entry2hash(entry)
290
+ hash = Hash.new('')
291
+
292
+ fields = toptag2array(entry)
293
+
294
+ fields.each do |field|
295
+ tag = tag_get(field)
296
+ hash[tag] += field
297
+ end
298
+ return hash
299
+ end
300
+
301
+ end # class NCBIDB
302
+
303
+
304
+ # Class for KEGG databases. Inherits a NCBIDB class.
305
+ class KEGGDB < NCBIDB
306
+ end
307
+
308
+
309
+ # Stores an EMBL style (EMBL, TrEMBL, Swiss-Prot etc.) entry.
310
+ class EMBLDB < DB
311
+
312
+ autoload :Common, 'bio/db/embl/common'
313
+
314
+ # The entire entry is passed as a String. The length of the tag field is
315
+ # passed as an Integer. Parses the entry roughly by the entry2hash method
316
+ # and returns a database object.
317
+ def initialize(entry, tagsize)
318
+ @tagsize = tagsize
319
+ @orig = entry2hash(entry.strip) # Hash of the original entry
320
+ @data = {} # Hash of the parsed entry
321
+ end
322
+
323
+ private
324
+
325
+ # Returns the contents of the entry as a Hash.
326
+ def entry2hash(entry)
327
+ hash = Hash.new('')
328
+ entry.each_line do |line|
329
+ tag = tag_get(line)
330
+ next if tag == 'XX'
331
+ tag = 'R' if tag =~ /^R./ # Reference lines
332
+ hash[tag] += line
333
+ end
334
+ return hash
335
+ end
336
+
337
+ end # class EMBLDB
338
+
339
+ end # module Bio
340
+