bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,417 @@
1
+ require 'bio/reference'
2
+ module Bio
3
+
4
+ #
5
+ # bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files
6
+ #
7
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
8
+ # License:: LGPL
9
+ #
10
+ # $Id: rebase.rb,v 1.2 2005/12/13 15:02:41 trevor Exp $
11
+ #
12
+ #
13
+ #--
14
+ #
15
+ # This library is free software; you can redistribute it and/or
16
+ # modify it under the terms of the GNU Lesser General Public
17
+ # License as published by the Free Software Foundation; either
18
+ # version 2 of the License, or (at your option) any later version.
19
+ #
20
+ # This library is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
+ # Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public
26
+ # License along with this library; if not, write to the Free Software
27
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28
+ #
29
+ #++
30
+ #
31
+ #
32
+
33
+ =begin rdoc
34
+ bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files
35
+
36
+ == Synopsis
37
+
38
+ Bio::REBASE provides utilties for interacting with REBASE data in EMBOSS
39
+ format. REBASE is the Restriction Enzyme Database, more information
40
+ can be found here:
41
+ * http://rebase.neb.com
42
+
43
+ EMBOSS formatted files located at:
44
+ * http://rebase.neb.com/rebase/rebase.f37.html
45
+
46
+ These files are the same as the "emboss_?.???" files located at:
47
+ * ftp://ftp.neb.com/pub/rebase/
48
+
49
+ To easily get started with the data you can simply type this command at your shell prompt:
50
+ wget ftp://ftp.neb.com/pub/rebase/emboss*
51
+
52
+
53
+ == Usage
54
+
55
+ require 'bio/db/rebase'
56
+ require 'pp'
57
+
58
+ enz = File.read('emboss_e')
59
+ ref = File.read('emboss_r')
60
+ sup = File.read('emboss_s')
61
+
62
+ # When creating a new instance of Bio::REBASE
63
+ # the contents of the enzyme file must be passed.
64
+ # The references and suppiers file contents
65
+ # may also be passed.
66
+ rebase = Bio::REBASE.new( enz )
67
+ rebase = Bio::REBASE.new( enz, ref )
68
+ rebase = Bio::REBASE.new( enz, ref, sup )
69
+
70
+ # The 'read' class method allows you to read in files
71
+ # that are REBASE EMBOSS formatted
72
+ rebase = Bio::REBASE.read( 'emboss_e' )
73
+ rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
74
+ rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
75
+
76
+ # The data loaded may be saved in YAML format
77
+ rebase.save_yaml( 'enz.yaml' )
78
+ rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
79
+ rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
80
+
81
+ # YAML formatted files can also be read with the
82
+ # class method 'load_yaml'
83
+ rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
84
+ rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
85
+ rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
86
+
87
+ pp rebase.enzymes[0..4] # ["AarI", "AasI", "AatI", "AatII", "Acc16I"]
88
+ pp rebase['AarI'].pattern # "CACCTGC"
89
+ pp rebase['AarI'].blunt? # false
90
+ pp rebase['AarI'].organism # "Arthrobacter aurescens SS2-322"
91
+ pp rebase['AarI'].source # "A. Janulaitis"
92
+ pp rebase['AarI'].primary_strand_cut1 # 11
93
+ pp rebase['AarI'].primary_strand_cut2 # 0
94
+ pp rebase['AarI'].complementary_strand_cut1 # 15
95
+ pp rebase['AarI'].complementary_strand_cut2 # 0
96
+ pp rebase['AarI'].suppliers # ["F"]
97
+ pp rebase['AarI'].supplier_names # ["Fermentas International Inc."]
98
+
99
+ pp rebase['AarI'].isoschizomers # Currently none stored in the references file
100
+ pp rebase['AarI'].methylation # ""
101
+
102
+ pp rebase['EcoRII'].methylation # "2(5)"
103
+ pp rebase['EcoRII'].suppliers # ["F", "J", "M", "O", "S"]
104
+ pp rebase['EcoRII'].supplier_names # ["Fermentas International Inc.", "Nippon Gene Co., Ltd.",
105
+ # "Roche Applied Science", "Toyobo Biochemicals",
106
+ # "Sigma Chemical Corporation"]
107
+
108
+ # Number of enzymes in the database
109
+ pp rebase.size # 673
110
+ pp rebase.enzymes.size # 673
111
+
112
+ rebase.each do |name, info|
113
+ pp "#{name}: #{info.methylation}" unless info.methylation.empty?
114
+ end
115
+
116
+
117
+ == Author
118
+ Trevor Wennblom <trevor@corevx.com>
119
+
120
+
121
+ == Copyright
122
+ Copyright (C) 2005 Trevor Wennblom
123
+ Licensed under the same terms as BioRuby.
124
+
125
+ =end
126
+
127
+ class REBASE
128
+ autoload(:YAML, 'yaml')
129
+
130
+ class DynamicMethod_Hash < Hash
131
+ # Define a writer or reader
132
+ # * Allows hash[:kay]= to be accessed like hash.key=
133
+ # * Allows hash[:key] to be accessed like hash.key
134
+ def method_missing(method_id, *args)
135
+ k = self.class
136
+ if method_id.to_s[-1].chr == '='
137
+ k.class_eval do
138
+ define_method(method_id) { |s| self[ method_id.to_s[0..-2].to_sym ] = s }
139
+ end
140
+ k.instance_method(method_id).bind(self).call(args[0])
141
+ else
142
+ k.class_eval do
143
+ define_method(method_id) { self[method_id] }
144
+ end
145
+ k.instance_method(method_id).bind(self).call
146
+ end
147
+ end
148
+ end
149
+
150
+ class EnzymeEntry < DynamicMethod_Hash
151
+ @@supplier_data = {}
152
+ def self.supplier_data=(d); @@supplier_data = d; end
153
+
154
+ def supplier_names
155
+ ret = []
156
+ self.suppliers.each { |s| ret << @@supplier_data[s] }
157
+ ret
158
+ end
159
+ end
160
+
161
+ def each
162
+ @data.each { |v| yield v }
163
+ end
164
+
165
+ # Make the instantiated class act like a Hash on @data
166
+ # Does the equivalent and more of this:
167
+ # def []( key ); @data[ key ]; end
168
+ # def size; @data.size; end
169
+ def method_missing(method_id, *args)
170
+ self.class.class_eval do
171
+ define_method(method_id) { |a| Hash.instance_method(method_id).bind(@data).call(a) }
172
+ end
173
+ Hash.instance_method(method_id).bind(@data).call(*args)
174
+ end
175
+
176
+ # All your REBASE are belong to us.
177
+ def initialize( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false )
178
+ if yaml
179
+ @enzyme_data = enzyme_lines
180
+ @reference_data = reference_lines
181
+ @supplier_data = supplier_lines
182
+ else
183
+ @enzyme_data = parse_enzymes(enzyme_lines)
184
+ @reference_data = parse_references(reference_lines)
185
+ @supplier_data = parse_suppliers(supplier_lines)
186
+ end
187
+
188
+ EnzymeEntry.supplier_data = @supplier_data
189
+ setup_enzyme_data
190
+ end
191
+
192
+ # List the enzymes available
193
+ def enzymes
194
+ @data.keys.sort
195
+ end
196
+
197
+ # Save the current data
198
+ # rebase.save_yaml( 'enz.yaml' )
199
+ # rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
200
+ # rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
201
+ def save_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
202
+ File.open(f_enzyme, 'w') { |f| f.puts YAML.dump(@enzyme_data) }
203
+ File.open(f_reference, 'w') { |f| f.puts YAML.dump(@reference_data) } if f_reference
204
+ File.open(f_supplier, 'w') { |f| f.puts YAML.dump(@supplier_data) } if f_supplier
205
+ end
206
+
207
+ # Read REBASE EMBOSS-formatted files
208
+ # rebase = Bio::REBASE.read( 'emboss_e' )
209
+ # rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
210
+ # rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
211
+ def self.read( f_enzyme, f_reference=nil, f_supplier=nil )
212
+ e = IO.readlines(f_enzyme)
213
+ r = f_reference ? IO.readlines(f_reference) : nil
214
+ s = f_supplier ? IO.readlines(f_supplier) : nil
215
+ self.new(e,r,s)
216
+ end
217
+
218
+ # Read YAML formatted files
219
+ # rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
220
+ # rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
221
+ # rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
222
+ def self.load_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
223
+ e = YAML.load_file(f_enzyme)
224
+ r = f_reference ? YAML.load_file(f_reference) : nil
225
+ s = f_supplier ? YAML.load_file(f_supplier) : nil
226
+ self.new(e,r,s,true)
227
+ end
228
+
229
+ #########
230
+ protected
231
+ #########
232
+
233
+ def setup_enzyme_data
234
+ @data = {}
235
+
236
+ @enzyme_data.each do |name, hash|
237
+ @data[name] = EnzymeEntry.new
238
+ d = @data[name]
239
+ d.pattern = hash[:pattern]
240
+ # d.blunt?= is a syntax error
241
+ d[:blunt?] = (hash[:blunt].to_i == 1 ? true : false)
242
+ d.primary_strand_cut1 = hash[:c1].to_i
243
+ d.complementary_strand_cut1 = hash[:c2].to_i
244
+ d.primary_strand_cut2 = hash[:c3].to_i
245
+ d.complementary_strand_cut2 = hash[:c4].to_i
246
+
247
+ # Set up keys just in case there's no reference data supplied
248
+ [:organism, :isoschizomers,
249
+ :methylation, :source].each { |k| d[k] = '' }
250
+ d.suppliers = []
251
+ d.references = []
252
+ end
253
+
254
+ setup_enzyme_and_reference_association
255
+ end
256
+
257
+ def setup_enzyme_and_reference_association
258
+ return unless @reference_data
259
+ @reference_data.each do |name, hash|
260
+ d = @data[name]
261
+ [:organism, :isoschizomers,
262
+ :methylation, :source].each { |k| d[k] = hash[k] }
263
+ d.suppliers = hash[:suppliers].split('')
264
+ d.references = []
265
+ hash[:references].each { |k| d.references << raw_to_reference(k) }
266
+ end
267
+ end
268
+
269
+ # data is a hash indexed by the :name of each entry which is also a hash
270
+ # * data[enzyme_name] has the following keys:
271
+ # :name, :pattern, :len, :ncuts, :blunt, :c1, :c2, :c3, :c4
272
+ # :c1 => First 5' cut
273
+ # :c2 => First 3' cut
274
+ # :c3 => Second 5' cut
275
+ # :c4 => Seocnd 3' cut
276
+ def parse_enzymes( lines )
277
+ data = {}
278
+ return data if lines == nil
279
+ lines.each do |line|
280
+ next if line[0].chr == '#'
281
+ line.chomp!
282
+
283
+ a = line.split("\s")
284
+
285
+ data[ a[0] ] = {
286
+ :name => a[0],
287
+ :pattern => a[1],
288
+ :len => a[2],
289
+ :ncuts => a[3],
290
+ :blunt => a[4],
291
+ :c1 => a[5],
292
+ :c2 => a[6],
293
+ :c3 => a[7],
294
+ :c4 => a[8]
295
+ }
296
+ end # lines.each
297
+ data
298
+ end
299
+
300
+ # data is a hash indexed by the :name of each entry which is also a hash
301
+ # * data[enzyme_name] has the following keys:
302
+ # :organism, :isoschizomers, :references, :source, :methylation, :suppliers, :name, :number_of_references
303
+ def parse_references( lines )
304
+ data = {}
305
+ return data if lines == nil
306
+ index = 1
307
+ h = {}
308
+ references_left = 0
309
+
310
+ lines.each do |line|
311
+ next if line[0].chr == '#' # Comment
312
+ next if line[0..1] == '//' # End of entry marker
313
+ line.chomp!
314
+
315
+ if (1..7).include?( index )
316
+ h[index] = line
317
+ references_left = h[index].to_i if index == 7
318
+ index += 1
319
+ next
320
+ end
321
+
322
+ if index == 8
323
+ h[index] ||= []
324
+ h[index] << line
325
+ references_left -= 1
326
+ end
327
+
328
+ if references_left == 0
329
+ data[ h[1] ] = {
330
+ :name => h[1],
331
+ :organism => h[2],
332
+ :isoschizomers => h[3],
333
+ :methylation => h[4],
334
+ :source => h[5],
335
+ :suppliers => h[6],
336
+ :number_of_references => h[7],
337
+ :references => h[8]
338
+ }
339
+ index = 1
340
+ h = {}
341
+ end
342
+ end # lines.each
343
+ data
344
+ end
345
+
346
+ # data is a hash indexed by the supplier code
347
+ # data[supplier_code]
348
+ # returns the suppliers name
349
+ def parse_suppliers( lines )
350
+ data = {}
351
+ return data if lines == nil
352
+ lines.each do |line|
353
+ next if line[0].chr == '#'
354
+ data[$1] = $2 if line =~ %r{(.+?)\s(.+)}
355
+ end
356
+ data
357
+ end
358
+
359
+ # Takes a string in one of the three formats listed below and returns a
360
+ # Bio::Reference object
361
+ # * Possible input styles:
362
+ # a = 'Inagaki, K., Hikita, T., Yanagidani, S., Nomura, Y., Kishimoto, N., Tano, T., Tanaka, H., (1993) Biosci. Biotechnol. Biochem., vol. 57, pp. 1716-1721.'
363
+ # b = 'Nekrasiene, D., Lapcinskaja, S., Kiuduliene, L., Vitkute, J., Janulaitis, A., Unpublished observations.'
364
+ # c = "Grigaite, R., Maneliene, Z., Janulaitis, A., (2002) Nucleic Acids Res., vol. 30."
365
+ def raw_to_reference( line )
366
+ a = line.split(', ')
367
+
368
+ if a[-1] == 'Unpublished observations.'
369
+ title = a.pop.chop
370
+ pages = volume = year = journal = ''
371
+ else
372
+ title = ''
373
+
374
+ pages_or_volume = a.pop.chop
375
+ if pages_or_volume =~ %r{pp\.\s}
376
+ pages = pages_or_volume
377
+ pages.gsub!('pp. ', '')
378
+ volume = a.pop
379
+ else
380
+ pages = ''
381
+ volume = pages_or_volume
382
+ end
383
+
384
+ volume.gsub!('vol. ', '')
385
+
386
+ year_and_journal = a.pop
387
+ year_and_journal =~ %r{\((\d+)\)\s(.+)}
388
+ year = $1
389
+ journal = $2
390
+ end
391
+
392
+ authors = []
393
+
394
+ last_name = nil
395
+ a.each do |e|
396
+ if last_name
397
+ authors << "#{last_name}, #{e}"
398
+ last_name = nil
399
+ else
400
+ last_name = e
401
+ end
402
+ end
403
+
404
+ ref = {
405
+ 'title' => title,
406
+ 'pages' => pages,
407
+ 'volume' => volume,
408
+ 'year' => year,
409
+ 'journal' => journal,
410
+ 'authors' => authors,
411
+ }
412
+
413
+ Bio::Reference.new(ref)
414
+ end
415
+
416
+ end # REBASE
417
+ end # Bio
@@ -0,0 +1,387 @@
1
+ #
2
+ # bio/db/transfac.rb - TRANSFAC database class
3
+ #
4
+ # Copyright (C) 2001 KAWASHIMA Shuichi <s@bioruby.org>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: transfac.rb,v 1.10 2005/11/28 04:57:33 k Exp $
21
+ #
22
+
23
+ require "bio/db"
24
+ require "matrix"
25
+
26
+ module Bio
27
+
28
+ class TRANSFAC < EMBLDB
29
+
30
+ DELIMITER = RS = "\n//\n"
31
+ TAGSIZE = 4
32
+
33
+ def initialize(entry)
34
+ super(entry, TAGSIZE)
35
+ end
36
+
37
+ # AC Accession number (1 per entry)
38
+ #
39
+ # AC T00001 in the case of FACTOR
40
+ # AC M00001 in the case of MATRIX
41
+ # AC R00001 in the case of SITE
42
+ # AC G000001 in the case of GENE
43
+ # AC C00001 in the case of CLASS
44
+ # AC 00001 in the case of CELL
45
+ #
46
+ def ac
47
+ unless @data['AC']
48
+ @data['AC'] = fetch('AC')
49
+ end
50
+ @data['AC']
51
+ end
52
+ alias entry_id ac
53
+
54
+ # DT Date (1 per entry)
55
+ #
56
+ # DT DD.MM.YYYY (created); ewi.
57
+ # DT DD.MM.YYYY (updated); mpr.
58
+ #
59
+ def dt
60
+ field_fetch('DT')
61
+ end
62
+ alias date dt
63
+
64
+ def cc
65
+ field_fetch('CC')
66
+ end
67
+ alias comment cc
68
+
69
+ def os
70
+ field_fetch('OS')
71
+ end
72
+ alias org_species os
73
+
74
+ def oc
75
+ field_fetch('OC')
76
+ end
77
+ alias org_class oc
78
+
79
+ def rn
80
+ field_fetch('RN')
81
+ end
82
+ alias ref_no rn
83
+
84
+ def ra
85
+ field_fetch('RA')
86
+ end
87
+ alias ref_authors ra
88
+
89
+ def rt
90
+ field_fetch('RT')
91
+ end
92
+ alias ref_title rt
93
+
94
+ def rl
95
+ field_fetch('RL')
96
+ end
97
+ alias ref_data rl
98
+
99
+
100
+ class MATRIX < TRANSFAC
101
+
102
+ def initialize(entry)
103
+ super(entry)
104
+ end
105
+
106
+ # NA Name of the binding factor
107
+ def na
108
+ field_fetch('NA')
109
+ end
110
+
111
+ # DE Short factor description
112
+ def de
113
+ field_fetch('DE')
114
+ end
115
+
116
+ # BF List of linked factor entries
117
+ def bf
118
+ field_fetch('bf')
119
+ end
120
+
121
+
122
+ def ma
123
+ ma_dat = {}
124
+ ma_ary = []
125
+ key = ''
126
+ @orig.each do |k, v|
127
+ if k =~ /^0*(\d+)/
128
+ key = $1.to_i
129
+ ma_dat[key] = fetch(k) unless ma_dat[key]
130
+ end
131
+ end
132
+ ma_dat.keys.sort.each_with_index do |k, i|
133
+ rep_nt = ma_dat[k].slice!(-1, 1)
134
+ ma_dat[k].slice!(-1, 1)
135
+ ma_ary[i] = ma_dat[k].split(/\s+/)
136
+ ma_ary[i].each_with_index do |x, j|
137
+ ma_ary[i][j] = x.to_i
138
+ end
139
+ end
140
+ Matrix[*ma_ary]
141
+ end
142
+
143
+ # BA Statistical basis
144
+ def ba
145
+ field_fetch('BA')
146
+ end
147
+
148
+ end
149
+
150
+
151
+ class SITE < TRANSFAC
152
+
153
+ def initialize(entry)
154
+ super(entry)
155
+ end
156
+
157
+ def ty
158
+ field_fetch('TY')
159
+ end
160
+
161
+ def de
162
+ field_fetch('DE')
163
+ end
164
+
165
+ def re
166
+ field_fetch('RE')
167
+ end
168
+
169
+ def sq
170
+ field_fetch('SQ')
171
+ end
172
+
173
+ def el
174
+ field_fetch('EL')
175
+ end
176
+
177
+ def sf
178
+ field_fetch('SF')
179
+ end
180
+
181
+ def st
182
+ field_fetch('ST')
183
+ end
184
+
185
+ def s1
186
+ field_fetch('S1')
187
+ end
188
+
189
+ def bf
190
+ field_fetch('BF')
191
+ end
192
+
193
+ def so
194
+ field_fetch('SO')
195
+ end
196
+
197
+ def mm
198
+ field_fetch('MM')
199
+ end
200
+
201
+ # DR Cross-references to other databases (>=0 per entry)
202
+ def dr
203
+ field_fetch('DR')
204
+ end
205
+
206
+ end
207
+
208
+
209
+ class FACTOR < TRANSFAC
210
+
211
+ def initialize(entry)
212
+ super(entry)
213
+ end
214
+
215
+ # FA Factor name
216
+ def fa
217
+ field_fetch('FA')
218
+ end
219
+
220
+ # SY Synonyms
221
+ def sy
222
+ field_fetch('SY')
223
+ end
224
+
225
+ # DR Cross-references to other databases (>=0 per entry)
226
+ def dr
227
+ field_fetch('DR')
228
+ end
229
+
230
+ # HO Homologs (suggested)
231
+ def ho
232
+ field_fetch('HO')
233
+ end
234
+
235
+ # CL Classification (class accession no.; class identifier; decimal
236
+ # CL classification number.)
237
+ def cl
238
+ field_fetch('CL')
239
+ end
240
+
241
+ # SZ Size (length (number of amino acids); calculated molecular mass
242
+ # SZ in kDa; experimental molecular mass (or range) in kDa
243
+ # SZ (experimental method) [Ref]
244
+ def sz
245
+ field_fetch('SZ')
246
+ end
247
+
248
+ # SQ Sequence
249
+ def sq
250
+ field_fetch('SQ')
251
+ end
252
+
253
+ # SC Sequence comment, i. e. source of the protein sequence
254
+ def sc
255
+ field_fetch('SC')
256
+ end
257
+
258
+ # FT Feature table (1st position last position feature)
259
+ def ft
260
+ field_fetch('FT')
261
+ end
262
+
263
+ # SF Structural features
264
+ def sf
265
+ field_fetch('SF')
266
+ end
267
+
268
+ # CP Cell specificity (positive)
269
+ def cp
270
+ field_fetch('CP')
271
+ end
272
+
273
+ # CN Cell specificity (negative)
274
+ def cn
275
+ field_fetch('CN')
276
+ end
277
+
278
+ # FF Functional features
279
+ def ff
280
+ field_fetch('FF')
281
+ end
282
+
283
+ # IN Interacting factors (factor accession no.; factor name;
284
+ # IN biological species.)
285
+ def in
286
+ field_fetch('IN')
287
+ end
288
+
289
+ # MX Matrix (matrix accession no.; matrix identifier)
290
+ def mx
291
+ field_fetch('MX')
292
+ end
293
+
294
+ # BS Bound sites (site accession no.; site ID; quality: N; biological
295
+ # BS species)
296
+ def bs
297
+ field_fetch('BS')
298
+ end
299
+
300
+ end
301
+
302
+
303
+ class CELL < TRANSFAC
304
+
305
+ def initialize(entry)
306
+ super(entry)
307
+ end
308
+
309
+ # CD Cell description
310
+ def cd
311
+ field_fetch('CD')
312
+ end
313
+
314
+ end
315
+
316
+
317
+ class CLASS < TRANSFAC
318
+
319
+ def initialize(entry)
320
+ super(entry)
321
+ end
322
+
323
+ # CL Class
324
+ def cl
325
+ field_fetch('CL')
326
+ end
327
+
328
+ # SD Structure description
329
+ def sd
330
+ field_fetch('SD')
331
+ end
332
+
333
+ # BF Factors belonging to this class
334
+ def bf
335
+ field_fetch('BF')
336
+ end
337
+
338
+ # DR PROSITE accession numbers
339
+ def dr
340
+ field_fetch('DR')
341
+ end
342
+
343
+ end
344
+
345
+
346
+ class GENE < TRANSFAC
347
+
348
+ def initialize(entry)
349
+ super(entry)
350
+ end
351
+
352
+ # SD Short description/name of the gene
353
+ def sd
354
+ field_fetch('SD')
355
+ end
356
+
357
+ # DE
358
+ def de
359
+ field_fetch('DE')
360
+ end
361
+
362
+ # BC Bucher promoter
363
+ def bc
364
+ field_fetch('BC')
365
+ end
366
+
367
+ # BS TRANSFAC SITE positions and accession numbers
368
+ def bs
369
+ field_fetch('BS')
370
+ end
371
+
372
+ # CO COMPEL accession number
373
+ def co
374
+ field_fetch('CO')
375
+ end
376
+
377
+ # TR TRRD accession number
378
+ def tr
379
+ field_fetch('TR')
380
+ end
381
+
382
+ end
383
+
384
+ end # class TRANSFAC
385
+
386
+ end # module Bio
387
+