bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,417 @@
1
+ require 'bio/reference'
2
+ module Bio
3
+
4
+ #
5
+ # bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files
6
+ #
7
+ # Copyright:: Copyright (C) 2005 Trevor Wennblom <trevor@corevx.com>
8
+ # License:: LGPL
9
+ #
10
+ # $Id: rebase.rb,v 1.2 2005/12/13 15:02:41 trevor Exp $
11
+ #
12
+ #
13
+ #--
14
+ #
15
+ # This library is free software; you can redistribute it and/or
16
+ # modify it under the terms of the GNU Lesser General Public
17
+ # License as published by the Free Software Foundation; either
18
+ # version 2 of the License, or (at your option) any later version.
19
+ #
20
+ # This library is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
+ # Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public
26
+ # License along with this library; if not, write to the Free Software
27
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28
+ #
29
+ #++
30
+ #
31
+ #
32
+
33
+ =begin rdoc
34
+ bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files
35
+
36
+ == Synopsis
37
+
38
+ Bio::REBASE provides utilties for interacting with REBASE data in EMBOSS
39
+ format. REBASE is the Restriction Enzyme Database, more information
40
+ can be found here:
41
+ * http://rebase.neb.com
42
+
43
+ EMBOSS formatted files located at:
44
+ * http://rebase.neb.com/rebase/rebase.f37.html
45
+
46
+ These files are the same as the "emboss_?.???" files located at:
47
+ * ftp://ftp.neb.com/pub/rebase/
48
+
49
+ To easily get started with the data you can simply type this command at your shell prompt:
50
+ wget ftp://ftp.neb.com/pub/rebase/emboss*
51
+
52
+
53
+ == Usage
54
+
55
+ require 'bio/db/rebase'
56
+ require 'pp'
57
+
58
+ enz = File.read('emboss_e')
59
+ ref = File.read('emboss_r')
60
+ sup = File.read('emboss_s')
61
+
62
+ # When creating a new instance of Bio::REBASE
63
+ # the contents of the enzyme file must be passed.
64
+ # The references and suppiers file contents
65
+ # may also be passed.
66
+ rebase = Bio::REBASE.new( enz )
67
+ rebase = Bio::REBASE.new( enz, ref )
68
+ rebase = Bio::REBASE.new( enz, ref, sup )
69
+
70
+ # The 'read' class method allows you to read in files
71
+ # that are REBASE EMBOSS formatted
72
+ rebase = Bio::REBASE.read( 'emboss_e' )
73
+ rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
74
+ rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
75
+
76
+ # The data loaded may be saved in YAML format
77
+ rebase.save_yaml( 'enz.yaml' )
78
+ rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
79
+ rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
80
+
81
+ # YAML formatted files can also be read with the
82
+ # class method 'load_yaml'
83
+ rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
84
+ rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
85
+ rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
86
+
87
+ pp rebase.enzymes[0..4] # ["AarI", "AasI", "AatI", "AatII", "Acc16I"]
88
+ pp rebase['AarI'].pattern # "CACCTGC"
89
+ pp rebase['AarI'].blunt? # false
90
+ pp rebase['AarI'].organism # "Arthrobacter aurescens SS2-322"
91
+ pp rebase['AarI'].source # "A. Janulaitis"
92
+ pp rebase['AarI'].primary_strand_cut1 # 11
93
+ pp rebase['AarI'].primary_strand_cut2 # 0
94
+ pp rebase['AarI'].complementary_strand_cut1 # 15
95
+ pp rebase['AarI'].complementary_strand_cut2 # 0
96
+ pp rebase['AarI'].suppliers # ["F"]
97
+ pp rebase['AarI'].supplier_names # ["Fermentas International Inc."]
98
+
99
+ pp rebase['AarI'].isoschizomers # Currently none stored in the references file
100
+ pp rebase['AarI'].methylation # ""
101
+
102
+ pp rebase['EcoRII'].methylation # "2(5)"
103
+ pp rebase['EcoRII'].suppliers # ["F", "J", "M", "O", "S"]
104
+ pp rebase['EcoRII'].supplier_names # ["Fermentas International Inc.", "Nippon Gene Co., Ltd.",
105
+ # "Roche Applied Science", "Toyobo Biochemicals",
106
+ # "Sigma Chemical Corporation"]
107
+
108
+ # Number of enzymes in the database
109
+ pp rebase.size # 673
110
+ pp rebase.enzymes.size # 673
111
+
112
+ rebase.each do |name, info|
113
+ pp "#{name}: #{info.methylation}" unless info.methylation.empty?
114
+ end
115
+
116
+
117
+ == Author
118
+ Trevor Wennblom <trevor@corevx.com>
119
+
120
+
121
+ == Copyright
122
+ Copyright (C) 2005 Trevor Wennblom
123
+ Licensed under the same terms as BioRuby.
124
+
125
+ =end
126
+
127
+ class REBASE
128
+ autoload(:YAML, 'yaml')
129
+
130
+ class DynamicMethod_Hash < Hash
131
+ # Define a writer or reader
132
+ # * Allows hash[:kay]= to be accessed like hash.key=
133
+ # * Allows hash[:key] to be accessed like hash.key
134
+ def method_missing(method_id, *args)
135
+ k = self.class
136
+ if method_id.to_s[-1].chr == '='
137
+ k.class_eval do
138
+ define_method(method_id) { |s| self[ method_id.to_s[0..-2].to_sym ] = s }
139
+ end
140
+ k.instance_method(method_id).bind(self).call(args[0])
141
+ else
142
+ k.class_eval do
143
+ define_method(method_id) { self[method_id] }
144
+ end
145
+ k.instance_method(method_id).bind(self).call
146
+ end
147
+ end
148
+ end
149
+
150
+ class EnzymeEntry < DynamicMethod_Hash
151
+ @@supplier_data = {}
152
+ def self.supplier_data=(d); @@supplier_data = d; end
153
+
154
+ def supplier_names
155
+ ret = []
156
+ self.suppliers.each { |s| ret << @@supplier_data[s] }
157
+ ret
158
+ end
159
+ end
160
+
161
+ def each
162
+ @data.each { |v| yield v }
163
+ end
164
+
165
+ # Make the instantiated class act like a Hash on @data
166
+ # Does the equivalent and more of this:
167
+ # def []( key ); @data[ key ]; end
168
+ # def size; @data.size; end
169
+ def method_missing(method_id, *args)
170
+ self.class.class_eval do
171
+ define_method(method_id) { |a| Hash.instance_method(method_id).bind(@data).call(a) }
172
+ end
173
+ Hash.instance_method(method_id).bind(@data).call(*args)
174
+ end
175
+
176
+ # All your REBASE are belong to us.
177
+ def initialize( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false )
178
+ if yaml
179
+ @enzyme_data = enzyme_lines
180
+ @reference_data = reference_lines
181
+ @supplier_data = supplier_lines
182
+ else
183
+ @enzyme_data = parse_enzymes(enzyme_lines)
184
+ @reference_data = parse_references(reference_lines)
185
+ @supplier_data = parse_suppliers(supplier_lines)
186
+ end
187
+
188
+ EnzymeEntry.supplier_data = @supplier_data
189
+ setup_enzyme_data
190
+ end
191
+
192
+ # List the enzymes available
193
+ def enzymes
194
+ @data.keys.sort
195
+ end
196
+
197
+ # Save the current data
198
+ # rebase.save_yaml( 'enz.yaml' )
199
+ # rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
200
+ # rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
201
+ def save_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
202
+ File.open(f_enzyme, 'w') { |f| f.puts YAML.dump(@enzyme_data) }
203
+ File.open(f_reference, 'w') { |f| f.puts YAML.dump(@reference_data) } if f_reference
204
+ File.open(f_supplier, 'w') { |f| f.puts YAML.dump(@supplier_data) } if f_supplier
205
+ end
206
+
207
+ # Read REBASE EMBOSS-formatted files
208
+ # rebase = Bio::REBASE.read( 'emboss_e' )
209
+ # rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
210
+ # rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
211
+ def self.read( f_enzyme, f_reference=nil, f_supplier=nil )
212
+ e = IO.readlines(f_enzyme)
213
+ r = f_reference ? IO.readlines(f_reference) : nil
214
+ s = f_supplier ? IO.readlines(f_supplier) : nil
215
+ self.new(e,r,s)
216
+ end
217
+
218
+ # Read YAML formatted files
219
+ # rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
220
+ # rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
221
+ # rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
222
+ def self.load_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
223
+ e = YAML.load_file(f_enzyme)
224
+ r = f_reference ? YAML.load_file(f_reference) : nil
225
+ s = f_supplier ? YAML.load_file(f_supplier) : nil
226
+ self.new(e,r,s,true)
227
+ end
228
+
229
+ #########
230
+ protected
231
+ #########
232
+
233
+ def setup_enzyme_data
234
+ @data = {}
235
+
236
+ @enzyme_data.each do |name, hash|
237
+ @data[name] = EnzymeEntry.new
238
+ d = @data[name]
239
+ d.pattern = hash[:pattern]
240
+ # d.blunt?= is a syntax error
241
+ d[:blunt?] = (hash[:blunt].to_i == 1 ? true : false)
242
+ d.primary_strand_cut1 = hash[:c1].to_i
243
+ d.complementary_strand_cut1 = hash[:c2].to_i
244
+ d.primary_strand_cut2 = hash[:c3].to_i
245
+ d.complementary_strand_cut2 = hash[:c4].to_i
246
+
247
+ # Set up keys just in case there's no reference data supplied
248
+ [:organism, :isoschizomers,
249
+ :methylation, :source].each { |k| d[k] = '' }
250
+ d.suppliers = []
251
+ d.references = []
252
+ end
253
+
254
+ setup_enzyme_and_reference_association
255
+ end
256
+
257
+ def setup_enzyme_and_reference_association
258
+ return unless @reference_data
259
+ @reference_data.each do |name, hash|
260
+ d = @data[name]
261
+ [:organism, :isoschizomers,
262
+ :methylation, :source].each { |k| d[k] = hash[k] }
263
+ d.suppliers = hash[:suppliers].split('')
264
+ d.references = []
265
+ hash[:references].each { |k| d.references << raw_to_reference(k) }
266
+ end
267
+ end
268
+
269
+ # data is a hash indexed by the :name of each entry which is also a hash
270
+ # * data[enzyme_name] has the following keys:
271
+ # :name, :pattern, :len, :ncuts, :blunt, :c1, :c2, :c3, :c4
272
+ # :c1 => First 5' cut
273
+ # :c2 => First 3' cut
274
+ # :c3 => Second 5' cut
275
+ # :c4 => Seocnd 3' cut
276
+ def parse_enzymes( lines )
277
+ data = {}
278
+ return data if lines == nil
279
+ lines.each do |line|
280
+ next if line[0].chr == '#'
281
+ line.chomp!
282
+
283
+ a = line.split("\s")
284
+
285
+ data[ a[0] ] = {
286
+ :name => a[0],
287
+ :pattern => a[1],
288
+ :len => a[2],
289
+ :ncuts => a[3],
290
+ :blunt => a[4],
291
+ :c1 => a[5],
292
+ :c2 => a[6],
293
+ :c3 => a[7],
294
+ :c4 => a[8]
295
+ }
296
+ end # lines.each
297
+ data
298
+ end
299
+
300
+ # data is a hash indexed by the :name of each entry which is also a hash
301
+ # * data[enzyme_name] has the following keys:
302
+ # :organism, :isoschizomers, :references, :source, :methylation, :suppliers, :name, :number_of_references
303
+ def parse_references( lines )
304
+ data = {}
305
+ return data if lines == nil
306
+ index = 1
307
+ h = {}
308
+ references_left = 0
309
+
310
+ lines.each do |line|
311
+ next if line[0].chr == '#' # Comment
312
+ next if line[0..1] == '//' # End of entry marker
313
+ line.chomp!
314
+
315
+ if (1..7).include?( index )
316
+ h[index] = line
317
+ references_left = h[index].to_i if index == 7
318
+ index += 1
319
+ next
320
+ end
321
+
322
+ if index == 8
323
+ h[index] ||= []
324
+ h[index] << line
325
+ references_left -= 1
326
+ end
327
+
328
+ if references_left == 0
329
+ data[ h[1] ] = {
330
+ :name => h[1],
331
+ :organism => h[2],
332
+ :isoschizomers => h[3],
333
+ :methylation => h[4],
334
+ :source => h[5],
335
+ :suppliers => h[6],
336
+ :number_of_references => h[7],
337
+ :references => h[8]
338
+ }
339
+ index = 1
340
+ h = {}
341
+ end
342
+ end # lines.each
343
+ data
344
+ end
345
+
346
+ # data is a hash indexed by the supplier code
347
+ # data[supplier_code]
348
+ # returns the suppliers name
349
+ def parse_suppliers( lines )
350
+ data = {}
351
+ return data if lines == nil
352
+ lines.each do |line|
353
+ next if line[0].chr == '#'
354
+ data[$1] = $2 if line =~ %r{(.+?)\s(.+)}
355
+ end
356
+ data
357
+ end
358
+
359
+ # Takes a string in one of the three formats listed below and returns a
360
+ # Bio::Reference object
361
+ # * Possible input styles:
362
+ # a = 'Inagaki, K., Hikita, T., Yanagidani, S., Nomura, Y., Kishimoto, N., Tano, T., Tanaka, H., (1993) Biosci. Biotechnol. Biochem., vol. 57, pp. 1716-1721.'
363
+ # b = 'Nekrasiene, D., Lapcinskaja, S., Kiuduliene, L., Vitkute, J., Janulaitis, A., Unpublished observations.'
364
+ # c = "Grigaite, R., Maneliene, Z., Janulaitis, A., (2002) Nucleic Acids Res., vol. 30."
365
+ def raw_to_reference( line )
366
+ a = line.split(', ')
367
+
368
+ if a[-1] == 'Unpublished observations.'
369
+ title = a.pop.chop
370
+ pages = volume = year = journal = ''
371
+ else
372
+ title = ''
373
+
374
+ pages_or_volume = a.pop.chop
375
+ if pages_or_volume =~ %r{pp\.\s}
376
+ pages = pages_or_volume
377
+ pages.gsub!('pp. ', '')
378
+ volume = a.pop
379
+ else
380
+ pages = ''
381
+ volume = pages_or_volume
382
+ end
383
+
384
+ volume.gsub!('vol. ', '')
385
+
386
+ year_and_journal = a.pop
387
+ year_and_journal =~ %r{\((\d+)\)\s(.+)}
388
+ year = $1
389
+ journal = $2
390
+ end
391
+
392
+ authors = []
393
+
394
+ last_name = nil
395
+ a.each do |e|
396
+ if last_name
397
+ authors << "#{last_name}, #{e}"
398
+ last_name = nil
399
+ else
400
+ last_name = e
401
+ end
402
+ end
403
+
404
+ ref = {
405
+ 'title' => title,
406
+ 'pages' => pages,
407
+ 'volume' => volume,
408
+ 'year' => year,
409
+ 'journal' => journal,
410
+ 'authors' => authors,
411
+ }
412
+
413
+ Bio::Reference.new(ref)
414
+ end
415
+
416
+ end # REBASE
417
+ end # Bio
@@ -0,0 +1,387 @@
1
+ #
2
+ # bio/db/transfac.rb - TRANSFAC database class
3
+ #
4
+ # Copyright (C) 2001 KAWASHIMA Shuichi <s@bioruby.org>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: transfac.rb,v 1.10 2005/11/28 04:57:33 k Exp $
21
+ #
22
+
23
+ require "bio/db"
24
+ require "matrix"
25
+
26
+ module Bio
27
+
28
+ class TRANSFAC < EMBLDB
29
+
30
+ DELIMITER = RS = "\n//\n"
31
+ TAGSIZE = 4
32
+
33
+ def initialize(entry)
34
+ super(entry, TAGSIZE)
35
+ end
36
+
37
+ # AC Accession number (1 per entry)
38
+ #
39
+ # AC T00001 in the case of FACTOR
40
+ # AC M00001 in the case of MATRIX
41
+ # AC R00001 in the case of SITE
42
+ # AC G000001 in the case of GENE
43
+ # AC C00001 in the case of CLASS
44
+ # AC 00001 in the case of CELL
45
+ #
46
+ def ac
47
+ unless @data['AC']
48
+ @data['AC'] = fetch('AC')
49
+ end
50
+ @data['AC']
51
+ end
52
+ alias entry_id ac
53
+
54
+ # DT Date (1 per entry)
55
+ #
56
+ # DT DD.MM.YYYY (created); ewi.
57
+ # DT DD.MM.YYYY (updated); mpr.
58
+ #
59
+ def dt
60
+ field_fetch('DT')
61
+ end
62
+ alias date dt
63
+
64
+ def cc
65
+ field_fetch('CC')
66
+ end
67
+ alias comment cc
68
+
69
+ def os
70
+ field_fetch('OS')
71
+ end
72
+ alias org_species os
73
+
74
+ def oc
75
+ field_fetch('OC')
76
+ end
77
+ alias org_class oc
78
+
79
+ def rn
80
+ field_fetch('RN')
81
+ end
82
+ alias ref_no rn
83
+
84
+ def ra
85
+ field_fetch('RA')
86
+ end
87
+ alias ref_authors ra
88
+
89
+ def rt
90
+ field_fetch('RT')
91
+ end
92
+ alias ref_title rt
93
+
94
+ def rl
95
+ field_fetch('RL')
96
+ end
97
+ alias ref_data rl
98
+
99
+
100
+ class MATRIX < TRANSFAC
101
+
102
+ def initialize(entry)
103
+ super(entry)
104
+ end
105
+
106
+ # NA Name of the binding factor
107
+ def na
108
+ field_fetch('NA')
109
+ end
110
+
111
+ # DE Short factor description
112
+ def de
113
+ field_fetch('DE')
114
+ end
115
+
116
+ # BF List of linked factor entries
117
+ def bf
118
+ field_fetch('bf')
119
+ end
120
+
121
+
122
+ def ma
123
+ ma_dat = {}
124
+ ma_ary = []
125
+ key = ''
126
+ @orig.each do |k, v|
127
+ if k =~ /^0*(\d+)/
128
+ key = $1.to_i
129
+ ma_dat[key] = fetch(k) unless ma_dat[key]
130
+ end
131
+ end
132
+ ma_dat.keys.sort.each_with_index do |k, i|
133
+ rep_nt = ma_dat[k].slice!(-1, 1)
134
+ ma_dat[k].slice!(-1, 1)
135
+ ma_ary[i] = ma_dat[k].split(/\s+/)
136
+ ma_ary[i].each_with_index do |x, j|
137
+ ma_ary[i][j] = x.to_i
138
+ end
139
+ end
140
+ Matrix[*ma_ary]
141
+ end
142
+
143
+ # BA Statistical basis
144
+ def ba
145
+ field_fetch('BA')
146
+ end
147
+
148
+ end
149
+
150
+
151
+ class SITE < TRANSFAC
152
+
153
+ def initialize(entry)
154
+ super(entry)
155
+ end
156
+
157
+ def ty
158
+ field_fetch('TY')
159
+ end
160
+
161
+ def de
162
+ field_fetch('DE')
163
+ end
164
+
165
+ def re
166
+ field_fetch('RE')
167
+ end
168
+
169
+ def sq
170
+ field_fetch('SQ')
171
+ end
172
+
173
+ def el
174
+ field_fetch('EL')
175
+ end
176
+
177
+ def sf
178
+ field_fetch('SF')
179
+ end
180
+
181
+ def st
182
+ field_fetch('ST')
183
+ end
184
+
185
+ def s1
186
+ field_fetch('S1')
187
+ end
188
+
189
+ def bf
190
+ field_fetch('BF')
191
+ end
192
+
193
+ def so
194
+ field_fetch('SO')
195
+ end
196
+
197
+ def mm
198
+ field_fetch('MM')
199
+ end
200
+
201
+ # DR Cross-references to other databases (>=0 per entry)
202
+ def dr
203
+ field_fetch('DR')
204
+ end
205
+
206
+ end
207
+
208
+
209
+ class FACTOR < TRANSFAC
210
+
211
+ def initialize(entry)
212
+ super(entry)
213
+ end
214
+
215
+ # FA Factor name
216
+ def fa
217
+ field_fetch('FA')
218
+ end
219
+
220
+ # SY Synonyms
221
+ def sy
222
+ field_fetch('SY')
223
+ end
224
+
225
+ # DR Cross-references to other databases (>=0 per entry)
226
+ def dr
227
+ field_fetch('DR')
228
+ end
229
+
230
+ # HO Homologs (suggested)
231
+ def ho
232
+ field_fetch('HO')
233
+ end
234
+
235
+ # CL Classification (class accession no.; class identifier; decimal
236
+ # CL classification number.)
237
+ def cl
238
+ field_fetch('CL')
239
+ end
240
+
241
+ # SZ Size (length (number of amino acids); calculated molecular mass
242
+ # SZ in kDa; experimental molecular mass (or range) in kDa
243
+ # SZ (experimental method) [Ref]
244
+ def sz
245
+ field_fetch('SZ')
246
+ end
247
+
248
+ # SQ Sequence
249
+ def sq
250
+ field_fetch('SQ')
251
+ end
252
+
253
+ # SC Sequence comment, i. e. source of the protein sequence
254
+ def sc
255
+ field_fetch('SC')
256
+ end
257
+
258
+ # FT Feature table (1st position last position feature)
259
+ def ft
260
+ field_fetch('FT')
261
+ end
262
+
263
+ # SF Structural features
264
+ def sf
265
+ field_fetch('SF')
266
+ end
267
+
268
+ # CP Cell specificity (positive)
269
+ def cp
270
+ field_fetch('CP')
271
+ end
272
+
273
+ # CN Cell specificity (negative)
274
+ def cn
275
+ field_fetch('CN')
276
+ end
277
+
278
+ # FF Functional features
279
+ def ff
280
+ field_fetch('FF')
281
+ end
282
+
283
+ # IN Interacting factors (factor accession no.; factor name;
284
+ # IN biological species.)
285
+ def in
286
+ field_fetch('IN')
287
+ end
288
+
289
+ # MX Matrix (matrix accession no.; matrix identifier)
290
+ def mx
291
+ field_fetch('MX')
292
+ end
293
+
294
+ # BS Bound sites (site accession no.; site ID; quality: N; biological
295
+ # BS species)
296
+ def bs
297
+ field_fetch('BS')
298
+ end
299
+
300
+ end
301
+
302
+
303
+ class CELL < TRANSFAC
304
+
305
+ def initialize(entry)
306
+ super(entry)
307
+ end
308
+
309
+ # CD Cell description
310
+ def cd
311
+ field_fetch('CD')
312
+ end
313
+
314
+ end
315
+
316
+
317
+ class CLASS < TRANSFAC
318
+
319
+ def initialize(entry)
320
+ super(entry)
321
+ end
322
+
323
+ # CL Class
324
+ def cl
325
+ field_fetch('CL')
326
+ end
327
+
328
+ # SD Structure description
329
+ def sd
330
+ field_fetch('SD')
331
+ end
332
+
333
+ # BF Factors belonging to this class
334
+ def bf
335
+ field_fetch('BF')
336
+ end
337
+
338
+ # DR PROSITE accession numbers
339
+ def dr
340
+ field_fetch('DR')
341
+ end
342
+
343
+ end
344
+
345
+
346
+ class GENE < TRANSFAC
347
+
348
+ def initialize(entry)
349
+ super(entry)
350
+ end
351
+
352
+ # SD Short description/name of the gene
353
+ def sd
354
+ field_fetch('SD')
355
+ end
356
+
357
+ # DE
358
+ def de
359
+ field_fetch('DE')
360
+ end
361
+
362
+ # BC Bucher promoter
363
+ def bc
364
+ field_fetch('BC')
365
+ end
366
+
367
+ # BS TRANSFAC SITE positions and accession numbers
368
+ def bs
369
+ field_fetch('BS')
370
+ end
371
+
372
+ # CO COMPEL accession number
373
+ def co
374
+ field_fetch('CO')
375
+ end
376
+
377
+ # TR TRRD accession number
378
+ def tr
379
+ field_fetch('TR')
380
+ end
381
+
382
+ end
383
+
384
+ end # class TRANSFAC
385
+
386
+ end # module Bio
387
+