bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,778 @@
1
+ #
2
+ # bio/io/flatfile/indexer.rb - OBDA flatfile indexer
3
+ #
4
+ # Copyright (C) 2002 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: indexer.rb,v 1.21 2005/09/26 13:00:08 k Exp $
21
+ #
22
+
23
+ require 'bio/io/flatfile/index'
24
+
25
+ module Bio
26
+ class FlatFileIndex
27
+
28
+ module Indexer
29
+
30
+ class NameSpace
31
+ def initialize(name, method)
32
+ @name = name
33
+ @proc = method
34
+ end
35
+ attr_reader :name, :proc
36
+ end #class NameSpace
37
+
38
+ class NameSpaces < Hash
39
+ def initialize(*arg)
40
+ super()
41
+ arg.each do |x|
42
+ self.store(x.name, x)
43
+ end
44
+ end
45
+ def names
46
+ self.keys
47
+ end
48
+ def <<(x)
49
+ self.store(x.name, x)
50
+ end
51
+ def add(x)
52
+ self.store(x.name, x)
53
+ end
54
+ #alias each_orig each
55
+ alias each each_value
56
+ end
57
+
58
+ module Parser
59
+ def self.new(format, *arg)
60
+ case format.to_s
61
+ when 'embl', 'Bio::EMBL'
62
+ EMBLParser.new(*arg)
63
+ when 'swiss', 'Bio::SPTR', 'Bio::TrEMBL', 'Bio::SwissProt'
64
+ SPTRParser.new(*arg)
65
+ when 'genbank', 'Bio::GenBank', 'Bio::RefSeq', 'Bio::DDBJ'
66
+ GenBankParser.new(*arg)
67
+ when 'Bio::GenPept'
68
+ GenPeptParser.new(*arg)
69
+ when 'fasta', 'Bio::FastaFormat'
70
+ FastaFormatParser.new(*arg)
71
+ when 'Bio::FANTOM::MaXML::Sequence'
72
+ MaXMLSequenceParser.new(*arg)
73
+ when 'Bio::FANTOM::MaXML::Cluster'
74
+ MaXMLClusterParser.new(*arg)
75
+ when 'Bio::Blast::Default::Report'
76
+ BlastDefaultParser.new(Bio::Blast::Default::Report, *arg)
77
+ when 'Bio::Blast::Default::Report_TBlast'
78
+ BlastDefaultParser.new(Bio::Blast::Default::Report_TBlast, *arg)
79
+ when 'Bio::Blast::WU::Report'
80
+ BlastDefaultParser.new(Bio::Blast::WU::Report, *arg)
81
+ when 'Bio::Blast::WU::Report_TBlast'
82
+ BlastDefaultParser.new(Bio::Blast::WU::Report_TBlast, *arg)
83
+ else
84
+ raise 'unknown or unsupported format'
85
+ end #case dbclass.to_s
86
+ end
87
+
88
+ class TemplateParser
89
+ NAMESTYLE = NameSpaces.new
90
+ def initialize
91
+ @namestyle = self.class::NAMESTYLE
92
+ @secondary = NameSpaces.new
93
+ @errorlog = []
94
+ end
95
+ attr_reader :primary, :secondary, :format, :dbclass
96
+ attr_reader :errorlog
97
+
98
+ def set_primary_namespace(name)
99
+ DEBUG.print "set_primary_namespace: #{name.inspect}\n"
100
+ if name.is_a?(NameSpace) then
101
+ @primary = name
102
+ else
103
+ @primary = @namestyle[name]
104
+ end
105
+ raise 'unknown primary namespace' unless @primary
106
+ @primary
107
+ end
108
+
109
+ def add_secondary_namespaces(*names)
110
+ DEBUG.print "add_secondary_namespaces: #{names.inspect}\n"
111
+ names.each do |x|
112
+ unless x.is_a?(NameSpace) then
113
+ y = @namestyle[x]
114
+ raise 'unknown secondary namespace' unless y
115
+ @secondary << y
116
+ end
117
+ end
118
+ true
119
+ end
120
+
121
+ # administration of a single flatfile
122
+ def open_flatfile(fileid, file)
123
+ @fileid = fileid
124
+ @flatfilename = file
125
+ DEBUG.print "fileid=#{fileid} file=#{@flatfilename.inspect}\n"
126
+ @flatfile = Bio::FlatFile.open(@dbclass, file, 'rb')
127
+ @flatfile.raw = nil
128
+ @entry = nil
129
+ end
130
+ attr_reader :fileid
131
+
132
+ def each
133
+ pos = @flatfile.pos
134
+ @flatfile.each do |x|
135
+ @entry = x
136
+ len = @flatfile.entry_raw.length
137
+ begin
138
+ yield pos, len
139
+ rescue RuntimeError, NameError => evar
140
+ DEBUG.print "Caught error: #{evar.inspect}\n"
141
+ DEBUG.print "in #{@flatfilename.inspect} position #{pos}\n"
142
+ DEBUG.print "===begin===\n"
143
+ DEBUG.print @flatfile.entry_raw.to_s.chomp
144
+ DEBUG.print "\n===end===\n"
145
+ @errorlog << [ evar, @flatfilename, pos ]
146
+ if @fatal then
147
+ DEBUG.print "Fatal error occurred, stop creating index...\n"
148
+ raise evar
149
+ else
150
+ DEBUG.print "This entry shall be incorrectly indexed.\n"
151
+ end
152
+ end #rescue
153
+ pos = @flatfile.pos
154
+ end
155
+ end
156
+
157
+ def parse_primary
158
+ r = self.primary.proc.call(@entry)
159
+ unless r.is_a?(String) and r.length > 0
160
+ #@fatal = true
161
+ raise 'primary id must be a non-void string (skipped this entry)'
162
+ end
163
+ r
164
+ end
165
+
166
+ def parse_secondary
167
+ self.secondary.each do |x|
168
+ p = x.proc.call(@entry)
169
+ p.each do |y|
170
+ yield x.name, y if y.length > 0
171
+ end
172
+ end
173
+ end
174
+
175
+ def close_flatfile
176
+ DEBUG.print "close flatfile #{@flatfilename.inspect}\n"
177
+ @flatfile.close
178
+ end
179
+
180
+ protected
181
+ attr_writer :format, :dbclass
182
+ end #class TemplateParser
183
+
184
+ class GenBankParser < TemplateParser
185
+ NAMESTYLE = NameSpaces.new(
186
+ NameSpace.new( 'VERSION', Proc.new { |x| x.acc_version } ),
187
+ NameSpace.new( 'LOCUS', Proc.new { |x| x.entry_id } ),
188
+ NameSpace.new( 'ACCESSION',
189
+ Proc.new { |x| x.accessions } ),
190
+ NameSpace.new( 'GI', Proc.new { |x|
191
+ x.gi.to_s.gsub(/\AGI\:/, '') } )
192
+ )
193
+ PRIMARY = 'VERSION'
194
+ def initialize(pri_name = nil, sec_names = nil)
195
+ super()
196
+ self.format = 'genbank'
197
+ self.dbclass = Bio::GenBank
198
+ self.set_primary_namespace((pri_name or PRIMARY))
199
+ unless sec_names then
200
+ sec_names = []
201
+ @namestyle.each_value do |x|
202
+ sec_names << x.name if x.name != self.primary.name
203
+ end
204
+ end
205
+ self.add_secondary_namespaces(*sec_names)
206
+ end
207
+ def open_flatfile(fileid, file)
208
+ super
209
+ @flatfile.pos = 0
210
+ begin
211
+ pos = @flatfile.pos
212
+ line = @flatfile.gets
213
+ end until (!line or line =~ /^LOCUS /)
214
+ @flatfile.pos = pos
215
+ end
216
+ end #class GenBankParser
217
+
218
+ class GenPeptParser < GenBankParser
219
+ def initialize(*arg)
220
+ super(*arg)
221
+ self.dbclass = Bio::GenPept
222
+ end
223
+ end #class GenPeptParser
224
+
225
+ class EMBLParser < TemplateParser
226
+ NAMESTYLE = NameSpaces.new(
227
+ NameSpace.new( 'ID', Proc.new { |x| x.entry_id } ),
228
+ NameSpace.new( 'AC', Proc.new { |x| x.accessions } ),
229
+ NameSpace.new( 'SV', Proc.new { |x| x.sv } ),
230
+ NameSpace.new( 'DR', Proc.new { |x|
231
+ y = []
232
+ x.dr.each_value { |z| y << z }
233
+ y.flatten!
234
+ y.find_all { |z| z.length > 1 } }
235
+ )
236
+ )
237
+ PRIMARY = 'ID'
238
+ SECONDARY = [ 'AC', 'SV' ]
239
+ def initialize(pri_name = nil, sec_names = nil)
240
+ super()
241
+ self.format = 'embl'
242
+ self.dbclass = Bio::EMBL
243
+ self.set_primary_namespace((pri_name or PRIMARY))
244
+ unless sec_names then
245
+ sec_names = self.class::SECONDARY
246
+ end
247
+ self.add_secondary_namespaces(*sec_names)
248
+ end
249
+ end #class EMBLParser
250
+
251
+ class SPTRParser < EMBLParser
252
+ SECONDARY = [ 'AC' ]
253
+ def initialize(*arg)
254
+ super(*arg)
255
+ self.format = 'swiss'
256
+ self.dbclass = Bio::SPTR
257
+ end
258
+ end #class SPTRParser
259
+
260
+ class FastaFormatParser < TemplateParser
261
+ NAMESTYLE = NameSpaces.new(
262
+ NameSpace.new( 'UNIQUE', nil ),
263
+ NameSpace.new( 'entry_id', Proc.new { |x| x.entry_id } ),
264
+ NameSpace.new( 'accession', Proc.new { |x| x.accessions } ),
265
+ NameSpace.new( 'id_string', Proc.new { |x|
266
+ x.identifiers.id_strings
267
+ }),
268
+ NameSpace.new( 'word', Proc.new { |x|
269
+ x.identifiers.words
270
+ })
271
+ )
272
+ PRIMARY = 'UNIQUE'
273
+ SECONDARY = [ 'entry_id', 'accession', 'id_string', 'word' ]
274
+
275
+ def unique_primary_key
276
+ r = "#{@flatfilename}:#{@count}"
277
+ @count += 1
278
+ r
279
+ end
280
+ private :unique_primary_key
281
+
282
+ def parse_primary
283
+ if p = self.primary.proc then
284
+ r = p.call(@entry)
285
+ unless r.is_a?(String) and r.length > 0
286
+ #@fatal = true
287
+ raise 'primary id must be a non-void string (skipped this entry)'
288
+ end
289
+ r
290
+ else
291
+ unique_primary_key
292
+ end
293
+ end
294
+
295
+ def initialize(pri_name = nil, sec_names = nil)
296
+ super()
297
+ self.format = 'fasta'
298
+ self.dbclass = Bio::FastaFormat
299
+ self.set_primary_namespace((pri_name or PRIMARY))
300
+ unless sec_names then
301
+ sec_names = self.class::SECONDARY
302
+ end
303
+ self.add_secondary_namespaces(*sec_names)
304
+ end
305
+ def open_flatfile(fileid, file)
306
+ super
307
+ @count = 1
308
+ @flatfilename_base = File.basename(@flatfilename)
309
+ @flatfile.pos = 0
310
+ begin
311
+ pos = @flatfile.pos
312
+ line = @flatfile.gets
313
+ end until (!line or line =~ /^\>/)
314
+ @flatfile.pos = pos
315
+ end
316
+ end #class FastaFormatParser
317
+
318
+ class MaXMLSequenceParser < TemplateParser
319
+ NAMESTYLE = NameSpaces.new(
320
+ NameSpace.new( 'id', Proc.new { |x| x.entry_id } ),
321
+ NameSpace.new( 'altid', Proc.new { |x| x.id_strings } ),
322
+ NameSpace.new( 'gene_ontology', Proc.new { |x|
323
+ x.annotations.get_all_by_qualifier('gene_ontology').collect { |y|
324
+ y.anntext
325
+ }
326
+ }),
327
+ NameSpace.new( 'datasrc', Proc.new { |x|
328
+ a = []
329
+ x.annotations.each { |y|
330
+ y.datasrc.each { |z|
331
+ a << z.split('|',2)[-1]
332
+ a << z
333
+ }
334
+ }
335
+ a.sort!
336
+ a.uniq!
337
+ a
338
+ })
339
+ )
340
+ PRIMARY = 'id'
341
+ SECONDARY = [ 'altid', 'gene_ontology', 'datasrc' ]
342
+ def initialize(pri_name = nil, sec_names = nil)
343
+ super()
344
+ self.format = 'raw'
345
+ self.dbclass = Bio::FANTOM::MaXML::Sequence
346
+ self.set_primary_namespace((pri_name or PRIMARY))
347
+ unless sec_names then
348
+ sec_names = self.class::SECONDARY
349
+ end
350
+ self.add_secondary_namespaces(*sec_names)
351
+ end
352
+ end #class MaXMLSequenceParser
353
+
354
+ class MaXMLClusterParser < TemplateParser
355
+ NAMESTYLE = NameSpaces.new(
356
+ NameSpace.new( 'id', Proc.new { |x| x.entry_id } ),
357
+ NameSpace.new( 'altid', Proc.new { |x| x.sequences.id_strings } ),
358
+ NameSpace.new( 'datasrc', Proc.new { |x|
359
+ a = x.sequences.collect { |y|
360
+ MaXMLSequenceParser::NAMESTYLE['datasrc'].proc.call(y)
361
+ }
362
+ a.flatten!
363
+ a.sort!
364
+ a.uniq!
365
+ a
366
+ }),
367
+ NameSpace.new( 'gene_ontology', Proc.new { |x|
368
+ a = x.sequences.collect { |y|
369
+ MaXMLSequenceParser::NAMESTYLE['gene_ontology'].proc.call(y)
370
+ }
371
+ a.flatten!
372
+ a.sort!
373
+ a.uniq!
374
+ a
375
+ })
376
+ )
377
+ PRIMARY = 'id'
378
+ SECONDARY = [ 'altid', 'gene_ontology', 'datasrc' ]
379
+ def initialize(pri_name = nil, sec_names = nil)
380
+ super()
381
+ self.format = 'raw'
382
+ self.dbclass = Bio::FANTOM::MaXML::Cluster
383
+ self.set_primary_namespace((pri_name or PRIMARY))
384
+ unless sec_names then
385
+ sec_names = self.class::SECONDARY
386
+ end
387
+ self.add_secondary_namespaces(*sec_names)
388
+ end
389
+ end #class MaXMLSequenceParser
390
+
391
+ class BlastDefaultParser < TemplateParser
392
+ NAMESTYLE = NameSpaces.new(
393
+ NameSpace.new( 'QUERY', Proc.new { |x| x.query_def } ),
394
+ NameSpace.new( 'query_id', Proc.new { |x|
395
+ a = Bio::FastaDefline.new(x.query_def.to_s).id_strings
396
+ a << x.query_def.to_s.split(/\s+/,2)[0]
397
+ a
398
+ } ),
399
+ NameSpace.new( 'hit', Proc.new { |x|
400
+ a = x.hits.collect { |y|
401
+ b = Bio::FastaDefline.new(y.definition.to_s).id_strings
402
+ b << y.definition
403
+ b << y.definition.to_s.split(/\s+/,2)[0]
404
+ b
405
+ }
406
+ a.flatten!
407
+ a
408
+ } )
409
+ )
410
+ PRIMARY = 'QUERY'
411
+ SECONDARY = [ 'query_id', 'hit' ]
412
+ def initialize(klass, pri_name = nil, sec_names = nil)
413
+ super()
414
+ self.format = 'raw'
415
+ self.dbclass = klass
416
+ self.set_primary_namespace((pri_name or PRIMARY))
417
+ unless sec_names then
418
+ sec_names = []
419
+ @namestyle.each_value do |x|
420
+ sec_names << x.name if x.name != self.primary.name
421
+ end
422
+ end
423
+ self.add_secondary_namespaces(*sec_names)
424
+ end
425
+ def open_flatfile(fileid, file)
426
+ super
427
+ @flatfile.rewind
428
+ @flatfile.dbclass = nil
429
+ @flatfile.autodetect
430
+ @flatfile.dbclass = self.dbclass unless @flatfile.dbclass
431
+ @flatfile.rewind
432
+ begin
433
+ pos = @flatfile.pos
434
+ line = @flatfile.gets
435
+ end until (!line or line =~ /^T?BLAST/)
436
+ @flatfile.pos = pos
437
+ end
438
+ end #class BlastDefaultReportParser
439
+
440
+ end #module Parser
441
+
442
+ def self.makeindexBDB(name, parser, options, *files)
443
+ # options are not used in this method
444
+ unless defined?(BDB)
445
+ raise RuntimeError, "Berkeley DB support not found"
446
+ end
447
+ DEBUG.print "makeing BDB DataBank...\n"
448
+ db = DataBank.new(name, MAGIC_BDB)
449
+ db.format = parser.format
450
+ db.fileids.add(*files)
451
+ db.fileids.recalc
452
+
453
+ db.primary = parser.primary.name
454
+ db.secondary = parser.secondary.names
455
+
456
+ DEBUG.print "writing config.dat, config, fileids ...\n"
457
+ db.write('wb', BDBdefault::flag_write)
458
+
459
+ DEBUG.print "reading files...\n"
460
+
461
+ addindex_bdb(db, BDBdefault::flag_write, (0...(files.size)),
462
+ parser, options)
463
+ db.close
464
+ true
465
+ end #def
466
+
467
+ def self.addindex_bdb(db, flag, need_update, parser, options)
468
+ DEBUG.print "reading files...\n"
469
+
470
+ pn = db.primary
471
+ pn.file.close
472
+ pn.file.flag = flag
473
+
474
+ db.secondary.each_files do |x|
475
+ x.file.close
476
+ x.file.flag = flag
477
+ x.file.open
478
+ x.file.close
479
+ end
480
+
481
+ need_update.each do |fileid|
482
+ filename = db.fileids[fileid].filename
483
+ parser.open_flatfile(fileid, filename)
484
+ parser.each do |pos, len|
485
+ p = parser.parse_primary
486
+ #pn.file.add_exclusive(p, [ fileid, pos, len ])
487
+ pn.file.add_overwrite(p, [ fileid, pos, len ])
488
+ #DEBUG.print "#{p} #{fileid} #{pos} #{len}\n"
489
+ parser.parse_secondary do |sn, sp|
490
+ db.secondary[sn].file.add_nr(sp, p)
491
+ #DEBUG.print "#{sp} #{p}\n"
492
+ end
493
+ end
494
+ parser.close_flatfile
495
+ end
496
+ true
497
+ end #def
498
+
499
+ def self.makeindexFlat(name, parser, options, *files)
500
+ DEBUG.print "makeing flat/1 DataBank using temporary files...\n"
501
+
502
+ db = DataBank.new(name, nil)
503
+ db.format = parser.format
504
+ db.fileids.add(*files)
505
+ db.primary = parser.primary.name
506
+ db.secondary = parser.secondary.names
507
+ db.fileids.recalc
508
+ DEBUG.print "writing DabaBank...\n"
509
+ db.write('wb')
510
+
511
+ addindex_flat(db, :new, (0...(files.size)), parser, options)
512
+ db.close
513
+ true
514
+ end #def
515
+
516
+ def self.addindex_flat(db, mode, need_update, parser, options)
517
+ require 'tempfile'
518
+ prog = options['sort_program']
519
+
520
+ return false if need_update.to_a.size == 0
521
+
522
+ DEBUG.print "prepare temporary files...\n"
523
+ tempbase = "bioflat#{rand(10000)}-"
524
+ pfile = Tempfile.open(tempbase + 'primary-')
525
+ DEBUG.print "open temporary file #{pfile.path.inspect}\n"
526
+ sfiles = {}
527
+ parser.secondary.names.each do |x|
528
+ sfiles[x] = Tempfile.open(tempbase + 'secondary-')
529
+ DEBUG.print "open temporary file #{sfiles[x].path.inspect}\n"
530
+ end
531
+
532
+ DEBUG.print "reading files...\n"
533
+ need_update.each do |fileid|
534
+ filename = db.fileids[fileid].filename
535
+ parser.open_flatfile(fileid, filename)
536
+ parser.each do |pos, len|
537
+ p = parser.parse_primary
538
+ pfile << "#{p}\t#{fileid}\t#{pos}\t#{len}\n"
539
+ #DEBUG.print "#{p} #{fileid} #{pos} #{len}\n"
540
+ parser.parse_secondary do |sn, sp|
541
+ sfiles[sn] << "#{sp}\t#{p}\n"
542
+ #DEBUG.print "#{sp} #{p}\n"
543
+ end
544
+ end
545
+ parser.close_flatfile
546
+ fileid += 1
547
+ end
548
+
549
+ sort_proc = chose_sort_proc(prog, mode)
550
+ pfile.close(false)
551
+ DEBUG.print "sorting primary (#{parser.primary.name})...\n"
552
+ db.primary.file.import_tsv_files(true, mode, sort_proc, pfile.path)
553
+ pfile.close(true)
554
+
555
+ parser.secondary.names.each do |x|
556
+ DEBUG.print "sorting secondary (#{x})...\n"
557
+ sfiles[x].close(false)
558
+ db.secondary[x].file.import_tsv_files(false, mode, sort_proc,
559
+ sfiles[x].path)
560
+ sfiles[x].close(true)
561
+ end
562
+ true
563
+ end #def
564
+
565
+ DEFAULT_SORT = '/usr/bin/sort'
566
+ def self.chose_sort_proc(prog, mode = :new)
567
+ case prog
568
+ when /^builtin$/i, /^hs$/i, /^lm$/i
569
+ DEBUG.print "sort: internal sort routine\n"
570
+ sort_proc = mapfile.internal_sort_proc
571
+ when nil, ''
572
+ if FileTest.executable?(DEFAULT_SORT)
573
+ DEBUG.print "sort: #{DEFAULT_SORT}\n"
574
+ if mode == :new then
575
+ sort_proc = Flat_1::FlatMappingFile::external_sort_proc(DEFAULT_SORT)
576
+ else
577
+ sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(DEFAULT_SORT)
578
+ end
579
+ else
580
+ DEBUG.print "sort: internal sort routine\n"
581
+ sort_proc = Flat_1::FlatMappingFile::internal_sort_proc
582
+ end
583
+ else
584
+ DEBUG.print "sort: #{prog}\n"
585
+ if mode == :new then
586
+ sort_proc = Flat_1::FlatMappingFile::external_sort_proc(prog)
587
+ else
588
+ sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(prog)
589
+ end
590
+ end
591
+ sort_proc
592
+ end
593
+
594
+ def self.update_index(name, parser, options, *files)
595
+ db = DataBank.open(name)
596
+
597
+ if parser then
598
+ raise 'file format mismatch' if db.format != parser.format
599
+ else
600
+
601
+ begin
602
+ dbclass_orig =
603
+ Bio::FlatFile.autodetect_file(db.fileids[0].filename)
604
+ rescue TypeError, Errno::ENOENT
605
+ end
606
+ begin
607
+ dbclass_new =
608
+ Bio::FlatFile.autodetect_file(files[0])
609
+ rescue TypeError, Errno::ENOENT
610
+ end
611
+
612
+ case db.format
613
+ when 'swiss', 'embl'
614
+ parser = Parser.new(db.format)
615
+ if dbclass_new and dbclass_new != parser.dbclass
616
+ raise 'file format mismatch'
617
+ end
618
+ when 'genbank'
619
+ dbclass = dbclass_orig or dbclass_new
620
+ if dbclass == Bio::GenBank or dbclass == Bio::GenPept
621
+ parser = Parser.new(dbclass_orig)
622
+ elsif !dbclass then
623
+ raise 'cannnot determine format. please specify manually.'
624
+ else
625
+ raise 'file format mismatch'
626
+ end
627
+ if dbclass_new and dbclass_new != parser.dbclass
628
+ raise 'file format mismatch'
629
+ end
630
+ else
631
+ raise 'unsupported format'
632
+ end
633
+ end
634
+
635
+ parser.set_primary_namespace(db.primary.name)
636
+ parser.add_secondary_namespaces(*db.secondary.names)
637
+
638
+ if options['renew'] then
639
+ newfiles = db.fileids.filenames.find_all do |x|
640
+ FileTest.exist?(x)
641
+ end
642
+ newfiles.concat(files)
643
+ newfiles2 = newfiles.sort
644
+ newfiles2.uniq!
645
+ newfiles3 = []
646
+ newfiles.each do |x|
647
+ newfiles3 << x if newfiles2.delete(x)
648
+ end
649
+ t = db.index_type
650
+ db.close
651
+ case t
652
+ when MAGIC_BDB
653
+ Indexer::makeindexBDB(name, parser, options, *newfiles3)
654
+ when MAGIC_FLAT
655
+ Indexer::makeindexFlat(name, parser, options, *newfiles3)
656
+ else
657
+ raise 'Unsupported index type'
658
+ end
659
+ return true
660
+ end
661
+
662
+ need_update = []
663
+ newfiles = files.dup
664
+ db.fileids.cache_all
665
+ db.fileids.each_with_index do |f, i|
666
+ need_update << i unless f.check
667
+ newfiles.delete(f.filename)
668
+ end
669
+
670
+ b = db.fileids.size
671
+ begin
672
+ db.fileids.recalc
673
+ rescue Errno::ENOENT => evar
674
+ DEBUG.print "Error: #{evar}\n"
675
+ DEBUG.print "assumed --renew option\n"
676
+ db.close
677
+ options = options.dup
678
+ options['renew'] = true
679
+ update_index(name, parser, options, *files)
680
+ return true
681
+ end
682
+ # add new files
683
+ db.fileids.add(*newfiles)
684
+ db.fileids.recalc
685
+
686
+ need_update.concat((b...(b + newfiles.size)).to_a)
687
+
688
+ DEBUG.print "writing DabaBank...\n"
689
+ db.write('wb', BDBdefault::flag_append)
690
+
691
+ case db.index_type
692
+ when MAGIC_BDB
693
+ addindex_bdb(db, BDBdefault::flag_append,
694
+ need_update, parser, options)
695
+ when MAGIC_FLAT
696
+ addindex_flat(db, :add, need_update, parser, options)
697
+ else
698
+ raise 'Unsupported index type'
699
+ end
700
+
701
+ db.close
702
+ true
703
+ end #def
704
+ end #module Indexer
705
+
706
+ ##############################################################
707
+ def self.formatstring2class(format_string)
708
+ case format
709
+ when /genbank/i
710
+ dbclass = Bio::GenBank
711
+ when /genpept/i
712
+ dbclass = Bio::GenPept
713
+ when /embl/i
714
+ dbclass = Bio::EMBL
715
+ when /sptr/i
716
+ dbclass = Bio::SPTR
717
+ when /fasta/i
718
+ dbclass = Bio::FastaFormat
719
+ else
720
+ raise "Unsupported format : #{format}"
721
+ end
722
+ end
723
+
724
+ def self.makeindex(is_bdb, dbname, format, options, *files)
725
+ if format then
726
+ dbclass = formatstring2class(format)
727
+ else
728
+ dbclass = Bio::FlatFile.autodetect_file(files[0])
729
+ raise "Cannot determine format" unless dbclass
730
+ DEBUG.print "file format is #{dbclass}\n"
731
+ end
732
+
733
+ options = {} unless options
734
+ pns = options['primary_namespace']
735
+ sns = options['secondary_namespaces']
736
+
737
+ parser = Indexer::Parser.new(dbclass, pns, sns)
738
+
739
+ #if /(EMBL|SPTR)/ =~ dbclass.to_s then
740
+ #a = [ 'DR' ]
741
+ #parser.add_secondary_namespaces(*a)
742
+ #end
743
+ if sns = options['additional_secondary_namespaces'] then
744
+ parser.add_secondary_namespaces(*sns)
745
+ end
746
+
747
+ if is_bdb then
748
+ Indexer::makeindexBDB(dbname, parser, options, *files)
749
+ else
750
+ Indexer::makeindexFlat(dbname, parser, options, *files)
751
+ end
752
+ end #def makeindex
753
+
754
+ def self.update_index(dbname, format, options, *files)
755
+ if format then
756
+ parser = Indexer::Parser.new(dbclass)
757
+ else
758
+ parser = nil
759
+ end
760
+ Indexer::update_index(dbname, parser, options, *files)
761
+ end #def update_index
762
+
763
+ end #class FlatFileIndex
764
+ end #module Bio
765
+
766
+ =begin
767
+
768
+ = Bio::FlatFile
769
+
770
+ --- Bio::FlatFile.makeindex(is_bdb, dbname, format, options, *files)
771
+
772
+ Create index files (called a databank) of given files.
773
+
774
+ --- Bio::FlatFile.update_index(dbname, format, options, *files)
775
+
776
+ Add entries to databank.
777
+
778
+ =end