bio 0.7.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. data/bin/bioruby +71 -27
  2. data/bin/br_biofetch.rb +5 -17
  3. data/bin/br_bioflat.rb +14 -26
  4. data/bin/br_biogetseq.rb +6 -18
  5. data/bin/br_pmfetch.rb +6 -16
  6. data/doc/Changes-0.7.rd +35 -0
  7. data/doc/KEGG_API.rd +287 -172
  8. data/doc/KEGG_API.rd.ja +273 -160
  9. data/doc/Tutorial.rd +18 -9
  10. data/doc/Tutorial.rd.ja +656 -138
  11. data/lib/bio.rb +6 -24
  12. data/lib/bio/alignment.rb +5 -5
  13. data/lib/bio/appl/blast.rb +132 -98
  14. data/lib/bio/appl/blast/format0.rb +9 -19
  15. data/lib/bio/appl/blast/wublast.rb +5 -18
  16. data/lib/bio/appl/emboss.rb +40 -47
  17. data/lib/bio/appl/hmmer.rb +116 -82
  18. data/lib/bio/appl/hmmer/report.rb +509 -364
  19. data/lib/bio/appl/spidey/report.rb +7 -18
  20. data/lib/bio/data/na.rb +3 -21
  21. data/lib/bio/db.rb +3 -21
  22. data/lib/bio/db/aaindex.rb +147 -52
  23. data/lib/bio/db/embl/common.rb +27 -6
  24. data/lib/bio/db/embl/embl.rb +18 -10
  25. data/lib/bio/db/embl/sptr.rb +87 -67
  26. data/lib/bio/db/embl/swissprot.rb +32 -3
  27. data/lib/bio/db/embl/trembl.rb +32 -3
  28. data/lib/bio/db/embl/uniprot.rb +32 -3
  29. data/lib/bio/db/fasta.rb +327 -289
  30. data/lib/bio/db/medline.rb +25 -4
  31. data/lib/bio/db/nbrf.rb +12 -20
  32. data/lib/bio/db/pdb.rb +4 -1
  33. data/lib/bio/db/pdb/chemicalcomponent.rb +240 -0
  34. data/lib/bio/db/pdb/pdb.rb +13 -8
  35. data/lib/bio/db/rebase.rb +93 -97
  36. data/lib/bio/feature.rb +2 -31
  37. data/lib/bio/io/ddbjxml.rb +167 -139
  38. data/lib/bio/io/fastacmd.rb +89 -56
  39. data/lib/bio/io/flatfile.rb +994 -278
  40. data/lib/bio/io/flatfile/index.rb +257 -194
  41. data/lib/bio/io/flatfile/indexer.rb +37 -29
  42. data/lib/bio/reference.rb +147 -64
  43. data/lib/bio/sequence.rb +57 -417
  44. data/lib/bio/sequence/aa.rb +64 -0
  45. data/lib/bio/sequence/common.rb +175 -0
  46. data/lib/bio/sequence/compat.rb +68 -0
  47. data/lib/bio/sequence/format.rb +134 -0
  48. data/lib/bio/sequence/generic.rb +24 -0
  49. data/lib/bio/sequence/na.rb +189 -0
  50. data/lib/bio/shell.rb +9 -23
  51. data/lib/bio/shell/core.rb +130 -125
  52. data/lib/bio/shell/demo.rb +143 -0
  53. data/lib/bio/shell/{session.rb → interface.rb} +42 -40
  54. data/lib/bio/shell/object.rb +52 -0
  55. data/lib/bio/shell/plugin/codon.rb +4 -22
  56. data/lib/bio/shell/plugin/emboss.rb +23 -0
  57. data/lib/bio/shell/plugin/entry.rb +34 -25
  58. data/lib/bio/shell/plugin/flatfile.rb +5 -23
  59. data/lib/bio/shell/plugin/keggapi.rb +11 -24
  60. data/lib/bio/shell/plugin/midi.rb +5 -23
  61. data/lib/bio/shell/plugin/obda.rb +4 -22
  62. data/lib/bio/shell/plugin/seq.rb +6 -24
  63. data/lib/bio/shell/rails/Rakefile +10 -0
  64. data/lib/bio/shell/rails/app/controllers/application.rb +4 -0
  65. data/lib/bio/shell/rails/app/controllers/shell_controller.rb +94 -0
  66. data/lib/bio/shell/rails/app/helpers/application_helper.rb +3 -0
  67. data/lib/bio/shell/rails/app/models/shell_connection.rb +30 -0
  68. data/lib/bio/shell/rails/app/views/layouts/shell.rhtml +37 -0
  69. data/lib/bio/shell/rails/app/views/shell/history.rhtml +5 -0
  70. data/lib/bio/shell/rails/app/views/shell/index.rhtml +2 -0
  71. data/lib/bio/shell/rails/app/views/shell/show.rhtml +13 -0
  72. data/lib/bio/shell/rails/config/boot.rb +19 -0
  73. data/lib/bio/shell/rails/config/database.yml +85 -0
  74. data/lib/bio/shell/rails/config/environment.rb +53 -0
  75. data/lib/bio/shell/rails/config/environments/development.rb +19 -0
  76. data/lib/bio/shell/rails/config/environments/production.rb +19 -0
  77. data/lib/bio/shell/rails/config/environments/test.rb +19 -0
  78. data/lib/bio/shell/rails/config/routes.rb +19 -0
  79. data/lib/bio/shell/rails/doc/README_FOR_APP +2 -0
  80. data/lib/bio/shell/rails/public/404.html +8 -0
  81. data/lib/bio/shell/rails/public/500.html +8 -0
  82. data/lib/bio/shell/rails/public/dispatch.cgi +10 -0
  83. data/lib/bio/shell/rails/public/dispatch.fcgi +24 -0
  84. data/lib/bio/shell/rails/public/dispatch.rb +10 -0
  85. data/lib/bio/shell/rails/public/favicon.ico +0 -0
  86. data/lib/bio/shell/rails/public/images/icon.png +0 -0
  87. data/lib/bio/shell/rails/public/images/rails.png +0 -0
  88. data/lib/bio/shell/rails/public/index.html +277 -0
  89. data/lib/bio/shell/rails/public/javascripts/controls.js +750 -0
  90. data/lib/bio/shell/rails/public/javascripts/dragdrop.js +584 -0
  91. data/lib/bio/shell/rails/public/javascripts/effects.js +854 -0
  92. data/lib/bio/shell/rails/public/javascripts/prototype.js +1785 -0
  93. data/lib/bio/shell/rails/public/robots.txt +1 -0
  94. data/lib/bio/shell/rails/public/stylesheets/main.css +187 -0
  95. data/lib/bio/shell/rails/script/about +3 -0
  96. data/lib/bio/shell/rails/script/breakpointer +3 -0
  97. data/lib/bio/shell/rails/script/console +3 -0
  98. data/lib/bio/shell/rails/script/destroy +3 -0
  99. data/lib/bio/shell/rails/script/generate +3 -0
  100. data/lib/bio/shell/rails/script/performance/benchmarker +3 -0
  101. data/lib/bio/shell/rails/script/performance/profiler +3 -0
  102. data/lib/bio/shell/rails/script/plugin +3 -0
  103. data/lib/bio/shell/rails/script/process/reaper +3 -0
  104. data/lib/bio/shell/rails/script/process/spawner +3 -0
  105. data/lib/bio/shell/rails/script/process/spinner +3 -0
  106. data/lib/bio/shell/rails/script/runner +3 -0
  107. data/lib/bio/shell/rails/script/server +42 -0
  108. data/lib/bio/shell/rails/test/test_helper.rb +28 -0
  109. data/lib/bio/shell/web.rb +90 -0
  110. data/lib/bio/util/contingency_table.rb +231 -225
  111. data/sample/any2fasta.rb +59 -0
  112. data/test/data/HMMER/hmmpfam.out +64 -0
  113. data/test/data/HMMER/hmmsearch.out +88 -0
  114. data/test/data/aaindex/DAYM780301 +30 -0
  115. data/test/data/aaindex/PRAM900102 +20 -0
  116. data/test/data/bl2seq/cd8a_cd8b_blastp.bl2seq +53 -0
  117. data/test/data/bl2seq/cd8a_p53_e-5blastp.bl2seq +37 -0
  118. data/test/data/blast/{eco:b0002.faa → b0002.faa} +0 -0
  119. data/test/data/blast/{eco:b0002.faa.m0 → b0002.faa.m0} +2 -2
  120. data/test/data/blast/{eco:b0002.faa.m7 → b0002.faa.m7} +1 -1
  121. data/test/data/blast/{eco:b0002.faa.m8 → b0002.faa.m8} +0 -0
  122. data/test/unit/bio/appl/bl2seq/test_report.rb +134 -0
  123. data/test/unit/bio/appl/blast/test_report.rb +15 -12
  124. data/test/unit/bio/appl/blast/test_xmlparser.rb +4 -4
  125. data/test/unit/bio/appl/hmmer/test_report.rb +355 -0
  126. data/test/unit/bio/appl/test_blast.rb +5 -5
  127. data/test/unit/bio/data/test_na.rb +9 -18
  128. data/test/unit/bio/db/pdb/test_pdb.rb +169 -0
  129. data/test/unit/bio/db/test_aaindex.rb +197 -0
  130. data/test/unit/bio/io/test_fastacmd.rb +55 -0
  131. data/test/unit/bio/sequence/test_aa.rb +102 -0
  132. data/test/unit/bio/sequence/test_common.rb +178 -0
  133. data/test/unit/bio/sequence/test_compat.rb +82 -0
  134. data/test/unit/bio/sequence/test_na.rb +242 -0
  135. data/test/unit/bio/shell/plugin/test_seq.rb +29 -19
  136. data/test/unit/bio/test_alignment.rb +15 -7
  137. data/test/unit/bio/test_reference.rb +198 -0
  138. data/test/unit/bio/test_sequence.rb +4 -49
  139. data/test/unit/bio/test_shell.rb +2 -2
  140. metadata +118 -15
  141. data/lib/bio/io/brdb.rb +0 -103
  142. data/lib/bioruby.rb +0 -34
@@ -1,8 +1,47 @@
1
1
  #
2
- # bio/io/fastacmd.rb - NCBI fastacmd wrapper class
2
+ # = bio/io/fastacmd.rb - NCBI fastacmd wrapper class
3
3
  #
4
- # Copyright (C) 2005 Shuji SHIGENOBU <shige@nibb.ac.jp>
5
- # Copyright (C) 2005 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2005, 2006
5
+ # Shuji SHIGENOBU <shige@nibb.ac.jp>,
6
+ # Toshiaki Katayama <k@bioruby.org>,
7
+ # Mitsuteru C. Nakao <n@bioruby.org>
8
+ # Lisence:: LGPL
9
+ #
10
+ # $Id: fastacmd.rb,v 1.10 2006/01/28 08:12:21 nakao Exp $
11
+ #
12
+ # == Description
13
+ #
14
+ # Retrives FASTA formatted sequences from a blast database using
15
+ # NCBI fastacmd command.
16
+ #
17
+ # This class requires 'fastacmd' command and a blast database
18
+ # (formatted using the '-o' option of 'formatdb').
19
+ #
20
+ # == Examples
21
+ #
22
+ # database = ARGV.shift || "/db/myblastdb"
23
+ # entry_id = ARGV.shift || "sp:128U_DROME"
24
+ # ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
25
+ #
26
+ # fastacmd = Bio::Blast::Fastacmd.new(database)
27
+ #
28
+ # entry = fastacmd.get_by_id(entry_id)
29
+ # fastacmd.fetch(entry_id)
30
+ # fastacmd.fetch(ent_list)
31
+ #
32
+ # fastacmd.fetch(ent_list).each do |fasta|
33
+ # puts fasta
34
+ # end
35
+ #
36
+ # == References
37
+ #
38
+ # * NCBI tool
39
+ # ftp://ftp.ncbi.nih.gov/blast/executables/LATEST/ncbi.tar.gz
40
+ #
41
+ # * fastacmd.html
42
+ # http://biowulf.nih.gov/apps/blast/doc/fastacmd.html
43
+ #
44
+ #--
6
45
  #
7
46
  # This library is free software; you can redistribute it and/or
8
47
  # modify it under the terms of the GNU Lesser General Public
@@ -18,7 +57,7 @@
18
57
  # License along with this library; if not, write to the Free Software
19
58
  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
59
  #
21
- # $Id: fastacmd.rb,v 1.8 2005/09/26 13:00:08 k Exp $
60
+ #++
22
61
  #
23
62
 
24
63
  require 'bio/db/fasta'
@@ -28,23 +67,52 @@ require 'bio/command'
28
67
  module Bio
29
68
  class Blast
30
69
 
70
+ # NCBI fastacmd wrapper class
71
+ #
31
72
  class Fastacmd
32
73
 
33
74
  include Enumerable
34
75
  include Bio::Command::Tools
35
76
 
36
- def initialize(db)
37
- @database = db
77
+ # Database file path.
78
+ attr_accessor :database
79
+
80
+ # fastcmd command file path.
81
+ attr_accessor :fastacmd
82
+
83
+ #
84
+ attr_accessor :errorlog
85
+
86
+ # Initalize a fastacmd object.
87
+ #
88
+ # fastacmd = Bio::Blast::Fastacmd.new("/db/myblastdb")
89
+ def initialize(blast_database_file_path)
90
+ @database = blast_database_file_path
38
91
  @fastacmd = 'fastacmd'
39
92
  end
40
- attr_accessor :database, :fastacmd, :errorlog
41
93
 
42
- # get an entry_id and returns a Bio::FastaFormat object
94
+
95
+ # get an entry_id and returns a Bio::FastaFormat object.
96
+ #
97
+ # entry_id = "sp:128U_DROME"
98
+ # entry = fastacmd.get_by_id(entry_id)
43
99
  def get_by_id(entry_id)
44
100
  fetch(entry_id).shift
45
101
  end
46
102
 
47
- # get one or more entry_id and returns an Array of Bio::FastaFormat objects
103
+ # get one or more entry_id and returns an Array of Bio::FastaFormat objects.
104
+ #
105
+ # Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
106
+ # object even when the result is a single entry.
107
+ #
108
+ # p fastacmd.fetch(entry_id)
109
+ #
110
+ # Fastacmd#fetch method also accepts a list of entry_id and returns
111
+ # an Array of Bio::FastaFormat objects.
112
+ #
113
+ # ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
114
+ # p fastacmd.fetch(ent_list)
115
+ #
48
116
  def fetch(list)
49
117
  if list.respond_to?(:join)
50
118
  entry_id = list.join(",")
@@ -59,13 +127,20 @@ class Fastacmd
59
127
  end
60
128
  end
61
129
 
130
+ # Iterates each entry.
131
+ #
132
+ # You can also iterate on all sequences in the database!
133
+ # fastacmd.each do |fasta|
134
+ # p [ fasta.definition[0..30], fasta.seq.size ]
135
+ # end
136
+ #
62
137
  def each_entry
63
138
  cmd = [ @fastacmd, '-d', @database, '-D', 'T' ]
64
139
  call_command_local(cmd) do |inn, out|
65
140
  inn.close_write
66
141
  Bio::FlatFile.open(Bio::FastaFormat, out) do |f|
67
- f.each_entry do |e|
68
- yield e
142
+ f.each_entry do |entry|
143
+ yield entry
69
144
  end
70
145
  end
71
146
  end
@@ -73,51 +148,9 @@ class Fastacmd
73
148
  end
74
149
  alias each each_entry
75
150
 
76
- end
151
+ end # class Fastacmd
77
152
 
78
- end
79
- end
80
-
81
-
82
- if __FILE__ == $0
83
-
84
- database = ARGV.shift || "/db/myblastdb"
85
- entry_id = ARGV.shift || "sp:128U_DROME"
86
- ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
87
-
88
- fastacmd = Bio::Blast::Fastacmd.new(database)
89
-
90
- ### Retrieve one sequence
91
- entry = fastacmd.get_by_id(entry_id)
92
-
93
- # Fastacmd#get_by_id(entry_id) returns a Bio::FastaFormat object.
94
- p entry
95
-
96
- # Bio::FastaFormat becomes a fasta format string when printed by puts.
97
- puts entry
98
-
99
- # Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
100
- # object even when the result is a single entry.
101
- p fastacmd.fetch(entry_id)
102
-
103
- ### Retrieve more sequences
104
-
105
- # Fastacmd#fetch method also accepts a list of entry_id and returns
106
- # an Array of Bio::FastaFormat objects.
107
- p fastacmd.fetch(ent_list)
108
-
109
- # So, you can iterate on the results.
110
- fastacmd.fetch(ent_list).each do |fasta|
111
- puts fasta
112
- end
113
-
114
-
115
- ### Iterates on all entries
116
-
117
- # You can also iterate on all sequences in the database!
118
- fastacmd.each do |fasta|
119
- p [ fasta.definition[0..30], fasta.seq.size ]
120
- end
153
+ end # class Blast
154
+ end # module Bio
121
155
 
122
- end
123
156
 
@@ -1,32 +1,19 @@
1
1
  #
2
2
  # = bio/io/flatfile.rb - flatfile access wrapper class
3
3
  #
4
- # Copyright:: Copyright (C) 2001, 2002 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
- # License:: LGPL
4
+ # Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
6
5
  #
7
- #--
8
- # This library is free software; you can redistribute it and/or
9
- # modify it under the terms of the GNU Lesser General Public
10
- # License as published by the Free Software Foundation; either
11
- # version 2 of the License, or (at your option) any later version.
6
+ # License:: Ruby's
12
7
  #
13
- # This library is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
- # Lesser General Public License for more details.
8
+ # $Id: flatfile.rb,v 1.46 2006/02/22 10:01:27 ngoto Exp $
17
9
  #
18
- # You should have received a copy of the GNU Lesser General Public
19
- # License along with this library; if not, write to the Free Software
20
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
- #++
22
- #
23
- # $Id: flatfile.rb,v 1.41 2005/11/01 15:34:45 ngoto Exp $
24
10
  #
25
11
  # Bio::FlatFile is a helper and wrapper class to read a biological data file.
26
12
  # It acts like a IO object.
27
13
  # It can automatically detect data format, and users do not need to tell
28
14
  # the class what the data is.
29
15
  #
16
+ require 'tsort'
30
17
 
31
18
  module Bio
32
19
 
@@ -38,60 +25,407 @@ module Bio
38
25
 
39
26
  include Enumerable
40
27
 
28
+ # Wrapper for a IO (or IO-like) object.
29
+ # It can input with a buffer.
30
+ class BufferedInputStream
31
+ # Creates a new input stream wrapper
32
+ def initialize(io, path)
33
+ @io = io
34
+ @path = path
35
+ # initialize prefetch buffer
36
+ @buffer = ''
37
+ @path = path
38
+ end
39
+
40
+ # Creates a new input stream wrapper from the given IO object.
41
+ def self.for_io(io)
42
+ begin
43
+ path = io.path
44
+ rescue NameError
45
+ path = nil
46
+ end
47
+ self.new(io, path)
48
+ end
49
+
50
+ # Creates a new input stream wrapper to open file _filename_
51
+ # by using File.open.
52
+ # *arg is passed to File.open.
53
+ #
54
+ # Like File.open, a block can be accepted.
55
+ def self.open_file(filename, *arg)
56
+ if block_given? then
57
+ File.open(filename, *arg) do |fobj|
58
+ yield self.new(fobj, filename)
59
+ end
60
+ else
61
+ fobj = File.open(filename, *arg)
62
+ self.new(fobj, filename)
63
+ end
64
+ end
65
+
66
+ # Creates a new input stream wrapper from URI specified as _uri_.
67
+ # by using OpenURI.open_uri or URI#open.
68
+ # _uri_ must be a String or URI object.
69
+ # *arg is passed to OpenURI.open_uri or URI#open.
70
+ #
71
+ # Like OpenURI.open_uri, it can accept a block.
72
+ def self.open_uri(uri, *arg)
73
+ if uri.kind_of?(URI)
74
+ if block_given?
75
+ uri.open(*arg) do |fobj|
76
+ yield self.new(fobj, uri.to_s)
77
+ end
78
+ else
79
+ fobj = uri.open(*arg)
80
+ self.new(fobj, uri.to_s)
81
+ end
82
+ else
83
+ if block_given?
84
+ OpenURI.open_uri(uri, *arg) do |fobj|
85
+ yield self.new(fobj, uri)
86
+ end
87
+ else
88
+ fobj = OpenURI.open_uri(uri, *arg)
89
+ self.new(fobj, uri)
90
+ end
91
+ end
92
+ end
93
+
94
+ # Pathname, filename or URI to open the object.
95
+ # Like File#path, returned value isn't normalized.
96
+ attr_reader :path
97
+
98
+ # Converts to IO object if possible
99
+ def to_io
100
+ @io.to_io
101
+ end
102
+
103
+ # Closes the IO object if possible
104
+ def close
105
+ @io.close
106
+ end
107
+
108
+ # Rewinds the IO object if possible
109
+ # Internal buffer in this wrapper is cleared.
110
+ def rewind
111
+ r = @io.rewind
112
+ @buffer = ''
113
+ r
114
+ end
115
+
116
+ # Returns current file position
117
+ def pos
118
+ @io.pos - @buffer.size
119
+ end
120
+
121
+ # Sets current file position if possible
122
+ # Internal buffer in this wrapper is cleared.
123
+ def pos=(p)
124
+ r = (@io.pos = p)
125
+ @buffer = ''
126
+ r
127
+ end
128
+
129
+ # Returns true if end-of-file. Otherwise, returns false.
130
+ #
131
+ # Note that it returns false if internal buffer is this wrapper
132
+ # is not empty,
133
+ def eof?
134
+ if @buffer.size > 0
135
+ false
136
+ else
137
+ @io.eof?
138
+ end
139
+ end
140
+
141
+ # Same as IO#gets.
142
+ def gets(io_rs = $/)
143
+ if @buffer.size > 0
144
+ if io_rs == nil then
145
+ r = @buffer + @io.gets(nil).to_s
146
+ @buffer = ''
147
+ else
148
+ if io_rs == '' then
149
+ sp_rs = /\n\n/n
150
+ sp_rs_orig = "\n\n"
151
+ else
152
+ sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
153
+ sp_rs_orig = io_rs
154
+ end
155
+ a = @buffer.split(sp_rs, 2)
156
+ if a.size > 1 then
157
+ r = a[0] + sp_rs_orig
158
+ @buffer = a[1]
159
+ else
160
+ @buffer << @io.gets(io_rs).to_s
161
+ a = @buffer.split(sp_rs, 2)
162
+ if a.size > 1 then
163
+ r = a[0] + sp_rs_orig
164
+ @buffer = a[1].to_s
165
+ else
166
+ r = @buffer
167
+ @buffer = ''
168
+ end
169
+ end
170
+ end
171
+ r
172
+ else
173
+ @io.gets(io_rs)
174
+ end
175
+ end
176
+
177
+ # Pushes back given str to the internal buffer.
178
+ # Returns nil.
179
+ # str must be read previously with the wrapper object.
180
+ #
181
+ # Note that in current implementation, the str can be everything,
182
+ # but please don't depend on it.
183
+ #
184
+ def ungets(str)
185
+ @buffer = str + @buffer
186
+ nil
187
+ end
188
+
189
+ # Same as IO#getc.
190
+ def getc
191
+ if @buffer.size > 0 then
192
+ r = @buffer[0]
193
+ @buffer = @buffer[1..-1]
194
+ else
195
+ r = @io.getc
196
+ end
197
+ r
198
+ end
199
+
200
+ # Pushes back one character into the internal buffer.
201
+ # Unlike IO#getc, it can be called more than one time.
202
+ def ungetc(c)
203
+ @buffer = sprintf("%c", c) + @buffer
204
+ nil
205
+ end
206
+
207
+ # Gets current prefetch buffer
208
+ def prefetch_buffer
209
+ @buffer
210
+ end
211
+
212
+ # It does @io.gets, and addes returned string
213
+ # to the internal buffer, and returns the string.
214
+ def prefetch_gets(*arg)
215
+ r = @io.gets(*arg)
216
+ @buffer << r if r
217
+ r
218
+ end
219
+
220
+ # It does @io.readpartial, and addes returned string
221
+ # to the internal buffer, and returns the string.
222
+ def prefetch_readpartial(*arg)
223
+ r = @io.readpartial(*arg)
224
+ @buffer << r if r
225
+ r
226
+ end
227
+
228
+ # Skips space characters in the stream.
229
+ # returns nil.
230
+ def skip_spaces
231
+ ws = { ?\s => true, ?\n => true, ?\r => true, ?\t => true }
232
+ while r = self.getc
233
+ unless ws[r] then
234
+ self.ungetc(r)
235
+ break
236
+ end
237
+ end
238
+ nil
239
+ end
240
+ end #class BufferedInputStream
241
+
242
+ # Splitter is a class to get entries from a buffered input stream.
243
+ module Splitter
244
+ # This is a template of splitter.
245
+ class Template
246
+ # Creates a new splitter.
247
+ def initialize(klass, bstream)
248
+ @stream = bstream
249
+ raise NotImplementedError
250
+ end
251
+
252
+ # skips leader of the entry.
253
+ def skip_leader
254
+ raise NotImplementedError
255
+ end
256
+
257
+ # Gets entry as a string
258
+ def get_entry
259
+ raise NotImplementedError
260
+ end
261
+
262
+ # the last entry read from the stream
263
+ attr_reader :entry
264
+
265
+ # start position of the entry
266
+ attr_reader :entry_start_pos
267
+
268
+ # (end position of the entry) + 1
269
+ attr_reader :entry_ended_pos
270
+ end
271
+
272
+ # Default splitter.
273
+ # It sees following constants in the given class.
274
+ # DELIMITER:: (String) delimiter indicates the end of a entry.
275
+ # FLATFILE_HEADER:: (String) start of a entry, located on head of a line.
276
+ # DELIMITER_OVERRUN:: (Integer) excess read size included in DELIMITER.
277
+ #
278
+ class Default < Template
279
+ # Creates a new splitter.
280
+ # klass:: database class
281
+ # bstream:: input stream. It must be a BufferedInputStream object.
282
+ def initialize(klass, bstream)
283
+ @stream = bstream
284
+ @delimiter = klass::DELIMITER rescue nil
285
+ @header = klass::FLATFILE_HEADER rescue nil
286
+ # for specific classes' benefit
287
+ unless header
288
+ if klass == Bio::GenBank or klass == Bio::GenPept
289
+ @header = 'LOCUS '
290
+ end
291
+ end
292
+ @delimiter_overrun = klass::DELIMITER_OVERRUN rescue nil
293
+ end
294
+
295
+ # (String) delimiter indicates the end of a entry.
296
+ attr_accessor :delimiter
297
+
298
+ # (String) start of a entry, located on head of a line.
299
+ attr_accessor :header
300
+
301
+ # (Integer) excess read data size included in delimiter.
302
+ attr_accessor :delimiter_overrun
303
+
304
+ # Skips leader of the entry.
305
+ #
306
+ # If @header is not nil, it reads till the contents of @header
307
+ # comes at the head of a line.
308
+ # If correct FLATFILE_HEADER is found, returns true.
309
+ # Otherwise, returns nil.
310
+ def skip_leader
311
+ if @header then
312
+ data = ''
313
+ while s = @stream.gets(@header)
314
+ data << s
315
+ if data.split(/[\r\n]+/)[-1] == @header then
316
+ @stream.ungets(@header)
317
+ return true
318
+ end
319
+ end
320
+ # @header was not found. For safety,
321
+ # pushes back data with removing white spaces in the head.
322
+ data.sub(/\A\s+/, '')
323
+ @stream.ungets(data)
324
+ return nil
325
+ else
326
+ @stream.skip_spaces
327
+ return nil
328
+ end
329
+ end
330
+
331
+ # gets a entry
332
+ def get_entry
333
+ p0 = @stream.pos
334
+ e = @stream.gets(@delimiter)
335
+ if e and @delimiter_overrun then
336
+ if e[-@delimiter.size, @delimiter.size ] == @delimiter then
337
+ overrun = e[-@delimiter_overrun, @delimiter_overrun]
338
+ e[-@delimiter_overrun, @delimiter_overrun] = ''
339
+ @stream.ungets(overrun)
340
+ end
341
+ end
342
+ p1 = @stream.pos
343
+ @entry_start_pos = p0
344
+ @entry = e
345
+ @entry_ended_pos = p1
346
+ @entry
347
+ end
348
+ end #class Defalult
349
+ end #module Splitter
350
+
351
+ #
352
+ # Bio::FlatFile.open(file, *arg)
353
+ # Bio::FlatFile.open(dbclass, file, *arg)
354
+ #
41
355
  # Creates a new Bio::FlatFile object to read a file or a stream
42
- # which contains +dbclass+ data.
356
+ # which contains _dbclass_ data.
43
357
  #
44
- # +dbclass+ should be a class (or module) or nil.
358
+ # _dbclass_ should be a class (or module) or nil.
45
359
  # e.g. Bio::GenBank, Bio::FastaFormat.
46
360
  #
47
- # If +file+ is a filename (which doesn't have gets method),
48
- # the method opens a local file named +file+
49
- # with 'File.open(filename, mode, perm)'.
361
+ # If _file_ is a filename (which doesn't have gets method),
362
+ # the method opens a local file named _file_
363
+ # with <code>File.open(filename, *arg)</code>.
50
364
  #
51
- # When nil is given to dbclass, trying to determine database class
52
- # (file format) automatically. If fails to determine, dbclass is
53
- # set to nil and FlatFile#next_entry works same as IO#gets when
54
- # raw = true. It is recommended to set dbclass using
55
- # FlatFile#dbclass= method if fails to determine automatically.
365
+ # When _dbclass_ is omitted or nil is given to _dbclass_,
366
+ # the method tries to determine database class
367
+ # (file format) automatically.
368
+ # When it fails to determine, dbclass is set to nil
369
+ # and FlatFile#next_entry would fail.
370
+ # You can still set dbclass using FlatFile#dbclass= method.
56
371
  #
57
372
  # * Example 1
58
373
  # Bio::FlatFile.open(Bio::GenBank, "genbank/gbest40.seq")
59
374
  # * Example 2
60
375
  # Bio::FlatFile.open(nil, "embl/est_hum17.dat")
61
376
  # * Example 3
377
+ # Bio::FlatFile.open("genbank/gbest40.seq")
378
+ #
379
+ # * Example 4
62
380
  # Bio::FlatFile.open(Bio::GenBank, $stdin)
63
381
  #
64
- # If it is called with block, the block will be executed with
65
- # a newly opened Bio::FlatFile instance object. If filename
66
- # is given, the file is automatically closed when leaving the block.
382
+ # If it is called with a block, the block will be executed with
383
+ # a new Bio::FlatFile object. If filename is given,
384
+ # the file is automatically closed when leaving the block.
67
385
  #
68
- # * Example 4
386
+ # * Example 5
69
387
  # Bio::FlatFile.open(nil, 'test4.fst') do |ff|
70
388
  # ff.each { |e| print e.definition, "\n" }
71
389
  # end
72
390
  #
73
- def self.open(dbclass, file, *arg)
74
- # 3rd and 4th arg: mode, perm (passed to File.open)
75
- openmode = []
76
- while x = arg[0] and !x.is_a?(Hash)
77
- openmode << arg.shift
78
- end
79
- # rest of arg: passed to FlatFile.new
80
- # create a flatfile object
391
+ # * Example 6
392
+ # Bio::FlatFile.open('test4.fst') do |ff|
393
+ # ff.each { |e| print e.definition, "\n" }
394
+ # end
395
+ #
396
+ # Compatibility Note:
397
+ # <em>*arg</em> is completely passed to the <code>File.open</code>
398
+ # and you cannot specify ":raw => true" or ":raw => false".
399
+ #
400
+ def self.open(*arg, &block)
401
+ # FlatFile.open(dbclass, file, mode, perm)
402
+ # FlatFile.open(file, mode, perm)
403
+ if arg.size <= 0
404
+ raise ArgumentError, 'wrong number of arguments (0 for 1)'
405
+ end
406
+ x = arg.shift
407
+ if x.is_a?(Module) then
408
+ # FlatFile.open(dbclass, filename_or_io, ...)
409
+ dbclass = x
410
+ elsif x.nil? then
411
+ # FlatFile.open(nil, filename_or_io, ...)
412
+ dbclass = nil
413
+ else
414
+ # FlatFile.open(filename, ...)
415
+ dbclass = nil
416
+ arg.unshift(x)
417
+ end
418
+ if arg.size <= 0
419
+ raise ArgumentError, 'wrong number of arguments (1 for 2)'
420
+ end
421
+ file = arg.shift
422
+ # check if file is filename or IO object
81
423
  unless file.respond_to?(:gets)
82
424
  # 'file' is a filename
83
- if block_given? then
84
- File.open(file, *openmode) do |fobj|
85
- ff = self.new(dbclass, fobj, *arg)
86
- yield ff
87
- end
88
- else
89
- fobj = File.open(file, *openmode)
90
- self.new(dbclass, fobj, *arg)
91
- end
425
+ self.open_file(file, *arg, &block)
92
426
  else
93
427
  # 'file' is a IO object
94
- ff = self.new(dbclass, file, *arg)
428
+ ff = self.new(dbclass, file)
95
429
  block_given? ? (yield ff) : ff
96
430
  end
97
431
  end
@@ -110,7 +444,8 @@ module Bio
110
444
  end
111
445
 
112
446
  # Same as FlatFile.auto(filename_or_stream, *arg).to_a
113
- # (It might be OBSOLETED in the future.)
447
+ #
448
+ # (This method might be OBSOLETED in the future.)
114
449
  def self.to_a(*arg)
115
450
  self.auto(*arg) do |ff|
116
451
  raise 'cannot determine file format' unless ff.dbclass
@@ -118,6 +453,46 @@ module Bio
118
453
  end
119
454
  end
120
455
 
456
+ # Same as FlatFile.auto(filename, *arg),
457
+ # except that it only accept filename and doesn't accept IO object.
458
+ # File format is automatically determined.
459
+ #
460
+ # It can accept a block.
461
+ # If a block is given, it returns the block's return value.
462
+ # Otherwise, it returns a new FlatFile object.
463
+ #
464
+ def self.open_file(filename, *arg)
465
+ if block_given? then
466
+ BufferedInputStream.open_file(filename, *arg) do |stream|
467
+ yield self.new(nil, stream)
468
+ end
469
+ else
470
+ stream = BufferedInputStream.open_file(filename, *arg)
471
+ self.new(nil, stream)
472
+ end
473
+ end
474
+
475
+ # Opens URI specified as _uri_.
476
+ # _uri_ must be a String or URI object.
477
+ # *arg is passed to OpenURI.open_uri or URI#open.
478
+ #
479
+ # Like FlatFile#open, it can accept a block.
480
+ #
481
+ # Note that you MUST explicitly require 'open-uri'.
482
+ # Because open-uri.rb modifies existing class,
483
+ # it isn't required by default.
484
+ #
485
+ def self.open_uri(uri, *arg)
486
+ if block_given? then
487
+ BufferedInputStream.open_uri(uri, *arg) do |stream|
488
+ yield self.new(nil, stream)
489
+ end
490
+ else
491
+ stream = BufferedInputStream.open_uri(uri, *arg)
492
+ self.new(nil, stream)
493
+ end
494
+ end
495
+
121
496
  # Same as FlatFile.open, except that 'stream' should be a opened
122
497
  # stream object (IO, File, ..., who have the 'gets' method).
123
498
  #
@@ -126,62 +501,101 @@ module Bio
126
501
  # * Example 2
127
502
  # Bio::FlatFile.new(Bio::GenBank, IO.popen("gzip -dc nc1101.flat.gz"))
128
503
  #
129
- # +options+ should be a hash (or nil). It will be OBSOLETED!!
130
- # Available options are below:
131
- # [<tt>:raw</tt>] if true, "raw mode" (same as #raw=true).
132
- # default: false (not "raw mode").
504
+ # Compatibility Note:
505
+ # Now, you cannot specify ":raw => true" or ":raw => false".
506
+ # Below styles are DEPRECATED.
133
507
  #
134
- # * Example 3
135
- # Bio::FlatFile.new(nil, $stdin, :raw=>true)
508
+ # * Example 3 (deprecated)
509
+ # # Bio::FlatFile.new(nil, $stdin, :raw=>true) # => ERROR
510
+ # # Please rewrite as below.
511
+ # ff = Bio::FlatFile.new(nil, $stdin)
512
+ # ff.raw = true
136
513
  # * Example 3 in old style (deprecated)
137
- # Bio::FlatFile.new(nil, $stdin, true)
514
+ # # Bio::FlatFile.new(nil, $stdin, true) # => ERROR
515
+ # # Please rewrite as below.
516
+ # ff = Bio::FlatFile.new(nil, $stdin)
517
+ # ff.raw = true
138
518
  #
139
- def initialize(dbclass, stream, options = nil)
519
+ def initialize(dbclass, stream)
140
520
  # 2nd arg: IO object
141
- @io = stream
142
- # 3rd arg: options (nil or a Hash)
143
- self.raw = false
144
- if options.is_a?(Hash) then
145
- self.raw = options[:raw] if options.has_key?(:raw)
521
+ if @stream.kind_of?(BufferedInputStream)
522
+ @stream = stream
146
523
  else
147
- self.raw = options
524
+ @stream = BufferedInputStream.for_io(stream)
148
525
  end
149
- # initialize prefetch buffer
150
- @prefetch = ''
526
+ # default is raw mode
527
+ self.raw = false
151
528
  # 1st arg: database class (or file format autodetection)
152
529
  if dbclass then
153
- self.dbclass = dbclass
530
+ self.dbclass = dbclass
154
531
  else
155
- autodetect
532
+ autodetect
156
533
  end
534
+ #
535
+ @skip_leader_mode = :firsttime
536
+ @firsttime_flag = true
537
+ end
538
+
539
+ # The mode how to skip leader of the data.
540
+ # :firsttime :: (DEFAULT) only head of file (= first time to read)
541
+ # :everytime :: everytime to read entry
542
+ # nil :: never skip
543
+ attr_accessor :skip_leader_mode
544
+
545
+ # (DEPRECATED) IO object in the flatfile object.
546
+ #
547
+ # Compatibility Note: Bio::FlatFile#io is deprecated.
548
+ # Please use Bio::FlatFile#to_io instead.
549
+ def io
550
+ warn "Bio::FlatFile#io is deprecated."
551
+ @stream.to_io
157
552
  end
158
553
 
159
554
  # IO object in the flatfile object.
160
- attr_reader :io
555
+ #
556
+ # Compatibility Note: Bio::FlatFile#io is deprecated.
557
+ def to_io
558
+ @stream.to_io
559
+ end
560
+
561
+ # Pathname, filename or URI (or nil).
562
+ def path
563
+ @stream.path
564
+ end
161
565
 
162
566
  # Get next entry.
163
567
  def next_entry
164
- @entry_raw = gets(@rs)
165
- return nil unless @entry_raw
568
+ if @skip_leader_mode and
569
+ ((@firsttime_flag and @skip_leader_mode == :firsttime) or
570
+ @skip_leader_mode == :everytime)
571
+ @splitter.skip_leader
572
+ end
573
+ r = @splitter.get_entry
574
+ @firsttime_flag = false
575
+ return nil unless r
166
576
  if raw then
167
- @entry_raw
577
+ r
168
578
  else
169
- e = @dbclass.new(@entry_raw)
170
- begin
171
- s = e.entry_overrun
172
- rescue NameError
173
- s = nil
174
- end
175
- if s then
176
- @entry_raw[-(s.length), s.length] = ''
177
- ungets(s)
178
- end
179
- e
579
+ @entry = @dbclass.new(r)
580
+ @entry
180
581
  end
181
582
  end
583
+ attr_reader :entry
182
584
 
183
585
  # Returns the last raw entry as a string.
184
- attr_reader :entry_raw
586
+ def entry_raw
587
+ @splitter.entry
588
+ end
589
+
590
+ # start position of the last entry
591
+ def entry_start_pos
592
+ @splitter.entry_start_pos
593
+ end
594
+
595
+ # (end position of the last entry) + 1
596
+ def entry_ended_pos
597
+ @splitter.entry_ended_pos
598
+ end
185
599
 
186
600
  # Iterates over each entry in the flatfile.
187
601
  #
@@ -193,23 +607,23 @@ module Bio
193
607
  # end
194
608
  def each_entry
195
609
  while e = self.next_entry
196
- yield e
610
+ yield e
197
611
  end
198
612
  end
199
- alias each each_entry
613
+ alias :each :each_entry
200
614
 
201
615
  # Resets file pointer to the start of the flatfile.
202
616
  # (similar to IO#rewind)
203
617
  def rewind
204
- r = @io.rewind
205
- @prefetch = ''
618
+ r = @stream.rewind
619
+ @firsttime_flag = true
206
620
  r
207
621
  end
208
622
 
209
623
  # Closes input stream.
210
624
  # (similar to IO#close)
211
625
  def close
212
- @io.close
626
+ @stream.close
213
627
  end
214
628
 
215
629
  # Returns current position of input stream.
@@ -217,9 +631,9 @@ module Bio
217
631
  # the result is not guaranteed.
218
632
  # It is similar to IO#pos.
219
633
  # Note that it will not be equal to io.pos,
220
- # because FlatFile#autodetect may pre-read some lines.
634
+ # because FlatFile has its own internal buffer.
221
635
  def pos
222
- @io.pos - @prefetch.size
636
+ @stream.pos
223
637
  end
224
638
 
225
639
  # (Not recommended to use it.)
@@ -228,86 +642,17 @@ module Bio
228
642
  # the result is not guaranteed.
229
643
  # It is similar to IO#pos=.
230
644
  # Note that it will not be equal to io.pos=,
231
- # because FlatFile#autodetect may pre-read some lines.
645
+ # because FlatFile has its own internal buffer.
232
646
  def pos=(p)
233
- r = (@io.pos = p)
234
- @prefetch = ''
235
- r
647
+ @stream.pos=(p)
236
648
  end
237
649
 
238
650
  # Returns true if input stream is end-of-file.
239
651
  # Otherwise, returns false.
240
652
  # (Similar to IO#eof?, but may not be equal to io.eof?,
241
- # because FlatFile#autodetect may pre-read some lines.)
653
+ # because FlatFile has its own internal buffer.)
242
654
  def eof?
243
- if @prefetch.size > 0
244
- false
245
- else
246
- @io.eof?
247
- end
248
- end
249
-
250
- # Similar to IO#gets.
251
- # Internal use only. Users should not call it directly.
252
- def gets(io_rs = $/)
253
- if @prefetch.size > 0
254
- if io_rs == nil then
255
- r = @prefetch + @io.gets(nil).to_s
256
- @prefetch = ''
257
- else
258
- if io_rs == '' then
259
- sp_rs = /\n\n/n
260
- sp_rs_orig = "\n\n"
261
- else
262
- sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
263
- sp_rs_orig = io_rs
264
- end
265
- a = @prefetch.split(sp_rs, 2)
266
- if a.size > 1 then
267
- r = a[0] + sp_rs_orig
268
- @prefetch = a[1]
269
- else
270
- @prefetch << @io.gets(io_rs).to_s
271
- a = @prefetch.split(sp_rs, 2)
272
- if a.size > 1 then
273
- r = a[0] + sp_rs_orig
274
- @prefetch = a[1].to_s
275
- else
276
- r = @prefetch
277
- @prefetch = ''
278
- end
279
- end
280
- end
281
- r
282
- else
283
- @io.gets(io_rs)
284
- end
285
- end
286
-
287
- # Unread read data.
288
- # Internal use only. Users must not call it.
289
- def ungets(str)
290
- @prefetch = str + @prefetch
291
- nil
292
- end
293
-
294
- # Similar to IO#getc.
295
- # Internal use only. Users should not call it directly.
296
- def getc
297
- if @prefetch.size > 0 then
298
- r = @prefetch[0]
299
- @prefetch = @prefetch[1..-1]
300
- else
301
- r = @io.getc
302
- end
303
- r
304
- end
305
-
306
- # Similar to IO#ungetc.
307
- # Internal use only. Users should not call it.
308
- def ungetc(c)
309
- @prefetch = sprintf("%c", c) + @prefetch
310
- nil
655
+ @stream.eof?
311
656
  end
312
657
 
313
658
  # If true is given, the next_entry method returns
@@ -319,14 +664,24 @@ module Bio
319
664
  # If true, raw mode.
320
665
  attr_reader :raw
321
666
 
667
+ # Similar to IO#gets.
668
+ # Internal use only. Users should not call it directly.
669
+ def gets(*arg)
670
+ @stream.gets(*arg)
671
+ end
672
+
322
673
  # Sets database class. Plese use only if autodetect fails.
323
- def dbclass=(k)
324
- if k then
325
- @dbclass = k
326
- @rs = @dbclass::DELIMITER
674
+ def dbclass=(klass)
675
+ if klass then
676
+ @dbclass = klass
677
+ begin
678
+ @splitter = @dbclass.flatfile_splitter(@dbclass, @stream)
679
+ rescue NameError, NoMethodError
680
+ @splitter = Splitter::Default.new(klass, @stream)
681
+ end
327
682
  else
328
- @dbclass = nil
329
- @rs = $/
683
+ @dbclass = nil
684
+ @splitter = nil
330
685
  end
331
686
  end
332
687
 
@@ -340,157 +695,518 @@ module Bio
340
695
  #
341
696
  # The method can be called anytime if you want (but not recommended).
342
697
  # This might be useful if input file is a mixture of muitiple format data.
343
- def autodetect(lines = 31)
344
- r = nil
345
- 1.upto(lines) do |x|
346
- if line = @io.gets then
347
- @prefetch << line
348
- if line and line.strip.size > 0 then
349
- r = self.class.autodetect(@prefetch)
350
- if r then
351
- self.dbclass = r
352
- return r
353
- end
354
- end
355
- end
698
+ def autodetect(lines = 31, ad = AutoDetect.default)
699
+ if r = ad.autodetect_flatfile(self, lines)
700
+ self.dbclass = r
701
+ else
702
+ self.dbclass = nil unless self.dbclass
356
703
  end
357
- self.dbclass = nil unless dbclass
358
704
  r
359
705
  end
360
706
 
361
707
  # Detects database class (== file format) of given file.
362
708
  # If fails to determine, returns nil.
363
709
  def self.autodetect_file(filename)
364
- ff = self.open(nil, filename)
365
- r = ff.dbclass
366
- ff.close
367
- r
710
+ self.open_file(filename).dbclass
368
711
  end
369
712
 
370
713
  # Detects database class (== file format) of given input stream.
371
714
  # If fails to determine, returns nil.
372
715
  # Caution: the method reads some data from the input stream,
373
716
  # and the data will be lost.
717
+ def self.autodetect_io(io)
718
+ self.new(nil, io).dbclass
719
+ end
720
+
721
+ # This is OBSOLETED. Please use autodetect_io(io) instead.
374
722
  def self.autodetect_stream(io)
375
- ff = self.new(nil, io)
376
- r = ff.dbclass
377
- r
723
+ $stderr.print "Bio::FlatFile.autodetect_stream will be deprecated." if $VERBOSE
724
+ self.autodetect_io(io)
378
725
  end
379
726
 
380
727
  # Detects database class (== file format) of given string.
381
728
  # If fails to determine, returns false or nil.
382
729
  def self.autodetect(text)
383
- require 'bio'
384
- case text
385
- when /^LOCUS .+ bp .*[a-z]*[DR]?NA/
386
- Bio::GenBank
387
- when /^LOCUS .+ aa .+/
388
- Bio::GenPept
389
- when /^UI \- [0-9]+$/
390
- Bio::MEDLINE
391
-
392
- when /^ID .+\; .*(DNA|RNA|XXX)\;/
393
- Bio::EMBL
394
- when /^ID .+\; *PRT\;/
395
- Bio::SPTR
396
- when /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/
397
- Bio::PROSITE
398
- when /^AC [-A-Za-z0-9_\.]+$/
399
- Bio::TRANSFAC
400
-
401
- when /^H [-A-Z0-9_\.]+$/
402
- if text =~ /^M [rc]/ then
403
- Bio::AAindex2
404
- elsif text =~ /^I A\/L/ then
405
- Bio::AAindex1
406
- else
407
- false #fail to determine
730
+ AutoDetect.default.autodetect(text)
731
+ end
732
+
733
+
734
+ # AutoDetect automatically determines database class of given data.
735
+ class AutoDetect
736
+
737
+ include TSort
738
+
739
+ # Template of a single rule of autodetection
740
+ class RuleTemplate
741
+ # Creates a new element.
742
+ def self.[](*arg)
743
+ self.new(*arg)
744
+ end
745
+
746
+ # Creates a new element.
747
+ def initialize
748
+ a = Array.new
749
+ def a.inspect
750
+ "[#{self.collect { |e| e.name.inspect }.join(' ')}]"
751
+ end
752
+ @higher_priority_elements = a.clone
753
+ @lower_priority_elements = a.clone
754
+ @name = nil
408
755
  end
409
756
 
410
- when /^CODE [0-9]+$/
411
- Bio::LITDB
412
- when /^Entry [A-Z0-9]+/
413
- Bio::KEGG::BRITE
414
-
415
- when /^ENTRY .+ KO\s*$/
416
- Bio::KEGG::KO
417
- when /^ENTRY .+ Glycan\s*$/
418
- Bio::KEGG::GLYCAN
419
- when /^ENTRY .+ (CDS|gene|.*RNA) /
420
- Bio::KEGG::GENES
421
- when /^ENTRY EC [0-9\.]+$/
422
- Bio::KEGG::ENZYME
423
- when /^ENTRY C[A-Za-z0-9\._]+$/
424
- Bio::KEGG::COMPOUND
425
- when /^ENTRY R[A-Za-z0-9\._]+$/
426
- Bio::KEGG::REACTION
427
- when /^ENTRY [a-z]+$/
428
- Bio::KEGG::GENOME
429
-
430
- when /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/
431
- if $1 == 'clusters'
432
- Bio::FANTOM::MaXML::Cluster
433
- elsif $1 == 'sequences'
434
- Bio::FANTOM::MaXML::Sequence
435
- else
436
- nil #unknown
757
+ # self is prior to the _elem_.
758
+ def is_prior_to(elem)
759
+ return nil if self == elem
760
+ elem.higher_priority_elements << self
761
+ self.lower_priority_elements << elem
762
+ true
437
763
  end
438
764
 
439
- when /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/
440
- Bio::PDB
765
+ # higher priority elements
766
+ attr_reader :higher_priority_elements
767
+ # lower priority elements
768
+ attr_reader :lower_priority_elements
769
+
770
+ # database classes
771
+ attr_reader :dbclasses
441
772
 
442
- when /^CLUSTAL .*\(.*\).*sequence +alignment/
443
- Bio::ClustalW::Report
773
+ # unique name of the element
774
+ attr_accessor :name
775
+
776
+ # If given text (and/or meta information) is known, returns
777
+ # the database class.
778
+ # Otherwise, returns nil or false.
779
+ #
780
+ # _text_ will be a String.
781
+ # _meta_ will be a Hash.
782
+ # _meta_ may contain following keys.
783
+ # :path => pathname, filename or uri.
784
+ def guess(text, meta)
785
+ nil
786
+ end
787
+ end #class Rule_Template
788
+
789
+ # RuleDebug is a class for debugging autodetect classes/methods
790
+ class RuleDebug < RuleTemplate
791
+ # Creates a new instance.
792
+ def initialize(name)
793
+ super()
794
+ @name = name
795
+ end
444
796
 
445
- when /\<\!DOCTYPE BlastOutput PUBLIC /
446
- Bio::Blast::Report
797
+ # prints information to the $stderr.
798
+ def guess(text, meta)
799
+ $stderr.puts @name
800
+ $stderr.puts text.inspect
801
+ $stderr.puts meta.inspect
802
+ nil
803
+ end
804
+ end #class RuleDebug
447
805
 
448
- when /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
449
- Bio::Blast::WU::Report
450
- when /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
451
- Bio::Blast::WU::Report_TBlast
806
+ # Special element that is always top or bottom priority.
807
+ class RuleSpecial < RuleTemplate
808
+ def initialize(name)
809
+ #super()
810
+ @name = name
811
+ end
812
+ # modification of @name is inhibited.
813
+ def name=(x)
814
+ raise 'cannot modify name'
815
+ end
452
816
 
453
- when /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
454
- Bio::Blast::Default::Report
455
- when /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
456
- Bio::Blast::Default::Report_TBlast
817
+ # always returns void array
818
+ def higher_priority_elements
819
+ []
820
+ end
821
+ # always returns void array
822
+ def lower_priority_elements
823
+ []
824
+ end
825
+ end #class RuleSpecial
457
826
 
458
- when /^psLayout version \d+\s*$/
459
- Bio::Blat::Report
460
- when /^\-\-SPIDEY version .+\-\-$/
461
- Bio::Spidey::Report
827
+ # Special element that is always top priority.
828
+ TopRule = RuleSpecial.new('top')
829
+ # Special element that is always bottom priority.
830
+ BottomRule = RuleSpecial.new('bottom')
462
831
 
463
- when /^HMMER +\d+\./
464
- Bio::HMMER::Report
832
+ # A autodetection rule to use a regular expression
833
+ class RuleRegexp < RuleTemplate
834
+ # Creates a new instance.
835
+ def initialize(dbclass, re)
836
+ super()
837
+ @re = re
838
+ @dbclass = dbclass
839
+ @dbclasses = [ dbclass ]
840
+ @name = dbclass.to_s
841
+ end
465
842
 
466
- when /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/
467
- Bio::Sim4::Report
843
+ # If given text matches the regexp, returns the database class.
844
+ # Otherwise, returns nil or false.
845
+ # _meta_ is ignored.
846
+ def guess(text, meta)
847
+ @re =~ text ? @dbclass : nil
848
+ end
849
+ end #class RuleRegexp
468
850
 
469
- when /^>.+$/
470
- if text =~ /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ then
471
- Bio::NBRF
472
- elsif text =~ /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ then
473
- Bio::FastaFormat
474
- elsif text =~ /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ then
475
- Bio::FastaNumericFormat
851
+ # A autodetection rule to use more than two regular expressions.
852
+ class RuleRegexp2 < RuleTemplate
853
+ # Creates a new instance.
854
+ def initialize(dbclass, *regexps)
855
+ super()
856
+ @regexps = regexps
857
+ @dbclass = dbclass
858
+ @dbclasses = [ dbclass ]
859
+ if name
860
+ @name = name
861
+ else
862
+ @name = @dbclass.to_s
863
+ end
864
+ end
865
+
866
+ # If given text matches the regexp, returns the database class.
867
+ # Otherwise, returns nil or false.
868
+ # _meta_ is ignored.
869
+ def guess(text, meta)
870
+ @regexps.each do |re|
871
+ return @dbclass if re =~ text
872
+ end
873
+ nil
874
+ end
875
+ end #class RuleRegexp
876
+
877
+ # A autodetection rule that passes data to the proc object.
878
+ class RuleProc < RuleTemplate
879
+ # Creates a new instance.
880
+ def initialize(*dbclasses, &proc)
881
+ super()
882
+ @proc = proc
883
+ @dbclasses = dbclasses
884
+ @name = dbclasses.collect { |x| x.to_s }.join('|')
885
+ end
886
+
887
+ # If given text (and/or meta information) is known, returns
888
+ # the database class.
889
+ # Otherwise, returns nil or false.
890
+ #
891
+ # Refer RuleTemplate#guess for _meta_.
892
+ def guess(text, meta)
893
+ @proc.call(text)
894
+ end
895
+ end #class RuleProc
896
+
897
+ # Creates a new Autodetect object
898
+ def initialize
899
+ # stores autodetection rules.
900
+ @rules = Hash.new
901
+ # stores elements (cache)
902
+ @elements = nil
903
+ self.add(TopRule)
904
+ self.add(BottomRule)
905
+ end
906
+
907
+ # Adds a new element.
908
+ # Returns _elem_.
909
+ def add(elem)
910
+ raise 'element name conflicts' if @rules[elem.name]
911
+ @elements = nil
912
+ @rules[elem.name] = elem
913
+ elem
914
+ end
915
+
916
+ # (required by TSort.)
917
+ # For all elements, yields each element.
918
+ def tsort_each_node(&x)
919
+ @rules.each_value(&x)
920
+ end
921
+
922
+ # (required by TSort.)
923
+ # For a given element, yields each child
924
+ # (= lower priority elements) of the element.
925
+ def tsort_each_child(elem)
926
+ if elem == TopRule then
927
+ @rules.each_value do |e|
928
+ yield e unless e == TopRule or
929
+ e.lower_priority_elements.index(TopRule)
930
+ end
931
+ elsif elem == BottomRule then
932
+ @rules.each_value do |e|
933
+ yield e if e.higher_priority_elements.index(BottomRule)
934
+ end
476
935
  else
477
- false #fail to determine
936
+ elem.lower_priority_elements.each do |e|
937
+ yield e if e != BottomRule
938
+ end
939
+ unless elem.higher_priority_elements.index(BottomRule)
940
+ yield BottomRule
941
+ end
478
942
  end
943
+ end
479
944
 
480
- else
481
- nil #not found
945
+ # Returns current elements as an array
946
+ # whose order fulfills all elements' priorities.
947
+ def elements
948
+ unless @elements
949
+ ary = tsort
950
+ ary.reverse!
951
+ @elements = ary
952
+ end
953
+ @elements
954
+ end
955
+
956
+ # rebuilds the object and clears internal cache.
957
+ def rehash
958
+ @rules.rehash
959
+ @elements = nil
960
+ end
961
+
962
+ # visualizes the object (mainly for debug)
963
+ def inspect
964
+ "<#{self.class.to_s} " +
965
+ self.elements.collect { |e| e.name.inspect }.join(' ') +
966
+ ">"
967
+ end
968
+
969
+ # Iterates over each element.
970
+ def each_rule(&x) #:yields: elem
971
+ elements.each(&x)
482
972
  end
483
- end
484
973
 
974
+ # Autodetect from the text.
975
+ # Returns a database class if succeeded.
976
+ # Returns nil if failed.
977
+ def autodetect(text, meta = {})
978
+ r = nil
979
+ elements.each do |e|
980
+ #$stderr.puts e.name
981
+ r = e.guess(text, meta)
982
+ break if r
983
+ end
984
+ r
985
+ end
986
+
987
+ # autodetect from the FlatFile object.
988
+ # Returns a database class if succeeded.
989
+ # Returns nil if failed.
990
+ def autodetect_flatfile(ff, lines = 31)
991
+ meta = {}
992
+ stream = ff.instance_eval { @stream }
993
+ begin
994
+ path = stream.path
995
+ rescue NameError
996
+ end
997
+ if path then
998
+ meta[:path] = path
999
+ # call autodetect onece with meta and without any read action
1000
+ if r = self.autodetect(stream.prefetch_buffer, meta)
1001
+ return r
1002
+ end
1003
+ end
1004
+ # reading stream
1005
+ 1.upto(lines) do |x|
1006
+ break unless line = stream.prefetch_gets
1007
+ if line.strip.size > 0 then
1008
+ if r = self.autodetect(stream.prefetch_buffer, meta)
1009
+ return r
1010
+ end
1011
+ end
1012
+ end
1013
+ return nil
1014
+ end
1015
+
1016
+ # default autodetect object for class method
1017
+ @default = nil
1018
+
1019
+ # returns the default autodetect object
1020
+ def self.default
1021
+ unless @default then
1022
+ @default = self.make_default
1023
+ end
1024
+ @default
1025
+ end
1026
+
1027
+ # sets the default autodetect object.
1028
+ def self.default=(ad)
1029
+ @default = ad
1030
+ end
1031
+
1032
+ # make a new autodetect object
1033
+ def self.[](*arg)
1034
+ a = self.new
1035
+ arg.each { |e| a.add(e) }
1036
+ a
1037
+ end
1038
+
1039
+ # make a default of default autodetect object
1040
+ def self.make_default
1041
+ a = self[
1042
+ genbank = RuleRegexp[ Bio::GenBank,
1043
+ /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ],
1044
+ genpept = RuleRegexp[ Bio::GenPept,
1045
+ /^LOCUS .+ aa .+/ ],
1046
+ medline = RuleRegexp[ Bio::MEDLINE,
1047
+ /^UI \- [0-9]+$/ ],
1048
+ embl = RuleRegexp[ Bio::EMBL,
1049
+ /^ID .+\; .*(DNA|RNA|XXX)\;/ ],
1050
+ sptr = RuleRegexp[ Bio::SPTR,
1051
+ /^ID .+\; *PRT\;/ ],
1052
+ prosite = RuleRegexp[ Bio::PROSITE,
1053
+ /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
1054
+ transfac = RuleRegexp[ Bio::TRANSFAC,
1055
+ /^AC [-A-Za-z0-9_\.]+$/ ],
1056
+
1057
+ aaindex = RuleProc.new(Bio::AAindex1, Bio::AAindex2) do |text|
1058
+ if /^H [-A-Z0-9_\.]+$/ =~ text then
1059
+ if text =~ /^M [rc]/ then
1060
+ Bio::AAindex2
1061
+ elsif text =~ /^I A\/L/ then
1062
+ Bio::AAindex1
1063
+ else
1064
+ false #fail to determine
1065
+ end
1066
+ else
1067
+ nil
1068
+ end
1069
+ end,
1070
+
1071
+ litdb = RuleRegexp[ Bio::LITDB,
1072
+ /^CODE [0-9]+$/ ],
1073
+ brite = RuleRegexp[ Bio::KEGG::BRITE,
1074
+ /^Entry [A-Z0-9]+/ ],
1075
+ ko = RuleRegexp[ Bio::KEGG::KO,
1076
+ /^ENTRY .+ KO\s*/ ],
1077
+ glycan = RuleRegexp[ Bio::KEGG::GLYCAN,
1078
+ /^ENTRY .+ Glycan\s*/ ],
1079
+ enzyme = RuleRegexp2[ Bio::KEGG::ENZYME,
1080
+ /^ENTRY EC [0-9\.]+$/,
1081
+ /^ENTRY .+ Enzyme\s*/
1082
+ ],
1083
+ compound = RuleRegexp2[ Bio::KEGG::COMPOUND,
1084
+ /^ENTRY C[A-Za-z0-9\._]+$/,
1085
+ /^ENTRY .+ Compound\s*/
1086
+ ],
1087
+ reaction = RuleRegexp2[ Bio::KEGG::REACTION,
1088
+ /^ENTRY R[A-Za-z0-9\._]+$/,
1089
+ /^ENTRY .+ Reaction\s*/
1090
+ ],
1091
+ genes = RuleRegexp[ Bio::KEGG::GENES,
1092
+ /^ENTRY .+ (CDS|gene|.*RNA) / ],
1093
+ genome = RuleRegexp[ Bio::KEGG::GENOME,
1094
+ /^ENTRY [a-z]+$/ ],
1095
+
1096
+ fantom = RuleProc.new(Bio::FANTOM::MaXML::Cluster,
1097
+ Bio::FANTOM::MaXML::Sequence) do |text|
1098
+ if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
1099
+ case $1
1100
+ when 'clusters'
1101
+ Bio::FANTOM::MaXML::Cluster
1102
+ when 'sequences'
1103
+ Bio::FANTOM::MaXML::Sequence
1104
+ else
1105
+ nil #unknown
1106
+ end
1107
+ else
1108
+ nil
1109
+ end
1110
+ end,
1111
+
1112
+ pdb = RuleRegexp[ Bio::PDB,
1113
+ /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ],
1114
+ het = RuleRegexp[ Bio::PDB::ChemicalComponent,
1115
+ /^RESIDUE +.+ +\d+\s*$/ ],
1116
+
1117
+ clustal = RuleRegexp[ Bio::ClustalW::Report,
1118
+ /^CLUSTAL .*\(.*\).*sequence +alignment/ ],
1119
+
1120
+ blastxml = RuleRegexp[ Bio::Blast::Report,
1121
+ /\<\!DOCTYPE BlastOutput PUBLIC / ],
1122
+ wublast = RuleRegexp[ Bio::Blast::WU::Report,
1123
+ /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
1124
+ wutblast = RuleRegexp[ Bio::Blast::WU::Report_TBlast,
1125
+ /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
1126
+ blast = RuleRegexp[ Bio::Blast::Default::Report,
1127
+ /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
1128
+ tblast = RuleRegexp[ Bio::Blast::Default::Report_TBlast,
1129
+ /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
1130
+
1131
+ blat = RuleRegexp[ Bio::Blat::Report,
1132
+ /^psLayout version \d+\s*$/ ],
1133
+ spidey = RuleRegexp[ Bio::Spidey::Report,
1134
+ /^\-\-SPIDEY version .+\-\-$/ ],
1135
+ hmmer = RuleRegexp[ Bio::HMMER::Report,
1136
+ /^HMMER +\d+\./ ],
1137
+ sim4 = RuleRegexp[ Bio::Sim4::Report,
1138
+ /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],
1139
+
1140
+ fastaformat = RuleProc.new(Bio::FastaFormat,
1141
+ Bio::NBRF,
1142
+ Bio::FastaNumericFormat) do |text|
1143
+ if /^>.+$/ =~ text
1144
+ case text
1145
+ when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
1146
+ Bio::NBRF
1147
+ when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
1148
+ Bio::FastaFormat
1149
+ when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
1150
+ Bio::FastaNumericFormat
1151
+ else
1152
+ false
1153
+ end
1154
+ else
1155
+ nil
1156
+ end
1157
+ end
1158
+ ]
1159
+
1160
+ # dependencies
1161
+ # NCBI
1162
+ genbank.is_prior_to genpept
1163
+ # EMBL/UniProt
1164
+ embl.is_prior_to sptr
1165
+ sptr.is_prior_to prosite
1166
+ prosite.is_prior_to transfac
1167
+ # KEGG
1168
+ #aaindex.is_prior_to litdb
1169
+ #litdb.is_prior_to brite
1170
+ brite.is_prior_to ko
1171
+ ko.is_prior_to glycan
1172
+ glycan.is_prior_to enzyme
1173
+ enzyme.is_prior_to compound
1174
+ compound.is_prior_to reaction
1175
+ reaction.is_prior_to genes
1176
+ genes.is_prior_to genome
1177
+ # PDB
1178
+ pdb.is_prior_to het
1179
+ # BLAST
1180
+ wublast.is_prior_to wutblast
1181
+ wutblast.is_prior_to blast
1182
+ blast.is_prior_to tblast
1183
+ # FastaFormat
1184
+ BottomRule.is_prior_to(fastaformat)
1185
+
1186
+ # for debug
1187
+ #debug_first = RuleDebug.new('debug_first')
1188
+ #a.add(debug_first)
1189
+ #debug_first.is_prior_to(TopRule)
1190
+
1191
+ ## for debug
1192
+ #debug_last = RuleDebug.new('debug_last')
1193
+ #a.add(debug_last)
1194
+ #BottomRule.is_prior_to(debug_last)
1195
+ #fastaformat.is_prior_to(debug_last)
1196
+
1197
+ a.rehash
1198
+ return a
1199
+ end
1200
+
1201
+ end #class AutoDetect
1202
+
485
1203
  end #class FlatFile
486
1204
 
487
1205
  end #module Bio
488
1206
 
489
-
490
1207
  if __FILE__ == $0
491
1208
  if ARGV.size == 2
492
1209
  require 'bio'
493
1210
  p Bio::FlatFile.open(eval(ARGV.shift), ARGV.shift).next_entry
494
1211
  end
495
1212
  end
496
-