bio 0.7.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. data/bin/bioruby +71 -27
  2. data/bin/br_biofetch.rb +5 -17
  3. data/bin/br_bioflat.rb +14 -26
  4. data/bin/br_biogetseq.rb +6 -18
  5. data/bin/br_pmfetch.rb +6 -16
  6. data/doc/Changes-0.7.rd +35 -0
  7. data/doc/KEGG_API.rd +287 -172
  8. data/doc/KEGG_API.rd.ja +273 -160
  9. data/doc/Tutorial.rd +18 -9
  10. data/doc/Tutorial.rd.ja +656 -138
  11. data/lib/bio.rb +6 -24
  12. data/lib/bio/alignment.rb +5 -5
  13. data/lib/bio/appl/blast.rb +132 -98
  14. data/lib/bio/appl/blast/format0.rb +9 -19
  15. data/lib/bio/appl/blast/wublast.rb +5 -18
  16. data/lib/bio/appl/emboss.rb +40 -47
  17. data/lib/bio/appl/hmmer.rb +116 -82
  18. data/lib/bio/appl/hmmer/report.rb +509 -364
  19. data/lib/bio/appl/spidey/report.rb +7 -18
  20. data/lib/bio/data/na.rb +3 -21
  21. data/lib/bio/db.rb +3 -21
  22. data/lib/bio/db/aaindex.rb +147 -52
  23. data/lib/bio/db/embl/common.rb +27 -6
  24. data/lib/bio/db/embl/embl.rb +18 -10
  25. data/lib/bio/db/embl/sptr.rb +87 -67
  26. data/lib/bio/db/embl/swissprot.rb +32 -3
  27. data/lib/bio/db/embl/trembl.rb +32 -3
  28. data/lib/bio/db/embl/uniprot.rb +32 -3
  29. data/lib/bio/db/fasta.rb +327 -289
  30. data/lib/bio/db/medline.rb +25 -4
  31. data/lib/bio/db/nbrf.rb +12 -20
  32. data/lib/bio/db/pdb.rb +4 -1
  33. data/lib/bio/db/pdb/chemicalcomponent.rb +240 -0
  34. data/lib/bio/db/pdb/pdb.rb +13 -8
  35. data/lib/bio/db/rebase.rb +93 -97
  36. data/lib/bio/feature.rb +2 -31
  37. data/lib/bio/io/ddbjxml.rb +167 -139
  38. data/lib/bio/io/fastacmd.rb +89 -56
  39. data/lib/bio/io/flatfile.rb +994 -278
  40. data/lib/bio/io/flatfile/index.rb +257 -194
  41. data/lib/bio/io/flatfile/indexer.rb +37 -29
  42. data/lib/bio/reference.rb +147 -64
  43. data/lib/bio/sequence.rb +57 -417
  44. data/lib/bio/sequence/aa.rb +64 -0
  45. data/lib/bio/sequence/common.rb +175 -0
  46. data/lib/bio/sequence/compat.rb +68 -0
  47. data/lib/bio/sequence/format.rb +134 -0
  48. data/lib/bio/sequence/generic.rb +24 -0
  49. data/lib/bio/sequence/na.rb +189 -0
  50. data/lib/bio/shell.rb +9 -23
  51. data/lib/bio/shell/core.rb +130 -125
  52. data/lib/bio/shell/demo.rb +143 -0
  53. data/lib/bio/shell/{session.rb → interface.rb} +42 -40
  54. data/lib/bio/shell/object.rb +52 -0
  55. data/lib/bio/shell/plugin/codon.rb +4 -22
  56. data/lib/bio/shell/plugin/emboss.rb +23 -0
  57. data/lib/bio/shell/plugin/entry.rb +34 -25
  58. data/lib/bio/shell/plugin/flatfile.rb +5 -23
  59. data/lib/bio/shell/plugin/keggapi.rb +11 -24
  60. data/lib/bio/shell/plugin/midi.rb +5 -23
  61. data/lib/bio/shell/plugin/obda.rb +4 -22
  62. data/lib/bio/shell/plugin/seq.rb +6 -24
  63. data/lib/bio/shell/rails/Rakefile +10 -0
  64. data/lib/bio/shell/rails/app/controllers/application.rb +4 -0
  65. data/lib/bio/shell/rails/app/controllers/shell_controller.rb +94 -0
  66. data/lib/bio/shell/rails/app/helpers/application_helper.rb +3 -0
  67. data/lib/bio/shell/rails/app/models/shell_connection.rb +30 -0
  68. data/lib/bio/shell/rails/app/views/layouts/shell.rhtml +37 -0
  69. data/lib/bio/shell/rails/app/views/shell/history.rhtml +5 -0
  70. data/lib/bio/shell/rails/app/views/shell/index.rhtml +2 -0
  71. data/lib/bio/shell/rails/app/views/shell/show.rhtml +13 -0
  72. data/lib/bio/shell/rails/config/boot.rb +19 -0
  73. data/lib/bio/shell/rails/config/database.yml +85 -0
  74. data/lib/bio/shell/rails/config/environment.rb +53 -0
  75. data/lib/bio/shell/rails/config/environments/development.rb +19 -0
  76. data/lib/bio/shell/rails/config/environments/production.rb +19 -0
  77. data/lib/bio/shell/rails/config/environments/test.rb +19 -0
  78. data/lib/bio/shell/rails/config/routes.rb +19 -0
  79. data/lib/bio/shell/rails/doc/README_FOR_APP +2 -0
  80. data/lib/bio/shell/rails/public/404.html +8 -0
  81. data/lib/bio/shell/rails/public/500.html +8 -0
  82. data/lib/bio/shell/rails/public/dispatch.cgi +10 -0
  83. data/lib/bio/shell/rails/public/dispatch.fcgi +24 -0
  84. data/lib/bio/shell/rails/public/dispatch.rb +10 -0
  85. data/lib/bio/shell/rails/public/favicon.ico +0 -0
  86. data/lib/bio/shell/rails/public/images/icon.png +0 -0
  87. data/lib/bio/shell/rails/public/images/rails.png +0 -0
  88. data/lib/bio/shell/rails/public/index.html +277 -0
  89. data/lib/bio/shell/rails/public/javascripts/controls.js +750 -0
  90. data/lib/bio/shell/rails/public/javascripts/dragdrop.js +584 -0
  91. data/lib/bio/shell/rails/public/javascripts/effects.js +854 -0
  92. data/lib/bio/shell/rails/public/javascripts/prototype.js +1785 -0
  93. data/lib/bio/shell/rails/public/robots.txt +1 -0
  94. data/lib/bio/shell/rails/public/stylesheets/main.css +187 -0
  95. data/lib/bio/shell/rails/script/about +3 -0
  96. data/lib/bio/shell/rails/script/breakpointer +3 -0
  97. data/lib/bio/shell/rails/script/console +3 -0
  98. data/lib/bio/shell/rails/script/destroy +3 -0
  99. data/lib/bio/shell/rails/script/generate +3 -0
  100. data/lib/bio/shell/rails/script/performance/benchmarker +3 -0
  101. data/lib/bio/shell/rails/script/performance/profiler +3 -0
  102. data/lib/bio/shell/rails/script/plugin +3 -0
  103. data/lib/bio/shell/rails/script/process/reaper +3 -0
  104. data/lib/bio/shell/rails/script/process/spawner +3 -0
  105. data/lib/bio/shell/rails/script/process/spinner +3 -0
  106. data/lib/bio/shell/rails/script/runner +3 -0
  107. data/lib/bio/shell/rails/script/server +42 -0
  108. data/lib/bio/shell/rails/test/test_helper.rb +28 -0
  109. data/lib/bio/shell/web.rb +90 -0
  110. data/lib/bio/util/contingency_table.rb +231 -225
  111. data/sample/any2fasta.rb +59 -0
  112. data/test/data/HMMER/hmmpfam.out +64 -0
  113. data/test/data/HMMER/hmmsearch.out +88 -0
  114. data/test/data/aaindex/DAYM780301 +30 -0
  115. data/test/data/aaindex/PRAM900102 +20 -0
  116. data/test/data/bl2seq/cd8a_cd8b_blastp.bl2seq +53 -0
  117. data/test/data/bl2seq/cd8a_p53_e-5blastp.bl2seq +37 -0
  118. data/test/data/blast/{eco:b0002.faa → b0002.faa} +0 -0
  119. data/test/data/blast/{eco:b0002.faa.m0 → b0002.faa.m0} +2 -2
  120. data/test/data/blast/{eco:b0002.faa.m7 → b0002.faa.m7} +1 -1
  121. data/test/data/blast/{eco:b0002.faa.m8 → b0002.faa.m8} +0 -0
  122. data/test/unit/bio/appl/bl2seq/test_report.rb +134 -0
  123. data/test/unit/bio/appl/blast/test_report.rb +15 -12
  124. data/test/unit/bio/appl/blast/test_xmlparser.rb +4 -4
  125. data/test/unit/bio/appl/hmmer/test_report.rb +355 -0
  126. data/test/unit/bio/appl/test_blast.rb +5 -5
  127. data/test/unit/bio/data/test_na.rb +9 -18
  128. data/test/unit/bio/db/pdb/test_pdb.rb +169 -0
  129. data/test/unit/bio/db/test_aaindex.rb +197 -0
  130. data/test/unit/bio/io/test_fastacmd.rb +55 -0
  131. data/test/unit/bio/sequence/test_aa.rb +102 -0
  132. data/test/unit/bio/sequence/test_common.rb +178 -0
  133. data/test/unit/bio/sequence/test_compat.rb +82 -0
  134. data/test/unit/bio/sequence/test_na.rb +242 -0
  135. data/test/unit/bio/shell/plugin/test_seq.rb +29 -19
  136. data/test/unit/bio/test_alignment.rb +15 -7
  137. data/test/unit/bio/test_reference.rb +198 -0
  138. data/test/unit/bio/test_sequence.rb +4 -49
  139. data/test/unit/bio/test_shell.rb +2 -2
  140. metadata +118 -15
  141. data/lib/bio/io/brdb.rb +0 -103
  142. data/lib/bioruby.rb +0 -34
@@ -1,8 +1,47 @@
1
1
  #
2
- # bio/io/fastacmd.rb - NCBI fastacmd wrapper class
2
+ # = bio/io/fastacmd.rb - NCBI fastacmd wrapper class
3
3
  #
4
- # Copyright (C) 2005 Shuji SHIGENOBU <shige@nibb.ac.jp>
5
- # Copyright (C) 2005 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2005, 2006
5
+ # Shuji SHIGENOBU <shige@nibb.ac.jp>,
6
+ # Toshiaki Katayama <k@bioruby.org>,
7
+ # Mitsuteru C. Nakao <n@bioruby.org>
8
+ # Lisence:: LGPL
9
+ #
10
+ # $Id: fastacmd.rb,v 1.10 2006/01/28 08:12:21 nakao Exp $
11
+ #
12
+ # == Description
13
+ #
14
+ # Retrives FASTA formatted sequences from a blast database using
15
+ # NCBI fastacmd command.
16
+ #
17
+ # This class requires 'fastacmd' command and a blast database
18
+ # (formatted using the '-o' option of 'formatdb').
19
+ #
20
+ # == Examples
21
+ #
22
+ # database = ARGV.shift || "/db/myblastdb"
23
+ # entry_id = ARGV.shift || "sp:128U_DROME"
24
+ # ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
25
+ #
26
+ # fastacmd = Bio::Blast::Fastacmd.new(database)
27
+ #
28
+ # entry = fastacmd.get_by_id(entry_id)
29
+ # fastacmd.fetch(entry_id)
30
+ # fastacmd.fetch(ent_list)
31
+ #
32
+ # fastacmd.fetch(ent_list).each do |fasta|
33
+ # puts fasta
34
+ # end
35
+ #
36
+ # == References
37
+ #
38
+ # * NCBI tool
39
+ # ftp://ftp.ncbi.nih.gov/blast/executables/LATEST/ncbi.tar.gz
40
+ #
41
+ # * fastacmd.html
42
+ # http://biowulf.nih.gov/apps/blast/doc/fastacmd.html
43
+ #
44
+ #--
6
45
  #
7
46
  # This library is free software; you can redistribute it and/or
8
47
  # modify it under the terms of the GNU Lesser General Public
@@ -18,7 +57,7 @@
18
57
  # License along with this library; if not, write to the Free Software
19
58
  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
59
  #
21
- # $Id: fastacmd.rb,v 1.8 2005/09/26 13:00:08 k Exp $
60
+ #++
22
61
  #
23
62
 
24
63
  require 'bio/db/fasta'
@@ -28,23 +67,52 @@ require 'bio/command'
28
67
  module Bio
29
68
  class Blast
30
69
 
70
+ # NCBI fastacmd wrapper class
71
+ #
31
72
  class Fastacmd
32
73
 
33
74
  include Enumerable
34
75
  include Bio::Command::Tools
35
76
 
36
- def initialize(db)
37
- @database = db
77
+ # Database file path.
78
+ attr_accessor :database
79
+
80
+ # fastcmd command file path.
81
+ attr_accessor :fastacmd
82
+
83
+ #
84
+ attr_accessor :errorlog
85
+
86
+ # Initalize a fastacmd object.
87
+ #
88
+ # fastacmd = Bio::Blast::Fastacmd.new("/db/myblastdb")
89
+ def initialize(blast_database_file_path)
90
+ @database = blast_database_file_path
38
91
  @fastacmd = 'fastacmd'
39
92
  end
40
- attr_accessor :database, :fastacmd, :errorlog
41
93
 
42
- # get an entry_id and returns a Bio::FastaFormat object
94
+
95
+ # get an entry_id and returns a Bio::FastaFormat object.
96
+ #
97
+ # entry_id = "sp:128U_DROME"
98
+ # entry = fastacmd.get_by_id(entry_id)
43
99
  def get_by_id(entry_id)
44
100
  fetch(entry_id).shift
45
101
  end
46
102
 
47
- # get one or more entry_id and returns an Array of Bio::FastaFormat objects
103
+ # get one or more entry_id and returns an Array of Bio::FastaFormat objects.
104
+ #
105
+ # Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
106
+ # object even when the result is a single entry.
107
+ #
108
+ # p fastacmd.fetch(entry_id)
109
+ #
110
+ # Fastacmd#fetch method also accepts a list of entry_id and returns
111
+ # an Array of Bio::FastaFormat objects.
112
+ #
113
+ # ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
114
+ # p fastacmd.fetch(ent_list)
115
+ #
48
116
  def fetch(list)
49
117
  if list.respond_to?(:join)
50
118
  entry_id = list.join(",")
@@ -59,13 +127,20 @@ class Fastacmd
59
127
  end
60
128
  end
61
129
 
130
+ # Iterates each entry.
131
+ #
132
+ # You can also iterate on all sequences in the database!
133
+ # fastacmd.each do |fasta|
134
+ # p [ fasta.definition[0..30], fasta.seq.size ]
135
+ # end
136
+ #
62
137
  def each_entry
63
138
  cmd = [ @fastacmd, '-d', @database, '-D', 'T' ]
64
139
  call_command_local(cmd) do |inn, out|
65
140
  inn.close_write
66
141
  Bio::FlatFile.open(Bio::FastaFormat, out) do |f|
67
- f.each_entry do |e|
68
- yield e
142
+ f.each_entry do |entry|
143
+ yield entry
69
144
  end
70
145
  end
71
146
  end
@@ -73,51 +148,9 @@ class Fastacmd
73
148
  end
74
149
  alias each each_entry
75
150
 
76
- end
151
+ end # class Fastacmd
77
152
 
78
- end
79
- end
80
-
81
-
82
- if __FILE__ == $0
83
-
84
- database = ARGV.shift || "/db/myblastdb"
85
- entry_id = ARGV.shift || "sp:128U_DROME"
86
- ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
87
-
88
- fastacmd = Bio::Blast::Fastacmd.new(database)
89
-
90
- ### Retrieve one sequence
91
- entry = fastacmd.get_by_id(entry_id)
92
-
93
- # Fastacmd#get_by_id(entry_id) returns a Bio::FastaFormat object.
94
- p entry
95
-
96
- # Bio::FastaFormat becomes a fasta format string when printed by puts.
97
- puts entry
98
-
99
- # Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
100
- # object even when the result is a single entry.
101
- p fastacmd.fetch(entry_id)
102
-
103
- ### Retrieve more sequences
104
-
105
- # Fastacmd#fetch method also accepts a list of entry_id and returns
106
- # an Array of Bio::FastaFormat objects.
107
- p fastacmd.fetch(ent_list)
108
-
109
- # So, you can iterate on the results.
110
- fastacmd.fetch(ent_list).each do |fasta|
111
- puts fasta
112
- end
113
-
114
-
115
- ### Iterates on all entries
116
-
117
- # You can also iterate on all sequences in the database!
118
- fastacmd.each do |fasta|
119
- p [ fasta.definition[0..30], fasta.seq.size ]
120
- end
153
+ end # class Blast
154
+ end # module Bio
121
155
 
122
- end
123
156
 
@@ -1,32 +1,19 @@
1
1
  #
2
2
  # = bio/io/flatfile.rb - flatfile access wrapper class
3
3
  #
4
- # Copyright:: Copyright (C) 2001, 2002 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
- # License:: LGPL
4
+ # Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
6
5
  #
7
- #--
8
- # This library is free software; you can redistribute it and/or
9
- # modify it under the terms of the GNU Lesser General Public
10
- # License as published by the Free Software Foundation; either
11
- # version 2 of the License, or (at your option) any later version.
6
+ # License:: Ruby's
12
7
  #
13
- # This library is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
- # Lesser General Public License for more details.
8
+ # $Id: flatfile.rb,v 1.46 2006/02/22 10:01:27 ngoto Exp $
17
9
  #
18
- # You should have received a copy of the GNU Lesser General Public
19
- # License along with this library; if not, write to the Free Software
20
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
- #++
22
- #
23
- # $Id: flatfile.rb,v 1.41 2005/11/01 15:34:45 ngoto Exp $
24
10
  #
25
11
  # Bio::FlatFile is a helper and wrapper class to read a biological data file.
26
12
  # It acts like a IO object.
27
13
  # It can automatically detect data format, and users do not need to tell
28
14
  # the class what the data is.
29
15
  #
16
+ require 'tsort'
30
17
 
31
18
  module Bio
32
19
 
@@ -38,60 +25,407 @@ module Bio
38
25
 
39
26
  include Enumerable
40
27
 
28
+ # Wrapper for a IO (or IO-like) object.
29
+ # It can input with a buffer.
30
+ class BufferedInputStream
31
+ # Creates a new input stream wrapper
32
+ def initialize(io, path)
33
+ @io = io
34
+ @path = path
35
+ # initialize prefetch buffer
36
+ @buffer = ''
37
+ @path = path
38
+ end
39
+
40
+ # Creates a new input stream wrapper from the given IO object.
41
+ def self.for_io(io)
42
+ begin
43
+ path = io.path
44
+ rescue NameError
45
+ path = nil
46
+ end
47
+ self.new(io, path)
48
+ end
49
+
50
+ # Creates a new input stream wrapper to open file _filename_
51
+ # by using File.open.
52
+ # *arg is passed to File.open.
53
+ #
54
+ # Like File.open, a block can be accepted.
55
+ def self.open_file(filename, *arg)
56
+ if block_given? then
57
+ File.open(filename, *arg) do |fobj|
58
+ yield self.new(fobj, filename)
59
+ end
60
+ else
61
+ fobj = File.open(filename, *arg)
62
+ self.new(fobj, filename)
63
+ end
64
+ end
65
+
66
+ # Creates a new input stream wrapper from URI specified as _uri_.
67
+ # by using OpenURI.open_uri or URI#open.
68
+ # _uri_ must be a String or URI object.
69
+ # *arg is passed to OpenURI.open_uri or URI#open.
70
+ #
71
+ # Like OpenURI.open_uri, it can accept a block.
72
+ def self.open_uri(uri, *arg)
73
+ if uri.kind_of?(URI)
74
+ if block_given?
75
+ uri.open(*arg) do |fobj|
76
+ yield self.new(fobj, uri.to_s)
77
+ end
78
+ else
79
+ fobj = uri.open(*arg)
80
+ self.new(fobj, uri.to_s)
81
+ end
82
+ else
83
+ if block_given?
84
+ OpenURI.open_uri(uri, *arg) do |fobj|
85
+ yield self.new(fobj, uri)
86
+ end
87
+ else
88
+ fobj = OpenURI.open_uri(uri, *arg)
89
+ self.new(fobj, uri)
90
+ end
91
+ end
92
+ end
93
+
94
+ # Pathname, filename or URI to open the object.
95
+ # Like File#path, returned value isn't normalized.
96
+ attr_reader :path
97
+
98
+ # Converts to IO object if possible
99
+ def to_io
100
+ @io.to_io
101
+ end
102
+
103
+ # Closes the IO object if possible
104
+ def close
105
+ @io.close
106
+ end
107
+
108
+ # Rewinds the IO object if possible
109
+ # Internal buffer in this wrapper is cleared.
110
+ def rewind
111
+ r = @io.rewind
112
+ @buffer = ''
113
+ r
114
+ end
115
+
116
+ # Returns current file position
117
+ def pos
118
+ @io.pos - @buffer.size
119
+ end
120
+
121
+ # Sets current file position if possible
122
+ # Internal buffer in this wrapper is cleared.
123
+ def pos=(p)
124
+ r = (@io.pos = p)
125
+ @buffer = ''
126
+ r
127
+ end
128
+
129
+ # Returns true if end-of-file. Otherwise, returns false.
130
+ #
131
+ # Note that it returns false if internal buffer is this wrapper
132
+ # is not empty,
133
+ def eof?
134
+ if @buffer.size > 0
135
+ false
136
+ else
137
+ @io.eof?
138
+ end
139
+ end
140
+
141
+ # Same as IO#gets.
142
+ def gets(io_rs = $/)
143
+ if @buffer.size > 0
144
+ if io_rs == nil then
145
+ r = @buffer + @io.gets(nil).to_s
146
+ @buffer = ''
147
+ else
148
+ if io_rs == '' then
149
+ sp_rs = /\n\n/n
150
+ sp_rs_orig = "\n\n"
151
+ else
152
+ sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
153
+ sp_rs_orig = io_rs
154
+ end
155
+ a = @buffer.split(sp_rs, 2)
156
+ if a.size > 1 then
157
+ r = a[0] + sp_rs_orig
158
+ @buffer = a[1]
159
+ else
160
+ @buffer << @io.gets(io_rs).to_s
161
+ a = @buffer.split(sp_rs, 2)
162
+ if a.size > 1 then
163
+ r = a[0] + sp_rs_orig
164
+ @buffer = a[1].to_s
165
+ else
166
+ r = @buffer
167
+ @buffer = ''
168
+ end
169
+ end
170
+ end
171
+ r
172
+ else
173
+ @io.gets(io_rs)
174
+ end
175
+ end
176
+
177
+ # Pushes back given str to the internal buffer.
178
+ # Returns nil.
179
+ # str must be read previously with the wrapper object.
180
+ #
181
+ # Note that in current implementation, the str can be everything,
182
+ # but please don't depend on it.
183
+ #
184
+ def ungets(str)
185
+ @buffer = str + @buffer
186
+ nil
187
+ end
188
+
189
+ # Same as IO#getc.
190
+ def getc
191
+ if @buffer.size > 0 then
192
+ r = @buffer[0]
193
+ @buffer = @buffer[1..-1]
194
+ else
195
+ r = @io.getc
196
+ end
197
+ r
198
+ end
199
+
200
+ # Pushes back one character into the internal buffer.
201
+ # Unlike IO#getc, it can be called more than one time.
202
+ def ungetc(c)
203
+ @buffer = sprintf("%c", c) + @buffer
204
+ nil
205
+ end
206
+
207
+ # Gets current prefetch buffer
208
+ def prefetch_buffer
209
+ @buffer
210
+ end
211
+
212
+ # It does @io.gets, and addes returned string
213
+ # to the internal buffer, and returns the string.
214
+ def prefetch_gets(*arg)
215
+ r = @io.gets(*arg)
216
+ @buffer << r if r
217
+ r
218
+ end
219
+
220
+ # It does @io.readpartial, and addes returned string
221
+ # to the internal buffer, and returns the string.
222
+ def prefetch_readpartial(*arg)
223
+ r = @io.readpartial(*arg)
224
+ @buffer << r if r
225
+ r
226
+ end
227
+
228
+ # Skips space characters in the stream.
229
+ # returns nil.
230
+ def skip_spaces
231
+ ws = { ?\s => true, ?\n => true, ?\r => true, ?\t => true }
232
+ while r = self.getc
233
+ unless ws[r] then
234
+ self.ungetc(r)
235
+ break
236
+ end
237
+ end
238
+ nil
239
+ end
240
+ end #class BufferedInputStream
241
+
242
+ # Splitter is a class to get entries from a buffered input stream.
243
+ module Splitter
244
+ # This is a template of splitter.
245
+ class Template
246
+ # Creates a new splitter.
247
+ def initialize(klass, bstream)
248
+ @stream = bstream
249
+ raise NotImplementedError
250
+ end
251
+
252
+ # skips leader of the entry.
253
+ def skip_leader
254
+ raise NotImplementedError
255
+ end
256
+
257
+ # Gets entry as a string
258
+ def get_entry
259
+ raise NotImplementedError
260
+ end
261
+
262
+ # the last entry read from the stream
263
+ attr_reader :entry
264
+
265
+ # start position of the entry
266
+ attr_reader :entry_start_pos
267
+
268
+ # (end position of the entry) + 1
269
+ attr_reader :entry_ended_pos
270
+ end
271
+
272
+ # Default splitter.
273
+ # It sees following constants in the given class.
274
+ # DELIMITER:: (String) delimiter indicates the end of a entry.
275
+ # FLATFILE_HEADER:: (String) start of a entry, located on head of a line.
276
+ # DELIMITER_OVERRUN:: (Integer) excess read size included in DELIMITER.
277
+ #
278
+ class Default < Template
279
+ # Creates a new splitter.
280
+ # klass:: database class
281
+ # bstream:: input stream. It must be a BufferedInputStream object.
282
+ def initialize(klass, bstream)
283
+ @stream = bstream
284
+ @delimiter = klass::DELIMITER rescue nil
285
+ @header = klass::FLATFILE_HEADER rescue nil
286
+ # for specific classes' benefit
287
+ unless header
288
+ if klass == Bio::GenBank or klass == Bio::GenPept
289
+ @header = 'LOCUS '
290
+ end
291
+ end
292
+ @delimiter_overrun = klass::DELIMITER_OVERRUN rescue nil
293
+ end
294
+
295
+ # (String) delimiter indicates the end of a entry.
296
+ attr_accessor :delimiter
297
+
298
+ # (String) start of a entry, located on head of a line.
299
+ attr_accessor :header
300
+
301
+ # (Integer) excess read data size included in delimiter.
302
+ attr_accessor :delimiter_overrun
303
+
304
+ # Skips leader of the entry.
305
+ #
306
+ # If @header is not nil, it reads till the contents of @header
307
+ # comes at the head of a line.
308
+ # If correct FLATFILE_HEADER is found, returns true.
309
+ # Otherwise, returns nil.
310
+ def skip_leader
311
+ if @header then
312
+ data = ''
313
+ while s = @stream.gets(@header)
314
+ data << s
315
+ if data.split(/[\r\n]+/)[-1] == @header then
316
+ @stream.ungets(@header)
317
+ return true
318
+ end
319
+ end
320
+ # @header was not found. For safety,
321
+ # pushes back data with removing white spaces in the head.
322
+ data.sub(/\A\s+/, '')
323
+ @stream.ungets(data)
324
+ return nil
325
+ else
326
+ @stream.skip_spaces
327
+ return nil
328
+ end
329
+ end
330
+
331
+ # gets a entry
332
+ def get_entry
333
+ p0 = @stream.pos
334
+ e = @stream.gets(@delimiter)
335
+ if e and @delimiter_overrun then
336
+ if e[-@delimiter.size, @delimiter.size ] == @delimiter then
337
+ overrun = e[-@delimiter_overrun, @delimiter_overrun]
338
+ e[-@delimiter_overrun, @delimiter_overrun] = ''
339
+ @stream.ungets(overrun)
340
+ end
341
+ end
342
+ p1 = @stream.pos
343
+ @entry_start_pos = p0
344
+ @entry = e
345
+ @entry_ended_pos = p1
346
+ @entry
347
+ end
348
+ end #class Defalult
349
+ end #module Splitter
350
+
351
+ #
352
+ # Bio::FlatFile.open(file, *arg)
353
+ # Bio::FlatFile.open(dbclass, file, *arg)
354
+ #
41
355
  # Creates a new Bio::FlatFile object to read a file or a stream
42
- # which contains +dbclass+ data.
356
+ # which contains _dbclass_ data.
43
357
  #
44
- # +dbclass+ should be a class (or module) or nil.
358
+ # _dbclass_ should be a class (or module) or nil.
45
359
  # e.g. Bio::GenBank, Bio::FastaFormat.
46
360
  #
47
- # If +file+ is a filename (which doesn't have gets method),
48
- # the method opens a local file named +file+
49
- # with 'File.open(filename, mode, perm)'.
361
+ # If _file_ is a filename (which doesn't have gets method),
362
+ # the method opens a local file named _file_
363
+ # with <code>File.open(filename, *arg)</code>.
50
364
  #
51
- # When nil is given to dbclass, trying to determine database class
52
- # (file format) automatically. If fails to determine, dbclass is
53
- # set to nil and FlatFile#next_entry works same as IO#gets when
54
- # raw = true. It is recommended to set dbclass using
55
- # FlatFile#dbclass= method if fails to determine automatically.
365
+ # When _dbclass_ is omitted or nil is given to _dbclass_,
366
+ # the method tries to determine database class
367
+ # (file format) automatically.
368
+ # When it fails to determine, dbclass is set to nil
369
+ # and FlatFile#next_entry would fail.
370
+ # You can still set dbclass using FlatFile#dbclass= method.
56
371
  #
57
372
  # * Example 1
58
373
  # Bio::FlatFile.open(Bio::GenBank, "genbank/gbest40.seq")
59
374
  # * Example 2
60
375
  # Bio::FlatFile.open(nil, "embl/est_hum17.dat")
61
376
  # * Example 3
377
+ # Bio::FlatFile.open("genbank/gbest40.seq")
378
+ #
379
+ # * Example 4
62
380
  # Bio::FlatFile.open(Bio::GenBank, $stdin)
63
381
  #
64
- # If it is called with block, the block will be executed with
65
- # a newly opened Bio::FlatFile instance object. If filename
66
- # is given, the file is automatically closed when leaving the block.
382
+ # If it is called with a block, the block will be executed with
383
+ # a new Bio::FlatFile object. If filename is given,
384
+ # the file is automatically closed when leaving the block.
67
385
  #
68
- # * Example 4
386
+ # * Example 5
69
387
  # Bio::FlatFile.open(nil, 'test4.fst') do |ff|
70
388
  # ff.each { |e| print e.definition, "\n" }
71
389
  # end
72
390
  #
73
- def self.open(dbclass, file, *arg)
74
- # 3rd and 4th arg: mode, perm (passed to File.open)
75
- openmode = []
76
- while x = arg[0] and !x.is_a?(Hash)
77
- openmode << arg.shift
78
- end
79
- # rest of arg: passed to FlatFile.new
80
- # create a flatfile object
391
+ # * Example 6
392
+ # Bio::FlatFile.open('test4.fst') do |ff|
393
+ # ff.each { |e| print e.definition, "\n" }
394
+ # end
395
+ #
396
+ # Compatibility Note:
397
+ # <em>*arg</em> is completely passed to the <code>File.open</code>
398
+ # and you cannot specify ":raw => true" or ":raw => false".
399
+ #
400
+ def self.open(*arg, &block)
401
+ # FlatFile.open(dbclass, file, mode, perm)
402
+ # FlatFile.open(file, mode, perm)
403
+ if arg.size <= 0
404
+ raise ArgumentError, 'wrong number of arguments (0 for 1)'
405
+ end
406
+ x = arg.shift
407
+ if x.is_a?(Module) then
408
+ # FlatFile.open(dbclass, filename_or_io, ...)
409
+ dbclass = x
410
+ elsif x.nil? then
411
+ # FlatFile.open(nil, filename_or_io, ...)
412
+ dbclass = nil
413
+ else
414
+ # FlatFile.open(filename, ...)
415
+ dbclass = nil
416
+ arg.unshift(x)
417
+ end
418
+ if arg.size <= 0
419
+ raise ArgumentError, 'wrong number of arguments (1 for 2)'
420
+ end
421
+ file = arg.shift
422
+ # check if file is filename or IO object
81
423
  unless file.respond_to?(:gets)
82
424
  # 'file' is a filename
83
- if block_given? then
84
- File.open(file, *openmode) do |fobj|
85
- ff = self.new(dbclass, fobj, *arg)
86
- yield ff
87
- end
88
- else
89
- fobj = File.open(file, *openmode)
90
- self.new(dbclass, fobj, *arg)
91
- end
425
+ self.open_file(file, *arg, &block)
92
426
  else
93
427
  # 'file' is a IO object
94
- ff = self.new(dbclass, file, *arg)
428
+ ff = self.new(dbclass, file)
95
429
  block_given? ? (yield ff) : ff
96
430
  end
97
431
  end
@@ -110,7 +444,8 @@ module Bio
110
444
  end
111
445
 
112
446
  # Same as FlatFile.auto(filename_or_stream, *arg).to_a
113
- # (It might be OBSOLETED in the future.)
447
+ #
448
+ # (This method might be OBSOLETED in the future.)
114
449
  def self.to_a(*arg)
115
450
  self.auto(*arg) do |ff|
116
451
  raise 'cannot determine file format' unless ff.dbclass
@@ -118,6 +453,46 @@ module Bio
118
453
  end
119
454
  end
120
455
 
456
+ # Same as FlatFile.auto(filename, *arg),
457
+ # except that it only accept filename and doesn't accept IO object.
458
+ # File format is automatically determined.
459
+ #
460
+ # It can accept a block.
461
+ # If a block is given, it returns the block's return value.
462
+ # Otherwise, it returns a new FlatFile object.
463
+ #
464
+ def self.open_file(filename, *arg)
465
+ if block_given? then
466
+ BufferedInputStream.open_file(filename, *arg) do |stream|
467
+ yield self.new(nil, stream)
468
+ end
469
+ else
470
+ stream = BufferedInputStream.open_file(filename, *arg)
471
+ self.new(nil, stream)
472
+ end
473
+ end
474
+
475
+ # Opens URI specified as _uri_.
476
+ # _uri_ must be a String or URI object.
477
+ # *arg is passed to OpenURI.open_uri or URI#open.
478
+ #
479
+ # Like FlatFile#open, it can accept a block.
480
+ #
481
+ # Note that you MUST explicitly require 'open-uri'.
482
+ # Because open-uri.rb modifies existing class,
483
+ # it isn't required by default.
484
+ #
485
+ def self.open_uri(uri, *arg)
486
+ if block_given? then
487
+ BufferedInputStream.open_uri(uri, *arg) do |stream|
488
+ yield self.new(nil, stream)
489
+ end
490
+ else
491
+ stream = BufferedInputStream.open_uri(uri, *arg)
492
+ self.new(nil, stream)
493
+ end
494
+ end
495
+
121
496
  # Same as FlatFile.open, except that 'stream' should be a opened
122
497
  # stream object (IO, File, ..., who have the 'gets' method).
123
498
  #
@@ -126,62 +501,101 @@ module Bio
126
501
  # * Example 2
127
502
  # Bio::FlatFile.new(Bio::GenBank, IO.popen("gzip -dc nc1101.flat.gz"))
128
503
  #
129
- # +options+ should be a hash (or nil). It will be OBSOLETED!!
130
- # Available options are below:
131
- # [<tt>:raw</tt>] if true, "raw mode" (same as #raw=true).
132
- # default: false (not "raw mode").
504
+ # Compatibility Note:
505
+ # Now, you cannot specify ":raw => true" or ":raw => false".
506
+ # Below styles are DEPRECATED.
133
507
  #
134
- # * Example 3
135
- # Bio::FlatFile.new(nil, $stdin, :raw=>true)
508
+ # * Example 3 (deprecated)
509
+ # # Bio::FlatFile.new(nil, $stdin, :raw=>true) # => ERROR
510
+ # # Please rewrite as below.
511
+ # ff = Bio::FlatFile.new(nil, $stdin)
512
+ # ff.raw = true
136
513
  # * Example 3 in old style (deprecated)
137
- # Bio::FlatFile.new(nil, $stdin, true)
514
+ # # Bio::FlatFile.new(nil, $stdin, true) # => ERROR
515
+ # # Please rewrite as below.
516
+ # ff = Bio::FlatFile.new(nil, $stdin)
517
+ # ff.raw = true
138
518
  #
139
- def initialize(dbclass, stream, options = nil)
519
+ def initialize(dbclass, stream)
140
520
  # 2nd arg: IO object
141
- @io = stream
142
- # 3rd arg: options (nil or a Hash)
143
- self.raw = false
144
- if options.is_a?(Hash) then
145
- self.raw = options[:raw] if options.has_key?(:raw)
521
+ if @stream.kind_of?(BufferedInputStream)
522
+ @stream = stream
146
523
  else
147
- self.raw = options
524
+ @stream = BufferedInputStream.for_io(stream)
148
525
  end
149
- # initialize prefetch buffer
150
- @prefetch = ''
526
+ # default is raw mode
527
+ self.raw = false
151
528
  # 1st arg: database class (or file format autodetection)
152
529
  if dbclass then
153
- self.dbclass = dbclass
530
+ self.dbclass = dbclass
154
531
  else
155
- autodetect
532
+ autodetect
156
533
  end
534
+ #
535
+ @skip_leader_mode = :firsttime
536
+ @firsttime_flag = true
537
+ end
538
+
539
+ # The mode how to skip leader of the data.
540
+ # :firsttime :: (DEFAULT) only head of file (= first time to read)
541
+ # :everytime :: everytime to read entry
542
+ # nil :: never skip
543
+ attr_accessor :skip_leader_mode
544
+
545
+ # (DEPRECATED) IO object in the flatfile object.
546
+ #
547
+ # Compatibility Note: Bio::FlatFile#io is deprecated.
548
+ # Please use Bio::FlatFile#to_io instead.
549
+ def io
550
+ warn "Bio::FlatFile#io is deprecated."
551
+ @stream.to_io
157
552
  end
158
553
 
159
554
  # IO object in the flatfile object.
160
- attr_reader :io
555
+ #
556
+ # Compatibility Note: Bio::FlatFile#io is deprecated.
557
+ def to_io
558
+ @stream.to_io
559
+ end
560
+
561
+ # Pathname, filename or URI (or nil).
562
+ def path
563
+ @stream.path
564
+ end
161
565
 
162
566
  # Get next entry.
163
567
  def next_entry
164
- @entry_raw = gets(@rs)
165
- return nil unless @entry_raw
568
+ if @skip_leader_mode and
569
+ ((@firsttime_flag and @skip_leader_mode == :firsttime) or
570
+ @skip_leader_mode == :everytime)
571
+ @splitter.skip_leader
572
+ end
573
+ r = @splitter.get_entry
574
+ @firsttime_flag = false
575
+ return nil unless r
166
576
  if raw then
167
- @entry_raw
577
+ r
168
578
  else
169
- e = @dbclass.new(@entry_raw)
170
- begin
171
- s = e.entry_overrun
172
- rescue NameError
173
- s = nil
174
- end
175
- if s then
176
- @entry_raw[-(s.length), s.length] = ''
177
- ungets(s)
178
- end
179
- e
579
+ @entry = @dbclass.new(r)
580
+ @entry
180
581
  end
181
582
  end
583
+ attr_reader :entry
182
584
 
183
585
  # Returns the last raw entry as a string.
184
- attr_reader :entry_raw
586
+ def entry_raw
587
+ @splitter.entry
588
+ end
589
+
590
+ # start position of the last entry
591
+ def entry_start_pos
592
+ @splitter.entry_start_pos
593
+ end
594
+
595
+ # (end position of the last entry) + 1
596
+ def entry_ended_pos
597
+ @splitter.entry_ended_pos
598
+ end
185
599
 
186
600
  # Iterates over each entry in the flatfile.
187
601
  #
@@ -193,23 +607,23 @@ module Bio
193
607
  # end
194
608
  def each_entry
195
609
  while e = self.next_entry
196
- yield e
610
+ yield e
197
611
  end
198
612
  end
199
- alias each each_entry
613
+ alias :each :each_entry
200
614
 
201
615
  # Resets file pointer to the start of the flatfile.
202
616
  # (similar to IO#rewind)
203
617
  def rewind
204
- r = @io.rewind
205
- @prefetch = ''
618
+ r = @stream.rewind
619
+ @firsttime_flag = true
206
620
  r
207
621
  end
208
622
 
209
623
  # Closes input stream.
210
624
  # (similar to IO#close)
211
625
  def close
212
- @io.close
626
+ @stream.close
213
627
  end
214
628
 
215
629
  # Returns current position of input stream.
@@ -217,9 +631,9 @@ module Bio
217
631
  # the result is not guaranteed.
218
632
  # It is similar to IO#pos.
219
633
  # Note that it will not be equal to io.pos,
220
- # because FlatFile#autodetect may pre-read some lines.
634
+ # because FlatFile has its own internal buffer.
221
635
  def pos
222
- @io.pos - @prefetch.size
636
+ @stream.pos
223
637
  end
224
638
 
225
639
  # (Not recommended to use it.)
@@ -228,86 +642,17 @@ module Bio
228
642
  # the result is not guaranteed.
229
643
  # It is similar to IO#pos=.
230
644
  # Note that it will not be equal to io.pos=,
231
- # because FlatFile#autodetect may pre-read some lines.
645
+ # because FlatFile has its own internal buffer.
232
646
  def pos=(p)
233
- r = (@io.pos = p)
234
- @prefetch = ''
235
- r
647
+ @stream.pos=(p)
236
648
  end
237
649
 
238
650
  # Returns true if input stream is end-of-file.
239
651
  # Otherwise, returns false.
240
652
  # (Similar to IO#eof?, but may not be equal to io.eof?,
241
- # because FlatFile#autodetect may pre-read some lines.)
653
+ # because FlatFile has its own internal buffer.)
242
654
  def eof?
243
- if @prefetch.size > 0
244
- false
245
- else
246
- @io.eof?
247
- end
248
- end
249
-
250
- # Similar to IO#gets.
251
- # Internal use only. Users should not call it directly.
252
- def gets(io_rs = $/)
253
- if @prefetch.size > 0
254
- if io_rs == nil then
255
- r = @prefetch + @io.gets(nil).to_s
256
- @prefetch = ''
257
- else
258
- if io_rs == '' then
259
- sp_rs = /\n\n/n
260
- sp_rs_orig = "\n\n"
261
- else
262
- sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
263
- sp_rs_orig = io_rs
264
- end
265
- a = @prefetch.split(sp_rs, 2)
266
- if a.size > 1 then
267
- r = a[0] + sp_rs_orig
268
- @prefetch = a[1]
269
- else
270
- @prefetch << @io.gets(io_rs).to_s
271
- a = @prefetch.split(sp_rs, 2)
272
- if a.size > 1 then
273
- r = a[0] + sp_rs_orig
274
- @prefetch = a[1].to_s
275
- else
276
- r = @prefetch
277
- @prefetch = ''
278
- end
279
- end
280
- end
281
- r
282
- else
283
- @io.gets(io_rs)
284
- end
285
- end
286
-
287
- # Unread read data.
288
- # Internal use only. Users must not call it.
289
- def ungets(str)
290
- @prefetch = str + @prefetch
291
- nil
292
- end
293
-
294
- # Similar to IO#getc.
295
- # Internal use only. Users should not call it directly.
296
- def getc
297
- if @prefetch.size > 0 then
298
- r = @prefetch[0]
299
- @prefetch = @prefetch[1..-1]
300
- else
301
- r = @io.getc
302
- end
303
- r
304
- end
305
-
306
- # Similar to IO#ungetc.
307
- # Internal use only. Users should not call it.
308
- def ungetc(c)
309
- @prefetch = sprintf("%c", c) + @prefetch
310
- nil
655
+ @stream.eof?
311
656
  end
312
657
 
313
658
  # If true is given, the next_entry method returns
@@ -319,14 +664,24 @@ module Bio
319
664
  # If true, raw mode.
320
665
  attr_reader :raw
321
666
 
667
+ # Similar to IO#gets.
668
+ # Internal use only. Users should not call it directly.
669
+ def gets(*arg)
670
+ @stream.gets(*arg)
671
+ end
672
+
322
673
  # Sets database class. Plese use only if autodetect fails.
323
- def dbclass=(k)
324
- if k then
325
- @dbclass = k
326
- @rs = @dbclass::DELIMITER
674
+ def dbclass=(klass)
675
+ if klass then
676
+ @dbclass = klass
677
+ begin
678
+ @splitter = @dbclass.flatfile_splitter(@dbclass, @stream)
679
+ rescue NameError, NoMethodError
680
+ @splitter = Splitter::Default.new(klass, @stream)
681
+ end
327
682
  else
328
- @dbclass = nil
329
- @rs = $/
683
+ @dbclass = nil
684
+ @splitter = nil
330
685
  end
331
686
  end
332
687
 
@@ -340,157 +695,518 @@ module Bio
340
695
  #
341
696
  # The method can be called anytime if you want (but not recommended).
342
697
  # This might be useful if input file is a mixture of muitiple format data.
343
- def autodetect(lines = 31)
344
- r = nil
345
- 1.upto(lines) do |x|
346
- if line = @io.gets then
347
- @prefetch << line
348
- if line and line.strip.size > 0 then
349
- r = self.class.autodetect(@prefetch)
350
- if r then
351
- self.dbclass = r
352
- return r
353
- end
354
- end
355
- end
698
+ def autodetect(lines = 31, ad = AutoDetect.default)
699
+ if r = ad.autodetect_flatfile(self, lines)
700
+ self.dbclass = r
701
+ else
702
+ self.dbclass = nil unless self.dbclass
356
703
  end
357
- self.dbclass = nil unless dbclass
358
704
  r
359
705
  end
360
706
 
361
707
  # Detects database class (== file format) of given file.
362
708
  # If fails to determine, returns nil.
363
709
  def self.autodetect_file(filename)
364
- ff = self.open(nil, filename)
365
- r = ff.dbclass
366
- ff.close
367
- r
710
+ self.open_file(filename).dbclass
368
711
  end
369
712
 
370
713
  # Detects database class (== file format) of given input stream.
371
714
  # If fails to determine, returns nil.
372
715
  # Caution: the method reads some data from the input stream,
373
716
  # and the data will be lost.
717
+ def self.autodetect_io(io)
718
+ self.new(nil, io).dbclass
719
+ end
720
+
721
+ # This is OBSOLETED. Please use autodetect_io(io) instead.
374
722
  def self.autodetect_stream(io)
375
- ff = self.new(nil, io)
376
- r = ff.dbclass
377
- r
723
+ $stderr.print "Bio::FlatFile.autodetect_stream will be deprecated." if $VERBOSE
724
+ self.autodetect_io(io)
378
725
  end
379
726
 
380
727
  # Detects database class (== file format) of given string.
381
728
  # If fails to determine, returns false or nil.
382
729
  def self.autodetect(text)
383
- require 'bio'
384
- case text
385
- when /^LOCUS .+ bp .*[a-z]*[DR]?NA/
386
- Bio::GenBank
387
- when /^LOCUS .+ aa .+/
388
- Bio::GenPept
389
- when /^UI \- [0-9]+$/
390
- Bio::MEDLINE
391
-
392
- when /^ID .+\; .*(DNA|RNA|XXX)\;/
393
- Bio::EMBL
394
- when /^ID .+\; *PRT\;/
395
- Bio::SPTR
396
- when /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/
397
- Bio::PROSITE
398
- when /^AC [-A-Za-z0-9_\.]+$/
399
- Bio::TRANSFAC
400
-
401
- when /^H [-A-Z0-9_\.]+$/
402
- if text =~ /^M [rc]/ then
403
- Bio::AAindex2
404
- elsif text =~ /^I A\/L/ then
405
- Bio::AAindex1
406
- else
407
- false #fail to determine
730
+ AutoDetect.default.autodetect(text)
731
+ end
732
+
733
+
734
+ # AutoDetect automatically determines database class of given data.
735
+ class AutoDetect
736
+
737
+ include TSort
738
+
739
+ # Template of a single rule of autodetection
740
+ class RuleTemplate
741
+ # Creates a new element.
742
+ def self.[](*arg)
743
+ self.new(*arg)
744
+ end
745
+
746
+ # Creates a new element.
747
+ def initialize
748
+ a = Array.new
749
+ def a.inspect
750
+ "[#{self.collect { |e| e.name.inspect }.join(' ')}]"
751
+ end
752
+ @higher_priority_elements = a.clone
753
+ @lower_priority_elements = a.clone
754
+ @name = nil
408
755
  end
409
756
 
410
- when /^CODE [0-9]+$/
411
- Bio::LITDB
412
- when /^Entry [A-Z0-9]+/
413
- Bio::KEGG::BRITE
414
-
415
- when /^ENTRY .+ KO\s*$/
416
- Bio::KEGG::KO
417
- when /^ENTRY .+ Glycan\s*$/
418
- Bio::KEGG::GLYCAN
419
- when /^ENTRY .+ (CDS|gene|.*RNA) /
420
- Bio::KEGG::GENES
421
- when /^ENTRY EC [0-9\.]+$/
422
- Bio::KEGG::ENZYME
423
- when /^ENTRY C[A-Za-z0-9\._]+$/
424
- Bio::KEGG::COMPOUND
425
- when /^ENTRY R[A-Za-z0-9\._]+$/
426
- Bio::KEGG::REACTION
427
- when /^ENTRY [a-z]+$/
428
- Bio::KEGG::GENOME
429
-
430
- when /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/
431
- if $1 == 'clusters'
432
- Bio::FANTOM::MaXML::Cluster
433
- elsif $1 == 'sequences'
434
- Bio::FANTOM::MaXML::Sequence
435
- else
436
- nil #unknown
757
+ # self is prior to the _elem_.
758
+ def is_prior_to(elem)
759
+ return nil if self == elem
760
+ elem.higher_priority_elements << self
761
+ self.lower_priority_elements << elem
762
+ true
437
763
  end
438
764
 
439
- when /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/
440
- Bio::PDB
765
+ # higher priority elements
766
+ attr_reader :higher_priority_elements
767
+ # lower priority elements
768
+ attr_reader :lower_priority_elements
769
+
770
+ # database classes
771
+ attr_reader :dbclasses
441
772
 
442
- when /^CLUSTAL .*\(.*\).*sequence +alignment/
443
- Bio::ClustalW::Report
773
+ # unique name of the element
774
+ attr_accessor :name
775
+
776
+ # If given text (and/or meta information) is known, returns
777
+ # the database class.
778
+ # Otherwise, returns nil or false.
779
+ #
780
+ # _text_ will be a String.
781
+ # _meta_ will be a Hash.
782
+ # _meta_ may contain following keys.
783
+ # :path => pathname, filename or uri.
784
+ def guess(text, meta)
785
+ nil
786
+ end
787
+ end #class Rule_Template
788
+
789
+ # RuleDebug is a class for debugging autodetect classes/methods
790
+ class RuleDebug < RuleTemplate
791
+ # Creates a new instance.
792
+ def initialize(name)
793
+ super()
794
+ @name = name
795
+ end
444
796
 
445
- when /\<\!DOCTYPE BlastOutput PUBLIC /
446
- Bio::Blast::Report
797
+ # prints information to the $stderr.
798
+ def guess(text, meta)
799
+ $stderr.puts @name
800
+ $stderr.puts text.inspect
801
+ $stderr.puts meta.inspect
802
+ nil
803
+ end
804
+ end #class RuleDebug
447
805
 
448
- when /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
449
- Bio::Blast::WU::Report
450
- when /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
451
- Bio::Blast::WU::Report_TBlast
806
+ # Special element that is always top or bottom priority.
807
+ class RuleSpecial < RuleTemplate
808
+ def initialize(name)
809
+ #super()
810
+ @name = name
811
+ end
812
+ # modification of @name is inhibited.
813
+ def name=(x)
814
+ raise 'cannot modify name'
815
+ end
452
816
 
453
- when /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
454
- Bio::Blast::Default::Report
455
- when /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
456
- Bio::Blast::Default::Report_TBlast
817
+ # always returns void array
818
+ def higher_priority_elements
819
+ []
820
+ end
821
+ # always returns void array
822
+ def lower_priority_elements
823
+ []
824
+ end
825
+ end #class RuleSpecial
457
826
 
458
- when /^psLayout version \d+\s*$/
459
- Bio::Blat::Report
460
- when /^\-\-SPIDEY version .+\-\-$/
461
- Bio::Spidey::Report
827
+ # Special element that is always top priority.
828
+ TopRule = RuleSpecial.new('top')
829
+ # Special element that is always bottom priority.
830
+ BottomRule = RuleSpecial.new('bottom')
462
831
 
463
- when /^HMMER +\d+\./
464
- Bio::HMMER::Report
832
+ # A autodetection rule to use a regular expression
833
+ class RuleRegexp < RuleTemplate
834
+ # Creates a new instance.
835
+ def initialize(dbclass, re)
836
+ super()
837
+ @re = re
838
+ @dbclass = dbclass
839
+ @dbclasses = [ dbclass ]
840
+ @name = dbclass.to_s
841
+ end
465
842
 
466
- when /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/
467
- Bio::Sim4::Report
843
+ # If given text matches the regexp, returns the database class.
844
+ # Otherwise, returns nil or false.
845
+ # _meta_ is ignored.
846
+ def guess(text, meta)
847
+ @re =~ text ? @dbclass : nil
848
+ end
849
+ end #class RuleRegexp
468
850
 
469
- when /^>.+$/
470
- if text =~ /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ then
471
- Bio::NBRF
472
- elsif text =~ /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ then
473
- Bio::FastaFormat
474
- elsif text =~ /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ then
475
- Bio::FastaNumericFormat
851
+ # A autodetection rule to use more than two regular expressions.
852
+ class RuleRegexp2 < RuleTemplate
853
+ # Creates a new instance.
854
+ def initialize(dbclass, *regexps)
855
+ super()
856
+ @regexps = regexps
857
+ @dbclass = dbclass
858
+ @dbclasses = [ dbclass ]
859
+ if name
860
+ @name = name
861
+ else
862
+ @name = @dbclass.to_s
863
+ end
864
+ end
865
+
866
+ # If given text matches the regexp, returns the database class.
867
+ # Otherwise, returns nil or false.
868
+ # _meta_ is ignored.
869
+ def guess(text, meta)
870
+ @regexps.each do |re|
871
+ return @dbclass if re =~ text
872
+ end
873
+ nil
874
+ end
875
+ end #class RuleRegexp
876
+
877
+ # A autodetection rule that passes data to the proc object.
878
+ class RuleProc < RuleTemplate
879
+ # Creates a new instance.
880
+ def initialize(*dbclasses, &proc)
881
+ super()
882
+ @proc = proc
883
+ @dbclasses = dbclasses
884
+ @name = dbclasses.collect { |x| x.to_s }.join('|')
885
+ end
886
+
887
+ # If given text (and/or meta information) is known, returns
888
+ # the database class.
889
+ # Otherwise, returns nil or false.
890
+ #
891
+ # Refer RuleTemplate#guess for _meta_.
892
+ def guess(text, meta)
893
+ @proc.call(text)
894
+ end
895
+ end #class RuleProc
896
+
897
+ # Creates a new Autodetect object
898
+ def initialize
899
+ # stores autodetection rules.
900
+ @rules = Hash.new
901
+ # stores elements (cache)
902
+ @elements = nil
903
+ self.add(TopRule)
904
+ self.add(BottomRule)
905
+ end
906
+
907
+ # Adds a new element.
908
+ # Returns _elem_.
909
+ def add(elem)
910
+ raise 'element name conflicts' if @rules[elem.name]
911
+ @elements = nil
912
+ @rules[elem.name] = elem
913
+ elem
914
+ end
915
+
916
+ # (required by TSort.)
917
+ # For all elements, yields each element.
918
+ def tsort_each_node(&x)
919
+ @rules.each_value(&x)
920
+ end
921
+
922
+ # (required by TSort.)
923
+ # For a given element, yields each child
924
+ # (= lower priority elements) of the element.
925
+ def tsort_each_child(elem)
926
+ if elem == TopRule then
927
+ @rules.each_value do |e|
928
+ yield e unless e == TopRule or
929
+ e.lower_priority_elements.index(TopRule)
930
+ end
931
+ elsif elem == BottomRule then
932
+ @rules.each_value do |e|
933
+ yield e if e.higher_priority_elements.index(BottomRule)
934
+ end
476
935
  else
477
- false #fail to determine
936
+ elem.lower_priority_elements.each do |e|
937
+ yield e if e != BottomRule
938
+ end
939
+ unless elem.higher_priority_elements.index(BottomRule)
940
+ yield BottomRule
941
+ end
478
942
  end
943
+ end
479
944
 
480
- else
481
- nil #not found
945
+ # Returns current elements as an array
946
+ # whose order fulfills all elements' priorities.
947
+ def elements
948
+ unless @elements
949
+ ary = tsort
950
+ ary.reverse!
951
+ @elements = ary
952
+ end
953
+ @elements
954
+ end
955
+
956
+ # rebuilds the object and clears internal cache.
957
+ def rehash
958
+ @rules.rehash
959
+ @elements = nil
960
+ end
961
+
962
+ # visualizes the object (mainly for debug)
963
+ def inspect
964
+ "<#{self.class.to_s} " +
965
+ self.elements.collect { |e| e.name.inspect }.join(' ') +
966
+ ">"
967
+ end
968
+
969
+ # Iterates over each element.
970
+ def each_rule(&x) #:yields: elem
971
+ elements.each(&x)
482
972
  end
483
- end
484
973
 
974
+ # Autodetect from the text.
975
+ # Returns a database class if succeeded.
976
+ # Returns nil if failed.
977
+ def autodetect(text, meta = {})
978
+ r = nil
979
+ elements.each do |e|
980
+ #$stderr.puts e.name
981
+ r = e.guess(text, meta)
982
+ break if r
983
+ end
984
+ r
985
+ end
986
+
987
+ # autodetect from the FlatFile object.
988
+ # Returns a database class if succeeded.
989
+ # Returns nil if failed.
990
+ def autodetect_flatfile(ff, lines = 31)
991
+ meta = {}
992
+ stream = ff.instance_eval { @stream }
993
+ begin
994
+ path = stream.path
995
+ rescue NameError
996
+ end
997
+ if path then
998
+ meta[:path] = path
999
+ # call autodetect onece with meta and without any read action
1000
+ if r = self.autodetect(stream.prefetch_buffer, meta)
1001
+ return r
1002
+ end
1003
+ end
1004
+ # reading stream
1005
+ 1.upto(lines) do |x|
1006
+ break unless line = stream.prefetch_gets
1007
+ if line.strip.size > 0 then
1008
+ if r = self.autodetect(stream.prefetch_buffer, meta)
1009
+ return r
1010
+ end
1011
+ end
1012
+ end
1013
+ return nil
1014
+ end
1015
+
1016
+ # default autodetect object for class method
1017
+ @default = nil
1018
+
1019
+ # returns the default autodetect object
1020
+ def self.default
1021
+ unless @default then
1022
+ @default = self.make_default
1023
+ end
1024
+ @default
1025
+ end
1026
+
1027
+ # sets the default autodetect object.
1028
+ def self.default=(ad)
1029
+ @default = ad
1030
+ end
1031
+
1032
+ # make a new autodetect object
1033
+ def self.[](*arg)
1034
+ a = self.new
1035
+ arg.each { |e| a.add(e) }
1036
+ a
1037
+ end
1038
+
1039
+ # make a default of default autodetect object
1040
+ def self.make_default
1041
+ a = self[
1042
+ genbank = RuleRegexp[ Bio::GenBank,
1043
+ /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ],
1044
+ genpept = RuleRegexp[ Bio::GenPept,
1045
+ /^LOCUS .+ aa .+/ ],
1046
+ medline = RuleRegexp[ Bio::MEDLINE,
1047
+ /^UI \- [0-9]+$/ ],
1048
+ embl = RuleRegexp[ Bio::EMBL,
1049
+ /^ID .+\; .*(DNA|RNA|XXX)\;/ ],
1050
+ sptr = RuleRegexp[ Bio::SPTR,
1051
+ /^ID .+\; *PRT\;/ ],
1052
+ prosite = RuleRegexp[ Bio::PROSITE,
1053
+ /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
1054
+ transfac = RuleRegexp[ Bio::TRANSFAC,
1055
+ /^AC [-A-Za-z0-9_\.]+$/ ],
1056
+
1057
+ aaindex = RuleProc.new(Bio::AAindex1, Bio::AAindex2) do |text|
1058
+ if /^H [-A-Z0-9_\.]+$/ =~ text then
1059
+ if text =~ /^M [rc]/ then
1060
+ Bio::AAindex2
1061
+ elsif text =~ /^I A\/L/ then
1062
+ Bio::AAindex1
1063
+ else
1064
+ false #fail to determine
1065
+ end
1066
+ else
1067
+ nil
1068
+ end
1069
+ end,
1070
+
1071
+ litdb = RuleRegexp[ Bio::LITDB,
1072
+ /^CODE [0-9]+$/ ],
1073
+ brite = RuleRegexp[ Bio::KEGG::BRITE,
1074
+ /^Entry [A-Z0-9]+/ ],
1075
+ ko = RuleRegexp[ Bio::KEGG::KO,
1076
+ /^ENTRY .+ KO\s*/ ],
1077
+ glycan = RuleRegexp[ Bio::KEGG::GLYCAN,
1078
+ /^ENTRY .+ Glycan\s*/ ],
1079
+ enzyme = RuleRegexp2[ Bio::KEGG::ENZYME,
1080
+ /^ENTRY EC [0-9\.]+$/,
1081
+ /^ENTRY .+ Enzyme\s*/
1082
+ ],
1083
+ compound = RuleRegexp2[ Bio::KEGG::COMPOUND,
1084
+ /^ENTRY C[A-Za-z0-9\._]+$/,
1085
+ /^ENTRY .+ Compound\s*/
1086
+ ],
1087
+ reaction = RuleRegexp2[ Bio::KEGG::REACTION,
1088
+ /^ENTRY R[A-Za-z0-9\._]+$/,
1089
+ /^ENTRY .+ Reaction\s*/
1090
+ ],
1091
+ genes = RuleRegexp[ Bio::KEGG::GENES,
1092
+ /^ENTRY .+ (CDS|gene|.*RNA) / ],
1093
+ genome = RuleRegexp[ Bio::KEGG::GENOME,
1094
+ /^ENTRY [a-z]+$/ ],
1095
+
1096
+ fantom = RuleProc.new(Bio::FANTOM::MaXML::Cluster,
1097
+ Bio::FANTOM::MaXML::Sequence) do |text|
1098
+ if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
1099
+ case $1
1100
+ when 'clusters'
1101
+ Bio::FANTOM::MaXML::Cluster
1102
+ when 'sequences'
1103
+ Bio::FANTOM::MaXML::Sequence
1104
+ else
1105
+ nil #unknown
1106
+ end
1107
+ else
1108
+ nil
1109
+ end
1110
+ end,
1111
+
1112
+ pdb = RuleRegexp[ Bio::PDB,
1113
+ /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ],
1114
+ het = RuleRegexp[ Bio::PDB::ChemicalComponent,
1115
+ /^RESIDUE +.+ +\d+\s*$/ ],
1116
+
1117
+ clustal = RuleRegexp[ Bio::ClustalW::Report,
1118
+ /^CLUSTAL .*\(.*\).*sequence +alignment/ ],
1119
+
1120
+ blastxml = RuleRegexp[ Bio::Blast::Report,
1121
+ /\<\!DOCTYPE BlastOutput PUBLIC / ],
1122
+ wublast = RuleRegexp[ Bio::Blast::WU::Report,
1123
+ /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
1124
+ wutblast = RuleRegexp[ Bio::Blast::WU::Report_TBlast,
1125
+ /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
1126
+ blast = RuleRegexp[ Bio::Blast::Default::Report,
1127
+ /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
1128
+ tblast = RuleRegexp[ Bio::Blast::Default::Report_TBlast,
1129
+ /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
1130
+
1131
+ blat = RuleRegexp[ Bio::Blat::Report,
1132
+ /^psLayout version \d+\s*$/ ],
1133
+ spidey = RuleRegexp[ Bio::Spidey::Report,
1134
+ /^\-\-SPIDEY version .+\-\-$/ ],
1135
+ hmmer = RuleRegexp[ Bio::HMMER::Report,
1136
+ /^HMMER +\d+\./ ],
1137
+ sim4 = RuleRegexp[ Bio::Sim4::Report,
1138
+ /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],
1139
+
1140
+ fastaformat = RuleProc.new(Bio::FastaFormat,
1141
+ Bio::NBRF,
1142
+ Bio::FastaNumericFormat) do |text|
1143
+ if /^>.+$/ =~ text
1144
+ case text
1145
+ when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
1146
+ Bio::NBRF
1147
+ when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
1148
+ Bio::FastaFormat
1149
+ when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
1150
+ Bio::FastaNumericFormat
1151
+ else
1152
+ false
1153
+ end
1154
+ else
1155
+ nil
1156
+ end
1157
+ end
1158
+ ]
1159
+
1160
+ # dependencies
1161
+ # NCBI
1162
+ genbank.is_prior_to genpept
1163
+ # EMBL/UniProt
1164
+ embl.is_prior_to sptr
1165
+ sptr.is_prior_to prosite
1166
+ prosite.is_prior_to transfac
1167
+ # KEGG
1168
+ #aaindex.is_prior_to litdb
1169
+ #litdb.is_prior_to brite
1170
+ brite.is_prior_to ko
1171
+ ko.is_prior_to glycan
1172
+ glycan.is_prior_to enzyme
1173
+ enzyme.is_prior_to compound
1174
+ compound.is_prior_to reaction
1175
+ reaction.is_prior_to genes
1176
+ genes.is_prior_to genome
1177
+ # PDB
1178
+ pdb.is_prior_to het
1179
+ # BLAST
1180
+ wublast.is_prior_to wutblast
1181
+ wutblast.is_prior_to blast
1182
+ blast.is_prior_to tblast
1183
+ # FastaFormat
1184
+ BottomRule.is_prior_to(fastaformat)
1185
+
1186
+ # for debug
1187
+ #debug_first = RuleDebug.new('debug_first')
1188
+ #a.add(debug_first)
1189
+ #debug_first.is_prior_to(TopRule)
1190
+
1191
+ ## for debug
1192
+ #debug_last = RuleDebug.new('debug_last')
1193
+ #a.add(debug_last)
1194
+ #BottomRule.is_prior_to(debug_last)
1195
+ #fastaformat.is_prior_to(debug_last)
1196
+
1197
+ a.rehash
1198
+ return a
1199
+ end
1200
+
1201
+ end #class AutoDetect
1202
+
485
1203
  end #class FlatFile
486
1204
 
487
1205
  end #module Bio
488
1206
 
489
-
490
1207
  if __FILE__ == $0
491
1208
  if ARGV.size == 2
492
1209
  require 'bio'
493
1210
  p Bio::FlatFile.open(eval(ARGV.shift), ARGV.shift).next_entry
494
1211
  end
495
1212
  end
496
-