bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,123 @@
1
+ #
2
+ # bio/io/fastacmd.rb - NCBI fastacmd wrapper class
3
+ #
4
+ # Copyright (C) 2005 Shuji SHIGENOBU <shige@nibb.ac.jp>
5
+ # Copyright (C) 2005 Toshiaki Katayama <k@bioruby.org>
6
+ #
7
+ # This library is free software; you can redistribute it and/or
8
+ # modify it under the terms of the GNU Lesser General Public
9
+ # License as published by the Free Software Foundation; either
10
+ # version 2 of the License, or (at your option) any later version.
11
+ #
12
+ # This library is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ # Lesser General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Lesser General Public
18
+ # License along with this library; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: fastacmd.rb,v 1.8 2005/09/26 13:00:08 k Exp $
22
+ #
23
+
24
+ require 'bio/db/fasta'
25
+ require 'bio/io/flatfile'
26
+ require 'bio/command'
27
+
28
+ module Bio
29
+ class Blast
30
+
31
+ class Fastacmd
32
+
33
+ include Enumerable
34
+ include Bio::Command::Tools
35
+
36
+ def initialize(db)
37
+ @database = db
38
+ @fastacmd = 'fastacmd'
39
+ end
40
+ attr_accessor :database, :fastacmd, :errorlog
41
+
42
+ # get an entry_id and returns a Bio::FastaFormat object
43
+ def get_by_id(entry_id)
44
+ fetch(entry_id).shift
45
+ end
46
+
47
+ # get one or more entry_id and returns an Array of Bio::FastaFormat objects
48
+ def fetch(list)
49
+ if list.respond_to?(:join)
50
+ entry_id = list.join(",")
51
+ else
52
+ entry_id = list
53
+ end
54
+
55
+ cmd = [ @fastacmd, '-d', @database, '-s', entry_id ]
56
+ call_command_local(cmd) do |inn, out|
57
+ inn.close_write
58
+ Bio::FlatFile.new(Bio::FastaFormat, out).to_a
59
+ end
60
+ end
61
+
62
+ def each_entry
63
+ cmd = [ @fastacmd, '-d', @database, '-D', 'T' ]
64
+ call_command_local(cmd) do |inn, out|
65
+ inn.close_write
66
+ Bio::FlatFile.open(Bio::FastaFormat, out) do |f|
67
+ f.each_entry do |e|
68
+ yield e
69
+ end
70
+ end
71
+ end
72
+ self
73
+ end
74
+ alias each each_entry
75
+
76
+ end
77
+
78
+ end
79
+ end
80
+
81
+
82
+ if __FILE__ == $0
83
+
84
+ database = ARGV.shift || "/db/myblastdb"
85
+ entry_id = ARGV.shift || "sp:128U_DROME"
86
+ ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
87
+
88
+ fastacmd = Bio::Blast::Fastacmd.new(database)
89
+
90
+ ### Retrieve one sequence
91
+ entry = fastacmd.get_by_id(entry_id)
92
+
93
+ # Fastacmd#get_by_id(entry_id) returns a Bio::FastaFormat object.
94
+ p entry
95
+
96
+ # Bio::FastaFormat becomes a fasta format string when printed by puts.
97
+ puts entry
98
+
99
+ # Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
100
+ # object even when the result is a single entry.
101
+ p fastacmd.fetch(entry_id)
102
+
103
+ ### Retrieve more sequences
104
+
105
+ # Fastacmd#fetch method also accepts a list of entry_id and returns
106
+ # an Array of Bio::FastaFormat objects.
107
+ p fastacmd.fetch(ent_list)
108
+
109
+ # So, you can iterate on the results.
110
+ fastacmd.fetch(ent_list).each do |fasta|
111
+ puts fasta
112
+ end
113
+
114
+
115
+ ### Iterates on all entries
116
+
117
+ # You can also iterate on all sequences in the database!
118
+ fastacmd.each do |fasta|
119
+ p [ fasta.definition[0..30], fasta.seq.size ]
120
+ end
121
+
122
+ end
123
+
@@ -0,0 +1,114 @@
1
+ #
2
+ # = bio/io/biofetch.rb - BioFetch access module
3
+ #
4
+ # Copyright:: Copyright (C) 2002, 2005
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: fetch.rb,v 1.4 2005/12/18 15:58:42 k Exp $
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+
29
+ require 'uri'
30
+ require 'net/http'
31
+
32
+ module Bio
33
+
34
+ class Fetch
35
+
36
+ # Create a new Bio::Fetch server object.
37
+ # Use Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') to connect
38
+ # to EBI BioFetch server.
39
+ def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
40
+ schema, user, @host, @port, reg, @path, = URI.split(url)
41
+ end
42
+
43
+ # Set default database to dbname (prepare for get_by_id).
44
+ attr_accessor :database
45
+
46
+ # Get raw database entry by id (mainly used by Bio::Registry).
47
+ def get_by_id(id)
48
+ fetch(@database, id)
49
+ end
50
+
51
+ # Fetch a database entry as specified by database (db), entry id (id),
52
+ # 'raw' text or 'html' (style), and format. When using BioRuby's
53
+ # BioFetch server, value for the format should not be set.
54
+ def fetch(db, id, style = 'raw', format = nil)
55
+ data = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
56
+ data.push("format=#{format}") if format
57
+ data = data.join('&')
58
+
59
+ responce, result = Net::HTTP.new(@host, @port).post(@path, data)
60
+ return result
61
+ end
62
+
63
+ # Short cut for using BioRuby's BioFetch server. You can fetch an entry
64
+ # without creating instance of BioFetch server.
65
+ def self.query(*args)
66
+ self.new.fetch(*args)
67
+ end
68
+
69
+ # What databases are available?
70
+ def databases
71
+ query = "info=dbs"
72
+ responce, result = Net::HTTP.new(@host, @port).post(@path, query)
73
+ return result
74
+ end
75
+
76
+ # What formats does the database X have?
77
+ def formats(database = @database)
78
+ if database
79
+ query = "info=formats;db=#{database}"
80
+ responce, result = Net::HTTP.new(@host, @port).post(@path, query)
81
+ return result
82
+ end
83
+ end
84
+
85
+ # How many entries can be retrieved simultaneously?
86
+ def maxids
87
+ query = "info=maxids"
88
+ responce, result = Net::HTTP.new(@host, @port).post(@path, query)
89
+ return result
90
+ end
91
+
92
+ end
93
+
94
+ end # module Bio
95
+
96
+
97
+
98
+ if __FILE__ == $0
99
+
100
+ # bfserv = Bio::Fetch.new('http://www.ebi.ac.uk:80/cgi-bin/dbfetch')
101
+ bfserv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
102
+ puts "# test 1"
103
+ puts bfserv.fetch('embl', 'J00231', 'raw')
104
+ puts "# test 2"
105
+ puts bfserv.fetch('embl', 'J00231', 'html')
106
+
107
+ puts "# test 3"
108
+ puts Bio::Fetch.query('genbank', 'J00231')
109
+ puts "# test 4"
110
+ puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
111
+
112
+ end
113
+
114
+
@@ -0,0 +1,496 @@
1
+ #
2
+ # = bio/io/flatfile.rb - flatfile access wrapper class
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2002 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
+ # License:: LGPL
6
+ #
7
+ #--
8
+ # This library is free software; you can redistribute it and/or
9
+ # modify it under the terms of the GNU Lesser General Public
10
+ # License as published by the Free Software Foundation; either
11
+ # version 2 of the License, or (at your option) any later version.
12
+ #
13
+ # This library is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # Lesser General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU Lesser General Public
19
+ # License along with this library; if not, write to the Free Software
20
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
+ #++
22
+ #
23
+ # $Id: flatfile.rb,v 1.41 2005/11/01 15:34:45 ngoto Exp $
24
+ #
25
+ # Bio::FlatFile is a helper and wrapper class to read a biological data file.
26
+ # It acts like a IO object.
27
+ # It can automatically detect data format, and users do not need to tell
28
+ # the class what the data is.
29
+ #
30
+
31
+ module Bio
32
+
33
+ # Bio::FlatFile is a helper and wrapper class to read a biological data file.
34
+ # It acts like a IO object.
35
+ # It can automatically detect data format, and users do not need to tell
36
+ # the class what the data is.
37
+ class FlatFile
38
+
39
+ include Enumerable
40
+
41
+ # Creates a new Bio::FlatFile object to read a file or a stream
42
+ # which contains +dbclass+ data.
43
+ #
44
+ # +dbclass+ should be a class (or module) or nil.
45
+ # e.g. Bio::GenBank, Bio::FastaFormat.
46
+ #
47
+ # If +file+ is a filename (which doesn't have gets method),
48
+ # the method opens a local file named +file+
49
+ # with 'File.open(filename, mode, perm)'.
50
+ #
51
+ # When nil is given to dbclass, trying to determine database class
52
+ # (file format) automatically. If fails to determine, dbclass is
53
+ # set to nil and FlatFile#next_entry works same as IO#gets when
54
+ # raw = true. It is recommended to set dbclass using
55
+ # FlatFile#dbclass= method if fails to determine automatically.
56
+ #
57
+ # * Example 1
58
+ # Bio::FlatFile.open(Bio::GenBank, "genbank/gbest40.seq")
59
+ # * Example 2
60
+ # Bio::FlatFile.open(nil, "embl/est_hum17.dat")
61
+ # * Example 3
62
+ # Bio::FlatFile.open(Bio::GenBank, $stdin)
63
+ #
64
+ # If it is called with block, the block will be executed with
65
+ # a newly opened Bio::FlatFile instance object. If filename
66
+ # is given, the file is automatically closed when leaving the block.
67
+ #
68
+ # * Example 4
69
+ # Bio::FlatFile.open(nil, 'test4.fst') do |ff|
70
+ # ff.each { |e| print e.definition, "\n" }
71
+ # end
72
+ #
73
+ def self.open(dbclass, file, *arg)
74
+ # 3rd and 4th arg: mode, perm (passed to File.open)
75
+ openmode = []
76
+ while x = arg[0] and !x.is_a?(Hash)
77
+ openmode << arg.shift
78
+ end
79
+ # rest of arg: passed to FlatFile.new
80
+ # create a flatfile object
81
+ unless file.respond_to?(:gets)
82
+ # 'file' is a filename
83
+ if block_given? then
84
+ File.open(file, *openmode) do |fobj|
85
+ ff = self.new(dbclass, fobj, *arg)
86
+ yield ff
87
+ end
88
+ else
89
+ fobj = File.open(file, *openmode)
90
+ self.new(dbclass, fobj, *arg)
91
+ end
92
+ else
93
+ # 'file' is a IO object
94
+ ff = self.new(dbclass, file, *arg)
95
+ block_given? ? (yield ff) : ff
96
+ end
97
+ end
98
+
99
+ # Same as Bio::FlatFile.open(nil, filename_or_stream, mode, perm, options).
100
+ #
101
+ # * Example 1
102
+ # Bio::FlatFile.auto(ARGF)
103
+ # * Example 2
104
+ # Bio::FlatFile.auto("embl/est_hum17.dat")
105
+ # * Example 3
106
+ # Bio::FlatFile.auto(IO.popen("gzip -dc nc1101.flat.gz"))
107
+ #
108
+ def self.auto(*arg, &block)
109
+ self.open(nil, *arg, &block)
110
+ end
111
+
112
+ # Same as FlatFile.auto(filename_or_stream, *arg).to_a
113
+ # (It might be OBSOLETED in the future.)
114
+ def self.to_a(*arg)
115
+ self.auto(*arg) do |ff|
116
+ raise 'cannot determine file format' unless ff.dbclass
117
+ ff.to_a
118
+ end
119
+ end
120
+
121
+ # Same as FlatFile.open, except that 'stream' should be a opened
122
+ # stream object (IO, File, ..., who have the 'gets' method).
123
+ #
124
+ # * Example 1
125
+ # Bio::FlatFile.new(Bio::GenBank, ARGF)
126
+ # * Example 2
127
+ # Bio::FlatFile.new(Bio::GenBank, IO.popen("gzip -dc nc1101.flat.gz"))
128
+ #
129
+ # +options+ should be a hash (or nil). It will be OBSOLETED!!
130
+ # Available options are below:
131
+ # [<tt>:raw</tt>] if true, "raw mode" (same as #raw=true).
132
+ # default: false (not "raw mode").
133
+ #
134
+ # * Example 3
135
+ # Bio::FlatFile.new(nil, $stdin, :raw=>true)
136
+ # * Example 3 in old style (deprecated)
137
+ # Bio::FlatFile.new(nil, $stdin, true)
138
+ #
139
+ def initialize(dbclass, stream, options = nil)
140
+ # 2nd arg: IO object
141
+ @io = stream
142
+ # 3rd arg: options (nil or a Hash)
143
+ self.raw = false
144
+ if options.is_a?(Hash) then
145
+ self.raw = options[:raw] if options.has_key?(:raw)
146
+ else
147
+ self.raw = options
148
+ end
149
+ # initialize prefetch buffer
150
+ @prefetch = ''
151
+ # 1st arg: database class (or file format autodetection)
152
+ if dbclass then
153
+ self.dbclass = dbclass
154
+ else
155
+ autodetect
156
+ end
157
+ end
158
+
159
+ # IO object in the flatfile object.
160
+ attr_reader :io
161
+
162
+ # Get next entry.
163
+ def next_entry
164
+ @entry_raw = gets(@rs)
165
+ return nil unless @entry_raw
166
+ if raw then
167
+ @entry_raw
168
+ else
169
+ e = @dbclass.new(@entry_raw)
170
+ begin
171
+ s = e.entry_overrun
172
+ rescue NameError
173
+ s = nil
174
+ end
175
+ if s then
176
+ @entry_raw[-(s.length), s.length] = ''
177
+ ungets(s)
178
+ end
179
+ e
180
+ end
181
+ end
182
+
183
+ # Returns the last raw entry as a string.
184
+ attr_reader :entry_raw
185
+
186
+ # Iterates over each entry in the flatfile.
187
+ #
188
+ # * Example
189
+ # include Bio
190
+ # ff = FlatFile.open(GenBank, "genbank/gbhtg14.seq")
191
+ # ff.each_entry do |x|
192
+ # puts x.definition
193
+ # end
194
+ def each_entry
195
+ while e = self.next_entry
196
+ yield e
197
+ end
198
+ end
199
+ alias each each_entry
200
+
201
+ # Resets file pointer to the start of the flatfile.
202
+ # (similar to IO#rewind)
203
+ def rewind
204
+ r = @io.rewind
205
+ @prefetch = ''
206
+ r
207
+ end
208
+
209
+ # Closes input stream.
210
+ # (similar to IO#close)
211
+ def close
212
+ @io.close
213
+ end
214
+
215
+ # Returns current position of input stream.
216
+ # If the input stream is not a normal file,
217
+ # the result is not guaranteed.
218
+ # It is similar to IO#pos.
219
+ # Note that it will not be equal to io.pos,
220
+ # because FlatFile#autodetect may pre-read some lines.
221
+ def pos
222
+ @io.pos - @prefetch.size
223
+ end
224
+
225
+ # (Not recommended to use it.)
226
+ # Sets position of input stream.
227
+ # If the input stream is not a normal file,
228
+ # the result is not guaranteed.
229
+ # It is similar to IO#pos=.
230
+ # Note that it will not be equal to io.pos=,
231
+ # because FlatFile#autodetect may pre-read some lines.
232
+ def pos=(p)
233
+ r = (@io.pos = p)
234
+ @prefetch = ''
235
+ r
236
+ end
237
+
238
+ # Returns true if input stream is end-of-file.
239
+ # Otherwise, returns false.
240
+ # (Similar to IO#eof?, but may not be equal to io.eof?,
241
+ # because FlatFile#autodetect may pre-read some lines.)
242
+ def eof?
243
+ if @prefetch.size > 0
244
+ false
245
+ else
246
+ @io.eof?
247
+ end
248
+ end
249
+
250
+ # Similar to IO#gets.
251
+ # Internal use only. Users should not call it directly.
252
+ def gets(io_rs = $/)
253
+ if @prefetch.size > 0
254
+ if io_rs == nil then
255
+ r = @prefetch + @io.gets(nil).to_s
256
+ @prefetch = ''
257
+ else
258
+ if io_rs == '' then
259
+ sp_rs = /\n\n/n
260
+ sp_rs_orig = "\n\n"
261
+ else
262
+ sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
263
+ sp_rs_orig = io_rs
264
+ end
265
+ a = @prefetch.split(sp_rs, 2)
266
+ if a.size > 1 then
267
+ r = a[0] + sp_rs_orig
268
+ @prefetch = a[1]
269
+ else
270
+ @prefetch << @io.gets(io_rs).to_s
271
+ a = @prefetch.split(sp_rs, 2)
272
+ if a.size > 1 then
273
+ r = a[0] + sp_rs_orig
274
+ @prefetch = a[1].to_s
275
+ else
276
+ r = @prefetch
277
+ @prefetch = ''
278
+ end
279
+ end
280
+ end
281
+ r
282
+ else
283
+ @io.gets(io_rs)
284
+ end
285
+ end
286
+
287
+ # Unread read data.
288
+ # Internal use only. Users must not call it.
289
+ def ungets(str)
290
+ @prefetch = str + @prefetch
291
+ nil
292
+ end
293
+
294
+ # Similar to IO#getc.
295
+ # Internal use only. Users should not call it directly.
296
+ def getc
297
+ if @prefetch.size > 0 then
298
+ r = @prefetch[0]
299
+ @prefetch = @prefetch[1..-1]
300
+ else
301
+ r = @io.getc
302
+ end
303
+ r
304
+ end
305
+
306
+ # Similar to IO#ungetc.
307
+ # Internal use only. Users should not call it.
308
+ def ungetc(c)
309
+ @prefetch = sprintf("%c", c) + @prefetch
310
+ nil
311
+ end
312
+
313
+ # If true is given, the next_entry method returns
314
+ # a entry as a text, whereas if false, returns as a parsed object.
315
+ def raw=(bool)
316
+ @raw = (bool ? true : false)
317
+ end
318
+
319
+ # If true, raw mode.
320
+ attr_reader :raw
321
+
322
+ # Sets database class. Plese use only if autodetect fails.
323
+ def dbclass=(k)
324
+ if k then
325
+ @dbclass = k
326
+ @rs = @dbclass::DELIMITER
327
+ else
328
+ @dbclass = nil
329
+ @rs = $/
330
+ end
331
+ end
332
+
333
+ # Returns database class which is automatically detected or
334
+ # given in FlatFile#initialize.
335
+ attr_reader :dbclass
336
+
337
+ # Performs determination of database class (file format).
338
+ # Pre-reads +lines+ lines for format determination (default 31 lines).
339
+ # If fails, returns nil or false. Otherwise, returns database class.
340
+ #
341
+ # The method can be called anytime if you want (but not recommended).
342
+ # This might be useful if input file is a mixture of muitiple format data.
343
+ def autodetect(lines = 31)
344
+ r = nil
345
+ 1.upto(lines) do |x|
346
+ if line = @io.gets then
347
+ @prefetch << line
348
+ if line and line.strip.size > 0 then
349
+ r = self.class.autodetect(@prefetch)
350
+ if r then
351
+ self.dbclass = r
352
+ return r
353
+ end
354
+ end
355
+ end
356
+ end
357
+ self.dbclass = nil unless dbclass
358
+ r
359
+ end
360
+
361
+ # Detects database class (== file format) of given file.
362
+ # If fails to determine, returns nil.
363
+ def self.autodetect_file(filename)
364
+ ff = self.open(nil, filename)
365
+ r = ff.dbclass
366
+ ff.close
367
+ r
368
+ end
369
+
370
+ # Detects database class (== file format) of given input stream.
371
+ # If fails to determine, returns nil.
372
+ # Caution: the method reads some data from the input stream,
373
+ # and the data will be lost.
374
+ def self.autodetect_stream(io)
375
+ ff = self.new(nil, io)
376
+ r = ff.dbclass
377
+ r
378
+ end
379
+
380
+ # Detects database class (== file format) of given string.
381
+ # If fails to determine, returns false or nil.
382
+ def self.autodetect(text)
383
+ require 'bio'
384
+ case text
385
+ when /^LOCUS .+ bp .*[a-z]*[DR]?NA/
386
+ Bio::GenBank
387
+ when /^LOCUS .+ aa .+/
388
+ Bio::GenPept
389
+ when /^UI \- [0-9]+$/
390
+ Bio::MEDLINE
391
+
392
+ when /^ID .+\; .*(DNA|RNA|XXX)\;/
393
+ Bio::EMBL
394
+ when /^ID .+\; *PRT\;/
395
+ Bio::SPTR
396
+ when /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/
397
+ Bio::PROSITE
398
+ when /^AC [-A-Za-z0-9_\.]+$/
399
+ Bio::TRANSFAC
400
+
401
+ when /^H [-A-Z0-9_\.]+$/
402
+ if text =~ /^M [rc]/ then
403
+ Bio::AAindex2
404
+ elsif text =~ /^I A\/L/ then
405
+ Bio::AAindex1
406
+ else
407
+ false #fail to determine
408
+ end
409
+
410
+ when /^CODE [0-9]+$/
411
+ Bio::LITDB
412
+ when /^Entry [A-Z0-9]+/
413
+ Bio::KEGG::BRITE
414
+
415
+ when /^ENTRY .+ KO\s*$/
416
+ Bio::KEGG::KO
417
+ when /^ENTRY .+ Glycan\s*$/
418
+ Bio::KEGG::GLYCAN
419
+ when /^ENTRY .+ (CDS|gene|.*RNA) /
420
+ Bio::KEGG::GENES
421
+ when /^ENTRY EC [0-9\.]+$/
422
+ Bio::KEGG::ENZYME
423
+ when /^ENTRY C[A-Za-z0-9\._]+$/
424
+ Bio::KEGG::COMPOUND
425
+ when /^ENTRY R[A-Za-z0-9\._]+$/
426
+ Bio::KEGG::REACTION
427
+ when /^ENTRY [a-z]+$/
428
+ Bio::KEGG::GENOME
429
+
430
+ when /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/
431
+ if $1 == 'clusters'
432
+ Bio::FANTOM::MaXML::Cluster
433
+ elsif $1 == 'sequences'
434
+ Bio::FANTOM::MaXML::Sequence
435
+ else
436
+ nil #unknown
437
+ end
438
+
439
+ when /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/
440
+ Bio::PDB
441
+
442
+ when /^CLUSTAL .*\(.*\).*sequence +alignment/
443
+ Bio::ClustalW::Report
444
+
445
+ when /\<\!DOCTYPE BlastOutput PUBLIC /
446
+ Bio::Blast::Report
447
+
448
+ when /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
449
+ Bio::Blast::WU::Report
450
+ when /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
451
+ Bio::Blast::WU::Report_TBlast
452
+
453
+ when /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
454
+ Bio::Blast::Default::Report
455
+ when /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
456
+ Bio::Blast::Default::Report_TBlast
457
+
458
+ when /^psLayout version \d+\s*$/
459
+ Bio::Blat::Report
460
+ when /^\-\-SPIDEY version .+\-\-$/
461
+ Bio::Spidey::Report
462
+
463
+ when /^HMMER +\d+\./
464
+ Bio::HMMER::Report
465
+
466
+ when /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/
467
+ Bio::Sim4::Report
468
+
469
+ when /^>.+$/
470
+ if text =~ /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ then
471
+ Bio::NBRF
472
+ elsif text =~ /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ then
473
+ Bio::FastaFormat
474
+ elsif text =~ /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ then
475
+ Bio::FastaNumericFormat
476
+ else
477
+ false #fail to determine
478
+ end
479
+
480
+ else
481
+ nil #not found
482
+ end
483
+ end
484
+
485
+ end #class FlatFile
486
+
487
+ end #module Bio
488
+
489
+
490
+ if __FILE__ == $0
491
+ if ARGV.size == 2
492
+ require 'bio'
493
+ p Bio::FlatFile.open(eval(ARGV.shift), ARGV.shift).next_entry
494
+ end
495
+ end
496
+