bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,123 @@
1
+ #
2
+ # bio/io/fastacmd.rb - NCBI fastacmd wrapper class
3
+ #
4
+ # Copyright (C) 2005 Shuji SHIGENOBU <shige@nibb.ac.jp>
5
+ # Copyright (C) 2005 Toshiaki Katayama <k@bioruby.org>
6
+ #
7
+ # This library is free software; you can redistribute it and/or
8
+ # modify it under the terms of the GNU Lesser General Public
9
+ # License as published by the Free Software Foundation; either
10
+ # version 2 of the License, or (at your option) any later version.
11
+ #
12
+ # This library is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ # Lesser General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Lesser General Public
18
+ # License along with this library; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: fastacmd.rb,v 1.8 2005/09/26 13:00:08 k Exp $
22
+ #
23
+
24
+ require 'bio/db/fasta'
25
+ require 'bio/io/flatfile'
26
+ require 'bio/command'
27
+
28
+ module Bio
29
+ class Blast
30
+
31
+ class Fastacmd
32
+
33
+ include Enumerable
34
+ include Bio::Command::Tools
35
+
36
+ def initialize(db)
37
+ @database = db
38
+ @fastacmd = 'fastacmd'
39
+ end
40
+ attr_accessor :database, :fastacmd, :errorlog
41
+
42
+ # get an entry_id and returns a Bio::FastaFormat object
43
+ def get_by_id(entry_id)
44
+ fetch(entry_id).shift
45
+ end
46
+
47
+ # get one or more entry_id and returns an Array of Bio::FastaFormat objects
48
+ def fetch(list)
49
+ if list.respond_to?(:join)
50
+ entry_id = list.join(",")
51
+ else
52
+ entry_id = list
53
+ end
54
+
55
+ cmd = [ @fastacmd, '-d', @database, '-s', entry_id ]
56
+ call_command_local(cmd) do |inn, out|
57
+ inn.close_write
58
+ Bio::FlatFile.new(Bio::FastaFormat, out).to_a
59
+ end
60
+ end
61
+
62
+ def each_entry
63
+ cmd = [ @fastacmd, '-d', @database, '-D', 'T' ]
64
+ call_command_local(cmd) do |inn, out|
65
+ inn.close_write
66
+ Bio::FlatFile.open(Bio::FastaFormat, out) do |f|
67
+ f.each_entry do |e|
68
+ yield e
69
+ end
70
+ end
71
+ end
72
+ self
73
+ end
74
+ alias each each_entry
75
+
76
+ end
77
+
78
+ end
79
+ end
80
+
81
+
82
+ if __FILE__ == $0
83
+
84
+ database = ARGV.shift || "/db/myblastdb"
85
+ entry_id = ARGV.shift || "sp:128U_DROME"
86
+ ent_list = ["sp:1433_SPIOL", "sp:1432_MAIZE"]
87
+
88
+ fastacmd = Bio::Blast::Fastacmd.new(database)
89
+
90
+ ### Retrieve one sequence
91
+ entry = fastacmd.get_by_id(entry_id)
92
+
93
+ # Fastacmd#get_by_id(entry_id) returns a Bio::FastaFormat object.
94
+ p entry
95
+
96
+ # Bio::FastaFormat becomes a fasta format string when printed by puts.
97
+ puts entry
98
+
99
+ # Fastacmd#fetch(entry_id) returns an Array of a Bio::FastaFormat
100
+ # object even when the result is a single entry.
101
+ p fastacmd.fetch(entry_id)
102
+
103
+ ### Retrieve more sequences
104
+
105
+ # Fastacmd#fetch method also accepts a list of entry_id and returns
106
+ # an Array of Bio::FastaFormat objects.
107
+ p fastacmd.fetch(ent_list)
108
+
109
+ # So, you can iterate on the results.
110
+ fastacmd.fetch(ent_list).each do |fasta|
111
+ puts fasta
112
+ end
113
+
114
+
115
+ ### Iterates on all entries
116
+
117
+ # You can also iterate on all sequences in the database!
118
+ fastacmd.each do |fasta|
119
+ p [ fasta.definition[0..30], fasta.seq.size ]
120
+ end
121
+
122
+ end
123
+
@@ -0,0 +1,114 @@
1
+ #
2
+ # = bio/io/biofetch.rb - BioFetch access module
3
+ #
4
+ # Copyright:: Copyright (C) 2002, 2005
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: fetch.rb,v 1.4 2005/12/18 15:58:42 k Exp $
9
+ #
10
+ #--
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
+ #
26
+ #++
27
+ #
28
+
29
+ require 'uri'
30
+ require 'net/http'
31
+
32
+ module Bio
33
+
34
+ class Fetch
35
+
36
+ # Create a new Bio::Fetch server object.
37
+ # Use Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') to connect
38
+ # to EBI BioFetch server.
39
+ def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
40
+ schema, user, @host, @port, reg, @path, = URI.split(url)
41
+ end
42
+
43
+ # Set default database to dbname (prepare for get_by_id).
44
+ attr_accessor :database
45
+
46
+ # Get raw database entry by id (mainly used by Bio::Registry).
47
+ def get_by_id(id)
48
+ fetch(@database, id)
49
+ end
50
+
51
+ # Fetch a database entry as specified by database (db), entry id (id),
52
+ # 'raw' text or 'html' (style), and format. When using BioRuby's
53
+ # BioFetch server, value for the format should not be set.
54
+ def fetch(db, id, style = 'raw', format = nil)
55
+ data = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
56
+ data.push("format=#{format}") if format
57
+ data = data.join('&')
58
+
59
+ responce, result = Net::HTTP.new(@host, @port).post(@path, data)
60
+ return result
61
+ end
62
+
63
+ # Short cut for using BioRuby's BioFetch server. You can fetch an entry
64
+ # without creating instance of BioFetch server.
65
+ def self.query(*args)
66
+ self.new.fetch(*args)
67
+ end
68
+
69
+ # What databases are available?
70
+ def databases
71
+ query = "info=dbs"
72
+ responce, result = Net::HTTP.new(@host, @port).post(@path, query)
73
+ return result
74
+ end
75
+
76
+ # What formats does the database X have?
77
+ def formats(database = @database)
78
+ if database
79
+ query = "info=formats;db=#{database}"
80
+ responce, result = Net::HTTP.new(@host, @port).post(@path, query)
81
+ return result
82
+ end
83
+ end
84
+
85
+ # How many entries can be retrieved simultaneously?
86
+ def maxids
87
+ query = "info=maxids"
88
+ responce, result = Net::HTTP.new(@host, @port).post(@path, query)
89
+ return result
90
+ end
91
+
92
+ end
93
+
94
+ end # module Bio
95
+
96
+
97
+
98
+ if __FILE__ == $0
99
+
100
+ # bfserv = Bio::Fetch.new('http://www.ebi.ac.uk:80/cgi-bin/dbfetch')
101
+ bfserv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
102
+ puts "# test 1"
103
+ puts bfserv.fetch('embl', 'J00231', 'raw')
104
+ puts "# test 2"
105
+ puts bfserv.fetch('embl', 'J00231', 'html')
106
+
107
+ puts "# test 3"
108
+ puts Bio::Fetch.query('genbank', 'J00231')
109
+ puts "# test 4"
110
+ puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
111
+
112
+ end
113
+
114
+
@@ -0,0 +1,496 @@
1
+ #
2
+ # = bio/io/flatfile.rb - flatfile access wrapper class
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2002 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
+ # License:: LGPL
6
+ #
7
+ #--
8
+ # This library is free software; you can redistribute it and/or
9
+ # modify it under the terms of the GNU Lesser General Public
10
+ # License as published by the Free Software Foundation; either
11
+ # version 2 of the License, or (at your option) any later version.
12
+ #
13
+ # This library is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # Lesser General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU Lesser General Public
19
+ # License along with this library; if not, write to the Free Software
20
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
+ #++
22
+ #
23
+ # $Id: flatfile.rb,v 1.41 2005/11/01 15:34:45 ngoto Exp $
24
+ #
25
+ # Bio::FlatFile is a helper and wrapper class to read a biological data file.
26
+ # It acts like a IO object.
27
+ # It can automatically detect data format, and users do not need to tell
28
+ # the class what the data is.
29
+ #
30
+
31
+ module Bio
32
+
33
+ # Bio::FlatFile is a helper and wrapper class to read a biological data file.
34
+ # It acts like a IO object.
35
+ # It can automatically detect data format, and users do not need to tell
36
+ # the class what the data is.
37
+ class FlatFile
38
+
39
+ include Enumerable
40
+
41
+ # Creates a new Bio::FlatFile object to read a file or a stream
42
+ # which contains +dbclass+ data.
43
+ #
44
+ # +dbclass+ should be a class (or module) or nil.
45
+ # e.g. Bio::GenBank, Bio::FastaFormat.
46
+ #
47
+ # If +file+ is a filename (which doesn't have gets method),
48
+ # the method opens a local file named +file+
49
+ # with 'File.open(filename, mode, perm)'.
50
+ #
51
+ # When nil is given to dbclass, trying to determine database class
52
+ # (file format) automatically. If fails to determine, dbclass is
53
+ # set to nil and FlatFile#next_entry works same as IO#gets when
54
+ # raw = true. It is recommended to set dbclass using
55
+ # FlatFile#dbclass= method if fails to determine automatically.
56
+ #
57
+ # * Example 1
58
+ # Bio::FlatFile.open(Bio::GenBank, "genbank/gbest40.seq")
59
+ # * Example 2
60
+ # Bio::FlatFile.open(nil, "embl/est_hum17.dat")
61
+ # * Example 3
62
+ # Bio::FlatFile.open(Bio::GenBank, $stdin)
63
+ #
64
+ # If it is called with block, the block will be executed with
65
+ # a newly opened Bio::FlatFile instance object. If filename
66
+ # is given, the file is automatically closed when leaving the block.
67
+ #
68
+ # * Example 4
69
+ # Bio::FlatFile.open(nil, 'test4.fst') do |ff|
70
+ # ff.each { |e| print e.definition, "\n" }
71
+ # end
72
+ #
73
+ def self.open(dbclass, file, *arg)
74
+ # 3rd and 4th arg: mode, perm (passed to File.open)
75
+ openmode = []
76
+ while x = arg[0] and !x.is_a?(Hash)
77
+ openmode << arg.shift
78
+ end
79
+ # rest of arg: passed to FlatFile.new
80
+ # create a flatfile object
81
+ unless file.respond_to?(:gets)
82
+ # 'file' is a filename
83
+ if block_given? then
84
+ File.open(file, *openmode) do |fobj|
85
+ ff = self.new(dbclass, fobj, *arg)
86
+ yield ff
87
+ end
88
+ else
89
+ fobj = File.open(file, *openmode)
90
+ self.new(dbclass, fobj, *arg)
91
+ end
92
+ else
93
+ # 'file' is a IO object
94
+ ff = self.new(dbclass, file, *arg)
95
+ block_given? ? (yield ff) : ff
96
+ end
97
+ end
98
+
99
+ # Same as Bio::FlatFile.open(nil, filename_or_stream, mode, perm, options).
100
+ #
101
+ # * Example 1
102
+ # Bio::FlatFile.auto(ARGF)
103
+ # * Example 2
104
+ # Bio::FlatFile.auto("embl/est_hum17.dat")
105
+ # * Example 3
106
+ # Bio::FlatFile.auto(IO.popen("gzip -dc nc1101.flat.gz"))
107
+ #
108
+ def self.auto(*arg, &block)
109
+ self.open(nil, *arg, &block)
110
+ end
111
+
112
+ # Same as FlatFile.auto(filename_or_stream, *arg).to_a
113
+ # (It might be OBSOLETED in the future.)
114
+ def self.to_a(*arg)
115
+ self.auto(*arg) do |ff|
116
+ raise 'cannot determine file format' unless ff.dbclass
117
+ ff.to_a
118
+ end
119
+ end
120
+
121
+ # Same as FlatFile.open, except that 'stream' should be a opened
122
+ # stream object (IO, File, ..., who have the 'gets' method).
123
+ #
124
+ # * Example 1
125
+ # Bio::FlatFile.new(Bio::GenBank, ARGF)
126
+ # * Example 2
127
+ # Bio::FlatFile.new(Bio::GenBank, IO.popen("gzip -dc nc1101.flat.gz"))
128
+ #
129
+ # +options+ should be a hash (or nil). It will be OBSOLETED!!
130
+ # Available options are below:
131
+ # [<tt>:raw</tt>] if true, "raw mode" (same as #raw=true).
132
+ # default: false (not "raw mode").
133
+ #
134
+ # * Example 3
135
+ # Bio::FlatFile.new(nil, $stdin, :raw=>true)
136
+ # * Example 3 in old style (deprecated)
137
+ # Bio::FlatFile.new(nil, $stdin, true)
138
+ #
139
+ def initialize(dbclass, stream, options = nil)
140
+ # 2nd arg: IO object
141
+ @io = stream
142
+ # 3rd arg: options (nil or a Hash)
143
+ self.raw = false
144
+ if options.is_a?(Hash) then
145
+ self.raw = options[:raw] if options.has_key?(:raw)
146
+ else
147
+ self.raw = options
148
+ end
149
+ # initialize prefetch buffer
150
+ @prefetch = ''
151
+ # 1st arg: database class (or file format autodetection)
152
+ if dbclass then
153
+ self.dbclass = dbclass
154
+ else
155
+ autodetect
156
+ end
157
+ end
158
+
159
+ # IO object in the flatfile object.
160
+ attr_reader :io
161
+
162
+ # Get next entry.
163
+ def next_entry
164
+ @entry_raw = gets(@rs)
165
+ return nil unless @entry_raw
166
+ if raw then
167
+ @entry_raw
168
+ else
169
+ e = @dbclass.new(@entry_raw)
170
+ begin
171
+ s = e.entry_overrun
172
+ rescue NameError
173
+ s = nil
174
+ end
175
+ if s then
176
+ @entry_raw[-(s.length), s.length] = ''
177
+ ungets(s)
178
+ end
179
+ e
180
+ end
181
+ end
182
+
183
+ # Returns the last raw entry as a string.
184
+ attr_reader :entry_raw
185
+
186
+ # Iterates over each entry in the flatfile.
187
+ #
188
+ # * Example
189
+ # include Bio
190
+ # ff = FlatFile.open(GenBank, "genbank/gbhtg14.seq")
191
+ # ff.each_entry do |x|
192
+ # puts x.definition
193
+ # end
194
+ def each_entry
195
+ while e = self.next_entry
196
+ yield e
197
+ end
198
+ end
199
+ alias each each_entry
200
+
201
+ # Resets file pointer to the start of the flatfile.
202
+ # (similar to IO#rewind)
203
+ def rewind
204
+ r = @io.rewind
205
+ @prefetch = ''
206
+ r
207
+ end
208
+
209
+ # Closes input stream.
210
+ # (similar to IO#close)
211
+ def close
212
+ @io.close
213
+ end
214
+
215
+ # Returns current position of input stream.
216
+ # If the input stream is not a normal file,
217
+ # the result is not guaranteed.
218
+ # It is similar to IO#pos.
219
+ # Note that it will not be equal to io.pos,
220
+ # because FlatFile#autodetect may pre-read some lines.
221
+ def pos
222
+ @io.pos - @prefetch.size
223
+ end
224
+
225
+ # (Not recommended to use it.)
226
+ # Sets position of input stream.
227
+ # If the input stream is not a normal file,
228
+ # the result is not guaranteed.
229
+ # It is similar to IO#pos=.
230
+ # Note that it will not be equal to io.pos=,
231
+ # because FlatFile#autodetect may pre-read some lines.
232
+ def pos=(p)
233
+ r = (@io.pos = p)
234
+ @prefetch = ''
235
+ r
236
+ end
237
+
238
+ # Returns true if input stream is end-of-file.
239
+ # Otherwise, returns false.
240
+ # (Similar to IO#eof?, but may not be equal to io.eof?,
241
+ # because FlatFile#autodetect may pre-read some lines.)
242
+ def eof?
243
+ if @prefetch.size > 0
244
+ false
245
+ else
246
+ @io.eof?
247
+ end
248
+ end
249
+
250
+ # Similar to IO#gets.
251
+ # Internal use only. Users should not call it directly.
252
+ def gets(io_rs = $/)
253
+ if @prefetch.size > 0
254
+ if io_rs == nil then
255
+ r = @prefetch + @io.gets(nil).to_s
256
+ @prefetch = ''
257
+ else
258
+ if io_rs == '' then
259
+ sp_rs = /\n\n/n
260
+ sp_rs_orig = "\n\n"
261
+ else
262
+ sp_rs = Regexp.new(Regexp.escape(io_rs, 'n'), 0, 'n')
263
+ sp_rs_orig = io_rs
264
+ end
265
+ a = @prefetch.split(sp_rs, 2)
266
+ if a.size > 1 then
267
+ r = a[0] + sp_rs_orig
268
+ @prefetch = a[1]
269
+ else
270
+ @prefetch << @io.gets(io_rs).to_s
271
+ a = @prefetch.split(sp_rs, 2)
272
+ if a.size > 1 then
273
+ r = a[0] + sp_rs_orig
274
+ @prefetch = a[1].to_s
275
+ else
276
+ r = @prefetch
277
+ @prefetch = ''
278
+ end
279
+ end
280
+ end
281
+ r
282
+ else
283
+ @io.gets(io_rs)
284
+ end
285
+ end
286
+
287
+ # Unread read data.
288
+ # Internal use only. Users must not call it.
289
+ def ungets(str)
290
+ @prefetch = str + @prefetch
291
+ nil
292
+ end
293
+
294
+ # Similar to IO#getc.
295
+ # Internal use only. Users should not call it directly.
296
+ def getc
297
+ if @prefetch.size > 0 then
298
+ r = @prefetch[0]
299
+ @prefetch = @prefetch[1..-1]
300
+ else
301
+ r = @io.getc
302
+ end
303
+ r
304
+ end
305
+
306
+ # Similar to IO#ungetc.
307
+ # Internal use only. Users should not call it.
308
+ def ungetc(c)
309
+ @prefetch = sprintf("%c", c) + @prefetch
310
+ nil
311
+ end
312
+
313
+ # If true is given, the next_entry method returns
314
+ # a entry as a text, whereas if false, returns as a parsed object.
315
+ def raw=(bool)
316
+ @raw = (bool ? true : false)
317
+ end
318
+
319
+ # If true, raw mode.
320
+ attr_reader :raw
321
+
322
+ # Sets database class. Plese use only if autodetect fails.
323
+ def dbclass=(k)
324
+ if k then
325
+ @dbclass = k
326
+ @rs = @dbclass::DELIMITER
327
+ else
328
+ @dbclass = nil
329
+ @rs = $/
330
+ end
331
+ end
332
+
333
+ # Returns database class which is automatically detected or
334
+ # given in FlatFile#initialize.
335
+ attr_reader :dbclass
336
+
337
+ # Performs determination of database class (file format).
338
+ # Pre-reads +lines+ lines for format determination (default 31 lines).
339
+ # If fails, returns nil or false. Otherwise, returns database class.
340
+ #
341
+ # The method can be called anytime if you want (but not recommended).
342
+ # This might be useful if input file is a mixture of muitiple format data.
343
+ def autodetect(lines = 31)
344
+ r = nil
345
+ 1.upto(lines) do |x|
346
+ if line = @io.gets then
347
+ @prefetch << line
348
+ if line and line.strip.size > 0 then
349
+ r = self.class.autodetect(@prefetch)
350
+ if r then
351
+ self.dbclass = r
352
+ return r
353
+ end
354
+ end
355
+ end
356
+ end
357
+ self.dbclass = nil unless dbclass
358
+ r
359
+ end
360
+
361
+ # Detects database class (== file format) of given file.
362
+ # If fails to determine, returns nil.
363
+ def self.autodetect_file(filename)
364
+ ff = self.open(nil, filename)
365
+ r = ff.dbclass
366
+ ff.close
367
+ r
368
+ end
369
+
370
+ # Detects database class (== file format) of given input stream.
371
+ # If fails to determine, returns nil.
372
+ # Caution: the method reads some data from the input stream,
373
+ # and the data will be lost.
374
+ def self.autodetect_stream(io)
375
+ ff = self.new(nil, io)
376
+ r = ff.dbclass
377
+ r
378
+ end
379
+
380
+ # Detects database class (== file format) of given string.
381
+ # If fails to determine, returns false or nil.
382
+ def self.autodetect(text)
383
+ require 'bio'
384
+ case text
385
+ when /^LOCUS .+ bp .*[a-z]*[DR]?NA/
386
+ Bio::GenBank
387
+ when /^LOCUS .+ aa .+/
388
+ Bio::GenPept
389
+ when /^UI \- [0-9]+$/
390
+ Bio::MEDLINE
391
+
392
+ when /^ID .+\; .*(DNA|RNA|XXX)\;/
393
+ Bio::EMBL
394
+ when /^ID .+\; *PRT\;/
395
+ Bio::SPTR
396
+ when /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/
397
+ Bio::PROSITE
398
+ when /^AC [-A-Za-z0-9_\.]+$/
399
+ Bio::TRANSFAC
400
+
401
+ when /^H [-A-Z0-9_\.]+$/
402
+ if text =~ /^M [rc]/ then
403
+ Bio::AAindex2
404
+ elsif text =~ /^I A\/L/ then
405
+ Bio::AAindex1
406
+ else
407
+ false #fail to determine
408
+ end
409
+
410
+ when /^CODE [0-9]+$/
411
+ Bio::LITDB
412
+ when /^Entry [A-Z0-9]+/
413
+ Bio::KEGG::BRITE
414
+
415
+ when /^ENTRY .+ KO\s*$/
416
+ Bio::KEGG::KO
417
+ when /^ENTRY .+ Glycan\s*$/
418
+ Bio::KEGG::GLYCAN
419
+ when /^ENTRY .+ (CDS|gene|.*RNA) /
420
+ Bio::KEGG::GENES
421
+ when /^ENTRY EC [0-9\.]+$/
422
+ Bio::KEGG::ENZYME
423
+ when /^ENTRY C[A-Za-z0-9\._]+$/
424
+ Bio::KEGG::COMPOUND
425
+ when /^ENTRY R[A-Za-z0-9\._]+$/
426
+ Bio::KEGG::REACTION
427
+ when /^ENTRY [a-z]+$/
428
+ Bio::KEGG::GENOME
429
+
430
+ when /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/
431
+ if $1 == 'clusters'
432
+ Bio::FANTOM::MaXML::Cluster
433
+ elsif $1 == 'sequences'
434
+ Bio::FANTOM::MaXML::Sequence
435
+ else
436
+ nil #unknown
437
+ end
438
+
439
+ when /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/
440
+ Bio::PDB
441
+
442
+ when /^CLUSTAL .*\(.*\).*sequence +alignment/
443
+ Bio::ClustalW::Report
444
+
445
+ when /\<\!DOCTYPE BlastOutput PUBLIC /
446
+ Bio::Blast::Report
447
+
448
+ when /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
449
+ Bio::Blast::WU::Report
450
+ when /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/
451
+ Bio::Blast::WU::Report_TBlast
452
+
453
+ when /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
454
+ Bio::Blast::Default::Report
455
+ when /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/
456
+ Bio::Blast::Default::Report_TBlast
457
+
458
+ when /^psLayout version \d+\s*$/
459
+ Bio::Blat::Report
460
+ when /^\-\-SPIDEY version .+\-\-$/
461
+ Bio::Spidey::Report
462
+
463
+ when /^HMMER +\d+\./
464
+ Bio::HMMER::Report
465
+
466
+ when /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/
467
+ Bio::Sim4::Report
468
+
469
+ when /^>.+$/
470
+ if text =~ /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ then
471
+ Bio::NBRF
472
+ elsif text =~ /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ then
473
+ Bio::FastaFormat
474
+ elsif text =~ /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ then
475
+ Bio::FastaNumericFormat
476
+ else
477
+ false #fail to determine
478
+ end
479
+
480
+ else
481
+ nil #not found
482
+ end
483
+ end
484
+
485
+ end #class FlatFile
486
+
487
+ end #module Bio
488
+
489
+
490
+ if __FILE__ == $0
491
+ if ARGV.size == 2
492
+ require 'bio'
493
+ p Bio::FlatFile.open(eval(ARGV.shift), ARGV.shift).next_entry
494
+ end
495
+ end
496
+