bio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,545 @@
1
+ #
2
+ # = bio/io/flatfile/autodetection.rb - file format auto-detection
3
+ #
4
+ # Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
5
+ #
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ #
11
+ # See documents for Bio::FlatFile::AutoDetect and Bio::FlatFile.
12
+ #
13
+
14
+ require 'tsort'
15
+ require 'bio/io/flatfile'
16
+
17
+ module Bio
18
+
19
+ class FlatFile
20
+
21
+ # AutoDetect automatically determines database class of given data.
22
+ class AutoDetect
23
+
24
+ include TSort
25
+
26
+ # Array to store autodetection rules.
27
+ # This is defined only for inspect.
28
+ class RulesArray < Array
29
+ # visualize contents
30
+ def inspect
31
+ "[#{self.collect { |e| e.name.inspect }.join(' ')}]"
32
+ end
33
+ end #class RulesArray
34
+
35
+ # Template of a single rule of autodetection
36
+ class RuleTemplate
37
+ # Creates a new element.
38
+ def self.[](*arg)
39
+ self.new(*arg)
40
+ end
41
+
42
+ # Creates a new element.
43
+ def initialize
44
+ @higher_priority_elements = RulesArray.new
45
+ @lower_priority_elements = RulesArray.new
46
+ @name = nil
47
+ end
48
+
49
+ # self is prior to the _elem_.
50
+ def is_prior_to(elem)
51
+ return nil if self == elem
52
+ elem.higher_priority_elements << self
53
+ self.lower_priority_elements << elem
54
+ true
55
+ end
56
+
57
+ # higher priority elements
58
+ attr_reader :higher_priority_elements
59
+ # lower priority elements
60
+ attr_reader :lower_priority_elements
61
+
62
+ # database classes
63
+ attr_reader :dbclasses
64
+
65
+ # unique name of the element
66
+ attr_accessor :name
67
+
68
+ # If given text (and/or meta information) is known, returns
69
+ # the database class.
70
+ # Otherwise, returns nil or false.
71
+ #
72
+ # _text_ will be a String.
73
+ # _meta_ will be a Hash.
74
+ # _meta_ may contain following keys.
75
+ # :path => pathname, filename or uri.
76
+ def guess(text, meta)
77
+ nil
78
+ end
79
+
80
+ private
81
+ # Gets constant from constant name given as a string.
82
+ def str2const(str)
83
+ const = Object
84
+ str.split(/\:\:/).each do |x|
85
+ const = const.const_get(x)
86
+ end
87
+ const
88
+ end
89
+
90
+ # Gets database class from given object.
91
+ # Current implementation is:
92
+ # if _obj_ is kind of String, regarded as a constant.
93
+ # Otherwise, returns _obj_ as is.
94
+ def get_dbclass(obj)
95
+ obj.kind_of?(String) ? str2const(obj) : obj
96
+ end
97
+ end #class Rule_Template
98
+
99
+ # RuleDebug is a class for debugging autodetect classes/methods
100
+ class RuleDebug < RuleTemplate
101
+ # Creates a new instance.
102
+ def initialize(name)
103
+ super()
104
+ @name = name
105
+ end
106
+
107
+ # prints information to the $stderr.
108
+ def guess(text, meta)
109
+ $stderr.puts @name
110
+ $stderr.puts text.inspect
111
+ $stderr.puts meta.inspect
112
+ nil
113
+ end
114
+ end #class RuleDebug
115
+
116
+ # Special element that is always top or bottom priority.
117
+ class RuleSpecial < RuleTemplate
118
+ def initialize(name)
119
+ #super()
120
+ @name = name
121
+ end
122
+ # modification of @name is inhibited.
123
+ def name=(x)
124
+ raise 'cannot modify name'
125
+ end
126
+
127
+ # always returns void array
128
+ def higher_priority_elements
129
+ []
130
+ end
131
+ # always returns void array
132
+ def lower_priority_elements
133
+ []
134
+ end
135
+ end #class RuleSpecial
136
+
137
+ # Special element that is always top priority.
138
+ TopRule = RuleSpecial.new('top')
139
+ # Special element that is always bottom priority.
140
+ BottomRule = RuleSpecial.new('bottom')
141
+
142
+ # A autodetection rule to use a regular expression
143
+ class RuleRegexp < RuleTemplate
144
+ # Creates a new instance.
145
+ def initialize(dbclass, re)
146
+ super()
147
+ @re = re
148
+ @name = dbclass.to_s
149
+ @dbclass = nil
150
+ @dbclass_lazy = dbclass
151
+ end
152
+
153
+ # database class (lazy evaluation)
154
+ def dbclass
155
+ unless @dbclass
156
+ @dbclass = get_dbclass(@dbclass_lazy)
157
+ end
158
+ @dbclass
159
+ end
160
+ private :dbclass
161
+
162
+ # returns database classes
163
+ def dbclasses
164
+ [ dbclass ]
165
+ end
166
+
167
+ # If given text matches the regexp, returns the database class.
168
+ # Otherwise, returns nil or false.
169
+ # _meta_ is ignored.
170
+ def guess(text, meta)
171
+ @re =~ text ? dbclass : nil
172
+ end
173
+ end #class RuleRegexp
174
+
175
+ # A autodetection rule to use more than two regular expressions.
176
+ # If given string matches one of the regular expressions,
177
+ # returns the database class.
178
+ class RuleRegexp2 < RuleRegexp
179
+ # Creates a new instance.
180
+ def initialize(dbclass, *regexps)
181
+ super(dbclass, nil)
182
+ @regexps = regexps
183
+ end
184
+
185
+ # If given text matches one of the regexp, returns the database class.
186
+ # Otherwise, returns nil or false.
187
+ # _meta_ is ignored.
188
+ def guess(text, meta)
189
+ @regexps.each do |re|
190
+ return dbclass if re =~ text
191
+ end
192
+ nil
193
+ end
194
+ end #class RuleRegexp
195
+
196
+ # A autodetection rule that passes data to the proc object.
197
+ class RuleProc < RuleTemplate
198
+ # Creates a new instance.
199
+ def initialize(*dbclasses, &proc)
200
+ super()
201
+ @proc = proc
202
+ @dbclasses = nil
203
+ @dbclasses_lazy = dbclasses
204
+ @name = dbclasses.collect { |x| x.to_s }.join('|')
205
+ end
206
+
207
+ # database classes (lazy evaluation)
208
+ def dbclasses
209
+ unless @dbclasses
210
+ @dbclasses = @dbclasses_lazy.collect { |x| get_dbclass(x) }
211
+ end
212
+ @dbclasses
213
+ end
214
+
215
+ # If given text (and/or meta information) is known, returns
216
+ # the database class.
217
+ # Otherwise, returns nil or false.
218
+ #
219
+ # Refer RuleTemplate#guess for _meta_.
220
+ def guess(text, meta)
221
+ @proc.call(text)
222
+ end
223
+ end #class RuleProc
224
+
225
+ # Creates a new Autodetect object
226
+ def initialize
227
+ # stores autodetection rules.
228
+ @rules = Hash.new
229
+ # stores elements (cache)
230
+ @elements = nil
231
+ self.add(TopRule)
232
+ self.add(BottomRule)
233
+ end
234
+
235
+ # Adds a new element.
236
+ # Returns _elem_.
237
+ def add(elem)
238
+ raise 'element name conflicts' if @rules[elem.name]
239
+ @elements = nil
240
+ @rules[elem.name] = elem
241
+ elem
242
+ end
243
+
244
+ # (required by TSort.)
245
+ # For all elements, yields each element.
246
+ def tsort_each_node(&x)
247
+ @rules.each_value(&x)
248
+ end
249
+
250
+ # (required by TSort.)
251
+ # For a given element, yields each child
252
+ # (= lower priority elements) of the element.
253
+ def tsort_each_child(elem)
254
+ if elem == TopRule then
255
+ @rules.each_value do |e|
256
+ yield e unless e == TopRule or
257
+ e.lower_priority_elements.index(TopRule)
258
+ end
259
+ elsif elem == BottomRule then
260
+ @rules.each_value do |e|
261
+ yield e if e.higher_priority_elements.index(BottomRule)
262
+ end
263
+ else
264
+ elem.lower_priority_elements.each do |e|
265
+ yield e if e != BottomRule
266
+ end
267
+ unless elem.higher_priority_elements.index(BottomRule)
268
+ yield BottomRule
269
+ end
270
+ end
271
+ end
272
+
273
+ # Returns current elements as an array
274
+ # whose order fulfills all elements' priorities.
275
+ def elements
276
+ unless @elements
277
+ ary = tsort
278
+ ary.reverse!
279
+ @elements = ary
280
+ end
281
+ @elements
282
+ end
283
+
284
+ # rebuilds the object and clears internal cache.
285
+ def rehash
286
+ @rules.rehash
287
+ @elements = nil
288
+ end
289
+
290
+ # visualizes the object (mainly for debug)
291
+ def inspect
292
+ "<#{self.class.to_s} " +
293
+ self.elements.collect { |e| e.name.inspect }.join(' ') +
294
+ ">"
295
+ end
296
+
297
+ # Iterates over each element.
298
+ def each_rule(&x) #:yields: elem
299
+ elements.each(&x)
300
+ end
301
+
302
+ # Autodetect from the text.
303
+ # Returns a database class if succeeded.
304
+ # Returns nil if failed.
305
+ def autodetect(text, meta = {})
306
+ r = nil
307
+ elements.each do |e|
308
+ #$stderr.puts e.name
309
+ r = e.guess(text, meta)
310
+ break if r
311
+ end
312
+ r
313
+ end
314
+
315
+ # autodetect from the FlatFile object.
316
+ # Returns a database class if succeeded.
317
+ # Returns nil if failed.
318
+ def autodetect_flatfile(ff, lines = 31)
319
+ meta = {}
320
+ stream = ff.instance_eval { @stream }
321
+ begin
322
+ path = stream.path
323
+ rescue NameError
324
+ end
325
+ if path then
326
+ meta[:path] = path
327
+ # call autodetect onece with meta and without any read action
328
+ if r = self.autodetect(stream.prefetch_buffer, meta)
329
+ return r
330
+ end
331
+ end
332
+ # reading stream
333
+ 1.upto(lines) do |x|
334
+ break unless line = stream.prefetch_gets
335
+ if line.strip.size > 0 then
336
+ if r = self.autodetect(stream.prefetch_buffer, meta)
337
+ return r
338
+ end
339
+ end
340
+ end
341
+ return nil
342
+ end
343
+
344
+ # default autodetect object for class method
345
+ @default = nil
346
+
347
+ # returns the default autodetect object
348
+ def self.default
349
+ unless @default then
350
+ @default = self.make_default
351
+ end
352
+ @default
353
+ end
354
+
355
+ # sets the default autodetect object.
356
+ def self.default=(ad)
357
+ @default = ad
358
+ end
359
+
360
+ # make a new autodetect object
361
+ def self.[](*arg)
362
+ a = self.new
363
+ arg.each { |e| a.add(e) }
364
+ a
365
+ end
366
+
367
+ # make a default of default autodetect object
368
+ def self.make_default
369
+ a = self[
370
+ genbank = RuleRegexp[ 'Bio::GenBank',
371
+ /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ],
372
+ genpept = RuleRegexp[ 'Bio::GenPept',
373
+ /^LOCUS .+ aa .+/ ],
374
+ medline = RuleRegexp[ 'Bio::MEDLINE',
375
+ /^PMID\- [0-9]+$/ ],
376
+ embl = RuleRegexp[ 'Bio::EMBL',
377
+ /^ID .+\; .*(DNA|RNA|XXX)\;/ ],
378
+ sptr = RuleRegexp2[ 'Bio::SPTR',
379
+ /^ID .+\; *PRT\;/,
380
+ /^ID [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
381
+ prosite = RuleRegexp[ 'Bio::PROSITE',
382
+ /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
383
+ transfac = RuleRegexp[ 'Bio::TRANSFAC',
384
+ /^AC [-A-Za-z0-9_\.]+$/ ],
385
+
386
+ aaindex = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
387
+ if /^H [-A-Z0-9_\.]+$/ =~ text then
388
+ if text =~ /^M [rc]/ then
389
+ Bio::AAindex2
390
+ elsif text =~ /^I A\/L/ then
391
+ Bio::AAindex1
392
+ else
393
+ false #fail to determine
394
+ end
395
+ else
396
+ nil
397
+ end
398
+ end,
399
+
400
+ litdb = RuleRegexp[ 'Bio::LITDB',
401
+ /^CODE [0-9]+$/ ],
402
+ brite = RuleRegexp[ 'Bio::KEGG::BRITE',
403
+ /^Entry [A-Z0-9]+/ ],
404
+ orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
405
+ /^ENTRY .+ KO\s*/ ],
406
+ drug = RuleRegexp[ 'Bio::KEGG::DRUG',
407
+ /^ENTRY .+ Drug\s*/ ],
408
+ glycan = RuleRegexp[ 'Bio::KEGG::GLYCAN',
409
+ /^ENTRY .+ Glycan\s*/ ],
410
+ enzyme = RuleRegexp2[ 'Bio::KEGG::ENZYME',
411
+ /^ENTRY EC [0-9\.]+$/,
412
+ /^ENTRY .+ Enzyme\s*/
413
+ ],
414
+ compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
415
+ /^ENTRY C[A-Za-z0-9\._]+$/,
416
+ /^ENTRY .+ Compound\s*/
417
+ ],
418
+ reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
419
+ /^ENTRY R[A-Za-z0-9\._]+$/,
420
+ /^ENTRY .+ Reaction\s*/
421
+ ],
422
+ genes = RuleRegexp[ 'Bio::KEGG::GENES',
423
+ /^ENTRY .+ (CDS|gene|.*RNA|Contig) / ],
424
+ genome = RuleRegexp[ 'Bio::KEGG::GENOME',
425
+ /^ENTRY [a-z]+$/ ],
426
+
427
+ fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
428
+ 'Bio::FANTOM::MaXML::Sequence') do |text|
429
+ if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
430
+ case $1
431
+ when 'clusters'
432
+ Bio::FANTOM::MaXML::Cluster
433
+ when 'sequences'
434
+ Bio::FANTOM::MaXML::Sequence
435
+ else
436
+ nil #unknown
437
+ end
438
+ else
439
+ nil
440
+ end
441
+ end,
442
+
443
+ pdb = RuleRegexp[ 'Bio::PDB',
444
+ /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ],
445
+ het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
446
+ /^RESIDUE +.+ +\d+\s*$/ ],
447
+
448
+ clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
449
+ /^CLUSTAL .*\(.*\).*sequence +alignment/,
450
+ /^CLUSTAL FORMAT for T-COFFEE/ ],
451
+
452
+ gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
453
+ /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],
454
+
455
+ gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
456
+ /^!!(N|A)A_SEQUENCE .+/ ],
457
+
458
+ blastxml = RuleRegexp[ 'Bio::Blast::Report',
459
+ /\<\!DOCTYPE BlastOutput PUBLIC / ],
460
+ wublast = RuleRegexp[ 'Bio::Blast::WU::Report',
461
+ /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
462
+ wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
463
+ /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
464
+ blast = RuleRegexp[ 'Bio::Blast::Default::Report',
465
+ /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
466
+ tblast = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
467
+ /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
468
+ rpsblast = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
469
+ /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
470
+
471
+ blat = RuleRegexp[ 'Bio::Blat::Report',
472
+ /^psLayout version \d+/ ],
473
+ spidey = RuleRegexp[ 'Bio::Spidey::Report',
474
+ /^\-\-SPIDEY version .+\-\-$/ ],
475
+ hmmer = RuleRegexp[ 'Bio::HMMER::Report',
476
+ /^HMMER +\d+\./ ],
477
+ sim4 = RuleRegexp[ 'Bio::Sim4::Report',
478
+ /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],
479
+
480
+ fastaformat = RuleProc.new('Bio::FastaFormat',
481
+ 'Bio::NBRF',
482
+ 'Bio::FastaNumericFormat') do |text|
483
+ if /^>.+$/ =~ text
484
+ case text
485
+ when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
486
+ Bio::NBRF
487
+ when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
488
+ Bio::FastaFormat
489
+ when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
490
+ Bio::FastaNumericFormat
491
+ else
492
+ false
493
+ end
494
+ else
495
+ nil
496
+ end
497
+ end
498
+ ]
499
+
500
+ # dependencies
501
+ # NCBI
502
+ genbank.is_prior_to genpept
503
+ # EMBL/UniProt
504
+ embl.is_prior_to sptr
505
+ sptr.is_prior_to prosite
506
+ prosite.is_prior_to transfac
507
+ # KEGG
508
+ #aaindex.is_prior_to litdb
509
+ #litdb.is_prior_to brite
510
+ brite.is_prior_to orthology
511
+ orthology.is_prior_to drug
512
+ drug.is_prior_to glycan
513
+ glycan.is_prior_to enzyme
514
+ enzyme.is_prior_to compound
515
+ compound.is_prior_to reaction
516
+ reaction.is_prior_to genes
517
+ genes.is_prior_to genome
518
+ # PDB
519
+ pdb.is_prior_to het
520
+ # BLAST
521
+ wublast.is_prior_to wutblast
522
+ wutblast.is_prior_to blast
523
+ blast.is_prior_to tblast
524
+ # FastaFormat
525
+ BottomRule.is_prior_to(fastaformat)
526
+
527
+ # for debug
528
+ #debug_first = RuleDebug.new('debug_first')
529
+ #a.add(debug_first)
530
+ #debug_first.is_prior_to(TopRule)
531
+
532
+ ## for debug
533
+ #debug_last = RuleDebug.new('debug_last')
534
+ #a.add(debug_last)
535
+ #BottomRule.is_prior_to(debug_last)
536
+ #fastaformat.is_prior_to(debug_last)
537
+
538
+ a.rehash
539
+ return a
540
+ end
541
+
542
+ end #class AutoDetect
543
+ end #class FlatFile
544
+ end #module Bio
545
+