bio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,237 @@
1
+ #
2
+ # = bio/io/flatfile/buffer.rb - Input stream buffer for FlatFile
3
+ #
4
+ # Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
5
+ #
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ #
11
+ # See documents for Bio::FlatFile::BufferedInputStream and Bio::FlatFile.
12
+ #
13
+
14
+ require 'bio/io/flatfile'
15
+
16
+ module Bio
17
+
18
+ class FlatFile
19
+
20
+ # Wrapper for a IO (or IO-like) object.
21
+ # It can input with a buffer.
22
+ class BufferedInputStream
23
+ # Creates a new input stream wrapper
24
+ def initialize(io, path)
25
+ @io = io
26
+ @path = path
27
+ # initialize prefetch buffer
28
+ @buffer = ''
29
+ end
30
+
31
+ # Creates a new input stream wrapper from the given IO object.
32
+ def self.for_io(io)
33
+ begin
34
+ path = io.path
35
+ rescue NameError
36
+ path = nil
37
+ end
38
+ self.new(io, path)
39
+ end
40
+
41
+ # Creates a new input stream wrapper to open file _filename_
42
+ # by using File.open.
43
+ # *arg is passed to File.open.
44
+ #
45
+ # Like File.open, a block can be accepted.
46
+ def self.open_file(filename, *arg)
47
+ if block_given? then
48
+ File.open(filename, *arg) do |fobj|
49
+ yield self.new(fobj, filename)
50
+ end
51
+ else
52
+ fobj = File.open(filename, *arg)
53
+ self.new(fobj, filename)
54
+ end
55
+ end
56
+
57
+ # Creates a new input stream wrapper from URI specified as _uri_.
58
+ # by using OpenURI.open_uri or URI#open.
59
+ # _uri_ must be a String or URI object.
60
+ # *arg is passed to OpenURI.open_uri or URI#open.
61
+ #
62
+ # Like OpenURI.open_uri, it can accept a block.
63
+ def self.open_uri(uri, *arg)
64
+ if uri.kind_of?(URI)
65
+ if block_given?
66
+ uri.open(*arg) do |fobj|
67
+ yield self.new(fobj, uri.to_s)
68
+ end
69
+ else
70
+ fobj = uri.open(*arg)
71
+ self.new(fobj, uri.to_s)
72
+ end
73
+ else
74
+ if block_given?
75
+ OpenURI.open_uri(uri, *arg) do |fobj|
76
+ yield self.new(fobj, uri)
77
+ end
78
+ else
79
+ fobj = OpenURI.open_uri(uri, *arg)
80
+ self.new(fobj, uri)
81
+ end
82
+ end
83
+ end
84
+
85
+ # Pathname, filename or URI to open the object.
86
+ # Like File#path, returned value isn't normalized.
87
+ attr_reader :path
88
+
89
+ # Converts to IO object if possible
90
+ def to_io
91
+ @io.to_io
92
+ end
93
+
94
+ # Closes the IO object if possible
95
+ def close
96
+ @io.close
97
+ end
98
+
99
+ # Rewinds the IO object if possible
100
+ # Internal buffer in this wrapper is cleared.
101
+ def rewind
102
+ r = @io.rewind
103
+ @buffer = ''
104
+ r
105
+ end
106
+
107
+ # Returns current file position
108
+ def pos
109
+ @io.pos - @buffer.size
110
+ end
111
+
112
+ # Sets current file position if possible
113
+ # Internal buffer in this wrapper is cleared.
114
+ def pos=(p)
115
+ r = (@io.pos = p)
116
+ @buffer = ''
117
+ r
118
+ end
119
+
120
+ # Returns true if end-of-file. Otherwise, returns false.
121
+ #
122
+ # Note that it returns false if internal buffer is this wrapper
123
+ # is not empty,
124
+ def eof?
125
+ if @buffer.size > 0
126
+ false
127
+ else
128
+ @io.eof?
129
+ end
130
+ end
131
+
132
+ # Same as IO#gets.
133
+ #
134
+ # Compatibility note: the bahavior of paragraph mode (io_rs = '')
135
+ # may differ from that of IO#gets('').
136
+ def gets(io_rs = $/)
137
+ if @buffer.size > 0
138
+ if io_rs == nil then
139
+ r = @buffer + @io.gets(nil).to_s
140
+ @buffer = ''
141
+ else
142
+ if io_rs == '' then # io_rs.empty?
143
+ sp_rs = /((?:\r?\n){2,})/n
144
+ else
145
+ sp_rs = io_rs
146
+ end
147
+ a = @buffer.split(sp_rs, 2)
148
+ if a.size > 1 then
149
+ r = a.shift
150
+ r += (io_rs.empty? ? a.shift : io_rs)
151
+ @buffer = a.shift.to_s
152
+ else
153
+ @buffer << @io.gets(io_rs).to_s
154
+ a = @buffer.split(sp_rs, 2)
155
+ if a.size > 1 then
156
+ r = a.shift
157
+ r += (io_rs.empty? ? a.shift : io_rs)
158
+ @buffer = a.shift.to_s
159
+ else
160
+ r = @buffer
161
+ @buffer = ''
162
+ end
163
+ end
164
+ end
165
+ r
166
+ else
167
+ @io.gets(io_rs)
168
+ end
169
+ end
170
+
171
+ # Pushes back given str to the internal buffer.
172
+ # Returns nil.
173
+ # str must be read previously with the wrapper object.
174
+ #
175
+ # Note that in current implementation, the str can be everything,
176
+ # but please don't depend on it.
177
+ #
178
+ def ungets(str)
179
+ @buffer = str + @buffer
180
+ nil
181
+ end
182
+
183
+ # Same as IO#getc.
184
+ def getc
185
+ if @buffer.size > 0 then
186
+ r = @buffer[0]
187
+ @buffer = @buffer[1..-1]
188
+ else
189
+ r = @io.getc
190
+ end
191
+ r
192
+ end
193
+
194
+ # Pushes back one character into the internal buffer.
195
+ # Unlike IO#getc, it can be called more than one time.
196
+ def ungetc(c)
197
+ @buffer = sprintf("%c", c) + @buffer
198
+ nil
199
+ end
200
+
201
+ # Gets current prefetch buffer
202
+ def prefetch_buffer
203
+ @buffer
204
+ end
205
+
206
+ # It does @io.gets, and addes returned string
207
+ # to the internal buffer, and returns the string.
208
+ def prefetch_gets(*arg)
209
+ r = @io.gets(*arg)
210
+ @buffer << r if r
211
+ r
212
+ end
213
+
214
+ # It does @io.readpartial, and addes returned string
215
+ # to the internal buffer, and returns the string.
216
+ def prefetch_readpartial(*arg)
217
+ r = @io.readpartial(*arg)
218
+ @buffer << r if r
219
+ r
220
+ end
221
+
222
+ # Skips space characters in the stream.
223
+ # returns nil.
224
+ def skip_spaces
225
+ ws = { ?\s => true, ?\n => true, ?\r => true, ?\t => true }
226
+ while r = self.getc
227
+ unless ws[r] then
228
+ self.ungetc(r)
229
+ break
230
+ end
231
+ end
232
+ nil
233
+ end
234
+ end #class BufferedInputStream
235
+
236
+ end #class FlatFile
237
+ end #module Bio
@@ -888,13 +888,18 @@ module Bio
888
888
  self
889
889
  end
890
890
 
891
- def self.external_sort_proc(sort_program = '/usr/bin/sort')
891
+ def self.external_sort_proc(sort_program = [ '/usr/bin/env',
892
+ 'LC_ALL=C',
893
+ '/usr/bin/sort' ])
892
894
  Proc.new do |out, in1, *files|
893
- system(sort_program, '-o', out, in1, *files)
895
+ cmd = sort_program + [ '-o', out, in1, *files ]
896
+ system(*cmd)
894
897
  end
895
898
  end
896
899
 
897
- def self.external_merge_sort_proc(sort_program = '/usr/bin/sort')
900
+ def self.external_merge_sort_proc(sort_program = [ '/usr/bin/env',
901
+ 'LC_ALL=C',
902
+ '/usr/bin/sort' ])
898
903
  Proc.new do |out, in1, *files|
899
904
  # (in1 may be sorted)
900
905
  tf_all = []
@@ -902,21 +907,26 @@ module Bio
902
907
  files.each do |fn|
903
908
  tf = Tempfile.open('sort')
904
909
  tf.close(false)
905
- system(sort_program, '-o', tf.path, fn)
910
+ cmd = sort_program + [ '-o', tf.path, fn ]
911
+ system(*cmd)
906
912
  tf_all << tf
907
913
  tfn_all << tf.path
908
914
  end
909
- system(sort_program, '-m', '-o', out, in1, *tfn_all)
915
+ cmd_fin = sort_program + [ '-m', '-o', out, in1, *tfn_all ]
916
+ system(*cmd_fin)
910
917
  tf_all.each do |tf|
911
918
  tf.close(true)
912
919
  end
913
920
  end
914
921
  end
915
922
 
916
- def self.external_merge_proc(sort_program = '/usr/bin/sort')
923
+ def self.external_merge_proc(sort_program = [ '/usr/bin/env',
924
+ 'LC_ALL=C',
925
+ '/usr/bin/sort' ])
917
926
  Proc.new do |out, in1, *files|
918
927
  # files (and in1) must be sorted
919
- system(sort_program, '-m', '-o', out, in1, *files)
928
+ cmd = sort_program + [ '-m', '-o', out, in1, *files ]
929
+ system(*cmd)
920
930
  end
921
931
  end
922
932
 
@@ -525,6 +525,8 @@ module Bio
525
525
  def self.addindex_flat(db, mode, need_update, parser, options)
526
526
  require 'tempfile'
527
527
  prog = options['sort_program']
528
+ env = options['env_program']
529
+ env_args = options['env_program_arguments']
528
530
 
529
531
  return false if need_update.to_a.size == 0
530
532
 
@@ -555,7 +557,7 @@ module Bio
555
557
  fileid += 1
556
558
  end
557
559
 
558
- sort_proc = chose_sort_proc(prog, mode)
560
+ sort_proc = chose_sort_proc(prog, mode, env, env_args)
559
561
  pfile.close(false)
560
562
  DEBUG.print "sorting primary (#{parser.primary.name})...\n"
561
563
  db.primary.file.import_tsv_files(true, mode, sort_proc, pfile.path)
@@ -571,30 +573,46 @@ module Bio
571
573
  true
572
574
  end #def
573
575
 
576
+ # default sort program
574
577
  DEFAULT_SORT = '/usr/bin/sort'
575
- def self.chose_sort_proc(prog, mode = :new)
578
+
579
+ # default env program (run a program in a modified environment)
580
+ DEFAULT_ENV = '/usr/bin/env'
581
+
582
+ # default arguments for env program
583
+ DEFAULT_ENV_ARGS = [ 'LC_ALL=C' ]
584
+
585
+ def self.chose_sort_proc(prog, mode = :new,
586
+ env = nil, env_args = nil)
576
587
  case prog
577
588
  when /^builtin$/i, /^hs$/i, /^lm$/i
578
589
  DEBUG.print "sort: internal sort routine\n"
579
- sort_proc = mapfile.internal_sort_proc
590
+ sort_proc = Flat_1::FlatMappingFile::internal_sort_proc
580
591
  when nil, ''
581
592
  if FileTest.executable?(DEFAULT_SORT)
582
- DEBUG.print "sort: #{DEFAULT_SORT}\n"
583
- if mode == :new then
584
- sort_proc = Flat_1::FlatMappingFile::external_sort_proc(DEFAULT_SORT)
585
- else
586
- sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(DEFAULT_SORT)
587
- end
593
+ return chose_sort_proc(DEFAULT_SORT, mode, env, env_args)
588
594
  else
589
595
  DEBUG.print "sort: internal sort routine\n"
590
596
  sort_proc = Flat_1::FlatMappingFile::internal_sort_proc
591
597
  end
592
598
  else
593
- DEBUG.print "sort: #{prog}\n"
599
+ env_args ||= DEFAULT_ENV_ARGS
600
+ if env == '' or env == false then # inhibit to use env program
601
+ prefixes = [ prog ]
602
+ elsif env then # uses given env program
603
+ prefixes = [ env ] + env_args + [ prog ]
604
+ else # env == nil; uses default env program if possible
605
+ if FileTest.executable?(DEFAULT_ENV)
606
+ prefixes = [ DEFAULT_ENV ] + env_args + [ prog ]
607
+ else
608
+ prefixes = [ prog ]
609
+ end
610
+ end
611
+ DEBUG.print "sort: #{prefixes.join(' ')}\n"
594
612
  if mode == :new then
595
- sort_proc = Flat_1::FlatMappingFile::external_sort_proc(prog)
613
+ sort_proc = Flat_1::FlatMappingFile::external_sort_proc(prefixes)
596
614
  else
597
- sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(prog)
615
+ sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(prefixes)
598
616
  end
599
617
  end
600
618
  sort_proc
@@ -0,0 +1,297 @@
1
+ #
2
+ # = bio/io/flatfile/splitter.rb - input data splitter for FlatFile
3
+ #
4
+ # Copyright (C) 2001-2008 Naohisa Goto <ng@bioruby.org>
5
+ #
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ #
11
+ # See documents for Bio::FlatFile::Splitter and Bio::FlatFile.
12
+ #
13
+
14
+ require 'bio/io/flatfile'
15
+
16
+ module Bio
17
+
18
+ class FlatFile
19
+
20
+ # The Bio::FlatFile::Splitter is a namespace for flatfile splitters.
21
+ # Each splitter is a class to get entries from a buffered input stream.
22
+ #
23
+ # It is internally called in Bio::FlatFile.
24
+ # Normally, users do not need to use it directly.
25
+ module Splitter
26
+
27
+ # This is a template of splitter.
28
+ class Template
29
+ # Creates a new splitter.
30
+ def initialize(klass, bstream)
31
+ @dbclass = klass
32
+ @stream = bstream
33
+ @entry_pos_flag = nil
34
+ end
35
+
36
+ # skips leader of the entry.
37
+ def skip_leader
38
+ raise NotImplementedError
39
+ end
40
+
41
+ # rewind the stream
42
+ def rewind
43
+ @stream.rewind
44
+ end
45
+
46
+ # Gets entry as a string. (String)
47
+ def get_entry
48
+ raise NotImplementedError
49
+ end
50
+
51
+ # Gets entry as a data class's object
52
+ def get_parsed_entry
53
+ ent = get_entry
54
+ if ent then
55
+ self.parsed_entry = dbclass.new(ent)
56
+ else
57
+ self.parsed_entry = ent
58
+ end
59
+ parsed_entry
60
+ end
61
+
62
+ # the last entry string read from the stream (String)
63
+ attr_reader :entry
64
+
65
+ # The last parsed entry read from the stream (entry data class).
66
+ # Note that it is valid only after get_parsed_entry is called,
67
+ # and the get_entry may not affect the parsed_entry attribute.
68
+ attr_reader :parsed_entry
69
+
70
+ # a flag to write down entry start and end positions
71
+ attr_accessor :entry_pos_flag
72
+
73
+ # start position of the entry
74
+ attr_reader :entry_start_pos
75
+
76
+ # (end position of the entry) + 1
77
+ attr_reader :entry_ended_pos
78
+
79
+ #--
80
+ #private
81
+ #
82
+ ## to prevent warning message "warning: private attribute?",
83
+ ## private attributes are explicitly declared.
84
+ #++
85
+
86
+ # entry data class
87
+ attr_reader :dbclass
88
+ private :dbclass
89
+
90
+ # input stream
91
+ attr_reader :stream
92
+ private :stream
93
+
94
+ # the last entry string read from the stream
95
+ attr_writer :entry
96
+ private :entry=
97
+
98
+ # the last entry as a parsed data object
99
+ attr_writer :parsed_entry
100
+ private :parsed_entry=
101
+
102
+ # start position of the entry
103
+ attr_writer :entry_start_pos
104
+ private :entry_start_pos=
105
+
106
+ # (end position of the entry) + 1
107
+ attr_writer :entry_ended_pos
108
+ private :entry_ended_pos=
109
+
110
+ # Does stream.pos if entry_pos_flag is not nil.
111
+ # Otherwise, returns nil.
112
+ def stream_pos
113
+ entry_pos_flag ? stream.pos : nil
114
+ end
115
+ private :stream_pos
116
+ end #class Template
117
+
118
+ # Default splitter.
119
+ # It sees following constants in the given class.
120
+ # DELIMITER:: (String) delimiter indicates the end of a entry.
121
+ # FLATFILE_HEADER:: (String) start of a entry, located on head of a line.
122
+ # DELIMITER_OVERRUN:: (Integer) excess read size included in DELIMITER.
123
+ #
124
+ class Default < Template
125
+ # Creates a new splitter.
126
+ # klass:: database class
127
+ # bstream:: input stream. It must be a BufferedInputStream object.
128
+ def initialize(klass, bstream)
129
+ super(klass, bstream)
130
+
131
+ @delimiter = klass::DELIMITER rescue nil
132
+ @header = klass::FLATFILE_HEADER rescue nil
133
+ # for specific classes' benefit
134
+ unless header
135
+ if (defined?(Bio::GenBank) and klass == Bio::GenBank) or
136
+ (defined?(Bio::GenPept) and klass == Bio::GenPept)
137
+ @header = 'LOCUS '
138
+ end
139
+ end
140
+ @delimiter_overrun = klass::DELIMITER_OVERRUN rescue nil
141
+ end
142
+
143
+ # (String) delimiter indicates the end of a entry.
144
+ attr_accessor :delimiter
145
+
146
+ # (String) start of a entry, located on head of a line.
147
+ attr_accessor :header
148
+
149
+ # (Integer) excess read data size included in delimiter.
150
+ attr_accessor :delimiter_overrun
151
+
152
+ # Skips leader of the entry.
153
+ #
154
+ # If @header is not nil, it reads till the contents of @header
155
+ # comes at the head of a line.
156
+ # If correct FLATFILE_HEADER is found, returns true.
157
+ # Otherwise, returns nil.
158
+ def skip_leader
159
+ if @header then
160
+ data = ''
161
+ while s = stream.gets(@header)
162
+ data << s
163
+ if data.split(/[\r\n]+/)[-1] == @header then
164
+ stream.ungets(@header)
165
+ return true
166
+ end
167
+ end
168
+ # @header was not found. For safety,
169
+ # pushes back data with removing white spaces in the head.
170
+ data.sub(/\A\s+/, '')
171
+ stream.ungets(data)
172
+ return nil
173
+ else
174
+ stream.skip_spaces
175
+ return nil
176
+ end
177
+ end
178
+
179
+ # gets a entry
180
+ def get_entry
181
+ p0 = stream_pos()
182
+ e = stream.gets(@delimiter)
183
+ if e and @delimiter_overrun then
184
+ if e[-@delimiter.size, @delimiter.size ] == @delimiter then
185
+ overrun = e[-@delimiter_overrun, @delimiter_overrun]
186
+ e[-@delimiter_overrun, @delimiter_overrun] = ''
187
+ stream.ungets(overrun)
188
+ end
189
+ end
190
+ p1 = stream_pos()
191
+ self.entry_start_pos = p0
192
+ self.entry = e
193
+ self.entry_ended_pos = p1
194
+ return entry
195
+ end
196
+ end #class Defalult
197
+
198
+
199
+ # A splitter for line oriented text data.
200
+ #
201
+ # The given class's object must have following methods.
202
+ # Klass#add_header_line(line)
203
+ # Klass#add_line(line)
204
+ # where 'line' is a string. They normally returns self.
205
+ # If the line is not suitable to add to the current entry,
206
+ # nil or false should be returned.
207
+ # Then, the line is treated as (for add_header_line) the entry data
208
+ # or (for add_line) the next entry's data.
209
+ #
210
+ class LineOriented < Template
211
+ # Creates a new splitter.
212
+ # klass:: database class
213
+ # bstream:: input stream. It must be a BufferedInputStream object.
214
+ def initialize(klass, bstream)
215
+ super(klass, bstream)
216
+ self.flag_to_fetch_header = true
217
+ end
218
+
219
+ # do nothing
220
+ def skip_leader
221
+ nil
222
+ end
223
+
224
+ # get an entry and return the entry as a string
225
+ def get_entry
226
+ if e = get_parsed_entry then
227
+ entry
228
+ else
229
+ e
230
+ end
231
+ end
232
+
233
+ # get an entry and return the entry as a data class object
234
+ def get_parsed_entry
235
+ p0 = stream_pos()
236
+ ent = @dbclass.new()
237
+
238
+ lines = []
239
+ line_overrun = nil
240
+
241
+ if flag_to_fetch_header then
242
+ while line = stream.gets("\n")
243
+ unless ent.add_header_line(line) then
244
+ line_overrun = line
245
+ break
246
+ end
247
+ lines.push line
248
+ end
249
+ stream.ungets(line_overrun) if line_overrun
250
+ line_overrun = nil
251
+ self.flag_to_fetch_header = false
252
+ end
253
+
254
+ while line = stream.gets("\n")
255
+ unless ent.add_line(line) then
256
+ line_overrun = line
257
+ break
258
+ end
259
+ lines.push line
260
+ end
261
+ stream.ungets(line_overrun) if line_overrun
262
+ p1 = stream_pos()
263
+
264
+ return nil if lines.empty?
265
+
266
+ self.entry_start_pos = p0
267
+ self.entry = lines.join('')
268
+ self.parsed_entry = ent
269
+ self.entry_ended_pos = p1
270
+
271
+ return ent
272
+ end
273
+
274
+ # rewinds the stream
275
+ def rewind
276
+ ret = super
277
+ self.flag_to_fetch_header = true
278
+ ret
279
+ end
280
+
281
+ #--
282
+ #private methods / attributes
283
+ #++
284
+
285
+ # flag to fetch header
286
+ attr_accessor :flag_to_fetch_header
287
+ private :flag_to_fetch_header
288
+ private :flag_to_fetch_header=
289
+
290
+ end #class LineOriented
291
+
292
+ end #module Splitter
293
+
294
+ end #class FlatFile
295
+ end #module Bio
296
+
297
+