bio 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,237 @@
1
+ #
2
+ # = bio/io/flatfile/buffer.rb - Input stream buffer for FlatFile
3
+ #
4
+ # Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
5
+ #
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ #
11
+ # See documents for Bio::FlatFile::BufferedInputStream and Bio::FlatFile.
12
+ #
13
+
14
+ require 'bio/io/flatfile'
15
+
16
+ module Bio
17
+
18
+ class FlatFile
19
+
20
+ # Wrapper for a IO (or IO-like) object.
21
+ # It can input with a buffer.
22
+ class BufferedInputStream
23
+ # Creates a new input stream wrapper
24
+ def initialize(io, path)
25
+ @io = io
26
+ @path = path
27
+ # initialize prefetch buffer
28
+ @buffer = ''
29
+ end
30
+
31
+ # Creates a new input stream wrapper from the given IO object.
32
+ def self.for_io(io)
33
+ begin
34
+ path = io.path
35
+ rescue NameError
36
+ path = nil
37
+ end
38
+ self.new(io, path)
39
+ end
40
+
41
+ # Creates a new input stream wrapper to open file _filename_
42
+ # by using File.open.
43
+ # *arg is passed to File.open.
44
+ #
45
+ # Like File.open, a block can be accepted.
46
+ def self.open_file(filename, *arg)
47
+ if block_given? then
48
+ File.open(filename, *arg) do |fobj|
49
+ yield self.new(fobj, filename)
50
+ end
51
+ else
52
+ fobj = File.open(filename, *arg)
53
+ self.new(fobj, filename)
54
+ end
55
+ end
56
+
57
+ # Creates a new input stream wrapper from URI specified as _uri_.
58
+ # by using OpenURI.open_uri or URI#open.
59
+ # _uri_ must be a String or URI object.
60
+ # *arg is passed to OpenURI.open_uri or URI#open.
61
+ #
62
+ # Like OpenURI.open_uri, it can accept a block.
63
+ def self.open_uri(uri, *arg)
64
+ if uri.kind_of?(URI)
65
+ if block_given?
66
+ uri.open(*arg) do |fobj|
67
+ yield self.new(fobj, uri.to_s)
68
+ end
69
+ else
70
+ fobj = uri.open(*arg)
71
+ self.new(fobj, uri.to_s)
72
+ end
73
+ else
74
+ if block_given?
75
+ OpenURI.open_uri(uri, *arg) do |fobj|
76
+ yield self.new(fobj, uri)
77
+ end
78
+ else
79
+ fobj = OpenURI.open_uri(uri, *arg)
80
+ self.new(fobj, uri)
81
+ end
82
+ end
83
+ end
84
+
85
+ # Pathname, filename or URI to open the object.
86
+ # Like File#path, returned value isn't normalized.
87
+ attr_reader :path
88
+
89
+ # Converts to IO object if possible
90
+ def to_io
91
+ @io.to_io
92
+ end
93
+
94
+ # Closes the IO object if possible
95
+ def close
96
+ @io.close
97
+ end
98
+
99
+ # Rewinds the IO object if possible
100
+ # Internal buffer in this wrapper is cleared.
101
+ def rewind
102
+ r = @io.rewind
103
+ @buffer = ''
104
+ r
105
+ end
106
+
107
+ # Returns current file position
108
+ def pos
109
+ @io.pos - @buffer.size
110
+ end
111
+
112
+ # Sets current file position if possible
113
+ # Internal buffer in this wrapper is cleared.
114
+ def pos=(p)
115
+ r = (@io.pos = p)
116
+ @buffer = ''
117
+ r
118
+ end
119
+
120
+ # Returns true if end-of-file. Otherwise, returns false.
121
+ #
122
+ # Note that it returns false if internal buffer is this wrapper
123
+ # is not empty,
124
+ def eof?
125
+ if @buffer.size > 0
126
+ false
127
+ else
128
+ @io.eof?
129
+ end
130
+ end
131
+
132
+ # Same as IO#gets.
133
+ #
134
+ # Compatibility note: the bahavior of paragraph mode (io_rs = '')
135
+ # may differ from that of IO#gets('').
136
+ def gets(io_rs = $/)
137
+ if @buffer.size > 0
138
+ if io_rs == nil then
139
+ r = @buffer + @io.gets(nil).to_s
140
+ @buffer = ''
141
+ else
142
+ if io_rs == '' then # io_rs.empty?
143
+ sp_rs = /((?:\r?\n){2,})/n
144
+ else
145
+ sp_rs = io_rs
146
+ end
147
+ a = @buffer.split(sp_rs, 2)
148
+ if a.size > 1 then
149
+ r = a.shift
150
+ r += (io_rs.empty? ? a.shift : io_rs)
151
+ @buffer = a.shift.to_s
152
+ else
153
+ @buffer << @io.gets(io_rs).to_s
154
+ a = @buffer.split(sp_rs, 2)
155
+ if a.size > 1 then
156
+ r = a.shift
157
+ r += (io_rs.empty? ? a.shift : io_rs)
158
+ @buffer = a.shift.to_s
159
+ else
160
+ r = @buffer
161
+ @buffer = ''
162
+ end
163
+ end
164
+ end
165
+ r
166
+ else
167
+ @io.gets(io_rs)
168
+ end
169
+ end
170
+
171
+ # Pushes back given str to the internal buffer.
172
+ # Returns nil.
173
+ # str must be read previously with the wrapper object.
174
+ #
175
+ # Note that in current implementation, the str can be everything,
176
+ # but please don't depend on it.
177
+ #
178
+ def ungets(str)
179
+ @buffer = str + @buffer
180
+ nil
181
+ end
182
+
183
+ # Same as IO#getc.
184
+ def getc
185
+ if @buffer.size > 0 then
186
+ r = @buffer[0]
187
+ @buffer = @buffer[1..-1]
188
+ else
189
+ r = @io.getc
190
+ end
191
+ r
192
+ end
193
+
194
+ # Pushes back one character into the internal buffer.
195
+ # Unlike IO#getc, it can be called more than one time.
196
+ def ungetc(c)
197
+ @buffer = sprintf("%c", c) + @buffer
198
+ nil
199
+ end
200
+
201
+ # Gets current prefetch buffer
202
+ def prefetch_buffer
203
+ @buffer
204
+ end
205
+
206
+ # It does @io.gets, and addes returned string
207
+ # to the internal buffer, and returns the string.
208
+ def prefetch_gets(*arg)
209
+ r = @io.gets(*arg)
210
+ @buffer << r if r
211
+ r
212
+ end
213
+
214
+ # It does @io.readpartial, and addes returned string
215
+ # to the internal buffer, and returns the string.
216
+ def prefetch_readpartial(*arg)
217
+ r = @io.readpartial(*arg)
218
+ @buffer << r if r
219
+ r
220
+ end
221
+
222
+ # Skips space characters in the stream.
223
+ # returns nil.
224
+ def skip_spaces
225
+ ws = { ?\s => true, ?\n => true, ?\r => true, ?\t => true }
226
+ while r = self.getc
227
+ unless ws[r] then
228
+ self.ungetc(r)
229
+ break
230
+ end
231
+ end
232
+ nil
233
+ end
234
+ end #class BufferedInputStream
235
+
236
+ end #class FlatFile
237
+ end #module Bio
@@ -888,13 +888,18 @@ module Bio
888
888
  self
889
889
  end
890
890
 
891
- def self.external_sort_proc(sort_program = '/usr/bin/sort')
891
+ def self.external_sort_proc(sort_program = [ '/usr/bin/env',
892
+ 'LC_ALL=C',
893
+ '/usr/bin/sort' ])
892
894
  Proc.new do |out, in1, *files|
893
- system(sort_program, '-o', out, in1, *files)
895
+ cmd = sort_program + [ '-o', out, in1, *files ]
896
+ system(*cmd)
894
897
  end
895
898
  end
896
899
 
897
- def self.external_merge_sort_proc(sort_program = '/usr/bin/sort')
900
+ def self.external_merge_sort_proc(sort_program = [ '/usr/bin/env',
901
+ 'LC_ALL=C',
902
+ '/usr/bin/sort' ])
898
903
  Proc.new do |out, in1, *files|
899
904
  # (in1 may be sorted)
900
905
  tf_all = []
@@ -902,21 +907,26 @@ module Bio
902
907
  files.each do |fn|
903
908
  tf = Tempfile.open('sort')
904
909
  tf.close(false)
905
- system(sort_program, '-o', tf.path, fn)
910
+ cmd = sort_program + [ '-o', tf.path, fn ]
911
+ system(*cmd)
906
912
  tf_all << tf
907
913
  tfn_all << tf.path
908
914
  end
909
- system(sort_program, '-m', '-o', out, in1, *tfn_all)
915
+ cmd_fin = sort_program + [ '-m', '-o', out, in1, *tfn_all ]
916
+ system(*cmd_fin)
910
917
  tf_all.each do |tf|
911
918
  tf.close(true)
912
919
  end
913
920
  end
914
921
  end
915
922
 
916
- def self.external_merge_proc(sort_program = '/usr/bin/sort')
923
+ def self.external_merge_proc(sort_program = [ '/usr/bin/env',
924
+ 'LC_ALL=C',
925
+ '/usr/bin/sort' ])
917
926
  Proc.new do |out, in1, *files|
918
927
  # files (and in1) must be sorted
919
- system(sort_program, '-m', '-o', out, in1, *files)
928
+ cmd = sort_program + [ '-m', '-o', out, in1, *files ]
929
+ system(*cmd)
920
930
  end
921
931
  end
922
932
 
@@ -525,6 +525,8 @@ module Bio
525
525
  def self.addindex_flat(db, mode, need_update, parser, options)
526
526
  require 'tempfile'
527
527
  prog = options['sort_program']
528
+ env = options['env_program']
529
+ env_args = options['env_program_arguments']
528
530
 
529
531
  return false if need_update.to_a.size == 0
530
532
 
@@ -555,7 +557,7 @@ module Bio
555
557
  fileid += 1
556
558
  end
557
559
 
558
- sort_proc = chose_sort_proc(prog, mode)
560
+ sort_proc = chose_sort_proc(prog, mode, env, env_args)
559
561
  pfile.close(false)
560
562
  DEBUG.print "sorting primary (#{parser.primary.name})...\n"
561
563
  db.primary.file.import_tsv_files(true, mode, sort_proc, pfile.path)
@@ -571,30 +573,46 @@ module Bio
571
573
  true
572
574
  end #def
573
575
 
576
+ # default sort program
574
577
  DEFAULT_SORT = '/usr/bin/sort'
575
- def self.chose_sort_proc(prog, mode = :new)
578
+
579
+ # default env program (run a program in a modified environment)
580
+ DEFAULT_ENV = '/usr/bin/env'
581
+
582
+ # default arguments for env program
583
+ DEFAULT_ENV_ARGS = [ 'LC_ALL=C' ]
584
+
585
+ def self.chose_sort_proc(prog, mode = :new,
586
+ env = nil, env_args = nil)
576
587
  case prog
577
588
  when /^builtin$/i, /^hs$/i, /^lm$/i
578
589
  DEBUG.print "sort: internal sort routine\n"
579
- sort_proc = mapfile.internal_sort_proc
590
+ sort_proc = Flat_1::FlatMappingFile::internal_sort_proc
580
591
  when nil, ''
581
592
  if FileTest.executable?(DEFAULT_SORT)
582
- DEBUG.print "sort: #{DEFAULT_SORT}\n"
583
- if mode == :new then
584
- sort_proc = Flat_1::FlatMappingFile::external_sort_proc(DEFAULT_SORT)
585
- else
586
- sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(DEFAULT_SORT)
587
- end
593
+ return chose_sort_proc(DEFAULT_SORT, mode, env, env_args)
588
594
  else
589
595
  DEBUG.print "sort: internal sort routine\n"
590
596
  sort_proc = Flat_1::FlatMappingFile::internal_sort_proc
591
597
  end
592
598
  else
593
- DEBUG.print "sort: #{prog}\n"
599
+ env_args ||= DEFAULT_ENV_ARGS
600
+ if env == '' or env == false then # inhibit to use env program
601
+ prefixes = [ prog ]
602
+ elsif env then # uses given env program
603
+ prefixes = [ env ] + env_args + [ prog ]
604
+ else # env == nil; uses default env program if possible
605
+ if FileTest.executable?(DEFAULT_ENV)
606
+ prefixes = [ DEFAULT_ENV ] + env_args + [ prog ]
607
+ else
608
+ prefixes = [ prog ]
609
+ end
610
+ end
611
+ DEBUG.print "sort: #{prefixes.join(' ')}\n"
594
612
  if mode == :new then
595
- sort_proc = Flat_1::FlatMappingFile::external_sort_proc(prog)
613
+ sort_proc = Flat_1::FlatMappingFile::external_sort_proc(prefixes)
596
614
  else
597
- sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(prog)
615
+ sort_proc = Flat_1::FlatMappingFile::external_merge_sort_proc(prefixes)
598
616
  end
599
617
  end
600
618
  sort_proc
@@ -0,0 +1,297 @@
1
+ #
2
+ # = bio/io/flatfile/splitter.rb - input data splitter for FlatFile
3
+ #
4
+ # Copyright (C) 2001-2008 Naohisa Goto <ng@bioruby.org>
5
+ #
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ #
11
+ # See documents for Bio::FlatFile::Splitter and Bio::FlatFile.
12
+ #
13
+
14
+ require 'bio/io/flatfile'
15
+
16
+ module Bio
17
+
18
+ class FlatFile
19
+
20
+ # The Bio::FlatFile::Splitter is a namespace for flatfile splitters.
21
+ # Each splitter is a class to get entries from a buffered input stream.
22
+ #
23
+ # It is internally called in Bio::FlatFile.
24
+ # Normally, users do not need to use it directly.
25
+ module Splitter
26
+
27
+ # This is a template of splitter.
28
+ class Template
29
+ # Creates a new splitter.
30
+ def initialize(klass, bstream)
31
+ @dbclass = klass
32
+ @stream = bstream
33
+ @entry_pos_flag = nil
34
+ end
35
+
36
+ # skips leader of the entry.
37
+ def skip_leader
38
+ raise NotImplementedError
39
+ end
40
+
41
+ # rewind the stream
42
+ def rewind
43
+ @stream.rewind
44
+ end
45
+
46
+ # Gets entry as a string. (String)
47
+ def get_entry
48
+ raise NotImplementedError
49
+ end
50
+
51
+ # Gets entry as a data class's object
52
+ def get_parsed_entry
53
+ ent = get_entry
54
+ if ent then
55
+ self.parsed_entry = dbclass.new(ent)
56
+ else
57
+ self.parsed_entry = ent
58
+ end
59
+ parsed_entry
60
+ end
61
+
62
+ # the last entry string read from the stream (String)
63
+ attr_reader :entry
64
+
65
+ # The last parsed entry read from the stream (entry data class).
66
+ # Note that it is valid only after get_parsed_entry is called,
67
+ # and the get_entry may not affect the parsed_entry attribute.
68
+ attr_reader :parsed_entry
69
+
70
+ # a flag to write down entry start and end positions
71
+ attr_accessor :entry_pos_flag
72
+
73
+ # start position of the entry
74
+ attr_reader :entry_start_pos
75
+
76
+ # (end position of the entry) + 1
77
+ attr_reader :entry_ended_pos
78
+
79
+ #--
80
+ #private
81
+ #
82
+ ## to prevent warning message "warning: private attribute?",
83
+ ## private attributes are explicitly declared.
84
+ #++
85
+
86
+ # entry data class
87
+ attr_reader :dbclass
88
+ private :dbclass
89
+
90
+ # input stream
91
+ attr_reader :stream
92
+ private :stream
93
+
94
+ # the last entry string read from the stream
95
+ attr_writer :entry
96
+ private :entry=
97
+
98
+ # the last entry as a parsed data object
99
+ attr_writer :parsed_entry
100
+ private :parsed_entry=
101
+
102
+ # start position of the entry
103
+ attr_writer :entry_start_pos
104
+ private :entry_start_pos=
105
+
106
+ # (end position of the entry) + 1
107
+ attr_writer :entry_ended_pos
108
+ private :entry_ended_pos=
109
+
110
+ # Does stream.pos if entry_pos_flag is not nil.
111
+ # Otherwise, returns nil.
112
+ def stream_pos
113
+ entry_pos_flag ? stream.pos : nil
114
+ end
115
+ private :stream_pos
116
+ end #class Template
117
+
118
+ # Default splitter.
119
+ # It sees following constants in the given class.
120
+ # DELIMITER:: (String) delimiter indicates the end of a entry.
121
+ # FLATFILE_HEADER:: (String) start of a entry, located on head of a line.
122
+ # DELIMITER_OVERRUN:: (Integer) excess read size included in DELIMITER.
123
+ #
124
+ class Default < Template
125
+ # Creates a new splitter.
126
+ # klass:: database class
127
+ # bstream:: input stream. It must be a BufferedInputStream object.
128
+ def initialize(klass, bstream)
129
+ super(klass, bstream)
130
+
131
+ @delimiter = klass::DELIMITER rescue nil
132
+ @header = klass::FLATFILE_HEADER rescue nil
133
+ # for specific classes' benefit
134
+ unless header
135
+ if (defined?(Bio::GenBank) and klass == Bio::GenBank) or
136
+ (defined?(Bio::GenPept) and klass == Bio::GenPept)
137
+ @header = 'LOCUS '
138
+ end
139
+ end
140
+ @delimiter_overrun = klass::DELIMITER_OVERRUN rescue nil
141
+ end
142
+
143
+ # (String) delimiter indicates the end of a entry.
144
+ attr_accessor :delimiter
145
+
146
+ # (String) start of a entry, located on head of a line.
147
+ attr_accessor :header
148
+
149
+ # (Integer) excess read data size included in delimiter.
150
+ attr_accessor :delimiter_overrun
151
+
152
+ # Skips leader of the entry.
153
+ #
154
+ # If @header is not nil, it reads till the contents of @header
155
+ # comes at the head of a line.
156
+ # If correct FLATFILE_HEADER is found, returns true.
157
+ # Otherwise, returns nil.
158
+ def skip_leader
159
+ if @header then
160
+ data = ''
161
+ while s = stream.gets(@header)
162
+ data << s
163
+ if data.split(/[\r\n]+/)[-1] == @header then
164
+ stream.ungets(@header)
165
+ return true
166
+ end
167
+ end
168
+ # @header was not found. For safety,
169
+ # pushes back data with removing white spaces in the head.
170
+ data.sub(/\A\s+/, '')
171
+ stream.ungets(data)
172
+ return nil
173
+ else
174
+ stream.skip_spaces
175
+ return nil
176
+ end
177
+ end
178
+
179
+ # gets a entry
180
+ def get_entry
181
+ p0 = stream_pos()
182
+ e = stream.gets(@delimiter)
183
+ if e and @delimiter_overrun then
184
+ if e[-@delimiter.size, @delimiter.size ] == @delimiter then
185
+ overrun = e[-@delimiter_overrun, @delimiter_overrun]
186
+ e[-@delimiter_overrun, @delimiter_overrun] = ''
187
+ stream.ungets(overrun)
188
+ end
189
+ end
190
+ p1 = stream_pos()
191
+ self.entry_start_pos = p0
192
+ self.entry = e
193
+ self.entry_ended_pos = p1
194
+ return entry
195
+ end
196
+ end #class Defalult
197
+
198
+
199
+ # A splitter for line oriented text data.
200
+ #
201
+ # The given class's object must have following methods.
202
+ # Klass#add_header_line(line)
203
+ # Klass#add_line(line)
204
+ # where 'line' is a string. They normally returns self.
205
+ # If the line is not suitable to add to the current entry,
206
+ # nil or false should be returned.
207
+ # Then, the line is treated as (for add_header_line) the entry data
208
+ # or (for add_line) the next entry's data.
209
+ #
210
+ class LineOriented < Template
211
+ # Creates a new splitter.
212
+ # klass:: database class
213
+ # bstream:: input stream. It must be a BufferedInputStream object.
214
+ def initialize(klass, bstream)
215
+ super(klass, bstream)
216
+ self.flag_to_fetch_header = true
217
+ end
218
+
219
+ # do nothing
220
+ def skip_leader
221
+ nil
222
+ end
223
+
224
+ # get an entry and return the entry as a string
225
+ def get_entry
226
+ if e = get_parsed_entry then
227
+ entry
228
+ else
229
+ e
230
+ end
231
+ end
232
+
233
+ # get an entry and return the entry as a data class object
234
+ def get_parsed_entry
235
+ p0 = stream_pos()
236
+ ent = @dbclass.new()
237
+
238
+ lines = []
239
+ line_overrun = nil
240
+
241
+ if flag_to_fetch_header then
242
+ while line = stream.gets("\n")
243
+ unless ent.add_header_line(line) then
244
+ line_overrun = line
245
+ break
246
+ end
247
+ lines.push line
248
+ end
249
+ stream.ungets(line_overrun) if line_overrun
250
+ line_overrun = nil
251
+ self.flag_to_fetch_header = false
252
+ end
253
+
254
+ while line = stream.gets("\n")
255
+ unless ent.add_line(line) then
256
+ line_overrun = line
257
+ break
258
+ end
259
+ lines.push line
260
+ end
261
+ stream.ungets(line_overrun) if line_overrun
262
+ p1 = stream_pos()
263
+
264
+ return nil if lines.empty?
265
+
266
+ self.entry_start_pos = p0
267
+ self.entry = lines.join('')
268
+ self.parsed_entry = ent
269
+ self.entry_ended_pos = p1
270
+
271
+ return ent
272
+ end
273
+
274
+ # rewinds the stream
275
+ def rewind
276
+ ret = super
277
+ self.flag_to_fetch_header = true
278
+ ret
279
+ end
280
+
281
+ #--
282
+ #private methods / attributes
283
+ #++
284
+
285
+ # flag to fetch header
286
+ attr_accessor :flag_to_fetch_header
287
+ private :flag_to_fetch_header
288
+ private :flag_to_fetch_header=
289
+
290
+ end #class LineOriented
291
+
292
+ end #module Splitter
293
+
294
+ end #class FlatFile
295
+ end #module Bio
296
+
297
+