bio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -6,7 +6,7 @@
6
6
  # Ryan Raaum <ryan@raaum.org>
7
7
  # License:: The Ruby License
8
8
  #
9
- # $Id: common.rb,v 1.6 2007/12/27 17:36:02 ngoto Exp $
9
+ # $Id:$
10
10
  #
11
11
 
12
12
  module Bio
@@ -37,7 +37,7 @@ class Sequence
37
37
  # # Create a random sequence with the composition of a current sequence
38
38
  # puts dna.randomize
39
39
  module Common
40
-
40
+
41
41
  # Return sequence as
42
42
  # String[http://corelib.rubyonrails.org/classes/String.html].
43
43
  # The original sequence is unchanged.
@@ -65,7 +65,7 @@ module Common
65
65
  def seq
66
66
  self.class.new(self)
67
67
  end
68
-
68
+
69
69
  # Normalize the current sequence, removing all whitespace and
70
70
  # transforming all positions to uppercase if the sequence is AA or
71
71
  # transforming all positions to lowercase if the sequence is NA.
@@ -241,53 +241,30 @@ module Common
241
241
  # * (optional) _hash_: Hash object
242
242
  # *Returns*:: new Bio::Sequence::NA/AA object
243
243
  def randomize(hash = nil)
244
- length = self.length
245
244
  if hash
246
- length = 0
247
- count = hash.clone
248
- count.each_value {|x| length += x}
245
+ tmp = ''
246
+ hash.each {|k, v|
247
+ tmp += k * v.to_i
248
+ }
249
249
  else
250
- count = self.composition
250
+ tmp = self
251
251
  end
252
-
253
- seq = ''
254
- tmp = {}
255
- length.times do
256
- count.each do |k, v|
257
- tmp[k] = v * rand
258
- end
259
- max = tmp.max {|a, b| a[1] <=> b[1]}
260
- count[max.first] -= 1
261
-
262
- if block_given?
263
- yield max.first
264
- else
265
- seq += max.first
252
+ seq = self.class.new(tmp)
253
+ # Reference: http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
254
+ seq.length.downto(2) do |n|
255
+ k = rand(n)
256
+ c = seq[n - 1]
257
+ seq[n - 1] = seq[k]
258
+ seq[k] = c
259
+ end
260
+ if block_given? then
261
+ (0...seq.length).each do |i|
262
+ yield seq[i, 1]
266
263
  end
264
+ return self.class.new('')
265
+ else
266
+ return seq
267
267
  end
268
- return self.class.new(seq)
269
- end
270
-
271
- # Generate a new random sequence with the given frequency of bases.
272
- # The sequence length is determined by their cumulative sum.
273
- # (See also Bio::Sequence::Common#randomize which creates a new
274
- # randomized sequence object using the base composition of an existing
275
- # sequence instance).
276
- #
277
- # counts = {'R'=>1,'L'=>2,'E'=>3,'A'=>4}
278
- # puts Bio::Sequence::AA.randomize(counts) #=> "AAEAELALRE" (for example)
279
- #
280
- # You may also feed the output of randomize into a block
281
- #
282
- # actual_counts = {'R'=>0,'L'=>0,'E'=>0,'A'=>0}
283
- # Bio::Sequence::AA.randomize(counts) {|x| actual_counts[x] += 1}
284
- # actual_counts #=> {"A"=>4, "L"=>2, "E"=>3, "R"=>1}
285
- # ---
286
- # *Arguments*:
287
- # * (optional) _hash_: Hash object
288
- # *Returns*:: Bio::Sequence::NA/AA object
289
- def self.randomize(*arg, &block)
290
- self.new('').randomize(*arg, &block)
291
268
  end
292
269
 
293
270
  # Return a new sequence extracted from the original using a GenBank style
@@ -6,7 +6,7 @@
6
6
  # Ryan Raaum <ryan@raaum.org>
7
7
  # License:: The Ruby License
8
8
  #
9
- # $Id: compat.rb,v 1.4 2007/04/05 23:35:41 trevor Exp $
9
+ # $Id:$
10
10
  #
11
11
 
12
12
 
@@ -30,7 +30,7 @@ class Sequence
30
30
  # ---
31
31
  # *Returns*:: String object
32
32
  def to_s
33
- String.new(@seq)
33
+ String.new(self.seq)
34
34
  end
35
35
  alias to_str to_s
36
36
 
@@ -0,0 +1,54 @@
1
+ #
2
+ # = bio/sequence/dblink.rb - sequence ID with database name
3
+ #
4
+ # Copyright:: Copyright (C) 2008
5
+ # Naohisa Goto <ng@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id: dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
9
+ #
10
+
11
+ require 'bio/sequence'
12
+
13
+ # Bio::Sequence::DBLink stores IDs with the database name.
14
+ # Its main purpose is to store database cross-reference information
15
+ # for a sequence entry.
16
+ class Bio::Sequence::DBLink
17
+
18
+ # creates a new DBLink object
19
+ def initialize(database, primary_id, *secondary_ids)
20
+ @database = database
21
+ @id = primary_id
22
+ @secondary_ids = secondary_ids
23
+ end
24
+
25
+ # Database name, or namespace identifier (String).
26
+ attr_reader :database
27
+
28
+ # Primary identifier (String)
29
+ attr_reader :id
30
+
31
+ # Secondary identifiers (Array of String)
32
+ attr_reader :secondary_ids
33
+
34
+ #--
35
+ # class methods
36
+ #++
37
+
38
+ # Parses DR line in EMBL entry, and returns a DBLink object.
39
+ def self.parse_embl_DR_line(str)
40
+ str = str.sub(/\.\s*\z/, '')
41
+ str.sub!(/\ADR /, '')
42
+ self.new(*(str.split(/\s*\;\s*/, 3)))
43
+ end
44
+
45
+ # Parses DR line in UniProt entry, and returns a DBLink object.
46
+ def self.parse_uniprot_DR_line(str)
47
+ str = str.sub(/\.\s*\z/, '')
48
+ str.sub!(/\ADR /, '')
49
+ self.new(*(str.split(/\s*\;\s*/)))
50
+ end
51
+
52
+ end #class Bio::Sequence::DBLink
53
+
54
+
@@ -1,24 +1,24 @@
1
1
  #
2
2
  # = bio/sequence/format.rb - various output format of the biological sequence
3
3
  #
4
- # Copyright:: Copyright (C) 2006
4
+ # Copyright:: Copyright (C) 2006-2008
5
5
  # Toshiaki Katayama <k@bioruby.org>,
6
6
  # Naohisa Goto <ng@bioruby.org>,
7
- # Ryan Raaum <ryan@raaum.org>
7
+ # Ryan Raaum <ryan@raaum.org>,
8
+ # Jan Aerts <jan.aerts@bbsrc.ac.uk>
8
9
  # License:: The Ruby License
9
10
  #
10
11
  # = TODO
11
12
  #
12
13
  # porting from N. Goto's feature-output.rb on BioRuby list.
13
14
  #
14
- # $Id: format.rb,v 1.4 2007/04/05 23:35:41 trevor Exp $
15
+ # $Id: format.rb,v 1.4.2.8 2008/06/17 15:50:05 ngoto Exp $
15
16
  #
16
17
 
18
+ require 'erb'
17
19
 
18
20
  module Bio
19
21
 
20
- autoload :Sequence, 'bio/sequence'
21
-
22
22
  class Sequence
23
23
 
24
24
  # = DESCRIPTION
@@ -33,149 +33,326 @@ class Sequence
33
33
  # puts s.output(:embl)
34
34
  module Format
35
35
 
36
- # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
37
- # case, it would be difficult to successfully call this method outside
38
- # its expected context).
39
- #
40
- # Output the FASTA format string of the sequence.
41
- #
42
- # UNFORTUNATLY, the current implementation of Bio::Sequence is incapable of
43
- # using either the header or width arguments. So something needs to be
44
- # changed...
36
+ # Repository of generic (or both nucleotide and protein) sequence
37
+ # formatter classes
38
+ module Formatter
39
+
40
+ # Raw format generatar
41
+ autoload :Raw, 'bio/sequence/format_raw'
42
+
43
+ # Fasta format generater
44
+ autoload :Fasta, 'bio/db/fasta/format_fasta'
45
+
46
+ # NCBI-style Fasta format generatar
47
+ # (resemble to EMBOSS "ncbi" format)
48
+ autoload :Fasta_ncbi, 'bio/db/fasta/format_fasta'
49
+
50
+ end #module Formatter
51
+
52
+ # Repository of nucleotide sequence formatter classes
53
+ module NucFormatter
54
+
55
+ # GenBank format generater
56
+ # Note that the name is 'Genbank' and NOT 'GenBank'
57
+ autoload :Genbank, 'bio/db/genbank/format_genbank'
58
+
59
+ # EMBL format generater
60
+ # Note that the name is 'Embl' and NOT 'EMBL'
61
+ autoload :Embl, 'bio/db/embl/format_embl'
62
+
63
+ end #module NucFormatter
64
+
65
+ # Repository of protein sequence formatter classes
66
+ module AminoFormatter
67
+ # currently no formats available
68
+ end #module AminoFormatter
69
+
70
+ # Formatter base class.
71
+ # Any formatter class should inherit this class.
72
+ class FormatterBase
73
+
74
+ # Returns a formatterd string of the given sequence
75
+ # ---
76
+ # *Arguments*:
77
+ # * (required) _sequence_: Bio::Sequence object
78
+ # * (optional) _options_: a Hash object
79
+ # *Returns*:: String object
80
+ def self.output(sequence, options = {})
81
+ self.new(sequence, options).output
82
+ end
83
+
84
+ # register new Erb template
85
+ def self.erb_template(str)
86
+ erb = ERB.new(str)
87
+ erb.def_method(self, 'output')
88
+ true
89
+ end
90
+ private_class_method :erb_template
91
+
92
+ # generates output data
93
+ # ---
94
+ # *Returns*:: String object
95
+ def output
96
+ raise NotImplementedError, 'should be implemented in subclass'
97
+ end
98
+
99
+ # creates a new formatter object for output
100
+ def initialize(sequence, options = {})
101
+ @sequence = sequence
102
+ @options = options
103
+ end
104
+
105
+ private
106
+
107
+ # any unknown methods are delegated to the sequence object
108
+ def method_missing(sym, *args, &block) #:nodoc:
109
+ begin
110
+ @sequence.__send__(sym, *args, &block)
111
+ rescue NoMethodError => evar
112
+ lineno = __LINE__ - 2
113
+ file = __FILE__
114
+ bt_here = [ "#{file}:#{lineno}:in \`__send__\'",
115
+ "#{file}:#{lineno}:in \`method_missing\'"
116
+ ]
117
+ if bt_here == evar.backtrace[0, 2] then
118
+ bt = evar.backtrace[2..-1]
119
+ evar = evar.class.new("undefined method \`#{sym.to_s}\' for #{self.inspect}")
120
+ evar.set_backtrace(bt)
121
+ end
122
+ raise(evar)
123
+ end
124
+ end
125
+ end #class FormatterBase
126
+
127
+ # Using Bio::Sequence::Format, return a String with the Bio::Sequence
128
+ # object formatted in the given style.
45
129
  #
46
- # Currently, this method is used in Bio::Sequence#output like so,
130
+ # Formats currently implemented are: 'fasta', 'genbank', and 'embl'
47
131
  #
48
132
  # s = Bio::Sequence.new('atgc')
49
133
  # puts s.output(:fasta) #=> "> \natgc\n"
134
+ #
135
+ # The style argument is given as a Ruby
136
+ # Symbol(http://www.ruby-doc.org/core/classes/Symbol.html)
50
137
  # ---
51
- # *Arguments*:
52
- # * (optional) _header_: String (default nil)
53
- # * (optional) _width_: Fixnum (default nil)
138
+ # *Arguments*:
139
+ # * (required) _format_: :fasta, :genbank, *or* :embl
54
140
  # *Returns*:: String object
55
- def format_fasta(header = nil, width = nil)
56
- header ||= "#{@entry_id} #{@definition}"
141
+ def output(format = :fasta, options = {})
142
+ formatter_const = format.to_s.capitalize.intern
143
+
144
+ formatter_class = nil
145
+ get_formatter_repositories.each do |mod|
146
+ begin
147
+ formatter_class = mod.const_get(formatter_const)
148
+ rescue NameError
149
+ end
150
+ break if formatter_class
151
+ end
152
+ unless formatter_class then
153
+ raise "unknown format name #{format.inspect}"
154
+ end
155
+
156
+ formatter_class.output(self, options)
157
+ end
57
158
 
58
- ">#{header}\n" +
59
- if width
60
- @seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
159
+ # Returns a list of available output formats for the sequence
160
+ # ---
161
+ # *Arguments*:
162
+ # *Returns*:: Array of Symbols
163
+ def list_output_formats
164
+ a = get_formatter_repositories.collect { |mod| mod.constants }
165
+ a.flatten!
166
+ a.collect! { |x| x.to_s.downcase.intern }
167
+ a
168
+ end
169
+
170
+ private
171
+
172
+ # returns formatter repository modules
173
+ def get_formatter_repositories
174
+ if self.moltype == Bio::Sequence::NA then
175
+ [ NucFormatter, Formatter ]
176
+ elsif self.moltype == Bio::Sequence::AA then
177
+ [ AminoFormatter, Formatter ]
61
178
  else
62
- @seq.to_s + "\n"
179
+ [ NucFormatter, AminoFormatter, Formatter ]
63
180
  end
64
181
  end
65
182
 
183
+ #---
184
+
66
185
  # Not yet implemented :)
67
186
  # Remove the nodoc command after implementation!
68
187
  # ---
69
188
  # *Returns*:: String object
70
- def format_gff #:nodoc:
71
- raise NotImplementedError
72
- end
189
+ #def format_gff #:nodoc:
190
+ # raise NotImplementedError
191
+ #end
192
+
193
+ #+++
194
+
195
+ # Formatting helper methods for INSD (NCBI, EMBL, DDBJ) feature table
196
+ module INSDFeatureHelper
197
+ private
73
198
 
74
199
  # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
75
200
  # case, it would be difficult to successfully call this method outside
76
201
  # its expected context).
77
202
  #
78
- # Output the Genbank format string of the sequence.
203
+ # Output the Genbank feature format string of the sequence.
79
204
  # Used in Bio::Sequence#output.
80
205
  # ---
81
206
  # *Returns*:: String object
82
- def format_genbank
207
+ def format_features_genbank(features)
83
208
  prefix = ' ' * 5
84
209
  indent = prefix + ' ' * 16
85
210
  fwidth = 79 - indent.length
86
-
87
- format_features(prefix, indent, fwidth)
211
+
212
+ format_features(features, prefix, indent, fwidth)
88
213
  end
89
214
 
90
215
  # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
91
216
  # case, it would be difficult to successfully call this method outside
92
217
  # its expected context).
93
218
  #
94
- # Output the EMBL format string of the sequence.
219
+ # Output the EMBL feature format string of the sequence.
95
220
  # Used in Bio::Sequence#output.
96
221
  # ---
97
222
  # *Returns*:: String object
98
- def format_embl
223
+ def format_features_embl(features)
99
224
  prefix = 'FT '
100
225
  indent = prefix + ' ' * 16
101
226
  fwidth = 80 - indent.length
102
-
103
- format_features(prefix, indent, fwidth)
227
+
228
+ format_features(features, prefix, indent, fwidth)
104
229
  end
105
230
 
231
+ # format INSD featurs
232
+ def format_features(features, prefix, indent, width)
233
+ result = []
234
+ features.each do |feature|
235
+ result.push format_feature(feature, prefix, indent, width)
236
+ end
237
+ return result.join('')
238
+ end
106
239
 
107
- private
108
-
109
- def format_features(prefix, indent, width)
110
- result = ''
111
- @features.each do |feature|
112
- result << prefix + sprintf("%-16s", feature.feature)
113
-
114
- position = feature.position
115
- #position = feature.locations.to_s
240
+ # format an INSD feature
241
+ def format_feature(feature, prefix, indent, width)
242
+ result = prefix + sprintf("%-16s", feature.feature)
116
243
 
117
- head = ''
118
- wrap(position, width).each_line do |line|
119
- result << head << line
120
- head = indent
121
- end
244
+ position = feature.position
245
+ #position = feature.locations.to_s
122
246
 
123
- result << format_qualifiers(feature.qualifiers, width)
124
- end
247
+ result << wrap_and_split_lines(position, width).join("\n" + indent)
248
+ result << "\n"
249
+ result << format_qualifiers(feature.qualifiers, indent, width)
125
250
  return result
126
251
  end
127
252
 
253
+ # format qualifiers
128
254
  def format_qualifiers(qualifiers, indent, width)
129
- qualifiers.each do |qualifier|
255
+ qualifiers.collect do |qualifier|
130
256
  q = qualifier.qualifier
131
257
  v = qualifier.value.to_s
132
258
 
133
259
  if v == true
134
- lines = wrap('/' + q, width)
260
+ lines = wrap_with_newline('/' + q, width)
135
261
  elsif q == 'translation'
136
- lines = fold('/' + q + '=' + val, width)
262
+ lines = fold("/#{q}=\"#{v}\"", width)
137
263
  else
138
- if v[/\D/]
264
+ if v[/\D/] or q == 'chromosome'
139
265
  #v.delete!("\x00-\x1f\x7f-\xff")
140
266
  v.gsub!(/"/, '""')
141
267
  v = '"' + v + '"'
142
268
  end
143
- lines = wrap('/' + q + '=' + val, width)
269
+ lines = wrap_with_newline('/' + q + '=' + v, width)
144
270
  end
145
271
 
146
- return lines.gsub(/^/, indent)
147
- end
272
+ lines.gsub!(/^/, indent)
273
+ lines
274
+ end.join
148
275
  end
149
276
 
150
277
  def fold(str, width)
151
278
  str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
152
279
  end
153
280
 
154
- def wrap(str, width)
281
+ def fold_and_split_lines(str, width)
282
+ str.scan(Regexp.new(".{1,#{width}}"))
283
+ end
284
+
285
+ def wrap_and_split_lines(str, width)
155
286
  result = []
156
- left = str.dup
157
- while left and left.length > width
158
- line = nil
159
- width.downto(1) do |i|
160
- if left[i..i] == ' ' or /[,;]/ =~ left[(i-1)..(i-1)] then
161
- line = left[0..(i-1)].sub(/ +\z/, '')
162
- left = left[i..-1].sub(/\A +/, '')
163
- break
287
+ lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
288
+ lefts.each do |left|
289
+ left.rstrip!
290
+ while left and left.length > width
291
+ line = nil
292
+ width.downto(1) do |i|
293
+ if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
294
+ line = left[0..(i-1)].sub(/ +\z/, '')
295
+ left = left[i..-1].sub(/\A +/, '')
296
+ break
297
+ end
164
298
  end
299
+ if line.nil? then
300
+ line = left[0..(width-1)]
301
+ left = left[width..-1]
302
+ end
303
+ result << line
304
+ left = nil if left.to_s.empty?
165
305
  end
166
- if line.nil? then
167
- line = left[0..(width-1)]
168
- left = left[width..-1]
169
- end
170
- result << line
306
+ result << left if left
307
+ end
308
+ return result
309
+ end
310
+
311
+ def wrap_with_newline(str, width)
312
+ result = wrap_and_split_lines(str, width)
313
+ result_string = result.join("\n")
314
+ result_string << "\n" unless result_string.empty?
315
+ return result_string
316
+ end
317
+
318
+ def wrap(str, width = 80, prefix = '')
319
+ actual_width = width - prefix.length
320
+ result = wrap_and_split_lines(str, actual_width)
321
+ result_string = result.join("\n#{prefix}")
322
+ result_string = prefix + result_string unless result_string.empty?
323
+ return result_string
324
+ end
325
+
326
+ #--
327
+ # internal use only
328
+ MonthStr = [ nil,
329
+ 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
330
+ 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
331
+ ].collect { |x| x.freeze }.freeze
332
+ #++
333
+
334
+ # formats a date from Date, DateTime, or Time object, or String.
335
+ def format_date(d)
336
+ begin
337
+ yy = d.year
338
+ mm = d.month
339
+ dd = d.day
340
+ rescue NoMethodError, NameError, ArgumentError, TypeError
341
+ return sprintf("%-11s", d)
171
342
  end
172
- result << left if left
173
- return result.join("\n")
343
+ sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
344
+ end
345
+
346
+ # null date
347
+ def null_date
348
+ Date.new(0, 1, 1)
174
349
  end
175
350
 
176
- end # Format
351
+ end #module INSDFeatureHelper
352
+
353
+ end #module Format
177
354
 
178
- end # Sequence
355
+ end #class Sequence
179
356
 
180
- end # Bio
357
+ end #module Bio
181
358