bio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,86 @@
1
+ #
2
+ # = bio/db/genbank/genbank_to_biosequence.rb - Bio::GenBank to Bio::Sequence adapter module
3
+ #
4
+ # Copyright:: Copyright (C) 2008
5
+ # Naohisa Goto <ng@bioruby.org>,
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+
11
+ require 'bio/sequence'
12
+ require 'bio/sequence/adapter'
13
+
14
+ # Internal use only. Normal users should not use this module.
15
+ #
16
+ # Bio::GenBank to Bio::Sequence adapter module.
17
+ # It is internally used in Bio::GenBank#to_biosequence.
18
+ #
19
+ module Bio::Sequence::Adapter::GenBank
20
+
21
+ extend Bio::Sequence::Adapter
22
+
23
+ private
24
+
25
+ def_biosequence_adapter :seq
26
+
27
+ def_biosequence_adapter :id_namespace do |orig|
28
+ if /\_/ =~ orig.accession.to_s then
29
+ 'RefSeq'
30
+ else
31
+ 'GenBank'
32
+ end
33
+ end
34
+
35
+ def_biosequence_adapter :entry_id
36
+
37
+ def_biosequence_adapter :primary_accession, :accession
38
+
39
+ def_biosequence_adapter :secondary_accessions do |orig|
40
+ orig.accessions - [ orig.accession ]
41
+ end
42
+
43
+ def_biosequence_adapter :other_seqids do |orig|
44
+ if /GI\:(.+)/ =~ orig.gi.to_s then
45
+ [ Bio::Sequence::DBLink.new('GI', $1) ]
46
+ else
47
+ nil
48
+ end
49
+ end
50
+
51
+ def_biosequence_adapter :molecule_type, :natype
52
+
53
+ def_biosequence_adapter :division
54
+
55
+ def_biosequence_adapter :topology, :circular
56
+
57
+ def_biosequence_adapter :strandedness
58
+
59
+ def_biosequence_adapter :sequence_version, :version
60
+
61
+ #--
62
+ #sequence.date_created = nil #????
63
+ #++
64
+
65
+ def_biosequence_adapter :date_modified
66
+
67
+ def_biosequence_adapter :definition
68
+
69
+ def_biosequence_adapter :keywords
70
+
71
+ def_biosequence_adapter :species, :organism
72
+
73
+ def_biosequence_adapter :classification
74
+
75
+ #--
76
+ #sequence.organelle = nil # yet unsupported
77
+ #++
78
+
79
+ def_biosequence_adapter :comments, :comment
80
+
81
+ def_biosequence_adapter :references
82
+
83
+ def_biosequence_adapter :features
84
+
85
+ end #module Bio::Sequence::Adapter::GenBank
86
+
@@ -4,154 +4,1826 @@
4
4
  # Copyright:: Copyright (C) 2003, 2005
5
5
  # Toshiaki Katayama <k@bioruby.org>
6
6
  # 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
7
+ # 2008 Naohisa Goto <ng@bioruby.org>
7
8
  # License:: The Ruby License
8
9
  #
9
- # $Id: gff.rb,v 1.9 2007/05/18 15:23:42 k Exp $
10
+ # $Id:$
10
11
  #
12
+ require 'uri'
13
+ require 'strscan'
14
+ require 'enumerator'
15
+ require 'bio/db/fasta'
11
16
 
12
17
  module Bio
13
- # == DESCRIPTION
14
- # The Bio::GFF and Bio::GFF::Record classes describe data contained in a
15
- # GFF-formatted file. For information on the GFF format, see
16
- # http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
17
- # delimited format, including
18
- # * seqname
19
- # * source
20
- # * feature
21
- # * start
22
- # * end
23
- # * score
24
- # * strand
25
- # * frame
26
- # * attributes (optional)
27
- #
28
- # For example:
29
- # SEQ1 EMBL atg 103 105 . + 0
30
- # SEQ1 EMBL exon 103 172 . + 0
31
- # SEQ1 EMBL splice5 172 173 . + .
32
- # SEQ1 netgene splice5 172 173 0.94 + .
33
- # SEQ1 genie sp5-20 163 182 2.3 + .
34
- # SEQ1 genie sp5-10 168 177 2.1 + .
35
- # SEQ1 grail ATG 17 19 2.1 - 0
36
- #
37
- # The Bio::GFF object is a container for Bio::GFF::Record objects, each
38
- # representing a single line in the GFF file.
39
- class GFF
40
- # Creates a Bio::GFF object by building a collection of Bio::GFF::Record
41
- # objects.
18
+ # == DESCRIPTION
19
+ # The Bio::GFF and Bio::GFF::Record classes describe data contained in a
20
+ # GFF-formatted file. For information on the GFF format, see
21
+ # http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
22
+ # delimited format, including
23
+ # * seqname
24
+ # * source
25
+ # * feature
26
+ # * start
27
+ # * end
28
+ # * score
29
+ # * strand
30
+ # * frame
31
+ # * attributes (optional)
42
32
  #
43
- # Create a Bio::GFF object the hard way
44
- # this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n"
45
- # this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n"
46
- # this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n"
47
- # this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n"
48
- # this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n"
49
- # this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n"
50
- # this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n"
51
- # p Bio::GFF.new(this_gff)
52
- #
53
- # or create one based on a GFF-formatted file:
54
- # p Bio::GFF.new(File.open('my_data.gff')
55
- # ---
56
- # *Arguments*:
57
- # * _str_: string in GFF format
58
- # *Returns*:: Bio::GFF object
59
- def initialize(str = '')
60
- @records = Array.new
61
- str.each_line do |line|
62
- @records << Record.new(line)
33
+ # For example:
34
+ # SEQ1 EMBL atg 103 105 . + 0
35
+ # SEQ1 EMBL exon 103 172 . + 0
36
+ # SEQ1 EMBL splice5 172 173 . + .
37
+ # SEQ1 netgene splice5 172 173 0.94 + .
38
+ # SEQ1 genie sp5-20 163 182 2.3 + .
39
+ # SEQ1 genie sp5-10 168 177 2.1 + .
40
+ # SEQ1 grail ATG 17 19 2.1 - 0
41
+ #
42
+ # The Bio::GFF object is a container for Bio::GFF::Record objects, each
43
+ # representing a single line in the GFF file.
44
+ class GFF
45
+ # Creates a Bio::GFF object by building a collection of Bio::GFF::Record
46
+ # objects.
47
+ #
48
+ # Create a Bio::GFF object the hard way
49
+ # this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n"
50
+ # this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n"
51
+ # this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n"
52
+ # this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n"
53
+ # this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n"
54
+ # this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n"
55
+ # this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n"
56
+ # p Bio::GFF.new(this_gff)
57
+ #
58
+ # or create one based on a GFF-formatted file:
59
+ # p Bio::GFF.new(File.open('my_data.gff')
60
+ # ---
61
+ # *Arguments*:
62
+ # * _str_: string in GFF format
63
+ # *Returns*:: Bio::GFF object
64
+ def initialize(str = '')
65
+ @records = Array.new
66
+ str.each_line do |line|
67
+ @records << Record.new(line)
68
+ end
63
69
  end
64
- end
65
70
 
66
- # An array of Bio::GFF::Record objects.
67
- attr_accessor :records
71
+ # An array of Bio::GFF::Record objects.
72
+ attr_accessor :records
68
73
 
69
- # Represents a single line of a GFF-formatted file. See Bio::GFF for more
70
- # information.
71
- class Record
74
+ # Represents a single line of a GFF-formatted file. See Bio::GFF for more
75
+ # information.
76
+ class Record
72
77
 
73
- # Name of the reference sequence
74
- attr_accessor :seqname
78
+ # Name of the reference sequence
79
+ attr_accessor :seqname
75
80
 
76
- # Name of the source of the feature (e.g. program that did prediction)
77
- attr_accessor :source
81
+ # Name of the source of the feature (e.g. program that did prediction)
82
+ attr_accessor :source
78
83
 
79
- # Name of the feature
80
- attr_accessor :feature
84
+ # Name of the feature
85
+ attr_accessor :feature
81
86
 
82
- # Start position of feature on reference sequence
83
- attr_accessor :start
87
+ # Start position of feature on reference sequence
88
+ attr_accessor :start
84
89
 
85
- # End position of feature on reference sequence
86
- attr_accessor :end
90
+ # End position of feature on reference sequence
91
+ attr_accessor :end
87
92
 
88
- # Score of annotation (e.g. e-value for BLAST search)
89
- attr_accessor :score
93
+ # Score of annotation (e.g. e-value for BLAST search)
94
+ attr_accessor :score
90
95
 
91
- # Strand that feature is located on
92
- attr_accessor :strand
96
+ # Strand that feature is located on
97
+ attr_accessor :strand
93
98
 
94
- # For features of type 'exon': indicates where feature begins in the reading frame
95
- attr_accessor :frame
99
+ # For features of type 'exon': indicates where feature begins in the reading frame
100
+ attr_accessor :frame
96
101
 
97
- # List of tag=value pairs (e.g. to store name of the feature: ID=my_id)
98
- attr_accessor :attributes
102
+ # List of tag=value pairs (e.g. to store name of the feature: ID=my_id)
103
+ attr_accessor :attributes
99
104
 
100
- # Comments for the GFF record
101
- attr_accessor :comments
105
+ # Comments for the GFF record
106
+ attr_accessor :comment
102
107
 
103
- # Creates a Bio::GFF::Record object. Is typically not called directly, but
104
- # is called automatically when creating a Bio::GFF object.
105
- # ---
106
- # *Arguments*:
107
- # * _str_: a tab-delimited line in GFF format
108
- def initialize(str)
109
- @comments = str.chomp[/#.*/]
110
- return if /^#/.match(str)
111
- @seqname, @source, @feature, @start, @end, @score, @strand, @frame,
112
- attributes, = str.chomp.split("\t")
113
- @attributes = parse_attributes(attributes) if attributes
114
- end
108
+ # "comments" is deprecated. Instead, use "comment".
109
+ def comments
110
+ #warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"." if $VERBOSE
111
+ self.comment
112
+ end
115
113
 
116
- private
114
+ # "comments=" is deprecated. Instead, use "comment=".
115
+ def comments=(str)
116
+ #warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"." if $VERBOSE
117
+ self.comment = str
118
+ end
117
119
 
118
- def parse_attributes(attributes)
119
- hash = Hash.new
120
- attributes.split(/[^\\];/).each do |atr|
121
- key, value = atr.split(' ', 2)
122
- hash[key] = value
120
+ # Creates a Bio::GFF::Record object. Is typically not called directly, but
121
+ # is called automatically when creating a Bio::GFF object.
122
+ # ---
123
+ # *Arguments*:
124
+ # * _str_: a tab-delimited line in GFF format
125
+ def initialize(str)
126
+ @comment = str.chomp[/#.*/]
127
+ return if /^#/.match(str)
128
+ @seqname, @source, @feature, @start, @end, @score, @strand, @frame,
129
+ attributes, = str.chomp.split("\t")
130
+ @attributes = parse_attributes(attributes) if attributes
123
131
  end
124
- return hash
125
- end
126
- end
127
132
 
128
- # = DESCRIPTION
129
- # Represents version 2 of GFF specification. Is completely implemented by the
130
- # Bio::GFF class.
131
- class GFF2 < GFF
132
- VERSION = 2
133
- end
133
+ private
134
134
 
135
- # = DESCRIPTION
136
- # Represents version 3 of GFF specification. Is completely implemented by the
137
- # Bio::GFF class. For more information on version GFF3, see
138
- # http://flybase.bio.indiana.edu/annot/gff3.html
139
- class GFF3 < GFF
140
- VERSION = 3
135
+ def parse_attributes(attributes)
136
+ hash = Hash.new
141
137
 
142
- private
138
+ sc = StringScanner.new(attributes)
139
+ attrs = []
140
+ token = ''
141
+ while !sc.eos?
142
+ if sc.scan(/[^\\\;\"]+/) then
143
+ token.concat sc.matched
144
+ elsif sc.scan(/\;/) then
145
+ attrs.push token unless token.empty?
146
+ token = ''
147
+ elsif sc.scan(/\"/) then
148
+ origtext = sc.matched
149
+ while !sc.eos?
150
+ if sc.scan(/[^\\\"]+/) then
151
+ origtext.concat sc.matched
152
+ elsif sc.scan(/\"/) then
153
+ origtext.concat sc.matched
154
+ break
155
+ elsif sc.scan(/\\([\"\\])/) then
156
+ origtext.concat sc.matched
157
+ elsif sc.scan(/\\/) then
158
+ origtext.concat sc.matched
159
+ else
160
+ raise 'Bug: should not reach here'
161
+ end
162
+ end
163
+ token.concat origtext
164
+ elsif sc.scan(/\\\;/) then
165
+ token.concat sc.matched
166
+ elsif sc.scan(/\\/) then
167
+ token.concat sc.matched
168
+ else
169
+ raise 'Bug: should not reach here'
170
+ end #if
171
+ end #while
172
+ attrs.push token unless token.empty?
143
173
 
144
- def parse_attributes(attributes)
145
- hash = Hash.new
146
- attributes.split(/[^\\];/).each do |atr|
147
- key, value = atr.split('=', 2)
148
- hash[key] = value
174
+ attrs.each do |x|
175
+ key, value = x.split(' ', 2)
176
+ key.strip!
177
+ value.strip! if value
178
+ hash[key] = value
179
+ end
180
+ hash
149
181
  end
150
- return hash
151
- end
152
- end
153
182
 
154
- end # class GFF
183
+ end #Class Record
184
+
185
+ # = DESCRIPTION
186
+ # Represents version 2 of GFF specification.
187
+ # Its behavior is somehow different from Bio::GFF,
188
+ # especially for attributes.
189
+ #
190
+ class GFF2 < GFF
191
+ VERSION = 2
192
+
193
+ # string representation of the whole entry.
194
+ def to_s
195
+ ver = @gff_version || VERSION.to_s
196
+ ver = ver.gsub(/[\r\n]+/, ' ')
197
+ ([ "##gff-version #{ver}\n" ] +
198
+ @metadata.collect { |m| m.to_s } +
199
+ @records.collect{ |r| r.to_s }).join('')
200
+ end
201
+
202
+ # Private methods for GFF2 escaping characters.
203
+ # Internal only. Users should not use this module directly.
204
+ module Escape
205
+ # unsafe characters to be escaped
206
+ UNSAFE_GFF2 = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] \x80-\xfd><;=,%^&\|`]/n
207
+
208
+ # GFF2 standard identifier
209
+ IDENTIFIER_GFF2 = /\A[A-Za-z][A-Za-z0-9_]*\z/n
210
+
211
+ # GFF2 numeric value
212
+ NUMERIC_GFF2 = /\A[-+]?([0-9]+|[0-9]*\.[0-9]*)([eE][+-]?[0-9]+)?\z/n
213
+
214
+ # List of 1-letter special backslash code.
215
+ # The letters other than listed here are the same as
216
+ # those of without backslash, except for "x" and digits.
217
+ # (Note that \u (unicode) is not supported.)
218
+ BACKSLASH = {
219
+ 't' => "\t",
220
+ 'n' => "\n",
221
+ 'r' => "\r",
222
+ 'f' => "\f",
223
+ 'b' => "\b",
224
+ 'a' => "\a",
225
+ 'e' => "\e",
226
+ 'v' => "\v",
227
+ # 's' => " ",
228
+ }.freeze
229
+
230
+ # inverted hash of BACKSLASH
231
+ CHAR2BACKSLASH = BACKSLASH.invert.freeze
232
+
233
+ # inverted hash of BACKSLASH, including double quote and backslash
234
+ CHAR2BACKSLASH_EXTENDED =
235
+ CHAR2BACKSLASH.merge({ '"' => '"', "\\" => "\\" }).freeze
236
+
237
+ # prohibited characters in GFF2 columns
238
+ PROHIBITED_GFF2_COLUMNS = /[\t\r\n\x00-\x1f\x7f\xfe\xff]/
239
+
240
+ # prohibited characters in GFF2 attribute tags
241
+ PROHIBITED_GFF2_TAGS = /[\s\"\;\t\r\n\x00-\x1f\x7f\xfe\xff]/
242
+
243
+ private
244
+ # (private) escapes GFF2 free text string
245
+ def escape_gff2_freetext(str)
246
+ '"' + str.gsub(UNSAFE_GFF2) do |x|
247
+ "\\" + (CHAR2BACKSLASH_EXTENDED[x] || char2octal(x))
248
+ end + '"'
249
+ end
250
+
251
+ # (private) "x" => "\\oXXX"
252
+ # "x" must be a letter.
253
+ # If "x" is consisted of two bytes or more, joined with "\\".
254
+ def char2octal(x)
255
+ x.enum_for(:each_byte).collect { |y|
256
+ sprintf("%03o", y) }.join("\\")
257
+ end
258
+
259
+ # (private) escapes GFF2 attribute value string
260
+ def escape_gff2_attribute_value(str)
261
+ freetext?(str) ? escape_gff2_freetext(str) : str
262
+ end
263
+
264
+ # (private) check if the given string is a free text to be quoted
265
+ # by double-qoute.
266
+ def freetext?(str)
267
+ if IDENTIFIER_GFF2 =~ str or
268
+ NUMERIC_GFF2 =~ str then
269
+ false
270
+ else
271
+ true
272
+ end
273
+ end
274
+
275
+ # (private) escapes normal columns in GFF2
276
+ def gff2_column_to_s(str)
277
+ str = str.to_s
278
+ str = str.empty? ? '.' : str
279
+ str = str.gsub(PROHIBITED_GFF2_COLUMNS) do |x|
280
+ "\\" + (CHAR2BACKSLASH[x] || char2octal(x))
281
+ end
282
+ if str[0, 1] == '#' then
283
+ str[0, 1] = "\\043"
284
+ end
285
+ str
286
+ end
287
+
288
+ # (private) escapes GFF2 attribute tag string
289
+ def escape_gff2_attribute_tag(str)
290
+ str = str.to_s
291
+ str = str.empty? ? '.' : str
292
+ str = str.gsub(PROHIBITED_GFF2_TAGS) do |x|
293
+ "\\" + (CHAR2BACKSLASH[x] || char2octal(x))
294
+ end
295
+ if str[0, 1] == '#' then
296
+ str[0, 1] = "\\043"
297
+ end
298
+ str
299
+ end
300
+
301
+ # (private) dummy method, will be redefined in GFF3.
302
+ def unescape(str)
303
+ str
304
+ end
305
+ end #module Escape
306
+
307
+ # Stores GFF2 record.
308
+ class Record < GFF::Record
309
+
310
+ include Escape
311
+
312
+ # Stores GFF2 attribute's value.
313
+ class Value
314
+
315
+ include Escape
316
+
317
+ # Creates a new Value object.
318
+ # Note that the given array _values_ is directly stored in
319
+ # the object.
320
+ #
321
+ # ---
322
+ # *Arguments*:
323
+ # * (optional) _values_: Array containing String objects.
324
+ # *Returns*:: Value object.
325
+ def initialize(values = [])
326
+ @values = values
327
+ end
328
+
329
+ # Returns string representation of this Value object.
330
+ # ---
331
+ # *Returns*:: String
332
+ def to_s
333
+ @values.collect do |str|
334
+ escape_gff2_attribute_value(str)
335
+ end.join(' ')
336
+ end
337
+
338
+ # Returns all values in this object.
339
+ #
340
+ # Note that modification of the returned array would affect
341
+ # original Value object.
342
+ # ---
343
+ # *Returns*:: Array
344
+ def values
345
+ @values
346
+ end
347
+ alias to_a values
348
+
349
+ # Returns true if other == self.
350
+ # Otherwise, returns false.
351
+ def ==(other)
352
+ return false unless other.kind_of?(self.class) or
353
+ self.kind_of?(other.class)
354
+ self.values == other.values rescue super(other)
355
+ end
356
+ end #class Value
357
+
358
+
359
+ # Parses a GFF2-formatted line and returns a new
360
+ # Bio::GFF::GFF2::Record object.
361
+ def self.parse(str)
362
+ self.new.parse(str)
363
+ end
364
+
365
+ # Creates a Bio::GFF::GFF2::Record object.
366
+ # Is typically not called directly, but
367
+ # is called automatically when creating a Bio::GFF::GFF2 object.
368
+ #
369
+ # ---
370
+ # *Arguments*:
371
+ # * _str_: a tab-delimited line in GFF2 format
372
+ # *Arguments*:
373
+ # * _seqname_: seqname (String or nil)
374
+ # * _source_: source (String or nil)
375
+ # * _feature_: feature type (String)
376
+ # * _start_position_: start (Integer)
377
+ # * _end_position_: end (Integer)
378
+ # * _score_: score (Float or nil)
379
+ # * _strand_: strand (String or nil)
380
+ # * _frame_: frame (Integer or nil)
381
+ # * _attributes_: attributes (Array or nil)
382
+ def initialize(*arg)
383
+ if arg.size == 1 then
384
+ parse(arg[0])
385
+ else
386
+ @seqname, @source, @feature,
387
+ start, endp, @score, @strand, frame,
388
+ @attributes = arg
389
+ @start = start ? start.to_i : nil
390
+ @end = endp ? endp.to_i : nil
391
+ @score = score ? score.to_f : nil
392
+ @frame = frame ? frame.to_i : nil
393
+ end
394
+ @attributes ||= []
395
+ end
396
+
397
+ # Comment for the GFF record
398
+ attr_accessor :comment
399
+
400
+ # "comments" is deprecated. Instead, use "comment".
401
+ def comments
402
+ warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"."
403
+ self.comment
404
+ end
405
+
406
+ # "comments=" is deprecated. Instead, use "comment=".
407
+ def comments=(str)
408
+ warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"."
409
+ self.comment = str
410
+ end
411
+
412
+ # Parses a GFF2-formatted line and stores data from the string.
413
+ # Note that all existing data is wiped out.
414
+ def parse(string)
415
+ if /^\s*\#/ =~ string then
416
+ @comment = string[/\#(.*)/, 1].chomp
417
+ columns = []
418
+ else
419
+ columns = string.chomp.split("\t", 10)
420
+ @comment = columns[9][/\#(.*)/, 1].chomp if columns[9]
421
+ end
422
+
423
+ @seqname, @source, @feature,
424
+ start, endp, score, @strand, frame =
425
+ columns[0, 8].collect { |x|
426
+ str = unescape(x)
427
+ str == '.' ? nil : str
428
+ }
429
+ @start = start ? start.to_i : nil
430
+ @end = endp ? endp.to_i : nil
431
+ @score = score ? score.to_f : nil
432
+ @frame = frame ? frame.to_i : nil
433
+
434
+ @attributes = parse_attributes(columns[8])
435
+ end
436
+
437
+ # Returns true if the entry is empty except for comment.
438
+ # Otherwise, returns false.
439
+ def comment_only?
440
+ if !@seqname and
441
+ !@source and
442
+ !@feature and
443
+ !@start and
444
+ !@end and
445
+ !@score and
446
+ !@strand and
447
+ !@frame and
448
+ @attributes.empty? then
449
+ true
450
+ else
451
+ false
452
+ end
453
+ end
454
+
455
+ # Return the record as a GFF2 compatible string
456
+ def to_s
457
+ cmnt = if @comment and !@comment.to_s.strip.empty? then
458
+ @comment.gsub(/[\r\n]+/, ' ')
459
+ else
460
+ false
461
+ end
462
+ return "\##{cmnt}\n" if self.comment_only? and cmnt
463
+ [
464
+ gff2_column_to_s(@seqname),
465
+ gff2_column_to_s(@source),
466
+ gff2_column_to_s(@feature),
467
+ gff2_column_to_s(@start),
468
+ gff2_column_to_s(@end),
469
+ gff2_column_to_s(@score),
470
+ gff2_column_to_s(@strand),
471
+ gff2_column_to_s(@frame),
472
+ attributes_to_s(@attributes)
473
+ ].join("\t") +
474
+ (cmnt ? "\t\##{cmnt}\n" : "\n")
475
+ end
476
+
477
+ # Returns true if self == other. Otherwise, returns false.
478
+ def ==(other)
479
+ super ||
480
+ ((self.class == other.class and
481
+ self.seqname == other.seqname and
482
+ self.source == other.source and
483
+ self.feature == other.feature and
484
+ self.start == other.start and
485
+ self.end == other.end and
486
+ self.score == other.score and
487
+ self.strand == other.strand and
488
+ self.frame == other.frame and
489
+ self.attributes == other.attributes) ? true : false)
490
+ end
491
+
492
+ # Gets the attribute value for the given tag.
493
+ #
494
+ # Note that if two or more tag-value pairs with the same name found,
495
+ # only the first value is returned.
496
+ # ---
497
+ # *Arguments*:
498
+ # * (required) _tag_: String
499
+ # *Returns*:: String, Bio::GFF::GFF2::Record::Value object, or nil.
500
+ def get_attribute(tag)
501
+ ary = @attributes.assoc(tag)
502
+ ary ? ary[1] : nil
503
+ end
504
+ alias attribute get_attribute
505
+
506
+ # Gets the attribute values for the given tag.
507
+ # This method always returns an array.
508
+ # ---
509
+ # *Arguments*:
510
+ # * (required) _tag_: String
511
+ # *Returns*:: Array containing String or \
512
+ # Bio::GFF::GFF2::Record::Value objects.
513
+ def get_attributes(tag)
514
+ ary = @attributes.find_all do |x|
515
+ x[0] == tag
516
+ end
517
+ ary.collect! { |x| x[1] }
518
+ ary
519
+ end
520
+
521
+ # Sets value for the given tag.
522
+ # If the tag exists, the value of the tag is replaced with _value_.
523
+ # Note that if two or more tag-value pairs with the same name found,
524
+ # only the first tag-value pair is replaced.
525
+ #
526
+ # If the tag does not exist, the tag-value pair is newly added.
527
+ # ---
528
+ # *Arguments*:
529
+ # * (required) _tag_: String
530
+ # * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
531
+ # *Returns*:: _value_
532
+ def set_attribute(tag, value)
533
+ ary = @attributes.find do |x|
534
+ x[0] == tag
535
+ end
536
+ if ary then
537
+ ary[1] = value
538
+ else
539
+ ary = [ String.new(tag), value ]
540
+ @attributes.push ary
541
+ end
542
+ value
543
+ end
544
+
545
+ # Replaces values for the given tags with new values.
546
+ # Existing values for the tag are completely wiped out and
547
+ # replaced by new tag-value pairs.
548
+ # If the tag does not exist, the tag-value pairs are newly added.
549
+ #
550
+ # ---
551
+ # *Arguments*:
552
+ # * (required) _tag_: String
553
+ # * (required) _values_: String or Bio::GFF::GFF2::Record::Value objects.
554
+ # *Returns*:: _self_
555
+ def replace_attributes(tag, *values)
556
+ i = 0
557
+ @attributes.reject! do |x|
558
+ if x[0] == tag then
559
+ if i >= values.size then
560
+ true
561
+ else
562
+ x[1] = values[i]
563
+ i += 1
564
+ false
565
+ end
566
+ else
567
+ false
568
+ end
569
+ end
570
+ (i...(values.size)).each do |j|
571
+ @attributes.push [ String.new(tag), values[j] ]
572
+ end
573
+ self
574
+ end
575
+
576
+ # Adds a new tag-value pair.
577
+ # ---
578
+ # *Arguments*:
579
+ # * (required) _tag_: String
580
+ # * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
581
+ # *Returns*:: _value_
582
+ def add_attribute(tag, value)
583
+ @attributes.push([ String.new(tag), value ])
584
+ end
585
+
586
+ # Removes a specific tag-value pair.
587
+ #
588
+ # Note that if two or more tag-value pairs found,
589
+ # only the first tag-value pair is removed.
590
+ #
591
+ # ---
592
+ # *Arguments*:
593
+ # * (required) _tag_: String
594
+ # * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
595
+ # *Returns*:: if removed, _value_. Otherwise, nil.
596
+ def delete_attribute(tag, value)
597
+ removed = nil
598
+ if i = @attributes.index([ tag, value ]) then
599
+ ary = @attributes.delete_at(i)
600
+ removed = ary[1]
601
+ end
602
+ removed
603
+ end
604
+
605
+ # Removes all attributes with the specified tag.
606
+ #
607
+ # ---
608
+ # *Arguments*:
609
+ # * (required) _tag_: String
610
+ # *Returns*:: if removed, self. Otherwise, nil.
611
+ def delete_attributes(tag)
612
+ @attributes.reject! do |x|
613
+ x[0] == tag
614
+ end ? self : nil
615
+ end
616
+
617
+ # Sorts attributes order by given tag name's order.
618
+ # If a block is given, the argument _tags_ is ignored, and
619
+ # yields two tag names like Array#sort!.
620
+ #
621
+ # ---
622
+ # *Arguments*:
623
+ # * (required or optional) _tags_: Array containing String objects
624
+ # *Returns*:: _self_
625
+ def sort_attributes_by_tag!(tags = nil)
626
+ h = {}
627
+ s = @attributes.size
628
+ @attributes.each_with_index { |x, i| h[x] = i }
629
+ if block_given? then
630
+ @attributes.sort! do |x, y|
631
+ r = yield x[0], y[0]
632
+ if r == 0 then
633
+ r = (h[x] || s) <=> (h[y] || s)
634
+ end
635
+ r
636
+ end
637
+ else
638
+ unless tags then
639
+ raise ArgumentError, 'wrong number of arguments (0 for 1) or wrong argument value'
640
+ end
641
+ @attributes.sort! do |x, y|
642
+ r = (tags.index(x[0]) || tags.size) <=>
643
+ (tags.index(y[0]) || tags.size)
644
+ if r == 0 then
645
+ r = (h[x] || s) <=> (h[y] || s)
646
+ end
647
+ r
648
+ end
649
+ end
650
+ self
651
+ end
652
+
653
+ # Returns hash representation of attributes.
654
+ #
655
+ # Note: If two or more tag-value pairs with same tag names exist,
656
+ # only the first tag-value pair is used for each tag.
657
+ #
658
+ # ---
659
+ # *Returns*:: Hash object
660
+ def attributes_to_hash
661
+ h = {}
662
+ @attributes.each do |x|
663
+ key, val = x
664
+ h[key] = val unless h[key]
665
+ end
666
+ h
667
+ end
668
+
669
+ private
670
+
671
+ # (private) Parses attributes.
672
+ # Returns arrays
673
+ def parse_attributes(str)
674
+ return [] if !str or str == '.'
675
+ attr_pairs = parse_attributes_string(str)
676
+ attr_pairs.collect! do |x|
677
+ key = x.shift
678
+ val = (x.size == 1) ? x[0] : Value.new(x)
679
+ [ key, val ]
680
+ end
681
+ attr_pairs
682
+ end
683
+
684
+ # (private) Parses attributes string.
685
+ # Returns arrays
686
+ def parse_attributes_string(str)
687
+ sc = StringScanner.new(str)
688
+ attr_pairs = []
689
+ tokens = []
690
+ cur_token = ''
691
+ while !sc.eos?
692
+ if sc.scan(/[^\\\;\"\s]+/) then
693
+ cur_token.concat sc.matched
694
+ elsif sc.scan(/\s+/) then
695
+ tokens.push cur_token unless cur_token.empty?
696
+ cur_token = ''
697
+ elsif sc.scan(/\;/) then
698
+ tokens.push cur_token unless cur_token.empty?
699
+ cur_token = ''
700
+ attr_pairs.push tokens
701
+ tokens = []
702
+ elsif sc.scan(/\"/) then
703
+ tokens.push cur_token unless cur_token.empty?
704
+ cur_token = ''
705
+ freetext = ''
706
+ while !sc.eos?
707
+ if sc.scan(/[^\\\"]+/) then
708
+ freetext.concat sc.matched
709
+ elsif sc.scan(/\"/) then
710
+ break
711
+ elsif sc.scan(/\\([\"\\])/) then
712
+ freetext.concat sc[1]
713
+ elsif sc.scan(/\\x([0-9a-fA-F][0-9a-fA-F])/n) then
714
+ chr = sc[1].to_i(16).chr
715
+ freetext.concat chr
716
+ elsif sc.scan(/\\([0-7][0-7][0-7])/n) then
717
+ chr = sc[1].to_i(8).chr
718
+ freetext.concat chr
719
+ elsif sc.scan(/\\([^x0-9])/n) then
720
+ chr = Escape::BACKSLASH[sc[1]] || sc.matched
721
+ freetext.concat chr
722
+ elsif sc.scan(/\\/) then
723
+ freetext.concat sc.matched
724
+ else
725
+ raise 'Bug: should not reach here'
726
+ end
727
+ end
728
+ tokens.push freetext
729
+ #p freetext
730
+ # # disabled support for \; out of freetext
731
+ #elsif sc.scan(/\\\;/) then
732
+ # cur_token.concat sc.matched
733
+ elsif sc.scan(/\\/) then
734
+ cur_token.concat sc.matched
735
+ else
736
+ raise 'Bug: should not reach here'
737
+ end #if
738
+ end #while
739
+ tokens.push cur_token unless cur_token.empty?
740
+ attr_pairs.push tokens unless tokens.empty?
741
+ return attr_pairs
742
+ end
743
+
744
+ # (private) string representation of attributes
745
+ def attributes_to_s(attr)
746
+ attr.collect do |a|
747
+ tag, val = a
748
+ if Escape::IDENTIFIER_GFF2 !~ tag then
749
+ warn "Illegal GFF2 attribute tag: #{tag.inspect}" if $VERBOSE
750
+ end
751
+ tagstr = gff2_column_to_s(tag)
752
+ valstr = if val.kind_of?(Value) then
753
+ val.to_s
754
+ else
755
+ escape_gff2_attribute_value(val)
756
+ end
757
+ "#{tagstr} #{valstr}"
758
+ end.join(' ; ')
759
+ end
760
+ end #class Record
761
+
762
+ # Stores GFF2 meta-data.
763
+ class MetaData
764
+ # Creates a new MetaData object
765
+ def initialize(directive, data = nil)
766
+ @directive = directive
767
+ @data = data
768
+ end
769
+
770
+ # Directive. Usually, one of "feature-ontology", "attribute-ontology",
771
+ # or "source-ontology".
772
+ attr_accessor :directive
773
+
774
+ # data of this entry
775
+ attr_accessor :data
776
+
777
+ # parses a line
778
+ def self.parse(line)
779
+ directive, data = line.chomp.split(/\s+/, 2)
780
+ directive = directive.sub(/\A\#\#/, '') if directive
781
+ self.new(directive, data)
782
+ end
783
+
784
+ # string representation of this meta-data
785
+ def to_s
786
+ d = @directive.to_s.gsub(/[\r\n]+/, ' ')
787
+ v = ' ' + @data.to_s.gsub(/[\r\n]+/, ' ') unless @data.to_s.empty?
788
+ "\#\##{d}#{v}\n"
789
+ end
790
+
791
+ # Returns true if self == other. Otherwise, returns false.
792
+ def ==(other)
793
+ if self.class == other.class and
794
+ self.directive == other.directive and
795
+ self.data == other.data then
796
+ true
797
+ else
798
+ false
799
+ end
800
+ end
801
+ end #class MetaData
802
+
803
+ # (private) parses metadata
804
+ def parse_metadata(directive, line)
805
+ case directive
806
+ when 'gff-version'
807
+ @gff_version ||= line.split(/\s+/)[1]
808
+ else
809
+ @metadata.push MetaData.parse(line)
810
+ end
811
+ true
812
+ end
813
+ private :parse_metadata
814
+
815
+ # Creates a Bio::GFF::GFF2 object by building a collection of
816
+ # Bio::GFF::GFF2::Record (and metadata) objects.
817
+ #
818
+ # ---
819
+ # *Arguments*:
820
+ # * _str_: string in GFF format
821
+ # *Returns*:: Bio::GFF::GFF2 object
822
+ def initialize(str = nil)
823
+ @gff_version = nil
824
+ @records = []
825
+ @metadata = []
826
+ parse(str) if str
827
+ end
828
+
829
+ # GFF2 version string (String or nil). nil means "2".
830
+ attr_reader :gff_version
831
+
832
+ # Metadata (except "##gff-version").
833
+ # Must be an array of Bio::GFF::GFF2::MetaData objects.
834
+ attr_accessor :metadata
835
+
836
+ # Parses a GFF2 entries, and concatenated the parsed data.
837
+ #
838
+ # ---
839
+ # *Arguments*:
840
+ # * _str_: string in GFF format
841
+ # *Returns*:: self
842
+ def parse(str)
843
+ # parses GFF lines
844
+ str.each_line do |line|
845
+ if /^\#\#([^\s]+)/ =~ line then
846
+ parse_metadata($1, line)
847
+ else
848
+ @records << GFF2::Record.new(line)
849
+ end
850
+ end
851
+ self
852
+ end
853
+
854
+ end #class GFF2
855
+
856
+ # = DESCRIPTION
857
+ # Represents version 3 of GFF specification.
858
+ # For more information on version GFF3, see
859
+ # http://song.sourceforge.net/gff3.shtml
860
+ #--
861
+ # obsolete URL:
862
+ # http://flybase.bio.indiana.edu/annot/gff3.html
863
+ #++
864
+ class GFF3 < GFF
865
+ VERSION = 3
866
+
867
+ # Creates a Bio::GFF::GFF3 object by building a collection of
868
+ # Bio::GFF::GFF3::Record (and metadata) objects.
869
+ #
870
+ # ---
871
+ # *Arguments*:
872
+ # * _str_: string in GFF format
873
+ # *Returns*:: Bio::GFF object
874
+ def initialize(str = nil)
875
+ @gff_version = nil
876
+ @records = []
877
+ @sequence_regions = []
878
+ @metadata = []
879
+ @sequences = []
880
+ @in_fasta = false
881
+ parse(str) if str
882
+ end
883
+
884
+ # GFF3 version string (String or nil). nil means "3".
885
+ attr_reader :gff_version
886
+
887
+ # Metadata of "##sequence-region".
888
+ # Must be an array of Bio::GFF::GFF3::SequenceRegion objects.
889
+ attr_accessor :sequence_regions
890
+
891
+ # Metadata (except "##sequence-region", "##gff-version", "###").
892
+ # Must be an array of Bio::GFF::GFF3::MetaData objects.
893
+ attr_accessor :metadata
894
+
895
+ # Sequences bundled within GFF3.
896
+ # Must be an array of Bio::Sequence objects.
897
+ attr_accessor :sequences
898
+
899
+ # Parses a GFF3 entries, and concatenated the parsed data.
900
+ #
901
+ # Note that after "##FASTA" line is given,
902
+ # only fasta-formatted text is accepted.
903
+ #
904
+ # ---
905
+ # *Arguments*:
906
+ # * _str_: string in GFF format
907
+ # *Returns*:: self
908
+ def parse(str)
909
+ # if already after the ##FASTA line, parses fasta format and return
910
+ if @in_fasta then
911
+ parse_fasta(str)
912
+ return self
913
+ end
914
+
915
+ if str.respond_to?(:gets) then
916
+ # str is a IO-like object
917
+ fst = nil
918
+ else
919
+ # str is a String
920
+ gff, sep, fst = str.split(/^(\>|##FASTA.*)/n, 2)
921
+ fst = sep + fst if sep == '>' and fst
922
+ str = gff
923
+ end
924
+
925
+ # parses GFF lines
926
+ str.each_line do |line|
927
+ if /^\#\#([^\s]+)/ =~ line then
928
+ parse_metadata($1, line)
929
+ parse_fasta(str) if @in_fasta
930
+ elsif /^\>/ =~ line then
931
+ @in_fasta = true
932
+ parse_fasta(str, line)
933
+ else
934
+ @records << GFF3::Record.new(line)
935
+ end
936
+ end
937
+
938
+ # parses fasta format when str is a String and fasta data exists
939
+ if fst then
940
+ @in_fasta = true
941
+ parse_fasta(fst)
942
+ end
943
+
944
+ self
945
+ end
946
+
947
+ # parses fasta formatted data
948
+ def parse_fasta(str, line = nil)
949
+ str.each_line("\n>") do |seqstr|
950
+ if line then seqstr = line + seqstr; line = nil; end
951
+ x = seqstr.strip
952
+ next if x.empty? or x == '>'
953
+ fst = Bio::FastaFormat.new(seqstr)
954
+ seq = fst.to_seq
955
+ seq.entry_id =
956
+ unescape(fst.definition.strip.split(/\s/, 2)[0].to_s)
957
+ @sequences.push seq
958
+ end
959
+ end
960
+ private :parse_fasta
961
+
962
+ # string representation of whole entry.
963
+ def to_s
964
+ ver = @gff_version || VERSION.to_s
965
+ if @sequences.size > 0 then
966
+ seqs = "##FASTA\n" +
967
+ @sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('')
968
+ else
969
+ seqs = ''
970
+ end
971
+
972
+ ([ "##gff-version #{escape(ver)}\n" ] +
973
+ @metadata.collect { |m| m.to_s } +
974
+ @sequence_regions.collect { |m| m.to_s } +
975
+ @records.collect{ |r| r.to_s }).join('') + seqs
976
+ end
977
+
978
+ # Private methods for escaping characters.
979
+ # Internal only. Users should not use this module directly.
980
+ module Escape
981
+ # unsafe characters to be escaped for normal columns
982
+ UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
983
+
984
+ # unsafe characters to be escaped for seqid columns
985
+ # and target_id of the "Target" attribute
986
+ UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
987
+
988
+ # unsafe characters to be escaped for attribute columns
989
+ UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
990
+
991
+ private
992
+
993
+ # If str is empty, returns '.'. Otherwise, returns str.
994
+ def column_to_s(str)
995
+ str = str.to_s
996
+ str.empty? ? '.' : str
997
+ end
998
+
999
+ # Return the string corresponding to these characters unescaped
1000
+ def unescape(string)
1001
+ URI.unescape(string)
1002
+ end
1003
+
1004
+ # Escape a column according to the specification at
1005
+ # http://song.sourceforge.net/gff3.shtml.
1006
+ def escape(string)
1007
+ URI.escape(string, UNSAFE)
1008
+ end
1009
+
1010
+ # Escape seqid column according to the specification at
1011
+ # http://song.sourceforge.net/gff3.shtml.
1012
+ def escape_seqid(string)
1013
+ URI.escape(string, UNSAFE_SEQID)
1014
+ end
1015
+
1016
+ # Escape attribute according to the specification at
1017
+ # http://song.sourceforge.net/gff3.shtml.
1018
+ # In addition to the normal escape rule, the following characters
1019
+ # are escaped: ",=;".
1020
+ # Returns the string corresponding to these characters escaped.
1021
+ def escape_attribute(string)
1022
+ URI.escape(string, UNSAFE_ATTRIBUTE)
1023
+ end
1024
+ end #module Escape
1025
+
1026
+ include Escape
1027
+
1028
+ # Stores meta-data "##sequence-region seqid start end".
1029
+ class SequenceRegion
1030
+ include Escape
1031
+
1032
+ # creates a new SequenceRegion class
1033
+ def initialize(seqid, start, endpos)
1034
+ @seqid = seqid
1035
+ @start = start ? start.to_i : nil
1036
+ @end = endpos ? endpos.to_i : nil
1037
+ end
1038
+
1039
+ # parses given string and returns SequenceRegion class
1040
+ def self.parse(str)
1041
+ dummy, seqid, start, endpos =
1042
+ str.chomp.split(/\s+/, 4).collect { |x| URI.unescape(x) }
1043
+ self.new(seqid, start, endpos)
1044
+ end
1045
+
1046
+ # sequence ID
1047
+ attr_accessor :seqid
1048
+
1049
+ # start position
1050
+ attr_accessor :start
1051
+
1052
+ # end position
1053
+ attr_accessor :end
1054
+
1055
+ # string representation
1056
+ def to_s
1057
+ i = escape_seqid(column_to_s(@seqid))
1058
+ s = escape_seqid(column_to_s(@start))
1059
+ e = escape_seqid(column_to_s(@end))
1060
+ "##sequence-region #{i} #{s} #{e}\n"
1061
+ end
1062
+
1063
+ # Returns true if self == other. Otherwise, returns false.
1064
+ def ==(other)
1065
+ if other.class == self.class and
1066
+ other.seqid == self.seqid and
1067
+ other.start == self.start and
1068
+ other.end == self.end then
1069
+ true
1070
+ else
1071
+ false
1072
+ end
1073
+ end
1074
+ end #class SequenceRegion
1075
+
1076
+ # Represents a single line of a GFF3-formatted file.
1077
+ # See Bio::GFF::GFF3 for more information.
1078
+ class Record < GFF2::Record
1079
+
1080
+ include GFF3::Escape
1081
+
1082
+ # shortcut to the ID attribute
1083
+ def id
1084
+ get_attribute('ID')
1085
+ end
1086
+
1087
+ # set ID attribute
1088
+ def id=(str)
1089
+ set_attribute('ID', str)
1090
+ end
1091
+
1092
+ # aliases for Column 1 (formerly "seqname")
1093
+ alias seqid seqname
1094
+ alias seqid= seqname=
1095
+
1096
+ # aliases for Column 3 (formerly "feature").
1097
+ # In the GFF3 document http://song.sourceforge.net/gff3.shtml,
1098
+ # column3 is called "type", but we used "feature_type"
1099
+ # because "type" is already used by Ruby itself.
1100
+ alias feature_type feature
1101
+ alias feature_type= feature=
1102
+
1103
+ # aliases for Column 8
1104
+ alias phase frame
1105
+ alias phase= frame=
1106
+
1107
+ # Parses a GFF3-formatted line and returns a new
1108
+ # Bio::GFF::GFF3::Record object.
1109
+ def self.parse(str)
1110
+ self.new.parse(str)
1111
+ end
1112
+
1113
+ # Creates a Bio::GFF::GFF3::Record object.
1114
+ # Is typically not called directly, but
1115
+ # is called automatically when creating a Bio::GFF::GFF3 object.
1116
+ #
1117
+ # ---
1118
+ # *Arguments*:
1119
+ # * _str_: a tab-delimited line in GFF3 format
1120
+ # *Arguments*:
1121
+ # * _seqid_: sequence ID (String or nil)
1122
+ # * _source_: source (String or nil)
1123
+ # * _feature_type_: type of feature (String)
1124
+ # * _start_position_: start (Integer)
1125
+ # * _end_position_: end (Integer)
1126
+ # * _score_: score (Float or nil)
1127
+ # * _strand_: strand (String or nil)
1128
+ # * _phase_: phase (Integer or nil)
1129
+ # * _attributes_: attributes (Array or nil)
1130
+ def initialize(*arg)
1131
+ super(*arg)
1132
+ end
1133
+
1134
+ # Parses a GFF3-formatted line and stores data from the string.
1135
+ # Note that all existing data is wiped out.
1136
+ def parse(string)
1137
+ super
1138
+ end
1139
+
1140
+ # Return the record as a GFF3 compatible string
1141
+ def to_s
1142
+ cmnt = if @comment and !@comment.to_s.strip.empty? then
1143
+ @comment.gsub(/[\r\n]+/, ' ')
1144
+ else
1145
+ false
1146
+ end
1147
+ return "\##{cmnt}\n" if self.comment_only? and cmnt
1148
+ [
1149
+ escape_seqid(column_to_s(@seqname)),
1150
+ escape(column_to_s(@source)),
1151
+ escape(column_to_s(@feature)),
1152
+ escape(column_to_s(@start)),
1153
+ escape(column_to_s(@end)),
1154
+ escape(column_to_s(@score)),
1155
+ escape(column_to_s(@strand)),
1156
+ escape(column_to_s(@frame)),
1157
+ attributes_to_s(@attributes)
1158
+ ].join("\t") +
1159
+ (cmnt ? "\t\##{cmnt}\n" : "\n")
1160
+ end
1161
+
1162
+ # Bio:GFF::GFF3::Record::Target is a class to store
1163
+ # data of "Target" attribute.
1164
+ class Target
1165
+ include GFF3::Escape
1166
+
1167
+ # Creates a new Target object.
1168
+ def initialize(target_id, start, endpos, strand = nil)
1169
+ @target_id = target_id
1170
+ @start = start ? start.to_i : nil
1171
+ @end = endpos ? endpos.to_i : nil
1172
+ @strand = strand
1173
+ end
1174
+
1175
+ # target ID
1176
+ attr_accessor :target_id
1177
+
1178
+ # start position
1179
+ attr_accessor :start
1180
+
1181
+ # end position
1182
+ attr_accessor :end
1183
+
1184
+ # strand (optional). Normally, "+" or "-", or nil.
1185
+ attr_accessor :strand
1186
+
1187
+ # parses "target_id start end [strand]"-style string
1188
+ # (for example, "ABC789 123 456 +")
1189
+ # and creates a new Target object.
1190
+ #
1191
+ def self.parse(str)
1192
+ target_id, start, endpos, strand =
1193
+ str.split(/ +/, 4).collect { |x| URI.unescape(x) }
1194
+ self.new(target_id, start, endpos, strand)
1195
+ end
1196
+
1197
+ # returns a string
1198
+ def to_s
1199
+ i = escape_seqid(column_to_s(@target_id))
1200
+ s = escape_attribute(column_to_s(@start))
1201
+ e = escape_attribute(column_to_s(@end))
1202
+ strnd = escape_attribute(@strand.to_s)
1203
+ strnd = " " + strnd unless strnd.empty?
1204
+ "#{i} #{s} #{e}#{strnd}"
1205
+ end
1206
+
1207
+ # Returns true if self == other. Otherwise, returns false.
1208
+ def ==(other)
1209
+ if other.class == self.class and
1210
+ other.target_id == self.target_id and
1211
+ other.start == self.start and
1212
+ other.end == self.end and
1213
+ other.strand == self.strand then
1214
+ true
1215
+ else
1216
+ false
1217
+ end
1218
+ end
1219
+ end #class Target
1220
+
1221
+ # Bio:GFF::GFF3::Record::Gap is a class to store
1222
+ # data of "Gap" attribute.
1223
+ class Gap
1224
+
1225
+ # Code is a class to store length of single-letter code.
1226
+ Code = Struct.new(:code, :length)
1227
+
1228
+ # Code is a class to store length of single-letter code.
1229
+ class Code
1230
+ # 1-letter code (Symbol). One of :M, :I, :D, :F, or :R is expected.
1231
+ attr_reader :code if false #dummy for RDoc
1232
+
1233
+ # length (Integer)
1234
+ attr_reader :length if false #dummy for RDoc
1235
+
1236
+ def to_s
1237
+ "#{code}#{length}"
1238
+ end
1239
+ end #class code
1240
+
1241
+ # Creates a new Gap object.
1242
+ #
1243
+ # ---
1244
+ # *Arguments*:
1245
+ # * _str_: a formatted string, or nil.
1246
+ def initialize(str = nil)
1247
+ if str then
1248
+ @data = str.split(/ +/).collect do |x|
1249
+ if /\A([A-Z])([0-9]+)\z/ =~ x.strip then
1250
+ Code.new($1.intern, $2.to_i)
1251
+ else
1252
+ warn "ignored unknown token: #{x}.inspect" if $VERBOSE
1253
+ nil
1254
+ end
1255
+ end
1256
+ @data.compact!
1257
+ else
1258
+ @data = []
1259
+ end
1260
+ end
1261
+
1262
+ # Same as new(str).
1263
+ def self.parse(str)
1264
+ self.new(str)
1265
+ end
1266
+
1267
+ # (private method)
1268
+ # Scans gaps and returns an array of Code objects
1269
+ def __scan_gap(str, gap_regexp = /[^a-zA-Z]/,
1270
+ code_i = :I, code_m = :M)
1271
+ sc = StringScanner.new(str)
1272
+ data = []
1273
+ while len = sc.skip_until(gap_regexp)
1274
+ mlen = len - sc.matched_size
1275
+ data.push Code.new(code_m, mlen) if mlen > 0
1276
+ g = Code.new(code_i, sc.matched_size)
1277
+ while glen = sc.skip(gap_regexp)
1278
+ g.length += glen
1279
+ end
1280
+ data.push g
1281
+ end
1282
+ if sc.rest_size > 0 then
1283
+ m = Code.new(code_m, sc.rest_size)
1284
+ data.push m
1285
+ end
1286
+ data
1287
+ end
1288
+ private :__scan_gap
1289
+
1290
+ # (private method)
1291
+ # Parses given reference-target sequence alignment and
1292
+ # initializes self. Existing data will be erased.
1293
+ def __initialize_from_sequences_na(reference, target,
1294
+ gap_regexp = /[^a-zA-Z]/)
1295
+
1296
+ data_ref = __scan_gap(reference, gap_regexp, :I, :M)
1297
+ data_tgt = __scan_gap(target, gap_regexp, :D, :M)
1298
+ data = []
1299
+
1300
+ while !data_ref.empty? and !data_tgt.empty?
1301
+ ref = data_ref.shift
1302
+ tgt = data_tgt.shift
1303
+ if ref.length > tgt.length then
1304
+ x = Code.new(ref.code, ref.length - tgt.length)
1305
+ data_ref.unshift x
1306
+ ref.length = tgt.length
1307
+ elsif ref.length < tgt.length then
1308
+ x = Code.new(tgt.code, tgt.length - ref.length)
1309
+ data_tgt.unshift x
1310
+ tgt.length = ref.length
1311
+ end
1312
+ case ref.code
1313
+ when :M
1314
+ if tgt.code == :M then
1315
+ data.push ref
1316
+ elsif tgt.code == :D then
1317
+ data.push tgt
1318
+ else
1319
+ raise 'Bug: should not reach here.'
1320
+ end
1321
+ when :I
1322
+ if tgt.code == :M then
1323
+ data.push ref
1324
+ elsif tgt.code == :D then
1325
+ # This site is ignored,
1326
+ # because both reference and target are gap
1327
+ else
1328
+ raise 'Bug: should not reach here.'
1329
+ end
1330
+ end
1331
+ end #while
1332
+
1333
+ # rest of data_ref
1334
+ len = 0
1335
+ data_ref.each do |ref|
1336
+ len += ref.length if ref.code == :M
1337
+ end
1338
+ data.push Code.new(:D, len) if len > 0
1339
+
1340
+ # rest of data_tgt
1341
+ len = 0
1342
+ data_tgt.each do |tgt|
1343
+ len += tgt.length if tgt.code == :M
1344
+ end
1345
+ data.push Code.new(:I, len) if len > 0
1346
+
1347
+ @data = data
1348
+ true
1349
+ end
1350
+ private :__initialize_from_sequences_na
1351
+
1352
+ # Creates a new Gap object from given sequence alignment.
1353
+ #
1354
+ # Note that sites of which both reference and target are gaps
1355
+ # are silently removed.
1356
+ #
1357
+ # ---
1358
+ # *Arguments*:
1359
+ # * _reference_: reference sequence (nucleotide sequence)
1360
+ # * _target_: target sequence (nucleotide sequence)
1361
+ # * <I>gap_regexp</I>: regexp to identify gap
1362
+ def self.new_from_sequences_na(reference, target,
1363
+ gap_regexp = /[^a-zA-Z]/)
1364
+ gap = self.new
1365
+ gap.instance_eval {
1366
+ __initialize_from_sequences_na(reference, target,
1367
+ gap_regexp)
1368
+ }
1369
+ gap
1370
+ end
1371
+
1372
+ # (private method)
1373
+ # scans a codon or gap in reference sequence
1374
+ def __scan_codon(sc_ref,
1375
+ gap_regexp, space_regexp,
1376
+ forward_frameshift_regexp,
1377
+ reverse_frameshift_regexp)
1378
+ chars = []
1379
+ gap_count = 0
1380
+ fs_count = 0
1381
+
1382
+ while chars.size < 3 + fs_count and char = sc_ref.scan(/./mn)
1383
+ case char
1384
+ when space_regexp
1385
+ # ignored
1386
+ when forward_frameshift_regexp
1387
+ # next char is forward frameshift
1388
+ fs_count += 1
1389
+ when reverse_frameshift_regexp
1390
+ # next char is reverse frameshift
1391
+ fs_count -= 1
1392
+ when gap_regexp
1393
+ chars.push char
1394
+ gap_count += 1
1395
+ else
1396
+ chars.push char
1397
+ end
1398
+ end #while
1399
+ if chars.size < (3 + fs_count) then
1400
+ gap_count += (3 + fs_count) - chars.size
1401
+ end
1402
+ return gap_count, fs_count
1403
+ end
1404
+ private :__scan_codon
1405
+
1406
+ # (private method)
1407
+ # internal use only
1408
+ def __push_code_to_data(cur, data, code, len)
1409
+ if cur and cur.code == code then
1410
+ cur.length += len
1411
+ else
1412
+ cur = Code.new(code, len)
1413
+ data.push cur
1414
+ end
1415
+ return cur
1416
+ end
1417
+ private :__push_code_to_data
1418
+
1419
+ # (private method)
1420
+ # Parses given reference(nuc)-target(amino) sequence alignment and
1421
+ # initializes self. Existing data will be erased.
1422
+ def __initialize_from_sequences_na_aa(reference, target,
1423
+ gap_regexp = /[^a-zA-Z]/,
1424
+ space_regexp = /\s/,
1425
+ forward_frameshift_regexp =
1426
+ /\>/,
1427
+ reverse_frameshift_regexp =
1428
+ /\</)
1429
+
1430
+ data = []
1431
+ sc_ref = StringScanner.new(reference)
1432
+ sc_tgt = StringScanner.new(target)
1433
+
1434
+ re_one = /./mn
1435
+
1436
+ while !sc_tgt.eos?
1437
+ if len = sc_tgt.skip(space_regexp) then
1438
+ # ignored
1439
+ elsif len = sc_tgt.skip(forward_frameshift_regexp) then
1440
+ cur = __push_code_to_data(cur, data, :F, len)
1441
+ len.times { sc_ref.scan(re_one) }
1442
+
1443
+ elsif len = sc_tgt.skip(reverse_frameshift_regexp) then
1444
+ cur = __push_code_to_data(cur, data, :R, len)
1445
+ pos = sc_ref.pos
1446
+ pos -= len
1447
+ if pos < 0 then
1448
+ warn "Incorrect reverse frameshift" if $VERBOSE
1449
+ pos = 0
1450
+ end
1451
+ sc_ref.pos = pos
1452
+
1453
+ elsif len = sc_tgt.skip(gap_regexp) then
1454
+ len.times do
1455
+ ref_gaps, ref_fs = __scan_codon(sc_ref,
1456
+ gap_regexp,
1457
+ space_regexp,
1458
+ forward_frameshift_regexp,
1459
+ reverse_frameshift_regexp)
1460
+ case ref_gaps
1461
+ when 3
1462
+ # both ref and tgt are gap. ignored the site
1463
+ when 2, 1
1464
+ # forward frameshift inserted
1465
+ ref_fs += (3 - ref_gaps)
1466
+ when 0
1467
+ cur = __push_code_to_data(cur, data, :D, 1)
1468
+ else
1469
+ raise 'Bug: should not reach here'
1470
+ end
1471
+ if ref_fs < 0 then
1472
+ cur = __push_code_to_data(cur, data, :R, -ref_fs)
1473
+ elsif ref_fs > 0 then
1474
+ cur = __push_code_to_data(cur, data, :F, ref_fs)
1475
+ end
1476
+ end #len.times
1477
+ elsif len = sc_tgt.skip(re_one) then
1478
+ # always 1-letter
1479
+ ref_gaps, ref_fs = __scan_codon(sc_ref,
1480
+ gap_regexp,
1481
+ space_regexp,
1482
+ forward_frameshift_regexp,
1483
+ reverse_frameshift_regexp)
1484
+ case ref_gaps
1485
+ when 3
1486
+ cur = __push_code_to_data(cur, data, :I, 1)
1487
+ when 2, 1, 0
1488
+ # reverse frameshift inserted when gaps exist
1489
+ ref_fs -= ref_gaps
1490
+ # normal site
1491
+ cur = __push_code_to_data(cur, data, :M, 1)
1492
+ else
1493
+ raise 'Bug: should not reach here'
1494
+ end
1495
+ if ref_fs < 0 then
1496
+ cur = __push_code_to_data(cur, data, :R, -ref_fs)
1497
+ elsif ref_fs > 0 then
1498
+ cur = __push_code_to_data(cur, data, :F, ref_fs)
1499
+ end
1500
+ else
1501
+ raise 'Bug: should not reach here'
1502
+ end
1503
+ end #while
1504
+
1505
+ if sc_ref.rest_size > 0 then
1506
+ rest = sc_ref.scan(/.*/mn)
1507
+ rest.gsub!(space_regexp, '')
1508
+ rest.gsub!(forward_frameshift_regexp, '')
1509
+ rest.gsub!(reverse_frameshift_regexp, '')
1510
+ rest.gsub!(gap_regexp, '')
1511
+ len = rest.length.div(3)
1512
+ cur = __push_code_to_data(cur, data, :D, len) if len > 0
1513
+ len = rest.length % 3
1514
+ cur = __push_code_to_data(cur, data, :F, len) if len > 0
1515
+ end
1516
+
1517
+ @data = data
1518
+ self
1519
+ end
1520
+ private :__initialize_from_sequences_na_aa
1521
+
1522
+ # Creates a new Gap object from given sequence alignment.
1523
+ #
1524
+ # Note that sites of which both reference and target are gaps
1525
+ # are silently removed.
1526
+ #
1527
+ # For incorrect alignments that break 3:1 rule,
1528
+ # gap positions will be moved inside codons,
1529
+ # unwanted gaps will be removed, and
1530
+ # some forward or reverse frameshift will be inserted.
1531
+ #
1532
+ # For example,
1533
+ # atgg-taagac-att
1534
+ # M V K - I
1535
+ # is treated as:
1536
+ # atggt<aagacatt
1537
+ # M V K >>I
1538
+ #
1539
+ # Incorrect combination of frameshift with frameshift or gap
1540
+ # may cause undefined behavior.
1541
+ #
1542
+ # Forward frameshifts are recomended to be indicated in the
1543
+ # target sequence.
1544
+ # Reverse frameshifts can be indicated in the reference sequence
1545
+ # or the target sequence.
1546
+ #
1547
+ # Priority of regular expressions:
1548
+ # space > forward/reverse frameshift > gap
1549
+ #
1550
+ # ---
1551
+ # *Arguments*:
1552
+ # * _reference_: reference sequence (nucleotide sequence)
1553
+ # * _target_: target sequence (amino acid sequence)
1554
+ # * <I>gap_regexp</I>: regexp to identify gap
1555
+ # * <I>space_regexp</I>: regexp to identify space character which is completely ignored
1556
+ # * <I>forward_frameshift_regexp</I>: regexp to identify forward frameshift
1557
+ # * <I>reverse_frameshift_regexp</I>: regexp to identify reverse frameshift
1558
+ def self.new_from_sequences_na_aa(reference, target,
1559
+ gap_regexp = /[^a-zA-Z]/,
1560
+ space_regexp = /\s/,
1561
+ forward_frameshift_regexp = /\>/,
1562
+ reverse_frameshift_regexp = /\</)
1563
+ gap = self.new
1564
+ gap.instance_eval {
1565
+ __initialize_from_sequences_na_aa(reference, target,
1566
+ gap_regexp,
1567
+ space_regexp,
1568
+ forward_frameshift_regexp,
1569
+ reverse_frameshift_regexp)
1570
+ }
1571
+ gap
1572
+ end
1573
+
1574
+ # string representation
1575
+ def to_s
1576
+ @data.collect { |x| x.to_s }.join(" ")
1577
+ end
1578
+
1579
+ # Internal data. Users must not use it.
1580
+ attr_reader :data
1581
+ # @data can be read by other Gap instances
1582
+ protected :data
1583
+
1584
+ # If self == other, returns true.
1585
+ # otherwise, returns false.
1586
+ def ==(other)
1587
+ if other.class == self.class and
1588
+ @data == other.data then
1589
+ true
1590
+ else
1591
+ false
1592
+ end
1593
+ end
1594
+
1595
+ # duplicates sequences
1596
+ def dup_seqs(*arg)
1597
+ arg.collect do |s|
1598
+ begin
1599
+ s = s.seq
1600
+ rescue NoMethodError
1601
+ end
1602
+ s.dup
1603
+ end
1604
+ end
1605
+ private :dup_seqs
1606
+
1607
+ # (private method)
1608
+ # insert gaps refers to the gap rule inside the object
1609
+ def __process_sequences(s_ref, s_tgt,
1610
+ ref_gap, tgt_gap,
1611
+ ref_increment, tgt_increment,
1612
+ forward_frameshift,
1613
+ reverse_frameshift)
1614
+ p_ref = 0
1615
+ p_tgt = 0
1616
+ @data.each do |c|
1617
+ #$stderr.puts c.inspect
1618
+ #$stderr.puts "p_ref=#{p_ref} s_ref=#{s_ref.inspect}"
1619
+ #$stderr.puts "p_tgt=#{p_tgt} s_tgt=#{s_tgt.inspect}"
1620
+ case c.code
1621
+ when :M # match
1622
+ p_ref += c.length * ref_increment
1623
+ p_tgt += c.length * tgt_increment
1624
+ when :I # insert a gap into the reference sequence
1625
+ begin
1626
+ s_ref[p_ref, 0] = ref_gap * c.length
1627
+ rescue IndexError
1628
+ raise 'reference sequence too short'
1629
+ end
1630
+ p_ref += c.length * ref_increment
1631
+ p_tgt += c.length * tgt_increment
1632
+ when :D # insert a gap into the target (delete from reference)
1633
+ begin
1634
+ s_tgt[p_tgt, 0] = tgt_gap * c.length
1635
+ rescue IndexError
1636
+ raise 'target sequence too short'
1637
+ end
1638
+ p_ref += c.length * ref_increment
1639
+ p_tgt += c.length * tgt_increment
1640
+ when :F # frameshift forward in the reference sequence
1641
+ begin
1642
+ s_tgt[p_tgt, 0] = forward_frameshift * c.length
1643
+ rescue IndexError
1644
+ raise 'target sequence too short'
1645
+ end
1646
+ p_ref += c.length
1647
+ p_tgt += c.length
1648
+ when :R # frameshift reverse in the reference sequence
1649
+ p_rev_frm = p_ref - c.length
1650
+ if p_rev_frm < 0 then
1651
+ raise 'too short reference sequence, or too many reverse frameshifts'
1652
+ end
1653
+ begin
1654
+ s_ref[p_rev_frm, 0] = reverse_frameshift * c.length
1655
+ rescue IndexError
1656
+ raise 'reference sequence too short'
1657
+ end
1658
+
1659
+ else
1660
+ warn "ignored #{c.to_s.inspect}" if $VERBOSE
1661
+ end
1662
+ end
1663
+
1664
+ if s_ref.length < p_ref then
1665
+ raise 'reference sequence too short'
1666
+ end
1667
+ if s_tgt.length < p_tgt then
1668
+ raise 'target sequence too short'
1669
+ end
1670
+ return s_ref, s_tgt
1671
+ end
1672
+ private :__process_sequences
1673
+
1674
+ # Processes nucleotide sequences and
1675
+ # returns gapped sequences as an array of sequences.
1676
+ #
1677
+ # Note for forward/reverse frameshift:
1678
+ # Forward/Reverse_frameshift is simply treated as
1679
+ # gap insertion to the target/reference sequence.
1680
+ #
1681
+ # ---
1682
+ # *Arguments*:
1683
+ # * _reference_: reference sequence (nucleotide sequence)
1684
+ # * _target_: target sequence (nucleotide sequence)
1685
+ # * <I>gap_char</I>: gap character
1686
+ def process_sequences_na(reference, target, gap_char = '-')
1687
+ s_ref, s_tgt = dup_seqs(reference, target)
1688
+
1689
+ s_ref, s_tgt = __process_sequences(s_ref, s_tgt,
1690
+ gap_char, gap_char,
1691
+ 1, 1,
1692
+ gap_char, gap_char)
1693
+
1694
+ if $VERBOSE and s_ref.length != s_tgt.length then
1695
+ warn "returned sequences not equal length"
1696
+ end
1697
+ return s_ref, s_tgt
1698
+ end
1699
+
1700
+ # Processes sequences and
1701
+ # returns gapped sequences as an array of sequences.
1702
+ # reference must be a nucleotide sequence, and
1703
+ # target must be an amino acid sequence.
1704
+ #
1705
+ # Note for reverse frameshift:
1706
+ # Reverse_frameshift characers are inserted in the
1707
+ # reference sequence.
1708
+ # For example, alignment of "Gap=M3 R1 M2" is:
1709
+ # atgaagat<aatgtc
1710
+ # M K I N V
1711
+ # Alignment of "Gap=M3 R3 M3" is:
1712
+ # atgaag<<<attaatgtc
1713
+ # M K I I N V
1714
+ #
1715
+ # ---
1716
+ # *Arguments*:
1717
+ # * _reference_: reference sequence (nucleotide sequence)
1718
+ # * _target_: target sequence (amino acid sequence)
1719
+ # * <I>gap_char</I>: gap character
1720
+ # * <I>space_char</I>: space character inserted to amino sequence for matching na-aa alignment
1721
+ # * <I>forward_frameshift</I>: forward frameshift character
1722
+ # * <I>reverse_frameshift</I>: reverse frameshift character
1723
+ def process_sequences_na_aa(reference, target,
1724
+ gap_char = '-',
1725
+ space_char = ' ',
1726
+ forward_frameshift = '>',
1727
+ reverse_frameshift = '<')
1728
+ s_ref, s_tgt = dup_seqs(reference, target)
1729
+ s_tgt = s_tgt.gsub(/./, "\\0#{space_char}#{space_char}")
1730
+ ref_increment = 3
1731
+ tgt_increment = 1 + space_char.length * 2
1732
+ ref_gap = gap_char * 3
1733
+ tgt_gap = "#{gap_char}#{space_char}#{space_char}"
1734
+ return __process_sequences(s_ref, s_tgt,
1735
+ ref_gap, tgt_gap,
1736
+ ref_increment, tgt_increment,
1737
+ forward_frameshift,
1738
+ reverse_frameshift)
1739
+ end
1740
+ end #class Gap
1741
+
1742
+ private
1743
+ def parse_attributes(string)
1744
+ return [] if !string or string == '.'
1745
+ attr_pairs = []
1746
+ string.split(';').each do |pair|
1747
+ key, value = pair.split('=', 2)
1748
+ key = unescape(key)
1749
+ values = value.to_s.split(',')
1750
+ case key
1751
+ when 'Target'
1752
+ values.collect! { |v| Target.parse(v) }
1753
+ when 'Gap'
1754
+ values.collect! { |v| Gap.parse(v) }
1755
+ else
1756
+ values.collect! { |v| unescape(v) }
1757
+ end
1758
+ attr_pairs.concat values.collect { |v| [ key, v ] }
1759
+ end
1760
+ return attr_pairs
1761
+ end # method parse_attributes
1762
+
1763
+ # Return the attributes as a string as it appears at the end of
1764
+ # a GFF3 line
1765
+ def attributes_to_s(attr)
1766
+ return '.' if !attr or attr.empty?
1767
+ keys = []
1768
+ hash = {}
1769
+ attr.each do |pair|
1770
+ key = pair[0]
1771
+ val = pair[1]
1772
+ keys.push key unless hash[key]
1773
+ hash[key] ||= []
1774
+ hash[key].push val
1775
+ end
1776
+ keys.collect do |key|
1777
+ values = hash[key]
1778
+ val = values.collect do |v|
1779
+ if v.kind_of?(Target) then
1780
+ v.to_s
1781
+ else
1782
+ escape_attribute(v.to_s)
1783
+ end
1784
+ end.join(',')
1785
+ "#{escape_attribute(key)}=#{val}"
1786
+ end.join(';')
1787
+ end
1788
+
1789
+ end # class GFF3::Record
1790
+
1791
+ # This is a dummy record corresponding to the "###" metadata.
1792
+ class RecordBoundary < GFF3::Record
1793
+ def initialize(*arg)
1794
+ super(*arg)
1795
+ self.freeze
1796
+ end
1797
+
1798
+ def to_s
1799
+ "###\n"
1800
+ end
1801
+ end #class RecordBoundary
1802
+
1803
+ # stores GFF3 MetaData
1804
+ MetaData = GFF2::MetaData
1805
+
1806
+ # parses metadata
1807
+ def parse_metadata(directive, line)
1808
+ case directive
1809
+ when 'gff-version'
1810
+ @gff_version ||= line.split(/\s+/)[1]
1811
+ when 'FASTA'
1812
+ @in_fasta = true
1813
+ when 'sequence-region'
1814
+ @sequence_regions.push SequenceRegion.parse(line)
1815
+ when '#' # "###" directive
1816
+ @records.push RecordBoundary.new
1817
+ else
1818
+ @metadata.push MetaData.parse(line)
1819
+ end
1820
+ true
1821
+ end
1822
+ private :parse_metadata
1823
+
1824
+ end #class GFF3
1825
+
1826
+ end # class GFF
155
1827
 
156
1828
  end # module Bio
157
1829