bio 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,86 @@
1
+ #
2
+ # = bio/db/genbank/genbank_to_biosequence.rb - Bio::GenBank to Bio::Sequence adapter module
3
+ #
4
+ # Copyright:: Copyright (C) 2008
5
+ # Naohisa Goto <ng@bioruby.org>,
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+
11
+ require 'bio/sequence'
12
+ require 'bio/sequence/adapter'
13
+
14
+ # Internal use only. Normal users should not use this module.
15
+ #
16
+ # Bio::GenBank to Bio::Sequence adapter module.
17
+ # It is internally used in Bio::GenBank#to_biosequence.
18
+ #
19
+ module Bio::Sequence::Adapter::GenBank
20
+
21
+ extend Bio::Sequence::Adapter
22
+
23
+ private
24
+
25
+ def_biosequence_adapter :seq
26
+
27
+ def_biosequence_adapter :id_namespace do |orig|
28
+ if /\_/ =~ orig.accession.to_s then
29
+ 'RefSeq'
30
+ else
31
+ 'GenBank'
32
+ end
33
+ end
34
+
35
+ def_biosequence_adapter :entry_id
36
+
37
+ def_biosequence_adapter :primary_accession, :accession
38
+
39
+ def_biosequence_adapter :secondary_accessions do |orig|
40
+ orig.accessions - [ orig.accession ]
41
+ end
42
+
43
+ def_biosequence_adapter :other_seqids do |orig|
44
+ if /GI\:(.+)/ =~ orig.gi.to_s then
45
+ [ Bio::Sequence::DBLink.new('GI', $1) ]
46
+ else
47
+ nil
48
+ end
49
+ end
50
+
51
+ def_biosequence_adapter :molecule_type, :natype
52
+
53
+ def_biosequence_adapter :division
54
+
55
+ def_biosequence_adapter :topology, :circular
56
+
57
+ def_biosequence_adapter :strandedness
58
+
59
+ def_biosequence_adapter :sequence_version, :version
60
+
61
+ #--
62
+ #sequence.date_created = nil #????
63
+ #++
64
+
65
+ def_biosequence_adapter :date_modified
66
+
67
+ def_biosequence_adapter :definition
68
+
69
+ def_biosequence_adapter :keywords
70
+
71
+ def_biosequence_adapter :species, :organism
72
+
73
+ def_biosequence_adapter :classification
74
+
75
+ #--
76
+ #sequence.organelle = nil # yet unsupported
77
+ #++
78
+
79
+ def_biosequence_adapter :comments, :comment
80
+
81
+ def_biosequence_adapter :references
82
+
83
+ def_biosequence_adapter :features
84
+
85
+ end #module Bio::Sequence::Adapter::GenBank
86
+
@@ -4,154 +4,1826 @@
4
4
  # Copyright:: Copyright (C) 2003, 2005
5
5
  # Toshiaki Katayama <k@bioruby.org>
6
6
  # 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
7
+ # 2008 Naohisa Goto <ng@bioruby.org>
7
8
  # License:: The Ruby License
8
9
  #
9
- # $Id: gff.rb,v 1.9 2007/05/18 15:23:42 k Exp $
10
+ # $Id:$
10
11
  #
12
+ require 'uri'
13
+ require 'strscan'
14
+ require 'enumerator'
15
+ require 'bio/db/fasta'
11
16
 
12
17
  module Bio
13
- # == DESCRIPTION
14
- # The Bio::GFF and Bio::GFF::Record classes describe data contained in a
15
- # GFF-formatted file. For information on the GFF format, see
16
- # http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
17
- # delimited format, including
18
- # * seqname
19
- # * source
20
- # * feature
21
- # * start
22
- # * end
23
- # * score
24
- # * strand
25
- # * frame
26
- # * attributes (optional)
27
- #
28
- # For example:
29
- # SEQ1 EMBL atg 103 105 . + 0
30
- # SEQ1 EMBL exon 103 172 . + 0
31
- # SEQ1 EMBL splice5 172 173 . + .
32
- # SEQ1 netgene splice5 172 173 0.94 + .
33
- # SEQ1 genie sp5-20 163 182 2.3 + .
34
- # SEQ1 genie sp5-10 168 177 2.1 + .
35
- # SEQ1 grail ATG 17 19 2.1 - 0
36
- #
37
- # The Bio::GFF object is a container for Bio::GFF::Record objects, each
38
- # representing a single line in the GFF file.
39
- class GFF
40
- # Creates a Bio::GFF object by building a collection of Bio::GFF::Record
41
- # objects.
18
+ # == DESCRIPTION
19
+ # The Bio::GFF and Bio::GFF::Record classes describe data contained in a
20
+ # GFF-formatted file. For information on the GFF format, see
21
+ # http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
22
+ # delimited format, including
23
+ # * seqname
24
+ # * source
25
+ # * feature
26
+ # * start
27
+ # * end
28
+ # * score
29
+ # * strand
30
+ # * frame
31
+ # * attributes (optional)
42
32
  #
43
- # Create a Bio::GFF object the hard way
44
- # this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n"
45
- # this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n"
46
- # this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n"
47
- # this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n"
48
- # this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n"
49
- # this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n"
50
- # this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n"
51
- # p Bio::GFF.new(this_gff)
52
- #
53
- # or create one based on a GFF-formatted file:
54
- # p Bio::GFF.new(File.open('my_data.gff')
55
- # ---
56
- # *Arguments*:
57
- # * _str_: string in GFF format
58
- # *Returns*:: Bio::GFF object
59
- def initialize(str = '')
60
- @records = Array.new
61
- str.each_line do |line|
62
- @records << Record.new(line)
33
+ # For example:
34
+ # SEQ1 EMBL atg 103 105 . + 0
35
+ # SEQ1 EMBL exon 103 172 . + 0
36
+ # SEQ1 EMBL splice5 172 173 . + .
37
+ # SEQ1 netgene splice5 172 173 0.94 + .
38
+ # SEQ1 genie sp5-20 163 182 2.3 + .
39
+ # SEQ1 genie sp5-10 168 177 2.1 + .
40
+ # SEQ1 grail ATG 17 19 2.1 - 0
41
+ #
42
+ # The Bio::GFF object is a container for Bio::GFF::Record objects, each
43
+ # representing a single line in the GFF file.
44
+ class GFF
45
+ # Creates a Bio::GFF object by building a collection of Bio::GFF::Record
46
+ # objects.
47
+ #
48
+ # Create a Bio::GFF object the hard way
49
+ # this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n"
50
+ # this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n"
51
+ # this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n"
52
+ # this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n"
53
+ # this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n"
54
+ # this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n"
55
+ # this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n"
56
+ # p Bio::GFF.new(this_gff)
57
+ #
58
+ # or create one based on a GFF-formatted file:
59
+ # p Bio::GFF.new(File.open('my_data.gff')
60
+ # ---
61
+ # *Arguments*:
62
+ # * _str_: string in GFF format
63
+ # *Returns*:: Bio::GFF object
64
+ def initialize(str = '')
65
+ @records = Array.new
66
+ str.each_line do |line|
67
+ @records << Record.new(line)
68
+ end
63
69
  end
64
- end
65
70
 
66
- # An array of Bio::GFF::Record objects.
67
- attr_accessor :records
71
+ # An array of Bio::GFF::Record objects.
72
+ attr_accessor :records
68
73
 
69
- # Represents a single line of a GFF-formatted file. See Bio::GFF for more
70
- # information.
71
- class Record
74
+ # Represents a single line of a GFF-formatted file. See Bio::GFF for more
75
+ # information.
76
+ class Record
72
77
 
73
- # Name of the reference sequence
74
- attr_accessor :seqname
78
+ # Name of the reference sequence
79
+ attr_accessor :seqname
75
80
 
76
- # Name of the source of the feature (e.g. program that did prediction)
77
- attr_accessor :source
81
+ # Name of the source of the feature (e.g. program that did prediction)
82
+ attr_accessor :source
78
83
 
79
- # Name of the feature
80
- attr_accessor :feature
84
+ # Name of the feature
85
+ attr_accessor :feature
81
86
 
82
- # Start position of feature on reference sequence
83
- attr_accessor :start
87
+ # Start position of feature on reference sequence
88
+ attr_accessor :start
84
89
 
85
- # End position of feature on reference sequence
86
- attr_accessor :end
90
+ # End position of feature on reference sequence
91
+ attr_accessor :end
87
92
 
88
- # Score of annotation (e.g. e-value for BLAST search)
89
- attr_accessor :score
93
+ # Score of annotation (e.g. e-value for BLAST search)
94
+ attr_accessor :score
90
95
 
91
- # Strand that feature is located on
92
- attr_accessor :strand
96
+ # Strand that feature is located on
97
+ attr_accessor :strand
93
98
 
94
- # For features of type 'exon': indicates where feature begins in the reading frame
95
- attr_accessor :frame
99
+ # For features of type 'exon': indicates where feature begins in the reading frame
100
+ attr_accessor :frame
96
101
 
97
- # List of tag=value pairs (e.g. to store name of the feature: ID=my_id)
98
- attr_accessor :attributes
102
+ # List of tag=value pairs (e.g. to store name of the feature: ID=my_id)
103
+ attr_accessor :attributes
99
104
 
100
- # Comments for the GFF record
101
- attr_accessor :comments
105
+ # Comments for the GFF record
106
+ attr_accessor :comment
102
107
 
103
- # Creates a Bio::GFF::Record object. Is typically not called directly, but
104
- # is called automatically when creating a Bio::GFF object.
105
- # ---
106
- # *Arguments*:
107
- # * _str_: a tab-delimited line in GFF format
108
- def initialize(str)
109
- @comments = str.chomp[/#.*/]
110
- return if /^#/.match(str)
111
- @seqname, @source, @feature, @start, @end, @score, @strand, @frame,
112
- attributes, = str.chomp.split("\t")
113
- @attributes = parse_attributes(attributes) if attributes
114
- end
108
+ # "comments" is deprecated. Instead, use "comment".
109
+ def comments
110
+ #warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"." if $VERBOSE
111
+ self.comment
112
+ end
115
113
 
116
- private
114
+ # "comments=" is deprecated. Instead, use "comment=".
115
+ def comments=(str)
116
+ #warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"." if $VERBOSE
117
+ self.comment = str
118
+ end
117
119
 
118
- def parse_attributes(attributes)
119
- hash = Hash.new
120
- attributes.split(/[^\\];/).each do |atr|
121
- key, value = atr.split(' ', 2)
122
- hash[key] = value
120
+ # Creates a Bio::GFF::Record object. Is typically not called directly, but
121
+ # is called automatically when creating a Bio::GFF object.
122
+ # ---
123
+ # *Arguments*:
124
+ # * _str_: a tab-delimited line in GFF format
125
+ def initialize(str)
126
+ @comment = str.chomp[/#.*/]
127
+ return if /^#/.match(str)
128
+ @seqname, @source, @feature, @start, @end, @score, @strand, @frame,
129
+ attributes, = str.chomp.split("\t")
130
+ @attributes = parse_attributes(attributes) if attributes
123
131
  end
124
- return hash
125
- end
126
- end
127
132
 
128
- # = DESCRIPTION
129
- # Represents version 2 of GFF specification. Is completely implemented by the
130
- # Bio::GFF class.
131
- class GFF2 < GFF
132
- VERSION = 2
133
- end
133
+ private
134
134
 
135
- # = DESCRIPTION
136
- # Represents version 3 of GFF specification. Is completely implemented by the
137
- # Bio::GFF class. For more information on version GFF3, see
138
- # http://flybase.bio.indiana.edu/annot/gff3.html
139
- class GFF3 < GFF
140
- VERSION = 3
135
+ def parse_attributes(attributes)
136
+ hash = Hash.new
141
137
 
142
- private
138
+ sc = StringScanner.new(attributes)
139
+ attrs = []
140
+ token = ''
141
+ while !sc.eos?
142
+ if sc.scan(/[^\\\;\"]+/) then
143
+ token.concat sc.matched
144
+ elsif sc.scan(/\;/) then
145
+ attrs.push token unless token.empty?
146
+ token = ''
147
+ elsif sc.scan(/\"/) then
148
+ origtext = sc.matched
149
+ while !sc.eos?
150
+ if sc.scan(/[^\\\"]+/) then
151
+ origtext.concat sc.matched
152
+ elsif sc.scan(/\"/) then
153
+ origtext.concat sc.matched
154
+ break
155
+ elsif sc.scan(/\\([\"\\])/) then
156
+ origtext.concat sc.matched
157
+ elsif sc.scan(/\\/) then
158
+ origtext.concat sc.matched
159
+ else
160
+ raise 'Bug: should not reach here'
161
+ end
162
+ end
163
+ token.concat origtext
164
+ elsif sc.scan(/\\\;/) then
165
+ token.concat sc.matched
166
+ elsif sc.scan(/\\/) then
167
+ token.concat sc.matched
168
+ else
169
+ raise 'Bug: should not reach here'
170
+ end #if
171
+ end #while
172
+ attrs.push token unless token.empty?
143
173
 
144
- def parse_attributes(attributes)
145
- hash = Hash.new
146
- attributes.split(/[^\\];/).each do |atr|
147
- key, value = atr.split('=', 2)
148
- hash[key] = value
174
+ attrs.each do |x|
175
+ key, value = x.split(' ', 2)
176
+ key.strip!
177
+ value.strip! if value
178
+ hash[key] = value
179
+ end
180
+ hash
149
181
  end
150
- return hash
151
- end
152
- end
153
182
 
154
- end # class GFF
183
+ end #Class Record
184
+
185
+ # = DESCRIPTION
186
+ # Represents version 2 of GFF specification.
187
+ # Its behavior is somehow different from Bio::GFF,
188
+ # especially for attributes.
189
+ #
190
+ class GFF2 < GFF
191
+ VERSION = 2
192
+
193
+ # string representation of the whole entry.
194
+ def to_s
195
+ ver = @gff_version || VERSION.to_s
196
+ ver = ver.gsub(/[\r\n]+/, ' ')
197
+ ([ "##gff-version #{ver}\n" ] +
198
+ @metadata.collect { |m| m.to_s } +
199
+ @records.collect{ |r| r.to_s }).join('')
200
+ end
201
+
202
+ # Private methods for GFF2 escaping characters.
203
+ # Internal only. Users should not use this module directly.
204
+ module Escape
205
+ # unsafe characters to be escaped
206
+ UNSAFE_GFF2 = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] \x80-\xfd><;=,%^&\|`]/n
207
+
208
+ # GFF2 standard identifier
209
+ IDENTIFIER_GFF2 = /\A[A-Za-z][A-Za-z0-9_]*\z/n
210
+
211
+ # GFF2 numeric value
212
+ NUMERIC_GFF2 = /\A[-+]?([0-9]+|[0-9]*\.[0-9]*)([eE][+-]?[0-9]+)?\z/n
213
+
214
+ # List of 1-letter special backslash code.
215
+ # The letters other than listed here are the same as
216
+ # those of without backslash, except for "x" and digits.
217
+ # (Note that \u (unicode) is not supported.)
218
+ BACKSLASH = {
219
+ 't' => "\t",
220
+ 'n' => "\n",
221
+ 'r' => "\r",
222
+ 'f' => "\f",
223
+ 'b' => "\b",
224
+ 'a' => "\a",
225
+ 'e' => "\e",
226
+ 'v' => "\v",
227
+ # 's' => " ",
228
+ }.freeze
229
+
230
+ # inverted hash of BACKSLASH
231
+ CHAR2BACKSLASH = BACKSLASH.invert.freeze
232
+
233
+ # inverted hash of BACKSLASH, including double quote and backslash
234
+ CHAR2BACKSLASH_EXTENDED =
235
+ CHAR2BACKSLASH.merge({ '"' => '"', "\\" => "\\" }).freeze
236
+
237
+ # prohibited characters in GFF2 columns
238
+ PROHIBITED_GFF2_COLUMNS = /[\t\r\n\x00-\x1f\x7f\xfe\xff]/
239
+
240
+ # prohibited characters in GFF2 attribute tags
241
+ PROHIBITED_GFF2_TAGS = /[\s\"\;\t\r\n\x00-\x1f\x7f\xfe\xff]/
242
+
243
+ private
244
+ # (private) escapes GFF2 free text string
245
+ def escape_gff2_freetext(str)
246
+ '"' + str.gsub(UNSAFE_GFF2) do |x|
247
+ "\\" + (CHAR2BACKSLASH_EXTENDED[x] || char2octal(x))
248
+ end + '"'
249
+ end
250
+
251
+ # (private) "x" => "\\oXXX"
252
+ # "x" must be a letter.
253
+ # If "x" is consisted of two bytes or more, joined with "\\".
254
+ def char2octal(x)
255
+ x.enum_for(:each_byte).collect { |y|
256
+ sprintf("%03o", y) }.join("\\")
257
+ end
258
+
259
+ # (private) escapes GFF2 attribute value string
260
+ def escape_gff2_attribute_value(str)
261
+ freetext?(str) ? escape_gff2_freetext(str) : str
262
+ end
263
+
264
+ # (private) check if the given string is a free text to be quoted
265
+ # by double-qoute.
266
+ def freetext?(str)
267
+ if IDENTIFIER_GFF2 =~ str or
268
+ NUMERIC_GFF2 =~ str then
269
+ false
270
+ else
271
+ true
272
+ end
273
+ end
274
+
275
+ # (private) escapes normal columns in GFF2
276
+ def gff2_column_to_s(str)
277
+ str = str.to_s
278
+ str = str.empty? ? '.' : str
279
+ str = str.gsub(PROHIBITED_GFF2_COLUMNS) do |x|
280
+ "\\" + (CHAR2BACKSLASH[x] || char2octal(x))
281
+ end
282
+ if str[0, 1] == '#' then
283
+ str[0, 1] = "\\043"
284
+ end
285
+ str
286
+ end
287
+
288
+ # (private) escapes GFF2 attribute tag string
289
+ def escape_gff2_attribute_tag(str)
290
+ str = str.to_s
291
+ str = str.empty? ? '.' : str
292
+ str = str.gsub(PROHIBITED_GFF2_TAGS) do |x|
293
+ "\\" + (CHAR2BACKSLASH[x] || char2octal(x))
294
+ end
295
+ if str[0, 1] == '#' then
296
+ str[0, 1] = "\\043"
297
+ end
298
+ str
299
+ end
300
+
301
+ # (private) dummy method, will be redefined in GFF3.
302
+ def unescape(str)
303
+ str
304
+ end
305
+ end #module Escape
306
+
307
+ # Stores GFF2 record.
308
+ class Record < GFF::Record
309
+
310
+ include Escape
311
+
312
+ # Stores GFF2 attribute's value.
313
+ class Value
314
+
315
+ include Escape
316
+
317
+ # Creates a new Value object.
318
+ # Note that the given array _values_ is directly stored in
319
+ # the object.
320
+ #
321
+ # ---
322
+ # *Arguments*:
323
+ # * (optional) _values_: Array containing String objects.
324
+ # *Returns*:: Value object.
325
+ def initialize(values = [])
326
+ @values = values
327
+ end
328
+
329
+ # Returns string representation of this Value object.
330
+ # ---
331
+ # *Returns*:: String
332
+ def to_s
333
+ @values.collect do |str|
334
+ escape_gff2_attribute_value(str)
335
+ end.join(' ')
336
+ end
337
+
338
+ # Returns all values in this object.
339
+ #
340
+ # Note that modification of the returned array would affect
341
+ # original Value object.
342
+ # ---
343
+ # *Returns*:: Array
344
+ def values
345
+ @values
346
+ end
347
+ alias to_a values
348
+
349
+ # Returns true if other == self.
350
+ # Otherwise, returns false.
351
+ def ==(other)
352
+ return false unless other.kind_of?(self.class) or
353
+ self.kind_of?(other.class)
354
+ self.values == other.values rescue super(other)
355
+ end
356
+ end #class Value
357
+
358
+
359
+ # Parses a GFF2-formatted line and returns a new
360
+ # Bio::GFF::GFF2::Record object.
361
+ def self.parse(str)
362
+ self.new.parse(str)
363
+ end
364
+
365
+ # Creates a Bio::GFF::GFF2::Record object.
366
+ # Is typically not called directly, but
367
+ # is called automatically when creating a Bio::GFF::GFF2 object.
368
+ #
369
+ # ---
370
+ # *Arguments*:
371
+ # * _str_: a tab-delimited line in GFF2 format
372
+ # *Arguments*:
373
+ # * _seqname_: seqname (String or nil)
374
+ # * _source_: source (String or nil)
375
+ # * _feature_: feature type (String)
376
+ # * _start_position_: start (Integer)
377
+ # * _end_position_: end (Integer)
378
+ # * _score_: score (Float or nil)
379
+ # * _strand_: strand (String or nil)
380
+ # * _frame_: frame (Integer or nil)
381
+ # * _attributes_: attributes (Array or nil)
382
+ def initialize(*arg)
383
+ if arg.size == 1 then
384
+ parse(arg[0])
385
+ else
386
+ @seqname, @source, @feature,
387
+ start, endp, @score, @strand, frame,
388
+ @attributes = arg
389
+ @start = start ? start.to_i : nil
390
+ @end = endp ? endp.to_i : nil
391
+ @score = score ? score.to_f : nil
392
+ @frame = frame ? frame.to_i : nil
393
+ end
394
+ @attributes ||= []
395
+ end
396
+
397
+ # Comment for the GFF record
398
+ attr_accessor :comment
399
+
400
+ # "comments" is deprecated. Instead, use "comment".
401
+ def comments
402
+ warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"."
403
+ self.comment
404
+ end
405
+
406
+ # "comments=" is deprecated. Instead, use "comment=".
407
+ def comments=(str)
408
+ warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"."
409
+ self.comment = str
410
+ end
411
+
412
+ # Parses a GFF2-formatted line and stores data from the string.
413
+ # Note that all existing data is wiped out.
414
+ def parse(string)
415
+ if /^\s*\#/ =~ string then
416
+ @comment = string[/\#(.*)/, 1].chomp
417
+ columns = []
418
+ else
419
+ columns = string.chomp.split("\t", 10)
420
+ @comment = columns[9][/\#(.*)/, 1].chomp if columns[9]
421
+ end
422
+
423
+ @seqname, @source, @feature,
424
+ start, endp, score, @strand, frame =
425
+ columns[0, 8].collect { |x|
426
+ str = unescape(x)
427
+ str == '.' ? nil : str
428
+ }
429
+ @start = start ? start.to_i : nil
430
+ @end = endp ? endp.to_i : nil
431
+ @score = score ? score.to_f : nil
432
+ @frame = frame ? frame.to_i : nil
433
+
434
+ @attributes = parse_attributes(columns[8])
435
+ end
436
+
437
+ # Returns true if the entry is empty except for comment.
438
+ # Otherwise, returns false.
439
+ def comment_only?
440
+ if !@seqname and
441
+ !@source and
442
+ !@feature and
443
+ !@start and
444
+ !@end and
445
+ !@score and
446
+ !@strand and
447
+ !@frame and
448
+ @attributes.empty? then
449
+ true
450
+ else
451
+ false
452
+ end
453
+ end
454
+
455
+ # Return the record as a GFF2 compatible string
456
+ def to_s
457
+ cmnt = if @comment and !@comment.to_s.strip.empty? then
458
+ @comment.gsub(/[\r\n]+/, ' ')
459
+ else
460
+ false
461
+ end
462
+ return "\##{cmnt}\n" if self.comment_only? and cmnt
463
+ [
464
+ gff2_column_to_s(@seqname),
465
+ gff2_column_to_s(@source),
466
+ gff2_column_to_s(@feature),
467
+ gff2_column_to_s(@start),
468
+ gff2_column_to_s(@end),
469
+ gff2_column_to_s(@score),
470
+ gff2_column_to_s(@strand),
471
+ gff2_column_to_s(@frame),
472
+ attributes_to_s(@attributes)
473
+ ].join("\t") +
474
+ (cmnt ? "\t\##{cmnt}\n" : "\n")
475
+ end
476
+
477
+ # Returns true if self == other. Otherwise, returns false.
478
+ def ==(other)
479
+ super ||
480
+ ((self.class == other.class and
481
+ self.seqname == other.seqname and
482
+ self.source == other.source and
483
+ self.feature == other.feature and
484
+ self.start == other.start and
485
+ self.end == other.end and
486
+ self.score == other.score and
487
+ self.strand == other.strand and
488
+ self.frame == other.frame and
489
+ self.attributes == other.attributes) ? true : false)
490
+ end
491
+
492
+ # Gets the attribute value for the given tag.
493
+ #
494
+ # Note that if two or more tag-value pairs with the same name found,
495
+ # only the first value is returned.
496
+ # ---
497
+ # *Arguments*:
498
+ # * (required) _tag_: String
499
+ # *Returns*:: String, Bio::GFF::GFF2::Record::Value object, or nil.
500
+ def get_attribute(tag)
501
+ ary = @attributes.assoc(tag)
502
+ ary ? ary[1] : nil
503
+ end
504
+ alias attribute get_attribute
505
+
506
+ # Gets the attribute values for the given tag.
507
+ # This method always returns an array.
508
+ # ---
509
+ # *Arguments*:
510
+ # * (required) _tag_: String
511
+ # *Returns*:: Array containing String or \
512
+ # Bio::GFF::GFF2::Record::Value objects.
513
+ def get_attributes(tag)
514
+ ary = @attributes.find_all do |x|
515
+ x[0] == tag
516
+ end
517
+ ary.collect! { |x| x[1] }
518
+ ary
519
+ end
520
+
521
+ # Sets value for the given tag.
522
+ # If the tag exists, the value of the tag is replaced with _value_.
523
+ # Note that if two or more tag-value pairs with the same name found,
524
+ # only the first tag-value pair is replaced.
525
+ #
526
+ # If the tag does not exist, the tag-value pair is newly added.
527
+ # ---
528
+ # *Arguments*:
529
+ # * (required) _tag_: String
530
+ # * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
531
+ # *Returns*:: _value_
532
+ def set_attribute(tag, value)
533
+ ary = @attributes.find do |x|
534
+ x[0] == tag
535
+ end
536
+ if ary then
537
+ ary[1] = value
538
+ else
539
+ ary = [ String.new(tag), value ]
540
+ @attributes.push ary
541
+ end
542
+ value
543
+ end
544
+
545
+ # Replaces values for the given tags with new values.
546
+ # Existing values for the tag are completely wiped out and
547
+ # replaced by new tag-value pairs.
548
+ # If the tag does not exist, the tag-value pairs are newly added.
549
+ #
550
+ # ---
551
+ # *Arguments*:
552
+ # * (required) _tag_: String
553
+ # * (required) _values_: String or Bio::GFF::GFF2::Record::Value objects.
554
+ # *Returns*:: _self_
555
+ def replace_attributes(tag, *values)
556
+ i = 0
557
+ @attributes.reject! do |x|
558
+ if x[0] == tag then
559
+ if i >= values.size then
560
+ true
561
+ else
562
+ x[1] = values[i]
563
+ i += 1
564
+ false
565
+ end
566
+ else
567
+ false
568
+ end
569
+ end
570
+ (i...(values.size)).each do |j|
571
+ @attributes.push [ String.new(tag), values[j] ]
572
+ end
573
+ self
574
+ end
575
+
576
+ # Adds a new tag-value pair.
577
+ # ---
578
+ # *Arguments*:
579
+ # * (required) _tag_: String
580
+ # * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
581
+ # *Returns*:: _value_
582
+ def add_attribute(tag, value)
583
+ @attributes.push([ String.new(tag), value ])
584
+ end
585
+
586
+ # Removes a specific tag-value pair.
587
+ #
588
+ # Note that if two or more tag-value pairs found,
589
+ # only the first tag-value pair is removed.
590
+ #
591
+ # ---
592
+ # *Arguments*:
593
+ # * (required) _tag_: String
594
+ # * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
595
+ # *Returns*:: if removed, _value_. Otherwise, nil.
596
+ def delete_attribute(tag, value)
597
+ removed = nil
598
+ if i = @attributes.index([ tag, value ]) then
599
+ ary = @attributes.delete_at(i)
600
+ removed = ary[1]
601
+ end
602
+ removed
603
+ end
604
+
605
+ # Removes all attributes with the specified tag.
606
+ #
607
+ # ---
608
+ # *Arguments*:
609
+ # * (required) _tag_: String
610
+ # *Returns*:: if removed, self. Otherwise, nil.
611
+ def delete_attributes(tag)
612
+ @attributes.reject! do |x|
613
+ x[0] == tag
614
+ end ? self : nil
615
+ end
616
+
617
+ # Sorts attributes order by given tag name's order.
618
+ # If a block is given, the argument _tags_ is ignored, and
619
+ # yields two tag names like Array#sort!.
620
+ #
621
+ # ---
622
+ # *Arguments*:
623
+ # * (required or optional) _tags_: Array containing String objects
624
+ # *Returns*:: _self_
625
+ def sort_attributes_by_tag!(tags = nil)
626
+ h = {}
627
+ s = @attributes.size
628
+ @attributes.each_with_index { |x, i| h[x] = i }
629
+ if block_given? then
630
+ @attributes.sort! do |x, y|
631
+ r = yield x[0], y[0]
632
+ if r == 0 then
633
+ r = (h[x] || s) <=> (h[y] || s)
634
+ end
635
+ r
636
+ end
637
+ else
638
+ unless tags then
639
+ raise ArgumentError, 'wrong number of arguments (0 for 1) or wrong argument value'
640
+ end
641
+ @attributes.sort! do |x, y|
642
+ r = (tags.index(x[0]) || tags.size) <=>
643
+ (tags.index(y[0]) || tags.size)
644
+ if r == 0 then
645
+ r = (h[x] || s) <=> (h[y] || s)
646
+ end
647
+ r
648
+ end
649
+ end
650
+ self
651
+ end
652
+
653
+ # Returns hash representation of attributes.
654
+ #
655
+ # Note: If two or more tag-value pairs with same tag names exist,
656
+ # only the first tag-value pair is used for each tag.
657
+ #
658
+ # ---
659
+ # *Returns*:: Hash object
660
+ def attributes_to_hash
661
+ h = {}
662
+ @attributes.each do |x|
663
+ key, val = x
664
+ h[key] = val unless h[key]
665
+ end
666
+ h
667
+ end
668
+
669
+ private
670
+
671
+ # (private) Parses attributes.
672
+ # Returns arrays
673
+ def parse_attributes(str)
674
+ return [] if !str or str == '.'
675
+ attr_pairs = parse_attributes_string(str)
676
+ attr_pairs.collect! do |x|
677
+ key = x.shift
678
+ val = (x.size == 1) ? x[0] : Value.new(x)
679
+ [ key, val ]
680
+ end
681
+ attr_pairs
682
+ end
683
+
684
+ # (private) Parses attributes string.
685
+ # Returns arrays
686
+ def parse_attributes_string(str)
687
+ sc = StringScanner.new(str)
688
+ attr_pairs = []
689
+ tokens = []
690
+ cur_token = ''
691
+ while !sc.eos?
692
+ if sc.scan(/[^\\\;\"\s]+/) then
693
+ cur_token.concat sc.matched
694
+ elsif sc.scan(/\s+/) then
695
+ tokens.push cur_token unless cur_token.empty?
696
+ cur_token = ''
697
+ elsif sc.scan(/\;/) then
698
+ tokens.push cur_token unless cur_token.empty?
699
+ cur_token = ''
700
+ attr_pairs.push tokens
701
+ tokens = []
702
+ elsif sc.scan(/\"/) then
703
+ tokens.push cur_token unless cur_token.empty?
704
+ cur_token = ''
705
+ freetext = ''
706
+ while !sc.eos?
707
+ if sc.scan(/[^\\\"]+/) then
708
+ freetext.concat sc.matched
709
+ elsif sc.scan(/\"/) then
710
+ break
711
+ elsif sc.scan(/\\([\"\\])/) then
712
+ freetext.concat sc[1]
713
+ elsif sc.scan(/\\x([0-9a-fA-F][0-9a-fA-F])/n) then
714
+ chr = sc[1].to_i(16).chr
715
+ freetext.concat chr
716
+ elsif sc.scan(/\\([0-7][0-7][0-7])/n) then
717
+ chr = sc[1].to_i(8).chr
718
+ freetext.concat chr
719
+ elsif sc.scan(/\\([^x0-9])/n) then
720
+ chr = Escape::BACKSLASH[sc[1]] || sc.matched
721
+ freetext.concat chr
722
+ elsif sc.scan(/\\/) then
723
+ freetext.concat sc.matched
724
+ else
725
+ raise 'Bug: should not reach here'
726
+ end
727
+ end
728
+ tokens.push freetext
729
+ #p freetext
730
+ # # disabled support for \; out of freetext
731
+ #elsif sc.scan(/\\\;/) then
732
+ # cur_token.concat sc.matched
733
+ elsif sc.scan(/\\/) then
734
+ cur_token.concat sc.matched
735
+ else
736
+ raise 'Bug: should not reach here'
737
+ end #if
738
+ end #while
739
+ tokens.push cur_token unless cur_token.empty?
740
+ attr_pairs.push tokens unless tokens.empty?
741
+ return attr_pairs
742
+ end
743
+
744
+ # (private) string representation of attributes
745
+ def attributes_to_s(attr)
746
+ attr.collect do |a|
747
+ tag, val = a
748
+ if Escape::IDENTIFIER_GFF2 !~ tag then
749
+ warn "Illegal GFF2 attribute tag: #{tag.inspect}" if $VERBOSE
750
+ end
751
+ tagstr = gff2_column_to_s(tag)
752
+ valstr = if val.kind_of?(Value) then
753
+ val.to_s
754
+ else
755
+ escape_gff2_attribute_value(val)
756
+ end
757
+ "#{tagstr} #{valstr}"
758
+ end.join(' ; ')
759
+ end
760
+ end #class Record
761
+
762
+ # Stores GFF2 meta-data.
763
+ class MetaData
764
+ # Creates a new MetaData object
765
+ def initialize(directive, data = nil)
766
+ @directive = directive
767
+ @data = data
768
+ end
769
+
770
+ # Directive. Usually, one of "feature-ontology", "attribute-ontology",
771
+ # or "source-ontology".
772
+ attr_accessor :directive
773
+
774
+ # data of this entry
775
+ attr_accessor :data
776
+
777
+ # parses a line
778
+ def self.parse(line)
779
+ directive, data = line.chomp.split(/\s+/, 2)
780
+ directive = directive.sub(/\A\#\#/, '') if directive
781
+ self.new(directive, data)
782
+ end
783
+
784
+ # string representation of this meta-data
785
+ def to_s
786
+ d = @directive.to_s.gsub(/[\r\n]+/, ' ')
787
+ v = ' ' + @data.to_s.gsub(/[\r\n]+/, ' ') unless @data.to_s.empty?
788
+ "\#\##{d}#{v}\n"
789
+ end
790
+
791
+ # Returns true if self == other. Otherwise, returns false.
792
+ def ==(other)
793
+ if self.class == other.class and
794
+ self.directive == other.directive and
795
+ self.data == other.data then
796
+ true
797
+ else
798
+ false
799
+ end
800
+ end
801
+ end #class MetaData
802
+
803
+ # (private) parses metadata
804
+ def parse_metadata(directive, line)
805
+ case directive
806
+ when 'gff-version'
807
+ @gff_version ||= line.split(/\s+/)[1]
808
+ else
809
+ @metadata.push MetaData.parse(line)
810
+ end
811
+ true
812
+ end
813
+ private :parse_metadata
814
+
815
+ # Creates a Bio::GFF::GFF2 object by building a collection of
816
+ # Bio::GFF::GFF2::Record (and metadata) objects.
817
+ #
818
+ # ---
819
+ # *Arguments*:
820
+ # * _str_: string in GFF format
821
+ # *Returns*:: Bio::GFF::GFF2 object
822
+ def initialize(str = nil)
823
+ @gff_version = nil
824
+ @records = []
825
+ @metadata = []
826
+ parse(str) if str
827
+ end
828
+
829
+ # GFF2 version string (String or nil). nil means "2".
830
+ attr_reader :gff_version
831
+
832
+ # Metadata (except "##gff-version").
833
+ # Must be an array of Bio::GFF::GFF2::MetaData objects.
834
+ attr_accessor :metadata
835
+
836
+ # Parses a GFF2 entries, and concatenated the parsed data.
837
+ #
838
+ # ---
839
+ # *Arguments*:
840
+ # * _str_: string in GFF format
841
+ # *Returns*:: self
842
+ def parse(str)
843
+ # parses GFF lines
844
+ str.each_line do |line|
845
+ if /^\#\#([^\s]+)/ =~ line then
846
+ parse_metadata($1, line)
847
+ else
848
+ @records << GFF2::Record.new(line)
849
+ end
850
+ end
851
+ self
852
+ end
853
+
854
+ end #class GFF2
855
+
856
+ # = DESCRIPTION
857
+ # Represents version 3 of GFF specification.
858
+ # For more information on version GFF3, see
859
+ # http://song.sourceforge.net/gff3.shtml
860
+ #--
861
+ # obsolete URL:
862
+ # http://flybase.bio.indiana.edu/annot/gff3.html
863
+ #++
864
+ class GFF3 < GFF
865
+ VERSION = 3
866
+
867
+ # Creates a Bio::GFF::GFF3 object by building a collection of
868
+ # Bio::GFF::GFF3::Record (and metadata) objects.
869
+ #
870
+ # ---
871
+ # *Arguments*:
872
+ # * _str_: string in GFF format
873
+ # *Returns*:: Bio::GFF object
874
+ def initialize(str = nil)
875
+ @gff_version = nil
876
+ @records = []
877
+ @sequence_regions = []
878
+ @metadata = []
879
+ @sequences = []
880
+ @in_fasta = false
881
+ parse(str) if str
882
+ end
883
+
884
+ # GFF3 version string (String or nil). nil means "3".
885
+ attr_reader :gff_version
886
+
887
+ # Metadata of "##sequence-region".
888
+ # Must be an array of Bio::GFF::GFF3::SequenceRegion objects.
889
+ attr_accessor :sequence_regions
890
+
891
+ # Metadata (except "##sequence-region", "##gff-version", "###").
892
+ # Must be an array of Bio::GFF::GFF3::MetaData objects.
893
+ attr_accessor :metadata
894
+
895
+ # Sequences bundled within GFF3.
896
+ # Must be an array of Bio::Sequence objects.
897
+ attr_accessor :sequences
898
+
899
+ # Parses a GFF3 entries, and concatenated the parsed data.
900
+ #
901
+ # Note that after "##FASTA" line is given,
902
+ # only fasta-formatted text is accepted.
903
+ #
904
+ # ---
905
+ # *Arguments*:
906
+ # * _str_: string in GFF format
907
+ # *Returns*:: self
908
+ def parse(str)
909
+ # if already after the ##FASTA line, parses fasta format and return
910
+ if @in_fasta then
911
+ parse_fasta(str)
912
+ return self
913
+ end
914
+
915
+ if str.respond_to?(:gets) then
916
+ # str is a IO-like object
917
+ fst = nil
918
+ else
919
+ # str is a String
920
+ gff, sep, fst = str.split(/^(\>|##FASTA.*)/n, 2)
921
+ fst = sep + fst if sep == '>' and fst
922
+ str = gff
923
+ end
924
+
925
+ # parses GFF lines
926
+ str.each_line do |line|
927
+ if /^\#\#([^\s]+)/ =~ line then
928
+ parse_metadata($1, line)
929
+ parse_fasta(str) if @in_fasta
930
+ elsif /^\>/ =~ line then
931
+ @in_fasta = true
932
+ parse_fasta(str, line)
933
+ else
934
+ @records << GFF3::Record.new(line)
935
+ end
936
+ end
937
+
938
+ # parses fasta format when str is a String and fasta data exists
939
+ if fst then
940
+ @in_fasta = true
941
+ parse_fasta(fst)
942
+ end
943
+
944
+ self
945
+ end
946
+
947
+ # parses fasta formatted data
948
+ def parse_fasta(str, line = nil)
949
+ str.each_line("\n>") do |seqstr|
950
+ if line then seqstr = line + seqstr; line = nil; end
951
+ x = seqstr.strip
952
+ next if x.empty? or x == '>'
953
+ fst = Bio::FastaFormat.new(seqstr)
954
+ seq = fst.to_seq
955
+ seq.entry_id =
956
+ unescape(fst.definition.strip.split(/\s/, 2)[0].to_s)
957
+ @sequences.push seq
958
+ end
959
+ end
960
+ private :parse_fasta
961
+
962
+ # string representation of whole entry.
963
+ def to_s
964
+ ver = @gff_version || VERSION.to_s
965
+ if @sequences.size > 0 then
966
+ seqs = "##FASTA\n" +
967
+ @sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('')
968
+ else
969
+ seqs = ''
970
+ end
971
+
972
+ ([ "##gff-version #{escape(ver)}\n" ] +
973
+ @metadata.collect { |m| m.to_s } +
974
+ @sequence_regions.collect { |m| m.to_s } +
975
+ @records.collect{ |r| r.to_s }).join('') + seqs
976
+ end
977
+
978
+ # Private methods for escaping characters.
979
+ # Internal only. Users should not use this module directly.
980
+ module Escape
981
+ # unsafe characters to be escaped for normal columns
982
+ UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
983
+
984
+ # unsafe characters to be escaped for seqid columns
985
+ # and target_id of the "Target" attribute
986
+ UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
987
+
988
+ # unsafe characters to be escaped for attribute columns
989
+ UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
990
+
991
+ private
992
+
993
+ # If str is empty, returns '.'. Otherwise, returns str.
994
+ def column_to_s(str)
995
+ str = str.to_s
996
+ str.empty? ? '.' : str
997
+ end
998
+
999
+ # Return the string corresponding to these characters unescaped
1000
+ def unescape(string)
1001
+ URI.unescape(string)
1002
+ end
1003
+
1004
+ # Escape a column according to the specification at
1005
+ # http://song.sourceforge.net/gff3.shtml.
1006
+ def escape(string)
1007
+ URI.escape(string, UNSAFE)
1008
+ end
1009
+
1010
+ # Escape seqid column according to the specification at
1011
+ # http://song.sourceforge.net/gff3.shtml.
1012
+ def escape_seqid(string)
1013
+ URI.escape(string, UNSAFE_SEQID)
1014
+ end
1015
+
1016
+ # Escape attribute according to the specification at
1017
+ # http://song.sourceforge.net/gff3.shtml.
1018
+ # In addition to the normal escape rule, the following characters
1019
+ # are escaped: ",=;".
1020
+ # Returns the string corresponding to these characters escaped.
1021
+ def escape_attribute(string)
1022
+ URI.escape(string, UNSAFE_ATTRIBUTE)
1023
+ end
1024
+ end #module Escape
1025
+
1026
+ include Escape
1027
+
1028
+ # Stores meta-data "##sequence-region seqid start end".
1029
+ class SequenceRegion
1030
+ include Escape
1031
+
1032
+ # creates a new SequenceRegion class
1033
+ def initialize(seqid, start, endpos)
1034
+ @seqid = seqid
1035
+ @start = start ? start.to_i : nil
1036
+ @end = endpos ? endpos.to_i : nil
1037
+ end
1038
+
1039
+ # parses given string and returns SequenceRegion class
1040
+ def self.parse(str)
1041
+ dummy, seqid, start, endpos =
1042
+ str.chomp.split(/\s+/, 4).collect { |x| URI.unescape(x) }
1043
+ self.new(seqid, start, endpos)
1044
+ end
1045
+
1046
+ # sequence ID
1047
+ attr_accessor :seqid
1048
+
1049
+ # start position
1050
+ attr_accessor :start
1051
+
1052
+ # end position
1053
+ attr_accessor :end
1054
+
1055
+ # string representation
1056
+ def to_s
1057
+ i = escape_seqid(column_to_s(@seqid))
1058
+ s = escape_seqid(column_to_s(@start))
1059
+ e = escape_seqid(column_to_s(@end))
1060
+ "##sequence-region #{i} #{s} #{e}\n"
1061
+ end
1062
+
1063
+ # Returns true if self == other. Otherwise, returns false.
1064
+ def ==(other)
1065
+ if other.class == self.class and
1066
+ other.seqid == self.seqid and
1067
+ other.start == self.start and
1068
+ other.end == self.end then
1069
+ true
1070
+ else
1071
+ false
1072
+ end
1073
+ end
1074
+ end #class SequenceRegion
1075
+
1076
+ # Represents a single line of a GFF3-formatted file.
1077
+ # See Bio::GFF::GFF3 for more information.
1078
+ class Record < GFF2::Record
1079
+
1080
+ include GFF3::Escape
1081
+
1082
+ # shortcut to the ID attribute
1083
+ def id
1084
+ get_attribute('ID')
1085
+ end
1086
+
1087
+ # set ID attribute
1088
+ def id=(str)
1089
+ set_attribute('ID', str)
1090
+ end
1091
+
1092
+ # aliases for Column 1 (formerly "seqname")
1093
+ alias seqid seqname
1094
+ alias seqid= seqname=
1095
+
1096
+ # aliases for Column 3 (formerly "feature").
1097
+ # In the GFF3 document http://song.sourceforge.net/gff3.shtml,
1098
+ # column3 is called "type", but we used "feature_type"
1099
+ # because "type" is already used by Ruby itself.
1100
+ alias feature_type feature
1101
+ alias feature_type= feature=
1102
+
1103
+ # aliases for Column 8
1104
+ alias phase frame
1105
+ alias phase= frame=
1106
+
1107
+ # Parses a GFF3-formatted line and returns a new
1108
+ # Bio::GFF::GFF3::Record object.
1109
+ def self.parse(str)
1110
+ self.new.parse(str)
1111
+ end
1112
+
1113
+ # Creates a Bio::GFF::GFF3::Record object.
1114
+ # Is typically not called directly, but
1115
+ # is called automatically when creating a Bio::GFF::GFF3 object.
1116
+ #
1117
+ # ---
1118
+ # *Arguments*:
1119
+ # * _str_: a tab-delimited line in GFF3 format
1120
+ # *Arguments*:
1121
+ # * _seqid_: sequence ID (String or nil)
1122
+ # * _source_: source (String or nil)
1123
+ # * _feature_type_: type of feature (String)
1124
+ # * _start_position_: start (Integer)
1125
+ # * _end_position_: end (Integer)
1126
+ # * _score_: score (Float or nil)
1127
+ # * _strand_: strand (String or nil)
1128
+ # * _phase_: phase (Integer or nil)
1129
+ # * _attributes_: attributes (Array or nil)
1130
+ def initialize(*arg)
1131
+ super(*arg)
1132
+ end
1133
+
1134
+ # Parses a GFF3-formatted line and stores data from the string.
1135
+ # Note that all existing data is wiped out.
1136
+ def parse(string)
1137
+ super
1138
+ end
1139
+
1140
+ # Return the record as a GFF3 compatible string
1141
+ def to_s
1142
+ cmnt = if @comment and !@comment.to_s.strip.empty? then
1143
+ @comment.gsub(/[\r\n]+/, ' ')
1144
+ else
1145
+ false
1146
+ end
1147
+ return "\##{cmnt}\n" if self.comment_only? and cmnt
1148
+ [
1149
+ escape_seqid(column_to_s(@seqname)),
1150
+ escape(column_to_s(@source)),
1151
+ escape(column_to_s(@feature)),
1152
+ escape(column_to_s(@start)),
1153
+ escape(column_to_s(@end)),
1154
+ escape(column_to_s(@score)),
1155
+ escape(column_to_s(@strand)),
1156
+ escape(column_to_s(@frame)),
1157
+ attributes_to_s(@attributes)
1158
+ ].join("\t") +
1159
+ (cmnt ? "\t\##{cmnt}\n" : "\n")
1160
+ end
1161
+
1162
+ # Bio:GFF::GFF3::Record::Target is a class to store
1163
+ # data of "Target" attribute.
1164
+ class Target
1165
+ include GFF3::Escape
1166
+
1167
+ # Creates a new Target object.
1168
+ def initialize(target_id, start, endpos, strand = nil)
1169
+ @target_id = target_id
1170
+ @start = start ? start.to_i : nil
1171
+ @end = endpos ? endpos.to_i : nil
1172
+ @strand = strand
1173
+ end
1174
+
1175
+ # target ID
1176
+ attr_accessor :target_id
1177
+
1178
+ # start position
1179
+ attr_accessor :start
1180
+
1181
+ # end position
1182
+ attr_accessor :end
1183
+
1184
+ # strand (optional). Normally, "+" or "-", or nil.
1185
+ attr_accessor :strand
1186
+
1187
+ # parses "target_id start end [strand]"-style string
1188
+ # (for example, "ABC789 123 456 +")
1189
+ # and creates a new Target object.
1190
+ #
1191
+ def self.parse(str)
1192
+ target_id, start, endpos, strand =
1193
+ str.split(/ +/, 4).collect { |x| URI.unescape(x) }
1194
+ self.new(target_id, start, endpos, strand)
1195
+ end
1196
+
1197
+ # returns a string
1198
+ def to_s
1199
+ i = escape_seqid(column_to_s(@target_id))
1200
+ s = escape_attribute(column_to_s(@start))
1201
+ e = escape_attribute(column_to_s(@end))
1202
+ strnd = escape_attribute(@strand.to_s)
1203
+ strnd = " " + strnd unless strnd.empty?
1204
+ "#{i} #{s} #{e}#{strnd}"
1205
+ end
1206
+
1207
+ # Returns true if self == other. Otherwise, returns false.
1208
+ def ==(other)
1209
+ if other.class == self.class and
1210
+ other.target_id == self.target_id and
1211
+ other.start == self.start and
1212
+ other.end == self.end and
1213
+ other.strand == self.strand then
1214
+ true
1215
+ else
1216
+ false
1217
+ end
1218
+ end
1219
+ end #class Target
1220
+
1221
+ # Bio:GFF::GFF3::Record::Gap is a class to store
1222
+ # data of "Gap" attribute.
1223
+ class Gap
1224
+
1225
+ # Code is a class to store length of single-letter code.
1226
+ Code = Struct.new(:code, :length)
1227
+
1228
+ # Code is a class to store length of single-letter code.
1229
+ class Code
1230
+ # 1-letter code (Symbol). One of :M, :I, :D, :F, or :R is expected.
1231
+ attr_reader :code if false #dummy for RDoc
1232
+
1233
+ # length (Integer)
1234
+ attr_reader :length if false #dummy for RDoc
1235
+
1236
+ def to_s
1237
+ "#{code}#{length}"
1238
+ end
1239
+ end #class code
1240
+
1241
+ # Creates a new Gap object.
1242
+ #
1243
+ # ---
1244
+ # *Arguments*:
1245
+ # * _str_: a formatted string, or nil.
1246
+ def initialize(str = nil)
1247
+ if str then
1248
+ @data = str.split(/ +/).collect do |x|
1249
+ if /\A([A-Z])([0-9]+)\z/ =~ x.strip then
1250
+ Code.new($1.intern, $2.to_i)
1251
+ else
1252
+ warn "ignored unknown token: #{x}.inspect" if $VERBOSE
1253
+ nil
1254
+ end
1255
+ end
1256
+ @data.compact!
1257
+ else
1258
+ @data = []
1259
+ end
1260
+ end
1261
+
1262
+ # Same as new(str).
1263
+ def self.parse(str)
1264
+ self.new(str)
1265
+ end
1266
+
1267
+ # (private method)
1268
+ # Scans gaps and returns an array of Code objects
1269
+ def __scan_gap(str, gap_regexp = /[^a-zA-Z]/,
1270
+ code_i = :I, code_m = :M)
1271
+ sc = StringScanner.new(str)
1272
+ data = []
1273
+ while len = sc.skip_until(gap_regexp)
1274
+ mlen = len - sc.matched_size
1275
+ data.push Code.new(code_m, mlen) if mlen > 0
1276
+ g = Code.new(code_i, sc.matched_size)
1277
+ while glen = sc.skip(gap_regexp)
1278
+ g.length += glen
1279
+ end
1280
+ data.push g
1281
+ end
1282
+ if sc.rest_size > 0 then
1283
+ m = Code.new(code_m, sc.rest_size)
1284
+ data.push m
1285
+ end
1286
+ data
1287
+ end
1288
+ private :__scan_gap
1289
+
1290
+ # (private method)
1291
+ # Parses given reference-target sequence alignment and
1292
+ # initializes self. Existing data will be erased.
1293
+ def __initialize_from_sequences_na(reference, target,
1294
+ gap_regexp = /[^a-zA-Z]/)
1295
+
1296
+ data_ref = __scan_gap(reference, gap_regexp, :I, :M)
1297
+ data_tgt = __scan_gap(target, gap_regexp, :D, :M)
1298
+ data = []
1299
+
1300
+ while !data_ref.empty? and !data_tgt.empty?
1301
+ ref = data_ref.shift
1302
+ tgt = data_tgt.shift
1303
+ if ref.length > tgt.length then
1304
+ x = Code.new(ref.code, ref.length - tgt.length)
1305
+ data_ref.unshift x
1306
+ ref.length = tgt.length
1307
+ elsif ref.length < tgt.length then
1308
+ x = Code.new(tgt.code, tgt.length - ref.length)
1309
+ data_tgt.unshift x
1310
+ tgt.length = ref.length
1311
+ end
1312
+ case ref.code
1313
+ when :M
1314
+ if tgt.code == :M then
1315
+ data.push ref
1316
+ elsif tgt.code == :D then
1317
+ data.push tgt
1318
+ else
1319
+ raise 'Bug: should not reach here.'
1320
+ end
1321
+ when :I
1322
+ if tgt.code == :M then
1323
+ data.push ref
1324
+ elsif tgt.code == :D then
1325
+ # This site is ignored,
1326
+ # because both reference and target are gap
1327
+ else
1328
+ raise 'Bug: should not reach here.'
1329
+ end
1330
+ end
1331
+ end #while
1332
+
1333
+ # rest of data_ref
1334
+ len = 0
1335
+ data_ref.each do |ref|
1336
+ len += ref.length if ref.code == :M
1337
+ end
1338
+ data.push Code.new(:D, len) if len > 0
1339
+
1340
+ # rest of data_tgt
1341
+ len = 0
1342
+ data_tgt.each do |tgt|
1343
+ len += tgt.length if tgt.code == :M
1344
+ end
1345
+ data.push Code.new(:I, len) if len > 0
1346
+
1347
+ @data = data
1348
+ true
1349
+ end
1350
+ private :__initialize_from_sequences_na
1351
+
1352
+ # Creates a new Gap object from given sequence alignment.
1353
+ #
1354
+ # Note that sites of which both reference and target are gaps
1355
+ # are silently removed.
1356
+ #
1357
+ # ---
1358
+ # *Arguments*:
1359
+ # * _reference_: reference sequence (nucleotide sequence)
1360
+ # * _target_: target sequence (nucleotide sequence)
1361
+ # * <I>gap_regexp</I>: regexp to identify gap
1362
+ def self.new_from_sequences_na(reference, target,
1363
+ gap_regexp = /[^a-zA-Z]/)
1364
+ gap = self.new
1365
+ gap.instance_eval {
1366
+ __initialize_from_sequences_na(reference, target,
1367
+ gap_regexp)
1368
+ }
1369
+ gap
1370
+ end
1371
+
1372
+ # (private method)
1373
+ # scans a codon or gap in reference sequence
1374
+ def __scan_codon(sc_ref,
1375
+ gap_regexp, space_regexp,
1376
+ forward_frameshift_regexp,
1377
+ reverse_frameshift_regexp)
1378
+ chars = []
1379
+ gap_count = 0
1380
+ fs_count = 0
1381
+
1382
+ while chars.size < 3 + fs_count and char = sc_ref.scan(/./mn)
1383
+ case char
1384
+ when space_regexp
1385
+ # ignored
1386
+ when forward_frameshift_regexp
1387
+ # next char is forward frameshift
1388
+ fs_count += 1
1389
+ when reverse_frameshift_regexp
1390
+ # next char is reverse frameshift
1391
+ fs_count -= 1
1392
+ when gap_regexp
1393
+ chars.push char
1394
+ gap_count += 1
1395
+ else
1396
+ chars.push char
1397
+ end
1398
+ end #while
1399
+ if chars.size < (3 + fs_count) then
1400
+ gap_count += (3 + fs_count) - chars.size
1401
+ end
1402
+ return gap_count, fs_count
1403
+ end
1404
+ private :__scan_codon
1405
+
1406
+ # (private method)
1407
+ # internal use only
1408
+ def __push_code_to_data(cur, data, code, len)
1409
+ if cur and cur.code == code then
1410
+ cur.length += len
1411
+ else
1412
+ cur = Code.new(code, len)
1413
+ data.push cur
1414
+ end
1415
+ return cur
1416
+ end
1417
+ private :__push_code_to_data
1418
+
1419
+ # (private method)
1420
+ # Parses given reference(nuc)-target(amino) sequence alignment and
1421
+ # initializes self. Existing data will be erased.
1422
+ def __initialize_from_sequences_na_aa(reference, target,
1423
+ gap_regexp = /[^a-zA-Z]/,
1424
+ space_regexp = /\s/,
1425
+ forward_frameshift_regexp =
1426
+ /\>/,
1427
+ reverse_frameshift_regexp =
1428
+ /\</)
1429
+
1430
+ data = []
1431
+ sc_ref = StringScanner.new(reference)
1432
+ sc_tgt = StringScanner.new(target)
1433
+
1434
+ re_one = /./mn
1435
+
1436
+ while !sc_tgt.eos?
1437
+ if len = sc_tgt.skip(space_regexp) then
1438
+ # ignored
1439
+ elsif len = sc_tgt.skip(forward_frameshift_regexp) then
1440
+ cur = __push_code_to_data(cur, data, :F, len)
1441
+ len.times { sc_ref.scan(re_one) }
1442
+
1443
+ elsif len = sc_tgt.skip(reverse_frameshift_regexp) then
1444
+ cur = __push_code_to_data(cur, data, :R, len)
1445
+ pos = sc_ref.pos
1446
+ pos -= len
1447
+ if pos < 0 then
1448
+ warn "Incorrect reverse frameshift" if $VERBOSE
1449
+ pos = 0
1450
+ end
1451
+ sc_ref.pos = pos
1452
+
1453
+ elsif len = sc_tgt.skip(gap_regexp) then
1454
+ len.times do
1455
+ ref_gaps, ref_fs = __scan_codon(sc_ref,
1456
+ gap_regexp,
1457
+ space_regexp,
1458
+ forward_frameshift_regexp,
1459
+ reverse_frameshift_regexp)
1460
+ case ref_gaps
1461
+ when 3
1462
+ # both ref and tgt are gap. ignored the site
1463
+ when 2, 1
1464
+ # forward frameshift inserted
1465
+ ref_fs += (3 - ref_gaps)
1466
+ when 0
1467
+ cur = __push_code_to_data(cur, data, :D, 1)
1468
+ else
1469
+ raise 'Bug: should not reach here'
1470
+ end
1471
+ if ref_fs < 0 then
1472
+ cur = __push_code_to_data(cur, data, :R, -ref_fs)
1473
+ elsif ref_fs > 0 then
1474
+ cur = __push_code_to_data(cur, data, :F, ref_fs)
1475
+ end
1476
+ end #len.times
1477
+ elsif len = sc_tgt.skip(re_one) then
1478
+ # always 1-letter
1479
+ ref_gaps, ref_fs = __scan_codon(sc_ref,
1480
+ gap_regexp,
1481
+ space_regexp,
1482
+ forward_frameshift_regexp,
1483
+ reverse_frameshift_regexp)
1484
+ case ref_gaps
1485
+ when 3
1486
+ cur = __push_code_to_data(cur, data, :I, 1)
1487
+ when 2, 1, 0
1488
+ # reverse frameshift inserted when gaps exist
1489
+ ref_fs -= ref_gaps
1490
+ # normal site
1491
+ cur = __push_code_to_data(cur, data, :M, 1)
1492
+ else
1493
+ raise 'Bug: should not reach here'
1494
+ end
1495
+ if ref_fs < 0 then
1496
+ cur = __push_code_to_data(cur, data, :R, -ref_fs)
1497
+ elsif ref_fs > 0 then
1498
+ cur = __push_code_to_data(cur, data, :F, ref_fs)
1499
+ end
1500
+ else
1501
+ raise 'Bug: should not reach here'
1502
+ end
1503
+ end #while
1504
+
1505
+ if sc_ref.rest_size > 0 then
1506
+ rest = sc_ref.scan(/.*/mn)
1507
+ rest.gsub!(space_regexp, '')
1508
+ rest.gsub!(forward_frameshift_regexp, '')
1509
+ rest.gsub!(reverse_frameshift_regexp, '')
1510
+ rest.gsub!(gap_regexp, '')
1511
+ len = rest.length.div(3)
1512
+ cur = __push_code_to_data(cur, data, :D, len) if len > 0
1513
+ len = rest.length % 3
1514
+ cur = __push_code_to_data(cur, data, :F, len) if len > 0
1515
+ end
1516
+
1517
+ @data = data
1518
+ self
1519
+ end
1520
+ private :__initialize_from_sequences_na_aa
1521
+
1522
+ # Creates a new Gap object from given sequence alignment.
1523
+ #
1524
+ # Note that sites of which both reference and target are gaps
1525
+ # are silently removed.
1526
+ #
1527
+ # For incorrect alignments that break 3:1 rule,
1528
+ # gap positions will be moved inside codons,
1529
+ # unwanted gaps will be removed, and
1530
+ # some forward or reverse frameshift will be inserted.
1531
+ #
1532
+ # For example,
1533
+ # atgg-taagac-att
1534
+ # M V K - I
1535
+ # is treated as:
1536
+ # atggt<aagacatt
1537
+ # M V K >>I
1538
+ #
1539
+ # Incorrect combination of frameshift with frameshift or gap
1540
+ # may cause undefined behavior.
1541
+ #
1542
+ # Forward frameshifts are recomended to be indicated in the
1543
+ # target sequence.
1544
+ # Reverse frameshifts can be indicated in the reference sequence
1545
+ # or the target sequence.
1546
+ #
1547
+ # Priority of regular expressions:
1548
+ # space > forward/reverse frameshift > gap
1549
+ #
1550
+ # ---
1551
+ # *Arguments*:
1552
+ # * _reference_: reference sequence (nucleotide sequence)
1553
+ # * _target_: target sequence (amino acid sequence)
1554
+ # * <I>gap_regexp</I>: regexp to identify gap
1555
+ # * <I>space_regexp</I>: regexp to identify space character which is completely ignored
1556
+ # * <I>forward_frameshift_regexp</I>: regexp to identify forward frameshift
1557
+ # * <I>reverse_frameshift_regexp</I>: regexp to identify reverse frameshift
1558
+ def self.new_from_sequences_na_aa(reference, target,
1559
+ gap_regexp = /[^a-zA-Z]/,
1560
+ space_regexp = /\s/,
1561
+ forward_frameshift_regexp = /\>/,
1562
+ reverse_frameshift_regexp = /\</)
1563
+ gap = self.new
1564
+ gap.instance_eval {
1565
+ __initialize_from_sequences_na_aa(reference, target,
1566
+ gap_regexp,
1567
+ space_regexp,
1568
+ forward_frameshift_regexp,
1569
+ reverse_frameshift_regexp)
1570
+ }
1571
+ gap
1572
+ end
1573
+
1574
+ # string representation
1575
+ def to_s
1576
+ @data.collect { |x| x.to_s }.join(" ")
1577
+ end
1578
+
1579
+ # Internal data. Users must not use it.
1580
+ attr_reader :data
1581
+ # @data can be read by other Gap instances
1582
+ protected :data
1583
+
1584
+ # If self == other, returns true.
1585
+ # otherwise, returns false.
1586
+ def ==(other)
1587
+ if other.class == self.class and
1588
+ @data == other.data then
1589
+ true
1590
+ else
1591
+ false
1592
+ end
1593
+ end
1594
+
1595
+ # duplicates sequences
1596
+ def dup_seqs(*arg)
1597
+ arg.collect do |s|
1598
+ begin
1599
+ s = s.seq
1600
+ rescue NoMethodError
1601
+ end
1602
+ s.dup
1603
+ end
1604
+ end
1605
+ private :dup_seqs
1606
+
1607
+ # (private method)
1608
+ # insert gaps refers to the gap rule inside the object
1609
+ def __process_sequences(s_ref, s_tgt,
1610
+ ref_gap, tgt_gap,
1611
+ ref_increment, tgt_increment,
1612
+ forward_frameshift,
1613
+ reverse_frameshift)
1614
+ p_ref = 0
1615
+ p_tgt = 0
1616
+ @data.each do |c|
1617
+ #$stderr.puts c.inspect
1618
+ #$stderr.puts "p_ref=#{p_ref} s_ref=#{s_ref.inspect}"
1619
+ #$stderr.puts "p_tgt=#{p_tgt} s_tgt=#{s_tgt.inspect}"
1620
+ case c.code
1621
+ when :M # match
1622
+ p_ref += c.length * ref_increment
1623
+ p_tgt += c.length * tgt_increment
1624
+ when :I # insert a gap into the reference sequence
1625
+ begin
1626
+ s_ref[p_ref, 0] = ref_gap * c.length
1627
+ rescue IndexError
1628
+ raise 'reference sequence too short'
1629
+ end
1630
+ p_ref += c.length * ref_increment
1631
+ p_tgt += c.length * tgt_increment
1632
+ when :D # insert a gap into the target (delete from reference)
1633
+ begin
1634
+ s_tgt[p_tgt, 0] = tgt_gap * c.length
1635
+ rescue IndexError
1636
+ raise 'target sequence too short'
1637
+ end
1638
+ p_ref += c.length * ref_increment
1639
+ p_tgt += c.length * tgt_increment
1640
+ when :F # frameshift forward in the reference sequence
1641
+ begin
1642
+ s_tgt[p_tgt, 0] = forward_frameshift * c.length
1643
+ rescue IndexError
1644
+ raise 'target sequence too short'
1645
+ end
1646
+ p_ref += c.length
1647
+ p_tgt += c.length
1648
+ when :R # frameshift reverse in the reference sequence
1649
+ p_rev_frm = p_ref - c.length
1650
+ if p_rev_frm < 0 then
1651
+ raise 'too short reference sequence, or too many reverse frameshifts'
1652
+ end
1653
+ begin
1654
+ s_ref[p_rev_frm, 0] = reverse_frameshift * c.length
1655
+ rescue IndexError
1656
+ raise 'reference sequence too short'
1657
+ end
1658
+
1659
+ else
1660
+ warn "ignored #{c.to_s.inspect}" if $VERBOSE
1661
+ end
1662
+ end
1663
+
1664
+ if s_ref.length < p_ref then
1665
+ raise 'reference sequence too short'
1666
+ end
1667
+ if s_tgt.length < p_tgt then
1668
+ raise 'target sequence too short'
1669
+ end
1670
+ return s_ref, s_tgt
1671
+ end
1672
+ private :__process_sequences
1673
+
1674
+ # Processes nucleotide sequences and
1675
+ # returns gapped sequences as an array of sequences.
1676
+ #
1677
+ # Note for forward/reverse frameshift:
1678
+ # Forward/Reverse_frameshift is simply treated as
1679
+ # gap insertion to the target/reference sequence.
1680
+ #
1681
+ # ---
1682
+ # *Arguments*:
1683
+ # * _reference_: reference sequence (nucleotide sequence)
1684
+ # * _target_: target sequence (nucleotide sequence)
1685
+ # * <I>gap_char</I>: gap character
1686
+ def process_sequences_na(reference, target, gap_char = '-')
1687
+ s_ref, s_tgt = dup_seqs(reference, target)
1688
+
1689
+ s_ref, s_tgt = __process_sequences(s_ref, s_tgt,
1690
+ gap_char, gap_char,
1691
+ 1, 1,
1692
+ gap_char, gap_char)
1693
+
1694
+ if $VERBOSE and s_ref.length != s_tgt.length then
1695
+ warn "returned sequences not equal length"
1696
+ end
1697
+ return s_ref, s_tgt
1698
+ end
1699
+
1700
+ # Processes sequences and
1701
+ # returns gapped sequences as an array of sequences.
1702
+ # reference must be a nucleotide sequence, and
1703
+ # target must be an amino acid sequence.
1704
+ #
1705
+ # Note for reverse frameshift:
1706
+ # Reverse_frameshift characers are inserted in the
1707
+ # reference sequence.
1708
+ # For example, alignment of "Gap=M3 R1 M2" is:
1709
+ # atgaagat<aatgtc
1710
+ # M K I N V
1711
+ # Alignment of "Gap=M3 R3 M3" is:
1712
+ # atgaag<<<attaatgtc
1713
+ # M K I I N V
1714
+ #
1715
+ # ---
1716
+ # *Arguments*:
1717
+ # * _reference_: reference sequence (nucleotide sequence)
1718
+ # * _target_: target sequence (amino acid sequence)
1719
+ # * <I>gap_char</I>: gap character
1720
+ # * <I>space_char</I>: space character inserted to amino sequence for matching na-aa alignment
1721
+ # * <I>forward_frameshift</I>: forward frameshift character
1722
+ # * <I>reverse_frameshift</I>: reverse frameshift character
1723
+ def process_sequences_na_aa(reference, target,
1724
+ gap_char = '-',
1725
+ space_char = ' ',
1726
+ forward_frameshift = '>',
1727
+ reverse_frameshift = '<')
1728
+ s_ref, s_tgt = dup_seqs(reference, target)
1729
+ s_tgt = s_tgt.gsub(/./, "\\0#{space_char}#{space_char}")
1730
+ ref_increment = 3
1731
+ tgt_increment = 1 + space_char.length * 2
1732
+ ref_gap = gap_char * 3
1733
+ tgt_gap = "#{gap_char}#{space_char}#{space_char}"
1734
+ return __process_sequences(s_ref, s_tgt,
1735
+ ref_gap, tgt_gap,
1736
+ ref_increment, tgt_increment,
1737
+ forward_frameshift,
1738
+ reverse_frameshift)
1739
+ end
1740
+ end #class Gap
1741
+
1742
+ private
1743
+ def parse_attributes(string)
1744
+ return [] if !string or string == '.'
1745
+ attr_pairs = []
1746
+ string.split(';').each do |pair|
1747
+ key, value = pair.split('=', 2)
1748
+ key = unescape(key)
1749
+ values = value.to_s.split(',')
1750
+ case key
1751
+ when 'Target'
1752
+ values.collect! { |v| Target.parse(v) }
1753
+ when 'Gap'
1754
+ values.collect! { |v| Gap.parse(v) }
1755
+ else
1756
+ values.collect! { |v| unescape(v) }
1757
+ end
1758
+ attr_pairs.concat values.collect { |v| [ key, v ] }
1759
+ end
1760
+ return attr_pairs
1761
+ end # method parse_attributes
1762
+
1763
+ # Return the attributes as a string as it appears at the end of
1764
+ # a GFF3 line
1765
+ def attributes_to_s(attr)
1766
+ return '.' if !attr or attr.empty?
1767
+ keys = []
1768
+ hash = {}
1769
+ attr.each do |pair|
1770
+ key = pair[0]
1771
+ val = pair[1]
1772
+ keys.push key unless hash[key]
1773
+ hash[key] ||= []
1774
+ hash[key].push val
1775
+ end
1776
+ keys.collect do |key|
1777
+ values = hash[key]
1778
+ val = values.collect do |v|
1779
+ if v.kind_of?(Target) then
1780
+ v.to_s
1781
+ else
1782
+ escape_attribute(v.to_s)
1783
+ end
1784
+ end.join(',')
1785
+ "#{escape_attribute(key)}=#{val}"
1786
+ end.join(';')
1787
+ end
1788
+
1789
+ end # class GFF3::Record
1790
+
1791
+ # This is a dummy record corresponding to the "###" metadata.
1792
+ class RecordBoundary < GFF3::Record
1793
+ def initialize(*arg)
1794
+ super(*arg)
1795
+ self.freeze
1796
+ end
1797
+
1798
+ def to_s
1799
+ "###\n"
1800
+ end
1801
+ end #class RecordBoundary
1802
+
1803
+ # stores GFF3 MetaData
1804
+ MetaData = GFF2::MetaData
1805
+
1806
+ # parses metadata
1807
+ def parse_metadata(directive, line)
1808
+ case directive
1809
+ when 'gff-version'
1810
+ @gff_version ||= line.split(/\s+/)[1]
1811
+ when 'FASTA'
1812
+ @in_fasta = true
1813
+ when 'sequence-region'
1814
+ @sequence_regions.push SequenceRegion.parse(line)
1815
+ when '#' # "###" directive
1816
+ @records.push RecordBoundary.new
1817
+ else
1818
+ @metadata.push MetaData.parse(line)
1819
+ end
1820
+ true
1821
+ end
1822
+ private :parse_metadata
1823
+
1824
+ end #class GFF3
1825
+
1826
+ end # class GFF
155
1827
 
156
1828
  end # module Bio
157
1829