bio 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/genbank/genbank_to_biosequence.rb - Bio::GenBank to Bio::Sequence adapter module
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2008
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
# $Id:$
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
require 'bio/sequence'
|
|
12
|
+
require 'bio/sequence/adapter'
|
|
13
|
+
|
|
14
|
+
# Internal use only. Normal users should not use this module.
|
|
15
|
+
#
|
|
16
|
+
# Bio::GenBank to Bio::Sequence adapter module.
|
|
17
|
+
# It is internally used in Bio::GenBank#to_biosequence.
|
|
18
|
+
#
|
|
19
|
+
module Bio::Sequence::Adapter::GenBank
|
|
20
|
+
|
|
21
|
+
extend Bio::Sequence::Adapter
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def_biosequence_adapter :seq
|
|
26
|
+
|
|
27
|
+
def_biosequence_adapter :id_namespace do |orig|
|
|
28
|
+
if /\_/ =~ orig.accession.to_s then
|
|
29
|
+
'RefSeq'
|
|
30
|
+
else
|
|
31
|
+
'GenBank'
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def_biosequence_adapter :entry_id
|
|
36
|
+
|
|
37
|
+
def_biosequence_adapter :primary_accession, :accession
|
|
38
|
+
|
|
39
|
+
def_biosequence_adapter :secondary_accessions do |orig|
|
|
40
|
+
orig.accessions - [ orig.accession ]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def_biosequence_adapter :other_seqids do |orig|
|
|
44
|
+
if /GI\:(.+)/ =~ orig.gi.to_s then
|
|
45
|
+
[ Bio::Sequence::DBLink.new('GI', $1) ]
|
|
46
|
+
else
|
|
47
|
+
nil
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def_biosequence_adapter :molecule_type, :natype
|
|
52
|
+
|
|
53
|
+
def_biosequence_adapter :division
|
|
54
|
+
|
|
55
|
+
def_biosequence_adapter :topology, :circular
|
|
56
|
+
|
|
57
|
+
def_biosequence_adapter :strandedness
|
|
58
|
+
|
|
59
|
+
def_biosequence_adapter :sequence_version, :version
|
|
60
|
+
|
|
61
|
+
#--
|
|
62
|
+
#sequence.date_created = nil #????
|
|
63
|
+
#++
|
|
64
|
+
|
|
65
|
+
def_biosequence_adapter :date_modified
|
|
66
|
+
|
|
67
|
+
def_biosequence_adapter :definition
|
|
68
|
+
|
|
69
|
+
def_biosequence_adapter :keywords
|
|
70
|
+
|
|
71
|
+
def_biosequence_adapter :species, :organism
|
|
72
|
+
|
|
73
|
+
def_biosequence_adapter :classification
|
|
74
|
+
|
|
75
|
+
#--
|
|
76
|
+
#sequence.organelle = nil # yet unsupported
|
|
77
|
+
#++
|
|
78
|
+
|
|
79
|
+
def_biosequence_adapter :comments, :comment
|
|
80
|
+
|
|
81
|
+
def_biosequence_adapter :references
|
|
82
|
+
|
|
83
|
+
def_biosequence_adapter :features
|
|
84
|
+
|
|
85
|
+
end #module Bio::Sequence::Adapter::GenBank
|
|
86
|
+
|
data/lib/bio/db/gff.rb
CHANGED
|
@@ -4,154 +4,1826 @@
|
|
|
4
4
|
# Copyright:: Copyright (C) 2003, 2005
|
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
|
6
6
|
# 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
|
|
7
|
+
# 2008 Naohisa Goto <ng@bioruby.org>
|
|
7
8
|
# License:: The Ruby License
|
|
8
9
|
#
|
|
9
|
-
# $Id
|
|
10
|
+
# $Id:$
|
|
10
11
|
#
|
|
12
|
+
require 'uri'
|
|
13
|
+
require 'strscan'
|
|
14
|
+
require 'enumerator'
|
|
15
|
+
require 'bio/db/fasta'
|
|
11
16
|
|
|
12
17
|
module Bio
|
|
13
|
-
# == DESCRIPTION
|
|
14
|
-
# The Bio::GFF and Bio::GFF::Record classes describe data contained in a
|
|
15
|
-
# GFF-formatted file. For information on the GFF format, see
|
|
16
|
-
# http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
|
|
17
|
-
# delimited format, including
|
|
18
|
-
# * seqname
|
|
19
|
-
# * source
|
|
20
|
-
# * feature
|
|
21
|
-
# * start
|
|
22
|
-
# * end
|
|
23
|
-
# * score
|
|
24
|
-
# * strand
|
|
25
|
-
# * frame
|
|
26
|
-
# * attributes (optional)
|
|
27
|
-
#
|
|
28
|
-
# For example:
|
|
29
|
-
# SEQ1 EMBL atg 103 105 . + 0
|
|
30
|
-
# SEQ1 EMBL exon 103 172 . + 0
|
|
31
|
-
# SEQ1 EMBL splice5 172 173 . + .
|
|
32
|
-
# SEQ1 netgene splice5 172 173 0.94 + .
|
|
33
|
-
# SEQ1 genie sp5-20 163 182 2.3 + .
|
|
34
|
-
# SEQ1 genie sp5-10 168 177 2.1 + .
|
|
35
|
-
# SEQ1 grail ATG 17 19 2.1 - 0
|
|
36
|
-
#
|
|
37
|
-
# The Bio::GFF object is a container for Bio::GFF::Record objects, each
|
|
38
|
-
# representing a single line in the GFF file.
|
|
39
|
-
class GFF
|
|
40
|
-
# Creates a Bio::GFF object by building a collection of Bio::GFF::Record
|
|
41
|
-
# objects.
|
|
18
|
+
# == DESCRIPTION
|
|
19
|
+
# The Bio::GFF and Bio::GFF::Record classes describe data contained in a
|
|
20
|
+
# GFF-formatted file. For information on the GFF format, see
|
|
21
|
+
# http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
|
|
22
|
+
# delimited format, including
|
|
23
|
+
# * seqname
|
|
24
|
+
# * source
|
|
25
|
+
# * feature
|
|
26
|
+
# * start
|
|
27
|
+
# * end
|
|
28
|
+
# * score
|
|
29
|
+
# * strand
|
|
30
|
+
# * frame
|
|
31
|
+
# * attributes (optional)
|
|
42
32
|
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
33
|
+
# For example:
|
|
34
|
+
# SEQ1 EMBL atg 103 105 . + 0
|
|
35
|
+
# SEQ1 EMBL exon 103 172 . + 0
|
|
36
|
+
# SEQ1 EMBL splice5 172 173 . + .
|
|
37
|
+
# SEQ1 netgene splice5 172 173 0.94 + .
|
|
38
|
+
# SEQ1 genie sp5-20 163 182 2.3 + .
|
|
39
|
+
# SEQ1 genie sp5-10 168 177 2.1 + .
|
|
40
|
+
# SEQ1 grail ATG 17 19 2.1 - 0
|
|
41
|
+
#
|
|
42
|
+
# The Bio::GFF object is a container for Bio::GFF::Record objects, each
|
|
43
|
+
# representing a single line in the GFF file.
|
|
44
|
+
class GFF
|
|
45
|
+
# Creates a Bio::GFF object by building a collection of Bio::GFF::Record
|
|
46
|
+
# objects.
|
|
47
|
+
#
|
|
48
|
+
# Create a Bio::GFF object the hard way
|
|
49
|
+
# this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n"
|
|
50
|
+
# this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n"
|
|
51
|
+
# this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n"
|
|
52
|
+
# this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n"
|
|
53
|
+
# this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n"
|
|
54
|
+
# this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n"
|
|
55
|
+
# this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n"
|
|
56
|
+
# p Bio::GFF.new(this_gff)
|
|
57
|
+
#
|
|
58
|
+
# or create one based on a GFF-formatted file:
|
|
59
|
+
# p Bio::GFF.new(File.open('my_data.gff')
|
|
60
|
+
# ---
|
|
61
|
+
# *Arguments*:
|
|
62
|
+
# * _str_: string in GFF format
|
|
63
|
+
# *Returns*:: Bio::GFF object
|
|
64
|
+
def initialize(str = '')
|
|
65
|
+
@records = Array.new
|
|
66
|
+
str.each_line do |line|
|
|
67
|
+
@records << Record.new(line)
|
|
68
|
+
end
|
|
63
69
|
end
|
|
64
|
-
end
|
|
65
70
|
|
|
66
|
-
|
|
67
|
-
|
|
71
|
+
# An array of Bio::GFF::Record objects.
|
|
72
|
+
attr_accessor :records
|
|
68
73
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
74
|
+
# Represents a single line of a GFF-formatted file. See Bio::GFF for more
|
|
75
|
+
# information.
|
|
76
|
+
class Record
|
|
72
77
|
|
|
73
|
-
|
|
74
|
-
|
|
78
|
+
# Name of the reference sequence
|
|
79
|
+
attr_accessor :seqname
|
|
75
80
|
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
# Name of the source of the feature (e.g. program that did prediction)
|
|
82
|
+
attr_accessor :source
|
|
78
83
|
|
|
79
|
-
|
|
80
|
-
|
|
84
|
+
# Name of the feature
|
|
85
|
+
attr_accessor :feature
|
|
81
86
|
|
|
82
|
-
|
|
83
|
-
|
|
87
|
+
# Start position of feature on reference sequence
|
|
88
|
+
attr_accessor :start
|
|
84
89
|
|
|
85
|
-
|
|
86
|
-
|
|
90
|
+
# End position of feature on reference sequence
|
|
91
|
+
attr_accessor :end
|
|
87
92
|
|
|
88
|
-
|
|
89
|
-
|
|
93
|
+
# Score of annotation (e.g. e-value for BLAST search)
|
|
94
|
+
attr_accessor :score
|
|
90
95
|
|
|
91
|
-
|
|
92
|
-
|
|
96
|
+
# Strand that feature is located on
|
|
97
|
+
attr_accessor :strand
|
|
93
98
|
|
|
94
|
-
|
|
95
|
-
|
|
99
|
+
# For features of type 'exon': indicates where feature begins in the reading frame
|
|
100
|
+
attr_accessor :frame
|
|
96
101
|
|
|
97
|
-
|
|
98
|
-
|
|
102
|
+
# List of tag=value pairs (e.g. to store name of the feature: ID=my_id)
|
|
103
|
+
attr_accessor :attributes
|
|
99
104
|
|
|
100
|
-
|
|
101
|
-
|
|
105
|
+
# Comments for the GFF record
|
|
106
|
+
attr_accessor :comment
|
|
102
107
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def initialize(str)
|
|
109
|
-
@comments = str.chomp[/#.*/]
|
|
110
|
-
return if /^#/.match(str)
|
|
111
|
-
@seqname, @source, @feature, @start, @end, @score, @strand, @frame,
|
|
112
|
-
attributes, = str.chomp.split("\t")
|
|
113
|
-
@attributes = parse_attributes(attributes) if attributes
|
|
114
|
-
end
|
|
108
|
+
# "comments" is deprecated. Instead, use "comment".
|
|
109
|
+
def comments
|
|
110
|
+
#warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"." if $VERBOSE
|
|
111
|
+
self.comment
|
|
112
|
+
end
|
|
115
113
|
|
|
116
|
-
|
|
114
|
+
# "comments=" is deprecated. Instead, use "comment=".
|
|
115
|
+
def comments=(str)
|
|
116
|
+
#warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"." if $VERBOSE
|
|
117
|
+
self.comment = str
|
|
118
|
+
end
|
|
117
119
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
120
|
+
# Creates a Bio::GFF::Record object. Is typically not called directly, but
|
|
121
|
+
# is called automatically when creating a Bio::GFF object.
|
|
122
|
+
# ---
|
|
123
|
+
# *Arguments*:
|
|
124
|
+
# * _str_: a tab-delimited line in GFF format
|
|
125
|
+
def initialize(str)
|
|
126
|
+
@comment = str.chomp[/#.*/]
|
|
127
|
+
return if /^#/.match(str)
|
|
128
|
+
@seqname, @source, @feature, @start, @end, @score, @strand, @frame,
|
|
129
|
+
attributes, = str.chomp.split("\t")
|
|
130
|
+
@attributes = parse_attributes(attributes) if attributes
|
|
123
131
|
end
|
|
124
|
-
return hash
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
132
|
|
|
128
|
-
|
|
129
|
-
# Represents version 2 of GFF specification. Is completely implemented by the
|
|
130
|
-
# Bio::GFF class.
|
|
131
|
-
class GFF2 < GFF
|
|
132
|
-
VERSION = 2
|
|
133
|
-
end
|
|
133
|
+
private
|
|
134
134
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# Bio::GFF class. For more information on version GFF3, see
|
|
138
|
-
# http://flybase.bio.indiana.edu/annot/gff3.html
|
|
139
|
-
class GFF3 < GFF
|
|
140
|
-
VERSION = 3
|
|
135
|
+
def parse_attributes(attributes)
|
|
136
|
+
hash = Hash.new
|
|
141
137
|
|
|
142
|
-
|
|
138
|
+
sc = StringScanner.new(attributes)
|
|
139
|
+
attrs = []
|
|
140
|
+
token = ''
|
|
141
|
+
while !sc.eos?
|
|
142
|
+
if sc.scan(/[^\\\;\"]+/) then
|
|
143
|
+
token.concat sc.matched
|
|
144
|
+
elsif sc.scan(/\;/) then
|
|
145
|
+
attrs.push token unless token.empty?
|
|
146
|
+
token = ''
|
|
147
|
+
elsif sc.scan(/\"/) then
|
|
148
|
+
origtext = sc.matched
|
|
149
|
+
while !sc.eos?
|
|
150
|
+
if sc.scan(/[^\\\"]+/) then
|
|
151
|
+
origtext.concat sc.matched
|
|
152
|
+
elsif sc.scan(/\"/) then
|
|
153
|
+
origtext.concat sc.matched
|
|
154
|
+
break
|
|
155
|
+
elsif sc.scan(/\\([\"\\])/) then
|
|
156
|
+
origtext.concat sc.matched
|
|
157
|
+
elsif sc.scan(/\\/) then
|
|
158
|
+
origtext.concat sc.matched
|
|
159
|
+
else
|
|
160
|
+
raise 'Bug: should not reach here'
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
token.concat origtext
|
|
164
|
+
elsif sc.scan(/\\\;/) then
|
|
165
|
+
token.concat sc.matched
|
|
166
|
+
elsif sc.scan(/\\/) then
|
|
167
|
+
token.concat sc.matched
|
|
168
|
+
else
|
|
169
|
+
raise 'Bug: should not reach here'
|
|
170
|
+
end #if
|
|
171
|
+
end #while
|
|
172
|
+
attrs.push token unless token.empty?
|
|
143
173
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
174
|
+
attrs.each do |x|
|
|
175
|
+
key, value = x.split(' ', 2)
|
|
176
|
+
key.strip!
|
|
177
|
+
value.strip! if value
|
|
178
|
+
hash[key] = value
|
|
179
|
+
end
|
|
180
|
+
hash
|
|
149
181
|
end
|
|
150
|
-
return hash
|
|
151
|
-
end
|
|
152
|
-
end
|
|
153
182
|
|
|
154
|
-
end #
|
|
183
|
+
end #Class Record
|
|
184
|
+
|
|
185
|
+
# = DESCRIPTION
|
|
186
|
+
# Represents version 2 of GFF specification.
|
|
187
|
+
# Its behavior is somehow different from Bio::GFF,
|
|
188
|
+
# especially for attributes.
|
|
189
|
+
#
|
|
190
|
+
class GFF2 < GFF
|
|
191
|
+
VERSION = 2
|
|
192
|
+
|
|
193
|
+
# string representation of the whole entry.
|
|
194
|
+
def to_s
|
|
195
|
+
ver = @gff_version || VERSION.to_s
|
|
196
|
+
ver = ver.gsub(/[\r\n]+/, ' ')
|
|
197
|
+
([ "##gff-version #{ver}\n" ] +
|
|
198
|
+
@metadata.collect { |m| m.to_s } +
|
|
199
|
+
@records.collect{ |r| r.to_s }).join('')
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Private methods for GFF2 escaping characters.
|
|
203
|
+
# Internal only. Users should not use this module directly.
|
|
204
|
+
module Escape
|
|
205
|
+
# unsafe characters to be escaped
|
|
206
|
+
UNSAFE_GFF2 = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] \x80-\xfd><;=,%^&\|`]/n
|
|
207
|
+
|
|
208
|
+
# GFF2 standard identifier
|
|
209
|
+
IDENTIFIER_GFF2 = /\A[A-Za-z][A-Za-z0-9_]*\z/n
|
|
210
|
+
|
|
211
|
+
# GFF2 numeric value
|
|
212
|
+
NUMERIC_GFF2 = /\A[-+]?([0-9]+|[0-9]*\.[0-9]*)([eE][+-]?[0-9]+)?\z/n
|
|
213
|
+
|
|
214
|
+
# List of 1-letter special backslash code.
|
|
215
|
+
# The letters other than listed here are the same as
|
|
216
|
+
# those of without backslash, except for "x" and digits.
|
|
217
|
+
# (Note that \u (unicode) is not supported.)
|
|
218
|
+
BACKSLASH = {
|
|
219
|
+
't' => "\t",
|
|
220
|
+
'n' => "\n",
|
|
221
|
+
'r' => "\r",
|
|
222
|
+
'f' => "\f",
|
|
223
|
+
'b' => "\b",
|
|
224
|
+
'a' => "\a",
|
|
225
|
+
'e' => "\e",
|
|
226
|
+
'v' => "\v",
|
|
227
|
+
# 's' => " ",
|
|
228
|
+
}.freeze
|
|
229
|
+
|
|
230
|
+
# inverted hash of BACKSLASH
|
|
231
|
+
CHAR2BACKSLASH = BACKSLASH.invert.freeze
|
|
232
|
+
|
|
233
|
+
# inverted hash of BACKSLASH, including double quote and backslash
|
|
234
|
+
CHAR2BACKSLASH_EXTENDED =
|
|
235
|
+
CHAR2BACKSLASH.merge({ '"' => '"', "\\" => "\\" }).freeze
|
|
236
|
+
|
|
237
|
+
# prohibited characters in GFF2 columns
|
|
238
|
+
PROHIBITED_GFF2_COLUMNS = /[\t\r\n\x00-\x1f\x7f\xfe\xff]/
|
|
239
|
+
|
|
240
|
+
# prohibited characters in GFF2 attribute tags
|
|
241
|
+
PROHIBITED_GFF2_TAGS = /[\s\"\;\t\r\n\x00-\x1f\x7f\xfe\xff]/
|
|
242
|
+
|
|
243
|
+
private
|
|
244
|
+
# (private) escapes GFF2 free text string
|
|
245
|
+
def escape_gff2_freetext(str)
|
|
246
|
+
'"' + str.gsub(UNSAFE_GFF2) do |x|
|
|
247
|
+
"\\" + (CHAR2BACKSLASH_EXTENDED[x] || char2octal(x))
|
|
248
|
+
end + '"'
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# (private) "x" => "\\oXXX"
|
|
252
|
+
# "x" must be a letter.
|
|
253
|
+
# If "x" is consisted of two bytes or more, joined with "\\".
|
|
254
|
+
def char2octal(x)
|
|
255
|
+
x.enum_for(:each_byte).collect { |y|
|
|
256
|
+
sprintf("%03o", y) }.join("\\")
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# (private) escapes GFF2 attribute value string
|
|
260
|
+
def escape_gff2_attribute_value(str)
|
|
261
|
+
freetext?(str) ? escape_gff2_freetext(str) : str
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# (private) check if the given string is a free text to be quoted
|
|
265
|
+
# by double-qoute.
|
|
266
|
+
def freetext?(str)
|
|
267
|
+
if IDENTIFIER_GFF2 =~ str or
|
|
268
|
+
NUMERIC_GFF2 =~ str then
|
|
269
|
+
false
|
|
270
|
+
else
|
|
271
|
+
true
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# (private) escapes normal columns in GFF2
|
|
276
|
+
def gff2_column_to_s(str)
|
|
277
|
+
str = str.to_s
|
|
278
|
+
str = str.empty? ? '.' : str
|
|
279
|
+
str = str.gsub(PROHIBITED_GFF2_COLUMNS) do |x|
|
|
280
|
+
"\\" + (CHAR2BACKSLASH[x] || char2octal(x))
|
|
281
|
+
end
|
|
282
|
+
if str[0, 1] == '#' then
|
|
283
|
+
str[0, 1] = "\\043"
|
|
284
|
+
end
|
|
285
|
+
str
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# (private) escapes GFF2 attribute tag string
|
|
289
|
+
def escape_gff2_attribute_tag(str)
|
|
290
|
+
str = str.to_s
|
|
291
|
+
str = str.empty? ? '.' : str
|
|
292
|
+
str = str.gsub(PROHIBITED_GFF2_TAGS) do |x|
|
|
293
|
+
"\\" + (CHAR2BACKSLASH[x] || char2octal(x))
|
|
294
|
+
end
|
|
295
|
+
if str[0, 1] == '#' then
|
|
296
|
+
str[0, 1] = "\\043"
|
|
297
|
+
end
|
|
298
|
+
str
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# (private) dummy method, will be redefined in GFF3.
|
|
302
|
+
def unescape(str)
|
|
303
|
+
str
|
|
304
|
+
end
|
|
305
|
+
end #module Escape
|
|
306
|
+
|
|
307
|
+
# Stores GFF2 record.
|
|
308
|
+
class Record < GFF::Record
|
|
309
|
+
|
|
310
|
+
include Escape
|
|
311
|
+
|
|
312
|
+
# Stores GFF2 attribute's value.
|
|
313
|
+
class Value
|
|
314
|
+
|
|
315
|
+
include Escape
|
|
316
|
+
|
|
317
|
+
# Creates a new Value object.
|
|
318
|
+
# Note that the given array _values_ is directly stored in
|
|
319
|
+
# the object.
|
|
320
|
+
#
|
|
321
|
+
# ---
|
|
322
|
+
# *Arguments*:
|
|
323
|
+
# * (optional) _values_: Array containing String objects.
|
|
324
|
+
# *Returns*:: Value object.
|
|
325
|
+
def initialize(values = [])
|
|
326
|
+
@values = values
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Returns string representation of this Value object.
|
|
330
|
+
# ---
|
|
331
|
+
# *Returns*:: String
|
|
332
|
+
def to_s
|
|
333
|
+
@values.collect do |str|
|
|
334
|
+
escape_gff2_attribute_value(str)
|
|
335
|
+
end.join(' ')
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Returns all values in this object.
|
|
339
|
+
#
|
|
340
|
+
# Note that modification of the returned array would affect
|
|
341
|
+
# original Value object.
|
|
342
|
+
# ---
|
|
343
|
+
# *Returns*:: Array
|
|
344
|
+
def values
|
|
345
|
+
@values
|
|
346
|
+
end
|
|
347
|
+
alias to_a values
|
|
348
|
+
|
|
349
|
+
# Returns true if other == self.
|
|
350
|
+
# Otherwise, returns false.
|
|
351
|
+
def ==(other)
|
|
352
|
+
return false unless other.kind_of?(self.class) or
|
|
353
|
+
self.kind_of?(other.class)
|
|
354
|
+
self.values == other.values rescue super(other)
|
|
355
|
+
end
|
|
356
|
+
end #class Value
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
# Parses a GFF2-formatted line and returns a new
|
|
360
|
+
# Bio::GFF::GFF2::Record object.
|
|
361
|
+
def self.parse(str)
|
|
362
|
+
self.new.parse(str)
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
# Creates a Bio::GFF::GFF2::Record object.
|
|
366
|
+
# Is typically not called directly, but
|
|
367
|
+
# is called automatically when creating a Bio::GFF::GFF2 object.
|
|
368
|
+
#
|
|
369
|
+
# ---
|
|
370
|
+
# *Arguments*:
|
|
371
|
+
# * _str_: a tab-delimited line in GFF2 format
|
|
372
|
+
# *Arguments*:
|
|
373
|
+
# * _seqname_: seqname (String or nil)
|
|
374
|
+
# * _source_: source (String or nil)
|
|
375
|
+
# * _feature_: feature type (String)
|
|
376
|
+
# * _start_position_: start (Integer)
|
|
377
|
+
# * _end_position_: end (Integer)
|
|
378
|
+
# * _score_: score (Float or nil)
|
|
379
|
+
# * _strand_: strand (String or nil)
|
|
380
|
+
# * _frame_: frame (Integer or nil)
|
|
381
|
+
# * _attributes_: attributes (Array or nil)
|
|
382
|
+
def initialize(*arg)
|
|
383
|
+
if arg.size == 1 then
|
|
384
|
+
parse(arg[0])
|
|
385
|
+
else
|
|
386
|
+
@seqname, @source, @feature,
|
|
387
|
+
start, endp, @score, @strand, frame,
|
|
388
|
+
@attributes = arg
|
|
389
|
+
@start = start ? start.to_i : nil
|
|
390
|
+
@end = endp ? endp.to_i : nil
|
|
391
|
+
@score = score ? score.to_f : nil
|
|
392
|
+
@frame = frame ? frame.to_i : nil
|
|
393
|
+
end
|
|
394
|
+
@attributes ||= []
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Comment for the GFF record
|
|
398
|
+
attr_accessor :comment
|
|
399
|
+
|
|
400
|
+
# "comments" is deprecated. Instead, use "comment".
|
|
401
|
+
def comments
|
|
402
|
+
warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"."
|
|
403
|
+
self.comment
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
# "comments=" is deprecated. Instead, use "comment=".
|
|
407
|
+
def comments=(str)
|
|
408
|
+
warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"."
|
|
409
|
+
self.comment = str
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
# Parses a GFF2-formatted line and stores data from the string.
|
|
413
|
+
# Note that all existing data is wiped out.
|
|
414
|
+
def parse(string)
|
|
415
|
+
if /^\s*\#/ =~ string then
|
|
416
|
+
@comment = string[/\#(.*)/, 1].chomp
|
|
417
|
+
columns = []
|
|
418
|
+
else
|
|
419
|
+
columns = string.chomp.split("\t", 10)
|
|
420
|
+
@comment = columns[9][/\#(.*)/, 1].chomp if columns[9]
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
@seqname, @source, @feature,
|
|
424
|
+
start, endp, score, @strand, frame =
|
|
425
|
+
columns[0, 8].collect { |x|
|
|
426
|
+
str = unescape(x)
|
|
427
|
+
str == '.' ? nil : str
|
|
428
|
+
}
|
|
429
|
+
@start = start ? start.to_i : nil
|
|
430
|
+
@end = endp ? endp.to_i : nil
|
|
431
|
+
@score = score ? score.to_f : nil
|
|
432
|
+
@frame = frame ? frame.to_i : nil
|
|
433
|
+
|
|
434
|
+
@attributes = parse_attributes(columns[8])
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
# Returns true if the entry is empty except for comment.
|
|
438
|
+
# Otherwise, returns false.
|
|
439
|
+
def comment_only?
|
|
440
|
+
if !@seqname and
|
|
441
|
+
!@source and
|
|
442
|
+
!@feature and
|
|
443
|
+
!@start and
|
|
444
|
+
!@end and
|
|
445
|
+
!@score and
|
|
446
|
+
!@strand and
|
|
447
|
+
!@frame and
|
|
448
|
+
@attributes.empty? then
|
|
449
|
+
true
|
|
450
|
+
else
|
|
451
|
+
false
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Return the record as a GFF2 compatible string
|
|
456
|
+
def to_s
|
|
457
|
+
cmnt = if @comment and !@comment.to_s.strip.empty? then
|
|
458
|
+
@comment.gsub(/[\r\n]+/, ' ')
|
|
459
|
+
else
|
|
460
|
+
false
|
|
461
|
+
end
|
|
462
|
+
return "\##{cmnt}\n" if self.comment_only? and cmnt
|
|
463
|
+
[
|
|
464
|
+
gff2_column_to_s(@seqname),
|
|
465
|
+
gff2_column_to_s(@source),
|
|
466
|
+
gff2_column_to_s(@feature),
|
|
467
|
+
gff2_column_to_s(@start),
|
|
468
|
+
gff2_column_to_s(@end),
|
|
469
|
+
gff2_column_to_s(@score),
|
|
470
|
+
gff2_column_to_s(@strand),
|
|
471
|
+
gff2_column_to_s(@frame),
|
|
472
|
+
attributes_to_s(@attributes)
|
|
473
|
+
].join("\t") +
|
|
474
|
+
(cmnt ? "\t\##{cmnt}\n" : "\n")
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# Returns true if self == other. Otherwise, returns false.
|
|
478
|
+
def ==(other)
|
|
479
|
+
super ||
|
|
480
|
+
((self.class == other.class and
|
|
481
|
+
self.seqname == other.seqname and
|
|
482
|
+
self.source == other.source and
|
|
483
|
+
self.feature == other.feature and
|
|
484
|
+
self.start == other.start and
|
|
485
|
+
self.end == other.end and
|
|
486
|
+
self.score == other.score and
|
|
487
|
+
self.strand == other.strand and
|
|
488
|
+
self.frame == other.frame and
|
|
489
|
+
self.attributes == other.attributes) ? true : false)
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
# Gets the attribute value for the given tag.
|
|
493
|
+
#
|
|
494
|
+
# Note that if two or more tag-value pairs with the same name found,
|
|
495
|
+
# only the first value is returned.
|
|
496
|
+
# ---
|
|
497
|
+
# *Arguments*:
|
|
498
|
+
# * (required) _tag_: String
|
|
499
|
+
# *Returns*:: String, Bio::GFF::GFF2::Record::Value object, or nil.
|
|
500
|
+
def get_attribute(tag)
|
|
501
|
+
ary = @attributes.assoc(tag)
|
|
502
|
+
ary ? ary[1] : nil
|
|
503
|
+
end
|
|
504
|
+
alias attribute get_attribute
|
|
505
|
+
|
|
506
|
+
# Gets the attribute values for the given tag.
|
|
507
|
+
# This method always returns an array.
|
|
508
|
+
# ---
|
|
509
|
+
# *Arguments*:
|
|
510
|
+
# * (required) _tag_: String
|
|
511
|
+
# *Returns*:: Array containing String or \
|
|
512
|
+
# Bio::GFF::GFF2::Record::Value objects.
|
|
513
|
+
def get_attributes(tag)
|
|
514
|
+
ary = @attributes.find_all do |x|
|
|
515
|
+
x[0] == tag
|
|
516
|
+
end
|
|
517
|
+
ary.collect! { |x| x[1] }
|
|
518
|
+
ary
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
# Sets value for the given tag.
|
|
522
|
+
# If the tag exists, the value of the tag is replaced with _value_.
|
|
523
|
+
# Note that if two or more tag-value pairs with the same name found,
|
|
524
|
+
# only the first tag-value pair is replaced.
|
|
525
|
+
#
|
|
526
|
+
# If the tag does not exist, the tag-value pair is newly added.
|
|
527
|
+
# ---
|
|
528
|
+
# *Arguments*:
|
|
529
|
+
# * (required) _tag_: String
|
|
530
|
+
# * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
|
|
531
|
+
# *Returns*:: _value_
|
|
532
|
+
def set_attribute(tag, value)
|
|
533
|
+
ary = @attributes.find do |x|
|
|
534
|
+
x[0] == tag
|
|
535
|
+
end
|
|
536
|
+
if ary then
|
|
537
|
+
ary[1] = value
|
|
538
|
+
else
|
|
539
|
+
ary = [ String.new(tag), value ]
|
|
540
|
+
@attributes.push ary
|
|
541
|
+
end
|
|
542
|
+
value
|
|
543
|
+
end
|
|
544
|
+
|
|
545
|
+
# Replaces values for the given tags with new values.
|
|
546
|
+
# Existing values for the tag are completely wiped out and
|
|
547
|
+
# replaced by new tag-value pairs.
|
|
548
|
+
# If the tag does not exist, the tag-value pairs are newly added.
|
|
549
|
+
#
|
|
550
|
+
# ---
|
|
551
|
+
# *Arguments*:
|
|
552
|
+
# * (required) _tag_: String
|
|
553
|
+
# * (required) _values_: String or Bio::GFF::GFF2::Record::Value objects.
|
|
554
|
+
# *Returns*:: _self_
|
|
555
|
+
def replace_attributes(tag, *values)
|
|
556
|
+
i = 0
|
|
557
|
+
@attributes.reject! do |x|
|
|
558
|
+
if x[0] == tag then
|
|
559
|
+
if i >= values.size then
|
|
560
|
+
true
|
|
561
|
+
else
|
|
562
|
+
x[1] = values[i]
|
|
563
|
+
i += 1
|
|
564
|
+
false
|
|
565
|
+
end
|
|
566
|
+
else
|
|
567
|
+
false
|
|
568
|
+
end
|
|
569
|
+
end
|
|
570
|
+
(i...(values.size)).each do |j|
|
|
571
|
+
@attributes.push [ String.new(tag), values[j] ]
|
|
572
|
+
end
|
|
573
|
+
self
|
|
574
|
+
end
|
|
575
|
+
|
|
576
|
+
# Adds a new tag-value pair.
|
|
577
|
+
# ---
|
|
578
|
+
# *Arguments*:
|
|
579
|
+
# * (required) _tag_: String
|
|
580
|
+
# * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
|
|
581
|
+
# *Returns*:: _value_
|
|
582
|
+
def add_attribute(tag, value)
|
|
583
|
+
@attributes.push([ String.new(tag), value ])
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
# Removes a specific tag-value pair.
|
|
587
|
+
#
|
|
588
|
+
# Note that if two or more tag-value pairs found,
|
|
589
|
+
# only the first tag-value pair is removed.
|
|
590
|
+
#
|
|
591
|
+
# ---
|
|
592
|
+
# *Arguments*:
|
|
593
|
+
# * (required) _tag_: String
|
|
594
|
+
# * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
|
|
595
|
+
# *Returns*:: if removed, _value_. Otherwise, nil.
|
|
596
|
+
def delete_attribute(tag, value)
|
|
597
|
+
removed = nil
|
|
598
|
+
if i = @attributes.index([ tag, value ]) then
|
|
599
|
+
ary = @attributes.delete_at(i)
|
|
600
|
+
removed = ary[1]
|
|
601
|
+
end
|
|
602
|
+
removed
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
# Removes all attributes with the specified tag.
|
|
606
|
+
#
|
|
607
|
+
# ---
|
|
608
|
+
# *Arguments*:
|
|
609
|
+
# * (required) _tag_: String
|
|
610
|
+
# *Returns*:: if removed, self. Otherwise, nil.
|
|
611
|
+
def delete_attributes(tag)
|
|
612
|
+
@attributes.reject! do |x|
|
|
613
|
+
x[0] == tag
|
|
614
|
+
end ? self : nil
|
|
615
|
+
end
|
|
616
|
+
|
|
617
|
+
# Sorts attributes order by given tag name's order.
|
|
618
|
+
# If a block is given, the argument _tags_ is ignored, and
|
|
619
|
+
# yields two tag names like Array#sort!.
|
|
620
|
+
#
|
|
621
|
+
# ---
|
|
622
|
+
# *Arguments*:
|
|
623
|
+
# * (required or optional) _tags_: Array containing String objects
|
|
624
|
+
# *Returns*:: _self_
|
|
625
|
+
def sort_attributes_by_tag!(tags = nil)
|
|
626
|
+
h = {}
|
|
627
|
+
s = @attributes.size
|
|
628
|
+
@attributes.each_with_index { |x, i| h[x] = i }
|
|
629
|
+
if block_given? then
|
|
630
|
+
@attributes.sort! do |x, y|
|
|
631
|
+
r = yield x[0], y[0]
|
|
632
|
+
if r == 0 then
|
|
633
|
+
r = (h[x] || s) <=> (h[y] || s)
|
|
634
|
+
end
|
|
635
|
+
r
|
|
636
|
+
end
|
|
637
|
+
else
|
|
638
|
+
unless tags then
|
|
639
|
+
raise ArgumentError, 'wrong number of arguments (0 for 1) or wrong argument value'
|
|
640
|
+
end
|
|
641
|
+
@attributes.sort! do |x, y|
|
|
642
|
+
r = (tags.index(x[0]) || tags.size) <=>
|
|
643
|
+
(tags.index(y[0]) || tags.size)
|
|
644
|
+
if r == 0 then
|
|
645
|
+
r = (h[x] || s) <=> (h[y] || s)
|
|
646
|
+
end
|
|
647
|
+
r
|
|
648
|
+
end
|
|
649
|
+
end
|
|
650
|
+
self
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
# Returns hash representation of attributes.
|
|
654
|
+
#
|
|
655
|
+
# Note: If two or more tag-value pairs with same tag names exist,
|
|
656
|
+
# only the first tag-value pair is used for each tag.
|
|
657
|
+
#
|
|
658
|
+
# ---
|
|
659
|
+
# *Returns*:: Hash object
|
|
660
|
+
def attributes_to_hash
|
|
661
|
+
h = {}
|
|
662
|
+
@attributes.each do |x|
|
|
663
|
+
key, val = x
|
|
664
|
+
h[key] = val unless h[key]
|
|
665
|
+
end
|
|
666
|
+
h
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
private
|
|
670
|
+
|
|
671
|
+
# (private) Parses attributes.
|
|
672
|
+
# Returns arrays
|
|
673
|
+
def parse_attributes(str)
|
|
674
|
+
return [] if !str or str == '.'
|
|
675
|
+
attr_pairs = parse_attributes_string(str)
|
|
676
|
+
attr_pairs.collect! do |x|
|
|
677
|
+
key = x.shift
|
|
678
|
+
val = (x.size == 1) ? x[0] : Value.new(x)
|
|
679
|
+
[ key, val ]
|
|
680
|
+
end
|
|
681
|
+
attr_pairs
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
# (private) Parses attributes string.
|
|
685
|
+
# Returns arrays
|
|
686
|
+
def parse_attributes_string(str)
|
|
687
|
+
sc = StringScanner.new(str)
|
|
688
|
+
attr_pairs = []
|
|
689
|
+
tokens = []
|
|
690
|
+
cur_token = ''
|
|
691
|
+
while !sc.eos?
|
|
692
|
+
if sc.scan(/[^\\\;\"\s]+/) then
|
|
693
|
+
cur_token.concat sc.matched
|
|
694
|
+
elsif sc.scan(/\s+/) then
|
|
695
|
+
tokens.push cur_token unless cur_token.empty?
|
|
696
|
+
cur_token = ''
|
|
697
|
+
elsif sc.scan(/\;/) then
|
|
698
|
+
tokens.push cur_token unless cur_token.empty?
|
|
699
|
+
cur_token = ''
|
|
700
|
+
attr_pairs.push tokens
|
|
701
|
+
tokens = []
|
|
702
|
+
elsif sc.scan(/\"/) then
|
|
703
|
+
tokens.push cur_token unless cur_token.empty?
|
|
704
|
+
cur_token = ''
|
|
705
|
+
freetext = ''
|
|
706
|
+
while !sc.eos?
|
|
707
|
+
if sc.scan(/[^\\\"]+/) then
|
|
708
|
+
freetext.concat sc.matched
|
|
709
|
+
elsif sc.scan(/\"/) then
|
|
710
|
+
break
|
|
711
|
+
elsif sc.scan(/\\([\"\\])/) then
|
|
712
|
+
freetext.concat sc[1]
|
|
713
|
+
elsif sc.scan(/\\x([0-9a-fA-F][0-9a-fA-F])/n) then
|
|
714
|
+
chr = sc[1].to_i(16).chr
|
|
715
|
+
freetext.concat chr
|
|
716
|
+
elsif sc.scan(/\\([0-7][0-7][0-7])/n) then
|
|
717
|
+
chr = sc[1].to_i(8).chr
|
|
718
|
+
freetext.concat chr
|
|
719
|
+
elsif sc.scan(/\\([^x0-9])/n) then
|
|
720
|
+
chr = Escape::BACKSLASH[sc[1]] || sc.matched
|
|
721
|
+
freetext.concat chr
|
|
722
|
+
elsif sc.scan(/\\/) then
|
|
723
|
+
freetext.concat sc.matched
|
|
724
|
+
else
|
|
725
|
+
raise 'Bug: should not reach here'
|
|
726
|
+
end
|
|
727
|
+
end
|
|
728
|
+
tokens.push freetext
|
|
729
|
+
#p freetext
|
|
730
|
+
# # disabled support for \; out of freetext
|
|
731
|
+
#elsif sc.scan(/\\\;/) then
|
|
732
|
+
# cur_token.concat sc.matched
|
|
733
|
+
elsif sc.scan(/\\/) then
|
|
734
|
+
cur_token.concat sc.matched
|
|
735
|
+
else
|
|
736
|
+
raise 'Bug: should not reach here'
|
|
737
|
+
end #if
|
|
738
|
+
end #while
|
|
739
|
+
tokens.push cur_token unless cur_token.empty?
|
|
740
|
+
attr_pairs.push tokens unless tokens.empty?
|
|
741
|
+
return attr_pairs
|
|
742
|
+
end
|
|
743
|
+
|
|
744
|
+
# (private) string representation of attributes
|
|
745
|
+
def attributes_to_s(attr)
|
|
746
|
+
attr.collect do |a|
|
|
747
|
+
tag, val = a
|
|
748
|
+
if Escape::IDENTIFIER_GFF2 !~ tag then
|
|
749
|
+
warn "Illegal GFF2 attribute tag: #{tag.inspect}" if $VERBOSE
|
|
750
|
+
end
|
|
751
|
+
tagstr = gff2_column_to_s(tag)
|
|
752
|
+
valstr = if val.kind_of?(Value) then
|
|
753
|
+
val.to_s
|
|
754
|
+
else
|
|
755
|
+
escape_gff2_attribute_value(val)
|
|
756
|
+
end
|
|
757
|
+
"#{tagstr} #{valstr}"
|
|
758
|
+
end.join(' ; ')
|
|
759
|
+
end
|
|
760
|
+
end #class Record
|
|
761
|
+
|
|
762
|
+
# Stores GFF2 meta-data.
|
|
763
|
+
class MetaData
|
|
764
|
+
# Creates a new MetaData object
|
|
765
|
+
def initialize(directive, data = nil)
|
|
766
|
+
@directive = directive
|
|
767
|
+
@data = data
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
# Directive. Usually, one of "feature-ontology", "attribute-ontology",
|
|
771
|
+
# or "source-ontology".
|
|
772
|
+
attr_accessor :directive
|
|
773
|
+
|
|
774
|
+
# data of this entry
|
|
775
|
+
attr_accessor :data
|
|
776
|
+
|
|
777
|
+
# parses a line
|
|
778
|
+
def self.parse(line)
|
|
779
|
+
directive, data = line.chomp.split(/\s+/, 2)
|
|
780
|
+
directive = directive.sub(/\A\#\#/, '') if directive
|
|
781
|
+
self.new(directive, data)
|
|
782
|
+
end
|
|
783
|
+
|
|
784
|
+
# string representation of this meta-data
|
|
785
|
+
def to_s
|
|
786
|
+
d = @directive.to_s.gsub(/[\r\n]+/, ' ')
|
|
787
|
+
v = ' ' + @data.to_s.gsub(/[\r\n]+/, ' ') unless @data.to_s.empty?
|
|
788
|
+
"\#\##{d}#{v}\n"
|
|
789
|
+
end
|
|
790
|
+
|
|
791
|
+
# Returns true if self == other. Otherwise, returns false.
|
|
792
|
+
def ==(other)
|
|
793
|
+
if self.class == other.class and
|
|
794
|
+
self.directive == other.directive and
|
|
795
|
+
self.data == other.data then
|
|
796
|
+
true
|
|
797
|
+
else
|
|
798
|
+
false
|
|
799
|
+
end
|
|
800
|
+
end
|
|
801
|
+
end #class MetaData
|
|
802
|
+
|
|
803
|
+
# (private) parses metadata
|
|
804
|
+
def parse_metadata(directive, line)
|
|
805
|
+
case directive
|
|
806
|
+
when 'gff-version'
|
|
807
|
+
@gff_version ||= line.split(/\s+/)[1]
|
|
808
|
+
else
|
|
809
|
+
@metadata.push MetaData.parse(line)
|
|
810
|
+
end
|
|
811
|
+
true
|
|
812
|
+
end
|
|
813
|
+
private :parse_metadata
|
|
814
|
+
|
|
815
|
+
# Creates a Bio::GFF::GFF2 object by building a collection of
|
|
816
|
+
# Bio::GFF::GFF2::Record (and metadata) objects.
|
|
817
|
+
#
|
|
818
|
+
# ---
|
|
819
|
+
# *Arguments*:
|
|
820
|
+
# * _str_: string in GFF format
|
|
821
|
+
# *Returns*:: Bio::GFF::GFF2 object
|
|
822
|
+
def initialize(str = nil)
|
|
823
|
+
@gff_version = nil
|
|
824
|
+
@records = []
|
|
825
|
+
@metadata = []
|
|
826
|
+
parse(str) if str
|
|
827
|
+
end
|
|
828
|
+
|
|
829
|
+
# GFF2 version string (String or nil). nil means "2".
|
|
830
|
+
attr_reader :gff_version
|
|
831
|
+
|
|
832
|
+
# Metadata (except "##gff-version").
|
|
833
|
+
# Must be an array of Bio::GFF::GFF2::MetaData objects.
|
|
834
|
+
attr_accessor :metadata
|
|
835
|
+
|
|
836
|
+
# Parses a GFF2 entries, and concatenated the parsed data.
|
|
837
|
+
#
|
|
838
|
+
# ---
|
|
839
|
+
# *Arguments*:
|
|
840
|
+
# * _str_: string in GFF format
|
|
841
|
+
# *Returns*:: self
|
|
842
|
+
def parse(str)
|
|
843
|
+
# parses GFF lines
|
|
844
|
+
str.each_line do |line|
|
|
845
|
+
if /^\#\#([^\s]+)/ =~ line then
|
|
846
|
+
parse_metadata($1, line)
|
|
847
|
+
else
|
|
848
|
+
@records << GFF2::Record.new(line)
|
|
849
|
+
end
|
|
850
|
+
end
|
|
851
|
+
self
|
|
852
|
+
end
|
|
853
|
+
|
|
854
|
+
end #class GFF2
|
|
855
|
+
|
|
856
|
+
# = DESCRIPTION
|
|
857
|
+
# Represents version 3 of GFF specification.
|
|
858
|
+
# For more information on version GFF3, see
|
|
859
|
+
# http://song.sourceforge.net/gff3.shtml
|
|
860
|
+
#--
|
|
861
|
+
# obsolete URL:
|
|
862
|
+
# http://flybase.bio.indiana.edu/annot/gff3.html
|
|
863
|
+
#++
|
|
864
|
+
class GFF3 < GFF
|
|
865
|
+
VERSION = 3
|
|
866
|
+
|
|
867
|
+
# Creates a Bio::GFF::GFF3 object by building a collection of
|
|
868
|
+
# Bio::GFF::GFF3::Record (and metadata) objects.
|
|
869
|
+
#
|
|
870
|
+
# ---
|
|
871
|
+
# *Arguments*:
|
|
872
|
+
# * _str_: string in GFF format
|
|
873
|
+
# *Returns*:: Bio::GFF object
|
|
874
|
+
def initialize(str = nil)
|
|
875
|
+
@gff_version = nil
|
|
876
|
+
@records = []
|
|
877
|
+
@sequence_regions = []
|
|
878
|
+
@metadata = []
|
|
879
|
+
@sequences = []
|
|
880
|
+
@in_fasta = false
|
|
881
|
+
parse(str) if str
|
|
882
|
+
end
|
|
883
|
+
|
|
884
|
+
# GFF3 version string (String or nil). nil means "3".
|
|
885
|
+
attr_reader :gff_version
|
|
886
|
+
|
|
887
|
+
# Metadata of "##sequence-region".
|
|
888
|
+
# Must be an array of Bio::GFF::GFF3::SequenceRegion objects.
|
|
889
|
+
attr_accessor :sequence_regions
|
|
890
|
+
|
|
891
|
+
# Metadata (except "##sequence-region", "##gff-version", "###").
|
|
892
|
+
# Must be an array of Bio::GFF::GFF3::MetaData objects.
|
|
893
|
+
attr_accessor :metadata
|
|
894
|
+
|
|
895
|
+
# Sequences bundled within GFF3.
|
|
896
|
+
# Must be an array of Bio::Sequence objects.
|
|
897
|
+
attr_accessor :sequences
|
|
898
|
+
|
|
899
|
+
# Parses a GFF3 entries, and concatenated the parsed data.
|
|
900
|
+
#
|
|
901
|
+
# Note that after "##FASTA" line is given,
|
|
902
|
+
# only fasta-formatted text is accepted.
|
|
903
|
+
#
|
|
904
|
+
# ---
|
|
905
|
+
# *Arguments*:
|
|
906
|
+
# * _str_: string in GFF format
|
|
907
|
+
# *Returns*:: self
|
|
908
|
+
def parse(str)
|
|
909
|
+
# if already after the ##FASTA line, parses fasta format and return
|
|
910
|
+
if @in_fasta then
|
|
911
|
+
parse_fasta(str)
|
|
912
|
+
return self
|
|
913
|
+
end
|
|
914
|
+
|
|
915
|
+
if str.respond_to?(:gets) then
|
|
916
|
+
# str is a IO-like object
|
|
917
|
+
fst = nil
|
|
918
|
+
else
|
|
919
|
+
# str is a String
|
|
920
|
+
gff, sep, fst = str.split(/^(\>|##FASTA.*)/n, 2)
|
|
921
|
+
fst = sep + fst if sep == '>' and fst
|
|
922
|
+
str = gff
|
|
923
|
+
end
|
|
924
|
+
|
|
925
|
+
# parses GFF lines
|
|
926
|
+
str.each_line do |line|
|
|
927
|
+
if /^\#\#([^\s]+)/ =~ line then
|
|
928
|
+
parse_metadata($1, line)
|
|
929
|
+
parse_fasta(str) if @in_fasta
|
|
930
|
+
elsif /^\>/ =~ line then
|
|
931
|
+
@in_fasta = true
|
|
932
|
+
parse_fasta(str, line)
|
|
933
|
+
else
|
|
934
|
+
@records << GFF3::Record.new(line)
|
|
935
|
+
end
|
|
936
|
+
end
|
|
937
|
+
|
|
938
|
+
# parses fasta format when str is a String and fasta data exists
|
|
939
|
+
if fst then
|
|
940
|
+
@in_fasta = true
|
|
941
|
+
parse_fasta(fst)
|
|
942
|
+
end
|
|
943
|
+
|
|
944
|
+
self
|
|
945
|
+
end
|
|
946
|
+
|
|
947
|
+
# parses fasta formatted data
|
|
948
|
+
def parse_fasta(str, line = nil)
|
|
949
|
+
str.each_line("\n>") do |seqstr|
|
|
950
|
+
if line then seqstr = line + seqstr; line = nil; end
|
|
951
|
+
x = seqstr.strip
|
|
952
|
+
next if x.empty? or x == '>'
|
|
953
|
+
fst = Bio::FastaFormat.new(seqstr)
|
|
954
|
+
seq = fst.to_seq
|
|
955
|
+
seq.entry_id =
|
|
956
|
+
unescape(fst.definition.strip.split(/\s/, 2)[0].to_s)
|
|
957
|
+
@sequences.push seq
|
|
958
|
+
end
|
|
959
|
+
end
|
|
960
|
+
private :parse_fasta
|
|
961
|
+
|
|
962
|
+
# string representation of whole entry.
|
|
963
|
+
def to_s
|
|
964
|
+
ver = @gff_version || VERSION.to_s
|
|
965
|
+
if @sequences.size > 0 then
|
|
966
|
+
seqs = "##FASTA\n" +
|
|
967
|
+
@sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('')
|
|
968
|
+
else
|
|
969
|
+
seqs = ''
|
|
970
|
+
end
|
|
971
|
+
|
|
972
|
+
([ "##gff-version #{escape(ver)}\n" ] +
|
|
973
|
+
@metadata.collect { |m| m.to_s } +
|
|
974
|
+
@sequence_regions.collect { |m| m.to_s } +
|
|
975
|
+
@records.collect{ |r| r.to_s }).join('') + seqs
|
|
976
|
+
end
|
|
977
|
+
|
|
978
|
+
# Private methods for escaping characters.
|
|
979
|
+
# Internal only. Users should not use this module directly.
|
|
980
|
+
module Escape
|
|
981
|
+
# unsafe characters to be escaped for normal columns
|
|
982
|
+
UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
|
|
983
|
+
|
|
984
|
+
# unsafe characters to be escaped for seqid columns
|
|
985
|
+
# and target_id of the "Target" attribute
|
|
986
|
+
UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
|
|
987
|
+
|
|
988
|
+
# unsafe characters to be escaped for attribute columns
|
|
989
|
+
UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
|
|
990
|
+
|
|
991
|
+
private
|
|
992
|
+
|
|
993
|
+
# If str is empty, returns '.'. Otherwise, returns str.
|
|
994
|
+
def column_to_s(str)
|
|
995
|
+
str = str.to_s
|
|
996
|
+
str.empty? ? '.' : str
|
|
997
|
+
end
|
|
998
|
+
|
|
999
|
+
# Return the string corresponding to these characters unescaped
|
|
1000
|
+
def unescape(string)
|
|
1001
|
+
URI.unescape(string)
|
|
1002
|
+
end
|
|
1003
|
+
|
|
1004
|
+
# Escape a column according to the specification at
|
|
1005
|
+
# http://song.sourceforge.net/gff3.shtml.
|
|
1006
|
+
def escape(string)
|
|
1007
|
+
URI.escape(string, UNSAFE)
|
|
1008
|
+
end
|
|
1009
|
+
|
|
1010
|
+
# Escape seqid column according to the specification at
|
|
1011
|
+
# http://song.sourceforge.net/gff3.shtml.
|
|
1012
|
+
def escape_seqid(string)
|
|
1013
|
+
URI.escape(string, UNSAFE_SEQID)
|
|
1014
|
+
end
|
|
1015
|
+
|
|
1016
|
+
# Escape attribute according to the specification at
|
|
1017
|
+
# http://song.sourceforge.net/gff3.shtml.
|
|
1018
|
+
# In addition to the normal escape rule, the following characters
|
|
1019
|
+
# are escaped: ",=;".
|
|
1020
|
+
# Returns the string corresponding to these characters escaped.
|
|
1021
|
+
def escape_attribute(string)
|
|
1022
|
+
URI.escape(string, UNSAFE_ATTRIBUTE)
|
|
1023
|
+
end
|
|
1024
|
+
end #module Escape
|
|
1025
|
+
|
|
1026
|
+
include Escape
|
|
1027
|
+
|
|
1028
|
+
# Stores meta-data "##sequence-region seqid start end".
|
|
1029
|
+
class SequenceRegion
|
|
1030
|
+
include Escape
|
|
1031
|
+
|
|
1032
|
+
# creates a new SequenceRegion class
|
|
1033
|
+
def initialize(seqid, start, endpos)
|
|
1034
|
+
@seqid = seqid
|
|
1035
|
+
@start = start ? start.to_i : nil
|
|
1036
|
+
@end = endpos ? endpos.to_i : nil
|
|
1037
|
+
end
|
|
1038
|
+
|
|
1039
|
+
# parses given string and returns SequenceRegion class
|
|
1040
|
+
def self.parse(str)
|
|
1041
|
+
dummy, seqid, start, endpos =
|
|
1042
|
+
str.chomp.split(/\s+/, 4).collect { |x| URI.unescape(x) }
|
|
1043
|
+
self.new(seqid, start, endpos)
|
|
1044
|
+
end
|
|
1045
|
+
|
|
1046
|
+
# sequence ID
|
|
1047
|
+
attr_accessor :seqid
|
|
1048
|
+
|
|
1049
|
+
# start position
|
|
1050
|
+
attr_accessor :start
|
|
1051
|
+
|
|
1052
|
+
# end position
|
|
1053
|
+
attr_accessor :end
|
|
1054
|
+
|
|
1055
|
+
# string representation
|
|
1056
|
+
def to_s
|
|
1057
|
+
i = escape_seqid(column_to_s(@seqid))
|
|
1058
|
+
s = escape_seqid(column_to_s(@start))
|
|
1059
|
+
e = escape_seqid(column_to_s(@end))
|
|
1060
|
+
"##sequence-region #{i} #{s} #{e}\n"
|
|
1061
|
+
end
|
|
1062
|
+
|
|
1063
|
+
# Returns true if self == other. Otherwise, returns false.
|
|
1064
|
+
def ==(other)
|
|
1065
|
+
if other.class == self.class and
|
|
1066
|
+
other.seqid == self.seqid and
|
|
1067
|
+
other.start == self.start and
|
|
1068
|
+
other.end == self.end then
|
|
1069
|
+
true
|
|
1070
|
+
else
|
|
1071
|
+
false
|
|
1072
|
+
end
|
|
1073
|
+
end
|
|
1074
|
+
end #class SequenceRegion
|
|
1075
|
+
|
|
1076
|
+
# Represents a single line of a GFF3-formatted file.
|
|
1077
|
+
# See Bio::GFF::GFF3 for more information.
|
|
1078
|
+
class Record < GFF2::Record
|
|
1079
|
+
|
|
1080
|
+
include GFF3::Escape
|
|
1081
|
+
|
|
1082
|
+
# shortcut to the ID attribute
|
|
1083
|
+
def id
|
|
1084
|
+
get_attribute('ID')
|
|
1085
|
+
end
|
|
1086
|
+
|
|
1087
|
+
# set ID attribute
|
|
1088
|
+
def id=(str)
|
|
1089
|
+
set_attribute('ID', str)
|
|
1090
|
+
end
|
|
1091
|
+
|
|
1092
|
+
# aliases for Column 1 (formerly "seqname")
|
|
1093
|
+
alias seqid seqname
|
|
1094
|
+
alias seqid= seqname=
|
|
1095
|
+
|
|
1096
|
+
# aliases for Column 3 (formerly "feature").
|
|
1097
|
+
# In the GFF3 document http://song.sourceforge.net/gff3.shtml,
|
|
1098
|
+
# column3 is called "type", but we used "feature_type"
|
|
1099
|
+
# because "type" is already used by Ruby itself.
|
|
1100
|
+
alias feature_type feature
|
|
1101
|
+
alias feature_type= feature=
|
|
1102
|
+
|
|
1103
|
+
# aliases for Column 8
|
|
1104
|
+
alias phase frame
|
|
1105
|
+
alias phase= frame=
|
|
1106
|
+
|
|
1107
|
+
# Parses a GFF3-formatted line and returns a new
|
|
1108
|
+
# Bio::GFF::GFF3::Record object.
|
|
1109
|
+
def self.parse(str)
|
|
1110
|
+
self.new.parse(str)
|
|
1111
|
+
end
|
|
1112
|
+
|
|
1113
|
+
# Creates a Bio::GFF::GFF3::Record object.
|
|
1114
|
+
# Is typically not called directly, but
|
|
1115
|
+
# is called automatically when creating a Bio::GFF::GFF3 object.
|
|
1116
|
+
#
|
|
1117
|
+
# ---
|
|
1118
|
+
# *Arguments*:
|
|
1119
|
+
# * _str_: a tab-delimited line in GFF3 format
|
|
1120
|
+
# *Arguments*:
|
|
1121
|
+
# * _seqid_: sequence ID (String or nil)
|
|
1122
|
+
# * _source_: source (String or nil)
|
|
1123
|
+
# * _feature_type_: type of feature (String)
|
|
1124
|
+
# * _start_position_: start (Integer)
|
|
1125
|
+
# * _end_position_: end (Integer)
|
|
1126
|
+
# * _score_: score (Float or nil)
|
|
1127
|
+
# * _strand_: strand (String or nil)
|
|
1128
|
+
# * _phase_: phase (Integer or nil)
|
|
1129
|
+
# * _attributes_: attributes (Array or nil)
|
|
1130
|
+
def initialize(*arg)
|
|
1131
|
+
super(*arg)
|
|
1132
|
+
end
|
|
1133
|
+
|
|
1134
|
+
# Parses a GFF3-formatted line and stores data from the string.
|
|
1135
|
+
# Note that all existing data is wiped out.
|
|
1136
|
+
def parse(string)
|
|
1137
|
+
super
|
|
1138
|
+
end
|
|
1139
|
+
|
|
1140
|
+
# Return the record as a GFF3 compatible string
|
|
1141
|
+
def to_s
|
|
1142
|
+
cmnt = if @comment and !@comment.to_s.strip.empty? then
|
|
1143
|
+
@comment.gsub(/[\r\n]+/, ' ')
|
|
1144
|
+
else
|
|
1145
|
+
false
|
|
1146
|
+
end
|
|
1147
|
+
return "\##{cmnt}\n" if self.comment_only? and cmnt
|
|
1148
|
+
[
|
|
1149
|
+
escape_seqid(column_to_s(@seqname)),
|
|
1150
|
+
escape(column_to_s(@source)),
|
|
1151
|
+
escape(column_to_s(@feature)),
|
|
1152
|
+
escape(column_to_s(@start)),
|
|
1153
|
+
escape(column_to_s(@end)),
|
|
1154
|
+
escape(column_to_s(@score)),
|
|
1155
|
+
escape(column_to_s(@strand)),
|
|
1156
|
+
escape(column_to_s(@frame)),
|
|
1157
|
+
attributes_to_s(@attributes)
|
|
1158
|
+
].join("\t") +
|
|
1159
|
+
(cmnt ? "\t\##{cmnt}\n" : "\n")
|
|
1160
|
+
end
|
|
1161
|
+
|
|
1162
|
+
# Bio:GFF::GFF3::Record::Target is a class to store
|
|
1163
|
+
# data of "Target" attribute.
|
|
1164
|
+
class Target
|
|
1165
|
+
include GFF3::Escape
|
|
1166
|
+
|
|
1167
|
+
# Creates a new Target object.
|
|
1168
|
+
def initialize(target_id, start, endpos, strand = nil)
|
|
1169
|
+
@target_id = target_id
|
|
1170
|
+
@start = start ? start.to_i : nil
|
|
1171
|
+
@end = endpos ? endpos.to_i : nil
|
|
1172
|
+
@strand = strand
|
|
1173
|
+
end
|
|
1174
|
+
|
|
1175
|
+
# target ID
|
|
1176
|
+
attr_accessor :target_id
|
|
1177
|
+
|
|
1178
|
+
# start position
|
|
1179
|
+
attr_accessor :start
|
|
1180
|
+
|
|
1181
|
+
# end position
|
|
1182
|
+
attr_accessor :end
|
|
1183
|
+
|
|
1184
|
+
# strand (optional). Normally, "+" or "-", or nil.
|
|
1185
|
+
attr_accessor :strand
|
|
1186
|
+
|
|
1187
|
+
# parses "target_id start end [strand]"-style string
|
|
1188
|
+
# (for example, "ABC789 123 456 +")
|
|
1189
|
+
# and creates a new Target object.
|
|
1190
|
+
#
|
|
1191
|
+
def self.parse(str)
|
|
1192
|
+
target_id, start, endpos, strand =
|
|
1193
|
+
str.split(/ +/, 4).collect { |x| URI.unescape(x) }
|
|
1194
|
+
self.new(target_id, start, endpos, strand)
|
|
1195
|
+
end
|
|
1196
|
+
|
|
1197
|
+
# returns a string
|
|
1198
|
+
def to_s
|
|
1199
|
+
i = escape_seqid(column_to_s(@target_id))
|
|
1200
|
+
s = escape_attribute(column_to_s(@start))
|
|
1201
|
+
e = escape_attribute(column_to_s(@end))
|
|
1202
|
+
strnd = escape_attribute(@strand.to_s)
|
|
1203
|
+
strnd = " " + strnd unless strnd.empty?
|
|
1204
|
+
"#{i} #{s} #{e}#{strnd}"
|
|
1205
|
+
end
|
|
1206
|
+
|
|
1207
|
+
# Returns true if self == other. Otherwise, returns false.
|
|
1208
|
+
def ==(other)
|
|
1209
|
+
if other.class == self.class and
|
|
1210
|
+
other.target_id == self.target_id and
|
|
1211
|
+
other.start == self.start and
|
|
1212
|
+
other.end == self.end and
|
|
1213
|
+
other.strand == self.strand then
|
|
1214
|
+
true
|
|
1215
|
+
else
|
|
1216
|
+
false
|
|
1217
|
+
end
|
|
1218
|
+
end
|
|
1219
|
+
end #class Target
|
|
1220
|
+
|
|
1221
|
+
# Bio:GFF::GFF3::Record::Gap is a class to store
|
|
1222
|
+
# data of "Gap" attribute.
|
|
1223
|
+
class Gap
|
|
1224
|
+
|
|
1225
|
+
# Code is a class to store length of single-letter code.
|
|
1226
|
+
Code = Struct.new(:code, :length)
|
|
1227
|
+
|
|
1228
|
+
# Code is a class to store length of single-letter code.
|
|
1229
|
+
class Code
|
|
1230
|
+
# 1-letter code (Symbol). One of :M, :I, :D, :F, or :R is expected.
|
|
1231
|
+
attr_reader :code if false #dummy for RDoc
|
|
1232
|
+
|
|
1233
|
+
# length (Integer)
|
|
1234
|
+
attr_reader :length if false #dummy for RDoc
|
|
1235
|
+
|
|
1236
|
+
def to_s
|
|
1237
|
+
"#{code}#{length}"
|
|
1238
|
+
end
|
|
1239
|
+
end #class code
|
|
1240
|
+
|
|
1241
|
+
# Creates a new Gap object.
|
|
1242
|
+
#
|
|
1243
|
+
# ---
|
|
1244
|
+
# *Arguments*:
|
|
1245
|
+
# * _str_: a formatted string, or nil.
|
|
1246
|
+
def initialize(str = nil)
|
|
1247
|
+
if str then
|
|
1248
|
+
@data = str.split(/ +/).collect do |x|
|
|
1249
|
+
if /\A([A-Z])([0-9]+)\z/ =~ x.strip then
|
|
1250
|
+
Code.new($1.intern, $2.to_i)
|
|
1251
|
+
else
|
|
1252
|
+
warn "ignored unknown token: #{x}.inspect" if $VERBOSE
|
|
1253
|
+
nil
|
|
1254
|
+
end
|
|
1255
|
+
end
|
|
1256
|
+
@data.compact!
|
|
1257
|
+
else
|
|
1258
|
+
@data = []
|
|
1259
|
+
end
|
|
1260
|
+
end
|
|
1261
|
+
|
|
1262
|
+
# Same as new(str).
|
|
1263
|
+
def self.parse(str)
|
|
1264
|
+
self.new(str)
|
|
1265
|
+
end
|
|
1266
|
+
|
|
1267
|
+
# (private method)
|
|
1268
|
+
# Scans gaps and returns an array of Code objects
|
|
1269
|
+
def __scan_gap(str, gap_regexp = /[^a-zA-Z]/,
|
|
1270
|
+
code_i = :I, code_m = :M)
|
|
1271
|
+
sc = StringScanner.new(str)
|
|
1272
|
+
data = []
|
|
1273
|
+
while len = sc.skip_until(gap_regexp)
|
|
1274
|
+
mlen = len - sc.matched_size
|
|
1275
|
+
data.push Code.new(code_m, mlen) if mlen > 0
|
|
1276
|
+
g = Code.new(code_i, sc.matched_size)
|
|
1277
|
+
while glen = sc.skip(gap_regexp)
|
|
1278
|
+
g.length += glen
|
|
1279
|
+
end
|
|
1280
|
+
data.push g
|
|
1281
|
+
end
|
|
1282
|
+
if sc.rest_size > 0 then
|
|
1283
|
+
m = Code.new(code_m, sc.rest_size)
|
|
1284
|
+
data.push m
|
|
1285
|
+
end
|
|
1286
|
+
data
|
|
1287
|
+
end
|
|
1288
|
+
private :__scan_gap
|
|
1289
|
+
|
|
1290
|
+
# (private method)
|
|
1291
|
+
# Parses given reference-target sequence alignment and
|
|
1292
|
+
# initializes self. Existing data will be erased.
|
|
1293
|
+
def __initialize_from_sequences_na(reference, target,
|
|
1294
|
+
gap_regexp = /[^a-zA-Z]/)
|
|
1295
|
+
|
|
1296
|
+
data_ref = __scan_gap(reference, gap_regexp, :I, :M)
|
|
1297
|
+
data_tgt = __scan_gap(target, gap_regexp, :D, :M)
|
|
1298
|
+
data = []
|
|
1299
|
+
|
|
1300
|
+
while !data_ref.empty? and !data_tgt.empty?
|
|
1301
|
+
ref = data_ref.shift
|
|
1302
|
+
tgt = data_tgt.shift
|
|
1303
|
+
if ref.length > tgt.length then
|
|
1304
|
+
x = Code.new(ref.code, ref.length - tgt.length)
|
|
1305
|
+
data_ref.unshift x
|
|
1306
|
+
ref.length = tgt.length
|
|
1307
|
+
elsif ref.length < tgt.length then
|
|
1308
|
+
x = Code.new(tgt.code, tgt.length - ref.length)
|
|
1309
|
+
data_tgt.unshift x
|
|
1310
|
+
tgt.length = ref.length
|
|
1311
|
+
end
|
|
1312
|
+
case ref.code
|
|
1313
|
+
when :M
|
|
1314
|
+
if tgt.code == :M then
|
|
1315
|
+
data.push ref
|
|
1316
|
+
elsif tgt.code == :D then
|
|
1317
|
+
data.push tgt
|
|
1318
|
+
else
|
|
1319
|
+
raise 'Bug: should not reach here.'
|
|
1320
|
+
end
|
|
1321
|
+
when :I
|
|
1322
|
+
if tgt.code == :M then
|
|
1323
|
+
data.push ref
|
|
1324
|
+
elsif tgt.code == :D then
|
|
1325
|
+
# This site is ignored,
|
|
1326
|
+
# because both reference and target are gap
|
|
1327
|
+
else
|
|
1328
|
+
raise 'Bug: should not reach here.'
|
|
1329
|
+
end
|
|
1330
|
+
end
|
|
1331
|
+
end #while
|
|
1332
|
+
|
|
1333
|
+
# rest of data_ref
|
|
1334
|
+
len = 0
|
|
1335
|
+
data_ref.each do |ref|
|
|
1336
|
+
len += ref.length if ref.code == :M
|
|
1337
|
+
end
|
|
1338
|
+
data.push Code.new(:D, len) if len > 0
|
|
1339
|
+
|
|
1340
|
+
# rest of data_tgt
|
|
1341
|
+
len = 0
|
|
1342
|
+
data_tgt.each do |tgt|
|
|
1343
|
+
len += tgt.length if tgt.code == :M
|
|
1344
|
+
end
|
|
1345
|
+
data.push Code.new(:I, len) if len > 0
|
|
1346
|
+
|
|
1347
|
+
@data = data
|
|
1348
|
+
true
|
|
1349
|
+
end
|
|
1350
|
+
private :__initialize_from_sequences_na
|
|
1351
|
+
|
|
1352
|
+
# Creates a new Gap object from given sequence alignment.
|
|
1353
|
+
#
|
|
1354
|
+
# Note that sites of which both reference and target are gaps
|
|
1355
|
+
# are silently removed.
|
|
1356
|
+
#
|
|
1357
|
+
# ---
|
|
1358
|
+
# *Arguments*:
|
|
1359
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
|
1360
|
+
# * _target_: target sequence (nucleotide sequence)
|
|
1361
|
+
# * <I>gap_regexp</I>: regexp to identify gap
|
|
1362
|
+
def self.new_from_sequences_na(reference, target,
|
|
1363
|
+
gap_regexp = /[^a-zA-Z]/)
|
|
1364
|
+
gap = self.new
|
|
1365
|
+
gap.instance_eval {
|
|
1366
|
+
__initialize_from_sequences_na(reference, target,
|
|
1367
|
+
gap_regexp)
|
|
1368
|
+
}
|
|
1369
|
+
gap
|
|
1370
|
+
end
|
|
1371
|
+
|
|
1372
|
+
# (private method)
|
|
1373
|
+
# scans a codon or gap in reference sequence
|
|
1374
|
+
def __scan_codon(sc_ref,
|
|
1375
|
+
gap_regexp, space_regexp,
|
|
1376
|
+
forward_frameshift_regexp,
|
|
1377
|
+
reverse_frameshift_regexp)
|
|
1378
|
+
chars = []
|
|
1379
|
+
gap_count = 0
|
|
1380
|
+
fs_count = 0
|
|
1381
|
+
|
|
1382
|
+
while chars.size < 3 + fs_count and char = sc_ref.scan(/./mn)
|
|
1383
|
+
case char
|
|
1384
|
+
when space_regexp
|
|
1385
|
+
# ignored
|
|
1386
|
+
when forward_frameshift_regexp
|
|
1387
|
+
# next char is forward frameshift
|
|
1388
|
+
fs_count += 1
|
|
1389
|
+
when reverse_frameshift_regexp
|
|
1390
|
+
# next char is reverse frameshift
|
|
1391
|
+
fs_count -= 1
|
|
1392
|
+
when gap_regexp
|
|
1393
|
+
chars.push char
|
|
1394
|
+
gap_count += 1
|
|
1395
|
+
else
|
|
1396
|
+
chars.push char
|
|
1397
|
+
end
|
|
1398
|
+
end #while
|
|
1399
|
+
if chars.size < (3 + fs_count) then
|
|
1400
|
+
gap_count += (3 + fs_count) - chars.size
|
|
1401
|
+
end
|
|
1402
|
+
return gap_count, fs_count
|
|
1403
|
+
end
|
|
1404
|
+
private :__scan_codon
|
|
1405
|
+
|
|
1406
|
+
# (private method)
|
|
1407
|
+
# internal use only
|
|
1408
|
+
def __push_code_to_data(cur, data, code, len)
|
|
1409
|
+
if cur and cur.code == code then
|
|
1410
|
+
cur.length += len
|
|
1411
|
+
else
|
|
1412
|
+
cur = Code.new(code, len)
|
|
1413
|
+
data.push cur
|
|
1414
|
+
end
|
|
1415
|
+
return cur
|
|
1416
|
+
end
|
|
1417
|
+
private :__push_code_to_data
|
|
1418
|
+
|
|
1419
|
+
# (private method)
|
|
1420
|
+
# Parses given reference(nuc)-target(amino) sequence alignment and
|
|
1421
|
+
# initializes self. Existing data will be erased.
|
|
1422
|
+
def __initialize_from_sequences_na_aa(reference, target,
|
|
1423
|
+
gap_regexp = /[^a-zA-Z]/,
|
|
1424
|
+
space_regexp = /\s/,
|
|
1425
|
+
forward_frameshift_regexp =
|
|
1426
|
+
/\>/,
|
|
1427
|
+
reverse_frameshift_regexp =
|
|
1428
|
+
/\</)
|
|
1429
|
+
|
|
1430
|
+
data = []
|
|
1431
|
+
sc_ref = StringScanner.new(reference)
|
|
1432
|
+
sc_tgt = StringScanner.new(target)
|
|
1433
|
+
|
|
1434
|
+
re_one = /./mn
|
|
1435
|
+
|
|
1436
|
+
while !sc_tgt.eos?
|
|
1437
|
+
if len = sc_tgt.skip(space_regexp) then
|
|
1438
|
+
# ignored
|
|
1439
|
+
elsif len = sc_tgt.skip(forward_frameshift_regexp) then
|
|
1440
|
+
cur = __push_code_to_data(cur, data, :F, len)
|
|
1441
|
+
len.times { sc_ref.scan(re_one) }
|
|
1442
|
+
|
|
1443
|
+
elsif len = sc_tgt.skip(reverse_frameshift_regexp) then
|
|
1444
|
+
cur = __push_code_to_data(cur, data, :R, len)
|
|
1445
|
+
pos = sc_ref.pos
|
|
1446
|
+
pos -= len
|
|
1447
|
+
if pos < 0 then
|
|
1448
|
+
warn "Incorrect reverse frameshift" if $VERBOSE
|
|
1449
|
+
pos = 0
|
|
1450
|
+
end
|
|
1451
|
+
sc_ref.pos = pos
|
|
1452
|
+
|
|
1453
|
+
elsif len = sc_tgt.skip(gap_regexp) then
|
|
1454
|
+
len.times do
|
|
1455
|
+
ref_gaps, ref_fs = __scan_codon(sc_ref,
|
|
1456
|
+
gap_regexp,
|
|
1457
|
+
space_regexp,
|
|
1458
|
+
forward_frameshift_regexp,
|
|
1459
|
+
reverse_frameshift_regexp)
|
|
1460
|
+
case ref_gaps
|
|
1461
|
+
when 3
|
|
1462
|
+
# both ref and tgt are gap. ignored the site
|
|
1463
|
+
when 2, 1
|
|
1464
|
+
# forward frameshift inserted
|
|
1465
|
+
ref_fs += (3 - ref_gaps)
|
|
1466
|
+
when 0
|
|
1467
|
+
cur = __push_code_to_data(cur, data, :D, 1)
|
|
1468
|
+
else
|
|
1469
|
+
raise 'Bug: should not reach here'
|
|
1470
|
+
end
|
|
1471
|
+
if ref_fs < 0 then
|
|
1472
|
+
cur = __push_code_to_data(cur, data, :R, -ref_fs)
|
|
1473
|
+
elsif ref_fs > 0 then
|
|
1474
|
+
cur = __push_code_to_data(cur, data, :F, ref_fs)
|
|
1475
|
+
end
|
|
1476
|
+
end #len.times
|
|
1477
|
+
elsif len = sc_tgt.skip(re_one) then
|
|
1478
|
+
# always 1-letter
|
|
1479
|
+
ref_gaps, ref_fs = __scan_codon(sc_ref,
|
|
1480
|
+
gap_regexp,
|
|
1481
|
+
space_regexp,
|
|
1482
|
+
forward_frameshift_regexp,
|
|
1483
|
+
reverse_frameshift_regexp)
|
|
1484
|
+
case ref_gaps
|
|
1485
|
+
when 3
|
|
1486
|
+
cur = __push_code_to_data(cur, data, :I, 1)
|
|
1487
|
+
when 2, 1, 0
|
|
1488
|
+
# reverse frameshift inserted when gaps exist
|
|
1489
|
+
ref_fs -= ref_gaps
|
|
1490
|
+
# normal site
|
|
1491
|
+
cur = __push_code_to_data(cur, data, :M, 1)
|
|
1492
|
+
else
|
|
1493
|
+
raise 'Bug: should not reach here'
|
|
1494
|
+
end
|
|
1495
|
+
if ref_fs < 0 then
|
|
1496
|
+
cur = __push_code_to_data(cur, data, :R, -ref_fs)
|
|
1497
|
+
elsif ref_fs > 0 then
|
|
1498
|
+
cur = __push_code_to_data(cur, data, :F, ref_fs)
|
|
1499
|
+
end
|
|
1500
|
+
else
|
|
1501
|
+
raise 'Bug: should not reach here'
|
|
1502
|
+
end
|
|
1503
|
+
end #while
|
|
1504
|
+
|
|
1505
|
+
if sc_ref.rest_size > 0 then
|
|
1506
|
+
rest = sc_ref.scan(/.*/mn)
|
|
1507
|
+
rest.gsub!(space_regexp, '')
|
|
1508
|
+
rest.gsub!(forward_frameshift_regexp, '')
|
|
1509
|
+
rest.gsub!(reverse_frameshift_regexp, '')
|
|
1510
|
+
rest.gsub!(gap_regexp, '')
|
|
1511
|
+
len = rest.length.div(3)
|
|
1512
|
+
cur = __push_code_to_data(cur, data, :D, len) if len > 0
|
|
1513
|
+
len = rest.length % 3
|
|
1514
|
+
cur = __push_code_to_data(cur, data, :F, len) if len > 0
|
|
1515
|
+
end
|
|
1516
|
+
|
|
1517
|
+
@data = data
|
|
1518
|
+
self
|
|
1519
|
+
end
|
|
1520
|
+
private :__initialize_from_sequences_na_aa
|
|
1521
|
+
|
|
1522
|
+
# Creates a new Gap object from given sequence alignment.
|
|
1523
|
+
#
|
|
1524
|
+
# Note that sites of which both reference and target are gaps
|
|
1525
|
+
# are silently removed.
|
|
1526
|
+
#
|
|
1527
|
+
# For incorrect alignments that break 3:1 rule,
|
|
1528
|
+
# gap positions will be moved inside codons,
|
|
1529
|
+
# unwanted gaps will be removed, and
|
|
1530
|
+
# some forward or reverse frameshift will be inserted.
|
|
1531
|
+
#
|
|
1532
|
+
# For example,
|
|
1533
|
+
# atgg-taagac-att
|
|
1534
|
+
# M V K - I
|
|
1535
|
+
# is treated as:
|
|
1536
|
+
# atggt<aagacatt
|
|
1537
|
+
# M V K >>I
|
|
1538
|
+
#
|
|
1539
|
+
# Incorrect combination of frameshift with frameshift or gap
|
|
1540
|
+
# may cause undefined behavior.
|
|
1541
|
+
#
|
|
1542
|
+
# Forward frameshifts are recomended to be indicated in the
|
|
1543
|
+
# target sequence.
|
|
1544
|
+
# Reverse frameshifts can be indicated in the reference sequence
|
|
1545
|
+
# or the target sequence.
|
|
1546
|
+
#
|
|
1547
|
+
# Priority of regular expressions:
|
|
1548
|
+
# space > forward/reverse frameshift > gap
|
|
1549
|
+
#
|
|
1550
|
+
# ---
|
|
1551
|
+
# *Arguments*:
|
|
1552
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
|
1553
|
+
# * _target_: target sequence (amino acid sequence)
|
|
1554
|
+
# * <I>gap_regexp</I>: regexp to identify gap
|
|
1555
|
+
# * <I>space_regexp</I>: regexp to identify space character which is completely ignored
|
|
1556
|
+
# * <I>forward_frameshift_regexp</I>: regexp to identify forward frameshift
|
|
1557
|
+
# * <I>reverse_frameshift_regexp</I>: regexp to identify reverse frameshift
|
|
1558
|
+
def self.new_from_sequences_na_aa(reference, target,
|
|
1559
|
+
gap_regexp = /[^a-zA-Z]/,
|
|
1560
|
+
space_regexp = /\s/,
|
|
1561
|
+
forward_frameshift_regexp = /\>/,
|
|
1562
|
+
reverse_frameshift_regexp = /\</)
|
|
1563
|
+
gap = self.new
|
|
1564
|
+
gap.instance_eval {
|
|
1565
|
+
__initialize_from_sequences_na_aa(reference, target,
|
|
1566
|
+
gap_regexp,
|
|
1567
|
+
space_regexp,
|
|
1568
|
+
forward_frameshift_regexp,
|
|
1569
|
+
reverse_frameshift_regexp)
|
|
1570
|
+
}
|
|
1571
|
+
gap
|
|
1572
|
+
end
|
|
1573
|
+
|
|
1574
|
+
# string representation
|
|
1575
|
+
def to_s
|
|
1576
|
+
@data.collect { |x| x.to_s }.join(" ")
|
|
1577
|
+
end
|
|
1578
|
+
|
|
1579
|
+
# Internal data. Users must not use it.
|
|
1580
|
+
attr_reader :data
|
|
1581
|
+
# @data can be read by other Gap instances
|
|
1582
|
+
protected :data
|
|
1583
|
+
|
|
1584
|
+
# If self == other, returns true.
|
|
1585
|
+
# otherwise, returns false.
|
|
1586
|
+
def ==(other)
|
|
1587
|
+
if other.class == self.class and
|
|
1588
|
+
@data == other.data then
|
|
1589
|
+
true
|
|
1590
|
+
else
|
|
1591
|
+
false
|
|
1592
|
+
end
|
|
1593
|
+
end
|
|
1594
|
+
|
|
1595
|
+
# duplicates sequences
|
|
1596
|
+
def dup_seqs(*arg)
|
|
1597
|
+
arg.collect do |s|
|
|
1598
|
+
begin
|
|
1599
|
+
s = s.seq
|
|
1600
|
+
rescue NoMethodError
|
|
1601
|
+
end
|
|
1602
|
+
s.dup
|
|
1603
|
+
end
|
|
1604
|
+
end
|
|
1605
|
+
private :dup_seqs
|
|
1606
|
+
|
|
1607
|
+
# (private method)
|
|
1608
|
+
# insert gaps refers to the gap rule inside the object
|
|
1609
|
+
def __process_sequences(s_ref, s_tgt,
|
|
1610
|
+
ref_gap, tgt_gap,
|
|
1611
|
+
ref_increment, tgt_increment,
|
|
1612
|
+
forward_frameshift,
|
|
1613
|
+
reverse_frameshift)
|
|
1614
|
+
p_ref = 0
|
|
1615
|
+
p_tgt = 0
|
|
1616
|
+
@data.each do |c|
|
|
1617
|
+
#$stderr.puts c.inspect
|
|
1618
|
+
#$stderr.puts "p_ref=#{p_ref} s_ref=#{s_ref.inspect}"
|
|
1619
|
+
#$stderr.puts "p_tgt=#{p_tgt} s_tgt=#{s_tgt.inspect}"
|
|
1620
|
+
case c.code
|
|
1621
|
+
when :M # match
|
|
1622
|
+
p_ref += c.length * ref_increment
|
|
1623
|
+
p_tgt += c.length * tgt_increment
|
|
1624
|
+
when :I # insert a gap into the reference sequence
|
|
1625
|
+
begin
|
|
1626
|
+
s_ref[p_ref, 0] = ref_gap * c.length
|
|
1627
|
+
rescue IndexError
|
|
1628
|
+
raise 'reference sequence too short'
|
|
1629
|
+
end
|
|
1630
|
+
p_ref += c.length * ref_increment
|
|
1631
|
+
p_tgt += c.length * tgt_increment
|
|
1632
|
+
when :D # insert a gap into the target (delete from reference)
|
|
1633
|
+
begin
|
|
1634
|
+
s_tgt[p_tgt, 0] = tgt_gap * c.length
|
|
1635
|
+
rescue IndexError
|
|
1636
|
+
raise 'target sequence too short'
|
|
1637
|
+
end
|
|
1638
|
+
p_ref += c.length * ref_increment
|
|
1639
|
+
p_tgt += c.length * tgt_increment
|
|
1640
|
+
when :F # frameshift forward in the reference sequence
|
|
1641
|
+
begin
|
|
1642
|
+
s_tgt[p_tgt, 0] = forward_frameshift * c.length
|
|
1643
|
+
rescue IndexError
|
|
1644
|
+
raise 'target sequence too short'
|
|
1645
|
+
end
|
|
1646
|
+
p_ref += c.length
|
|
1647
|
+
p_tgt += c.length
|
|
1648
|
+
when :R # frameshift reverse in the reference sequence
|
|
1649
|
+
p_rev_frm = p_ref - c.length
|
|
1650
|
+
if p_rev_frm < 0 then
|
|
1651
|
+
raise 'too short reference sequence, or too many reverse frameshifts'
|
|
1652
|
+
end
|
|
1653
|
+
begin
|
|
1654
|
+
s_ref[p_rev_frm, 0] = reverse_frameshift * c.length
|
|
1655
|
+
rescue IndexError
|
|
1656
|
+
raise 'reference sequence too short'
|
|
1657
|
+
end
|
|
1658
|
+
|
|
1659
|
+
else
|
|
1660
|
+
warn "ignored #{c.to_s.inspect}" if $VERBOSE
|
|
1661
|
+
end
|
|
1662
|
+
end
|
|
1663
|
+
|
|
1664
|
+
if s_ref.length < p_ref then
|
|
1665
|
+
raise 'reference sequence too short'
|
|
1666
|
+
end
|
|
1667
|
+
if s_tgt.length < p_tgt then
|
|
1668
|
+
raise 'target sequence too short'
|
|
1669
|
+
end
|
|
1670
|
+
return s_ref, s_tgt
|
|
1671
|
+
end
|
|
1672
|
+
private :__process_sequences
|
|
1673
|
+
|
|
1674
|
+
# Processes nucleotide sequences and
|
|
1675
|
+
# returns gapped sequences as an array of sequences.
|
|
1676
|
+
#
|
|
1677
|
+
# Note for forward/reverse frameshift:
|
|
1678
|
+
# Forward/Reverse_frameshift is simply treated as
|
|
1679
|
+
# gap insertion to the target/reference sequence.
|
|
1680
|
+
#
|
|
1681
|
+
# ---
|
|
1682
|
+
# *Arguments*:
|
|
1683
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
|
1684
|
+
# * _target_: target sequence (nucleotide sequence)
|
|
1685
|
+
# * <I>gap_char</I>: gap character
|
|
1686
|
+
def process_sequences_na(reference, target, gap_char = '-')
|
|
1687
|
+
s_ref, s_tgt = dup_seqs(reference, target)
|
|
1688
|
+
|
|
1689
|
+
s_ref, s_tgt = __process_sequences(s_ref, s_tgt,
|
|
1690
|
+
gap_char, gap_char,
|
|
1691
|
+
1, 1,
|
|
1692
|
+
gap_char, gap_char)
|
|
1693
|
+
|
|
1694
|
+
if $VERBOSE and s_ref.length != s_tgt.length then
|
|
1695
|
+
warn "returned sequences not equal length"
|
|
1696
|
+
end
|
|
1697
|
+
return s_ref, s_tgt
|
|
1698
|
+
end
|
|
1699
|
+
|
|
1700
|
+
# Processes sequences and
|
|
1701
|
+
# returns gapped sequences as an array of sequences.
|
|
1702
|
+
# reference must be a nucleotide sequence, and
|
|
1703
|
+
# target must be an amino acid sequence.
|
|
1704
|
+
#
|
|
1705
|
+
# Note for reverse frameshift:
|
|
1706
|
+
# Reverse_frameshift characers are inserted in the
|
|
1707
|
+
# reference sequence.
|
|
1708
|
+
# For example, alignment of "Gap=M3 R1 M2" is:
|
|
1709
|
+
# atgaagat<aatgtc
|
|
1710
|
+
# M K I N V
|
|
1711
|
+
# Alignment of "Gap=M3 R3 M3" is:
|
|
1712
|
+
# atgaag<<<attaatgtc
|
|
1713
|
+
# M K I I N V
|
|
1714
|
+
#
|
|
1715
|
+
# ---
|
|
1716
|
+
# *Arguments*:
|
|
1717
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
|
1718
|
+
# * _target_: target sequence (amino acid sequence)
|
|
1719
|
+
# * <I>gap_char</I>: gap character
|
|
1720
|
+
# * <I>space_char</I>: space character inserted to amino sequence for matching na-aa alignment
|
|
1721
|
+
# * <I>forward_frameshift</I>: forward frameshift character
|
|
1722
|
+
# * <I>reverse_frameshift</I>: reverse frameshift character
|
|
1723
|
+
def process_sequences_na_aa(reference, target,
|
|
1724
|
+
gap_char = '-',
|
|
1725
|
+
space_char = ' ',
|
|
1726
|
+
forward_frameshift = '>',
|
|
1727
|
+
reverse_frameshift = '<')
|
|
1728
|
+
s_ref, s_tgt = dup_seqs(reference, target)
|
|
1729
|
+
s_tgt = s_tgt.gsub(/./, "\\0#{space_char}#{space_char}")
|
|
1730
|
+
ref_increment = 3
|
|
1731
|
+
tgt_increment = 1 + space_char.length * 2
|
|
1732
|
+
ref_gap = gap_char * 3
|
|
1733
|
+
tgt_gap = "#{gap_char}#{space_char}#{space_char}"
|
|
1734
|
+
return __process_sequences(s_ref, s_tgt,
|
|
1735
|
+
ref_gap, tgt_gap,
|
|
1736
|
+
ref_increment, tgt_increment,
|
|
1737
|
+
forward_frameshift,
|
|
1738
|
+
reverse_frameshift)
|
|
1739
|
+
end
|
|
1740
|
+
end #class Gap
|
|
1741
|
+
|
|
1742
|
+
private
|
|
1743
|
+
def parse_attributes(string)
|
|
1744
|
+
return [] if !string or string == '.'
|
|
1745
|
+
attr_pairs = []
|
|
1746
|
+
string.split(';').each do |pair|
|
|
1747
|
+
key, value = pair.split('=', 2)
|
|
1748
|
+
key = unescape(key)
|
|
1749
|
+
values = value.to_s.split(',')
|
|
1750
|
+
case key
|
|
1751
|
+
when 'Target'
|
|
1752
|
+
values.collect! { |v| Target.parse(v) }
|
|
1753
|
+
when 'Gap'
|
|
1754
|
+
values.collect! { |v| Gap.parse(v) }
|
|
1755
|
+
else
|
|
1756
|
+
values.collect! { |v| unescape(v) }
|
|
1757
|
+
end
|
|
1758
|
+
attr_pairs.concat values.collect { |v| [ key, v ] }
|
|
1759
|
+
end
|
|
1760
|
+
return attr_pairs
|
|
1761
|
+
end # method parse_attributes
|
|
1762
|
+
|
|
1763
|
+
# Return the attributes as a string as it appears at the end of
|
|
1764
|
+
# a GFF3 line
|
|
1765
|
+
def attributes_to_s(attr)
|
|
1766
|
+
return '.' if !attr or attr.empty?
|
|
1767
|
+
keys = []
|
|
1768
|
+
hash = {}
|
|
1769
|
+
attr.each do |pair|
|
|
1770
|
+
key = pair[0]
|
|
1771
|
+
val = pair[1]
|
|
1772
|
+
keys.push key unless hash[key]
|
|
1773
|
+
hash[key] ||= []
|
|
1774
|
+
hash[key].push val
|
|
1775
|
+
end
|
|
1776
|
+
keys.collect do |key|
|
|
1777
|
+
values = hash[key]
|
|
1778
|
+
val = values.collect do |v|
|
|
1779
|
+
if v.kind_of?(Target) then
|
|
1780
|
+
v.to_s
|
|
1781
|
+
else
|
|
1782
|
+
escape_attribute(v.to_s)
|
|
1783
|
+
end
|
|
1784
|
+
end.join(',')
|
|
1785
|
+
"#{escape_attribute(key)}=#{val}"
|
|
1786
|
+
end.join(';')
|
|
1787
|
+
end
|
|
1788
|
+
|
|
1789
|
+
end # class GFF3::Record
|
|
1790
|
+
|
|
1791
|
+
# This is a dummy record corresponding to the "###" metadata.
|
|
1792
|
+
class RecordBoundary < GFF3::Record
|
|
1793
|
+
def initialize(*arg)
|
|
1794
|
+
super(*arg)
|
|
1795
|
+
self.freeze
|
|
1796
|
+
end
|
|
1797
|
+
|
|
1798
|
+
def to_s
|
|
1799
|
+
"###\n"
|
|
1800
|
+
end
|
|
1801
|
+
end #class RecordBoundary
|
|
1802
|
+
|
|
1803
|
+
# stores GFF3 MetaData
|
|
1804
|
+
MetaData = GFF2::MetaData
|
|
1805
|
+
|
|
1806
|
+
# parses metadata
|
|
1807
|
+
def parse_metadata(directive, line)
|
|
1808
|
+
case directive
|
|
1809
|
+
when 'gff-version'
|
|
1810
|
+
@gff_version ||= line.split(/\s+/)[1]
|
|
1811
|
+
when 'FASTA'
|
|
1812
|
+
@in_fasta = true
|
|
1813
|
+
when 'sequence-region'
|
|
1814
|
+
@sequence_regions.push SequenceRegion.parse(line)
|
|
1815
|
+
when '#' # "###" directive
|
|
1816
|
+
@records.push RecordBoundary.new
|
|
1817
|
+
else
|
|
1818
|
+
@metadata.push MetaData.parse(line)
|
|
1819
|
+
end
|
|
1820
|
+
true
|
|
1821
|
+
end
|
|
1822
|
+
private :parse_metadata
|
|
1823
|
+
|
|
1824
|
+
end #class GFF3
|
|
1825
|
+
|
|
1826
|
+
end # class GFF
|
|
155
1827
|
|
|
156
1828
|
end # module Bio
|
|
157
1829
|
|