bio 1.2.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,86 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/genbank/genbank_to_biosequence.rb - Bio::GenBank to Bio::Sequence adapter module
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2008
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# $Id:$
|
9
|
+
#
|
10
|
+
|
11
|
+
require 'bio/sequence'
|
12
|
+
require 'bio/sequence/adapter'
|
13
|
+
|
14
|
+
# Internal use only. Normal users should not use this module.
|
15
|
+
#
|
16
|
+
# Bio::GenBank to Bio::Sequence adapter module.
|
17
|
+
# It is internally used in Bio::GenBank#to_biosequence.
|
18
|
+
#
|
19
|
+
module Bio::Sequence::Adapter::GenBank
|
20
|
+
|
21
|
+
extend Bio::Sequence::Adapter
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def_biosequence_adapter :seq
|
26
|
+
|
27
|
+
def_biosequence_adapter :id_namespace do |orig|
|
28
|
+
if /\_/ =~ orig.accession.to_s then
|
29
|
+
'RefSeq'
|
30
|
+
else
|
31
|
+
'GenBank'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def_biosequence_adapter :entry_id
|
36
|
+
|
37
|
+
def_biosequence_adapter :primary_accession, :accession
|
38
|
+
|
39
|
+
def_biosequence_adapter :secondary_accessions do |orig|
|
40
|
+
orig.accessions - [ orig.accession ]
|
41
|
+
end
|
42
|
+
|
43
|
+
def_biosequence_adapter :other_seqids do |orig|
|
44
|
+
if /GI\:(.+)/ =~ orig.gi.to_s then
|
45
|
+
[ Bio::Sequence::DBLink.new('GI', $1) ]
|
46
|
+
else
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def_biosequence_adapter :molecule_type, :natype
|
52
|
+
|
53
|
+
def_biosequence_adapter :division
|
54
|
+
|
55
|
+
def_biosequence_adapter :topology, :circular
|
56
|
+
|
57
|
+
def_biosequence_adapter :strandedness
|
58
|
+
|
59
|
+
def_biosequence_adapter :sequence_version, :version
|
60
|
+
|
61
|
+
#--
|
62
|
+
#sequence.date_created = nil #????
|
63
|
+
#++
|
64
|
+
|
65
|
+
def_biosequence_adapter :date_modified
|
66
|
+
|
67
|
+
def_biosequence_adapter :definition
|
68
|
+
|
69
|
+
def_biosequence_adapter :keywords
|
70
|
+
|
71
|
+
def_biosequence_adapter :species, :organism
|
72
|
+
|
73
|
+
def_biosequence_adapter :classification
|
74
|
+
|
75
|
+
#--
|
76
|
+
#sequence.organelle = nil # yet unsupported
|
77
|
+
#++
|
78
|
+
|
79
|
+
def_biosequence_adapter :comments, :comment
|
80
|
+
|
81
|
+
def_biosequence_adapter :references
|
82
|
+
|
83
|
+
def_biosequence_adapter :features
|
84
|
+
|
85
|
+
end #module Bio::Sequence::Adapter::GenBank
|
86
|
+
|
data/lib/bio/db/gff.rb
CHANGED
@@ -4,154 +4,1826 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2003, 2005
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
|
7
|
+
# 2008 Naohisa Goto <ng@bioruby.org>
|
7
8
|
# License:: The Ruby License
|
8
9
|
#
|
9
|
-
# $Id
|
10
|
+
# $Id:$
|
10
11
|
#
|
12
|
+
require 'uri'
|
13
|
+
require 'strscan'
|
14
|
+
require 'enumerator'
|
15
|
+
require 'bio/db/fasta'
|
11
16
|
|
12
17
|
module Bio
|
13
|
-
# == DESCRIPTION
|
14
|
-
# The Bio::GFF and Bio::GFF::Record classes describe data contained in a
|
15
|
-
# GFF-formatted file. For information on the GFF format, see
|
16
|
-
# http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
|
17
|
-
# delimited format, including
|
18
|
-
# * seqname
|
19
|
-
# * source
|
20
|
-
# * feature
|
21
|
-
# * start
|
22
|
-
# * end
|
23
|
-
# * score
|
24
|
-
# * strand
|
25
|
-
# * frame
|
26
|
-
# * attributes (optional)
|
27
|
-
#
|
28
|
-
# For example:
|
29
|
-
# SEQ1 EMBL atg 103 105 . + 0
|
30
|
-
# SEQ1 EMBL exon 103 172 . + 0
|
31
|
-
# SEQ1 EMBL splice5 172 173 . + .
|
32
|
-
# SEQ1 netgene splice5 172 173 0.94 + .
|
33
|
-
# SEQ1 genie sp5-20 163 182 2.3 + .
|
34
|
-
# SEQ1 genie sp5-10 168 177 2.1 + .
|
35
|
-
# SEQ1 grail ATG 17 19 2.1 - 0
|
36
|
-
#
|
37
|
-
# The Bio::GFF object is a container for Bio::GFF::Record objects, each
|
38
|
-
# representing a single line in the GFF file.
|
39
|
-
class GFF
|
40
|
-
# Creates a Bio::GFF object by building a collection of Bio::GFF::Record
|
41
|
-
# objects.
|
18
|
+
# == DESCRIPTION
|
19
|
+
# The Bio::GFF and Bio::GFF::Record classes describe data contained in a
|
20
|
+
# GFF-formatted file. For information on the GFF format, see
|
21
|
+
# http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab-
|
22
|
+
# delimited format, including
|
23
|
+
# * seqname
|
24
|
+
# * source
|
25
|
+
# * feature
|
26
|
+
# * start
|
27
|
+
# * end
|
28
|
+
# * score
|
29
|
+
# * strand
|
30
|
+
# * frame
|
31
|
+
# * attributes (optional)
|
42
32
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
33
|
+
# For example:
|
34
|
+
# SEQ1 EMBL atg 103 105 . + 0
|
35
|
+
# SEQ1 EMBL exon 103 172 . + 0
|
36
|
+
# SEQ1 EMBL splice5 172 173 . + .
|
37
|
+
# SEQ1 netgene splice5 172 173 0.94 + .
|
38
|
+
# SEQ1 genie sp5-20 163 182 2.3 + .
|
39
|
+
# SEQ1 genie sp5-10 168 177 2.1 + .
|
40
|
+
# SEQ1 grail ATG 17 19 2.1 - 0
|
41
|
+
#
|
42
|
+
# The Bio::GFF object is a container for Bio::GFF::Record objects, each
|
43
|
+
# representing a single line in the GFF file.
|
44
|
+
class GFF
|
45
|
+
# Creates a Bio::GFF object by building a collection of Bio::GFF::Record
|
46
|
+
# objects.
|
47
|
+
#
|
48
|
+
# Create a Bio::GFF object the hard way
|
49
|
+
# this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n"
|
50
|
+
# this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n"
|
51
|
+
# this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n"
|
52
|
+
# this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n"
|
53
|
+
# this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n"
|
54
|
+
# this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n"
|
55
|
+
# this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n"
|
56
|
+
# p Bio::GFF.new(this_gff)
|
57
|
+
#
|
58
|
+
# or create one based on a GFF-formatted file:
|
59
|
+
# p Bio::GFF.new(File.open('my_data.gff')
|
60
|
+
# ---
|
61
|
+
# *Arguments*:
|
62
|
+
# * _str_: string in GFF format
|
63
|
+
# *Returns*:: Bio::GFF object
|
64
|
+
def initialize(str = '')
|
65
|
+
@records = Array.new
|
66
|
+
str.each_line do |line|
|
67
|
+
@records << Record.new(line)
|
68
|
+
end
|
63
69
|
end
|
64
|
-
end
|
65
70
|
|
66
|
-
|
67
|
-
|
71
|
+
# An array of Bio::GFF::Record objects.
|
72
|
+
attr_accessor :records
|
68
73
|
|
69
|
-
|
70
|
-
|
71
|
-
|
74
|
+
# Represents a single line of a GFF-formatted file. See Bio::GFF for more
|
75
|
+
# information.
|
76
|
+
class Record
|
72
77
|
|
73
|
-
|
74
|
-
|
78
|
+
# Name of the reference sequence
|
79
|
+
attr_accessor :seqname
|
75
80
|
|
76
|
-
|
77
|
-
|
81
|
+
# Name of the source of the feature (e.g. program that did prediction)
|
82
|
+
attr_accessor :source
|
78
83
|
|
79
|
-
|
80
|
-
|
84
|
+
# Name of the feature
|
85
|
+
attr_accessor :feature
|
81
86
|
|
82
|
-
|
83
|
-
|
87
|
+
# Start position of feature on reference sequence
|
88
|
+
attr_accessor :start
|
84
89
|
|
85
|
-
|
86
|
-
|
90
|
+
# End position of feature on reference sequence
|
91
|
+
attr_accessor :end
|
87
92
|
|
88
|
-
|
89
|
-
|
93
|
+
# Score of annotation (e.g. e-value for BLAST search)
|
94
|
+
attr_accessor :score
|
90
95
|
|
91
|
-
|
92
|
-
|
96
|
+
# Strand that feature is located on
|
97
|
+
attr_accessor :strand
|
93
98
|
|
94
|
-
|
95
|
-
|
99
|
+
# For features of type 'exon': indicates where feature begins in the reading frame
|
100
|
+
attr_accessor :frame
|
96
101
|
|
97
|
-
|
98
|
-
|
102
|
+
# List of tag=value pairs (e.g. to store name of the feature: ID=my_id)
|
103
|
+
attr_accessor :attributes
|
99
104
|
|
100
|
-
|
101
|
-
|
105
|
+
# Comments for the GFF record
|
106
|
+
attr_accessor :comment
|
102
107
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
def initialize(str)
|
109
|
-
@comments = str.chomp[/#.*/]
|
110
|
-
return if /^#/.match(str)
|
111
|
-
@seqname, @source, @feature, @start, @end, @score, @strand, @frame,
|
112
|
-
attributes, = str.chomp.split("\t")
|
113
|
-
@attributes = parse_attributes(attributes) if attributes
|
114
|
-
end
|
108
|
+
# "comments" is deprecated. Instead, use "comment".
|
109
|
+
def comments
|
110
|
+
#warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"." if $VERBOSE
|
111
|
+
self.comment
|
112
|
+
end
|
115
113
|
|
116
|
-
|
114
|
+
# "comments=" is deprecated. Instead, use "comment=".
|
115
|
+
def comments=(str)
|
116
|
+
#warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"." if $VERBOSE
|
117
|
+
self.comment = str
|
118
|
+
end
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
120
|
+
# Creates a Bio::GFF::Record object. Is typically not called directly, but
|
121
|
+
# is called automatically when creating a Bio::GFF object.
|
122
|
+
# ---
|
123
|
+
# *Arguments*:
|
124
|
+
# * _str_: a tab-delimited line in GFF format
|
125
|
+
def initialize(str)
|
126
|
+
@comment = str.chomp[/#.*/]
|
127
|
+
return if /^#/.match(str)
|
128
|
+
@seqname, @source, @feature, @start, @end, @score, @strand, @frame,
|
129
|
+
attributes, = str.chomp.split("\t")
|
130
|
+
@attributes = parse_attributes(attributes) if attributes
|
123
131
|
end
|
124
|
-
return hash
|
125
|
-
end
|
126
|
-
end
|
127
132
|
|
128
|
-
|
129
|
-
# Represents version 2 of GFF specification. Is completely implemented by the
|
130
|
-
# Bio::GFF class.
|
131
|
-
class GFF2 < GFF
|
132
|
-
VERSION = 2
|
133
|
-
end
|
133
|
+
private
|
134
134
|
|
135
|
-
|
136
|
-
|
137
|
-
# Bio::GFF class. For more information on version GFF3, see
|
138
|
-
# http://flybase.bio.indiana.edu/annot/gff3.html
|
139
|
-
class GFF3 < GFF
|
140
|
-
VERSION = 3
|
135
|
+
def parse_attributes(attributes)
|
136
|
+
hash = Hash.new
|
141
137
|
|
142
|
-
|
138
|
+
sc = StringScanner.new(attributes)
|
139
|
+
attrs = []
|
140
|
+
token = ''
|
141
|
+
while !sc.eos?
|
142
|
+
if sc.scan(/[^\\\;\"]+/) then
|
143
|
+
token.concat sc.matched
|
144
|
+
elsif sc.scan(/\;/) then
|
145
|
+
attrs.push token unless token.empty?
|
146
|
+
token = ''
|
147
|
+
elsif sc.scan(/\"/) then
|
148
|
+
origtext = sc.matched
|
149
|
+
while !sc.eos?
|
150
|
+
if sc.scan(/[^\\\"]+/) then
|
151
|
+
origtext.concat sc.matched
|
152
|
+
elsif sc.scan(/\"/) then
|
153
|
+
origtext.concat sc.matched
|
154
|
+
break
|
155
|
+
elsif sc.scan(/\\([\"\\])/) then
|
156
|
+
origtext.concat sc.matched
|
157
|
+
elsif sc.scan(/\\/) then
|
158
|
+
origtext.concat sc.matched
|
159
|
+
else
|
160
|
+
raise 'Bug: should not reach here'
|
161
|
+
end
|
162
|
+
end
|
163
|
+
token.concat origtext
|
164
|
+
elsif sc.scan(/\\\;/) then
|
165
|
+
token.concat sc.matched
|
166
|
+
elsif sc.scan(/\\/) then
|
167
|
+
token.concat sc.matched
|
168
|
+
else
|
169
|
+
raise 'Bug: should not reach here'
|
170
|
+
end #if
|
171
|
+
end #while
|
172
|
+
attrs.push token unless token.empty?
|
143
173
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
174
|
+
attrs.each do |x|
|
175
|
+
key, value = x.split(' ', 2)
|
176
|
+
key.strip!
|
177
|
+
value.strip! if value
|
178
|
+
hash[key] = value
|
179
|
+
end
|
180
|
+
hash
|
149
181
|
end
|
150
|
-
return hash
|
151
|
-
end
|
152
|
-
end
|
153
182
|
|
154
|
-
end #
|
183
|
+
end #Class Record
|
184
|
+
|
185
|
+
# = DESCRIPTION
|
186
|
+
# Represents version 2 of GFF specification.
|
187
|
+
# Its behavior is somehow different from Bio::GFF,
|
188
|
+
# especially for attributes.
|
189
|
+
#
|
190
|
+
class GFF2 < GFF
|
191
|
+
VERSION = 2
|
192
|
+
|
193
|
+
# string representation of the whole entry.
|
194
|
+
def to_s
|
195
|
+
ver = @gff_version || VERSION.to_s
|
196
|
+
ver = ver.gsub(/[\r\n]+/, ' ')
|
197
|
+
([ "##gff-version #{ver}\n" ] +
|
198
|
+
@metadata.collect { |m| m.to_s } +
|
199
|
+
@records.collect{ |r| r.to_s }).join('')
|
200
|
+
end
|
201
|
+
|
202
|
+
# Private methods for GFF2 escaping characters.
|
203
|
+
# Internal only. Users should not use this module directly.
|
204
|
+
module Escape
|
205
|
+
# unsafe characters to be escaped
|
206
|
+
UNSAFE_GFF2 = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] \x80-\xfd><;=,%^&\|`]/n
|
207
|
+
|
208
|
+
# GFF2 standard identifier
|
209
|
+
IDENTIFIER_GFF2 = /\A[A-Za-z][A-Za-z0-9_]*\z/n
|
210
|
+
|
211
|
+
# GFF2 numeric value
|
212
|
+
NUMERIC_GFF2 = /\A[-+]?([0-9]+|[0-9]*\.[0-9]*)([eE][+-]?[0-9]+)?\z/n
|
213
|
+
|
214
|
+
# List of 1-letter special backslash code.
|
215
|
+
# The letters other than listed here are the same as
|
216
|
+
# those of without backslash, except for "x" and digits.
|
217
|
+
# (Note that \u (unicode) is not supported.)
|
218
|
+
BACKSLASH = {
|
219
|
+
't' => "\t",
|
220
|
+
'n' => "\n",
|
221
|
+
'r' => "\r",
|
222
|
+
'f' => "\f",
|
223
|
+
'b' => "\b",
|
224
|
+
'a' => "\a",
|
225
|
+
'e' => "\e",
|
226
|
+
'v' => "\v",
|
227
|
+
# 's' => " ",
|
228
|
+
}.freeze
|
229
|
+
|
230
|
+
# inverted hash of BACKSLASH
|
231
|
+
CHAR2BACKSLASH = BACKSLASH.invert.freeze
|
232
|
+
|
233
|
+
# inverted hash of BACKSLASH, including double quote and backslash
|
234
|
+
CHAR2BACKSLASH_EXTENDED =
|
235
|
+
CHAR2BACKSLASH.merge({ '"' => '"', "\\" => "\\" }).freeze
|
236
|
+
|
237
|
+
# prohibited characters in GFF2 columns
|
238
|
+
PROHIBITED_GFF2_COLUMNS = /[\t\r\n\x00-\x1f\x7f\xfe\xff]/
|
239
|
+
|
240
|
+
# prohibited characters in GFF2 attribute tags
|
241
|
+
PROHIBITED_GFF2_TAGS = /[\s\"\;\t\r\n\x00-\x1f\x7f\xfe\xff]/
|
242
|
+
|
243
|
+
private
|
244
|
+
# (private) escapes GFF2 free text string
|
245
|
+
def escape_gff2_freetext(str)
|
246
|
+
'"' + str.gsub(UNSAFE_GFF2) do |x|
|
247
|
+
"\\" + (CHAR2BACKSLASH_EXTENDED[x] || char2octal(x))
|
248
|
+
end + '"'
|
249
|
+
end
|
250
|
+
|
251
|
+
# (private) "x" => "\\oXXX"
|
252
|
+
# "x" must be a letter.
|
253
|
+
# If "x" is consisted of two bytes or more, joined with "\\".
|
254
|
+
def char2octal(x)
|
255
|
+
x.enum_for(:each_byte).collect { |y|
|
256
|
+
sprintf("%03o", y) }.join("\\")
|
257
|
+
end
|
258
|
+
|
259
|
+
# (private) escapes GFF2 attribute value string
|
260
|
+
def escape_gff2_attribute_value(str)
|
261
|
+
freetext?(str) ? escape_gff2_freetext(str) : str
|
262
|
+
end
|
263
|
+
|
264
|
+
# (private) check if the given string is a free text to be quoted
|
265
|
+
# by double-qoute.
|
266
|
+
def freetext?(str)
|
267
|
+
if IDENTIFIER_GFF2 =~ str or
|
268
|
+
NUMERIC_GFF2 =~ str then
|
269
|
+
false
|
270
|
+
else
|
271
|
+
true
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
# (private) escapes normal columns in GFF2
|
276
|
+
def gff2_column_to_s(str)
|
277
|
+
str = str.to_s
|
278
|
+
str = str.empty? ? '.' : str
|
279
|
+
str = str.gsub(PROHIBITED_GFF2_COLUMNS) do |x|
|
280
|
+
"\\" + (CHAR2BACKSLASH[x] || char2octal(x))
|
281
|
+
end
|
282
|
+
if str[0, 1] == '#' then
|
283
|
+
str[0, 1] = "\\043"
|
284
|
+
end
|
285
|
+
str
|
286
|
+
end
|
287
|
+
|
288
|
+
# (private) escapes GFF2 attribute tag string
|
289
|
+
def escape_gff2_attribute_tag(str)
|
290
|
+
str = str.to_s
|
291
|
+
str = str.empty? ? '.' : str
|
292
|
+
str = str.gsub(PROHIBITED_GFF2_TAGS) do |x|
|
293
|
+
"\\" + (CHAR2BACKSLASH[x] || char2octal(x))
|
294
|
+
end
|
295
|
+
if str[0, 1] == '#' then
|
296
|
+
str[0, 1] = "\\043"
|
297
|
+
end
|
298
|
+
str
|
299
|
+
end
|
300
|
+
|
301
|
+
# (private) dummy method, will be redefined in GFF3.
|
302
|
+
def unescape(str)
|
303
|
+
str
|
304
|
+
end
|
305
|
+
end #module Escape
|
306
|
+
|
307
|
+
# Stores GFF2 record.
|
308
|
+
class Record < GFF::Record
|
309
|
+
|
310
|
+
include Escape
|
311
|
+
|
312
|
+
# Stores GFF2 attribute's value.
|
313
|
+
class Value
|
314
|
+
|
315
|
+
include Escape
|
316
|
+
|
317
|
+
# Creates a new Value object.
|
318
|
+
# Note that the given array _values_ is directly stored in
|
319
|
+
# the object.
|
320
|
+
#
|
321
|
+
# ---
|
322
|
+
# *Arguments*:
|
323
|
+
# * (optional) _values_: Array containing String objects.
|
324
|
+
# *Returns*:: Value object.
|
325
|
+
def initialize(values = [])
|
326
|
+
@values = values
|
327
|
+
end
|
328
|
+
|
329
|
+
# Returns string representation of this Value object.
|
330
|
+
# ---
|
331
|
+
# *Returns*:: String
|
332
|
+
def to_s
|
333
|
+
@values.collect do |str|
|
334
|
+
escape_gff2_attribute_value(str)
|
335
|
+
end.join(' ')
|
336
|
+
end
|
337
|
+
|
338
|
+
# Returns all values in this object.
|
339
|
+
#
|
340
|
+
# Note that modification of the returned array would affect
|
341
|
+
# original Value object.
|
342
|
+
# ---
|
343
|
+
# *Returns*:: Array
|
344
|
+
def values
|
345
|
+
@values
|
346
|
+
end
|
347
|
+
alias to_a values
|
348
|
+
|
349
|
+
# Returns true if other == self.
|
350
|
+
# Otherwise, returns false.
|
351
|
+
def ==(other)
|
352
|
+
return false unless other.kind_of?(self.class) or
|
353
|
+
self.kind_of?(other.class)
|
354
|
+
self.values == other.values rescue super(other)
|
355
|
+
end
|
356
|
+
end #class Value
|
357
|
+
|
358
|
+
|
359
|
+
# Parses a GFF2-formatted line and returns a new
|
360
|
+
# Bio::GFF::GFF2::Record object.
|
361
|
+
def self.parse(str)
|
362
|
+
self.new.parse(str)
|
363
|
+
end
|
364
|
+
|
365
|
+
# Creates a Bio::GFF::GFF2::Record object.
|
366
|
+
# Is typically not called directly, but
|
367
|
+
# is called automatically when creating a Bio::GFF::GFF2 object.
|
368
|
+
#
|
369
|
+
# ---
|
370
|
+
# *Arguments*:
|
371
|
+
# * _str_: a tab-delimited line in GFF2 format
|
372
|
+
# *Arguments*:
|
373
|
+
# * _seqname_: seqname (String or nil)
|
374
|
+
# * _source_: source (String or nil)
|
375
|
+
# * _feature_: feature type (String)
|
376
|
+
# * _start_position_: start (Integer)
|
377
|
+
# * _end_position_: end (Integer)
|
378
|
+
# * _score_: score (Float or nil)
|
379
|
+
# * _strand_: strand (String or nil)
|
380
|
+
# * _frame_: frame (Integer or nil)
|
381
|
+
# * _attributes_: attributes (Array or nil)
|
382
|
+
def initialize(*arg)
|
383
|
+
if arg.size == 1 then
|
384
|
+
parse(arg[0])
|
385
|
+
else
|
386
|
+
@seqname, @source, @feature,
|
387
|
+
start, endp, @score, @strand, frame,
|
388
|
+
@attributes = arg
|
389
|
+
@start = start ? start.to_i : nil
|
390
|
+
@end = endp ? endp.to_i : nil
|
391
|
+
@score = score ? score.to_f : nil
|
392
|
+
@frame = frame ? frame.to_i : nil
|
393
|
+
end
|
394
|
+
@attributes ||= []
|
395
|
+
end
|
396
|
+
|
397
|
+
# Comment for the GFF record
|
398
|
+
attr_accessor :comment
|
399
|
+
|
400
|
+
# "comments" is deprecated. Instead, use "comment".
|
401
|
+
def comments
|
402
|
+
warn "#{self.class.to_s}#comments is deprecated. Instead, use \"comment\"."
|
403
|
+
self.comment
|
404
|
+
end
|
405
|
+
|
406
|
+
# "comments=" is deprecated. Instead, use "comment=".
|
407
|
+
def comments=(str)
|
408
|
+
warn "#{self.class.to_s}#comments= is deprecated. Instead, use \"comment=\"."
|
409
|
+
self.comment = str
|
410
|
+
end
|
411
|
+
|
412
|
+
# Parses a GFF2-formatted line and stores data from the string.
|
413
|
+
# Note that all existing data is wiped out.
|
414
|
+
def parse(string)
|
415
|
+
if /^\s*\#/ =~ string then
|
416
|
+
@comment = string[/\#(.*)/, 1].chomp
|
417
|
+
columns = []
|
418
|
+
else
|
419
|
+
columns = string.chomp.split("\t", 10)
|
420
|
+
@comment = columns[9][/\#(.*)/, 1].chomp if columns[9]
|
421
|
+
end
|
422
|
+
|
423
|
+
@seqname, @source, @feature,
|
424
|
+
start, endp, score, @strand, frame =
|
425
|
+
columns[0, 8].collect { |x|
|
426
|
+
str = unescape(x)
|
427
|
+
str == '.' ? nil : str
|
428
|
+
}
|
429
|
+
@start = start ? start.to_i : nil
|
430
|
+
@end = endp ? endp.to_i : nil
|
431
|
+
@score = score ? score.to_f : nil
|
432
|
+
@frame = frame ? frame.to_i : nil
|
433
|
+
|
434
|
+
@attributes = parse_attributes(columns[8])
|
435
|
+
end
|
436
|
+
|
437
|
+
# Returns true if the entry is empty except for comment.
|
438
|
+
# Otherwise, returns false.
|
439
|
+
def comment_only?
|
440
|
+
if !@seqname and
|
441
|
+
!@source and
|
442
|
+
!@feature and
|
443
|
+
!@start and
|
444
|
+
!@end and
|
445
|
+
!@score and
|
446
|
+
!@strand and
|
447
|
+
!@frame and
|
448
|
+
@attributes.empty? then
|
449
|
+
true
|
450
|
+
else
|
451
|
+
false
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
# Return the record as a GFF2 compatible string
|
456
|
+
def to_s
|
457
|
+
cmnt = if @comment and !@comment.to_s.strip.empty? then
|
458
|
+
@comment.gsub(/[\r\n]+/, ' ')
|
459
|
+
else
|
460
|
+
false
|
461
|
+
end
|
462
|
+
return "\##{cmnt}\n" if self.comment_only? and cmnt
|
463
|
+
[
|
464
|
+
gff2_column_to_s(@seqname),
|
465
|
+
gff2_column_to_s(@source),
|
466
|
+
gff2_column_to_s(@feature),
|
467
|
+
gff2_column_to_s(@start),
|
468
|
+
gff2_column_to_s(@end),
|
469
|
+
gff2_column_to_s(@score),
|
470
|
+
gff2_column_to_s(@strand),
|
471
|
+
gff2_column_to_s(@frame),
|
472
|
+
attributes_to_s(@attributes)
|
473
|
+
].join("\t") +
|
474
|
+
(cmnt ? "\t\##{cmnt}\n" : "\n")
|
475
|
+
end
|
476
|
+
|
477
|
+
# Returns true if self == other. Otherwise, returns false.
|
478
|
+
def ==(other)
|
479
|
+
super ||
|
480
|
+
((self.class == other.class and
|
481
|
+
self.seqname == other.seqname and
|
482
|
+
self.source == other.source and
|
483
|
+
self.feature == other.feature and
|
484
|
+
self.start == other.start and
|
485
|
+
self.end == other.end and
|
486
|
+
self.score == other.score and
|
487
|
+
self.strand == other.strand and
|
488
|
+
self.frame == other.frame and
|
489
|
+
self.attributes == other.attributes) ? true : false)
|
490
|
+
end
|
491
|
+
|
492
|
+
# Gets the attribute value for the given tag.
|
493
|
+
#
|
494
|
+
# Note that if two or more tag-value pairs with the same name found,
|
495
|
+
# only the first value is returned.
|
496
|
+
# ---
|
497
|
+
# *Arguments*:
|
498
|
+
# * (required) _tag_: String
|
499
|
+
# *Returns*:: String, Bio::GFF::GFF2::Record::Value object, or nil.
|
500
|
+
def get_attribute(tag)
|
501
|
+
ary = @attributes.assoc(tag)
|
502
|
+
ary ? ary[1] : nil
|
503
|
+
end
|
504
|
+
alias attribute get_attribute
|
505
|
+
|
506
|
+
# Gets the attribute values for the given tag.
|
507
|
+
# This method always returns an array.
|
508
|
+
# ---
|
509
|
+
# *Arguments*:
|
510
|
+
# * (required) _tag_: String
|
511
|
+
# *Returns*:: Array containing String or \
|
512
|
+
# Bio::GFF::GFF2::Record::Value objects.
|
513
|
+
def get_attributes(tag)
|
514
|
+
ary = @attributes.find_all do |x|
|
515
|
+
x[0] == tag
|
516
|
+
end
|
517
|
+
ary.collect! { |x| x[1] }
|
518
|
+
ary
|
519
|
+
end
|
520
|
+
|
521
|
+
# Sets value for the given tag.
|
522
|
+
# If the tag exists, the value of the tag is replaced with _value_.
|
523
|
+
# Note that if two or more tag-value pairs with the same name found,
|
524
|
+
# only the first tag-value pair is replaced.
|
525
|
+
#
|
526
|
+
# If the tag does not exist, the tag-value pair is newly added.
|
527
|
+
# ---
|
528
|
+
# *Arguments*:
|
529
|
+
# * (required) _tag_: String
|
530
|
+
# * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
|
531
|
+
# *Returns*:: _value_
|
532
|
+
def set_attribute(tag, value)
|
533
|
+
ary = @attributes.find do |x|
|
534
|
+
x[0] == tag
|
535
|
+
end
|
536
|
+
if ary then
|
537
|
+
ary[1] = value
|
538
|
+
else
|
539
|
+
ary = [ String.new(tag), value ]
|
540
|
+
@attributes.push ary
|
541
|
+
end
|
542
|
+
value
|
543
|
+
end
|
544
|
+
|
545
|
+
# Replaces values for the given tags with new values.
|
546
|
+
# Existing values for the tag are completely wiped out and
|
547
|
+
# replaced by new tag-value pairs.
|
548
|
+
# If the tag does not exist, the tag-value pairs are newly added.
|
549
|
+
#
|
550
|
+
# ---
|
551
|
+
# *Arguments*:
|
552
|
+
# * (required) _tag_: String
|
553
|
+
# * (required) _values_: String or Bio::GFF::GFF2::Record::Value objects.
|
554
|
+
# *Returns*:: _self_
|
555
|
+
def replace_attributes(tag, *values)
|
556
|
+
i = 0
|
557
|
+
@attributes.reject! do |x|
|
558
|
+
if x[0] == tag then
|
559
|
+
if i >= values.size then
|
560
|
+
true
|
561
|
+
else
|
562
|
+
x[1] = values[i]
|
563
|
+
i += 1
|
564
|
+
false
|
565
|
+
end
|
566
|
+
else
|
567
|
+
false
|
568
|
+
end
|
569
|
+
end
|
570
|
+
(i...(values.size)).each do |j|
|
571
|
+
@attributes.push [ String.new(tag), values[j] ]
|
572
|
+
end
|
573
|
+
self
|
574
|
+
end
|
575
|
+
|
576
|
+
# Adds a new tag-value pair.
|
577
|
+
# ---
|
578
|
+
# *Arguments*:
|
579
|
+
# * (required) _tag_: String
|
580
|
+
# * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
|
581
|
+
# *Returns*:: _value_
|
582
|
+
def add_attribute(tag, value)
|
583
|
+
@attributes.push([ String.new(tag), value ])
|
584
|
+
end
|
585
|
+
|
586
|
+
# Removes a specific tag-value pair.
|
587
|
+
#
|
588
|
+
# Note that if two or more tag-value pairs found,
|
589
|
+
# only the first tag-value pair is removed.
|
590
|
+
#
|
591
|
+
# ---
|
592
|
+
# *Arguments*:
|
593
|
+
# * (required) _tag_: String
|
594
|
+
# * (required) _value_: String or Bio::GFF::GFF2::Record::Value object.
|
595
|
+
# *Returns*:: if removed, _value_. Otherwise, nil.
|
596
|
+
def delete_attribute(tag, value)
|
597
|
+
removed = nil
|
598
|
+
if i = @attributes.index([ tag, value ]) then
|
599
|
+
ary = @attributes.delete_at(i)
|
600
|
+
removed = ary[1]
|
601
|
+
end
|
602
|
+
removed
|
603
|
+
end
|
604
|
+
|
605
|
+
# Removes all attributes with the specified tag.
|
606
|
+
#
|
607
|
+
# ---
|
608
|
+
# *Arguments*:
|
609
|
+
# * (required) _tag_: String
|
610
|
+
# *Returns*:: if removed, self. Otherwise, nil.
|
611
|
+
def delete_attributes(tag)
|
612
|
+
@attributes.reject! do |x|
|
613
|
+
x[0] == tag
|
614
|
+
end ? self : nil
|
615
|
+
end
|
616
|
+
|
617
|
+
# Sorts attributes order by given tag name's order.
|
618
|
+
# If a block is given, the argument _tags_ is ignored, and
|
619
|
+
# yields two tag names like Array#sort!.
|
620
|
+
#
|
621
|
+
# ---
|
622
|
+
# *Arguments*:
|
623
|
+
# * (required or optional) _tags_: Array containing String objects
|
624
|
+
# *Returns*:: _self_
|
625
|
+
def sort_attributes_by_tag!(tags = nil)
|
626
|
+
h = {}
|
627
|
+
s = @attributes.size
|
628
|
+
@attributes.each_with_index { |x, i| h[x] = i }
|
629
|
+
if block_given? then
|
630
|
+
@attributes.sort! do |x, y|
|
631
|
+
r = yield x[0], y[0]
|
632
|
+
if r == 0 then
|
633
|
+
r = (h[x] || s) <=> (h[y] || s)
|
634
|
+
end
|
635
|
+
r
|
636
|
+
end
|
637
|
+
else
|
638
|
+
unless tags then
|
639
|
+
raise ArgumentError, 'wrong number of arguments (0 for 1) or wrong argument value'
|
640
|
+
end
|
641
|
+
@attributes.sort! do |x, y|
|
642
|
+
r = (tags.index(x[0]) || tags.size) <=>
|
643
|
+
(tags.index(y[0]) || tags.size)
|
644
|
+
if r == 0 then
|
645
|
+
r = (h[x] || s) <=> (h[y] || s)
|
646
|
+
end
|
647
|
+
r
|
648
|
+
end
|
649
|
+
end
|
650
|
+
self
|
651
|
+
end
|
652
|
+
|
653
|
+
# Returns hash representation of attributes.
|
654
|
+
#
|
655
|
+
# Note: If two or more tag-value pairs with same tag names exist,
|
656
|
+
# only the first tag-value pair is used for each tag.
|
657
|
+
#
|
658
|
+
# ---
|
659
|
+
# *Returns*:: Hash object
|
660
|
+
def attributes_to_hash
|
661
|
+
h = {}
|
662
|
+
@attributes.each do |x|
|
663
|
+
key, val = x
|
664
|
+
h[key] = val unless h[key]
|
665
|
+
end
|
666
|
+
h
|
667
|
+
end
|
668
|
+
|
669
|
+
private
|
670
|
+
|
671
|
+
# (private) Parses attributes.
|
672
|
+
# Returns arrays
|
673
|
+
def parse_attributes(str)
|
674
|
+
return [] if !str or str == '.'
|
675
|
+
attr_pairs = parse_attributes_string(str)
|
676
|
+
attr_pairs.collect! do |x|
|
677
|
+
key = x.shift
|
678
|
+
val = (x.size == 1) ? x[0] : Value.new(x)
|
679
|
+
[ key, val ]
|
680
|
+
end
|
681
|
+
attr_pairs
|
682
|
+
end
|
683
|
+
|
684
|
+
# (private) Parses attributes string.
|
685
|
+
# Returns arrays
|
686
|
+
def parse_attributes_string(str)
|
687
|
+
sc = StringScanner.new(str)
|
688
|
+
attr_pairs = []
|
689
|
+
tokens = []
|
690
|
+
cur_token = ''
|
691
|
+
while !sc.eos?
|
692
|
+
if sc.scan(/[^\\\;\"\s]+/) then
|
693
|
+
cur_token.concat sc.matched
|
694
|
+
elsif sc.scan(/\s+/) then
|
695
|
+
tokens.push cur_token unless cur_token.empty?
|
696
|
+
cur_token = ''
|
697
|
+
elsif sc.scan(/\;/) then
|
698
|
+
tokens.push cur_token unless cur_token.empty?
|
699
|
+
cur_token = ''
|
700
|
+
attr_pairs.push tokens
|
701
|
+
tokens = []
|
702
|
+
elsif sc.scan(/\"/) then
|
703
|
+
tokens.push cur_token unless cur_token.empty?
|
704
|
+
cur_token = ''
|
705
|
+
freetext = ''
|
706
|
+
while !sc.eos?
|
707
|
+
if sc.scan(/[^\\\"]+/) then
|
708
|
+
freetext.concat sc.matched
|
709
|
+
elsif sc.scan(/\"/) then
|
710
|
+
break
|
711
|
+
elsif sc.scan(/\\([\"\\])/) then
|
712
|
+
freetext.concat sc[1]
|
713
|
+
elsif sc.scan(/\\x([0-9a-fA-F][0-9a-fA-F])/n) then
|
714
|
+
chr = sc[1].to_i(16).chr
|
715
|
+
freetext.concat chr
|
716
|
+
elsif sc.scan(/\\([0-7][0-7][0-7])/n) then
|
717
|
+
chr = sc[1].to_i(8).chr
|
718
|
+
freetext.concat chr
|
719
|
+
elsif sc.scan(/\\([^x0-9])/n) then
|
720
|
+
chr = Escape::BACKSLASH[sc[1]] || sc.matched
|
721
|
+
freetext.concat chr
|
722
|
+
elsif sc.scan(/\\/) then
|
723
|
+
freetext.concat sc.matched
|
724
|
+
else
|
725
|
+
raise 'Bug: should not reach here'
|
726
|
+
end
|
727
|
+
end
|
728
|
+
tokens.push freetext
|
729
|
+
#p freetext
|
730
|
+
# # disabled support for \; out of freetext
|
731
|
+
#elsif sc.scan(/\\\;/) then
|
732
|
+
# cur_token.concat sc.matched
|
733
|
+
elsif sc.scan(/\\/) then
|
734
|
+
cur_token.concat sc.matched
|
735
|
+
else
|
736
|
+
raise 'Bug: should not reach here'
|
737
|
+
end #if
|
738
|
+
end #while
|
739
|
+
tokens.push cur_token unless cur_token.empty?
|
740
|
+
attr_pairs.push tokens unless tokens.empty?
|
741
|
+
return attr_pairs
|
742
|
+
end
|
743
|
+
|
744
|
+
# (private) string representation of attributes
|
745
|
+
def attributes_to_s(attr)
|
746
|
+
attr.collect do |a|
|
747
|
+
tag, val = a
|
748
|
+
if Escape::IDENTIFIER_GFF2 !~ tag then
|
749
|
+
warn "Illegal GFF2 attribute tag: #{tag.inspect}" if $VERBOSE
|
750
|
+
end
|
751
|
+
tagstr = gff2_column_to_s(tag)
|
752
|
+
valstr = if val.kind_of?(Value) then
|
753
|
+
val.to_s
|
754
|
+
else
|
755
|
+
escape_gff2_attribute_value(val)
|
756
|
+
end
|
757
|
+
"#{tagstr} #{valstr}"
|
758
|
+
end.join(' ; ')
|
759
|
+
end
|
760
|
+
end #class Record
|
761
|
+
|
762
|
+
# Stores GFF2 meta-data.
|
763
|
+
class MetaData
|
764
|
+
# Creates a new MetaData object
|
765
|
+
def initialize(directive, data = nil)
|
766
|
+
@directive = directive
|
767
|
+
@data = data
|
768
|
+
end
|
769
|
+
|
770
|
+
# Directive. Usually, one of "feature-ontology", "attribute-ontology",
|
771
|
+
# or "source-ontology".
|
772
|
+
attr_accessor :directive
|
773
|
+
|
774
|
+
# data of this entry
|
775
|
+
attr_accessor :data
|
776
|
+
|
777
|
+
# parses a line
|
778
|
+
def self.parse(line)
|
779
|
+
directive, data = line.chomp.split(/\s+/, 2)
|
780
|
+
directive = directive.sub(/\A\#\#/, '') if directive
|
781
|
+
self.new(directive, data)
|
782
|
+
end
|
783
|
+
|
784
|
+
# string representation of this meta-data
|
785
|
+
def to_s
|
786
|
+
d = @directive.to_s.gsub(/[\r\n]+/, ' ')
|
787
|
+
v = ' ' + @data.to_s.gsub(/[\r\n]+/, ' ') unless @data.to_s.empty?
|
788
|
+
"\#\##{d}#{v}\n"
|
789
|
+
end
|
790
|
+
|
791
|
+
# Returns true if self == other. Otherwise, returns false.
|
792
|
+
def ==(other)
|
793
|
+
if self.class == other.class and
|
794
|
+
self.directive == other.directive and
|
795
|
+
self.data == other.data then
|
796
|
+
true
|
797
|
+
else
|
798
|
+
false
|
799
|
+
end
|
800
|
+
end
|
801
|
+
end #class MetaData
|
802
|
+
|
803
|
+
# (private) parses metadata
|
804
|
+
def parse_metadata(directive, line)
|
805
|
+
case directive
|
806
|
+
when 'gff-version'
|
807
|
+
@gff_version ||= line.split(/\s+/)[1]
|
808
|
+
else
|
809
|
+
@metadata.push MetaData.parse(line)
|
810
|
+
end
|
811
|
+
true
|
812
|
+
end
|
813
|
+
private :parse_metadata
|
814
|
+
|
815
|
+
# Creates a Bio::GFF::GFF2 object by building a collection of
|
816
|
+
# Bio::GFF::GFF2::Record (and metadata) objects.
|
817
|
+
#
|
818
|
+
# ---
|
819
|
+
# *Arguments*:
|
820
|
+
# * _str_: string in GFF format
|
821
|
+
# *Returns*:: Bio::GFF::GFF2 object
|
822
|
+
def initialize(str = nil)
|
823
|
+
@gff_version = nil
|
824
|
+
@records = []
|
825
|
+
@metadata = []
|
826
|
+
parse(str) if str
|
827
|
+
end
|
828
|
+
|
829
|
+
# GFF2 version string (String or nil). nil means "2".
|
830
|
+
attr_reader :gff_version
|
831
|
+
|
832
|
+
# Metadata (except "##gff-version").
|
833
|
+
# Must be an array of Bio::GFF::GFF2::MetaData objects.
|
834
|
+
attr_accessor :metadata
|
835
|
+
|
836
|
+
# Parses a GFF2 entries, and concatenated the parsed data.
|
837
|
+
#
|
838
|
+
# ---
|
839
|
+
# *Arguments*:
|
840
|
+
# * _str_: string in GFF format
|
841
|
+
# *Returns*:: self
|
842
|
+
def parse(str)
|
843
|
+
# parses GFF lines
|
844
|
+
str.each_line do |line|
|
845
|
+
if /^\#\#([^\s]+)/ =~ line then
|
846
|
+
parse_metadata($1, line)
|
847
|
+
else
|
848
|
+
@records << GFF2::Record.new(line)
|
849
|
+
end
|
850
|
+
end
|
851
|
+
self
|
852
|
+
end
|
853
|
+
|
854
|
+
end #class GFF2
|
855
|
+
|
856
|
+
# = DESCRIPTION
|
857
|
+
# Represents version 3 of GFF specification.
|
858
|
+
# For more information on version GFF3, see
|
859
|
+
# http://song.sourceforge.net/gff3.shtml
|
860
|
+
#--
|
861
|
+
# obsolete URL:
|
862
|
+
# http://flybase.bio.indiana.edu/annot/gff3.html
|
863
|
+
#++
|
864
|
+
class GFF3 < GFF
|
865
|
+
VERSION = 3
|
866
|
+
|
867
|
+
# Creates a Bio::GFF::GFF3 object by building a collection of
|
868
|
+
# Bio::GFF::GFF3::Record (and metadata) objects.
|
869
|
+
#
|
870
|
+
# ---
|
871
|
+
# *Arguments*:
|
872
|
+
# * _str_: string in GFF format
|
873
|
+
# *Returns*:: Bio::GFF object
|
874
|
+
def initialize(str = nil)
|
875
|
+
@gff_version = nil
|
876
|
+
@records = []
|
877
|
+
@sequence_regions = []
|
878
|
+
@metadata = []
|
879
|
+
@sequences = []
|
880
|
+
@in_fasta = false
|
881
|
+
parse(str) if str
|
882
|
+
end
|
883
|
+
|
884
|
+
# GFF3 version string (String or nil). nil means "3".
|
885
|
+
attr_reader :gff_version
|
886
|
+
|
887
|
+
# Metadata of "##sequence-region".
|
888
|
+
# Must be an array of Bio::GFF::GFF3::SequenceRegion objects.
|
889
|
+
attr_accessor :sequence_regions
|
890
|
+
|
891
|
+
# Metadata (except "##sequence-region", "##gff-version", "###").
|
892
|
+
# Must be an array of Bio::GFF::GFF3::MetaData objects.
|
893
|
+
attr_accessor :metadata
|
894
|
+
|
895
|
+
# Sequences bundled within GFF3.
|
896
|
+
# Must be an array of Bio::Sequence objects.
|
897
|
+
attr_accessor :sequences
|
898
|
+
|
899
|
+
# Parses a GFF3 entries, and concatenated the parsed data.
|
900
|
+
#
|
901
|
+
# Note that after "##FASTA" line is given,
|
902
|
+
# only fasta-formatted text is accepted.
|
903
|
+
#
|
904
|
+
# ---
|
905
|
+
# *Arguments*:
|
906
|
+
# * _str_: string in GFF format
|
907
|
+
# *Returns*:: self
|
908
|
+
def parse(str)
|
909
|
+
# if already after the ##FASTA line, parses fasta format and return
|
910
|
+
if @in_fasta then
|
911
|
+
parse_fasta(str)
|
912
|
+
return self
|
913
|
+
end
|
914
|
+
|
915
|
+
if str.respond_to?(:gets) then
|
916
|
+
# str is a IO-like object
|
917
|
+
fst = nil
|
918
|
+
else
|
919
|
+
# str is a String
|
920
|
+
gff, sep, fst = str.split(/^(\>|##FASTA.*)/n, 2)
|
921
|
+
fst = sep + fst if sep == '>' and fst
|
922
|
+
str = gff
|
923
|
+
end
|
924
|
+
|
925
|
+
# parses GFF lines
|
926
|
+
str.each_line do |line|
|
927
|
+
if /^\#\#([^\s]+)/ =~ line then
|
928
|
+
parse_metadata($1, line)
|
929
|
+
parse_fasta(str) if @in_fasta
|
930
|
+
elsif /^\>/ =~ line then
|
931
|
+
@in_fasta = true
|
932
|
+
parse_fasta(str, line)
|
933
|
+
else
|
934
|
+
@records << GFF3::Record.new(line)
|
935
|
+
end
|
936
|
+
end
|
937
|
+
|
938
|
+
# parses fasta format when str is a String and fasta data exists
|
939
|
+
if fst then
|
940
|
+
@in_fasta = true
|
941
|
+
parse_fasta(fst)
|
942
|
+
end
|
943
|
+
|
944
|
+
self
|
945
|
+
end
|
946
|
+
|
947
|
+
# parses fasta formatted data
|
948
|
+
def parse_fasta(str, line = nil)
|
949
|
+
str.each_line("\n>") do |seqstr|
|
950
|
+
if line then seqstr = line + seqstr; line = nil; end
|
951
|
+
x = seqstr.strip
|
952
|
+
next if x.empty? or x == '>'
|
953
|
+
fst = Bio::FastaFormat.new(seqstr)
|
954
|
+
seq = fst.to_seq
|
955
|
+
seq.entry_id =
|
956
|
+
unescape(fst.definition.strip.split(/\s/, 2)[0].to_s)
|
957
|
+
@sequences.push seq
|
958
|
+
end
|
959
|
+
end
|
960
|
+
private :parse_fasta
|
961
|
+
|
962
|
+
# string representation of whole entry.
|
963
|
+
def to_s
|
964
|
+
ver = @gff_version || VERSION.to_s
|
965
|
+
if @sequences.size > 0 then
|
966
|
+
seqs = "##FASTA\n" +
|
967
|
+
@sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('')
|
968
|
+
else
|
969
|
+
seqs = ''
|
970
|
+
end
|
971
|
+
|
972
|
+
([ "##gff-version #{escape(ver)}\n" ] +
|
973
|
+
@metadata.collect { |m| m.to_s } +
|
974
|
+
@sequence_regions.collect { |m| m.to_s } +
|
975
|
+
@records.collect{ |r| r.to_s }).join('') + seqs
|
976
|
+
end
|
977
|
+
|
978
|
+
# Private methods for escaping characters.
|
979
|
+
# Internal only. Users should not use this module directly.
|
980
|
+
module Escape
|
981
|
+
# unsafe characters to be escaped for normal columns
|
982
|
+
UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
|
983
|
+
|
984
|
+
# unsafe characters to be escaped for seqid columns
|
985
|
+
# and target_id of the "Target" attribute
|
986
|
+
UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
|
987
|
+
|
988
|
+
# unsafe characters to be escaped for attribute columns
|
989
|
+
UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
|
990
|
+
|
991
|
+
private
|
992
|
+
|
993
|
+
# If str is empty, returns '.'. Otherwise, returns str.
|
994
|
+
def column_to_s(str)
|
995
|
+
str = str.to_s
|
996
|
+
str.empty? ? '.' : str
|
997
|
+
end
|
998
|
+
|
999
|
+
# Return the string corresponding to these characters unescaped
|
1000
|
+
def unescape(string)
|
1001
|
+
URI.unescape(string)
|
1002
|
+
end
|
1003
|
+
|
1004
|
+
# Escape a column according to the specification at
|
1005
|
+
# http://song.sourceforge.net/gff3.shtml.
|
1006
|
+
def escape(string)
|
1007
|
+
URI.escape(string, UNSAFE)
|
1008
|
+
end
|
1009
|
+
|
1010
|
+
# Escape seqid column according to the specification at
|
1011
|
+
# http://song.sourceforge.net/gff3.shtml.
|
1012
|
+
def escape_seqid(string)
|
1013
|
+
URI.escape(string, UNSAFE_SEQID)
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
# Escape attribute according to the specification at
|
1017
|
+
# http://song.sourceforge.net/gff3.shtml.
|
1018
|
+
# In addition to the normal escape rule, the following characters
|
1019
|
+
# are escaped: ",=;".
|
1020
|
+
# Returns the string corresponding to these characters escaped.
|
1021
|
+
def escape_attribute(string)
|
1022
|
+
URI.escape(string, UNSAFE_ATTRIBUTE)
|
1023
|
+
end
|
1024
|
+
end #module Escape
|
1025
|
+
|
1026
|
+
include Escape
|
1027
|
+
|
1028
|
+
# Stores meta-data "##sequence-region seqid start end".
|
1029
|
+
class SequenceRegion
|
1030
|
+
include Escape
|
1031
|
+
|
1032
|
+
# creates a new SequenceRegion class
|
1033
|
+
def initialize(seqid, start, endpos)
|
1034
|
+
@seqid = seqid
|
1035
|
+
@start = start ? start.to_i : nil
|
1036
|
+
@end = endpos ? endpos.to_i : nil
|
1037
|
+
end
|
1038
|
+
|
1039
|
+
# parses given string and returns SequenceRegion class
|
1040
|
+
def self.parse(str)
|
1041
|
+
dummy, seqid, start, endpos =
|
1042
|
+
str.chomp.split(/\s+/, 4).collect { |x| URI.unescape(x) }
|
1043
|
+
self.new(seqid, start, endpos)
|
1044
|
+
end
|
1045
|
+
|
1046
|
+
# sequence ID
|
1047
|
+
attr_accessor :seqid
|
1048
|
+
|
1049
|
+
# start position
|
1050
|
+
attr_accessor :start
|
1051
|
+
|
1052
|
+
# end position
|
1053
|
+
attr_accessor :end
|
1054
|
+
|
1055
|
+
# string representation
|
1056
|
+
def to_s
|
1057
|
+
i = escape_seqid(column_to_s(@seqid))
|
1058
|
+
s = escape_seqid(column_to_s(@start))
|
1059
|
+
e = escape_seqid(column_to_s(@end))
|
1060
|
+
"##sequence-region #{i} #{s} #{e}\n"
|
1061
|
+
end
|
1062
|
+
|
1063
|
+
# Returns true if self == other. Otherwise, returns false.
|
1064
|
+
def ==(other)
|
1065
|
+
if other.class == self.class and
|
1066
|
+
other.seqid == self.seqid and
|
1067
|
+
other.start == self.start and
|
1068
|
+
other.end == self.end then
|
1069
|
+
true
|
1070
|
+
else
|
1071
|
+
false
|
1072
|
+
end
|
1073
|
+
end
|
1074
|
+
end #class SequenceRegion
|
1075
|
+
|
1076
|
+
# Represents a single line of a GFF3-formatted file.
|
1077
|
+
# See Bio::GFF::GFF3 for more information.
|
1078
|
+
class Record < GFF2::Record
|
1079
|
+
|
1080
|
+
include GFF3::Escape
|
1081
|
+
|
1082
|
+
# shortcut to the ID attribute
|
1083
|
+
def id
|
1084
|
+
get_attribute('ID')
|
1085
|
+
end
|
1086
|
+
|
1087
|
+
# set ID attribute
|
1088
|
+
def id=(str)
|
1089
|
+
set_attribute('ID', str)
|
1090
|
+
end
|
1091
|
+
|
1092
|
+
# aliases for Column 1 (formerly "seqname")
|
1093
|
+
alias seqid seqname
|
1094
|
+
alias seqid= seqname=
|
1095
|
+
|
1096
|
+
# aliases for Column 3 (formerly "feature").
|
1097
|
+
# In the GFF3 document http://song.sourceforge.net/gff3.shtml,
|
1098
|
+
# column3 is called "type", but we used "feature_type"
|
1099
|
+
# because "type" is already used by Ruby itself.
|
1100
|
+
alias feature_type feature
|
1101
|
+
alias feature_type= feature=
|
1102
|
+
|
1103
|
+
# aliases for Column 8
|
1104
|
+
alias phase frame
|
1105
|
+
alias phase= frame=
|
1106
|
+
|
1107
|
+
# Parses a GFF3-formatted line and returns a new
|
1108
|
+
# Bio::GFF::GFF3::Record object.
|
1109
|
+
def self.parse(str)
|
1110
|
+
self.new.parse(str)
|
1111
|
+
end
|
1112
|
+
|
1113
|
+
# Creates a Bio::GFF::GFF3::Record object.
|
1114
|
+
# Is typically not called directly, but
|
1115
|
+
# is called automatically when creating a Bio::GFF::GFF3 object.
|
1116
|
+
#
|
1117
|
+
# ---
|
1118
|
+
# *Arguments*:
|
1119
|
+
# * _str_: a tab-delimited line in GFF3 format
|
1120
|
+
# *Arguments*:
|
1121
|
+
# * _seqid_: sequence ID (String or nil)
|
1122
|
+
# * _source_: source (String or nil)
|
1123
|
+
# * _feature_type_: type of feature (String)
|
1124
|
+
# * _start_position_: start (Integer)
|
1125
|
+
# * _end_position_: end (Integer)
|
1126
|
+
# * _score_: score (Float or nil)
|
1127
|
+
# * _strand_: strand (String or nil)
|
1128
|
+
# * _phase_: phase (Integer or nil)
|
1129
|
+
# * _attributes_: attributes (Array or nil)
|
1130
|
+
def initialize(*arg)
|
1131
|
+
super(*arg)
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
# Parses a GFF3-formatted line and stores data from the string.
|
1135
|
+
# Note that all existing data is wiped out.
|
1136
|
+
def parse(string)
|
1137
|
+
super
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
# Return the record as a GFF3 compatible string
|
1141
|
+
def to_s
|
1142
|
+
cmnt = if @comment and !@comment.to_s.strip.empty? then
|
1143
|
+
@comment.gsub(/[\r\n]+/, ' ')
|
1144
|
+
else
|
1145
|
+
false
|
1146
|
+
end
|
1147
|
+
return "\##{cmnt}\n" if self.comment_only? and cmnt
|
1148
|
+
[
|
1149
|
+
escape_seqid(column_to_s(@seqname)),
|
1150
|
+
escape(column_to_s(@source)),
|
1151
|
+
escape(column_to_s(@feature)),
|
1152
|
+
escape(column_to_s(@start)),
|
1153
|
+
escape(column_to_s(@end)),
|
1154
|
+
escape(column_to_s(@score)),
|
1155
|
+
escape(column_to_s(@strand)),
|
1156
|
+
escape(column_to_s(@frame)),
|
1157
|
+
attributes_to_s(@attributes)
|
1158
|
+
].join("\t") +
|
1159
|
+
(cmnt ? "\t\##{cmnt}\n" : "\n")
|
1160
|
+
end
|
1161
|
+
|
1162
|
+
# Bio:GFF::GFF3::Record::Target is a class to store
|
1163
|
+
# data of "Target" attribute.
|
1164
|
+
class Target
|
1165
|
+
include GFF3::Escape
|
1166
|
+
|
1167
|
+
# Creates a new Target object.
|
1168
|
+
def initialize(target_id, start, endpos, strand = nil)
|
1169
|
+
@target_id = target_id
|
1170
|
+
@start = start ? start.to_i : nil
|
1171
|
+
@end = endpos ? endpos.to_i : nil
|
1172
|
+
@strand = strand
|
1173
|
+
end
|
1174
|
+
|
1175
|
+
# target ID
|
1176
|
+
attr_accessor :target_id
|
1177
|
+
|
1178
|
+
# start position
|
1179
|
+
attr_accessor :start
|
1180
|
+
|
1181
|
+
# end position
|
1182
|
+
attr_accessor :end
|
1183
|
+
|
1184
|
+
# strand (optional). Normally, "+" or "-", or nil.
|
1185
|
+
attr_accessor :strand
|
1186
|
+
|
1187
|
+
# parses "target_id start end [strand]"-style string
|
1188
|
+
# (for example, "ABC789 123 456 +")
|
1189
|
+
# and creates a new Target object.
|
1190
|
+
#
|
1191
|
+
def self.parse(str)
|
1192
|
+
target_id, start, endpos, strand =
|
1193
|
+
str.split(/ +/, 4).collect { |x| URI.unescape(x) }
|
1194
|
+
self.new(target_id, start, endpos, strand)
|
1195
|
+
end
|
1196
|
+
|
1197
|
+
# returns a string
|
1198
|
+
def to_s
|
1199
|
+
i = escape_seqid(column_to_s(@target_id))
|
1200
|
+
s = escape_attribute(column_to_s(@start))
|
1201
|
+
e = escape_attribute(column_to_s(@end))
|
1202
|
+
strnd = escape_attribute(@strand.to_s)
|
1203
|
+
strnd = " " + strnd unless strnd.empty?
|
1204
|
+
"#{i} #{s} #{e}#{strnd}"
|
1205
|
+
end
|
1206
|
+
|
1207
|
+
# Returns true if self == other. Otherwise, returns false.
|
1208
|
+
def ==(other)
|
1209
|
+
if other.class == self.class and
|
1210
|
+
other.target_id == self.target_id and
|
1211
|
+
other.start == self.start and
|
1212
|
+
other.end == self.end and
|
1213
|
+
other.strand == self.strand then
|
1214
|
+
true
|
1215
|
+
else
|
1216
|
+
false
|
1217
|
+
end
|
1218
|
+
end
|
1219
|
+
end #class Target
|
1220
|
+
|
1221
|
+
# Bio:GFF::GFF3::Record::Gap is a class to store
|
1222
|
+
# data of "Gap" attribute.
|
1223
|
+
class Gap
|
1224
|
+
|
1225
|
+
# Code is a class to store length of single-letter code.
|
1226
|
+
Code = Struct.new(:code, :length)
|
1227
|
+
|
1228
|
+
# Code is a class to store length of single-letter code.
|
1229
|
+
class Code
|
1230
|
+
# 1-letter code (Symbol). One of :M, :I, :D, :F, or :R is expected.
|
1231
|
+
attr_reader :code if false #dummy for RDoc
|
1232
|
+
|
1233
|
+
# length (Integer)
|
1234
|
+
attr_reader :length if false #dummy for RDoc
|
1235
|
+
|
1236
|
+
def to_s
|
1237
|
+
"#{code}#{length}"
|
1238
|
+
end
|
1239
|
+
end #class code
|
1240
|
+
|
1241
|
+
# Creates a new Gap object.
|
1242
|
+
#
|
1243
|
+
# ---
|
1244
|
+
# *Arguments*:
|
1245
|
+
# * _str_: a formatted string, or nil.
|
1246
|
+
def initialize(str = nil)
|
1247
|
+
if str then
|
1248
|
+
@data = str.split(/ +/).collect do |x|
|
1249
|
+
if /\A([A-Z])([0-9]+)\z/ =~ x.strip then
|
1250
|
+
Code.new($1.intern, $2.to_i)
|
1251
|
+
else
|
1252
|
+
warn "ignored unknown token: #{x}.inspect" if $VERBOSE
|
1253
|
+
nil
|
1254
|
+
end
|
1255
|
+
end
|
1256
|
+
@data.compact!
|
1257
|
+
else
|
1258
|
+
@data = []
|
1259
|
+
end
|
1260
|
+
end
|
1261
|
+
|
1262
|
+
# Same as new(str).
|
1263
|
+
def self.parse(str)
|
1264
|
+
self.new(str)
|
1265
|
+
end
|
1266
|
+
|
1267
|
+
# (private method)
|
1268
|
+
# Scans gaps and returns an array of Code objects
|
1269
|
+
def __scan_gap(str, gap_regexp = /[^a-zA-Z]/,
|
1270
|
+
code_i = :I, code_m = :M)
|
1271
|
+
sc = StringScanner.new(str)
|
1272
|
+
data = []
|
1273
|
+
while len = sc.skip_until(gap_regexp)
|
1274
|
+
mlen = len - sc.matched_size
|
1275
|
+
data.push Code.new(code_m, mlen) if mlen > 0
|
1276
|
+
g = Code.new(code_i, sc.matched_size)
|
1277
|
+
while glen = sc.skip(gap_regexp)
|
1278
|
+
g.length += glen
|
1279
|
+
end
|
1280
|
+
data.push g
|
1281
|
+
end
|
1282
|
+
if sc.rest_size > 0 then
|
1283
|
+
m = Code.new(code_m, sc.rest_size)
|
1284
|
+
data.push m
|
1285
|
+
end
|
1286
|
+
data
|
1287
|
+
end
|
1288
|
+
private :__scan_gap
|
1289
|
+
|
1290
|
+
# (private method)
|
1291
|
+
# Parses given reference-target sequence alignment and
|
1292
|
+
# initializes self. Existing data will be erased.
|
1293
|
+
def __initialize_from_sequences_na(reference, target,
|
1294
|
+
gap_regexp = /[^a-zA-Z]/)
|
1295
|
+
|
1296
|
+
data_ref = __scan_gap(reference, gap_regexp, :I, :M)
|
1297
|
+
data_tgt = __scan_gap(target, gap_regexp, :D, :M)
|
1298
|
+
data = []
|
1299
|
+
|
1300
|
+
while !data_ref.empty? and !data_tgt.empty?
|
1301
|
+
ref = data_ref.shift
|
1302
|
+
tgt = data_tgt.shift
|
1303
|
+
if ref.length > tgt.length then
|
1304
|
+
x = Code.new(ref.code, ref.length - tgt.length)
|
1305
|
+
data_ref.unshift x
|
1306
|
+
ref.length = tgt.length
|
1307
|
+
elsif ref.length < tgt.length then
|
1308
|
+
x = Code.new(tgt.code, tgt.length - ref.length)
|
1309
|
+
data_tgt.unshift x
|
1310
|
+
tgt.length = ref.length
|
1311
|
+
end
|
1312
|
+
case ref.code
|
1313
|
+
when :M
|
1314
|
+
if tgt.code == :M then
|
1315
|
+
data.push ref
|
1316
|
+
elsif tgt.code == :D then
|
1317
|
+
data.push tgt
|
1318
|
+
else
|
1319
|
+
raise 'Bug: should not reach here.'
|
1320
|
+
end
|
1321
|
+
when :I
|
1322
|
+
if tgt.code == :M then
|
1323
|
+
data.push ref
|
1324
|
+
elsif tgt.code == :D then
|
1325
|
+
# This site is ignored,
|
1326
|
+
# because both reference and target are gap
|
1327
|
+
else
|
1328
|
+
raise 'Bug: should not reach here.'
|
1329
|
+
end
|
1330
|
+
end
|
1331
|
+
end #while
|
1332
|
+
|
1333
|
+
# rest of data_ref
|
1334
|
+
len = 0
|
1335
|
+
data_ref.each do |ref|
|
1336
|
+
len += ref.length if ref.code == :M
|
1337
|
+
end
|
1338
|
+
data.push Code.new(:D, len) if len > 0
|
1339
|
+
|
1340
|
+
# rest of data_tgt
|
1341
|
+
len = 0
|
1342
|
+
data_tgt.each do |tgt|
|
1343
|
+
len += tgt.length if tgt.code == :M
|
1344
|
+
end
|
1345
|
+
data.push Code.new(:I, len) if len > 0
|
1346
|
+
|
1347
|
+
@data = data
|
1348
|
+
true
|
1349
|
+
end
|
1350
|
+
private :__initialize_from_sequences_na
|
1351
|
+
|
1352
|
+
# Creates a new Gap object from given sequence alignment.
|
1353
|
+
#
|
1354
|
+
# Note that sites of which both reference and target are gaps
|
1355
|
+
# are silently removed.
|
1356
|
+
#
|
1357
|
+
# ---
|
1358
|
+
# *Arguments*:
|
1359
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
1360
|
+
# * _target_: target sequence (nucleotide sequence)
|
1361
|
+
# * <I>gap_regexp</I>: regexp to identify gap
|
1362
|
+
def self.new_from_sequences_na(reference, target,
|
1363
|
+
gap_regexp = /[^a-zA-Z]/)
|
1364
|
+
gap = self.new
|
1365
|
+
gap.instance_eval {
|
1366
|
+
__initialize_from_sequences_na(reference, target,
|
1367
|
+
gap_regexp)
|
1368
|
+
}
|
1369
|
+
gap
|
1370
|
+
end
|
1371
|
+
|
1372
|
+
# (private method)
|
1373
|
+
# scans a codon or gap in reference sequence
|
1374
|
+
def __scan_codon(sc_ref,
|
1375
|
+
gap_regexp, space_regexp,
|
1376
|
+
forward_frameshift_regexp,
|
1377
|
+
reverse_frameshift_regexp)
|
1378
|
+
chars = []
|
1379
|
+
gap_count = 0
|
1380
|
+
fs_count = 0
|
1381
|
+
|
1382
|
+
while chars.size < 3 + fs_count and char = sc_ref.scan(/./mn)
|
1383
|
+
case char
|
1384
|
+
when space_regexp
|
1385
|
+
# ignored
|
1386
|
+
when forward_frameshift_regexp
|
1387
|
+
# next char is forward frameshift
|
1388
|
+
fs_count += 1
|
1389
|
+
when reverse_frameshift_regexp
|
1390
|
+
# next char is reverse frameshift
|
1391
|
+
fs_count -= 1
|
1392
|
+
when gap_regexp
|
1393
|
+
chars.push char
|
1394
|
+
gap_count += 1
|
1395
|
+
else
|
1396
|
+
chars.push char
|
1397
|
+
end
|
1398
|
+
end #while
|
1399
|
+
if chars.size < (3 + fs_count) then
|
1400
|
+
gap_count += (3 + fs_count) - chars.size
|
1401
|
+
end
|
1402
|
+
return gap_count, fs_count
|
1403
|
+
end
|
1404
|
+
private :__scan_codon
|
1405
|
+
|
1406
|
+
# (private method)
|
1407
|
+
# internal use only
|
1408
|
+
def __push_code_to_data(cur, data, code, len)
|
1409
|
+
if cur and cur.code == code then
|
1410
|
+
cur.length += len
|
1411
|
+
else
|
1412
|
+
cur = Code.new(code, len)
|
1413
|
+
data.push cur
|
1414
|
+
end
|
1415
|
+
return cur
|
1416
|
+
end
|
1417
|
+
private :__push_code_to_data
|
1418
|
+
|
1419
|
+
# (private method)
|
1420
|
+
# Parses given reference(nuc)-target(amino) sequence alignment and
|
1421
|
+
# initializes self. Existing data will be erased.
|
1422
|
+
def __initialize_from_sequences_na_aa(reference, target,
|
1423
|
+
gap_regexp = /[^a-zA-Z]/,
|
1424
|
+
space_regexp = /\s/,
|
1425
|
+
forward_frameshift_regexp =
|
1426
|
+
/\>/,
|
1427
|
+
reverse_frameshift_regexp =
|
1428
|
+
/\</)
|
1429
|
+
|
1430
|
+
data = []
|
1431
|
+
sc_ref = StringScanner.new(reference)
|
1432
|
+
sc_tgt = StringScanner.new(target)
|
1433
|
+
|
1434
|
+
re_one = /./mn
|
1435
|
+
|
1436
|
+
while !sc_tgt.eos?
|
1437
|
+
if len = sc_tgt.skip(space_regexp) then
|
1438
|
+
# ignored
|
1439
|
+
elsif len = sc_tgt.skip(forward_frameshift_regexp) then
|
1440
|
+
cur = __push_code_to_data(cur, data, :F, len)
|
1441
|
+
len.times { sc_ref.scan(re_one) }
|
1442
|
+
|
1443
|
+
elsif len = sc_tgt.skip(reverse_frameshift_regexp) then
|
1444
|
+
cur = __push_code_to_data(cur, data, :R, len)
|
1445
|
+
pos = sc_ref.pos
|
1446
|
+
pos -= len
|
1447
|
+
if pos < 0 then
|
1448
|
+
warn "Incorrect reverse frameshift" if $VERBOSE
|
1449
|
+
pos = 0
|
1450
|
+
end
|
1451
|
+
sc_ref.pos = pos
|
1452
|
+
|
1453
|
+
elsif len = sc_tgt.skip(gap_regexp) then
|
1454
|
+
len.times do
|
1455
|
+
ref_gaps, ref_fs = __scan_codon(sc_ref,
|
1456
|
+
gap_regexp,
|
1457
|
+
space_regexp,
|
1458
|
+
forward_frameshift_regexp,
|
1459
|
+
reverse_frameshift_regexp)
|
1460
|
+
case ref_gaps
|
1461
|
+
when 3
|
1462
|
+
# both ref and tgt are gap. ignored the site
|
1463
|
+
when 2, 1
|
1464
|
+
# forward frameshift inserted
|
1465
|
+
ref_fs += (3 - ref_gaps)
|
1466
|
+
when 0
|
1467
|
+
cur = __push_code_to_data(cur, data, :D, 1)
|
1468
|
+
else
|
1469
|
+
raise 'Bug: should not reach here'
|
1470
|
+
end
|
1471
|
+
if ref_fs < 0 then
|
1472
|
+
cur = __push_code_to_data(cur, data, :R, -ref_fs)
|
1473
|
+
elsif ref_fs > 0 then
|
1474
|
+
cur = __push_code_to_data(cur, data, :F, ref_fs)
|
1475
|
+
end
|
1476
|
+
end #len.times
|
1477
|
+
elsif len = sc_tgt.skip(re_one) then
|
1478
|
+
# always 1-letter
|
1479
|
+
ref_gaps, ref_fs = __scan_codon(sc_ref,
|
1480
|
+
gap_regexp,
|
1481
|
+
space_regexp,
|
1482
|
+
forward_frameshift_regexp,
|
1483
|
+
reverse_frameshift_regexp)
|
1484
|
+
case ref_gaps
|
1485
|
+
when 3
|
1486
|
+
cur = __push_code_to_data(cur, data, :I, 1)
|
1487
|
+
when 2, 1, 0
|
1488
|
+
# reverse frameshift inserted when gaps exist
|
1489
|
+
ref_fs -= ref_gaps
|
1490
|
+
# normal site
|
1491
|
+
cur = __push_code_to_data(cur, data, :M, 1)
|
1492
|
+
else
|
1493
|
+
raise 'Bug: should not reach here'
|
1494
|
+
end
|
1495
|
+
if ref_fs < 0 then
|
1496
|
+
cur = __push_code_to_data(cur, data, :R, -ref_fs)
|
1497
|
+
elsif ref_fs > 0 then
|
1498
|
+
cur = __push_code_to_data(cur, data, :F, ref_fs)
|
1499
|
+
end
|
1500
|
+
else
|
1501
|
+
raise 'Bug: should not reach here'
|
1502
|
+
end
|
1503
|
+
end #while
|
1504
|
+
|
1505
|
+
if sc_ref.rest_size > 0 then
|
1506
|
+
rest = sc_ref.scan(/.*/mn)
|
1507
|
+
rest.gsub!(space_regexp, '')
|
1508
|
+
rest.gsub!(forward_frameshift_regexp, '')
|
1509
|
+
rest.gsub!(reverse_frameshift_regexp, '')
|
1510
|
+
rest.gsub!(gap_regexp, '')
|
1511
|
+
len = rest.length.div(3)
|
1512
|
+
cur = __push_code_to_data(cur, data, :D, len) if len > 0
|
1513
|
+
len = rest.length % 3
|
1514
|
+
cur = __push_code_to_data(cur, data, :F, len) if len > 0
|
1515
|
+
end
|
1516
|
+
|
1517
|
+
@data = data
|
1518
|
+
self
|
1519
|
+
end
|
1520
|
+
private :__initialize_from_sequences_na_aa
|
1521
|
+
|
1522
|
+
# Creates a new Gap object from given sequence alignment.
|
1523
|
+
#
|
1524
|
+
# Note that sites of which both reference and target are gaps
|
1525
|
+
# are silently removed.
|
1526
|
+
#
|
1527
|
+
# For incorrect alignments that break 3:1 rule,
|
1528
|
+
# gap positions will be moved inside codons,
|
1529
|
+
# unwanted gaps will be removed, and
|
1530
|
+
# some forward or reverse frameshift will be inserted.
|
1531
|
+
#
|
1532
|
+
# For example,
|
1533
|
+
# atgg-taagac-att
|
1534
|
+
# M V K - I
|
1535
|
+
# is treated as:
|
1536
|
+
# atggt<aagacatt
|
1537
|
+
# M V K >>I
|
1538
|
+
#
|
1539
|
+
# Incorrect combination of frameshift with frameshift or gap
|
1540
|
+
# may cause undefined behavior.
|
1541
|
+
#
|
1542
|
+
# Forward frameshifts are recomended to be indicated in the
|
1543
|
+
# target sequence.
|
1544
|
+
# Reverse frameshifts can be indicated in the reference sequence
|
1545
|
+
# or the target sequence.
|
1546
|
+
#
|
1547
|
+
# Priority of regular expressions:
|
1548
|
+
# space > forward/reverse frameshift > gap
|
1549
|
+
#
|
1550
|
+
# ---
|
1551
|
+
# *Arguments*:
|
1552
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
1553
|
+
# * _target_: target sequence (amino acid sequence)
|
1554
|
+
# * <I>gap_regexp</I>: regexp to identify gap
|
1555
|
+
# * <I>space_regexp</I>: regexp to identify space character which is completely ignored
|
1556
|
+
# * <I>forward_frameshift_regexp</I>: regexp to identify forward frameshift
|
1557
|
+
# * <I>reverse_frameshift_regexp</I>: regexp to identify reverse frameshift
|
1558
|
+
def self.new_from_sequences_na_aa(reference, target,
|
1559
|
+
gap_regexp = /[^a-zA-Z]/,
|
1560
|
+
space_regexp = /\s/,
|
1561
|
+
forward_frameshift_regexp = /\>/,
|
1562
|
+
reverse_frameshift_regexp = /\</)
|
1563
|
+
gap = self.new
|
1564
|
+
gap.instance_eval {
|
1565
|
+
__initialize_from_sequences_na_aa(reference, target,
|
1566
|
+
gap_regexp,
|
1567
|
+
space_regexp,
|
1568
|
+
forward_frameshift_regexp,
|
1569
|
+
reverse_frameshift_regexp)
|
1570
|
+
}
|
1571
|
+
gap
|
1572
|
+
end
|
1573
|
+
|
1574
|
+
# string representation
|
1575
|
+
def to_s
|
1576
|
+
@data.collect { |x| x.to_s }.join(" ")
|
1577
|
+
end
|
1578
|
+
|
1579
|
+
# Internal data. Users must not use it.
|
1580
|
+
attr_reader :data
|
1581
|
+
# @data can be read by other Gap instances
|
1582
|
+
protected :data
|
1583
|
+
|
1584
|
+
# If self == other, returns true.
|
1585
|
+
# otherwise, returns false.
|
1586
|
+
def ==(other)
|
1587
|
+
if other.class == self.class and
|
1588
|
+
@data == other.data then
|
1589
|
+
true
|
1590
|
+
else
|
1591
|
+
false
|
1592
|
+
end
|
1593
|
+
end
|
1594
|
+
|
1595
|
+
# duplicates sequences
|
1596
|
+
def dup_seqs(*arg)
|
1597
|
+
arg.collect do |s|
|
1598
|
+
begin
|
1599
|
+
s = s.seq
|
1600
|
+
rescue NoMethodError
|
1601
|
+
end
|
1602
|
+
s.dup
|
1603
|
+
end
|
1604
|
+
end
|
1605
|
+
private :dup_seqs
|
1606
|
+
|
1607
|
+
# (private method)
|
1608
|
+
# insert gaps refers to the gap rule inside the object
|
1609
|
+
def __process_sequences(s_ref, s_tgt,
|
1610
|
+
ref_gap, tgt_gap,
|
1611
|
+
ref_increment, tgt_increment,
|
1612
|
+
forward_frameshift,
|
1613
|
+
reverse_frameshift)
|
1614
|
+
p_ref = 0
|
1615
|
+
p_tgt = 0
|
1616
|
+
@data.each do |c|
|
1617
|
+
#$stderr.puts c.inspect
|
1618
|
+
#$stderr.puts "p_ref=#{p_ref} s_ref=#{s_ref.inspect}"
|
1619
|
+
#$stderr.puts "p_tgt=#{p_tgt} s_tgt=#{s_tgt.inspect}"
|
1620
|
+
case c.code
|
1621
|
+
when :M # match
|
1622
|
+
p_ref += c.length * ref_increment
|
1623
|
+
p_tgt += c.length * tgt_increment
|
1624
|
+
when :I # insert a gap into the reference sequence
|
1625
|
+
begin
|
1626
|
+
s_ref[p_ref, 0] = ref_gap * c.length
|
1627
|
+
rescue IndexError
|
1628
|
+
raise 'reference sequence too short'
|
1629
|
+
end
|
1630
|
+
p_ref += c.length * ref_increment
|
1631
|
+
p_tgt += c.length * tgt_increment
|
1632
|
+
when :D # insert a gap into the target (delete from reference)
|
1633
|
+
begin
|
1634
|
+
s_tgt[p_tgt, 0] = tgt_gap * c.length
|
1635
|
+
rescue IndexError
|
1636
|
+
raise 'target sequence too short'
|
1637
|
+
end
|
1638
|
+
p_ref += c.length * ref_increment
|
1639
|
+
p_tgt += c.length * tgt_increment
|
1640
|
+
when :F # frameshift forward in the reference sequence
|
1641
|
+
begin
|
1642
|
+
s_tgt[p_tgt, 0] = forward_frameshift * c.length
|
1643
|
+
rescue IndexError
|
1644
|
+
raise 'target sequence too short'
|
1645
|
+
end
|
1646
|
+
p_ref += c.length
|
1647
|
+
p_tgt += c.length
|
1648
|
+
when :R # frameshift reverse in the reference sequence
|
1649
|
+
p_rev_frm = p_ref - c.length
|
1650
|
+
if p_rev_frm < 0 then
|
1651
|
+
raise 'too short reference sequence, or too many reverse frameshifts'
|
1652
|
+
end
|
1653
|
+
begin
|
1654
|
+
s_ref[p_rev_frm, 0] = reverse_frameshift * c.length
|
1655
|
+
rescue IndexError
|
1656
|
+
raise 'reference sequence too short'
|
1657
|
+
end
|
1658
|
+
|
1659
|
+
else
|
1660
|
+
warn "ignored #{c.to_s.inspect}" if $VERBOSE
|
1661
|
+
end
|
1662
|
+
end
|
1663
|
+
|
1664
|
+
if s_ref.length < p_ref then
|
1665
|
+
raise 'reference sequence too short'
|
1666
|
+
end
|
1667
|
+
if s_tgt.length < p_tgt then
|
1668
|
+
raise 'target sequence too short'
|
1669
|
+
end
|
1670
|
+
return s_ref, s_tgt
|
1671
|
+
end
|
1672
|
+
private :__process_sequences
|
1673
|
+
|
1674
|
+
# Processes nucleotide sequences and
|
1675
|
+
# returns gapped sequences as an array of sequences.
|
1676
|
+
#
|
1677
|
+
# Note for forward/reverse frameshift:
|
1678
|
+
# Forward/Reverse_frameshift is simply treated as
|
1679
|
+
# gap insertion to the target/reference sequence.
|
1680
|
+
#
|
1681
|
+
# ---
|
1682
|
+
# *Arguments*:
|
1683
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
1684
|
+
# * _target_: target sequence (nucleotide sequence)
|
1685
|
+
# * <I>gap_char</I>: gap character
|
1686
|
+
def process_sequences_na(reference, target, gap_char = '-')
|
1687
|
+
s_ref, s_tgt = dup_seqs(reference, target)
|
1688
|
+
|
1689
|
+
s_ref, s_tgt = __process_sequences(s_ref, s_tgt,
|
1690
|
+
gap_char, gap_char,
|
1691
|
+
1, 1,
|
1692
|
+
gap_char, gap_char)
|
1693
|
+
|
1694
|
+
if $VERBOSE and s_ref.length != s_tgt.length then
|
1695
|
+
warn "returned sequences not equal length"
|
1696
|
+
end
|
1697
|
+
return s_ref, s_tgt
|
1698
|
+
end
|
1699
|
+
|
1700
|
+
# Processes sequences and
|
1701
|
+
# returns gapped sequences as an array of sequences.
|
1702
|
+
# reference must be a nucleotide sequence, and
|
1703
|
+
# target must be an amino acid sequence.
|
1704
|
+
#
|
1705
|
+
# Note for reverse frameshift:
|
1706
|
+
# Reverse_frameshift characers are inserted in the
|
1707
|
+
# reference sequence.
|
1708
|
+
# For example, alignment of "Gap=M3 R1 M2" is:
|
1709
|
+
# atgaagat<aatgtc
|
1710
|
+
# M K I N V
|
1711
|
+
# Alignment of "Gap=M3 R3 M3" is:
|
1712
|
+
# atgaag<<<attaatgtc
|
1713
|
+
# M K I I N V
|
1714
|
+
#
|
1715
|
+
# ---
|
1716
|
+
# *Arguments*:
|
1717
|
+
# * _reference_: reference sequence (nucleotide sequence)
|
1718
|
+
# * _target_: target sequence (amino acid sequence)
|
1719
|
+
# * <I>gap_char</I>: gap character
|
1720
|
+
# * <I>space_char</I>: space character inserted to amino sequence for matching na-aa alignment
|
1721
|
+
# * <I>forward_frameshift</I>: forward frameshift character
|
1722
|
+
# * <I>reverse_frameshift</I>: reverse frameshift character
|
1723
|
+
def process_sequences_na_aa(reference, target,
|
1724
|
+
gap_char = '-',
|
1725
|
+
space_char = ' ',
|
1726
|
+
forward_frameshift = '>',
|
1727
|
+
reverse_frameshift = '<')
|
1728
|
+
s_ref, s_tgt = dup_seqs(reference, target)
|
1729
|
+
s_tgt = s_tgt.gsub(/./, "\\0#{space_char}#{space_char}")
|
1730
|
+
ref_increment = 3
|
1731
|
+
tgt_increment = 1 + space_char.length * 2
|
1732
|
+
ref_gap = gap_char * 3
|
1733
|
+
tgt_gap = "#{gap_char}#{space_char}#{space_char}"
|
1734
|
+
return __process_sequences(s_ref, s_tgt,
|
1735
|
+
ref_gap, tgt_gap,
|
1736
|
+
ref_increment, tgt_increment,
|
1737
|
+
forward_frameshift,
|
1738
|
+
reverse_frameshift)
|
1739
|
+
end
|
1740
|
+
end #class Gap
|
1741
|
+
|
1742
|
+
private
|
1743
|
+
def parse_attributes(string)
|
1744
|
+
return [] if !string or string == '.'
|
1745
|
+
attr_pairs = []
|
1746
|
+
string.split(';').each do |pair|
|
1747
|
+
key, value = pair.split('=', 2)
|
1748
|
+
key = unescape(key)
|
1749
|
+
values = value.to_s.split(',')
|
1750
|
+
case key
|
1751
|
+
when 'Target'
|
1752
|
+
values.collect! { |v| Target.parse(v) }
|
1753
|
+
when 'Gap'
|
1754
|
+
values.collect! { |v| Gap.parse(v) }
|
1755
|
+
else
|
1756
|
+
values.collect! { |v| unescape(v) }
|
1757
|
+
end
|
1758
|
+
attr_pairs.concat values.collect { |v| [ key, v ] }
|
1759
|
+
end
|
1760
|
+
return attr_pairs
|
1761
|
+
end # method parse_attributes
|
1762
|
+
|
1763
|
+
# Return the attributes as a string as it appears at the end of
|
1764
|
+
# a GFF3 line
|
1765
|
+
def attributes_to_s(attr)
|
1766
|
+
return '.' if !attr or attr.empty?
|
1767
|
+
keys = []
|
1768
|
+
hash = {}
|
1769
|
+
attr.each do |pair|
|
1770
|
+
key = pair[0]
|
1771
|
+
val = pair[1]
|
1772
|
+
keys.push key unless hash[key]
|
1773
|
+
hash[key] ||= []
|
1774
|
+
hash[key].push val
|
1775
|
+
end
|
1776
|
+
keys.collect do |key|
|
1777
|
+
values = hash[key]
|
1778
|
+
val = values.collect do |v|
|
1779
|
+
if v.kind_of?(Target) then
|
1780
|
+
v.to_s
|
1781
|
+
else
|
1782
|
+
escape_attribute(v.to_s)
|
1783
|
+
end
|
1784
|
+
end.join(',')
|
1785
|
+
"#{escape_attribute(key)}=#{val}"
|
1786
|
+
end.join(';')
|
1787
|
+
end
|
1788
|
+
|
1789
|
+
end # class GFF3::Record
|
1790
|
+
|
1791
|
+
# This is a dummy record corresponding to the "###" metadata.
|
1792
|
+
class RecordBoundary < GFF3::Record
|
1793
|
+
def initialize(*arg)
|
1794
|
+
super(*arg)
|
1795
|
+
self.freeze
|
1796
|
+
end
|
1797
|
+
|
1798
|
+
def to_s
|
1799
|
+
"###\n"
|
1800
|
+
end
|
1801
|
+
end #class RecordBoundary
|
1802
|
+
|
1803
|
+
# stores GFF3 MetaData
|
1804
|
+
MetaData = GFF2::MetaData
|
1805
|
+
|
1806
|
+
# parses metadata
|
1807
|
+
def parse_metadata(directive, line)
|
1808
|
+
case directive
|
1809
|
+
when 'gff-version'
|
1810
|
+
@gff_version ||= line.split(/\s+/)[1]
|
1811
|
+
when 'FASTA'
|
1812
|
+
@in_fasta = true
|
1813
|
+
when 'sequence-region'
|
1814
|
+
@sequence_regions.push SequenceRegion.parse(line)
|
1815
|
+
when '#' # "###" directive
|
1816
|
+
@records.push RecordBoundary.new
|
1817
|
+
else
|
1818
|
+
@metadata.push MetaData.parse(line)
|
1819
|
+
end
|
1820
|
+
true
|
1821
|
+
end
|
1822
|
+
private :parse_metadata
|
1823
|
+
|
1824
|
+
end #class GFF3
|
1825
|
+
|
1826
|
+
end # class GFF
|
155
1827
|
|
156
1828
|
end # module Bio
|
157
1829
|
|