bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,1906 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/alignment.rb - multiple alignment of sequences
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2003, 2005
|
|
5
|
+
# GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
|
|
6
|
+
#
|
|
7
|
+
# License:: LGPL
|
|
8
|
+
#
|
|
9
|
+
# $Id: alignment.rb,v 1.14 2005/12/02 12:01:28 ngoto Exp $
|
|
10
|
+
#
|
|
11
|
+
#--
|
|
12
|
+
# This library is free software; you can redistribute it and/or
|
|
13
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
14
|
+
# License as published by the Free Software Foundation; either
|
|
15
|
+
# version 2 of the License, or (at your option) any later version.
|
|
16
|
+
#
|
|
17
|
+
# This library is distributed in the hope that it will be useful,
|
|
18
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
20
|
+
# Lesser General Public License for more details.
|
|
21
|
+
#
|
|
22
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
23
|
+
# License along with this library; if not, write to the Free Software
|
|
24
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
25
|
+
#++
|
|
26
|
+
#
|
|
27
|
+
# = About Bio::Alignment
|
|
28
|
+
#
|
|
29
|
+
# Please refer document of Bio::Alignment module.
|
|
30
|
+
#
|
|
31
|
+
# = References
|
|
32
|
+
#
|
|
33
|
+
# * Bio::Align::AlignI class of the BioPerl.
|
|
34
|
+
# http://doc.bioperl.org/releases/bioperl-1.4/Bio/Align/AlignI.html
|
|
35
|
+
#
|
|
36
|
+
# * Bio::SimpleAlign class of the BioPerl.
|
|
37
|
+
# http://doc.bioperl.org/releases/bioperl-1.4/Bio/SimpleAlign.html
|
|
38
|
+
#
|
|
39
|
+
|
|
40
|
+
require 'bio/sequence'
|
|
41
|
+
|
|
42
|
+
module Bio
|
|
43
|
+
|
|
44
|
+
=begin rdoc
|
|
45
|
+
|
|
46
|
+
= About Bio::Alignment
|
|
47
|
+
|
|
48
|
+
Bio::Alignment is a namespace of classes/modules for multiple sequence
|
|
49
|
+
alignment.
|
|
50
|
+
|
|
51
|
+
= Multiple alignment container classes
|
|
52
|
+
|
|
53
|
+
== Bio::Alignment::OriginalAlignment
|
|
54
|
+
|
|
55
|
+
== Bio::Alignment::SequenceArray
|
|
56
|
+
|
|
57
|
+
== Bio::Alignment::SequenceHash
|
|
58
|
+
|
|
59
|
+
= Bio::Alignment::Site
|
|
60
|
+
|
|
61
|
+
= Modules
|
|
62
|
+
|
|
63
|
+
== Bio::Alignment::EnumerableExtension
|
|
64
|
+
|
|
65
|
+
Mix-in for classes included Enumerable.
|
|
66
|
+
|
|
67
|
+
== Bio::Alignment::ArrayExtension
|
|
68
|
+
|
|
69
|
+
Mix-in for Array or Array-like classes.
|
|
70
|
+
|
|
71
|
+
== Bio::Alignment::HashExtension
|
|
72
|
+
|
|
73
|
+
Mix-in for Hash or Hash-like classes.
|
|
74
|
+
|
|
75
|
+
== Bio::Alignment::SiteMethods
|
|
76
|
+
|
|
77
|
+
== Bio::Alignment::PropertyMethods
|
|
78
|
+
|
|
79
|
+
= Bio::Alignment::GAP
|
|
80
|
+
|
|
81
|
+
= Compatibility from older BioRuby
|
|
82
|
+
|
|
83
|
+
=end
|
|
84
|
+
module Alignment
|
|
85
|
+
|
|
86
|
+
# Bio::Alignment::PropertyMethods is a set of methods to treat
|
|
87
|
+
# the gap character and so on.
|
|
88
|
+
module PropertyMethods
|
|
89
|
+
# regular expression for detecting gaps.
|
|
90
|
+
GAP_REGEXP = /[^a-zA-Z]/
|
|
91
|
+
# gap character
|
|
92
|
+
GAP_CHAR = '-'.freeze
|
|
93
|
+
# missing character
|
|
94
|
+
MISSING_CHAR = '?'.freeze
|
|
95
|
+
|
|
96
|
+
# If given character is a gap, returns true.
|
|
97
|
+
# Otherwise, return false.
|
|
98
|
+
# Note that <em>s</em> must be a String which contain a single character.
|
|
99
|
+
def is_gap?(s)
|
|
100
|
+
(gap_regexp =~ s) ? true : false
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Returns regular expression for checking gap.
|
|
104
|
+
def gap_regexp
|
|
105
|
+
@gap_regexp or GAP_REGEXP
|
|
106
|
+
end
|
|
107
|
+
# regular expression for checking gap
|
|
108
|
+
attr_writer :gap_regexp
|
|
109
|
+
|
|
110
|
+
# Gap character.
|
|
111
|
+
def gap_char
|
|
112
|
+
@gap_char or GAP_CHAR
|
|
113
|
+
end
|
|
114
|
+
# gap character
|
|
115
|
+
attr_writer :gap_char
|
|
116
|
+
|
|
117
|
+
# Character if the site is missing or unknown.
|
|
118
|
+
def missing_char
|
|
119
|
+
@missing_char or MISSING_CHAR
|
|
120
|
+
end
|
|
121
|
+
# Character if the site is missing or unknown.
|
|
122
|
+
attr_writer :missing_char
|
|
123
|
+
|
|
124
|
+
# Returns class of the sequence.
|
|
125
|
+
# If instance variable @seqclass (which can be
|
|
126
|
+
# set by 'seqclass=' method) is set, simply returns the value.
|
|
127
|
+
# Otherwise, returns the first sequence's class.
|
|
128
|
+
# If no sequences are found, returns nil.
|
|
129
|
+
def seqclass
|
|
130
|
+
@seqclass or String
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# The class of the sequence.
|
|
134
|
+
# The value must be String or its derivatives.
|
|
135
|
+
attr_writer :seqclass
|
|
136
|
+
|
|
137
|
+
# Returns properties defined in the object as an hash.
|
|
138
|
+
def get_all_property
|
|
139
|
+
ret = {}
|
|
140
|
+
if defined? @gap_regexp
|
|
141
|
+
ret[:gap_regexp] = @gap_regexp
|
|
142
|
+
end
|
|
143
|
+
if defined? @gap_char
|
|
144
|
+
ret[:gap_char] = @gap_char
|
|
145
|
+
end
|
|
146
|
+
if defined? @missing_char
|
|
147
|
+
ret[:missing_char] = @missing_char
|
|
148
|
+
end
|
|
149
|
+
if defined? @seqclass
|
|
150
|
+
ret[:seqclass] = @seqclass
|
|
151
|
+
end
|
|
152
|
+
ret
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Sets properties from given hash.
|
|
156
|
+
# <em>hash</em> would be a return value of <tt>get_character</tt> method.
|
|
157
|
+
def set_all_property(hash)
|
|
158
|
+
@gap_regexp = hash[:gap_regexp] if hash.has_key?(:gap_regexp)
|
|
159
|
+
@gap_char = hash[:gap_char] if hash.has_key?(:gap_char)
|
|
160
|
+
@missing_char = hash[:missing_char] if hash.has_key?(:missing_char)
|
|
161
|
+
@seqclass = hash[:seqclass] if hash.has_key?(:seqclass)
|
|
162
|
+
self
|
|
163
|
+
end
|
|
164
|
+
end #module PropertyMethods
|
|
165
|
+
|
|
166
|
+
# Bio::Alignment::SiteMethods is a set of methods for
|
|
167
|
+
# Bio::Alignment::Site.
|
|
168
|
+
# It can also be used for extending an array of single-letter strings.
|
|
169
|
+
module SiteMethods
|
|
170
|
+
include PropertyMethods
|
|
171
|
+
|
|
172
|
+
# If there are gaps, returns true. Otherwise, returns false.
|
|
173
|
+
def has_gap?
|
|
174
|
+
(find { |x| is_gap?(x) }) ? true : false
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Removes gaps in the site. (destructive method)
|
|
178
|
+
def remove_gaps!
|
|
179
|
+
flag = nil
|
|
180
|
+
self.collect! do |x|
|
|
181
|
+
if is_gap?(x) then flag = self; nil; else x; end
|
|
182
|
+
end
|
|
183
|
+
self.compact!
|
|
184
|
+
flag
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Returns consensus character of the site.
|
|
188
|
+
# If consensus is found, eturns a single-letter string.
|
|
189
|
+
# If not, returns nil.
|
|
190
|
+
def consensus_string(threshold = 1.0)
|
|
191
|
+
return nil if self.size <= 0
|
|
192
|
+
return self[0] if self.sort.uniq.size == 1
|
|
193
|
+
h = Hash.new(0)
|
|
194
|
+
self.each { |x| h[x] += 1 }
|
|
195
|
+
total = self.size
|
|
196
|
+
b = h.to_a.sort do |x,y|
|
|
197
|
+
z = (y[1] <=> x[1])
|
|
198
|
+
z = (self.index(x[0]) <=> self.index(y[0])) if z == 0
|
|
199
|
+
z
|
|
200
|
+
end
|
|
201
|
+
if total * threshold <= b[0][1] then
|
|
202
|
+
b[0][0]
|
|
203
|
+
else
|
|
204
|
+
nil
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# IUPAC nucleotide groups. Internal use only.
|
|
209
|
+
IUPAC_NUC = [
|
|
210
|
+
%w( t u ),
|
|
211
|
+
%w( m a c ),
|
|
212
|
+
%w( r a g ),
|
|
213
|
+
%w( w a t u ),
|
|
214
|
+
%w( s c g ),
|
|
215
|
+
%w( y c t u ),
|
|
216
|
+
%w( k g t u ),
|
|
217
|
+
%w( v a c g m r s ),
|
|
218
|
+
%w( h a c t u m w y ),
|
|
219
|
+
%w( d a g t u r w k ),
|
|
220
|
+
%w( b c g t u s y k ),
|
|
221
|
+
%w( n a c g t u m r w s y k v h d b )
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
# Returns an IUPAC consensus base for the site.
|
|
225
|
+
# If consensus is found, eturns a single-letter string.
|
|
226
|
+
# If not, returns nil.
|
|
227
|
+
def consensus_iupac
|
|
228
|
+
a = self.collect { |x| x.downcase }.sort.uniq
|
|
229
|
+
if a.size == 1 then
|
|
230
|
+
case a[0]
|
|
231
|
+
when 'a', 'c', 'g', 't'
|
|
232
|
+
a[0]
|
|
233
|
+
when 'u'
|
|
234
|
+
't'
|
|
235
|
+
else
|
|
236
|
+
IUPAC_NUC.find { |x| a[0] == x[0] } ? a[0] : nil
|
|
237
|
+
end
|
|
238
|
+
elsif r = IUPAC_NUC.find { |x| (a - x).size <= 0 } then
|
|
239
|
+
r[0]
|
|
240
|
+
else
|
|
241
|
+
nil
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Table of strongly conserved amino-acid groups.
|
|
246
|
+
#
|
|
247
|
+
# The value of the tables are taken from BioPerl
|
|
248
|
+
# (Bio/SimpleAlign.pm in BioPerl 1.0),
|
|
249
|
+
# and the BioPerl's document says that
|
|
250
|
+
# it is taken from Clustalw documentation and
|
|
251
|
+
# These are all the positively scoring groups that occur in the
|
|
252
|
+
# Gonnet Pam250 matrix. The strong and weak groups are
|
|
253
|
+
# defined as strong score >0.5 and weak score =<0.5 respectively.
|
|
254
|
+
#
|
|
255
|
+
StrongConservationGroups = %w(STA NEQK NHQK NDEQ QHRK MILV MILF
|
|
256
|
+
HY FYW).collect { |x| x.split('').sort }
|
|
257
|
+
|
|
258
|
+
# Table of weakly conserved amino-acid groups.
|
|
259
|
+
#
|
|
260
|
+
# Please refer StrongConservationGroups document
|
|
261
|
+
# for the origin of the table.
|
|
262
|
+
WeakConservationGroups = %w(CSA ATV SAG STNK STPA SGND SNDEQK
|
|
263
|
+
NDEQHK NEQHRK FVLIM HFY).collect { |x| x.split('').sort }
|
|
264
|
+
|
|
265
|
+
# Returns the match-line character for the site.
|
|
266
|
+
# This is amino-acid version.
|
|
267
|
+
def match_line_amino(opt = {})
|
|
268
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
|
269
|
+
# opt[:strong_match_char] ==> strong match default: ':'
|
|
270
|
+
# opt[:weak_match_char] ==> weak match default: '.'
|
|
271
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
|
272
|
+
mlc = (opt[:match_line_char] or '*')
|
|
273
|
+
smc = (opt[:strong_match_char] or ':')
|
|
274
|
+
wmc = (opt[:weak_match_char] or '.')
|
|
275
|
+
mmc = (opt[:mismatch_char] or ' ')
|
|
276
|
+
a = self.collect { |c| c.upcase }.sort.uniq
|
|
277
|
+
a.extend(SiteMethods)
|
|
278
|
+
if a.has_gap? then
|
|
279
|
+
mmc
|
|
280
|
+
elsif a.size == 1 then
|
|
281
|
+
mlc
|
|
282
|
+
elsif StrongConservationGroups.find { |x| (a - x).empty? } then
|
|
283
|
+
smc
|
|
284
|
+
elsif WeakConservationGroups.find { |x| (a - x).empty? } then
|
|
285
|
+
wmc
|
|
286
|
+
else
|
|
287
|
+
mmc
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Returns the match-line character for the site.
|
|
292
|
+
# This is nucleic-acid version.
|
|
293
|
+
def match_line_nuc(opt = {})
|
|
294
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
|
295
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
|
296
|
+
mlc = (opt[:match_line_char] or '*')
|
|
297
|
+
mmc = (opt[:mismatch_char] or ' ')
|
|
298
|
+
a = self.collect { |c| c.upcase }.sort.uniq
|
|
299
|
+
a.extend(SiteMethods)
|
|
300
|
+
if a.has_gap? then
|
|
301
|
+
mmc
|
|
302
|
+
elsif a.size == 1 then
|
|
303
|
+
mlc
|
|
304
|
+
else
|
|
305
|
+
mmc
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end #module SiteMethods
|
|
309
|
+
|
|
310
|
+
# Bio::Alignment::Site stores bases or amino-acids in a
|
|
311
|
+
# site of the alignment.
|
|
312
|
+
# It would store multiple String objects of length 1.
|
|
313
|
+
# Please refer to the document of Array and SiteMethods for methods.
|
|
314
|
+
class Site < Array
|
|
315
|
+
include SiteMethods
|
|
316
|
+
end #module Site
|
|
317
|
+
|
|
318
|
+
# The module Bio::Alignment::EnumerableExtension is a set of useful
|
|
319
|
+
# methods for multiple sequence alignment.
|
|
320
|
+
# It can be included by any classes or can be extended to any objects.
|
|
321
|
+
# The classes or objects must have methods defined in Enumerable,
|
|
322
|
+
# and must have the <tt>each</tt> method
|
|
323
|
+
# which iterates over each sequence (or string) and yields
|
|
324
|
+
# a sequence (or string) object.
|
|
325
|
+
#
|
|
326
|
+
# Optionally, if <tt>each_seq</tt> method is defined,
|
|
327
|
+
# which iterates over each sequence (or string) and yields
|
|
328
|
+
# each sequence (or string) object, it is used instead of <tt>each</tt>.
|
|
329
|
+
#
|
|
330
|
+
# Note that the <tt>each</tt> or <tt>each_seq</tt> method would be
|
|
331
|
+
# called multiple times.
|
|
332
|
+
# This means that the module is not suitable for IO objects.
|
|
333
|
+
# In addition, <tt>break</tt> would be used in the given block and
|
|
334
|
+
# destructive methods would be used to the sequences.
|
|
335
|
+
#
|
|
336
|
+
# For Array or Hash objects, you'd better using
|
|
337
|
+
# ArrayExtension or HashExtension modules, respectively.
|
|
338
|
+
# They would have built-in <tt>each_seq</tt> method and/or
|
|
339
|
+
# some methods would be redefined.
|
|
340
|
+
#
|
|
341
|
+
module EnumerableExtension
|
|
342
|
+
include PropertyMethods
|
|
343
|
+
|
|
344
|
+
# Iterates over each sequences.
|
|
345
|
+
# Yields a sequence.
|
|
346
|
+
# It acts the same as Enumerable#each.
|
|
347
|
+
#
|
|
348
|
+
# You would redefine the method suitable for the class/object.
|
|
349
|
+
def each_seq(&block) #:yields: seq
|
|
350
|
+
each(&block)
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# Returns class of the sequence.
|
|
354
|
+
# If instance variable @seqclass (which can be
|
|
355
|
+
# set by 'seqclass=' method) is set, simply returns the value.
|
|
356
|
+
# Otherwise, returns the first sequence's class.
|
|
357
|
+
# If no sequences are found, returns nil.
|
|
358
|
+
def seqclass
|
|
359
|
+
if @seqclass then
|
|
360
|
+
@seqclass
|
|
361
|
+
else
|
|
362
|
+
klass = nil
|
|
363
|
+
each_seq do |s|
|
|
364
|
+
if s then
|
|
365
|
+
klass = s.class
|
|
366
|
+
break if klass
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
(klass or String)
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
# Returns the alignment length.
|
|
374
|
+
# Returns the longest length of the sequence in the alignment.
|
|
375
|
+
def alignment_length
|
|
376
|
+
maxlen = 0
|
|
377
|
+
each_seq do |s|
|
|
378
|
+
x = s.length
|
|
379
|
+
maxlen = x if x > maxlen
|
|
380
|
+
end
|
|
381
|
+
maxlen
|
|
382
|
+
end
|
|
383
|
+
alias seq_length alignment_length
|
|
384
|
+
|
|
385
|
+
# Gets a site of the position.
|
|
386
|
+
# Returns a Bio::Alignment::Site object.
|
|
387
|
+
#
|
|
388
|
+
# If the position is out of range, it returns the site
|
|
389
|
+
# of which all are gaps.
|
|
390
|
+
#
|
|
391
|
+
# It is a private method.
|
|
392
|
+
# Only difference from public alignment_site method is
|
|
393
|
+
# it does not do <tt>set_all_property(get_all_property)</tt>.
|
|
394
|
+
def _alignment_site(position)
|
|
395
|
+
site = Site.new
|
|
396
|
+
each_seq do |s|
|
|
397
|
+
c = s[position, 1]
|
|
398
|
+
if c.to_s.empty?
|
|
399
|
+
c = seqclass.new(gap_char)
|
|
400
|
+
end
|
|
401
|
+
site << c
|
|
402
|
+
end
|
|
403
|
+
site
|
|
404
|
+
end
|
|
405
|
+
private :_alignment_site
|
|
406
|
+
|
|
407
|
+
# Gets a site of the position.
|
|
408
|
+
# Returns a Bio::Alignment::Site object.
|
|
409
|
+
#
|
|
410
|
+
# If the position is out of range, it returns the site
|
|
411
|
+
# of which all are gaps.
|
|
412
|
+
def alignment_site(position)
|
|
413
|
+
site = _alignment_site(position)
|
|
414
|
+
site.set_all_property(get_all_property)
|
|
415
|
+
site
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
# Iterates over each site of the alignment.
|
|
419
|
+
# It yields a Bio::Alignment::Site object (which inherits Array).
|
|
420
|
+
# It returns self.
|
|
421
|
+
def each_site
|
|
422
|
+
cp = get_all_property
|
|
423
|
+
(0...alignment_length).each do |i|
|
|
424
|
+
site = _alignment_site(i)
|
|
425
|
+
site.set_all_property(cp)
|
|
426
|
+
yield(site)
|
|
427
|
+
end
|
|
428
|
+
self
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
# Iterates over each site of the alignment, with specifying
|
|
432
|
+
# start, stop positions and step.
|
|
433
|
+
# It yields Bio::Alignment::Site object (which inherits Array).
|
|
434
|
+
# It returns self.
|
|
435
|
+
# It is same as
|
|
436
|
+
# <tt>start.step(stop, step) { |i| yield alignment_site(i) }</tt>.
|
|
437
|
+
def each_site_step(start, stop, step = 1)
|
|
438
|
+
cp = get_all_property
|
|
439
|
+
start.step(stop, step) do |i|
|
|
440
|
+
site = _alignment_site(i)
|
|
441
|
+
site.set_all_property(cp)
|
|
442
|
+
yield(site)
|
|
443
|
+
end
|
|
444
|
+
self
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# Iterates over each sequence and results running blocks
|
|
448
|
+
# are collected and returns a new alignment as a
|
|
449
|
+
# Bio::Alignment::SequenceArray object.
|
|
450
|
+
#
|
|
451
|
+
# Note that it would be redefined if you want to change
|
|
452
|
+
# return value's class.
|
|
453
|
+
#
|
|
454
|
+
def alignment_collect
|
|
455
|
+
a = SequenceArray.new
|
|
456
|
+
a.set_all_property(get_all_property)
|
|
457
|
+
each_seq do |str|
|
|
458
|
+
a << yield(str)
|
|
459
|
+
end
|
|
460
|
+
a
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
# Returns specified range of the alignment.
|
|
464
|
+
# For each sequence, the '[]' method (it may be String#[])
|
|
465
|
+
# is executed, and returns a new alignment
|
|
466
|
+
# as a Bio::Alignment::SequenceArray object.
|
|
467
|
+
#
|
|
468
|
+
# Unlike alignment_slice method, the result alignment are
|
|
469
|
+
# guaranteed to contain String object if the range specified
|
|
470
|
+
# is out of range.
|
|
471
|
+
#
|
|
472
|
+
# If you want to change return value's class, you should redefine
|
|
473
|
+
# alignment_collect method.
|
|
474
|
+
#
|
|
475
|
+
def alignment_window(*arg)
|
|
476
|
+
alignment_collect do |s|
|
|
477
|
+
s[*arg] or seqclass.new('')
|
|
478
|
+
end
|
|
479
|
+
end
|
|
480
|
+
alias window alignment_window
|
|
481
|
+
|
|
482
|
+
# Iterates over each sliding window of the alignment.
|
|
483
|
+
# window_size is the size of sliding window.
|
|
484
|
+
# step is the step of each sliding.
|
|
485
|
+
# It yields a Bio::Alignment::SequenceArray object which contains
|
|
486
|
+
# each sliding window.
|
|
487
|
+
# It returns a Bio::Alignment::SequenceArray object which contains
|
|
488
|
+
# remainder alignment at the terminal end.
|
|
489
|
+
# If window_size is smaller than 0, it returns nil.
|
|
490
|
+
def each_window(window_size, step_size = 1)
|
|
491
|
+
return nil if window_size < 0
|
|
492
|
+
if step_size >= 0 then
|
|
493
|
+
i = nil
|
|
494
|
+
0.step(alignment_length - window_size, step_size) do |i|
|
|
495
|
+
yield alignment_window(i, window_size)
|
|
496
|
+
end
|
|
497
|
+
alignment_window((i+window_size)..-1)
|
|
498
|
+
else
|
|
499
|
+
i = alignment_length - window_size
|
|
500
|
+
while i >= 0
|
|
501
|
+
yield alignment_window(i, window_size)
|
|
502
|
+
i += step_size
|
|
503
|
+
end
|
|
504
|
+
alignment_window(0...(i-step_size))
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
# Iterates over each site of the alignment and results running the
|
|
509
|
+
# block are collected and returns an array.
|
|
510
|
+
# It yields a Bio::Alignment::Site object.
|
|
511
|
+
def collect_each_site
|
|
512
|
+
ary = []
|
|
513
|
+
each_site do |site|
|
|
514
|
+
ary << yield(site)
|
|
515
|
+
end
|
|
516
|
+
ary
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
# Helper method for calculating consensus sequence.
|
|
520
|
+
# It iterates over each site of the alignment.
|
|
521
|
+
# In each site, gaps will be removed if specified with opt.
|
|
522
|
+
# It yields a Bio::Alignment::Site object.
|
|
523
|
+
# Results running the block (String objects are expected)
|
|
524
|
+
# are joined to a string and it returns the string.
|
|
525
|
+
#
|
|
526
|
+
# opt[:gap_mode] ==> 0 -- gaps are regarded as normal characters
|
|
527
|
+
# 1 -- a site within gaps is regarded as a gap
|
|
528
|
+
# -1 -- gaps are eliminated from consensus calculation
|
|
529
|
+
# default: 0
|
|
530
|
+
#
|
|
531
|
+
def consensus_each_site(opt = {})
|
|
532
|
+
mchar = (opt[:missing_char] or self.missing_char)
|
|
533
|
+
gap_mode = opt[:gap_mode]
|
|
534
|
+
case gap_mode
|
|
535
|
+
when 0, nil
|
|
536
|
+
collect_each_site do |a|
|
|
537
|
+
yield(a) or mchar
|
|
538
|
+
end.join('')
|
|
539
|
+
when 1
|
|
540
|
+
collect_each_site do |a|
|
|
541
|
+
a.has_gap? ? gap_char : (yield(a) or mchar)
|
|
542
|
+
end.join('')
|
|
543
|
+
when -1
|
|
544
|
+
collect_each_site do |a|
|
|
545
|
+
a.remove_gaps!
|
|
546
|
+
a.empty? ? gap_char : (yield(a) or mchar)
|
|
547
|
+
end.join('')
|
|
548
|
+
else
|
|
549
|
+
raise ':gap_mode must be 0, 1 or -1'
|
|
550
|
+
end
|
|
551
|
+
end
|
|
552
|
+
|
|
553
|
+
# Returns the consensus string of the alignment.
|
|
554
|
+
# 0.0 <= threshold <= 1.0 is expected.
|
|
555
|
+
#
|
|
556
|
+
# It resembles the BioPerl's AlignI::consensus_string method.
|
|
557
|
+
#
|
|
558
|
+
# Please refer to the consensus_each_site method for opt.
|
|
559
|
+
#
|
|
560
|
+
def consensus_string(threshold = 1.0, opt = {})
|
|
561
|
+
consensus_each_site(opt) do |a|
|
|
562
|
+
a.consensus_string(threshold)
|
|
563
|
+
end
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
# Returns the IUPAC consensus string of the alignment
|
|
567
|
+
# of nucleic-acid sequences.
|
|
568
|
+
#
|
|
569
|
+
# It resembles the BioPerl's AlignI::consensus_iupac method.
|
|
570
|
+
#
|
|
571
|
+
# Please refer to the consensus_each_site method for opt.
|
|
572
|
+
#
|
|
573
|
+
def consensus_iupac(opt = {})
|
|
574
|
+
consensus_each_site(opt) do |a|
|
|
575
|
+
a.consensus_iupac
|
|
576
|
+
end
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
# Returns the match line stirng of the alignment
|
|
580
|
+
# of amino-acid sequences.
|
|
581
|
+
#
|
|
582
|
+
# It resembles the BioPerl's AlignI::match_line method.
|
|
583
|
+
#
|
|
584
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
|
585
|
+
# opt[:strong_match_char] ==> strong match default: ':'
|
|
586
|
+
# opt[:weak_match_char] ==> weak match default: '.'
|
|
587
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
|
588
|
+
#
|
|
589
|
+
# More opt can be accepted.
|
|
590
|
+
# Please refer to the consensus_each_site method for opt.
|
|
591
|
+
#
|
|
592
|
+
def match_line_amino(opt = {})
|
|
593
|
+
collect_each_site do |a|
|
|
594
|
+
a.match_line_amino(opt)
|
|
595
|
+
end.join('')
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
# Returns the match line stirng of the alignment
|
|
599
|
+
# of nucleic-acid sequences.
|
|
600
|
+
#
|
|
601
|
+
# It resembles the BioPerl's AlignI::match_line method.
|
|
602
|
+
#
|
|
603
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
|
604
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
|
605
|
+
#
|
|
606
|
+
# More opt can be accepted.
|
|
607
|
+
# Please refer to the consensus_each_site method for opt.
|
|
608
|
+
#
|
|
609
|
+
def match_line_nuc(opt = {})
|
|
610
|
+
collect_each_site do |a|
|
|
611
|
+
a.match_line_nuc(opt)
|
|
612
|
+
end.join('')
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
# Returns the match line stirng of the alignment
|
|
616
|
+
# of nucleic- or amino-acid sequences.
|
|
617
|
+
# The type of the sequence is automatically determined
|
|
618
|
+
# or you can specify with opt[:type].
|
|
619
|
+
#
|
|
620
|
+
# It resembles the BioPerl's AlignI::match_line method.
|
|
621
|
+
#
|
|
622
|
+
# opt[:type] ==> :na or :aa (or determined by sequence class)
|
|
623
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
|
624
|
+
# opt[:strong_match_char] ==> strong match default: ':'
|
|
625
|
+
# opt[:weak_match_char] ==> weak match default: '.'
|
|
626
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
|
627
|
+
# :strong_ and :weak_match_char are used only in amino mode (:aa)
|
|
628
|
+
#
|
|
629
|
+
# More opt can be accepted.
|
|
630
|
+
# Please refer to the consensus_each_site method for opt.
|
|
631
|
+
#
|
|
632
|
+
def match_line(opt = {})
|
|
633
|
+
case opt[:type]
|
|
634
|
+
when :aa
|
|
635
|
+
amino = true
|
|
636
|
+
when :na, :dna, :rna
|
|
637
|
+
amino = false
|
|
638
|
+
else
|
|
639
|
+
if seqclass == Bio::Sequence::AA then
|
|
640
|
+
amino = true
|
|
641
|
+
elsif seqclass == Bio::Sequence::NA then
|
|
642
|
+
amino = false
|
|
643
|
+
elsif self.find { |x| /[EFILPQ]/i =~ x } then
|
|
644
|
+
amino = true
|
|
645
|
+
else
|
|
646
|
+
amino = nil
|
|
647
|
+
end
|
|
648
|
+
end
|
|
649
|
+
if amino then
|
|
650
|
+
match_line_amino(opt)
|
|
651
|
+
else
|
|
652
|
+
match_line_nuc(opt)
|
|
653
|
+
end
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
# This is the BioPerl's AlignI::match like method.
|
|
657
|
+
#
|
|
658
|
+
# Changes second to last sequences' sites to match_char(default: '.')
|
|
659
|
+
# when a site is equeal to the first sequence's corresponding site.
|
|
660
|
+
#
|
|
661
|
+
# Note that it is a destructive method.
|
|
662
|
+
#
|
|
663
|
+
# For Hash, please use it carefully because
|
|
664
|
+
# the order of the sequences is inconstant.
|
|
665
|
+
#
|
|
666
|
+
def convert_match(match_char = '.')
|
|
667
|
+
#(BioPerl) AlignI::match like method
|
|
668
|
+
len = alignment_length
|
|
669
|
+
firstseq = nil
|
|
670
|
+
each_seq do |s|
|
|
671
|
+
unless firstseq then
|
|
672
|
+
firstseq = s
|
|
673
|
+
else
|
|
674
|
+
(0...len).each do |i|
|
|
675
|
+
if s[i] and firstseq[i] == s[i] and !is_gap?(firstseq[i..i])
|
|
676
|
+
s[i..i] = match_char
|
|
677
|
+
end
|
|
678
|
+
end
|
|
679
|
+
end
|
|
680
|
+
end
|
|
681
|
+
self
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
# This is the BioPerl's AlignI::unmatch like method.
|
|
685
|
+
#
|
|
686
|
+
# Changes second to last sequences' sites match_char(default: '.')
|
|
687
|
+
# to original sites' characters.
|
|
688
|
+
#
|
|
689
|
+
# Note that it is a destructive method.
|
|
690
|
+
#
|
|
691
|
+
# For Hash, please use it carefully because
|
|
692
|
+
# the order of the sequences is inconstant.
|
|
693
|
+
#
|
|
694
|
+
def convert_unmatch(match_char = '.')
|
|
695
|
+
#(BioPerl) AlignI::unmatch like method
|
|
696
|
+
len = alignment_length
|
|
697
|
+
firstseq = nil
|
|
698
|
+
each_seq do |s|
|
|
699
|
+
unless firstseq then
|
|
700
|
+
firstseq = s
|
|
701
|
+
else
|
|
702
|
+
(0...len).each do |i|
|
|
703
|
+
if s[i..i] == match_char then
|
|
704
|
+
s[i..i] = (firstseq[i..i] or match_char)
|
|
705
|
+
end
|
|
706
|
+
end
|
|
707
|
+
end
|
|
708
|
+
end
|
|
709
|
+
self
|
|
710
|
+
end
|
|
711
|
+
|
|
712
|
+
# Fills gaps to the tail of each sequence if the length of
|
|
713
|
+
# the sequence is shorter than the alignment length.
|
|
714
|
+
#
|
|
715
|
+
# Note that it is a destructive method.
|
|
716
|
+
def alignment_normalize!
|
|
717
|
+
#(original)
|
|
718
|
+
len = alignment_length
|
|
719
|
+
each_seq do |s|
|
|
720
|
+
s << (gap_char * (len - s.length)) if s.length < len
|
|
721
|
+
end
|
|
722
|
+
self
|
|
723
|
+
end
|
|
724
|
+
alias normalize! alignment_normalize!
|
|
725
|
+
|
|
726
|
+
# Removes excess gaps in the tail of the sequences.
|
|
727
|
+
# If removes nothing, returns nil.
|
|
728
|
+
# Otherwise, returns self.
|
|
729
|
+
#
|
|
730
|
+
# Note that it is a destructive method.
|
|
731
|
+
def alignment_rstrip!
|
|
732
|
+
#(String-like)
|
|
733
|
+
len = alignment_length
|
|
734
|
+
newlen = len
|
|
735
|
+
each_site_step(len - 1, 0, -1) do |a|
|
|
736
|
+
a.remove_gaps!
|
|
737
|
+
if a.empty? then
|
|
738
|
+
newlen -= 1
|
|
739
|
+
else
|
|
740
|
+
break
|
|
741
|
+
end
|
|
742
|
+
end
|
|
743
|
+
return nil if newlen >= len
|
|
744
|
+
each_seq do |s|
|
|
745
|
+
s[newlen..-1] = '' if s.length > newlen
|
|
746
|
+
end
|
|
747
|
+
self
|
|
748
|
+
end
|
|
749
|
+
alias rstrip! alignment_rstrip!
|
|
750
|
+
|
|
751
|
+
# Removes excess gaps in the head of the sequences.
|
|
752
|
+
# If removes nothing, returns nil.
|
|
753
|
+
# Otherwise, returns self.
|
|
754
|
+
#
|
|
755
|
+
# Note that it is a destructive method.
|
|
756
|
+
def alignment_lstrip!
|
|
757
|
+
#(String-like)
|
|
758
|
+
pos = 0
|
|
759
|
+
each_site do |a|
|
|
760
|
+
a.remove_gaps!
|
|
761
|
+
if a.empty?
|
|
762
|
+
pos += 1
|
|
763
|
+
else
|
|
764
|
+
break
|
|
765
|
+
end
|
|
766
|
+
end
|
|
767
|
+
return nil if pos <= 0
|
|
768
|
+
each_seq { |s| s[0, pos] = '' }
|
|
769
|
+
self
|
|
770
|
+
end
|
|
771
|
+
alias lstrip! alignment_lstrip!
|
|
772
|
+
|
|
773
|
+
# Removes excess gaps in the sequences.
|
|
774
|
+
# If removes nothing, returns nil.
|
|
775
|
+
# Otherwise, returns self.
|
|
776
|
+
#
|
|
777
|
+
# Note that it is a destructive method.
|
|
778
|
+
def alignment_strip!
|
|
779
|
+
#(String-like)
|
|
780
|
+
r = alignment_rstrip!
|
|
781
|
+
l = alignment_lstrip!
|
|
782
|
+
(r or l)
|
|
783
|
+
end
|
|
784
|
+
alias strip! alignment_strip!
|
|
785
|
+
|
|
786
|
+
# Completely removes ALL gaps in the sequences.
|
|
787
|
+
# If removes nothing, returns nil.
|
|
788
|
+
# Otherwise, returns self.
|
|
789
|
+
#
|
|
790
|
+
# Note that it is a destructive method.
|
|
791
|
+
def remove_all_gaps!
|
|
792
|
+
ret = nil
|
|
793
|
+
each_seq do |s|
|
|
794
|
+
x = s.gsub!(gap_regexp, '')
|
|
795
|
+
ret ||= x
|
|
796
|
+
end
|
|
797
|
+
ret ? self : nil
|
|
798
|
+
end
|
|
799
|
+
|
|
800
|
+
# Returns the specified range of the alignment.
|
|
801
|
+
# For each sequence, the 'slice' method (it may be String#slice,
|
|
802
|
+
# which is the same as String#[]) is executed, and
|
|
803
|
+
# returns a new alignment as a Bio::Alignment::SequenceArray object.
|
|
804
|
+
#
|
|
805
|
+
# Unlike alignment_window method, the result alignment
|
|
806
|
+
# might contain nil.
|
|
807
|
+
#
|
|
808
|
+
# If you want to change return value's class, you should redefine
|
|
809
|
+
# alignment_collect method.
|
|
810
|
+
#
|
|
811
|
+
def alignment_slice(*arg)
|
|
812
|
+
#(String-like)
|
|
813
|
+
#(BioPerl) AlignI::slice like method
|
|
814
|
+
alignment_collect do |s|
|
|
815
|
+
s.slice(*arg)
|
|
816
|
+
end
|
|
817
|
+
end
|
|
818
|
+
alias slice alignment_slice
|
|
819
|
+
|
|
820
|
+
# For each sequence, the 'subseq' method (Bio::Seqeunce#subseq is
|
|
821
|
+
# expected) is executed, and returns a new alignment as
|
|
822
|
+
# a Bio::Alignment::SequenceArray object.
|
|
823
|
+
#
|
|
824
|
+
# All sequences in the alignment are expected to be kind of
|
|
825
|
+
# Bio::Sequence objects.
|
|
826
|
+
#
|
|
827
|
+
# Unlike alignment_window method, the result alignment
|
|
828
|
+
# might contain nil.
|
|
829
|
+
#
|
|
830
|
+
# If you want to change return value's class, you should redefine
|
|
831
|
+
# alignment_collect method.
|
|
832
|
+
#
|
|
833
|
+
def alignment_subseq(*arg)
|
|
834
|
+
#(original)
|
|
835
|
+
alignment_collect do |s|
|
|
836
|
+
s.subseq(*arg)
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
alias subseq alignment_subseq
|
|
840
|
+
|
|
841
|
+
# Concatenates the given alignment.
|
|
842
|
+
# <em>align</em> must have <tt>each_seq</tt>
|
|
843
|
+
# or <tt>each</tt> method.
|
|
844
|
+
#
|
|
845
|
+
# Returns self.
|
|
846
|
+
#
|
|
847
|
+
# Note that it is a destructive method.
|
|
848
|
+
#
|
|
849
|
+
# For Hash, please use it carefully because
|
|
850
|
+
# the order of the sequences is inconstant and
|
|
851
|
+
# key information is completely ignored.
|
|
852
|
+
#
|
|
853
|
+
def alignment_concat(align)
|
|
854
|
+
flag = nil
|
|
855
|
+
a = []
|
|
856
|
+
each_seq { |s| a << s }
|
|
857
|
+
i = 0
|
|
858
|
+
begin
|
|
859
|
+
align.each_seq do |seq|
|
|
860
|
+
flag = true
|
|
861
|
+
a[i].concat(seq) if a[i] and seq
|
|
862
|
+
i += 1
|
|
863
|
+
end
|
|
864
|
+
return self
|
|
865
|
+
rescue NoMethodError, ArgumentError => evar
|
|
866
|
+
raise evar if flag
|
|
867
|
+
end
|
|
868
|
+
align.each do |seq|
|
|
869
|
+
a[i].concat(seq) if a[i] and seq
|
|
870
|
+
i += 1
|
|
871
|
+
end
|
|
872
|
+
self
|
|
873
|
+
end
|
|
874
|
+
end #module EnumerableExtension
|
|
875
|
+
|
|
876
|
+
# ClustalWFormatter is a module to create ClustalW-formatted text
|
|
877
|
+
# from an alignment object.
|
|
878
|
+
#
|
|
879
|
+
# It will be obsoleted and the methods will be frequently changed.
|
|
880
|
+
module ClustalWFormatter
|
|
881
|
+
# Check whether there are same names.
|
|
882
|
+
#
|
|
883
|
+
# array:: names of the sequences (array of string)
|
|
884
|
+
# len:: length to check (default:30)
|
|
885
|
+
def have_same_name?(array, len = 30)
|
|
886
|
+
na30 = array.collect do |k|
|
|
887
|
+
k.to_s.split(/[\x00\s]/)[0].to_s[0, len].gsub(/\:\;\,\(\)/, '_').to_s
|
|
888
|
+
end
|
|
889
|
+
#p na30
|
|
890
|
+
na30idx = (0...(na30.size)).to_a
|
|
891
|
+
na30idx.sort! do |x,y|
|
|
892
|
+
na30[x] <=> na30[y]
|
|
893
|
+
end
|
|
894
|
+
#p na30idx
|
|
895
|
+
y = nil
|
|
896
|
+
dupidx = []
|
|
897
|
+
na30idx.each do |x|
|
|
898
|
+
if y and na30[y] == na30[x] then
|
|
899
|
+
dupidx << y
|
|
900
|
+
dupidx << x
|
|
901
|
+
end
|
|
902
|
+
y = x
|
|
903
|
+
end
|
|
904
|
+
if dupidx.size > 0 then
|
|
905
|
+
dupidx.sort!
|
|
906
|
+
dupidx.uniq!
|
|
907
|
+
dupidx
|
|
908
|
+
else
|
|
909
|
+
false
|
|
910
|
+
end
|
|
911
|
+
end
|
|
912
|
+
private :have_same_name?
|
|
913
|
+
|
|
914
|
+
# Changes sequence names if there are conflicted names.
|
|
915
|
+
#
|
|
916
|
+
# array:: names of the sequences (array of string)
|
|
917
|
+
# len:: length to check (default:30)
|
|
918
|
+
def avoid_same_name(array, len = 30)
|
|
919
|
+
na = array.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
|
|
920
|
+
if dupidx = have_same_name?(na, len)
|
|
921
|
+
procs = [
|
|
922
|
+
Proc.new { |s, i|
|
|
923
|
+
s[0, len].to_s.gsub(/\s/, '_') + s[len..-1].to_s
|
|
924
|
+
},
|
|
925
|
+
# Proc.new { |s, i|
|
|
926
|
+
# "#{i}_#{s}"
|
|
927
|
+
# },
|
|
928
|
+
]
|
|
929
|
+
procs.each do |pr|
|
|
930
|
+
dupidx.each do |i|
|
|
931
|
+
s = array[i]
|
|
932
|
+
na[i] = pr.call(s.to_s, i)
|
|
933
|
+
end
|
|
934
|
+
dupidx = have_same_name?(na, len)
|
|
935
|
+
break unless dupidx
|
|
936
|
+
end
|
|
937
|
+
if dupidx then
|
|
938
|
+
na.each_with_index do |s, i|
|
|
939
|
+
na[i] = "#{i}_#{s}"
|
|
940
|
+
end
|
|
941
|
+
end
|
|
942
|
+
end
|
|
943
|
+
na
|
|
944
|
+
end
|
|
945
|
+
private :avoid_same_name
|
|
946
|
+
|
|
947
|
+
# Generates ClustalW-formatted text
|
|
948
|
+
# seqs:: sequences (must be an alignment object)
|
|
949
|
+
# names:: names of the sequences
|
|
950
|
+
# options:: options
|
|
951
|
+
def clustalw_formatter(seqs, names, options = {})
|
|
952
|
+
#(original)
|
|
953
|
+
aln = [ "CLUSTAL (0.00) multiple sequence alignment\n\n" ]
|
|
954
|
+
len = seqs.seq_length
|
|
955
|
+
sn = names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
|
|
956
|
+
if options[:replace_space]
|
|
957
|
+
sn.collect! { |x| x.gsub(/\s/, '_') }
|
|
958
|
+
end
|
|
959
|
+
if !options.has_key?(:escape) or options[:escape]
|
|
960
|
+
sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
|
|
961
|
+
end
|
|
962
|
+
if !options.has_key?(:split) or options[:split]
|
|
963
|
+
sn.collect! { |x| x.split(/\s/)[0].to_s }
|
|
964
|
+
end
|
|
965
|
+
if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
|
|
966
|
+
sn = avoid_same_name(sn)
|
|
967
|
+
end
|
|
968
|
+
|
|
969
|
+
if sn.find { |x| x.length > 10 } then
|
|
970
|
+
seqwidth = 50
|
|
971
|
+
namewidth = 30
|
|
972
|
+
sep = ' ' * 6
|
|
973
|
+
else
|
|
974
|
+
seqwidth = 60
|
|
975
|
+
namewidth = 10
|
|
976
|
+
sep = ' ' * 6
|
|
977
|
+
end
|
|
978
|
+
seqregexp = Regexp.new("(.{1,#{seqwidth}})")
|
|
979
|
+
gchar = (options[:gap_char] or '-')
|
|
980
|
+
|
|
981
|
+
case options[:type].to_s
|
|
982
|
+
when /protein/i, /aa/i
|
|
983
|
+
mopt = { :type => :aa }
|
|
984
|
+
when /na/i
|
|
985
|
+
mopt = { :type => :na }
|
|
986
|
+
else
|
|
987
|
+
mopt = {}
|
|
988
|
+
end
|
|
989
|
+
mline = (options[:match_line] or seqs.match_line(mopt))
|
|
990
|
+
|
|
991
|
+
aseqs = seqs.collect do |s|
|
|
992
|
+
s.to_s.gsub(seqs.gap_regexp, gchar)
|
|
993
|
+
end
|
|
994
|
+
case options[:case].to_s
|
|
995
|
+
when /lower/i
|
|
996
|
+
aseqs.each { |s| s.downcase! }
|
|
997
|
+
when /upper/i
|
|
998
|
+
aseqs.each { |s| s.upcase! }
|
|
999
|
+
end
|
|
1000
|
+
|
|
1001
|
+
aseqs << mline
|
|
1002
|
+
aseqs.collect! do |s|
|
|
1003
|
+
snx = sn.shift
|
|
1004
|
+
head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth] + sep
|
|
1005
|
+
s << (gchar * (len - s.length))
|
|
1006
|
+
s.gsub!(seqregexp, "\\1\n")
|
|
1007
|
+
a = s.split(/^/)
|
|
1008
|
+
if options[:seqnos] and snx then
|
|
1009
|
+
i = 0
|
|
1010
|
+
a.each do |x|
|
|
1011
|
+
x.chomp!
|
|
1012
|
+
l = x.tr(gchar, '').length
|
|
1013
|
+
i += l
|
|
1014
|
+
x.concat(l > 0 ? " #{i}\n" : "\n")
|
|
1015
|
+
end
|
|
1016
|
+
end
|
|
1017
|
+
a.collect { |x| head + x }
|
|
1018
|
+
end
|
|
1019
|
+
lines = (len + seqwidth - 1).div(seqwidth)
|
|
1020
|
+
lines.times do
|
|
1021
|
+
aln << "\n"
|
|
1022
|
+
aseqs.each { |a| aln << a.shift }
|
|
1023
|
+
end
|
|
1024
|
+
aln.join('')
|
|
1025
|
+
end
|
|
1026
|
+
private :clustalw_formatter
|
|
1027
|
+
end #module ClustalWFormatter
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
# Bio::Alignment::ArrayExtension is a set of useful methods for
|
|
1031
|
+
# multiple sequence alignment.
|
|
1032
|
+
# It is designed to be extended to array objects or
|
|
1033
|
+
# included in your own classes which inherit Array.
|
|
1034
|
+
# (It can also be included in Array, though not recommended.)
|
|
1035
|
+
#
|
|
1036
|
+
# It possesses all methods defined in EnumerableExtension.
|
|
1037
|
+
# For usage of methods, please refer to EnumerableExtension.
|
|
1038
|
+
module ArrayExtension
|
|
1039
|
+
include EnumerableExtension
|
|
1040
|
+
|
|
1041
|
+
# Iterates over each sequences.
|
|
1042
|
+
# Yields a sequence.
|
|
1043
|
+
#
|
|
1044
|
+
# It works the same as Array#each.
|
|
1045
|
+
def each_seq(&block) #:yields: seq
|
|
1046
|
+
each(&block)
|
|
1047
|
+
end
|
|
1048
|
+
|
|
1049
|
+
include ClustalWFormatter
|
|
1050
|
+
# Returns a string of Clustal W formatted text of the alignment.
|
|
1051
|
+
def to_clustal(options = {})
|
|
1052
|
+
clustalw_formatter(self, (0...(self.size)).to_a, options)
|
|
1053
|
+
end
|
|
1054
|
+
end #module ArrayExtension
|
|
1055
|
+
|
|
1056
|
+
# Bio::Alignment::HashExtension is a set of useful methods for
|
|
1057
|
+
# multiple sequence alignment.
|
|
1058
|
+
# It is designed to be extended to hash objects or
|
|
1059
|
+
# included in your own classes which inherit Hash.
|
|
1060
|
+
# (It can also be included in Hash, though not recommended.)
|
|
1061
|
+
#
|
|
1062
|
+
# It possesses all methods defined in EnumerableExtension.
|
|
1063
|
+
# For usage of methods, please refer to EnumerableExtension.
|
|
1064
|
+
#
|
|
1065
|
+
# Because SequenceHash#alignment_collect is redefined,
|
|
1066
|
+
# some methods' return value's class are changed to
|
|
1067
|
+
# SequenceHash instead of SequenceArray.
|
|
1068
|
+
#
|
|
1069
|
+
# Because the order of the objects in a hash is inconstant,
|
|
1070
|
+
# some methods strictly affected with the order of objects
|
|
1071
|
+
# might not work correctly,
|
|
1072
|
+
# e.g. EnumerableExtension#convert_match and #convert_unmatch.
|
|
1073
|
+
module HashExtension
|
|
1074
|
+
include EnumerableExtension
|
|
1075
|
+
|
|
1076
|
+
# Iterates over each sequences.
|
|
1077
|
+
# Yields a sequence.
|
|
1078
|
+
#
|
|
1079
|
+
# It works the same as Hash#each_value.
|
|
1080
|
+
def each_seq(&block) #:yields: seq
|
|
1081
|
+
each_value(&block)
|
|
1082
|
+
end
|
|
1083
|
+
|
|
1084
|
+
# Iterates over each sequence and each results running block
|
|
1085
|
+
# are collected and returns a new alignment as a
|
|
1086
|
+
# Bio::Alignment::SequenceHash object.
|
|
1087
|
+
#
|
|
1088
|
+
# Note that it would be redefined if you want to change
|
|
1089
|
+
# return value's class.
|
|
1090
|
+
#
|
|
1091
|
+
def alignment_collect
|
|
1092
|
+
a = SequenceHash.new
|
|
1093
|
+
a.set_all_property(get_all_property)
|
|
1094
|
+
each_pair do |key, str|
|
|
1095
|
+
a.store(key, yield(str))
|
|
1096
|
+
end
|
|
1097
|
+
a
|
|
1098
|
+
end
|
|
1099
|
+
|
|
1100
|
+
# Concatenates the given alignment.
|
|
1101
|
+
# If <em>align</em> is a Hash (or SequenceHash),
|
|
1102
|
+
# sequences of same keys are concatenated.
|
|
1103
|
+
# Otherwise, <em>align</em> must have <tt>each_seq</tt>
|
|
1104
|
+
# or <tt>each</tt> method and
|
|
1105
|
+
# works same as EnumerableExtension#alignment_concat.
|
|
1106
|
+
#
|
|
1107
|
+
# Returns self.
|
|
1108
|
+
#
|
|
1109
|
+
# Note that it is a destructive method.
|
|
1110
|
+
#
|
|
1111
|
+
def alignment_concat(align)
|
|
1112
|
+
flag = nil
|
|
1113
|
+
begin
|
|
1114
|
+
align.each_pair do |key, seq|
|
|
1115
|
+
flag = true
|
|
1116
|
+
if origseq = self[key]
|
|
1117
|
+
origseq.concat(seq)
|
|
1118
|
+
end
|
|
1119
|
+
end
|
|
1120
|
+
return self
|
|
1121
|
+
rescue NoMethodError, ArgumentError =>evar
|
|
1122
|
+
raise evar if flag
|
|
1123
|
+
end
|
|
1124
|
+
a = values
|
|
1125
|
+
i = 0
|
|
1126
|
+
begin
|
|
1127
|
+
align.each_seq do |seq|
|
|
1128
|
+
flag = true
|
|
1129
|
+
a[i].concat(seq) if a[i] and seq
|
|
1130
|
+
i += 1
|
|
1131
|
+
end
|
|
1132
|
+
return self
|
|
1133
|
+
rescue NoMethodError, ArgumentError => evar
|
|
1134
|
+
raise evar if flag
|
|
1135
|
+
end
|
|
1136
|
+
align.each do |seq|
|
|
1137
|
+
a[i].concat(seq) if a[i] and seq
|
|
1138
|
+
i += 1
|
|
1139
|
+
end
|
|
1140
|
+
self
|
|
1141
|
+
end
|
|
1142
|
+
|
|
1143
|
+
include ClustalWFormatter
|
|
1144
|
+
# Returns a string of Clustal W formatted text of the alignment.
|
|
1145
|
+
def to_clustal(options = {})
|
|
1146
|
+
seqs = SequenceArray.new
|
|
1147
|
+
names = self.keys
|
|
1148
|
+
names.each do |k|
|
|
1149
|
+
seqs << self[k]
|
|
1150
|
+
end
|
|
1151
|
+
clustalw_formatter(seqs, names, options)
|
|
1152
|
+
end
|
|
1153
|
+
end #module HashExtension
|
|
1154
|
+
|
|
1155
|
+
# Bio::Alignment::SequenceArray is a container class of
|
|
1156
|
+
# multiple sequence alignment.
|
|
1157
|
+
# Since it inherits Array, it acts completely same as Array.
|
|
1158
|
+
# In addition, methods defined in ArrayExtension and EnumerableExtension
|
|
1159
|
+
# can be used.
|
|
1160
|
+
class SequenceArray < Array
|
|
1161
|
+
include ArrayExtension
|
|
1162
|
+
end #class SequenceArray
|
|
1163
|
+
|
|
1164
|
+
# Bio::Alignment::SequenceHash is a container class of
|
|
1165
|
+
# multiple sequence alignment.
|
|
1166
|
+
# Since it inherits Hash, it acts completely same as Hash.
|
|
1167
|
+
# In addition, methods defined in HashExtension and EnumerableExtension
|
|
1168
|
+
# can be used.
|
|
1169
|
+
class SequenceHash < Hash
|
|
1170
|
+
include HashExtension
|
|
1171
|
+
end #class SequenceHash
|
|
1172
|
+
|
|
1173
|
+
# Bio::Alignment::OriginalPrivate is a set of private methods
|
|
1174
|
+
# for Bio::Alignment::OriginalAlignment.
|
|
1175
|
+
module OriginalPrivate
|
|
1176
|
+
|
|
1177
|
+
# Gets the sequence from given object.
|
|
1178
|
+
def extract_seq(obj)
|
|
1179
|
+
seq = nil
|
|
1180
|
+
if obj.is_a?(Bio::Sequence) then
|
|
1181
|
+
seq = obj
|
|
1182
|
+
else
|
|
1183
|
+
for m in [ :seq, :naseq, :aaseq ]
|
|
1184
|
+
begin
|
|
1185
|
+
seq = obj.send(m)
|
|
1186
|
+
rescue NameError, ArgumentError
|
|
1187
|
+
seq = nil
|
|
1188
|
+
end
|
|
1189
|
+
break if seq
|
|
1190
|
+
end
|
|
1191
|
+
seq = obj unless seq
|
|
1192
|
+
end
|
|
1193
|
+
seq
|
|
1194
|
+
end
|
|
1195
|
+
module_function :extract_seq
|
|
1196
|
+
|
|
1197
|
+
# Gets the name or the definition of the sequence from given object.
|
|
1198
|
+
def extract_key(obj)
|
|
1199
|
+
sn = nil
|
|
1200
|
+
for m in [ :definition, :entry_id ]
|
|
1201
|
+
begin
|
|
1202
|
+
sn = obj.send(m)
|
|
1203
|
+
rescue NameError, ArgumentError
|
|
1204
|
+
sn = nil
|
|
1205
|
+
end
|
|
1206
|
+
break if sn
|
|
1207
|
+
end
|
|
1208
|
+
sn
|
|
1209
|
+
end
|
|
1210
|
+
module_function :extract_key
|
|
1211
|
+
end #module OriginalPrivate
|
|
1212
|
+
|
|
1213
|
+
# Bio::Alignment::OriginalAlignment is
|
|
1214
|
+
# the BioRuby original multiple sequence alignment container class.
|
|
1215
|
+
# It includes HashExtension.
|
|
1216
|
+
#
|
|
1217
|
+
# It is recommended only to use methods defined in EnumerableExtension
|
|
1218
|
+
# (and the each_seq method).
|
|
1219
|
+
# The method only defined in this class might be obsoleted in the future.
|
|
1220
|
+
#
|
|
1221
|
+
class OriginalAlignment
|
|
1222
|
+
|
|
1223
|
+
include Enumerable
|
|
1224
|
+
include HashExtension
|
|
1225
|
+
include OriginalPrivate
|
|
1226
|
+
|
|
1227
|
+
# Read files and creates a new alignment object.
|
|
1228
|
+
#
|
|
1229
|
+
# It will be obsoleted.
|
|
1230
|
+
def self.readfiles(*files)
|
|
1231
|
+
require 'bio/io/flatfile'
|
|
1232
|
+
aln = self.new
|
|
1233
|
+
files.each do |fn|
|
|
1234
|
+
Bio::FlatFile.open(nil, fn) do |ff|
|
|
1235
|
+
aln.add_sequences(ff)
|
|
1236
|
+
end
|
|
1237
|
+
end
|
|
1238
|
+
aln
|
|
1239
|
+
end
|
|
1240
|
+
|
|
1241
|
+
# Creates a new alignment object from given arguments.
|
|
1242
|
+
#
|
|
1243
|
+
# It will be obsoleted.
|
|
1244
|
+
def self.new2(*arg)
|
|
1245
|
+
self.new(arg)
|
|
1246
|
+
end
|
|
1247
|
+
|
|
1248
|
+
# Creates a new alignment object.
|
|
1249
|
+
# <em>seqs</em> may be one of follows:
|
|
1250
|
+
# an array of sequences (or strings),
|
|
1251
|
+
# an array of sequence database objects,
|
|
1252
|
+
# an alignment object.
|
|
1253
|
+
def initialize(seqs = [])
|
|
1254
|
+
@seqs = {}
|
|
1255
|
+
@keys = []
|
|
1256
|
+
self.add_sequences(seqs)
|
|
1257
|
+
end
|
|
1258
|
+
|
|
1259
|
+
# If <em>x</em> is the same value, returns true.
|
|
1260
|
+
# Otherwise, returns false.
|
|
1261
|
+
def ==(x)
|
|
1262
|
+
#(original)
|
|
1263
|
+
if x.is_a?(self.class)
|
|
1264
|
+
self.to_hash == x.to_hash
|
|
1265
|
+
else
|
|
1266
|
+
false
|
|
1267
|
+
end
|
|
1268
|
+
end
|
|
1269
|
+
|
|
1270
|
+
# convert to hash
|
|
1271
|
+
def to_hash
|
|
1272
|
+
#(Hash-like)
|
|
1273
|
+
@seqs
|
|
1274
|
+
end
|
|
1275
|
+
|
|
1276
|
+
# Adds sequences to the alignment.
|
|
1277
|
+
# <em>seqs</em> may be one of follows:
|
|
1278
|
+
# an array of sequences (or strings),
|
|
1279
|
+
# an array of sequence database objects,
|
|
1280
|
+
# an alignment object.
|
|
1281
|
+
def add_sequences(seqs)
|
|
1282
|
+
if block_given? then
|
|
1283
|
+
seqs.each do |x|
|
|
1284
|
+
s, key = yield x
|
|
1285
|
+
self.store(key, s)
|
|
1286
|
+
end
|
|
1287
|
+
else
|
|
1288
|
+
if seqs.is_a?(self.class) then
|
|
1289
|
+
seqs.each_pair do |k, s|
|
|
1290
|
+
self.store(k, s)
|
|
1291
|
+
end
|
|
1292
|
+
elsif seqs.respond_to?(:each_pair)
|
|
1293
|
+
seqs.each_pair do |k, x|
|
|
1294
|
+
s = extract_seq(x)
|
|
1295
|
+
self.store(k, s)
|
|
1296
|
+
end
|
|
1297
|
+
else
|
|
1298
|
+
seqs.each do |x|
|
|
1299
|
+
s = extract_seq(x)
|
|
1300
|
+
k = extract_key(x)
|
|
1301
|
+
self.store(k, s)
|
|
1302
|
+
end
|
|
1303
|
+
end
|
|
1304
|
+
end
|
|
1305
|
+
self
|
|
1306
|
+
end
|
|
1307
|
+
|
|
1308
|
+
# identifiers (or definitions or names) of the sequences
|
|
1309
|
+
attr_reader :keys
|
|
1310
|
+
|
|
1311
|
+
# stores a sequences with the name
|
|
1312
|
+
# key:: name of the sequence
|
|
1313
|
+
# seq:: sequence
|
|
1314
|
+
def __store__(key, seq)
|
|
1315
|
+
#(Hash-like)
|
|
1316
|
+
h = { key => seq }
|
|
1317
|
+
@keys << h.keys[0]
|
|
1318
|
+
@seqs.update(h)
|
|
1319
|
+
seq
|
|
1320
|
+
end
|
|
1321
|
+
|
|
1322
|
+
# stores a sequence with <em>key</em>
|
|
1323
|
+
# (name or definition of the sequence).
|
|
1324
|
+
# Unlike <tt>__store__</tt> method, the method doesn't allow
|
|
1325
|
+
# same keys.
|
|
1326
|
+
# If the key is already used, returns nil.
|
|
1327
|
+
# When succeeded, returns key.
|
|
1328
|
+
def store(key, seq)
|
|
1329
|
+
#(Hash-like) returns key instead of seq
|
|
1330
|
+
if @seqs.has_key?(key) then
|
|
1331
|
+
# don't allow same key
|
|
1332
|
+
# New key is discarded, while existing key is preserved.
|
|
1333
|
+
key = nil
|
|
1334
|
+
end
|
|
1335
|
+
unless key then
|
|
1336
|
+
unless defined?(@serial)
|
|
1337
|
+
@serial = 0
|
|
1338
|
+
end
|
|
1339
|
+
@serial = @seqs.size if @seqs.size > @serial
|
|
1340
|
+
while @seqs.has_key?(@serial)
|
|
1341
|
+
@serial += 1
|
|
1342
|
+
end
|
|
1343
|
+
key = @serial
|
|
1344
|
+
end
|
|
1345
|
+
self.__store__(key, seq)
|
|
1346
|
+
key
|
|
1347
|
+
end
|
|
1348
|
+
|
|
1349
|
+
# Reconstructs internal data structure.
|
|
1350
|
+
# (Like Hash#rehash)
|
|
1351
|
+
def rehash
|
|
1352
|
+
@seqs.rehash
|
|
1353
|
+
oldkeys = @keys
|
|
1354
|
+
tmpkeys = @seqs.keys
|
|
1355
|
+
@keys.collect! do |k|
|
|
1356
|
+
tmpkeys.delete(k)
|
|
1357
|
+
end
|
|
1358
|
+
@keys.compact!
|
|
1359
|
+
@keys.concat(tmpkeys)
|
|
1360
|
+
self
|
|
1361
|
+
end
|
|
1362
|
+
|
|
1363
|
+
# Prepends seq (with key) to the front of the alignment.
|
|
1364
|
+
# (Like Array#unshift)
|
|
1365
|
+
def unshift(key, seq)
|
|
1366
|
+
#(Array-like)
|
|
1367
|
+
self.store(key, seq)
|
|
1368
|
+
k = @keys.pop
|
|
1369
|
+
@keys.unshift(k)
|
|
1370
|
+
k
|
|
1371
|
+
end
|
|
1372
|
+
|
|
1373
|
+
# Removes the first sequence in the alignment and
|
|
1374
|
+
# returns [ key, seq ].
|
|
1375
|
+
def shift
|
|
1376
|
+
k = @keys.shift
|
|
1377
|
+
if k then
|
|
1378
|
+
s = @seqs.delete(k)
|
|
1379
|
+
[ k, s ]
|
|
1380
|
+
else
|
|
1381
|
+
nil
|
|
1382
|
+
end
|
|
1383
|
+
end
|
|
1384
|
+
|
|
1385
|
+
# Gets the <em>n</em>-th sequence.
|
|
1386
|
+
# If not found, returns nil.
|
|
1387
|
+
def order(n)
|
|
1388
|
+
#(original)
|
|
1389
|
+
@seqs[@keys[n]]
|
|
1390
|
+
end
|
|
1391
|
+
|
|
1392
|
+
# Removes the sequence whose key is <em>key</em>.
|
|
1393
|
+
# Returns the removed sequence.
|
|
1394
|
+
# If not found, returns nil.
|
|
1395
|
+
def delete(key)
|
|
1396
|
+
#(Hash-like)
|
|
1397
|
+
@keys.delete(key)
|
|
1398
|
+
@seqs.delete(key)
|
|
1399
|
+
end
|
|
1400
|
+
|
|
1401
|
+
# Returns sequences. (Like Hash#values)
|
|
1402
|
+
def values
|
|
1403
|
+
#(Hash-like)
|
|
1404
|
+
@keys.collect { |k| @seqs[k] }
|
|
1405
|
+
end
|
|
1406
|
+
|
|
1407
|
+
# Adds a sequence without key.
|
|
1408
|
+
# The key is automatically determined.
|
|
1409
|
+
def <<(seq)
|
|
1410
|
+
#(Array-like)
|
|
1411
|
+
self.store(nil, seq)
|
|
1412
|
+
self
|
|
1413
|
+
end
|
|
1414
|
+
|
|
1415
|
+
# Gets a sequence. (Like Hash#[])
|
|
1416
|
+
def [](*arg)
|
|
1417
|
+
#(Hash-like)
|
|
1418
|
+
@seqs[*arg]
|
|
1419
|
+
end
|
|
1420
|
+
|
|
1421
|
+
# Number of sequences in the alignment.
|
|
1422
|
+
def size
|
|
1423
|
+
#(Hash&Array-like)
|
|
1424
|
+
@seqs.size
|
|
1425
|
+
end
|
|
1426
|
+
|
|
1427
|
+
# If the key exists, returns true. Otherwise, returns false.
|
|
1428
|
+
# (Like Hash#has_key?)
|
|
1429
|
+
def has_key?(key)
|
|
1430
|
+
#(Hash-like)
|
|
1431
|
+
@seqs.has_key?(key)
|
|
1432
|
+
end
|
|
1433
|
+
|
|
1434
|
+
# Iterates over each sequence.
|
|
1435
|
+
# (Like Array#each)
|
|
1436
|
+
def each
|
|
1437
|
+
#(Array-like)
|
|
1438
|
+
@keys.each do |k|
|
|
1439
|
+
yield @seqs[k]
|
|
1440
|
+
end
|
|
1441
|
+
end
|
|
1442
|
+
alias each_seq each
|
|
1443
|
+
|
|
1444
|
+
# Iterates over each key and sequence.
|
|
1445
|
+
# (Like Hash#each_pair)
|
|
1446
|
+
def each_pair
|
|
1447
|
+
#(Hash-like)
|
|
1448
|
+
@keys.each do |k|
|
|
1449
|
+
yield k, @seqs[k]
|
|
1450
|
+
end
|
|
1451
|
+
end
|
|
1452
|
+
|
|
1453
|
+
# Iterates over each sequence, replacing the sequence with the
|
|
1454
|
+
# value returned by the block.
|
|
1455
|
+
def collect!
|
|
1456
|
+
#(Array-like)
|
|
1457
|
+
@keys.each do |k|
|
|
1458
|
+
@seqs[k] = yield @seqs[k]
|
|
1459
|
+
end
|
|
1460
|
+
end
|
|
1461
|
+
|
|
1462
|
+
###--
|
|
1463
|
+
### note that 'collect' and 'to_a' is defined in Enumerable
|
|
1464
|
+
###
|
|
1465
|
+
### instance-variable-related methods
|
|
1466
|
+
###++
|
|
1467
|
+
|
|
1468
|
+
# Creates new alignment. Internal use only.
|
|
1469
|
+
def new(*arg)
|
|
1470
|
+
na = self.class.new(*arg)
|
|
1471
|
+
na.set_all_property(get_all_property)
|
|
1472
|
+
na
|
|
1473
|
+
end
|
|
1474
|
+
protected :new
|
|
1475
|
+
|
|
1476
|
+
# Duplicates the alignment
|
|
1477
|
+
def dup
|
|
1478
|
+
#(Hash-like)
|
|
1479
|
+
self.new(self)
|
|
1480
|
+
end
|
|
1481
|
+
|
|
1482
|
+
#--
|
|
1483
|
+
# methods below should not access instance variables
|
|
1484
|
+
#++
|
|
1485
|
+
|
|
1486
|
+
# Merges given alignment and returns a new alignment.
|
|
1487
|
+
def merge(*other)
|
|
1488
|
+
#(Hash-like)
|
|
1489
|
+
na = self.new(self)
|
|
1490
|
+
na.merge!(*other)
|
|
1491
|
+
na
|
|
1492
|
+
end
|
|
1493
|
+
|
|
1494
|
+
# Merge given alignment.
|
|
1495
|
+
# Note that it is destructive method.
|
|
1496
|
+
def merge!(*other)
|
|
1497
|
+
#(Hash-like)
|
|
1498
|
+
if block_given? then
|
|
1499
|
+
other.each do |aln|
|
|
1500
|
+
aln.each_pair do |k, s|
|
|
1501
|
+
if self.has_key?(k) then
|
|
1502
|
+
s = yield k, self[k], s
|
|
1503
|
+
self.to_hash.store(k, s)
|
|
1504
|
+
else
|
|
1505
|
+
self.store(k, s)
|
|
1506
|
+
end
|
|
1507
|
+
end
|
|
1508
|
+
end
|
|
1509
|
+
else
|
|
1510
|
+
other.each do |aln|
|
|
1511
|
+
aln.each_pair do |k, s|
|
|
1512
|
+
self.delete(k) if self.has_key?(k)
|
|
1513
|
+
self.store(k, s)
|
|
1514
|
+
end
|
|
1515
|
+
end
|
|
1516
|
+
end
|
|
1517
|
+
self
|
|
1518
|
+
end
|
|
1519
|
+
|
|
1520
|
+
# Returns the key for a given sequence. If not found, returns nil.
|
|
1521
|
+
def index(seq)
|
|
1522
|
+
#(Hash-like)
|
|
1523
|
+
k = nil
|
|
1524
|
+
self.each_pair do |k, s|
|
|
1525
|
+
if s.class == seq.class then
|
|
1526
|
+
r = (s == seq)
|
|
1527
|
+
else
|
|
1528
|
+
r = (s.to_s == seq.to_s)
|
|
1529
|
+
end
|
|
1530
|
+
break if r
|
|
1531
|
+
end
|
|
1532
|
+
k
|
|
1533
|
+
end
|
|
1534
|
+
|
|
1535
|
+
# Sequences in the alignment are duplicated.
|
|
1536
|
+
# If keys are given to the argument, sequences of given keys are
|
|
1537
|
+
# duplicated.
|
|
1538
|
+
#
|
|
1539
|
+
# It will be obsoleted.
|
|
1540
|
+
def isolate(*arg)
|
|
1541
|
+
#(original)
|
|
1542
|
+
if arg.size == 0 then
|
|
1543
|
+
self.collect! do |s|
|
|
1544
|
+
seqclass.new(s)
|
|
1545
|
+
end
|
|
1546
|
+
else
|
|
1547
|
+
arg.each do |k|
|
|
1548
|
+
if self.has_key?(k) then
|
|
1549
|
+
s = self.delete(key)
|
|
1550
|
+
self.store(k, seqclass.new(s))
|
|
1551
|
+
end
|
|
1552
|
+
end
|
|
1553
|
+
end
|
|
1554
|
+
self
|
|
1555
|
+
end
|
|
1556
|
+
|
|
1557
|
+
# Iterates over each sequence and each results running block
|
|
1558
|
+
# are collected and returns a new alignment.
|
|
1559
|
+
#
|
|
1560
|
+
# The method name 'collect_align' will be obsoleted.
|
|
1561
|
+
# Please use 'alignment_collect' instead.
|
|
1562
|
+
def alignment_collect
|
|
1563
|
+
#(original)
|
|
1564
|
+
na = self.class.new
|
|
1565
|
+
na.set_all_property(get_all_property)
|
|
1566
|
+
self.each_pair do |k, s|
|
|
1567
|
+
na.store(k, yield(s))
|
|
1568
|
+
end
|
|
1569
|
+
na
|
|
1570
|
+
end
|
|
1571
|
+
alias collect_align alignment_collect
|
|
1572
|
+
|
|
1573
|
+
# Removes empty sequences or nil in the alignment.
|
|
1574
|
+
# (Like Array#compact!)
|
|
1575
|
+
def compact!
|
|
1576
|
+
#(Array-like)
|
|
1577
|
+
d = []
|
|
1578
|
+
self.each_pair do |k, s|
|
|
1579
|
+
if !s or s.empty?
|
|
1580
|
+
d << k
|
|
1581
|
+
end
|
|
1582
|
+
end
|
|
1583
|
+
d.each do |k|
|
|
1584
|
+
self.delete(k)
|
|
1585
|
+
end
|
|
1586
|
+
d.empty? ? nil : d
|
|
1587
|
+
end
|
|
1588
|
+
|
|
1589
|
+
# Removes empty sequences or nil and returns new alignment.
|
|
1590
|
+
# (Like Array#compact)
|
|
1591
|
+
def compact
|
|
1592
|
+
#(Array-like)
|
|
1593
|
+
na = self.dup
|
|
1594
|
+
na.compact!
|
|
1595
|
+
na
|
|
1596
|
+
end
|
|
1597
|
+
|
|
1598
|
+
# Adds a sequence to the alignment.
|
|
1599
|
+
# Returns key if succeeded.
|
|
1600
|
+
# Returns nil (and not added to the alignment) if key is already used.
|
|
1601
|
+
#
|
|
1602
|
+
# It resembles BioPerl's AlignI::add_seq method.
|
|
1603
|
+
def add_seq(seq, key = nil)
|
|
1604
|
+
#(BioPerl) AlignI::add_seq like method
|
|
1605
|
+
unless seq.is_a?(Bio::Sequence) then
|
|
1606
|
+
s = extract_seq(seq)
|
|
1607
|
+
key = extract_key(seq) unless key
|
|
1608
|
+
seq = s
|
|
1609
|
+
end
|
|
1610
|
+
self.store(key, seq)
|
|
1611
|
+
end
|
|
1612
|
+
|
|
1613
|
+
# Removes given sequence from the alignment.
|
|
1614
|
+
# Returns removed sequence. If nothing removed, returns nil.
|
|
1615
|
+
#
|
|
1616
|
+
# It resembles BioPerl's AlignI::remove_seq.
|
|
1617
|
+
def remove_seq(seq)
|
|
1618
|
+
#(BioPerl) AlignI::remove_seq like method
|
|
1619
|
+
if k = self.index(seq) then
|
|
1620
|
+
self.delete(k)
|
|
1621
|
+
else
|
|
1622
|
+
nil
|
|
1623
|
+
end
|
|
1624
|
+
end
|
|
1625
|
+
|
|
1626
|
+
# Removes sequences from the alignment by given keys.
|
|
1627
|
+
# Returns an alignment object consists of removed sequences.
|
|
1628
|
+
#
|
|
1629
|
+
# It resembles BioPerl's AlignI::purge method.
|
|
1630
|
+
def purge(*arg)
|
|
1631
|
+
#(BioPerl) AlignI::purge like method
|
|
1632
|
+
purged = self.new
|
|
1633
|
+
arg.each do |k|
|
|
1634
|
+
if self[k] then
|
|
1635
|
+
purged.store(k, self.delete(k))
|
|
1636
|
+
end
|
|
1637
|
+
end
|
|
1638
|
+
purged
|
|
1639
|
+
end
|
|
1640
|
+
|
|
1641
|
+
# If block is given, it acts like Array#select (Enumerable#select).
|
|
1642
|
+
# Returns a new alignment containing all sequences of the alignment
|
|
1643
|
+
# for which return value of given block is not false nor nil.
|
|
1644
|
+
#
|
|
1645
|
+
# If no block is given, it acts like the BioPerl's AlignI::select.
|
|
1646
|
+
# Returns a new alignment containing sequences of given keys.
|
|
1647
|
+
#
|
|
1648
|
+
# The BioPerl's AlignI::select-like action will be obsoleted.
|
|
1649
|
+
def select(*arg)
|
|
1650
|
+
#(original)
|
|
1651
|
+
na = self.new
|
|
1652
|
+
if block_given? then
|
|
1653
|
+
# 'arg' is ignored
|
|
1654
|
+
# nearly same action as Array#select (Enumerable#select)
|
|
1655
|
+
self.each_pair.each do |k, s|
|
|
1656
|
+
na.store(k, s) if yield(s)
|
|
1657
|
+
end
|
|
1658
|
+
else
|
|
1659
|
+
# BioPerl's AlignI::select like function
|
|
1660
|
+
arg.each do |k|
|
|
1661
|
+
if s = self[k] then
|
|
1662
|
+
na.store(k, s)
|
|
1663
|
+
end
|
|
1664
|
+
end
|
|
1665
|
+
end
|
|
1666
|
+
na
|
|
1667
|
+
end
|
|
1668
|
+
|
|
1669
|
+
# The method name <tt>slice</tt> will be obsoleted.
|
|
1670
|
+
# Please use <tt>alignment_slice</tt> instead.
|
|
1671
|
+
alias slice alignment_slice
|
|
1672
|
+
|
|
1673
|
+
# The method name <tt>subseq</tt> will be obsoleted.
|
|
1674
|
+
# Please use <tt>alignment_subseq</tt> instead.
|
|
1675
|
+
alias subseq alignment_subseq
|
|
1676
|
+
|
|
1677
|
+
# Not-destructive version of alignment_normalize!.
|
|
1678
|
+
# Returns a new alignment.
|
|
1679
|
+
def normalize
|
|
1680
|
+
#(original)
|
|
1681
|
+
na = self.dup
|
|
1682
|
+
na.alignment_normalize!
|
|
1683
|
+
na
|
|
1684
|
+
end
|
|
1685
|
+
|
|
1686
|
+
# Not-destructive version of alignment_rstrip!.
|
|
1687
|
+
# Returns a new alignment.
|
|
1688
|
+
def rstrip
|
|
1689
|
+
#(String-like)
|
|
1690
|
+
na = self.dup
|
|
1691
|
+
na.isolate
|
|
1692
|
+
na.alignment_rstrip!
|
|
1693
|
+
na
|
|
1694
|
+
end
|
|
1695
|
+
|
|
1696
|
+
# Not-destructive version of alignment_lstrip!.
|
|
1697
|
+
# Returns a new alignment.
|
|
1698
|
+
def lstrip
|
|
1699
|
+
#(String-like)
|
|
1700
|
+
na = self.dup
|
|
1701
|
+
na.isolate
|
|
1702
|
+
na.alignment_lstrip!
|
|
1703
|
+
na
|
|
1704
|
+
end
|
|
1705
|
+
|
|
1706
|
+
# Not-destructive version of alignment_strip!.
|
|
1707
|
+
# Returns a new alignment.
|
|
1708
|
+
def strip
|
|
1709
|
+
#(String-like)
|
|
1710
|
+
na = self.dup
|
|
1711
|
+
na.isolate
|
|
1712
|
+
na.alignment_strip!
|
|
1713
|
+
na
|
|
1714
|
+
end
|
|
1715
|
+
|
|
1716
|
+
# Not-destructive version of remove_gaps!.
|
|
1717
|
+
# Returns a new alignment.
|
|
1718
|
+
#
|
|
1719
|
+
# The method name 'remove_gap' will be obsoleted.
|
|
1720
|
+
# Please use 'remove_all_gaps' instead.
|
|
1721
|
+
def remove_all_gaps
|
|
1722
|
+
#(original)
|
|
1723
|
+
na = self.dup
|
|
1724
|
+
na.isolate
|
|
1725
|
+
na.remove_all_gaps!
|
|
1726
|
+
na
|
|
1727
|
+
end
|
|
1728
|
+
|
|
1729
|
+
# Concatenates a string or an alignment.
|
|
1730
|
+
# Returns self.
|
|
1731
|
+
#
|
|
1732
|
+
# Note that the method will be obsoleted.
|
|
1733
|
+
# Please use <tt>each_seq { |s| s << str }</tt> for concatenating
|
|
1734
|
+
# a string and
|
|
1735
|
+
# <tt>alignment_concat(aln)</tt> for concatenating an alignment.
|
|
1736
|
+
def concat(aln)
|
|
1737
|
+
#(String-like)
|
|
1738
|
+
if aln.respond_to?(:to_str) then #aln.is_a?(String)
|
|
1739
|
+
self.each do |s|
|
|
1740
|
+
s << aln
|
|
1741
|
+
end
|
|
1742
|
+
self
|
|
1743
|
+
else
|
|
1744
|
+
alignment_concat(aln)
|
|
1745
|
+
end
|
|
1746
|
+
end
|
|
1747
|
+
|
|
1748
|
+
# Replace the specified region of the alignment to aln.
|
|
1749
|
+
# aln:: String or Bio::Alignment object
|
|
1750
|
+
# arg:: same format as String#slice
|
|
1751
|
+
#
|
|
1752
|
+
# It will be obsoleted.
|
|
1753
|
+
def replace_slice(aln, *arg)
|
|
1754
|
+
#(original)
|
|
1755
|
+
if aln.respond_to?(:to_str) then #aln.is_a?(String)
|
|
1756
|
+
self.each do |s|
|
|
1757
|
+
s[*arg] = aln
|
|
1758
|
+
end
|
|
1759
|
+
elsif aln.is_a?(self.class) then
|
|
1760
|
+
aln.each_pair do |k, s|
|
|
1761
|
+
self[k][*arg] = s
|
|
1762
|
+
end
|
|
1763
|
+
else
|
|
1764
|
+
i = 0
|
|
1765
|
+
aln.each do |s|
|
|
1766
|
+
self.order(i)[*arg] = s
|
|
1767
|
+
i += 1
|
|
1768
|
+
end
|
|
1769
|
+
end
|
|
1770
|
+
self
|
|
1771
|
+
end
|
|
1772
|
+
|
|
1773
|
+
# Performs multiple alignment by using external program.
|
|
1774
|
+
def do_align(factory)
|
|
1775
|
+
a0 = self.class.new
|
|
1776
|
+
(0...self.size).each { |i| a0.store(i, self.order(i)) }
|
|
1777
|
+
r = factory.query(a0)
|
|
1778
|
+
a1 = r.alignment
|
|
1779
|
+
a0.keys.each do |k|
|
|
1780
|
+
unless a1[k.to_s] then
|
|
1781
|
+
raise 'alignment result is inconsistent with input data'
|
|
1782
|
+
end
|
|
1783
|
+
end
|
|
1784
|
+
a2 = self.new
|
|
1785
|
+
a0.keys.each do |k|
|
|
1786
|
+
a2.store(self.keys[k], a1[k.to_s])
|
|
1787
|
+
end
|
|
1788
|
+
a2
|
|
1789
|
+
end
|
|
1790
|
+
|
|
1791
|
+
# Convert to fasta format and returns an array of strings.
|
|
1792
|
+
#
|
|
1793
|
+
# It will be obsoleted.
|
|
1794
|
+
def to_fasta_array(*arg)
|
|
1795
|
+
#(original)
|
|
1796
|
+
width = nil
|
|
1797
|
+
if arg[0].is_a?(Integer) then
|
|
1798
|
+
width = arg.shift
|
|
1799
|
+
end
|
|
1800
|
+
options = (arg.shift or {})
|
|
1801
|
+
width = options[:width] unless width
|
|
1802
|
+
if options[:avoid_same_name] then
|
|
1803
|
+
na = avoid_same_name(self.keys, 30)
|
|
1804
|
+
else
|
|
1805
|
+
na = self.keys.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
|
|
1806
|
+
end
|
|
1807
|
+
a = self.collect do |s|
|
|
1808
|
+
">#{na.shift}\n" +
|
|
1809
|
+
if width then
|
|
1810
|
+
s.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
|
|
1811
|
+
else
|
|
1812
|
+
s.to_s + "\n"
|
|
1813
|
+
end
|
|
1814
|
+
end
|
|
1815
|
+
a
|
|
1816
|
+
end
|
|
1817
|
+
|
|
1818
|
+
# Convets to fasta format and returns an array of FastaFormat objects.
|
|
1819
|
+
#
|
|
1820
|
+
# It will be obsoleted.
|
|
1821
|
+
def to_fastaformat_array(*arg)
|
|
1822
|
+
#(original)
|
|
1823
|
+
require 'bio/db/fasta'
|
|
1824
|
+
a = self.to_fasta_array(*arg)
|
|
1825
|
+
a.collect! do |x|
|
|
1826
|
+
Bio::FastaFormat.new(x)
|
|
1827
|
+
end
|
|
1828
|
+
a
|
|
1829
|
+
end
|
|
1830
|
+
|
|
1831
|
+
# Converts to fasta format and returns a string.
|
|
1832
|
+
#
|
|
1833
|
+
# The specification of the argument will be changed.
|
|
1834
|
+
def to_fasta(*arg)
|
|
1835
|
+
#(original)
|
|
1836
|
+
self.to_fasta_array(*arg).join('')
|
|
1837
|
+
end
|
|
1838
|
+
|
|
1839
|
+
include ClustalWFormatter
|
|
1840
|
+
# Returns a string of Clustal W formatted text of the alignment.
|
|
1841
|
+
def to_clustal(options = {})
|
|
1842
|
+
clustalw_formatter(self, self.keys, options)
|
|
1843
|
+
end
|
|
1844
|
+
|
|
1845
|
+
# The method name <tt>consensus</tt> will be obsoleted.
|
|
1846
|
+
# Please use <tt>consensus_string</tt> instead.
|
|
1847
|
+
alias consensus consensus_string
|
|
1848
|
+
end #class OriginalAlignment
|
|
1849
|
+
|
|
1850
|
+
# Bio::Alignment::GAP is a set of class methods for
|
|
1851
|
+
# gap-related position translation.
|
|
1852
|
+
module GAP
|
|
1853
|
+
# position with gaps are translated into the position without gaps.
|
|
1854
|
+
#<em>seq</em>:: sequence
|
|
1855
|
+
#<em>pos</em>:: position with gaps
|
|
1856
|
+
#<em>gap_regexp</em>:: regular expression to specify gaps
|
|
1857
|
+
def ungapped_pos(seq, pos, gap_regexp)
|
|
1858
|
+
p = seq[0..pos].gsub(gap_regexp, '').length
|
|
1859
|
+
p -= 1 if p > 0
|
|
1860
|
+
p
|
|
1861
|
+
end
|
|
1862
|
+
module_function :ungapped_pos
|
|
1863
|
+
|
|
1864
|
+
# position without gaps are translated into the position with gaps.
|
|
1865
|
+
#<em>seq</em>:: sequence
|
|
1866
|
+
#<em>pos</em>:: position with gaps
|
|
1867
|
+
#<em>gap_regexp</em>:: regular expression to specify gaps
|
|
1868
|
+
def gapped_pos(seq, pos, gap_regexp)
|
|
1869
|
+
olen = seq.gsub(gap_regexp, '').length
|
|
1870
|
+
pos = olen if pos >= olen
|
|
1871
|
+
pos = olen + pos if pos < 0
|
|
1872
|
+
|
|
1873
|
+
i = 0
|
|
1874
|
+
l = pos + 1
|
|
1875
|
+
while l > 0 and i < seq.length
|
|
1876
|
+
x = seq[i, l].gsub(gap_regexp, '').length
|
|
1877
|
+
i += l
|
|
1878
|
+
l -= x
|
|
1879
|
+
end
|
|
1880
|
+
i -= 1 if i > 0
|
|
1881
|
+
i
|
|
1882
|
+
end
|
|
1883
|
+
module_function :gapped_pos
|
|
1884
|
+
end # module GAP
|
|
1885
|
+
|
|
1886
|
+
# creates a new Bio::Alignment::OriginalAlignment object.
|
|
1887
|
+
# Please refer document of OriginalAlignment.new.
|
|
1888
|
+
def self.new(*arg)
|
|
1889
|
+
OriginalAlignment.new(*arg)
|
|
1890
|
+
end
|
|
1891
|
+
|
|
1892
|
+
# creates a new Bio::Alignment::OriginalAlignment object.
|
|
1893
|
+
# Please refer document of OriginalAlignment.new2.
|
|
1894
|
+
def self.new2(*arg)
|
|
1895
|
+
OriginalAlignment.new2(*arg)
|
|
1896
|
+
end
|
|
1897
|
+
|
|
1898
|
+
# creates a new Bio::Alignment::OriginalAlignment object.
|
|
1899
|
+
# Please refer document of OriginalAlignment.readfiles.
|
|
1900
|
+
def self.readfiles(*files)
|
|
1901
|
+
OriginalAlignment.readfiles(*files)
|
|
1902
|
+
end
|
|
1903
|
+
end #module Alignment
|
|
1904
|
+
|
|
1905
|
+
end #module Bio
|
|
1906
|
+
|