bio 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
@@ -0,0 +1,1906 @@
|
|
1
|
+
#
|
2
|
+
# = bio/alignment.rb - multiple alignment of sequences
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2003, 2005
|
5
|
+
# GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
|
6
|
+
#
|
7
|
+
# License:: LGPL
|
8
|
+
#
|
9
|
+
# $Id: alignment.rb,v 1.14 2005/12/02 12:01:28 ngoto Exp $
|
10
|
+
#
|
11
|
+
#--
|
12
|
+
# This library is free software; you can redistribute it and/or
|
13
|
+
# modify it under the terms of the GNU Lesser General Public
|
14
|
+
# License as published by the Free Software Foundation; either
|
15
|
+
# version 2 of the License, or (at your option) any later version.
|
16
|
+
#
|
17
|
+
# This library is distributed in the hope that it will be useful,
|
18
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
19
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
20
|
+
# Lesser General Public License for more details.
|
21
|
+
#
|
22
|
+
# You should have received a copy of the GNU Lesser General Public
|
23
|
+
# License along with this library; if not, write to the Free Software
|
24
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
25
|
+
#++
|
26
|
+
#
|
27
|
+
# = About Bio::Alignment
|
28
|
+
#
|
29
|
+
# Please refer document of Bio::Alignment module.
|
30
|
+
#
|
31
|
+
# = References
|
32
|
+
#
|
33
|
+
# * Bio::Align::AlignI class of the BioPerl.
|
34
|
+
# http://doc.bioperl.org/releases/bioperl-1.4/Bio/Align/AlignI.html
|
35
|
+
#
|
36
|
+
# * Bio::SimpleAlign class of the BioPerl.
|
37
|
+
# http://doc.bioperl.org/releases/bioperl-1.4/Bio/SimpleAlign.html
|
38
|
+
#
|
39
|
+
|
40
|
+
require 'bio/sequence'
|
41
|
+
|
42
|
+
module Bio
|
43
|
+
|
44
|
+
=begin rdoc
|
45
|
+
|
46
|
+
= About Bio::Alignment
|
47
|
+
|
48
|
+
Bio::Alignment is a namespace of classes/modules for multiple sequence
|
49
|
+
alignment.
|
50
|
+
|
51
|
+
= Multiple alignment container classes
|
52
|
+
|
53
|
+
== Bio::Alignment::OriginalAlignment
|
54
|
+
|
55
|
+
== Bio::Alignment::SequenceArray
|
56
|
+
|
57
|
+
== Bio::Alignment::SequenceHash
|
58
|
+
|
59
|
+
= Bio::Alignment::Site
|
60
|
+
|
61
|
+
= Modules
|
62
|
+
|
63
|
+
== Bio::Alignment::EnumerableExtension
|
64
|
+
|
65
|
+
Mix-in for classes included Enumerable.
|
66
|
+
|
67
|
+
== Bio::Alignment::ArrayExtension
|
68
|
+
|
69
|
+
Mix-in for Array or Array-like classes.
|
70
|
+
|
71
|
+
== Bio::Alignment::HashExtension
|
72
|
+
|
73
|
+
Mix-in for Hash or Hash-like classes.
|
74
|
+
|
75
|
+
== Bio::Alignment::SiteMethods
|
76
|
+
|
77
|
+
== Bio::Alignment::PropertyMethods
|
78
|
+
|
79
|
+
= Bio::Alignment::GAP
|
80
|
+
|
81
|
+
= Compatibility from older BioRuby
|
82
|
+
|
83
|
+
=end
|
84
|
+
module Alignment
|
85
|
+
|
86
|
+
# Bio::Alignment::PropertyMethods is a set of methods to treat
|
87
|
+
# the gap character and so on.
|
88
|
+
module PropertyMethods
|
89
|
+
# regular expression for detecting gaps.
|
90
|
+
GAP_REGEXP = /[^a-zA-Z]/
|
91
|
+
# gap character
|
92
|
+
GAP_CHAR = '-'.freeze
|
93
|
+
# missing character
|
94
|
+
MISSING_CHAR = '?'.freeze
|
95
|
+
|
96
|
+
# If given character is a gap, returns true.
|
97
|
+
# Otherwise, return false.
|
98
|
+
# Note that <em>s</em> must be a String which contain a single character.
|
99
|
+
def is_gap?(s)
|
100
|
+
(gap_regexp =~ s) ? true : false
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns regular expression for checking gap.
|
104
|
+
def gap_regexp
|
105
|
+
@gap_regexp or GAP_REGEXP
|
106
|
+
end
|
107
|
+
# regular expression for checking gap
|
108
|
+
attr_writer :gap_regexp
|
109
|
+
|
110
|
+
# Gap character.
|
111
|
+
def gap_char
|
112
|
+
@gap_char or GAP_CHAR
|
113
|
+
end
|
114
|
+
# gap character
|
115
|
+
attr_writer :gap_char
|
116
|
+
|
117
|
+
# Character if the site is missing or unknown.
|
118
|
+
def missing_char
|
119
|
+
@missing_char or MISSING_CHAR
|
120
|
+
end
|
121
|
+
# Character if the site is missing or unknown.
|
122
|
+
attr_writer :missing_char
|
123
|
+
|
124
|
+
# Returns class of the sequence.
|
125
|
+
# If instance variable @seqclass (which can be
|
126
|
+
# set by 'seqclass=' method) is set, simply returns the value.
|
127
|
+
# Otherwise, returns the first sequence's class.
|
128
|
+
# If no sequences are found, returns nil.
|
129
|
+
def seqclass
|
130
|
+
@seqclass or String
|
131
|
+
end
|
132
|
+
|
133
|
+
# The class of the sequence.
|
134
|
+
# The value must be String or its derivatives.
|
135
|
+
attr_writer :seqclass
|
136
|
+
|
137
|
+
# Returns properties defined in the object as an hash.
|
138
|
+
def get_all_property
|
139
|
+
ret = {}
|
140
|
+
if defined? @gap_regexp
|
141
|
+
ret[:gap_regexp] = @gap_regexp
|
142
|
+
end
|
143
|
+
if defined? @gap_char
|
144
|
+
ret[:gap_char] = @gap_char
|
145
|
+
end
|
146
|
+
if defined? @missing_char
|
147
|
+
ret[:missing_char] = @missing_char
|
148
|
+
end
|
149
|
+
if defined? @seqclass
|
150
|
+
ret[:seqclass] = @seqclass
|
151
|
+
end
|
152
|
+
ret
|
153
|
+
end
|
154
|
+
|
155
|
+
# Sets properties from given hash.
|
156
|
+
# <em>hash</em> would be a return value of <tt>get_character</tt> method.
|
157
|
+
def set_all_property(hash)
|
158
|
+
@gap_regexp = hash[:gap_regexp] if hash.has_key?(:gap_regexp)
|
159
|
+
@gap_char = hash[:gap_char] if hash.has_key?(:gap_char)
|
160
|
+
@missing_char = hash[:missing_char] if hash.has_key?(:missing_char)
|
161
|
+
@seqclass = hash[:seqclass] if hash.has_key?(:seqclass)
|
162
|
+
self
|
163
|
+
end
|
164
|
+
end #module PropertyMethods
|
165
|
+
|
166
|
+
# Bio::Alignment::SiteMethods is a set of methods for
|
167
|
+
# Bio::Alignment::Site.
|
168
|
+
# It can also be used for extending an array of single-letter strings.
|
169
|
+
module SiteMethods
|
170
|
+
include PropertyMethods
|
171
|
+
|
172
|
+
# If there are gaps, returns true. Otherwise, returns false.
|
173
|
+
def has_gap?
|
174
|
+
(find { |x| is_gap?(x) }) ? true : false
|
175
|
+
end
|
176
|
+
|
177
|
+
# Removes gaps in the site. (destructive method)
|
178
|
+
def remove_gaps!
|
179
|
+
flag = nil
|
180
|
+
self.collect! do |x|
|
181
|
+
if is_gap?(x) then flag = self; nil; else x; end
|
182
|
+
end
|
183
|
+
self.compact!
|
184
|
+
flag
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns consensus character of the site.
|
188
|
+
# If consensus is found, eturns a single-letter string.
|
189
|
+
# If not, returns nil.
|
190
|
+
def consensus_string(threshold = 1.0)
|
191
|
+
return nil if self.size <= 0
|
192
|
+
return self[0] if self.sort.uniq.size == 1
|
193
|
+
h = Hash.new(0)
|
194
|
+
self.each { |x| h[x] += 1 }
|
195
|
+
total = self.size
|
196
|
+
b = h.to_a.sort do |x,y|
|
197
|
+
z = (y[1] <=> x[1])
|
198
|
+
z = (self.index(x[0]) <=> self.index(y[0])) if z == 0
|
199
|
+
z
|
200
|
+
end
|
201
|
+
if total * threshold <= b[0][1] then
|
202
|
+
b[0][0]
|
203
|
+
else
|
204
|
+
nil
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# IUPAC nucleotide groups. Internal use only.
|
209
|
+
IUPAC_NUC = [
|
210
|
+
%w( t u ),
|
211
|
+
%w( m a c ),
|
212
|
+
%w( r a g ),
|
213
|
+
%w( w a t u ),
|
214
|
+
%w( s c g ),
|
215
|
+
%w( y c t u ),
|
216
|
+
%w( k g t u ),
|
217
|
+
%w( v a c g m r s ),
|
218
|
+
%w( h a c t u m w y ),
|
219
|
+
%w( d a g t u r w k ),
|
220
|
+
%w( b c g t u s y k ),
|
221
|
+
%w( n a c g t u m r w s y k v h d b )
|
222
|
+
]
|
223
|
+
|
224
|
+
# Returns an IUPAC consensus base for the site.
|
225
|
+
# If consensus is found, eturns a single-letter string.
|
226
|
+
# If not, returns nil.
|
227
|
+
def consensus_iupac
|
228
|
+
a = self.collect { |x| x.downcase }.sort.uniq
|
229
|
+
if a.size == 1 then
|
230
|
+
case a[0]
|
231
|
+
when 'a', 'c', 'g', 't'
|
232
|
+
a[0]
|
233
|
+
when 'u'
|
234
|
+
't'
|
235
|
+
else
|
236
|
+
IUPAC_NUC.find { |x| a[0] == x[0] } ? a[0] : nil
|
237
|
+
end
|
238
|
+
elsif r = IUPAC_NUC.find { |x| (a - x).size <= 0 } then
|
239
|
+
r[0]
|
240
|
+
else
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
# Table of strongly conserved amino-acid groups.
|
246
|
+
#
|
247
|
+
# The value of the tables are taken from BioPerl
|
248
|
+
# (Bio/SimpleAlign.pm in BioPerl 1.0),
|
249
|
+
# and the BioPerl's document says that
|
250
|
+
# it is taken from Clustalw documentation and
|
251
|
+
# These are all the positively scoring groups that occur in the
|
252
|
+
# Gonnet Pam250 matrix. The strong and weak groups are
|
253
|
+
# defined as strong score >0.5 and weak score =<0.5 respectively.
|
254
|
+
#
|
255
|
+
StrongConservationGroups = %w(STA NEQK NHQK NDEQ QHRK MILV MILF
|
256
|
+
HY FYW).collect { |x| x.split('').sort }
|
257
|
+
|
258
|
+
# Table of weakly conserved amino-acid groups.
|
259
|
+
#
|
260
|
+
# Please refer StrongConservationGroups document
|
261
|
+
# for the origin of the table.
|
262
|
+
WeakConservationGroups = %w(CSA ATV SAG STNK STPA SGND SNDEQK
|
263
|
+
NDEQHK NEQHRK FVLIM HFY).collect { |x| x.split('').sort }
|
264
|
+
|
265
|
+
# Returns the match-line character for the site.
|
266
|
+
# This is amino-acid version.
|
267
|
+
def match_line_amino(opt = {})
|
268
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
269
|
+
# opt[:strong_match_char] ==> strong match default: ':'
|
270
|
+
# opt[:weak_match_char] ==> weak match default: '.'
|
271
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
272
|
+
mlc = (opt[:match_line_char] or '*')
|
273
|
+
smc = (opt[:strong_match_char] or ':')
|
274
|
+
wmc = (opt[:weak_match_char] or '.')
|
275
|
+
mmc = (opt[:mismatch_char] or ' ')
|
276
|
+
a = self.collect { |c| c.upcase }.sort.uniq
|
277
|
+
a.extend(SiteMethods)
|
278
|
+
if a.has_gap? then
|
279
|
+
mmc
|
280
|
+
elsif a.size == 1 then
|
281
|
+
mlc
|
282
|
+
elsif StrongConservationGroups.find { |x| (a - x).empty? } then
|
283
|
+
smc
|
284
|
+
elsif WeakConservationGroups.find { |x| (a - x).empty? } then
|
285
|
+
wmc
|
286
|
+
else
|
287
|
+
mmc
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
# Returns the match-line character for the site.
|
292
|
+
# This is nucleic-acid version.
|
293
|
+
def match_line_nuc(opt = {})
|
294
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
295
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
296
|
+
mlc = (opt[:match_line_char] or '*')
|
297
|
+
mmc = (opt[:mismatch_char] or ' ')
|
298
|
+
a = self.collect { |c| c.upcase }.sort.uniq
|
299
|
+
a.extend(SiteMethods)
|
300
|
+
if a.has_gap? then
|
301
|
+
mmc
|
302
|
+
elsif a.size == 1 then
|
303
|
+
mlc
|
304
|
+
else
|
305
|
+
mmc
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end #module SiteMethods
|
309
|
+
|
310
|
+
# Bio::Alignment::Site stores bases or amino-acids in a
|
311
|
+
# site of the alignment.
|
312
|
+
# It would store multiple String objects of length 1.
|
313
|
+
# Please refer to the document of Array and SiteMethods for methods.
|
314
|
+
class Site < Array
|
315
|
+
include SiteMethods
|
316
|
+
end #module Site
|
317
|
+
|
318
|
+
# The module Bio::Alignment::EnumerableExtension is a set of useful
|
319
|
+
# methods for multiple sequence alignment.
|
320
|
+
# It can be included by any classes or can be extended to any objects.
|
321
|
+
# The classes or objects must have methods defined in Enumerable,
|
322
|
+
# and must have the <tt>each</tt> method
|
323
|
+
# which iterates over each sequence (or string) and yields
|
324
|
+
# a sequence (or string) object.
|
325
|
+
#
|
326
|
+
# Optionally, if <tt>each_seq</tt> method is defined,
|
327
|
+
# which iterates over each sequence (or string) and yields
|
328
|
+
# each sequence (or string) object, it is used instead of <tt>each</tt>.
|
329
|
+
#
|
330
|
+
# Note that the <tt>each</tt> or <tt>each_seq</tt> method would be
|
331
|
+
# called multiple times.
|
332
|
+
# This means that the module is not suitable for IO objects.
|
333
|
+
# In addition, <tt>break</tt> would be used in the given block and
|
334
|
+
# destructive methods would be used to the sequences.
|
335
|
+
#
|
336
|
+
# For Array or Hash objects, you'd better using
|
337
|
+
# ArrayExtension or HashExtension modules, respectively.
|
338
|
+
# They would have built-in <tt>each_seq</tt> method and/or
|
339
|
+
# some methods would be redefined.
|
340
|
+
#
|
341
|
+
module EnumerableExtension
|
342
|
+
include PropertyMethods
|
343
|
+
|
344
|
+
# Iterates over each sequences.
|
345
|
+
# Yields a sequence.
|
346
|
+
# It acts the same as Enumerable#each.
|
347
|
+
#
|
348
|
+
# You would redefine the method suitable for the class/object.
|
349
|
+
def each_seq(&block) #:yields: seq
|
350
|
+
each(&block)
|
351
|
+
end
|
352
|
+
|
353
|
+
# Returns class of the sequence.
|
354
|
+
# If instance variable @seqclass (which can be
|
355
|
+
# set by 'seqclass=' method) is set, simply returns the value.
|
356
|
+
# Otherwise, returns the first sequence's class.
|
357
|
+
# If no sequences are found, returns nil.
|
358
|
+
def seqclass
|
359
|
+
if @seqclass then
|
360
|
+
@seqclass
|
361
|
+
else
|
362
|
+
klass = nil
|
363
|
+
each_seq do |s|
|
364
|
+
if s then
|
365
|
+
klass = s.class
|
366
|
+
break if klass
|
367
|
+
end
|
368
|
+
end
|
369
|
+
(klass or String)
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
# Returns the alignment length.
|
374
|
+
# Returns the longest length of the sequence in the alignment.
|
375
|
+
def alignment_length
|
376
|
+
maxlen = 0
|
377
|
+
each_seq do |s|
|
378
|
+
x = s.length
|
379
|
+
maxlen = x if x > maxlen
|
380
|
+
end
|
381
|
+
maxlen
|
382
|
+
end
|
383
|
+
alias seq_length alignment_length
|
384
|
+
|
385
|
+
# Gets a site of the position.
|
386
|
+
# Returns a Bio::Alignment::Site object.
|
387
|
+
#
|
388
|
+
# If the position is out of range, it returns the site
|
389
|
+
# of which all are gaps.
|
390
|
+
#
|
391
|
+
# It is a private method.
|
392
|
+
# Only difference from public alignment_site method is
|
393
|
+
# it does not do <tt>set_all_property(get_all_property)</tt>.
|
394
|
+
def _alignment_site(position)
|
395
|
+
site = Site.new
|
396
|
+
each_seq do |s|
|
397
|
+
c = s[position, 1]
|
398
|
+
if c.to_s.empty?
|
399
|
+
c = seqclass.new(gap_char)
|
400
|
+
end
|
401
|
+
site << c
|
402
|
+
end
|
403
|
+
site
|
404
|
+
end
|
405
|
+
private :_alignment_site
|
406
|
+
|
407
|
+
# Gets a site of the position.
|
408
|
+
# Returns a Bio::Alignment::Site object.
|
409
|
+
#
|
410
|
+
# If the position is out of range, it returns the site
|
411
|
+
# of which all are gaps.
|
412
|
+
def alignment_site(position)
|
413
|
+
site = _alignment_site(position)
|
414
|
+
site.set_all_property(get_all_property)
|
415
|
+
site
|
416
|
+
end
|
417
|
+
|
418
|
+
# Iterates over each site of the alignment.
|
419
|
+
# It yields a Bio::Alignment::Site object (which inherits Array).
|
420
|
+
# It returns self.
|
421
|
+
def each_site
|
422
|
+
cp = get_all_property
|
423
|
+
(0...alignment_length).each do |i|
|
424
|
+
site = _alignment_site(i)
|
425
|
+
site.set_all_property(cp)
|
426
|
+
yield(site)
|
427
|
+
end
|
428
|
+
self
|
429
|
+
end
|
430
|
+
|
431
|
+
# Iterates over each site of the alignment, with specifying
|
432
|
+
# start, stop positions and step.
|
433
|
+
# It yields Bio::Alignment::Site object (which inherits Array).
|
434
|
+
# It returns self.
|
435
|
+
# It is same as
|
436
|
+
# <tt>start.step(stop, step) { |i| yield alignment_site(i) }</tt>.
|
437
|
+
def each_site_step(start, stop, step = 1)
|
438
|
+
cp = get_all_property
|
439
|
+
start.step(stop, step) do |i|
|
440
|
+
site = _alignment_site(i)
|
441
|
+
site.set_all_property(cp)
|
442
|
+
yield(site)
|
443
|
+
end
|
444
|
+
self
|
445
|
+
end
|
446
|
+
|
447
|
+
# Iterates over each sequence and results running blocks
|
448
|
+
# are collected and returns a new alignment as a
|
449
|
+
# Bio::Alignment::SequenceArray object.
|
450
|
+
#
|
451
|
+
# Note that it would be redefined if you want to change
|
452
|
+
# return value's class.
|
453
|
+
#
|
454
|
+
def alignment_collect
|
455
|
+
a = SequenceArray.new
|
456
|
+
a.set_all_property(get_all_property)
|
457
|
+
each_seq do |str|
|
458
|
+
a << yield(str)
|
459
|
+
end
|
460
|
+
a
|
461
|
+
end
|
462
|
+
|
463
|
+
# Returns specified range of the alignment.
|
464
|
+
# For each sequence, the '[]' method (it may be String#[])
|
465
|
+
# is executed, and returns a new alignment
|
466
|
+
# as a Bio::Alignment::SequenceArray object.
|
467
|
+
#
|
468
|
+
# Unlike alignment_slice method, the result alignment are
|
469
|
+
# guaranteed to contain String object if the range specified
|
470
|
+
# is out of range.
|
471
|
+
#
|
472
|
+
# If you want to change return value's class, you should redefine
|
473
|
+
# alignment_collect method.
|
474
|
+
#
|
475
|
+
def alignment_window(*arg)
|
476
|
+
alignment_collect do |s|
|
477
|
+
s[*arg] or seqclass.new('')
|
478
|
+
end
|
479
|
+
end
|
480
|
+
alias window alignment_window
|
481
|
+
|
482
|
+
# Iterates over each sliding window of the alignment.
|
483
|
+
# window_size is the size of sliding window.
|
484
|
+
# step is the step of each sliding.
|
485
|
+
# It yields a Bio::Alignment::SequenceArray object which contains
|
486
|
+
# each sliding window.
|
487
|
+
# It returns a Bio::Alignment::SequenceArray object which contains
|
488
|
+
# remainder alignment at the terminal end.
|
489
|
+
# If window_size is smaller than 0, it returns nil.
|
490
|
+
def each_window(window_size, step_size = 1)
|
491
|
+
return nil if window_size < 0
|
492
|
+
if step_size >= 0 then
|
493
|
+
i = nil
|
494
|
+
0.step(alignment_length - window_size, step_size) do |i|
|
495
|
+
yield alignment_window(i, window_size)
|
496
|
+
end
|
497
|
+
alignment_window((i+window_size)..-1)
|
498
|
+
else
|
499
|
+
i = alignment_length - window_size
|
500
|
+
while i >= 0
|
501
|
+
yield alignment_window(i, window_size)
|
502
|
+
i += step_size
|
503
|
+
end
|
504
|
+
alignment_window(0...(i-step_size))
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
# Iterates over each site of the alignment and results running the
|
509
|
+
# block are collected and returns an array.
|
510
|
+
# It yields a Bio::Alignment::Site object.
|
511
|
+
def collect_each_site
|
512
|
+
ary = []
|
513
|
+
each_site do |site|
|
514
|
+
ary << yield(site)
|
515
|
+
end
|
516
|
+
ary
|
517
|
+
end
|
518
|
+
|
519
|
+
# Helper method for calculating consensus sequence.
|
520
|
+
# It iterates over each site of the alignment.
|
521
|
+
# In each site, gaps will be removed if specified with opt.
|
522
|
+
# It yields a Bio::Alignment::Site object.
|
523
|
+
# Results running the block (String objects are expected)
|
524
|
+
# are joined to a string and it returns the string.
|
525
|
+
#
|
526
|
+
# opt[:gap_mode] ==> 0 -- gaps are regarded as normal characters
|
527
|
+
# 1 -- a site within gaps is regarded as a gap
|
528
|
+
# -1 -- gaps are eliminated from consensus calculation
|
529
|
+
# default: 0
|
530
|
+
#
|
531
|
+
def consensus_each_site(opt = {})
|
532
|
+
mchar = (opt[:missing_char] or self.missing_char)
|
533
|
+
gap_mode = opt[:gap_mode]
|
534
|
+
case gap_mode
|
535
|
+
when 0, nil
|
536
|
+
collect_each_site do |a|
|
537
|
+
yield(a) or mchar
|
538
|
+
end.join('')
|
539
|
+
when 1
|
540
|
+
collect_each_site do |a|
|
541
|
+
a.has_gap? ? gap_char : (yield(a) or mchar)
|
542
|
+
end.join('')
|
543
|
+
when -1
|
544
|
+
collect_each_site do |a|
|
545
|
+
a.remove_gaps!
|
546
|
+
a.empty? ? gap_char : (yield(a) or mchar)
|
547
|
+
end.join('')
|
548
|
+
else
|
549
|
+
raise ':gap_mode must be 0, 1 or -1'
|
550
|
+
end
|
551
|
+
end
|
552
|
+
|
553
|
+
# Returns the consensus string of the alignment.
|
554
|
+
# 0.0 <= threshold <= 1.0 is expected.
|
555
|
+
#
|
556
|
+
# It resembles the BioPerl's AlignI::consensus_string method.
|
557
|
+
#
|
558
|
+
# Please refer to the consensus_each_site method for opt.
|
559
|
+
#
|
560
|
+
def consensus_string(threshold = 1.0, opt = {})
|
561
|
+
consensus_each_site(opt) do |a|
|
562
|
+
a.consensus_string(threshold)
|
563
|
+
end
|
564
|
+
end
|
565
|
+
|
566
|
+
# Returns the IUPAC consensus string of the alignment
|
567
|
+
# of nucleic-acid sequences.
|
568
|
+
#
|
569
|
+
# It resembles the BioPerl's AlignI::consensus_iupac method.
|
570
|
+
#
|
571
|
+
# Please refer to the consensus_each_site method for opt.
|
572
|
+
#
|
573
|
+
def consensus_iupac(opt = {})
|
574
|
+
consensus_each_site(opt) do |a|
|
575
|
+
a.consensus_iupac
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
# Returns the match line stirng of the alignment
|
580
|
+
# of amino-acid sequences.
|
581
|
+
#
|
582
|
+
# It resembles the BioPerl's AlignI::match_line method.
|
583
|
+
#
|
584
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
585
|
+
# opt[:strong_match_char] ==> strong match default: ':'
|
586
|
+
# opt[:weak_match_char] ==> weak match default: '.'
|
587
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
588
|
+
#
|
589
|
+
# More opt can be accepted.
|
590
|
+
# Please refer to the consensus_each_site method for opt.
|
591
|
+
#
|
592
|
+
def match_line_amino(opt = {})
|
593
|
+
collect_each_site do |a|
|
594
|
+
a.match_line_amino(opt)
|
595
|
+
end.join('')
|
596
|
+
end
|
597
|
+
|
598
|
+
# Returns the match line stirng of the alignment
|
599
|
+
# of nucleic-acid sequences.
|
600
|
+
#
|
601
|
+
# It resembles the BioPerl's AlignI::match_line method.
|
602
|
+
#
|
603
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
604
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
605
|
+
#
|
606
|
+
# More opt can be accepted.
|
607
|
+
# Please refer to the consensus_each_site method for opt.
|
608
|
+
#
|
609
|
+
def match_line_nuc(opt = {})
|
610
|
+
collect_each_site do |a|
|
611
|
+
a.match_line_nuc(opt)
|
612
|
+
end.join('')
|
613
|
+
end
|
614
|
+
|
615
|
+
# Returns the match line stirng of the alignment
|
616
|
+
# of nucleic- or amino-acid sequences.
|
617
|
+
# The type of the sequence is automatically determined
|
618
|
+
# or you can specify with opt[:type].
|
619
|
+
#
|
620
|
+
# It resembles the BioPerl's AlignI::match_line method.
|
621
|
+
#
|
622
|
+
# opt[:type] ==> :na or :aa (or determined by sequence class)
|
623
|
+
# opt[:match_line_char] ==> 100% equal default: '*'
|
624
|
+
# opt[:strong_match_char] ==> strong match default: ':'
|
625
|
+
# opt[:weak_match_char] ==> weak match default: '.'
|
626
|
+
# opt[:mismatch_char] ==> mismatch default: ' '
|
627
|
+
# :strong_ and :weak_match_char are used only in amino mode (:aa)
|
628
|
+
#
|
629
|
+
# More opt can be accepted.
|
630
|
+
# Please refer to the consensus_each_site method for opt.
|
631
|
+
#
|
632
|
+
def match_line(opt = {})
|
633
|
+
case opt[:type]
|
634
|
+
when :aa
|
635
|
+
amino = true
|
636
|
+
when :na, :dna, :rna
|
637
|
+
amino = false
|
638
|
+
else
|
639
|
+
if seqclass == Bio::Sequence::AA then
|
640
|
+
amino = true
|
641
|
+
elsif seqclass == Bio::Sequence::NA then
|
642
|
+
amino = false
|
643
|
+
elsif self.find { |x| /[EFILPQ]/i =~ x } then
|
644
|
+
amino = true
|
645
|
+
else
|
646
|
+
amino = nil
|
647
|
+
end
|
648
|
+
end
|
649
|
+
if amino then
|
650
|
+
match_line_amino(opt)
|
651
|
+
else
|
652
|
+
match_line_nuc(opt)
|
653
|
+
end
|
654
|
+
end
|
655
|
+
|
656
|
+
# This is the BioPerl's AlignI::match like method.
|
657
|
+
#
|
658
|
+
# Changes second to last sequences' sites to match_char(default: '.')
|
659
|
+
# when a site is equeal to the first sequence's corresponding site.
|
660
|
+
#
|
661
|
+
# Note that it is a destructive method.
|
662
|
+
#
|
663
|
+
# For Hash, please use it carefully because
|
664
|
+
# the order of the sequences is inconstant.
|
665
|
+
#
|
666
|
+
def convert_match(match_char = '.')
|
667
|
+
#(BioPerl) AlignI::match like method
|
668
|
+
len = alignment_length
|
669
|
+
firstseq = nil
|
670
|
+
each_seq do |s|
|
671
|
+
unless firstseq then
|
672
|
+
firstseq = s
|
673
|
+
else
|
674
|
+
(0...len).each do |i|
|
675
|
+
if s[i] and firstseq[i] == s[i] and !is_gap?(firstseq[i..i])
|
676
|
+
s[i..i] = match_char
|
677
|
+
end
|
678
|
+
end
|
679
|
+
end
|
680
|
+
end
|
681
|
+
self
|
682
|
+
end
|
683
|
+
|
684
|
+
# This is the BioPerl's AlignI::unmatch like method.
|
685
|
+
#
|
686
|
+
# Changes second to last sequences' sites match_char(default: '.')
|
687
|
+
# to original sites' characters.
|
688
|
+
#
|
689
|
+
# Note that it is a destructive method.
|
690
|
+
#
|
691
|
+
# For Hash, please use it carefully because
|
692
|
+
# the order of the sequences is inconstant.
|
693
|
+
#
|
694
|
+
def convert_unmatch(match_char = '.')
|
695
|
+
#(BioPerl) AlignI::unmatch like method
|
696
|
+
len = alignment_length
|
697
|
+
firstseq = nil
|
698
|
+
each_seq do |s|
|
699
|
+
unless firstseq then
|
700
|
+
firstseq = s
|
701
|
+
else
|
702
|
+
(0...len).each do |i|
|
703
|
+
if s[i..i] == match_char then
|
704
|
+
s[i..i] = (firstseq[i..i] or match_char)
|
705
|
+
end
|
706
|
+
end
|
707
|
+
end
|
708
|
+
end
|
709
|
+
self
|
710
|
+
end
|
711
|
+
|
712
|
+
# Fills gaps to the tail of each sequence if the length of
|
713
|
+
# the sequence is shorter than the alignment length.
|
714
|
+
#
|
715
|
+
# Note that it is a destructive method.
|
716
|
+
def alignment_normalize!
|
717
|
+
#(original)
|
718
|
+
len = alignment_length
|
719
|
+
each_seq do |s|
|
720
|
+
s << (gap_char * (len - s.length)) if s.length < len
|
721
|
+
end
|
722
|
+
self
|
723
|
+
end
|
724
|
+
alias normalize! alignment_normalize!
|
725
|
+
|
726
|
+
# Removes excess gaps in the tail of the sequences.
|
727
|
+
# If removes nothing, returns nil.
|
728
|
+
# Otherwise, returns self.
|
729
|
+
#
|
730
|
+
# Note that it is a destructive method.
|
731
|
+
def alignment_rstrip!
|
732
|
+
#(String-like)
|
733
|
+
len = alignment_length
|
734
|
+
newlen = len
|
735
|
+
each_site_step(len - 1, 0, -1) do |a|
|
736
|
+
a.remove_gaps!
|
737
|
+
if a.empty? then
|
738
|
+
newlen -= 1
|
739
|
+
else
|
740
|
+
break
|
741
|
+
end
|
742
|
+
end
|
743
|
+
return nil if newlen >= len
|
744
|
+
each_seq do |s|
|
745
|
+
s[newlen..-1] = '' if s.length > newlen
|
746
|
+
end
|
747
|
+
self
|
748
|
+
end
|
749
|
+
alias rstrip! alignment_rstrip!
|
750
|
+
|
751
|
+
# Removes excess gaps in the head of the sequences.
|
752
|
+
# If removes nothing, returns nil.
|
753
|
+
# Otherwise, returns self.
|
754
|
+
#
|
755
|
+
# Note that it is a destructive method.
|
756
|
+
def alignment_lstrip!
|
757
|
+
#(String-like)
|
758
|
+
pos = 0
|
759
|
+
each_site do |a|
|
760
|
+
a.remove_gaps!
|
761
|
+
if a.empty?
|
762
|
+
pos += 1
|
763
|
+
else
|
764
|
+
break
|
765
|
+
end
|
766
|
+
end
|
767
|
+
return nil if pos <= 0
|
768
|
+
each_seq { |s| s[0, pos] = '' }
|
769
|
+
self
|
770
|
+
end
|
771
|
+
alias lstrip! alignment_lstrip!
|
772
|
+
|
773
|
+
# Removes excess gaps in the sequences.
|
774
|
+
# If removes nothing, returns nil.
|
775
|
+
# Otherwise, returns self.
|
776
|
+
#
|
777
|
+
# Note that it is a destructive method.
|
778
|
+
def alignment_strip!
|
779
|
+
#(String-like)
|
780
|
+
r = alignment_rstrip!
|
781
|
+
l = alignment_lstrip!
|
782
|
+
(r or l)
|
783
|
+
end
|
784
|
+
alias strip! alignment_strip!
|
785
|
+
|
786
|
+
# Completely removes ALL gaps in the sequences.
|
787
|
+
# If removes nothing, returns nil.
|
788
|
+
# Otherwise, returns self.
|
789
|
+
#
|
790
|
+
# Note that it is a destructive method.
|
791
|
+
def remove_all_gaps!
|
792
|
+
ret = nil
|
793
|
+
each_seq do |s|
|
794
|
+
x = s.gsub!(gap_regexp, '')
|
795
|
+
ret ||= x
|
796
|
+
end
|
797
|
+
ret ? self : nil
|
798
|
+
end
|
799
|
+
|
800
|
+
# Returns the specified range of the alignment.
|
801
|
+
# For each sequence, the 'slice' method (it may be String#slice,
|
802
|
+
# which is the same as String#[]) is executed, and
|
803
|
+
# returns a new alignment as a Bio::Alignment::SequenceArray object.
|
804
|
+
#
|
805
|
+
# Unlike alignment_window method, the result alignment
|
806
|
+
# might contain nil.
|
807
|
+
#
|
808
|
+
# If you want to change return value's class, you should redefine
|
809
|
+
# alignment_collect method.
|
810
|
+
#
|
811
|
+
def alignment_slice(*arg)
|
812
|
+
#(String-like)
|
813
|
+
#(BioPerl) AlignI::slice like method
|
814
|
+
alignment_collect do |s|
|
815
|
+
s.slice(*arg)
|
816
|
+
end
|
817
|
+
end
|
818
|
+
alias slice alignment_slice
|
819
|
+
|
820
|
+
# For each sequence, the 'subseq' method (Bio::Seqeunce#subseq is
|
821
|
+
# expected) is executed, and returns a new alignment as
|
822
|
+
# a Bio::Alignment::SequenceArray object.
|
823
|
+
#
|
824
|
+
# All sequences in the alignment are expected to be kind of
|
825
|
+
# Bio::Sequence objects.
|
826
|
+
#
|
827
|
+
# Unlike alignment_window method, the result alignment
|
828
|
+
# might contain nil.
|
829
|
+
#
|
830
|
+
# If you want to change return value's class, you should redefine
|
831
|
+
# alignment_collect method.
|
832
|
+
#
|
833
|
+
def alignment_subseq(*arg)
|
834
|
+
#(original)
|
835
|
+
alignment_collect do |s|
|
836
|
+
s.subseq(*arg)
|
837
|
+
end
|
838
|
+
end
|
839
|
+
alias subseq alignment_subseq
|
840
|
+
|
841
|
+
# Concatenates the given alignment.
|
842
|
+
# <em>align</em> must have <tt>each_seq</tt>
|
843
|
+
# or <tt>each</tt> method.
|
844
|
+
#
|
845
|
+
# Returns self.
|
846
|
+
#
|
847
|
+
# Note that it is a destructive method.
|
848
|
+
#
|
849
|
+
# For Hash, please use it carefully because
|
850
|
+
# the order of the sequences is inconstant and
|
851
|
+
# key information is completely ignored.
|
852
|
+
#
|
853
|
+
def alignment_concat(align)
|
854
|
+
flag = nil
|
855
|
+
a = []
|
856
|
+
each_seq { |s| a << s }
|
857
|
+
i = 0
|
858
|
+
begin
|
859
|
+
align.each_seq do |seq|
|
860
|
+
flag = true
|
861
|
+
a[i].concat(seq) if a[i] and seq
|
862
|
+
i += 1
|
863
|
+
end
|
864
|
+
return self
|
865
|
+
rescue NoMethodError, ArgumentError => evar
|
866
|
+
raise evar if flag
|
867
|
+
end
|
868
|
+
align.each do |seq|
|
869
|
+
a[i].concat(seq) if a[i] and seq
|
870
|
+
i += 1
|
871
|
+
end
|
872
|
+
self
|
873
|
+
end
|
874
|
+
end #module EnumerableExtension
|
875
|
+
|
876
|
+
# ClustalWFormatter is a module to create ClustalW-formatted text
|
877
|
+
# from an alignment object.
|
878
|
+
#
|
879
|
+
# It will be obsoleted and the methods will be frequently changed.
|
880
|
+
module ClustalWFormatter
|
881
|
+
# Check whether there are same names.
|
882
|
+
#
|
883
|
+
# array:: names of the sequences (array of string)
|
884
|
+
# len:: length to check (default:30)
|
885
|
+
def have_same_name?(array, len = 30)
|
886
|
+
na30 = array.collect do |k|
|
887
|
+
k.to_s.split(/[\x00\s]/)[0].to_s[0, len].gsub(/\:\;\,\(\)/, '_').to_s
|
888
|
+
end
|
889
|
+
#p na30
|
890
|
+
na30idx = (0...(na30.size)).to_a
|
891
|
+
na30idx.sort! do |x,y|
|
892
|
+
na30[x] <=> na30[y]
|
893
|
+
end
|
894
|
+
#p na30idx
|
895
|
+
y = nil
|
896
|
+
dupidx = []
|
897
|
+
na30idx.each do |x|
|
898
|
+
if y and na30[y] == na30[x] then
|
899
|
+
dupidx << y
|
900
|
+
dupidx << x
|
901
|
+
end
|
902
|
+
y = x
|
903
|
+
end
|
904
|
+
if dupidx.size > 0 then
|
905
|
+
dupidx.sort!
|
906
|
+
dupidx.uniq!
|
907
|
+
dupidx
|
908
|
+
else
|
909
|
+
false
|
910
|
+
end
|
911
|
+
end
|
912
|
+
private :have_same_name?
|
913
|
+
|
914
|
+
# Changes sequence names if there are conflicted names.
|
915
|
+
#
|
916
|
+
# array:: names of the sequences (array of string)
|
917
|
+
# len:: length to check (default:30)
|
918
|
+
def avoid_same_name(array, len = 30)
|
919
|
+
na = array.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
|
920
|
+
if dupidx = have_same_name?(na, len)
|
921
|
+
procs = [
|
922
|
+
Proc.new { |s, i|
|
923
|
+
s[0, len].to_s.gsub(/\s/, '_') + s[len..-1].to_s
|
924
|
+
},
|
925
|
+
# Proc.new { |s, i|
|
926
|
+
# "#{i}_#{s}"
|
927
|
+
# },
|
928
|
+
]
|
929
|
+
procs.each do |pr|
|
930
|
+
dupidx.each do |i|
|
931
|
+
s = array[i]
|
932
|
+
na[i] = pr.call(s.to_s, i)
|
933
|
+
end
|
934
|
+
dupidx = have_same_name?(na, len)
|
935
|
+
break unless dupidx
|
936
|
+
end
|
937
|
+
if dupidx then
|
938
|
+
na.each_with_index do |s, i|
|
939
|
+
na[i] = "#{i}_#{s}"
|
940
|
+
end
|
941
|
+
end
|
942
|
+
end
|
943
|
+
na
|
944
|
+
end
|
945
|
+
private :avoid_same_name
|
946
|
+
|
947
|
+
# Generates ClustalW-formatted text
|
948
|
+
# seqs:: sequences (must be an alignment object)
|
949
|
+
# names:: names of the sequences
|
950
|
+
# options:: options
|
951
|
+
def clustalw_formatter(seqs, names, options = {})
|
952
|
+
#(original)
|
953
|
+
aln = [ "CLUSTAL (0.00) multiple sequence alignment\n\n" ]
|
954
|
+
len = seqs.seq_length
|
955
|
+
sn = names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
|
956
|
+
if options[:replace_space]
|
957
|
+
sn.collect! { |x| x.gsub(/\s/, '_') }
|
958
|
+
end
|
959
|
+
if !options.has_key?(:escape) or options[:escape]
|
960
|
+
sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
|
961
|
+
end
|
962
|
+
if !options.has_key?(:split) or options[:split]
|
963
|
+
sn.collect! { |x| x.split(/\s/)[0].to_s }
|
964
|
+
end
|
965
|
+
if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
|
966
|
+
sn = avoid_same_name(sn)
|
967
|
+
end
|
968
|
+
|
969
|
+
if sn.find { |x| x.length > 10 } then
|
970
|
+
seqwidth = 50
|
971
|
+
namewidth = 30
|
972
|
+
sep = ' ' * 6
|
973
|
+
else
|
974
|
+
seqwidth = 60
|
975
|
+
namewidth = 10
|
976
|
+
sep = ' ' * 6
|
977
|
+
end
|
978
|
+
seqregexp = Regexp.new("(.{1,#{seqwidth}})")
|
979
|
+
gchar = (options[:gap_char] or '-')
|
980
|
+
|
981
|
+
case options[:type].to_s
|
982
|
+
when /protein/i, /aa/i
|
983
|
+
mopt = { :type => :aa }
|
984
|
+
when /na/i
|
985
|
+
mopt = { :type => :na }
|
986
|
+
else
|
987
|
+
mopt = {}
|
988
|
+
end
|
989
|
+
mline = (options[:match_line] or seqs.match_line(mopt))
|
990
|
+
|
991
|
+
aseqs = seqs.collect do |s|
|
992
|
+
s.to_s.gsub(seqs.gap_regexp, gchar)
|
993
|
+
end
|
994
|
+
case options[:case].to_s
|
995
|
+
when /lower/i
|
996
|
+
aseqs.each { |s| s.downcase! }
|
997
|
+
when /upper/i
|
998
|
+
aseqs.each { |s| s.upcase! }
|
999
|
+
end
|
1000
|
+
|
1001
|
+
aseqs << mline
|
1002
|
+
aseqs.collect! do |s|
|
1003
|
+
snx = sn.shift
|
1004
|
+
head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth] + sep
|
1005
|
+
s << (gchar * (len - s.length))
|
1006
|
+
s.gsub!(seqregexp, "\\1\n")
|
1007
|
+
a = s.split(/^/)
|
1008
|
+
if options[:seqnos] and snx then
|
1009
|
+
i = 0
|
1010
|
+
a.each do |x|
|
1011
|
+
x.chomp!
|
1012
|
+
l = x.tr(gchar, '').length
|
1013
|
+
i += l
|
1014
|
+
x.concat(l > 0 ? " #{i}\n" : "\n")
|
1015
|
+
end
|
1016
|
+
end
|
1017
|
+
a.collect { |x| head + x }
|
1018
|
+
end
|
1019
|
+
lines = (len + seqwidth - 1).div(seqwidth)
|
1020
|
+
lines.times do
|
1021
|
+
aln << "\n"
|
1022
|
+
aseqs.each { |a| aln << a.shift }
|
1023
|
+
end
|
1024
|
+
aln.join('')
|
1025
|
+
end
|
1026
|
+
private :clustalw_formatter
|
1027
|
+
end #module ClustalWFormatter
|
1028
|
+
|
1029
|
+
|
1030
|
+
# Bio::Alignment::ArrayExtension is a set of useful methods for
|
1031
|
+
# multiple sequence alignment.
|
1032
|
+
# It is designed to be extended to array objects or
|
1033
|
+
# included in your own classes which inherit Array.
|
1034
|
+
# (It can also be included in Array, though not recommended.)
|
1035
|
+
#
|
1036
|
+
# It possesses all methods defined in EnumerableExtension.
|
1037
|
+
# For usage of methods, please refer to EnumerableExtension.
|
1038
|
+
module ArrayExtension
|
1039
|
+
include EnumerableExtension
|
1040
|
+
|
1041
|
+
# Iterates over each sequences.
|
1042
|
+
# Yields a sequence.
|
1043
|
+
#
|
1044
|
+
# It works the same as Array#each.
|
1045
|
+
def each_seq(&block) #:yields: seq
|
1046
|
+
each(&block)
|
1047
|
+
end
|
1048
|
+
|
1049
|
+
include ClustalWFormatter
|
1050
|
+
# Returns a string of Clustal W formatted text of the alignment.
|
1051
|
+
def to_clustal(options = {})
|
1052
|
+
clustalw_formatter(self, (0...(self.size)).to_a, options)
|
1053
|
+
end
|
1054
|
+
end #module ArrayExtension
|
1055
|
+
|
1056
|
+
# Bio::Alignment::HashExtension is a set of useful methods for
|
1057
|
+
# multiple sequence alignment.
|
1058
|
+
# It is designed to be extended to hash objects or
|
1059
|
+
# included in your own classes which inherit Hash.
|
1060
|
+
# (It can also be included in Hash, though not recommended.)
|
1061
|
+
#
|
1062
|
+
# It possesses all methods defined in EnumerableExtension.
|
1063
|
+
# For usage of methods, please refer to EnumerableExtension.
|
1064
|
+
#
|
1065
|
+
# Because SequenceHash#alignment_collect is redefined,
|
1066
|
+
# some methods' return value's class are changed to
|
1067
|
+
# SequenceHash instead of SequenceArray.
|
1068
|
+
#
|
1069
|
+
# Because the order of the objects in a hash is inconstant,
|
1070
|
+
# some methods strictly affected with the order of objects
|
1071
|
+
# might not work correctly,
|
1072
|
+
# e.g. EnumerableExtension#convert_match and #convert_unmatch.
|
1073
|
+
module HashExtension
|
1074
|
+
include EnumerableExtension
|
1075
|
+
|
1076
|
+
# Iterates over each sequences.
|
1077
|
+
# Yields a sequence.
|
1078
|
+
#
|
1079
|
+
# It works the same as Hash#each_value.
|
1080
|
+
def each_seq(&block) #:yields: seq
|
1081
|
+
each_value(&block)
|
1082
|
+
end
|
1083
|
+
|
1084
|
+
# Iterates over each sequence and each results running block
|
1085
|
+
# are collected and returns a new alignment as a
|
1086
|
+
# Bio::Alignment::SequenceHash object.
|
1087
|
+
#
|
1088
|
+
# Note that it would be redefined if you want to change
|
1089
|
+
# return value's class.
|
1090
|
+
#
|
1091
|
+
def alignment_collect
|
1092
|
+
a = SequenceHash.new
|
1093
|
+
a.set_all_property(get_all_property)
|
1094
|
+
each_pair do |key, str|
|
1095
|
+
a.store(key, yield(str))
|
1096
|
+
end
|
1097
|
+
a
|
1098
|
+
end
|
1099
|
+
|
1100
|
+
# Concatenates the given alignment.
|
1101
|
+
# If <em>align</em> is a Hash (or SequenceHash),
|
1102
|
+
# sequences of same keys are concatenated.
|
1103
|
+
# Otherwise, <em>align</em> must have <tt>each_seq</tt>
|
1104
|
+
# or <tt>each</tt> method and
|
1105
|
+
# works same as EnumerableExtension#alignment_concat.
|
1106
|
+
#
|
1107
|
+
# Returns self.
|
1108
|
+
#
|
1109
|
+
# Note that it is a destructive method.
|
1110
|
+
#
|
1111
|
+
def alignment_concat(align)
|
1112
|
+
flag = nil
|
1113
|
+
begin
|
1114
|
+
align.each_pair do |key, seq|
|
1115
|
+
flag = true
|
1116
|
+
if origseq = self[key]
|
1117
|
+
origseq.concat(seq)
|
1118
|
+
end
|
1119
|
+
end
|
1120
|
+
return self
|
1121
|
+
rescue NoMethodError, ArgumentError =>evar
|
1122
|
+
raise evar if flag
|
1123
|
+
end
|
1124
|
+
a = values
|
1125
|
+
i = 0
|
1126
|
+
begin
|
1127
|
+
align.each_seq do |seq|
|
1128
|
+
flag = true
|
1129
|
+
a[i].concat(seq) if a[i] and seq
|
1130
|
+
i += 1
|
1131
|
+
end
|
1132
|
+
return self
|
1133
|
+
rescue NoMethodError, ArgumentError => evar
|
1134
|
+
raise evar if flag
|
1135
|
+
end
|
1136
|
+
align.each do |seq|
|
1137
|
+
a[i].concat(seq) if a[i] and seq
|
1138
|
+
i += 1
|
1139
|
+
end
|
1140
|
+
self
|
1141
|
+
end
|
1142
|
+
|
1143
|
+
include ClustalWFormatter
|
1144
|
+
# Returns a string of Clustal W formatted text of the alignment.
|
1145
|
+
def to_clustal(options = {})
|
1146
|
+
seqs = SequenceArray.new
|
1147
|
+
names = self.keys
|
1148
|
+
names.each do |k|
|
1149
|
+
seqs << self[k]
|
1150
|
+
end
|
1151
|
+
clustalw_formatter(seqs, names, options)
|
1152
|
+
end
|
1153
|
+
end #module HashExtension
|
1154
|
+
|
1155
|
+
# Bio::Alignment::SequenceArray is a container class of
|
1156
|
+
# multiple sequence alignment.
|
1157
|
+
# Since it inherits Array, it acts completely same as Array.
|
1158
|
+
# In addition, methods defined in ArrayExtension and EnumerableExtension
|
1159
|
+
# can be used.
|
1160
|
+
class SequenceArray < Array
|
1161
|
+
include ArrayExtension
|
1162
|
+
end #class SequenceArray
|
1163
|
+
|
1164
|
+
# Bio::Alignment::SequenceHash is a container class of
|
1165
|
+
# multiple sequence alignment.
|
1166
|
+
# Since it inherits Hash, it acts completely same as Hash.
|
1167
|
+
# In addition, methods defined in HashExtension and EnumerableExtension
|
1168
|
+
# can be used.
|
1169
|
+
class SequenceHash < Hash
|
1170
|
+
include HashExtension
|
1171
|
+
end #class SequenceHash
|
1172
|
+
|
1173
|
+
# Bio::Alignment::OriginalPrivate is a set of private methods
|
1174
|
+
# for Bio::Alignment::OriginalAlignment.
|
1175
|
+
module OriginalPrivate
|
1176
|
+
|
1177
|
+
# Gets the sequence from given object.
|
1178
|
+
def extract_seq(obj)
|
1179
|
+
seq = nil
|
1180
|
+
if obj.is_a?(Bio::Sequence) then
|
1181
|
+
seq = obj
|
1182
|
+
else
|
1183
|
+
for m in [ :seq, :naseq, :aaseq ]
|
1184
|
+
begin
|
1185
|
+
seq = obj.send(m)
|
1186
|
+
rescue NameError, ArgumentError
|
1187
|
+
seq = nil
|
1188
|
+
end
|
1189
|
+
break if seq
|
1190
|
+
end
|
1191
|
+
seq = obj unless seq
|
1192
|
+
end
|
1193
|
+
seq
|
1194
|
+
end
|
1195
|
+
module_function :extract_seq
|
1196
|
+
|
1197
|
+
# Gets the name or the definition of the sequence from given object.
|
1198
|
+
def extract_key(obj)
|
1199
|
+
sn = nil
|
1200
|
+
for m in [ :definition, :entry_id ]
|
1201
|
+
begin
|
1202
|
+
sn = obj.send(m)
|
1203
|
+
rescue NameError, ArgumentError
|
1204
|
+
sn = nil
|
1205
|
+
end
|
1206
|
+
break if sn
|
1207
|
+
end
|
1208
|
+
sn
|
1209
|
+
end
|
1210
|
+
module_function :extract_key
|
1211
|
+
end #module OriginalPrivate
|
1212
|
+
|
1213
|
+
# Bio::Alignment::OriginalAlignment is
|
1214
|
+
# the BioRuby original multiple sequence alignment container class.
|
1215
|
+
# It includes HashExtension.
|
1216
|
+
#
|
1217
|
+
# It is recommended only to use methods defined in EnumerableExtension
|
1218
|
+
# (and the each_seq method).
|
1219
|
+
# The method only defined in this class might be obsoleted in the future.
|
1220
|
+
#
|
1221
|
+
class OriginalAlignment
|
1222
|
+
|
1223
|
+
include Enumerable
|
1224
|
+
include HashExtension
|
1225
|
+
include OriginalPrivate
|
1226
|
+
|
1227
|
+
# Read files and creates a new alignment object.
|
1228
|
+
#
|
1229
|
+
# It will be obsoleted.
|
1230
|
+
def self.readfiles(*files)
|
1231
|
+
require 'bio/io/flatfile'
|
1232
|
+
aln = self.new
|
1233
|
+
files.each do |fn|
|
1234
|
+
Bio::FlatFile.open(nil, fn) do |ff|
|
1235
|
+
aln.add_sequences(ff)
|
1236
|
+
end
|
1237
|
+
end
|
1238
|
+
aln
|
1239
|
+
end
|
1240
|
+
|
1241
|
+
# Creates a new alignment object from given arguments.
|
1242
|
+
#
|
1243
|
+
# It will be obsoleted.
|
1244
|
+
def self.new2(*arg)
|
1245
|
+
self.new(arg)
|
1246
|
+
end
|
1247
|
+
|
1248
|
+
# Creates a new alignment object.
|
1249
|
+
# <em>seqs</em> may be one of follows:
|
1250
|
+
# an array of sequences (or strings),
|
1251
|
+
# an array of sequence database objects,
|
1252
|
+
# an alignment object.
|
1253
|
+
def initialize(seqs = [])
|
1254
|
+
@seqs = {}
|
1255
|
+
@keys = []
|
1256
|
+
self.add_sequences(seqs)
|
1257
|
+
end
|
1258
|
+
|
1259
|
+
# If <em>x</em> is the same value, returns true.
|
1260
|
+
# Otherwise, returns false.
|
1261
|
+
def ==(x)
|
1262
|
+
#(original)
|
1263
|
+
if x.is_a?(self.class)
|
1264
|
+
self.to_hash == x.to_hash
|
1265
|
+
else
|
1266
|
+
false
|
1267
|
+
end
|
1268
|
+
end
|
1269
|
+
|
1270
|
+
# convert to hash
|
1271
|
+
def to_hash
|
1272
|
+
#(Hash-like)
|
1273
|
+
@seqs
|
1274
|
+
end
|
1275
|
+
|
1276
|
+
# Adds sequences to the alignment.
|
1277
|
+
# <em>seqs</em> may be one of follows:
|
1278
|
+
# an array of sequences (or strings),
|
1279
|
+
# an array of sequence database objects,
|
1280
|
+
# an alignment object.
|
1281
|
+
def add_sequences(seqs)
|
1282
|
+
if block_given? then
|
1283
|
+
seqs.each do |x|
|
1284
|
+
s, key = yield x
|
1285
|
+
self.store(key, s)
|
1286
|
+
end
|
1287
|
+
else
|
1288
|
+
if seqs.is_a?(self.class) then
|
1289
|
+
seqs.each_pair do |k, s|
|
1290
|
+
self.store(k, s)
|
1291
|
+
end
|
1292
|
+
elsif seqs.respond_to?(:each_pair)
|
1293
|
+
seqs.each_pair do |k, x|
|
1294
|
+
s = extract_seq(x)
|
1295
|
+
self.store(k, s)
|
1296
|
+
end
|
1297
|
+
else
|
1298
|
+
seqs.each do |x|
|
1299
|
+
s = extract_seq(x)
|
1300
|
+
k = extract_key(x)
|
1301
|
+
self.store(k, s)
|
1302
|
+
end
|
1303
|
+
end
|
1304
|
+
end
|
1305
|
+
self
|
1306
|
+
end
|
1307
|
+
|
1308
|
+
# identifiers (or definitions or names) of the sequences
|
1309
|
+
attr_reader :keys
|
1310
|
+
|
1311
|
+
# stores a sequences with the name
|
1312
|
+
# key:: name of the sequence
|
1313
|
+
# seq:: sequence
|
1314
|
+
def __store__(key, seq)
|
1315
|
+
#(Hash-like)
|
1316
|
+
h = { key => seq }
|
1317
|
+
@keys << h.keys[0]
|
1318
|
+
@seqs.update(h)
|
1319
|
+
seq
|
1320
|
+
end
|
1321
|
+
|
1322
|
+
# stores a sequence with <em>key</em>
|
1323
|
+
# (name or definition of the sequence).
|
1324
|
+
# Unlike <tt>__store__</tt> method, the method doesn't allow
|
1325
|
+
# same keys.
|
1326
|
+
# If the key is already used, returns nil.
|
1327
|
+
# When succeeded, returns key.
|
1328
|
+
def store(key, seq)
|
1329
|
+
#(Hash-like) returns key instead of seq
|
1330
|
+
if @seqs.has_key?(key) then
|
1331
|
+
# don't allow same key
|
1332
|
+
# New key is discarded, while existing key is preserved.
|
1333
|
+
key = nil
|
1334
|
+
end
|
1335
|
+
unless key then
|
1336
|
+
unless defined?(@serial)
|
1337
|
+
@serial = 0
|
1338
|
+
end
|
1339
|
+
@serial = @seqs.size if @seqs.size > @serial
|
1340
|
+
while @seqs.has_key?(@serial)
|
1341
|
+
@serial += 1
|
1342
|
+
end
|
1343
|
+
key = @serial
|
1344
|
+
end
|
1345
|
+
self.__store__(key, seq)
|
1346
|
+
key
|
1347
|
+
end
|
1348
|
+
|
1349
|
+
# Reconstructs internal data structure.
|
1350
|
+
# (Like Hash#rehash)
|
1351
|
+
def rehash
|
1352
|
+
@seqs.rehash
|
1353
|
+
oldkeys = @keys
|
1354
|
+
tmpkeys = @seqs.keys
|
1355
|
+
@keys.collect! do |k|
|
1356
|
+
tmpkeys.delete(k)
|
1357
|
+
end
|
1358
|
+
@keys.compact!
|
1359
|
+
@keys.concat(tmpkeys)
|
1360
|
+
self
|
1361
|
+
end
|
1362
|
+
|
1363
|
+
# Prepends seq (with key) to the front of the alignment.
|
1364
|
+
# (Like Array#unshift)
|
1365
|
+
def unshift(key, seq)
|
1366
|
+
#(Array-like)
|
1367
|
+
self.store(key, seq)
|
1368
|
+
k = @keys.pop
|
1369
|
+
@keys.unshift(k)
|
1370
|
+
k
|
1371
|
+
end
|
1372
|
+
|
1373
|
+
# Removes the first sequence in the alignment and
|
1374
|
+
# returns [ key, seq ].
|
1375
|
+
def shift
|
1376
|
+
k = @keys.shift
|
1377
|
+
if k then
|
1378
|
+
s = @seqs.delete(k)
|
1379
|
+
[ k, s ]
|
1380
|
+
else
|
1381
|
+
nil
|
1382
|
+
end
|
1383
|
+
end
|
1384
|
+
|
1385
|
+
# Gets the <em>n</em>-th sequence.
|
1386
|
+
# If not found, returns nil.
|
1387
|
+
def order(n)
|
1388
|
+
#(original)
|
1389
|
+
@seqs[@keys[n]]
|
1390
|
+
end
|
1391
|
+
|
1392
|
+
# Removes the sequence whose key is <em>key</em>.
|
1393
|
+
# Returns the removed sequence.
|
1394
|
+
# If not found, returns nil.
|
1395
|
+
def delete(key)
|
1396
|
+
#(Hash-like)
|
1397
|
+
@keys.delete(key)
|
1398
|
+
@seqs.delete(key)
|
1399
|
+
end
|
1400
|
+
|
1401
|
+
# Returns sequences. (Like Hash#values)
|
1402
|
+
def values
|
1403
|
+
#(Hash-like)
|
1404
|
+
@keys.collect { |k| @seqs[k] }
|
1405
|
+
end
|
1406
|
+
|
1407
|
+
# Adds a sequence without key.
|
1408
|
+
# The key is automatically determined.
|
1409
|
+
def <<(seq)
|
1410
|
+
#(Array-like)
|
1411
|
+
self.store(nil, seq)
|
1412
|
+
self
|
1413
|
+
end
|
1414
|
+
|
1415
|
+
# Gets a sequence. (Like Hash#[])
|
1416
|
+
def [](*arg)
|
1417
|
+
#(Hash-like)
|
1418
|
+
@seqs[*arg]
|
1419
|
+
end
|
1420
|
+
|
1421
|
+
# Number of sequences in the alignment.
|
1422
|
+
def size
|
1423
|
+
#(Hash&Array-like)
|
1424
|
+
@seqs.size
|
1425
|
+
end
|
1426
|
+
|
1427
|
+
# If the key exists, returns true. Otherwise, returns false.
|
1428
|
+
# (Like Hash#has_key?)
|
1429
|
+
def has_key?(key)
|
1430
|
+
#(Hash-like)
|
1431
|
+
@seqs.has_key?(key)
|
1432
|
+
end
|
1433
|
+
|
1434
|
+
# Iterates over each sequence.
|
1435
|
+
# (Like Array#each)
|
1436
|
+
def each
|
1437
|
+
#(Array-like)
|
1438
|
+
@keys.each do |k|
|
1439
|
+
yield @seqs[k]
|
1440
|
+
end
|
1441
|
+
end
|
1442
|
+
alias each_seq each
|
1443
|
+
|
1444
|
+
# Iterates over each key and sequence.
|
1445
|
+
# (Like Hash#each_pair)
|
1446
|
+
def each_pair
|
1447
|
+
#(Hash-like)
|
1448
|
+
@keys.each do |k|
|
1449
|
+
yield k, @seqs[k]
|
1450
|
+
end
|
1451
|
+
end
|
1452
|
+
|
1453
|
+
# Iterates over each sequence, replacing the sequence with the
|
1454
|
+
# value returned by the block.
|
1455
|
+
def collect!
|
1456
|
+
#(Array-like)
|
1457
|
+
@keys.each do |k|
|
1458
|
+
@seqs[k] = yield @seqs[k]
|
1459
|
+
end
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
###--
|
1463
|
+
### note that 'collect' and 'to_a' is defined in Enumerable
|
1464
|
+
###
|
1465
|
+
### instance-variable-related methods
|
1466
|
+
###++
|
1467
|
+
|
1468
|
+
# Creates new alignment. Internal use only.
|
1469
|
+
def new(*arg)
|
1470
|
+
na = self.class.new(*arg)
|
1471
|
+
na.set_all_property(get_all_property)
|
1472
|
+
na
|
1473
|
+
end
|
1474
|
+
protected :new
|
1475
|
+
|
1476
|
+
# Duplicates the alignment
|
1477
|
+
def dup
|
1478
|
+
#(Hash-like)
|
1479
|
+
self.new(self)
|
1480
|
+
end
|
1481
|
+
|
1482
|
+
#--
|
1483
|
+
# methods below should not access instance variables
|
1484
|
+
#++
|
1485
|
+
|
1486
|
+
# Merges given alignment and returns a new alignment.
|
1487
|
+
def merge(*other)
|
1488
|
+
#(Hash-like)
|
1489
|
+
na = self.new(self)
|
1490
|
+
na.merge!(*other)
|
1491
|
+
na
|
1492
|
+
end
|
1493
|
+
|
1494
|
+
# Merge given alignment.
|
1495
|
+
# Note that it is destructive method.
|
1496
|
+
def merge!(*other)
|
1497
|
+
#(Hash-like)
|
1498
|
+
if block_given? then
|
1499
|
+
other.each do |aln|
|
1500
|
+
aln.each_pair do |k, s|
|
1501
|
+
if self.has_key?(k) then
|
1502
|
+
s = yield k, self[k], s
|
1503
|
+
self.to_hash.store(k, s)
|
1504
|
+
else
|
1505
|
+
self.store(k, s)
|
1506
|
+
end
|
1507
|
+
end
|
1508
|
+
end
|
1509
|
+
else
|
1510
|
+
other.each do |aln|
|
1511
|
+
aln.each_pair do |k, s|
|
1512
|
+
self.delete(k) if self.has_key?(k)
|
1513
|
+
self.store(k, s)
|
1514
|
+
end
|
1515
|
+
end
|
1516
|
+
end
|
1517
|
+
self
|
1518
|
+
end
|
1519
|
+
|
1520
|
+
# Returns the key for a given sequence. If not found, returns nil.
|
1521
|
+
def index(seq)
|
1522
|
+
#(Hash-like)
|
1523
|
+
k = nil
|
1524
|
+
self.each_pair do |k, s|
|
1525
|
+
if s.class == seq.class then
|
1526
|
+
r = (s == seq)
|
1527
|
+
else
|
1528
|
+
r = (s.to_s == seq.to_s)
|
1529
|
+
end
|
1530
|
+
break if r
|
1531
|
+
end
|
1532
|
+
k
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
# Sequences in the alignment are duplicated.
|
1536
|
+
# If keys are given to the argument, sequences of given keys are
|
1537
|
+
# duplicated.
|
1538
|
+
#
|
1539
|
+
# It will be obsoleted.
|
1540
|
+
def isolate(*arg)
|
1541
|
+
#(original)
|
1542
|
+
if arg.size == 0 then
|
1543
|
+
self.collect! do |s|
|
1544
|
+
seqclass.new(s)
|
1545
|
+
end
|
1546
|
+
else
|
1547
|
+
arg.each do |k|
|
1548
|
+
if self.has_key?(k) then
|
1549
|
+
s = self.delete(key)
|
1550
|
+
self.store(k, seqclass.new(s))
|
1551
|
+
end
|
1552
|
+
end
|
1553
|
+
end
|
1554
|
+
self
|
1555
|
+
end
|
1556
|
+
|
1557
|
+
# Iterates over each sequence and each results running block
|
1558
|
+
# are collected and returns a new alignment.
|
1559
|
+
#
|
1560
|
+
# The method name 'collect_align' will be obsoleted.
|
1561
|
+
# Please use 'alignment_collect' instead.
|
1562
|
+
def alignment_collect
|
1563
|
+
#(original)
|
1564
|
+
na = self.class.new
|
1565
|
+
na.set_all_property(get_all_property)
|
1566
|
+
self.each_pair do |k, s|
|
1567
|
+
na.store(k, yield(s))
|
1568
|
+
end
|
1569
|
+
na
|
1570
|
+
end
|
1571
|
+
alias collect_align alignment_collect
|
1572
|
+
|
1573
|
+
# Removes empty sequences or nil in the alignment.
|
1574
|
+
# (Like Array#compact!)
|
1575
|
+
def compact!
|
1576
|
+
#(Array-like)
|
1577
|
+
d = []
|
1578
|
+
self.each_pair do |k, s|
|
1579
|
+
if !s or s.empty?
|
1580
|
+
d << k
|
1581
|
+
end
|
1582
|
+
end
|
1583
|
+
d.each do |k|
|
1584
|
+
self.delete(k)
|
1585
|
+
end
|
1586
|
+
d.empty? ? nil : d
|
1587
|
+
end
|
1588
|
+
|
1589
|
+
# Removes empty sequences or nil and returns new alignment.
|
1590
|
+
# (Like Array#compact)
|
1591
|
+
def compact
|
1592
|
+
#(Array-like)
|
1593
|
+
na = self.dup
|
1594
|
+
na.compact!
|
1595
|
+
na
|
1596
|
+
end
|
1597
|
+
|
1598
|
+
# Adds a sequence to the alignment.
|
1599
|
+
# Returns key if succeeded.
|
1600
|
+
# Returns nil (and not added to the alignment) if key is already used.
|
1601
|
+
#
|
1602
|
+
# It resembles BioPerl's AlignI::add_seq method.
|
1603
|
+
def add_seq(seq, key = nil)
|
1604
|
+
#(BioPerl) AlignI::add_seq like method
|
1605
|
+
unless seq.is_a?(Bio::Sequence) then
|
1606
|
+
s = extract_seq(seq)
|
1607
|
+
key = extract_key(seq) unless key
|
1608
|
+
seq = s
|
1609
|
+
end
|
1610
|
+
self.store(key, seq)
|
1611
|
+
end
|
1612
|
+
|
1613
|
+
# Removes given sequence from the alignment.
|
1614
|
+
# Returns removed sequence. If nothing removed, returns nil.
|
1615
|
+
#
|
1616
|
+
# It resembles BioPerl's AlignI::remove_seq.
|
1617
|
+
def remove_seq(seq)
|
1618
|
+
#(BioPerl) AlignI::remove_seq like method
|
1619
|
+
if k = self.index(seq) then
|
1620
|
+
self.delete(k)
|
1621
|
+
else
|
1622
|
+
nil
|
1623
|
+
end
|
1624
|
+
end
|
1625
|
+
|
1626
|
+
# Removes sequences from the alignment by given keys.
|
1627
|
+
# Returns an alignment object consists of removed sequences.
|
1628
|
+
#
|
1629
|
+
# It resembles BioPerl's AlignI::purge method.
|
1630
|
+
def purge(*arg)
|
1631
|
+
#(BioPerl) AlignI::purge like method
|
1632
|
+
purged = self.new
|
1633
|
+
arg.each do |k|
|
1634
|
+
if self[k] then
|
1635
|
+
purged.store(k, self.delete(k))
|
1636
|
+
end
|
1637
|
+
end
|
1638
|
+
purged
|
1639
|
+
end
|
1640
|
+
|
1641
|
+
# If block is given, it acts like Array#select (Enumerable#select).
|
1642
|
+
# Returns a new alignment containing all sequences of the alignment
|
1643
|
+
# for which return value of given block is not false nor nil.
|
1644
|
+
#
|
1645
|
+
# If no block is given, it acts like the BioPerl's AlignI::select.
|
1646
|
+
# Returns a new alignment containing sequences of given keys.
|
1647
|
+
#
|
1648
|
+
# The BioPerl's AlignI::select-like action will be obsoleted.
|
1649
|
+
def select(*arg)
|
1650
|
+
#(original)
|
1651
|
+
na = self.new
|
1652
|
+
if block_given? then
|
1653
|
+
# 'arg' is ignored
|
1654
|
+
# nearly same action as Array#select (Enumerable#select)
|
1655
|
+
self.each_pair.each do |k, s|
|
1656
|
+
na.store(k, s) if yield(s)
|
1657
|
+
end
|
1658
|
+
else
|
1659
|
+
# BioPerl's AlignI::select like function
|
1660
|
+
arg.each do |k|
|
1661
|
+
if s = self[k] then
|
1662
|
+
na.store(k, s)
|
1663
|
+
end
|
1664
|
+
end
|
1665
|
+
end
|
1666
|
+
na
|
1667
|
+
end
|
1668
|
+
|
1669
|
+
# The method name <tt>slice</tt> will be obsoleted.
|
1670
|
+
# Please use <tt>alignment_slice</tt> instead.
|
1671
|
+
alias slice alignment_slice
|
1672
|
+
|
1673
|
+
# The method name <tt>subseq</tt> will be obsoleted.
|
1674
|
+
# Please use <tt>alignment_subseq</tt> instead.
|
1675
|
+
alias subseq alignment_subseq
|
1676
|
+
|
1677
|
+
# Not-destructive version of alignment_normalize!.
|
1678
|
+
# Returns a new alignment.
|
1679
|
+
def normalize
|
1680
|
+
#(original)
|
1681
|
+
na = self.dup
|
1682
|
+
na.alignment_normalize!
|
1683
|
+
na
|
1684
|
+
end
|
1685
|
+
|
1686
|
+
# Not-destructive version of alignment_rstrip!.
|
1687
|
+
# Returns a new alignment.
|
1688
|
+
def rstrip
|
1689
|
+
#(String-like)
|
1690
|
+
na = self.dup
|
1691
|
+
na.isolate
|
1692
|
+
na.alignment_rstrip!
|
1693
|
+
na
|
1694
|
+
end
|
1695
|
+
|
1696
|
+
# Not-destructive version of alignment_lstrip!.
|
1697
|
+
# Returns a new alignment.
|
1698
|
+
def lstrip
|
1699
|
+
#(String-like)
|
1700
|
+
na = self.dup
|
1701
|
+
na.isolate
|
1702
|
+
na.alignment_lstrip!
|
1703
|
+
na
|
1704
|
+
end
|
1705
|
+
|
1706
|
+
# Not-destructive version of alignment_strip!.
|
1707
|
+
# Returns a new alignment.
|
1708
|
+
def strip
|
1709
|
+
#(String-like)
|
1710
|
+
na = self.dup
|
1711
|
+
na.isolate
|
1712
|
+
na.alignment_strip!
|
1713
|
+
na
|
1714
|
+
end
|
1715
|
+
|
1716
|
+
# Not-destructive version of remove_gaps!.
|
1717
|
+
# Returns a new alignment.
|
1718
|
+
#
|
1719
|
+
# The method name 'remove_gap' will be obsoleted.
|
1720
|
+
# Please use 'remove_all_gaps' instead.
|
1721
|
+
def remove_all_gaps
|
1722
|
+
#(original)
|
1723
|
+
na = self.dup
|
1724
|
+
na.isolate
|
1725
|
+
na.remove_all_gaps!
|
1726
|
+
na
|
1727
|
+
end
|
1728
|
+
|
1729
|
+
# Concatenates a string or an alignment.
|
1730
|
+
# Returns self.
|
1731
|
+
#
|
1732
|
+
# Note that the method will be obsoleted.
|
1733
|
+
# Please use <tt>each_seq { |s| s << str }</tt> for concatenating
|
1734
|
+
# a string and
|
1735
|
+
# <tt>alignment_concat(aln)</tt> for concatenating an alignment.
|
1736
|
+
def concat(aln)
|
1737
|
+
#(String-like)
|
1738
|
+
if aln.respond_to?(:to_str) then #aln.is_a?(String)
|
1739
|
+
self.each do |s|
|
1740
|
+
s << aln
|
1741
|
+
end
|
1742
|
+
self
|
1743
|
+
else
|
1744
|
+
alignment_concat(aln)
|
1745
|
+
end
|
1746
|
+
end
|
1747
|
+
|
1748
|
+
# Replace the specified region of the alignment to aln.
|
1749
|
+
# aln:: String or Bio::Alignment object
|
1750
|
+
# arg:: same format as String#slice
|
1751
|
+
#
|
1752
|
+
# It will be obsoleted.
|
1753
|
+
def replace_slice(aln, *arg)
|
1754
|
+
#(original)
|
1755
|
+
if aln.respond_to?(:to_str) then #aln.is_a?(String)
|
1756
|
+
self.each do |s|
|
1757
|
+
s[*arg] = aln
|
1758
|
+
end
|
1759
|
+
elsif aln.is_a?(self.class) then
|
1760
|
+
aln.each_pair do |k, s|
|
1761
|
+
self[k][*arg] = s
|
1762
|
+
end
|
1763
|
+
else
|
1764
|
+
i = 0
|
1765
|
+
aln.each do |s|
|
1766
|
+
self.order(i)[*arg] = s
|
1767
|
+
i += 1
|
1768
|
+
end
|
1769
|
+
end
|
1770
|
+
self
|
1771
|
+
end
|
1772
|
+
|
1773
|
+
# Performs multiple alignment by using external program.
|
1774
|
+
def do_align(factory)
|
1775
|
+
a0 = self.class.new
|
1776
|
+
(0...self.size).each { |i| a0.store(i, self.order(i)) }
|
1777
|
+
r = factory.query(a0)
|
1778
|
+
a1 = r.alignment
|
1779
|
+
a0.keys.each do |k|
|
1780
|
+
unless a1[k.to_s] then
|
1781
|
+
raise 'alignment result is inconsistent with input data'
|
1782
|
+
end
|
1783
|
+
end
|
1784
|
+
a2 = self.new
|
1785
|
+
a0.keys.each do |k|
|
1786
|
+
a2.store(self.keys[k], a1[k.to_s])
|
1787
|
+
end
|
1788
|
+
a2
|
1789
|
+
end
|
1790
|
+
|
1791
|
+
# Convert to fasta format and returns an array of strings.
|
1792
|
+
#
|
1793
|
+
# It will be obsoleted.
|
1794
|
+
def to_fasta_array(*arg)
|
1795
|
+
#(original)
|
1796
|
+
width = nil
|
1797
|
+
if arg[0].is_a?(Integer) then
|
1798
|
+
width = arg.shift
|
1799
|
+
end
|
1800
|
+
options = (arg.shift or {})
|
1801
|
+
width = options[:width] unless width
|
1802
|
+
if options[:avoid_same_name] then
|
1803
|
+
na = avoid_same_name(self.keys, 30)
|
1804
|
+
else
|
1805
|
+
na = self.keys.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
|
1806
|
+
end
|
1807
|
+
a = self.collect do |s|
|
1808
|
+
">#{na.shift}\n" +
|
1809
|
+
if width then
|
1810
|
+
s.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
|
1811
|
+
else
|
1812
|
+
s.to_s + "\n"
|
1813
|
+
end
|
1814
|
+
end
|
1815
|
+
a
|
1816
|
+
end
|
1817
|
+
|
1818
|
+
# Convets to fasta format and returns an array of FastaFormat objects.
|
1819
|
+
#
|
1820
|
+
# It will be obsoleted.
|
1821
|
+
def to_fastaformat_array(*arg)
|
1822
|
+
#(original)
|
1823
|
+
require 'bio/db/fasta'
|
1824
|
+
a = self.to_fasta_array(*arg)
|
1825
|
+
a.collect! do |x|
|
1826
|
+
Bio::FastaFormat.new(x)
|
1827
|
+
end
|
1828
|
+
a
|
1829
|
+
end
|
1830
|
+
|
1831
|
+
# Converts to fasta format and returns a string.
|
1832
|
+
#
|
1833
|
+
# The specification of the argument will be changed.
|
1834
|
+
def to_fasta(*arg)
|
1835
|
+
#(original)
|
1836
|
+
self.to_fasta_array(*arg).join('')
|
1837
|
+
end
|
1838
|
+
|
1839
|
+
include ClustalWFormatter
|
1840
|
+
# Returns a string of Clustal W formatted text of the alignment.
|
1841
|
+
def to_clustal(options = {})
|
1842
|
+
clustalw_formatter(self, self.keys, options)
|
1843
|
+
end
|
1844
|
+
|
1845
|
+
# The method name <tt>consensus</tt> will be obsoleted.
|
1846
|
+
# Please use <tt>consensus_string</tt> instead.
|
1847
|
+
alias consensus consensus_string
|
1848
|
+
end #class OriginalAlignment
|
1849
|
+
|
1850
|
+
# Bio::Alignment::GAP is a set of class methods for
|
1851
|
+
# gap-related position translation.
|
1852
|
+
module GAP
|
1853
|
+
# position with gaps are translated into the position without gaps.
|
1854
|
+
#<em>seq</em>:: sequence
|
1855
|
+
#<em>pos</em>:: position with gaps
|
1856
|
+
#<em>gap_regexp</em>:: regular expression to specify gaps
|
1857
|
+
def ungapped_pos(seq, pos, gap_regexp)
|
1858
|
+
p = seq[0..pos].gsub(gap_regexp, '').length
|
1859
|
+
p -= 1 if p > 0
|
1860
|
+
p
|
1861
|
+
end
|
1862
|
+
module_function :ungapped_pos
|
1863
|
+
|
1864
|
+
# position without gaps are translated into the position with gaps.
|
1865
|
+
#<em>seq</em>:: sequence
|
1866
|
+
#<em>pos</em>:: position with gaps
|
1867
|
+
#<em>gap_regexp</em>:: regular expression to specify gaps
|
1868
|
+
def gapped_pos(seq, pos, gap_regexp)
|
1869
|
+
olen = seq.gsub(gap_regexp, '').length
|
1870
|
+
pos = olen if pos >= olen
|
1871
|
+
pos = olen + pos if pos < 0
|
1872
|
+
|
1873
|
+
i = 0
|
1874
|
+
l = pos + 1
|
1875
|
+
while l > 0 and i < seq.length
|
1876
|
+
x = seq[i, l].gsub(gap_regexp, '').length
|
1877
|
+
i += l
|
1878
|
+
l -= x
|
1879
|
+
end
|
1880
|
+
i -= 1 if i > 0
|
1881
|
+
i
|
1882
|
+
end
|
1883
|
+
module_function :gapped_pos
|
1884
|
+
end # module GAP
|
1885
|
+
|
1886
|
+
# creates a new Bio::Alignment::OriginalAlignment object.
|
1887
|
+
# Please refer document of OriginalAlignment.new.
|
1888
|
+
def self.new(*arg)
|
1889
|
+
OriginalAlignment.new(*arg)
|
1890
|
+
end
|
1891
|
+
|
1892
|
+
# creates a new Bio::Alignment::OriginalAlignment object.
|
1893
|
+
# Please refer document of OriginalAlignment.new2.
|
1894
|
+
def self.new2(*arg)
|
1895
|
+
OriginalAlignment.new2(*arg)
|
1896
|
+
end
|
1897
|
+
|
1898
|
+
# creates a new Bio::Alignment::OriginalAlignment object.
|
1899
|
+
# Please refer document of OriginalAlignment.readfiles.
|
1900
|
+
def self.readfiles(*files)
|
1901
|
+
OriginalAlignment.readfiles(*files)
|
1902
|
+
end
|
1903
|
+
end #module Alignment
|
1904
|
+
|
1905
|
+
end #module Bio
|
1906
|
+
|