bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/appl/sim4.rb - sim4 wrapper class
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2004 GOTO Naohisa <ng@bioruby.org>
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
#
|
|
7
|
+
#--
|
|
8
|
+
# This library is free software; you can redistribute it and/or
|
|
9
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
10
|
+
# License as published by the Free Software Foundation; either
|
|
11
|
+
# version 2 of the License, or (at your option) any later version.
|
|
12
|
+
#
|
|
13
|
+
# This library is distributed in the hope that it will be useful,
|
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
16
|
+
# Lesser General Public License for more details.
|
|
17
|
+
#
|
|
18
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
19
|
+
# License along with this library; if not, write to the Free Software
|
|
20
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
21
|
+
#++
|
|
22
|
+
#
|
|
23
|
+
# $Id: sim4.rb,v 1.5 2005/12/18 15:58:40 k Exp $
|
|
24
|
+
#
|
|
25
|
+
# The sim4 execution wrapper class.
|
|
26
|
+
#
|
|
27
|
+
# == References
|
|
28
|
+
#
|
|
29
|
+
# * Florea, L., et al., A Computer program for aligning a cDNA sequence
|
|
30
|
+
# with a genomic DNA sequence, Genome Research, 8, 967--974, 1998.
|
|
31
|
+
# http://www.genome.org/cgi/content/abstract/8/9/967
|
|
32
|
+
#
|
|
33
|
+
|
|
34
|
+
require 'open3'
|
|
35
|
+
require 'tempfile'
|
|
36
|
+
|
|
37
|
+
module Bio
|
|
38
|
+
|
|
39
|
+
# The sim4 execution wrapper class.
|
|
40
|
+
class Sim4
|
|
41
|
+
|
|
42
|
+
autoload :Report, 'bio/appl/sim4/report'
|
|
43
|
+
|
|
44
|
+
# Creates a new sim4 execution wrapper object.
|
|
45
|
+
# [+program+] Program name. Usually 'sim4' in UNIX.
|
|
46
|
+
# [+database+] Default file name of database('seq2').
|
|
47
|
+
# [+option+] Options (array of strings).
|
|
48
|
+
def initialize(program = 'sim4', database = nil, option = [])
|
|
49
|
+
@program = program
|
|
50
|
+
@option = option
|
|
51
|
+
@database = database #seq2
|
|
52
|
+
@command = nil
|
|
53
|
+
@output = nil
|
|
54
|
+
@report = nil
|
|
55
|
+
@log = nil
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# default file name of database('seq2')
|
|
59
|
+
attr_accessor :database
|
|
60
|
+
|
|
61
|
+
# name of the program (usually 'sim4' in UNIX)
|
|
62
|
+
attr_reader :program
|
|
63
|
+
|
|
64
|
+
# options
|
|
65
|
+
attr_reader :option
|
|
66
|
+
|
|
67
|
+
# last command-line strings executed by the object
|
|
68
|
+
attr_reader :command
|
|
69
|
+
|
|
70
|
+
# last messages of program reported to the STDERR
|
|
71
|
+
attr_reader :log
|
|
72
|
+
|
|
73
|
+
# last result text (String)
|
|
74
|
+
attr_reader :output
|
|
75
|
+
|
|
76
|
+
# last result. Returns a Bio::Sim4::Report object.
|
|
77
|
+
attr_reader :report
|
|
78
|
+
|
|
79
|
+
# Executes the sim4 program.
|
|
80
|
+
# <tt>seq1</tt> shall be a Bio::Sequence object.
|
|
81
|
+
# Returns a Bio::Sim4::Report object.
|
|
82
|
+
def query(seq1)
|
|
83
|
+
tf = Tempfile.open('sim4')
|
|
84
|
+
tf.print seq1.to_fasta('seq1', 70)
|
|
85
|
+
tf.close(false)
|
|
86
|
+
r = exec_local(tf.path)
|
|
87
|
+
tf.close(true)
|
|
88
|
+
r
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Executes the sim4 program.
|
|
92
|
+
# Perform mRNA-genome alignment between given sequences.
|
|
93
|
+
# <tt>seq1</tt> and <tt>seq2</tt> should be Bio::Sequence objects.
|
|
94
|
+
# Returns a Bio::Sim4::Report object.
|
|
95
|
+
def query_pairwise(seq1, seq2)
|
|
96
|
+
tf = Tempfile.open('sim4')
|
|
97
|
+
tf.print seq1.to_fasta('seq1', 70)
|
|
98
|
+
tf.close(false)
|
|
99
|
+
tf2 = Tempfile.open('seq2')
|
|
100
|
+
tf2.print seq1.to_fasta('seq2', 70)
|
|
101
|
+
tf2.close(false)
|
|
102
|
+
r = exec_local(tf.path, tf2.path)
|
|
103
|
+
tf.close(true)
|
|
104
|
+
tf2.close(true)
|
|
105
|
+
r
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Executes the sim4 program.
|
|
109
|
+
# Perform mRNA-genome alignment between sequences in given files.
|
|
110
|
+
# <tt>filename1</tt> and <tt>filename2</tt> should be file name strings.
|
|
111
|
+
# If <tt>filename2</tt> is not specified, using <tt>self.database</tt>.
|
|
112
|
+
def exec_local(filename1, filename2 = nil)
|
|
113
|
+
@command = [ @program, filename1, (filename2 or @database), *@option ]
|
|
114
|
+
@output = nil
|
|
115
|
+
@log = nil
|
|
116
|
+
@report = nil
|
|
117
|
+
Open3.popen3(*@command) do |din, dout, derr|
|
|
118
|
+
din.close
|
|
119
|
+
derr.sync = true
|
|
120
|
+
t = Thread.start { @log = derr.read }
|
|
121
|
+
begin
|
|
122
|
+
@output = dout.read
|
|
123
|
+
@report = Bio::Sim4::Report.new(@output)
|
|
124
|
+
ensure
|
|
125
|
+
t.join
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
@report
|
|
129
|
+
end
|
|
130
|
+
alias exec exec_local
|
|
131
|
+
|
|
132
|
+
end #class Sim4
|
|
133
|
+
end #module Bio
|
|
134
|
+
|
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/appl/sim4/report.rb - sim4 result parser
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2004 GOTO Naohisa <ng@bioruby.org>
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
#
|
|
7
|
+
#--
|
|
8
|
+
# This library is free software; you can redistribute it and/or
|
|
9
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
10
|
+
# License as published by the Free Software Foundation; either
|
|
11
|
+
# version 2 of the License, or (at your option) any later version.
|
|
12
|
+
#
|
|
13
|
+
# This library is distributed in the hope that it will be useful,
|
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
16
|
+
# Lesser General Public License for more details.
|
|
17
|
+
#
|
|
18
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
19
|
+
# License along with this library; if not, write to the Free Software
|
|
20
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
21
|
+
#++
|
|
22
|
+
#
|
|
23
|
+
# $Id: report.rb,v 1.7 2005/12/18 15:58:40 k Exp $
|
|
24
|
+
#
|
|
25
|
+
# The sim4 report parser classes.
|
|
26
|
+
#
|
|
27
|
+
# == References
|
|
28
|
+
#
|
|
29
|
+
# * Florea, L., et al., A Computer program for aligning a cDNA sequence
|
|
30
|
+
# with a genomic DNA sequence, Genome Research, 8, 967--974, 1998.
|
|
31
|
+
# http://www.genome.org/cgi/content/abstract/8/9/967
|
|
32
|
+
#
|
|
33
|
+
|
|
34
|
+
module Bio
|
|
35
|
+
class Sim4
|
|
36
|
+
|
|
37
|
+
# Bio::Sim4::Report is the sim4 report parser class.
|
|
38
|
+
# Its object may contain some Bio::Sim4::Report::Hit objects.
|
|
39
|
+
class Report #< DB
|
|
40
|
+
#--
|
|
41
|
+
# format: A=0, A=3, or A=4
|
|
42
|
+
#++
|
|
43
|
+
|
|
44
|
+
# Delimiter of each entry. Bio::FlatFile uses it.
|
|
45
|
+
# In Bio::Sim4::Report, it it nil (1 entry 1 file).
|
|
46
|
+
DELIMITER = RS = nil # 1 entry 1 file
|
|
47
|
+
|
|
48
|
+
# Creates new Bio::Sim4::Report object from String.
|
|
49
|
+
# You can use Bio::FlatFile to read a file.
|
|
50
|
+
# Currently, format A=0, A=3, and A=4 are supported.
|
|
51
|
+
# (A=1, A=2, A=5 are NOT supported yet.)
|
|
52
|
+
#
|
|
53
|
+
# Note that 'seq1' in sim4 result is always regarded as 'query',
|
|
54
|
+
# and 'seq2' is always regarded as 'subject'(target, hit).
|
|
55
|
+
#
|
|
56
|
+
# Note that first 'seq1' informations are used for
|
|
57
|
+
# Bio::Sim4::Report#query_id, #query_def, #query_len, and #seq1 methods.
|
|
58
|
+
def initialize(text)
|
|
59
|
+
@hits = []
|
|
60
|
+
@all_hits = []
|
|
61
|
+
overrun = ''
|
|
62
|
+
text.each("\n\nseq1 = ") do |str|
|
|
63
|
+
str = str.sub(/\A\s+/, '')
|
|
64
|
+
str.sub!(/\n(^seq1 \= .*)/m, "\n") # remove trailing hits for sure
|
|
65
|
+
tmp = $1.to_s
|
|
66
|
+
hit = Hit.new(overrun + str)
|
|
67
|
+
overrun = tmp
|
|
68
|
+
unless hit.instance_eval { @data.empty? } then
|
|
69
|
+
@hits << hit
|
|
70
|
+
end
|
|
71
|
+
@all_hits << hit
|
|
72
|
+
end
|
|
73
|
+
@seq1 = @all_hits[0].seq1
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Returns hits of the entry.
|
|
77
|
+
# Unlike Bio::Sim4::Report#all_hits, it returns
|
|
78
|
+
# hits which have alignments.
|
|
79
|
+
# Returns an Array of Bio::Sim4::Report::Hit objects.
|
|
80
|
+
attr_reader :hits
|
|
81
|
+
|
|
82
|
+
# Returns all hits of the entry.
|
|
83
|
+
# Unlike Bio::Sim4::Report#hits, it returns
|
|
84
|
+
# results of all trials of pairwise alignment.
|
|
85
|
+
# This would be a Bio::Sim4 specific method.
|
|
86
|
+
# Returns an Array of Bio::Sim4::Report::Hit objects.
|
|
87
|
+
attr_reader :all_hits
|
|
88
|
+
|
|
89
|
+
# Returns sequence informations of 'seq1'.
|
|
90
|
+
# Returns a Bio::Sim4::Report::SeqDesc object.
|
|
91
|
+
# This would be a Bio::Sim4 specific method.
|
|
92
|
+
attr_reader :seq1
|
|
93
|
+
|
|
94
|
+
# Bio::Sim4::Report::SeqDesc stores sequence information of
|
|
95
|
+
# query or subject of sim4 report.
|
|
96
|
+
class SeqDesc
|
|
97
|
+
#--
|
|
98
|
+
# description/definitions of a sequence
|
|
99
|
+
#++
|
|
100
|
+
|
|
101
|
+
# Creates a new object.
|
|
102
|
+
# It is designed to be called internally from Bio::Sim4::Report object.
|
|
103
|
+
# Users shall not use it directly.
|
|
104
|
+
def initialize(seqid, seqdef, len, filename)
|
|
105
|
+
@entry_id = seqid
|
|
106
|
+
@definition = seqdef
|
|
107
|
+
@len = len
|
|
108
|
+
@filename = filename
|
|
109
|
+
end
|
|
110
|
+
# identifier of the sequence
|
|
111
|
+
attr_reader :entry_id
|
|
112
|
+
# definition of the sequence
|
|
113
|
+
attr_reader :definition
|
|
114
|
+
# sequence length of the sequence
|
|
115
|
+
attr_reader :len
|
|
116
|
+
# filename of the sequence
|
|
117
|
+
attr_reader :filename
|
|
118
|
+
|
|
119
|
+
# Parses part of sim4 result text and creates new SeqDesc object.
|
|
120
|
+
# It is designed to be called internally from Bio::Sim4::Report object.
|
|
121
|
+
# Users shall not use it directly.
|
|
122
|
+
def self.parse(str, str2 = nil)
|
|
123
|
+
/^seq[12] \= (.*)(?: \((.*)\))?\,\s*(\d+)\s*bp\s*$/ =~ str
|
|
124
|
+
seqid = $2
|
|
125
|
+
filename = $1
|
|
126
|
+
len = $3.to_i
|
|
127
|
+
if str2 then
|
|
128
|
+
seqdef = str2.sub(/^\>\s*/, '')
|
|
129
|
+
seqid =seqdef.split(/\s+/, 2)[0] unless seqid
|
|
130
|
+
else
|
|
131
|
+
seqdef = (seqid or filename)
|
|
132
|
+
seqid = filename unless seqid
|
|
133
|
+
end
|
|
134
|
+
self.new(seqid, seqdef, len, filename)
|
|
135
|
+
end
|
|
136
|
+
end #class SeqDesc
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# Sequence segment pair of the sim4 result.
|
|
140
|
+
# Similar to Bio::Blast::Report::HSP but lacks many methods.
|
|
141
|
+
# For mRNA-genome mapping programs,
|
|
142
|
+
# unlike other homology search programs,
|
|
143
|
+
# the class is used not only for exons but also for introns.
|
|
144
|
+
# (Note that intron data would not be available according to run-time
|
|
145
|
+
# options of the program.)
|
|
146
|
+
class SegmentPair
|
|
147
|
+
#--
|
|
148
|
+
# segment pair (like Bio::BLAST::*::Report::HSP)
|
|
149
|
+
#++
|
|
150
|
+
|
|
151
|
+
# Creates a new SegmentPair object.
|
|
152
|
+
# It is designed to be called internally from
|
|
153
|
+
# Bio::Sim4::Report::Hit object.
|
|
154
|
+
# Users shall not use it directly.
|
|
155
|
+
def initialize(seq1, seq2, midline = nil,
|
|
156
|
+
percent_identity = nil, direction = nil)
|
|
157
|
+
@seq1 = seq1
|
|
158
|
+
@seq2 = seq2
|
|
159
|
+
@midline = midline
|
|
160
|
+
@percent_identity = percent_identity
|
|
161
|
+
@direction = direction
|
|
162
|
+
end
|
|
163
|
+
# Returns segment informations of 'seq1'.
|
|
164
|
+
# Returns a Bio::Sim4::Report::Segment object.
|
|
165
|
+
# These would be Bio::Sim4 specific methods.
|
|
166
|
+
attr_reader :seq1
|
|
167
|
+
# Returns segment informations of 'seq2'.
|
|
168
|
+
# Returns a Bio::Sim4::Report::Segment object.
|
|
169
|
+
# These would be Bio::Sim4 specific methods.
|
|
170
|
+
attr_reader :seq2
|
|
171
|
+
|
|
172
|
+
# Returns the "midline" of the segment pair.
|
|
173
|
+
# Returns nil if no alignment data are available.
|
|
174
|
+
attr_reader :midline
|
|
175
|
+
|
|
176
|
+
# Returns percent identity of the segment pair.
|
|
177
|
+
attr_reader :percent_identity
|
|
178
|
+
|
|
179
|
+
# Returns directions of mapping.
|
|
180
|
+
# Maybe one of "->", "<-" or "" or nil.
|
|
181
|
+
# This would be a Bio::Sim4 specific method.
|
|
182
|
+
attr_reader :direction
|
|
183
|
+
|
|
184
|
+
# Parses part of sim4 result text and creates a new SegmentPair object.
|
|
185
|
+
# It is designed to be called internally from
|
|
186
|
+
# Bio::Sim4::Report::Hit class.
|
|
187
|
+
# Users shall not use it directly.
|
|
188
|
+
def self.parse(str, aln)
|
|
189
|
+
/^(\d+)\-(\d+)\s*\((\d+)\-(\d+)\)\s*([\d\.]+)\%\s*([\-\<\>]*)/ =~ str
|
|
190
|
+
self.new(Segment.new($1, $2, aln[0]),
|
|
191
|
+
Segment.new($3, $4, aln[2]),
|
|
192
|
+
aln[1], $5, $6)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Parses part of sim4 result text and creates a new SegmentPair
|
|
196
|
+
# object when the seq1 is a intron.
|
|
197
|
+
# It is designed to be called internally from
|
|
198
|
+
# Bio::Sim4::Report::Hit class.
|
|
199
|
+
# Users shall not use it directly.
|
|
200
|
+
def self.seq1_intron(prev_e, e, aln)
|
|
201
|
+
self.new(Segment.new(prev_e.seq1.to+1, e.seq1.from-1, aln[0]),
|
|
202
|
+
Segment.new(nil, nil, aln[2]),
|
|
203
|
+
aln[1])
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Parses part of sim4 result text and creates a new SegmentPair
|
|
207
|
+
# object when seq2 is a intron.
|
|
208
|
+
# It is designed to be called internally from
|
|
209
|
+
# Bio::Sim4::Report::Hit class.
|
|
210
|
+
# Users shall not use it directly.
|
|
211
|
+
def self.seq2_intron(prev_e, e, aln)
|
|
212
|
+
self.new(Segment.new(nil, nil, aln[0]),
|
|
213
|
+
Segment.new(prev_e.seq2.to+1, e.seq2.from-1, aln[2]),
|
|
214
|
+
aln[1])
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
#--
|
|
218
|
+
# Bio::BLAST::*::Report::Hsp compatible methods
|
|
219
|
+
# Methods already defined: midline, percent_identity
|
|
220
|
+
#++
|
|
221
|
+
|
|
222
|
+
# start position of the query (the first position is 1)
|
|
223
|
+
def query_from; @seq1.from; end
|
|
224
|
+
|
|
225
|
+
# end position of the query (including its position)
|
|
226
|
+
def query_to; @seq1.to; end
|
|
227
|
+
|
|
228
|
+
# query sequence (with gaps) of the alignment of the segment pair.
|
|
229
|
+
def qseq; @seq1.seq; end
|
|
230
|
+
|
|
231
|
+
# start position of the hit(target) (the first position is 1)
|
|
232
|
+
def hit_from; @seq2.from; end
|
|
233
|
+
|
|
234
|
+
# end position of the hit(target) (including its position)
|
|
235
|
+
def hit_to; @seq2.to; end
|
|
236
|
+
|
|
237
|
+
# hit(target) sequence (with gaps) of the alignment
|
|
238
|
+
# of the segment pair.
|
|
239
|
+
def hseq; @seq2.seq; end
|
|
240
|
+
|
|
241
|
+
# Returns alignment length of the segment pair.
|
|
242
|
+
# Returns nil if no alignment data are available.
|
|
243
|
+
def align_len
|
|
244
|
+
(@midline and @seq1.seq and @seq2.seq) ? @midline.length : nil
|
|
245
|
+
end
|
|
246
|
+
end #class SegmentPair
|
|
247
|
+
|
|
248
|
+
# Segment informations of a segment pair.
|
|
249
|
+
class Segment
|
|
250
|
+
#--
|
|
251
|
+
# the segment of a sequence
|
|
252
|
+
#++
|
|
253
|
+
|
|
254
|
+
# Creates a new Segment object.
|
|
255
|
+
# It is designed to be called internally from
|
|
256
|
+
# Bio::Sim4::Report::SegmentPair class.
|
|
257
|
+
# Users shall not use it directly.
|
|
258
|
+
def initialize(pos_st, pos_ed, seq = nil)
|
|
259
|
+
@from = pos_st.to_i
|
|
260
|
+
@to = pos_ed.to_i
|
|
261
|
+
@seq = seq
|
|
262
|
+
end
|
|
263
|
+
# start position of the segment (the first position is 1)
|
|
264
|
+
attr_reader :from
|
|
265
|
+
# end position of the segment (including its position)
|
|
266
|
+
attr_reader :to
|
|
267
|
+
# sequence (with gaps) of the segment
|
|
268
|
+
attr_reader :seq
|
|
269
|
+
end #class Segment
|
|
270
|
+
|
|
271
|
+
# Hit object of the sim4 result.
|
|
272
|
+
# Similar to Bio::Blast::Report::Hit but lacks many methods.
|
|
273
|
+
class Hit
|
|
274
|
+
|
|
275
|
+
# Parses part of sim4 result text and creates a new Hit object.
|
|
276
|
+
# It is designed to be called internally from Bio::Sim4::Report class.
|
|
277
|
+
# Users shall not use it directly.
|
|
278
|
+
def initialize(str)
|
|
279
|
+
@data = str.split(/\n(?:\r?\n)+/)
|
|
280
|
+
parse_seqdesc
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Parses sequence descriptions.
|
|
284
|
+
def parse_seqdesc
|
|
285
|
+
# seq1: query, seq2: target(hit)
|
|
286
|
+
a0 = @data.shift.split(/\r?\n/)
|
|
287
|
+
if @data[0].to_s =~ /^\>/ then
|
|
288
|
+
a1 = @data.shift.split(/\r?\n/)
|
|
289
|
+
else
|
|
290
|
+
a1 = []
|
|
291
|
+
end
|
|
292
|
+
@seq1 = SeqDesc.parse(a0[0], a1[0])
|
|
293
|
+
@seq2 = SeqDesc.parse(a0[1], a1[1])
|
|
294
|
+
|
|
295
|
+
if @data[0].to_s.sub!(/\A\(complement\)\s*$/, '') then
|
|
296
|
+
@complement = true
|
|
297
|
+
@data.shift if @data[0].strip.empty?
|
|
298
|
+
else
|
|
299
|
+
@complement = nil
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
private :parse_seqdesc
|
|
303
|
+
|
|
304
|
+
# Returns sequence informations of 'seq1'.
|
|
305
|
+
# Returns a Bio::Sim4::Report::SeqDesc object.
|
|
306
|
+
# This would be Bio::Sim4 specific method.
|
|
307
|
+
attr_reader :seq1
|
|
308
|
+
|
|
309
|
+
# Returns sequence informations of 'seq2'.
|
|
310
|
+
# Returns a Bio::Sim4::Report::SeqDesc object.
|
|
311
|
+
# This would be Bio::Sim4 specific method.
|
|
312
|
+
attr_reader :seq2
|
|
313
|
+
|
|
314
|
+
# Returns true if the hit reports '-'(complemental) strand
|
|
315
|
+
# search result.
|
|
316
|
+
# Otherwise, return false or nil.
|
|
317
|
+
# This would be a Bio::Sim4 specific method.
|
|
318
|
+
def complement?
|
|
319
|
+
@complement
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Parses segment pair.
|
|
323
|
+
def parse_segmentpairs
|
|
324
|
+
aln = (self.align ? self.align.dup : [])
|
|
325
|
+
exo = [] #exons
|
|
326
|
+
itr = [] #introns
|
|
327
|
+
sgp = [] #segmentpairs
|
|
328
|
+
prev_e = nil
|
|
329
|
+
return unless @data[0]
|
|
330
|
+
@data[0].split(/\r?\n/).each do |str|
|
|
331
|
+
ai = (prev_e ? aln.shift : nil)
|
|
332
|
+
a = (aln.shift or [])
|
|
333
|
+
e = SegmentPair.parse(str, a)
|
|
334
|
+
exo << e
|
|
335
|
+
if ai then
|
|
336
|
+
# intron data in alignment
|
|
337
|
+
if ai[2].strip.empty? then
|
|
338
|
+
i = SegmentPair.seq1_intron(prev_e, e, ai)
|
|
339
|
+
else
|
|
340
|
+
i = SegmentPair.seq2_intron(prev_e, e, ai)
|
|
341
|
+
end
|
|
342
|
+
itr << i
|
|
343
|
+
sgp << i
|
|
344
|
+
end
|
|
345
|
+
sgp << e
|
|
346
|
+
prev_e = e
|
|
347
|
+
end
|
|
348
|
+
@exons = exo
|
|
349
|
+
@introns = itr
|
|
350
|
+
@segmentpairs = sgp
|
|
351
|
+
end
|
|
352
|
+
private :parse_segmentpairs
|
|
353
|
+
|
|
354
|
+
# Parses alignment.
|
|
355
|
+
def parse_align
|
|
356
|
+
s1 = []; ml = []; s2 = []
|
|
357
|
+
dat = @data[1..-1]
|
|
358
|
+
return unless dat
|
|
359
|
+
dat.each do |str|
|
|
360
|
+
a = str.split(/\r?\n/)
|
|
361
|
+
a.shift
|
|
362
|
+
if /^(\s*\d+\s*)(.+)$/ =~ a[0] then
|
|
363
|
+
range = ($1.length)..($1.length + $2.strip.length - 1)
|
|
364
|
+
a.collect! { |x| x[range] }
|
|
365
|
+
s1 << a.shift
|
|
366
|
+
ml << a.shift
|
|
367
|
+
s2 << a.shift
|
|
368
|
+
end
|
|
369
|
+
end #each
|
|
370
|
+
alx = ml.join('').split(/([\<\>]+\.+[\<\>]+)/)
|
|
371
|
+
seq1 = s1.join(''); seq2 = s2.join('')
|
|
372
|
+
i = 0
|
|
373
|
+
alx.collect! do |x|
|
|
374
|
+
len = x.length
|
|
375
|
+
y = [ seq1[i, len], x, seq2[i, len] ]
|
|
376
|
+
i += len
|
|
377
|
+
y
|
|
378
|
+
end
|
|
379
|
+
@align = alx
|
|
380
|
+
end
|
|
381
|
+
private :parse_align
|
|
382
|
+
|
|
383
|
+
# Returns exons of the hit.
|
|
384
|
+
# Each exon is a Bio::Sim4::Report::SegmentPair object.
|
|
385
|
+
def exons
|
|
386
|
+
unless defined?(@exons); parse_segmentpairs; end
|
|
387
|
+
@exons
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# Returns segment pairs (exons and introns) of the hit.
|
|
391
|
+
# Each segment pair is a Bio::Sim4::Report::SegmentPair object.
|
|
392
|
+
# Returns an array of Bio::Sim4::Report::SegmentPair objects.
|
|
393
|
+
# (Note that intron data is not always available
|
|
394
|
+
# according to run-time options of the program.)
|
|
395
|
+
def segmentpairs
|
|
396
|
+
unless defined?(@segmentpairs); parse_segmentpairs; end
|
|
397
|
+
@segmentpairs
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
# Returns introns of the hit.
|
|
401
|
+
# Some of them would contain untranscribed regions.
|
|
402
|
+
# Returns an array of Bio::Sim4::Report::SegmentPair objects.
|
|
403
|
+
# (Note that intron data is not always available
|
|
404
|
+
# according to run-time options of the program.)
|
|
405
|
+
def introns
|
|
406
|
+
unless defined?(@introns); parse_segmentpairs; end
|
|
407
|
+
@introns
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# Returns alignments.
|
|
411
|
+
# Returns an Array of arrays.
|
|
412
|
+
# Each array contains sequence of seq1, midline, sequence of seq2,
|
|
413
|
+
# respectively.
|
|
414
|
+
# This would be a Bio::Sim4 specific method.
|
|
415
|
+
def align
|
|
416
|
+
unless defined?(@align); parse_align; end
|
|
417
|
+
@align
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
#--
|
|
421
|
+
# Bio::BLAST::*::Report::Hit compatible methods
|
|
422
|
+
#++
|
|
423
|
+
|
|
424
|
+
# Length of the query sequence.
|
|
425
|
+
# Same as Bio::Sim4::Report#query_len.
|
|
426
|
+
def query_len; seq1.len; end
|
|
427
|
+
|
|
428
|
+
# Identifier of the query sequence.
|
|
429
|
+
# Same as Bio::Sim4::Report#query_id.
|
|
430
|
+
def query_id; seq1.entry_id; end
|
|
431
|
+
|
|
432
|
+
# Definition of the query sequence
|
|
433
|
+
# Same as Bio::Sim4::Report#query_def.
|
|
434
|
+
def query_def; seq1.definition; end
|
|
435
|
+
|
|
436
|
+
# length of the hit(target) sequence
|
|
437
|
+
def target_len; seq2.len; end
|
|
438
|
+
|
|
439
|
+
# Identifier of the hit(target) sequence
|
|
440
|
+
def target_id; seq2.entry_id; end
|
|
441
|
+
|
|
442
|
+
# Definition of the hit(target) sequence
|
|
443
|
+
def target_def; seq2.definition; end
|
|
444
|
+
|
|
445
|
+
alias hit_id target_id
|
|
446
|
+
alias len target_len
|
|
447
|
+
alias definition target_def
|
|
448
|
+
|
|
449
|
+
alias hsps exons
|
|
450
|
+
|
|
451
|
+
# Iterates over each exon of the hit.
|
|
452
|
+
# Yields a Bio::Sim4::Report::SegmentPair object.
|
|
453
|
+
def each(&x) #:yields: segmentpair
|
|
454
|
+
exons.each(&x)
|
|
455
|
+
end
|
|
456
|
+
end #class Hit
|
|
457
|
+
|
|
458
|
+
#--
|
|
459
|
+
#Bio::BLAST::*::Report compatible methods
|
|
460
|
+
#++
|
|
461
|
+
|
|
462
|
+
# Returns number of hits.
|
|
463
|
+
# Same as hits.size.
|
|
464
|
+
def num_hits; @hits.size; end
|
|
465
|
+
|
|
466
|
+
# Iterates over each hits of the sim4 result.
|
|
467
|
+
# Same as hits.each.
|
|
468
|
+
# Yields a Bio::Sim4::Report::Hit object.
|
|
469
|
+
def each_hit(&x) #:yields: hit
|
|
470
|
+
@hits.each(&x)
|
|
471
|
+
end
|
|
472
|
+
alias each each_hit
|
|
473
|
+
|
|
474
|
+
# Returns the definition of query sequence.
|
|
475
|
+
# The value will be filename or (first word of) sequence definition
|
|
476
|
+
# according to sim4 run-time options.
|
|
477
|
+
def query_def; @seq1.definition; end
|
|
478
|
+
|
|
479
|
+
# Returns the identifier of query sequence.
|
|
480
|
+
# The value will be filename or (first word of) sequence definition
|
|
481
|
+
# according to sim4 run-time options.
|
|
482
|
+
def query_id; @seq1.entry_id; end
|
|
483
|
+
|
|
484
|
+
# Returns the length of query sequence.
|
|
485
|
+
def query_len; @seq1.len; end
|
|
486
|
+
end #class Report
|
|
487
|
+
|
|
488
|
+
end #class Sim4
|
|
489
|
+
end #module Bio
|
|
490
|
+
|
|
491
|
+
=begin
|
|
492
|
+
|
|
493
|
+
= Bio::Sim4::Report
|
|
494
|
+
|
|
495
|
+
= References
|
|
496
|
+
|
|
497
|
+
* ((<URL:http://www.genome.org/cgi/content/abstract/8/9/967>))
|
|
498
|
+
Florea, L., et al., A Computer program for aligning a cDNA sequence
|
|
499
|
+
with a genomic DNA sequence, Genome Research, 8, 967--974, 1998.
|
|
500
|
+
|
|
501
|
+
=end
|