bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
#
|
|
2
|
+
# bio/appl/blast/xmlparser.rb - BLAST XML output (-m 7) parser by XMLParser
|
|
3
|
+
#
|
|
4
|
+
# Copyright (C) 2001 Mitsuteru C. Nakao <n@bioruby.org>
|
|
5
|
+
# Copyright (C) 2003 KATAYAMA Toshiaki <k@bioruby.org>
|
|
6
|
+
#
|
|
7
|
+
# This library is free software; you can redistribute it and/or
|
|
8
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
9
|
+
# License as published by the Free Software Foundation; either
|
|
10
|
+
# version 2 of the License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This library is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
15
|
+
# Lesser General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
18
|
+
# License along with this library; if not, write to the Free Software
|
|
19
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
20
|
+
#
|
|
21
|
+
# $Id: xmlparser.rb,v 1.13 2005/09/08 01:22:08 k Exp $
|
|
22
|
+
#
|
|
23
|
+
|
|
24
|
+
begin
|
|
25
|
+
require 'xmlparser'
|
|
26
|
+
rescue LoadError
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
module Bio
|
|
30
|
+
class Blast
|
|
31
|
+
class Report
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def xmlparser_parse(xml)
|
|
36
|
+
parser = XMLParser.new
|
|
37
|
+
def parser.default; end
|
|
38
|
+
|
|
39
|
+
begin
|
|
40
|
+
tag_stack = Array.new
|
|
41
|
+
hash = Hash.new
|
|
42
|
+
|
|
43
|
+
parser.parse(xml) do |type, name, data|
|
|
44
|
+
#print "type=#{type.inspect} name=#{name.inspect} data=#{data.inspect}\n" # for DEBUG
|
|
45
|
+
case type
|
|
46
|
+
when XMLParser::START_ELEM
|
|
47
|
+
tag_stack.push(name)
|
|
48
|
+
hash.update(data)
|
|
49
|
+
case name
|
|
50
|
+
when 'Iteration'
|
|
51
|
+
iteration = Iteration.new
|
|
52
|
+
@iterations.push(iteration)
|
|
53
|
+
when 'Hit'
|
|
54
|
+
hit = Hit.new
|
|
55
|
+
hit.query_id = @query_id
|
|
56
|
+
hit.query_def = @query_def
|
|
57
|
+
hit.query_len = @query_len
|
|
58
|
+
@iterations.last.hits.push(hit)
|
|
59
|
+
when 'Hsp'
|
|
60
|
+
hsp = Hsp.new
|
|
61
|
+
@iterations.last.hits.last.hsps.push(hsp)
|
|
62
|
+
end
|
|
63
|
+
when XMLParser::END_ELEM
|
|
64
|
+
case name
|
|
65
|
+
when /^BlastOutput/
|
|
66
|
+
xmlparser_parse_program(name,hash)
|
|
67
|
+
hash = Hash.new
|
|
68
|
+
when /^Parameters$/
|
|
69
|
+
xmlparser_parse_parameters(hash)
|
|
70
|
+
hash = Hash.new
|
|
71
|
+
when /^Iteration/
|
|
72
|
+
xmlparser_parse_iteration(name, hash)
|
|
73
|
+
hash = Hash.new
|
|
74
|
+
when /^Hit/
|
|
75
|
+
xmlparser_parse_hit(name, hash)
|
|
76
|
+
hash = Hash.new
|
|
77
|
+
when /^Hsp$/
|
|
78
|
+
xmlparser_parse_hsp(hash)
|
|
79
|
+
hash = Hash.new
|
|
80
|
+
when /^Statistics$/
|
|
81
|
+
xmlparser_parse_statistics(hash)
|
|
82
|
+
hash = Hash.new
|
|
83
|
+
end
|
|
84
|
+
tag_stack.pop
|
|
85
|
+
when XMLParser::CDATA
|
|
86
|
+
if hash[tag_stack.last].nil?
|
|
87
|
+
hash[tag_stack.last] = data unless data.strip.empty?
|
|
88
|
+
else
|
|
89
|
+
hash[tag_stack.last].concat(data) if data
|
|
90
|
+
end
|
|
91
|
+
when XMLParser::PI
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
rescue XMLParserError
|
|
95
|
+
line = parser.line
|
|
96
|
+
column = parser.column
|
|
97
|
+
print "Parse error at #{line}(#{column}) : #{$!}\n"
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def xmlparser_parse_program(tag, hash)
|
|
103
|
+
case tag
|
|
104
|
+
when 'BlastOutput_program'
|
|
105
|
+
@program = hash[tag]
|
|
106
|
+
when 'BlastOutput_version'
|
|
107
|
+
@version = hash[tag]
|
|
108
|
+
when 'BlastOutput_reference'
|
|
109
|
+
@reference = hash[tag]
|
|
110
|
+
when 'BlastOutput_db'
|
|
111
|
+
@db = hash[tag].strip
|
|
112
|
+
when 'BlastOutput_query-ID'
|
|
113
|
+
@query_id = hash[tag]
|
|
114
|
+
when 'BlastOutput_query-def'
|
|
115
|
+
@query_def = hash[tag]
|
|
116
|
+
when 'BlastOutput_query-len'
|
|
117
|
+
@query_len = hash[tag].to_i
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def xmlparser_parse_parameters(hash)
|
|
122
|
+
labels = {
|
|
123
|
+
'matrix' => 'Parameters_matrix',
|
|
124
|
+
'expect' => 'Parameters_expect',
|
|
125
|
+
'include' => 'Parameters_include',
|
|
126
|
+
'sc-match' => 'Parameters_sc-match',
|
|
127
|
+
'sc-mismatch' => 'Parameters_sc-mismatch',
|
|
128
|
+
'gap-open' => 'Parameters_gap-open',
|
|
129
|
+
'gap-extend' => 'Parameters_gap-extend',
|
|
130
|
+
'filter' => 'Parameters_filter',
|
|
131
|
+
'pattern' => 'Parameters_pattern',
|
|
132
|
+
'entrez-query'=> 'Parameters_entrez-query',
|
|
133
|
+
}
|
|
134
|
+
labels.each do |k,v|
|
|
135
|
+
case k
|
|
136
|
+
when 'filter', 'matrix'
|
|
137
|
+
@parameters[k] = hash[v].to_s
|
|
138
|
+
else
|
|
139
|
+
@parameters[k] = hash[v].to_i
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def xmlparser_parse_iteration(tag, hash)
|
|
145
|
+
case tag
|
|
146
|
+
when 'Iteration_iter-num'
|
|
147
|
+
@iterations.last.num = hash[tag].to_i
|
|
148
|
+
when 'Iteration_message'
|
|
149
|
+
@iterations.last.message = hash[tag].to_s
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def xmlparser_parse_hit(tag, hash)
|
|
154
|
+
hit = @iterations.last.hits.last
|
|
155
|
+
case tag
|
|
156
|
+
when 'Hit_num'
|
|
157
|
+
hit.num = hash[tag].to_i
|
|
158
|
+
when 'Hit_id'
|
|
159
|
+
hit.hit_id = hash[tag].clone
|
|
160
|
+
when 'Hit_def'
|
|
161
|
+
hit.definition = hash[tag].clone
|
|
162
|
+
when 'Hit_accession'
|
|
163
|
+
hit.accession = hash[tag].clone
|
|
164
|
+
when 'Hit_len'
|
|
165
|
+
hit.len = hash[tag].clone.to_i
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def xmlparser_parse_hsp(hash)
|
|
170
|
+
hsp = @iterations.last.hits.last.hsps.last
|
|
171
|
+
hsp.num = hash['Hsp_num'].to_i
|
|
172
|
+
hsp.bit_score = hash['Hsp_bit-score'].to_f
|
|
173
|
+
hsp.score = hash['Hsp_score'].to_i
|
|
174
|
+
hsp.evalue = hash['Hsp_evalue'].to_f
|
|
175
|
+
hsp.query_from = hash['Hsp_query-from'].to_i
|
|
176
|
+
hsp.query_to = hash['Hsp_query-to'].to_i
|
|
177
|
+
hsp.hit_from = hash['Hsp_hit-from'].to_i
|
|
178
|
+
hsp.hit_to = hash['Hsp_hit-to'].to_i
|
|
179
|
+
hsp.pattern_from = hash['Hsp_pattern-from'].to_i
|
|
180
|
+
hsp.pattern_to = hash['Hsp_pattern-to'].to_i
|
|
181
|
+
hsp.query_frame = hash['Hsp_query-frame'].to_i
|
|
182
|
+
hsp.hit_frame = hash['Hsp_hit-frame'].to_i
|
|
183
|
+
hsp.identity = hash['Hsp_identity'].to_i
|
|
184
|
+
hsp.positive = hash['Hsp_positive'].to_i
|
|
185
|
+
hsp.gaps = hash['Hsp_gaps'].to_i
|
|
186
|
+
hsp.align_len = hash['Hsp_align-len'].to_i
|
|
187
|
+
hsp.density = hash['Hsp_density'].to_i
|
|
188
|
+
hsp.qseq = hash['Hsp_qseq']
|
|
189
|
+
hsp.hseq = hash['Hsp_hseq']
|
|
190
|
+
hsp.midline = hash['Hsp_midline']
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def xmlparser_parse_statistics(hash)
|
|
194
|
+
labels = {
|
|
195
|
+
'db-num' => 'Statistics_db-num',
|
|
196
|
+
'db-len' => 'Statistics_db-len',
|
|
197
|
+
'hsp-len' => 'Statistics_hsp-len',
|
|
198
|
+
'eff-space' => 'Statistics_eff-space',
|
|
199
|
+
'kappa' => 'Statistics_kappa',
|
|
200
|
+
'lambda' => 'Statistics_lambda',
|
|
201
|
+
'entropy' => 'Statistics_entropy'
|
|
202
|
+
}
|
|
203
|
+
labels.each do |k,v|
|
|
204
|
+
case k
|
|
205
|
+
when 'db-num', 'db-len', 'hsp-len'
|
|
206
|
+
@iterations.last.statistics[k] = hash[v].to_i
|
|
207
|
+
else
|
|
208
|
+
@iterations.last.statistics[k] = hash[v].to_f
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
=begin
|
|
219
|
+
|
|
220
|
+
This file is automatically loaded by bio/appl/blast/report.rb
|
|
221
|
+
|
|
222
|
+
=end
|
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/appl/blat/report.rb - BLAT result parser
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2004 GOTO Naohisa <ng@bioruby.org>
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
#
|
|
7
|
+
#--
|
|
8
|
+
# This library is free software; you can redistribute it and/or
|
|
9
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
10
|
+
# License as published by the Free Software Foundation; either
|
|
11
|
+
# version 2 of the License, or (at your option) any later version.
|
|
12
|
+
#
|
|
13
|
+
# This library is distributed in the hope that it will be useful,
|
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
16
|
+
# Lesser General Public License for more details.
|
|
17
|
+
#
|
|
18
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
19
|
+
# License along with this library; if not, write to the Free Software
|
|
20
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
21
|
+
#++
|
|
22
|
+
#
|
|
23
|
+
# $Id: report.rb,v 1.6 2005/12/18 15:58:39 k Exp $
|
|
24
|
+
#
|
|
25
|
+
# BLAT result parser (psl / pslx format).
|
|
26
|
+
#
|
|
27
|
+
# == Important Notes
|
|
28
|
+
#
|
|
29
|
+
# In BLAT results, the start position of a sequnece is numbered as 0.
|
|
30
|
+
# On the other hand, in many other homology search programs,
|
|
31
|
+
# the start position of a sequence is numbered as 1.
|
|
32
|
+
# To keep compatibility, the BLAT parser adds 1 to every position number.
|
|
33
|
+
#
|
|
34
|
+
# == References
|
|
35
|
+
#
|
|
36
|
+
# * Kent, W.J., BLAT--the BLAST-like alignment tool,
|
|
37
|
+
# Genome Research, 12, 656--664, 2002.
|
|
38
|
+
# http://www.genome.org/cgi/content/abstract/12/4/656
|
|
39
|
+
#
|
|
40
|
+
|
|
41
|
+
require 'bio'
|
|
42
|
+
|
|
43
|
+
module Bio
|
|
44
|
+
class Blat
|
|
45
|
+
|
|
46
|
+
# Bio::Blat::Report is a BLAT report parser class.
|
|
47
|
+
# Its object may contain some Bio::Blat::Report::Hits objects.
|
|
48
|
+
#
|
|
49
|
+
# In BLAT results, the start position of a sequnece is numbered as 0.
|
|
50
|
+
# On the other hand, in many other homology search programs,
|
|
51
|
+
# the start position of a sequence is numbered as 1.
|
|
52
|
+
# To keep compatibility, the BLAT parser adds 1 to every position number.
|
|
53
|
+
#
|
|
54
|
+
# Note that Bio::Blat::Report#query_def, #query_id, #query_len methods
|
|
55
|
+
# simply return first hit's query_*.
|
|
56
|
+
# If multiple query sequences are given, these values
|
|
57
|
+
# will be incorrect.
|
|
58
|
+
#
|
|
59
|
+
class Report #< DB
|
|
60
|
+
# Delimiter of each entry. Bio::FlatFile uses it.
|
|
61
|
+
# In Bio::Blat::Report, it it nil (1 entry 1 file).
|
|
62
|
+
DELIMITER = RS = nil # 1 file 1 entry
|
|
63
|
+
|
|
64
|
+
# Creates a new Bio::Blat::Report object from BLAT result text (String).
|
|
65
|
+
# You can use Bio::FlatFile to read a file.
|
|
66
|
+
# Currently, results created with options -out=psl (default) or
|
|
67
|
+
# -out=pslx are supported.
|
|
68
|
+
def initialize(text)
|
|
69
|
+
flag = false
|
|
70
|
+
head = []
|
|
71
|
+
@hits = []
|
|
72
|
+
text.each do |line|
|
|
73
|
+
if flag then
|
|
74
|
+
@hits << Hit.new(line)
|
|
75
|
+
else
|
|
76
|
+
line = line.chomp
|
|
77
|
+
if /\A\-+\s*\z/ =~ line
|
|
78
|
+
flag = true
|
|
79
|
+
else
|
|
80
|
+
head << line
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
@columns = parse_header(head)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# hits of the result.
|
|
88
|
+
# Returns an Array of Bio::Blat::Report::Hit objects.
|
|
89
|
+
attr_reader :hits
|
|
90
|
+
|
|
91
|
+
# Returns descriptions of columns.
|
|
92
|
+
# Returns an Array.
|
|
93
|
+
# This would be a Bio::Blat specific method.
|
|
94
|
+
attr_reader :columns
|
|
95
|
+
|
|
96
|
+
# Parses headers.
|
|
97
|
+
def parse_header(ary)
|
|
98
|
+
ary.shift # first line is removed
|
|
99
|
+
a0 = ary.collect { |x| x.split(/\t/) }
|
|
100
|
+
k = []
|
|
101
|
+
a0.each do |x|
|
|
102
|
+
x.each_index do |i|
|
|
103
|
+
y = x[i].strip
|
|
104
|
+
k[i] = k[i].to_s + (y.sub!(/\-\z/, '') ? y : y + ' ')
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
k.each { |x| x.strip! }
|
|
108
|
+
k
|
|
109
|
+
end
|
|
110
|
+
private :parse_header
|
|
111
|
+
|
|
112
|
+
# Bio::Blat::Report::SeqDesc stores sequence information of
|
|
113
|
+
# query or subject of the BLAT report.
|
|
114
|
+
# It also includes some hit information.
|
|
115
|
+
class SeqDesc
|
|
116
|
+
# Creates a new SeqDesc object.
|
|
117
|
+
# It is designed to be called internally from Bio::Blat::Report class.
|
|
118
|
+
# Users shall not use it directly.
|
|
119
|
+
def initialize(gap_count, gap_bases, name, size,
|
|
120
|
+
st, ed, starts, seqs)
|
|
121
|
+
@gap_count = gap_count.to_i
|
|
122
|
+
@gap_bases = gap_bases.to_i
|
|
123
|
+
@name = name
|
|
124
|
+
@size = size.to_i
|
|
125
|
+
@start = st.to_i
|
|
126
|
+
@end = ed.to_i
|
|
127
|
+
@starts = starts.collect { |x| x.to_i }
|
|
128
|
+
@seqs = seqs
|
|
129
|
+
end
|
|
130
|
+
# gap count
|
|
131
|
+
attr_reader :gap_count
|
|
132
|
+
# gap bases
|
|
133
|
+
attr_reader :gap_bases
|
|
134
|
+
# name of the sequence
|
|
135
|
+
attr_reader :name
|
|
136
|
+
# length of the sequence
|
|
137
|
+
attr_reader :size
|
|
138
|
+
# start position of the first segment
|
|
139
|
+
attr_reader :start
|
|
140
|
+
# end position of the final segment
|
|
141
|
+
attr_reader :end
|
|
142
|
+
# start positions of segments.
|
|
143
|
+
# Returns an array of numbers.
|
|
144
|
+
attr_reader :starts
|
|
145
|
+
# sequences of segments.
|
|
146
|
+
# Returns an array of String.
|
|
147
|
+
# Returns nil if there are no sequence data.
|
|
148
|
+
attr_reader :seqs
|
|
149
|
+
end #class SeqDesc
|
|
150
|
+
|
|
151
|
+
# Sequence segment pair of BLAT result.
|
|
152
|
+
# Similar to Bio::Blast::Report::Hsp but lacks many methods.
|
|
153
|
+
class SegmentPair
|
|
154
|
+
# Creates a new SegmentPair object.
|
|
155
|
+
# It is designed to be called internally from Bio::Blat::Report class.
|
|
156
|
+
# Users shall not use it directly.
|
|
157
|
+
def initialize(query_len, strand,
|
|
158
|
+
blksize, qstart, tstart, qseq, tseq)
|
|
159
|
+
@blocksize = blksize
|
|
160
|
+
@qseq = qseq
|
|
161
|
+
@hseq = hseq
|
|
162
|
+
@hit_strand = 'plus'
|
|
163
|
+
case strand
|
|
164
|
+
when '-'
|
|
165
|
+
# query is minus strand
|
|
166
|
+
@query_strand = 'minus'
|
|
167
|
+
# convert positions
|
|
168
|
+
@query_from = query_len - qstart
|
|
169
|
+
@query_to = query_len - qstart - blksize + 1
|
|
170
|
+
# To keep compatibility, with other homology search programs,
|
|
171
|
+
# we add 1 to each position number.
|
|
172
|
+
@hit_from = tstart + 1
|
|
173
|
+
@hit_to = tstart + blksize # - 1 + 1
|
|
174
|
+
else #when '+'
|
|
175
|
+
@query_strand = 'plus'
|
|
176
|
+
# To keep compatibility with other homology search programs,
|
|
177
|
+
# we add 1 to each position number.
|
|
178
|
+
@query_from = qstart + 1
|
|
179
|
+
@query_to = qstart + blksize # - 1 + 1
|
|
180
|
+
@hit_from = tstart + 1
|
|
181
|
+
@hit_to = tstart + blksize # - 1 + 1
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
# Returns query start position.
|
|
185
|
+
# CAUTION: In Blat's raw result(psl format), first position is 0.
|
|
186
|
+
# To keep compatibility, the parser add 1 to the position.
|
|
187
|
+
attr_reader :query_from
|
|
188
|
+
|
|
189
|
+
# Returns query end position.
|
|
190
|
+
# CAUTION: In Blat's raw result(psl format), first position is 0.
|
|
191
|
+
# To keep compatibility, the parser add 1 to the position.
|
|
192
|
+
attr_reader :query_to
|
|
193
|
+
|
|
194
|
+
# Returns query sequence.
|
|
195
|
+
# If sequence data is not available, returns nil.
|
|
196
|
+
attr_reader :qseq
|
|
197
|
+
|
|
198
|
+
# Returns strand information of the query.
|
|
199
|
+
# Returns 'plus' or 'minus'.
|
|
200
|
+
attr_reader :query_strand
|
|
201
|
+
|
|
202
|
+
# Returns target (subject, hit) start position.
|
|
203
|
+
# CAUTION: In Blat's raw result(psl format), first position is 0.
|
|
204
|
+
# To keep compatibility, the parser add 1 to the position.
|
|
205
|
+
attr_reader :hit_from
|
|
206
|
+
|
|
207
|
+
# Returns target (subject, hit) end position.
|
|
208
|
+
# CAUTION: In Blat's raw result(psl format), first position is 0.
|
|
209
|
+
# To keep compatibility, the parser add 1 to the position.
|
|
210
|
+
attr_reader :hit_to
|
|
211
|
+
|
|
212
|
+
# Returns the target (subject, hit) sequence.
|
|
213
|
+
# If sequence data is not available, returns nil.
|
|
214
|
+
attr_reader :hseq
|
|
215
|
+
|
|
216
|
+
# Returns strand information of the target (subject, hit).
|
|
217
|
+
# Returns 'plus' or 'minus'.
|
|
218
|
+
attr_reader :hit_strand
|
|
219
|
+
|
|
220
|
+
# Returns block size (length) of the segment pair.
|
|
221
|
+
# This would be a Bio::Blat specific method.
|
|
222
|
+
attr_reader :blocksize
|
|
223
|
+
|
|
224
|
+
# Returns alignment length of the segment pair.
|
|
225
|
+
# Returns nil if no alignment data are available.
|
|
226
|
+
def align_len
|
|
227
|
+
@qseq ? @qseq.size : nil
|
|
228
|
+
end
|
|
229
|
+
end #class SegmentPair
|
|
230
|
+
|
|
231
|
+
# Hit class for the BLAT result parser.
|
|
232
|
+
# Similar to Bio::Blast::Report::Hit but lacks many methods.
|
|
233
|
+
# Its object may contain some Bio::Blat::Report::SegmentPair objects.
|
|
234
|
+
class Hit
|
|
235
|
+
# Creates a new Hit object from a piece of BLAT result text.
|
|
236
|
+
# It is designed to be called internally from Bio::Blat::Report object.
|
|
237
|
+
# Users shall not use it directly.
|
|
238
|
+
def initialize(str)
|
|
239
|
+
@data = str.chomp.split(/\t/)
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Raw data of the hit.
|
|
243
|
+
# (Note that it doesn't add 1 to position numbers.)
|
|
244
|
+
attr_reader :data
|
|
245
|
+
|
|
246
|
+
# split comma-separeted text
|
|
247
|
+
def split_comma(str)
|
|
248
|
+
str.to_s.sub(/\s*\,+\s*\z/, '').split(/\s*\,\s*/)
|
|
249
|
+
end
|
|
250
|
+
private :split_comma
|
|
251
|
+
|
|
252
|
+
# Returns sequence informations of the query.
|
|
253
|
+
# Returns a Bio::Blat::Report::SeqDesc object.
|
|
254
|
+
# This would be Bio::Blat specific method.
|
|
255
|
+
def query
|
|
256
|
+
unless defined?(@query)
|
|
257
|
+
d = @data
|
|
258
|
+
@query = SeqDesc.new(d[4], d[5], d[9], d[10], d[11], d[12],
|
|
259
|
+
split_comma(d[19]), split_comma(d[21]))
|
|
260
|
+
end
|
|
261
|
+
@query
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Returns sequence informations of the target(hit).
|
|
265
|
+
# Returns a Bio::Blat::Report::SeqDesc object.
|
|
266
|
+
# This would be Bio::Blat specific method.
|
|
267
|
+
def target
|
|
268
|
+
unless defined?(@target)
|
|
269
|
+
d = @data
|
|
270
|
+
@target = SeqDesc.new(d[6], d[7], d[13], d[14], d[15], d[16],
|
|
271
|
+
split_comma(d[20]), split_comma(d[22]))
|
|
272
|
+
end
|
|
273
|
+
@target
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Match nucleotides.
|
|
277
|
+
def match; @data[0].to_i; end
|
|
278
|
+
# Mismatch nucleotides.
|
|
279
|
+
def mismatch; @data[1].to_i; end
|
|
280
|
+
# rep. match (???)
|
|
281
|
+
def rep_match; @data[2].to_i; end
|
|
282
|
+
# N's (???)
|
|
283
|
+
def n_s; @data[3].to_i; end
|
|
284
|
+
|
|
285
|
+
# Returns strand information of the hit.
|
|
286
|
+
# Returns '+' or '-'.
|
|
287
|
+
# This would be a Bio::Blat specific method.
|
|
288
|
+
def strand; @data[8]; end
|
|
289
|
+
|
|
290
|
+
# Number of blocks(exons, segment pairs).
|
|
291
|
+
def block_count; @data[17].to_i; end
|
|
292
|
+
|
|
293
|
+
# Sizes of all blocks(exons, segment pairs).
|
|
294
|
+
# Returns an array of numbers.
|
|
295
|
+
def block_sizes
|
|
296
|
+
unless defined?(@block_sizes) then
|
|
297
|
+
@block_sizes = split_comma(@data[18]).collect { |x| x.to_i }
|
|
298
|
+
end
|
|
299
|
+
@block_sizes
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Returns blocks(exons, segment pairs) of the hit.
|
|
303
|
+
# Returns an array of Bio::Blat::Report::SegmentPair objects.
|
|
304
|
+
def blocks
|
|
305
|
+
unless defined?(@blocks)
|
|
306
|
+
bs = block_sizes
|
|
307
|
+
qst = query.starts
|
|
308
|
+
tst = target.starts
|
|
309
|
+
qseqs = query.seqs
|
|
310
|
+
tseqs = target.seqs
|
|
311
|
+
@blocks = (0...block_count).collect do |i|
|
|
312
|
+
SegmentPair.new(query.size, strand, bs[i],
|
|
313
|
+
qst[i], tst[i], qseqs[i], tseqs[i])
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
@blocks
|
|
317
|
+
end
|
|
318
|
+
alias exons blocks
|
|
319
|
+
|
|
320
|
+
#--
|
|
321
|
+
# Bio::BLAST::*::Report::Hit compatible methods
|
|
322
|
+
#++
|
|
323
|
+
alias hsps blocks
|
|
324
|
+
|
|
325
|
+
# Returns the length of query sequence.
|
|
326
|
+
def query_len; query.size; end
|
|
327
|
+
|
|
328
|
+
# Returns the name of query sequence.
|
|
329
|
+
def query_def; query.name; end
|
|
330
|
+
alias query_id query_def
|
|
331
|
+
|
|
332
|
+
# Returns the length of the target(subject) sequence.
|
|
333
|
+
def target_len; target.size; end
|
|
334
|
+
alias len target_len
|
|
335
|
+
|
|
336
|
+
# Returns the name of the target(subject) sequence.
|
|
337
|
+
def target_def; target.name; end
|
|
338
|
+
alias target_id target_def
|
|
339
|
+
alias definition target_def
|
|
340
|
+
|
|
341
|
+
#Iterates over each block(exon, segment pair) of the hit.
|
|
342
|
+
# Yields a Bio::Blat::Report::SegmentPair object.
|
|
343
|
+
def each(&x) #:yields: segmentpair
|
|
344
|
+
exons.each(&x)
|
|
345
|
+
end
|
|
346
|
+
end #class Hit
|
|
347
|
+
|
|
348
|
+
#--
|
|
349
|
+
#Bio::BLAST::*::Report compatible methods
|
|
350
|
+
#++
|
|
351
|
+
|
|
352
|
+
# Returns number of hits.
|
|
353
|
+
# Same as hits.size.
|
|
354
|
+
def num_hits; @hits.size; end
|
|
355
|
+
|
|
356
|
+
# Iterates over each Bio::Blat::Report::Hit object.
|
|
357
|
+
# Same as hits.each.
|
|
358
|
+
def each_hit(&x) #:yields: hit
|
|
359
|
+
@hits.each(&x)
|
|
360
|
+
end
|
|
361
|
+
alias each each_hit
|
|
362
|
+
|
|
363
|
+
# Returns the name of query sequence.
|
|
364
|
+
# CAUTION: query_* methods simply return first hit's query_*.
|
|
365
|
+
# If multiple query sequences are given, these values
|
|
366
|
+
# will be incorrect.
|
|
367
|
+
def query_def; (x = @hits.first) ? x.query_def : nil; end
|
|
368
|
+
|
|
369
|
+
# Returns the length of query sequence.
|
|
370
|
+
# CAUTION: query_* methods simply return first hit's query_*.
|
|
371
|
+
# If multiple query sequences are given, these values
|
|
372
|
+
# will be incorrect.
|
|
373
|
+
def query_len; (x = @hits.first) ? x.query_len : nil; end
|
|
374
|
+
alias query_id query_def
|
|
375
|
+
end #class Report
|
|
376
|
+
|
|
377
|
+
end #class Blat
|
|
378
|
+
end #module Bio
|
|
379
|
+
|
|
380
|
+
=begin
|
|
381
|
+
|
|
382
|
+
= Bio::Blat::Report
|
|
383
|
+
|
|
384
|
+
BLAT result parser. (psl / pslx format)
|
|
385
|
+
|
|
386
|
+
= References
|
|
387
|
+
|
|
388
|
+
* ((<URL:http://www.genome.org/cgi/content/abstract/12/4/656>))
|
|
389
|
+
Kent, W.J., BLAT--the BLAST-like alignment tool,
|
|
390
|
+
Genome Research, 12, 656--664, 2002.
|
|
391
|
+
|
|
392
|
+
=end
|