bio 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +107 -0
- data/bin/br_biofetch.rb +59 -0
- data/bin/br_bioflat.rb +294 -0
- data/bin/br_biogetseq.rb +57 -0
- data/bin/br_pmfetch.rb +431 -0
- data/doc/BioRuby.rd.ja +225 -0
- data/doc/Changes-0.7.rd +236 -0
- data/doc/Design.rd.ja +341 -0
- data/doc/KEGG_API.rd +1437 -0
- data/doc/KEGG_API.rd.ja +1399 -0
- data/doc/TODO.rd.ja +138 -0
- data/doc/Tutorial.rd +1138 -0
- data/doc/Tutorial.rd.ja +2110 -0
- data/etc/bioinformatics/seqdatabase.ini +210 -0
- data/lib/bio.rb +256 -0
- data/lib/bio/alignment.rb +1906 -0
- data/lib/bio/appl/bl2seq/report.rb +350 -0
- data/lib/bio/appl/blast.rb +269 -0
- data/lib/bio/appl/blast/format0.rb +1402 -0
- data/lib/bio/appl/blast/format8.rb +95 -0
- data/lib/bio/appl/blast/report.rb +652 -0
- data/lib/bio/appl/blast/rexml.rb +151 -0
- data/lib/bio/appl/blast/wublast.rb +553 -0
- data/lib/bio/appl/blast/xmlparser.rb +222 -0
- data/lib/bio/appl/blat/report.rb +392 -0
- data/lib/bio/appl/clustalw.rb +191 -0
- data/lib/bio/appl/clustalw/report.rb +154 -0
- data/lib/bio/appl/emboss.rb +68 -0
- data/lib/bio/appl/fasta.rb +262 -0
- data/lib/bio/appl/fasta/format10.rb +428 -0
- data/lib/bio/appl/fasta/format6.rb +37 -0
- data/lib/bio/appl/genscan/report.rb +570 -0
- data/lib/bio/appl/hmmer.rb +129 -0
- data/lib/bio/appl/hmmer/report.rb +556 -0
- data/lib/bio/appl/mafft.rb +222 -0
- data/lib/bio/appl/mafft/report.rb +119 -0
- data/lib/bio/appl/psort.rb +555 -0
- data/lib/bio/appl/psort/report.rb +473 -0
- data/lib/bio/appl/sim4.rb +134 -0
- data/lib/bio/appl/sim4/report.rb +501 -0
- data/lib/bio/appl/sosui/report.rb +166 -0
- data/lib/bio/appl/spidey/report.rb +604 -0
- data/lib/bio/appl/targetp/report.rb +283 -0
- data/lib/bio/appl/tmhmm/report.rb +238 -0
- data/lib/bio/command.rb +166 -0
- data/lib/bio/data/aa.rb +354 -0
- data/lib/bio/data/codontable.rb +740 -0
- data/lib/bio/data/na.rb +226 -0
- data/lib/bio/db.rb +340 -0
- data/lib/bio/db/aaindex.rb +280 -0
- data/lib/bio/db/embl/common.rb +332 -0
- data/lib/bio/db/embl/embl.rb +446 -0
- data/lib/bio/db/embl/sptr.rb +954 -0
- data/lib/bio/db/embl/swissprot.rb +32 -0
- data/lib/bio/db/embl/trembl.rb +31 -0
- data/lib/bio/db/embl/uniprot.rb +32 -0
- data/lib/bio/db/fantom.rb +604 -0
- data/lib/bio/db/fasta.rb +869 -0
- data/lib/bio/db/genbank/common.rb +299 -0
- data/lib/bio/db/genbank/ddbj.rb +34 -0
- data/lib/bio/db/genbank/genbank.rb +354 -0
- data/lib/bio/db/genbank/genpept.rb +73 -0
- data/lib/bio/db/genbank/refseq.rb +31 -0
- data/lib/bio/db/gff.rb +106 -0
- data/lib/bio/db/go.rb +497 -0
- data/lib/bio/db/kegg/brite.rb +51 -0
- data/lib/bio/db/kegg/cell.rb +88 -0
- data/lib/bio/db/kegg/compound.rb +130 -0
- data/lib/bio/db/kegg/enzyme.rb +125 -0
- data/lib/bio/db/kegg/expression.rb +173 -0
- data/lib/bio/db/kegg/genes.rb +293 -0
- data/lib/bio/db/kegg/genome.rb +362 -0
- data/lib/bio/db/kegg/glycan.rb +213 -0
- data/lib/bio/db/kegg/keggtab.rb +418 -0
- data/lib/bio/db/kegg/kgml.rb +299 -0
- data/lib/bio/db/kegg/ko.rb +178 -0
- data/lib/bio/db/kegg/reaction.rb +97 -0
- data/lib/bio/db/litdb.rb +131 -0
- data/lib/bio/db/medline.rb +317 -0
- data/lib/bio/db/nbrf.rb +199 -0
- data/lib/bio/db/pdb.rb +38 -0
- data/lib/bio/db/pdb/atom.rb +60 -0
- data/lib/bio/db/pdb/chain.rb +117 -0
- data/lib/bio/db/pdb/model.rb +106 -0
- data/lib/bio/db/pdb/pdb.rb +1682 -0
- data/lib/bio/db/pdb/residue.rb +122 -0
- data/lib/bio/db/pdb/utils.rb +234 -0
- data/lib/bio/db/prosite.rb +616 -0
- data/lib/bio/db/rebase.rb +417 -0
- data/lib/bio/db/transfac.rb +387 -0
- data/lib/bio/feature.rb +201 -0
- data/lib/bio/io/brdb.rb +103 -0
- data/lib/bio/io/das.rb +471 -0
- data/lib/bio/io/dbget.rb +212 -0
- data/lib/bio/io/ddbjxml.rb +614 -0
- data/lib/bio/io/fastacmd.rb +123 -0
- data/lib/bio/io/fetch.rb +114 -0
- data/lib/bio/io/flatfile.rb +496 -0
- data/lib/bio/io/flatfile/bdb.rb +266 -0
- data/lib/bio/io/flatfile/index.rb +1308 -0
- data/lib/bio/io/flatfile/indexer.rb +778 -0
- data/lib/bio/io/higet.rb +92 -0
- data/lib/bio/io/keggapi.rb +863 -0
- data/lib/bio/io/pubmed.rb +189 -0
- data/lib/bio/io/registry.rb +308 -0
- data/lib/bio/io/soapwsdl.rb +114 -0
- data/lib/bio/io/sql.rb +428 -0
- data/lib/bio/location.rb +650 -0
- data/lib/bio/pathway.rb +991 -0
- data/lib/bio/reference.rb +308 -0
- data/lib/bio/sequence.rb +593 -0
- data/lib/bio/shell.rb +51 -0
- data/lib/bio/shell/core.rb +512 -0
- data/lib/bio/shell/plugin/codon.rb +228 -0
- data/lib/bio/shell/plugin/entry.rb +85 -0
- data/lib/bio/shell/plugin/flatfile.rb +119 -0
- data/lib/bio/shell/plugin/keggapi.rb +187 -0
- data/lib/bio/shell/plugin/midi.rb +448 -0
- data/lib/bio/shell/plugin/obda.rb +63 -0
- data/lib/bio/shell/plugin/seq.rb +238 -0
- data/lib/bio/shell/session.rb +214 -0
- data/lib/bio/util/color_scheme.rb +214 -0
- data/lib/bio/util/color_scheme/buried.rb +78 -0
- data/lib/bio/util/color_scheme/helix.rb +78 -0
- data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
- data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
- data/lib/bio/util/color_scheme/strand.rb +78 -0
- data/lib/bio/util/color_scheme/taylor.rb +69 -0
- data/lib/bio/util/color_scheme/turn.rb +78 -0
- data/lib/bio/util/color_scheme/zappo.rb +69 -0
- data/lib/bio/util/contingency_table.rb +337 -0
- data/lib/bio/util/sirna.rb +306 -0
- data/lib/bioruby.rb +34 -0
- data/sample/biofetch.rb +475 -0
- data/sample/color_scheme_na.rb +99 -0
- data/sample/dbget +37 -0
- data/sample/fasta2tab.rb +99 -0
- data/sample/fsplit.rb +51 -0
- data/sample/gb2fasta.rb +31 -0
- data/sample/gb2tab.rb +325 -0
- data/sample/gbtab2mysql.rb +161 -0
- data/sample/genes2nuc.rb +33 -0
- data/sample/genes2pep.rb +33 -0
- data/sample/genes2tab.rb +81 -0
- data/sample/genome2rb.rb +29 -0
- data/sample/genome2tab.rb +76 -0
- data/sample/goslim.rb +311 -0
- data/sample/gt2fasta.rb +47 -0
- data/sample/pmfetch.rb +42 -0
- data/sample/pmsearch.rb +42 -0
- data/sample/psortplot_html.rb +222 -0
- data/sample/ssearch2tab.rb +96 -0
- data/sample/tdiary.rb +158 -0
- data/sample/tfastx2tab.rb +100 -0
- data/sample/vs-genes.rb +212 -0
- data/test/data/SOSUI/sample.report +11 -0
- data/test/data/TMHMM/sample.report +21 -0
- data/test/data/blast/eco:b0002.faa +15 -0
- data/test/data/blast/eco:b0002.faa.m0 +128 -0
- data/test/data/blast/eco:b0002.faa.m7 +65 -0
- data/test/data/blast/eco:b0002.faa.m8 +1 -0
- data/test/data/embl/AB090716.embl +65 -0
- data/test/data/genscan/sample.report +63 -0
- data/test/data/prosite/prosite.dat +2233 -0
- data/test/data/refseq/nm_126355.entret +64 -0
- data/test/data/uniprot/p53_human.uniprot +1456 -0
- data/test/runner.rb +10 -0
- data/test/unit/bio/appl/blast/test_report.rb +427 -0
- data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
- data/test/unit/bio/appl/genscan/test_report.rb +195 -0
- data/test/unit/bio/appl/sosui/test_report.rb +94 -0
- data/test/unit/bio/appl/targetp/test_report.rb +159 -0
- data/test/unit/bio/appl/test_blast.rb +159 -0
- data/test/unit/bio/appl/test_fasta.rb +142 -0
- data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
- data/test/unit/bio/data/test_aa.rb +103 -0
- data/test/unit/bio/data/test_codontable.rb +120 -0
- data/test/unit/bio/data/test_na.rb +89 -0
- data/test/unit/bio/db/embl/test_common.rb +130 -0
- data/test/unit/bio/db/embl/test_embl.rb +227 -0
- data/test/unit/bio/db/embl/test_sptr.rb +268 -0
- data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
- data/test/unit/bio/db/kegg/test_genes.rb +58 -0
- data/test/unit/bio/db/test_fasta.rb +263 -0
- data/test/unit/bio/db/test_gff.rb +140 -0
- data/test/unit/bio/db/test_prosite.rb +1450 -0
- data/test/unit/bio/io/test_ddbjxml.rb +87 -0
- data/test/unit/bio/io/test_soapwsdl.rb +45 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
- data/test/unit/bio/test_alignment.rb +1028 -0
- data/test/unit/bio/test_command.rb +71 -0
- data/test/unit/bio/test_db.rb +109 -0
- data/test/unit/bio/test_feature.rb +128 -0
- data/test/unit/bio/test_location.rb +51 -0
- data/test/unit/bio/test_pathway.rb +485 -0
- data/test/unit/bio/test_sequence.rb +386 -0
- data/test/unit/bio/test_shell.rb +31 -0
- data/test/unit/bio/util/test_color_scheme.rb +45 -0
- data/test/unit/bio/util/test_contingency_table.rb +106 -0
- data/test/unit/bio/util/test_sirna.rb +258 -0
- metadata +295 -0
data/lib/bio/db/fasta.rb
ADDED
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
#
|
|
2
|
+
# bio/db/fasta.rb - FASTA format class
|
|
3
|
+
#
|
|
4
|
+
# Copyright (C) 2001 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
|
|
5
|
+
# Copyright (C) 2001, 2002 KATAYAMA Toshiaki <k@bioruby.org>
|
|
6
|
+
#
|
|
7
|
+
# This library is free software; you can redistribute it and/or
|
|
8
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
9
|
+
# License as published by the Free Software Foundation; either
|
|
10
|
+
# version 2 of the License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This library is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
15
|
+
# Lesser General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
18
|
+
# License along with this library; if not, write to the Free Software
|
|
19
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
20
|
+
#
|
|
21
|
+
# $Id: fasta.rb,v 1.21 2005/09/26 13:00:06 k Exp $
|
|
22
|
+
#
|
|
23
|
+
|
|
24
|
+
require 'bio/db'
|
|
25
|
+
require 'bio/sequence'
|
|
26
|
+
|
|
27
|
+
module Bio
|
|
28
|
+
|
|
29
|
+
class FastaFormat < DB
|
|
30
|
+
|
|
31
|
+
DELIMITER = RS = "\n>"
|
|
32
|
+
|
|
33
|
+
def initialize(str)
|
|
34
|
+
@definition = str[/.*/].sub(/^>/, '').strip # 1st line
|
|
35
|
+
@data = str.sub(/.*/, '') # rests
|
|
36
|
+
@data.sub!(/^>.*/m, '') # remove trailing entries for sure
|
|
37
|
+
@entry_overrun = $&
|
|
38
|
+
end
|
|
39
|
+
attr_accessor :definition, :data
|
|
40
|
+
attr_reader :entry_overrun
|
|
41
|
+
|
|
42
|
+
def entry
|
|
43
|
+
@entry = ">#{@definition}\n#{@data.strip}\n"
|
|
44
|
+
end
|
|
45
|
+
alias to_s entry
|
|
46
|
+
|
|
47
|
+
def query(factory)
|
|
48
|
+
factory.query(@entry)
|
|
49
|
+
end
|
|
50
|
+
alias fasta query
|
|
51
|
+
alias blast query
|
|
52
|
+
|
|
53
|
+
def seq
|
|
54
|
+
unless defined?(@seq)
|
|
55
|
+
unless /\A\s*^\#/ =~ @data then
|
|
56
|
+
@seq = Sequence.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
|
|
57
|
+
else
|
|
58
|
+
a = @data.split(/(^\#.*$)/)
|
|
59
|
+
i = 0
|
|
60
|
+
cmnt = {}
|
|
61
|
+
s = []
|
|
62
|
+
a.each do |x|
|
|
63
|
+
if /^# ?(.*)$/ =~ x then
|
|
64
|
+
cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
|
|
65
|
+
else
|
|
66
|
+
x.tr!(" \t\r\n0-9", '') # lazy clean up
|
|
67
|
+
i += x.length
|
|
68
|
+
s << x
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
@comment = cmnt
|
|
72
|
+
@seq = Bio::Sequence.new(s.join(''))
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
@seq
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def comment
|
|
79
|
+
seq
|
|
80
|
+
@comment
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def length
|
|
84
|
+
seq.length
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def naseq
|
|
88
|
+
Sequence::NA.new(seq)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def nalen
|
|
92
|
+
self.naseq.length
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def aaseq
|
|
96
|
+
Sequence::AA.new(seq)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def aalen
|
|
100
|
+
self.aaseq.length
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def identifiers
|
|
104
|
+
unless defined?(@ids) then
|
|
105
|
+
@ids = FastaDefline.new(@definition)
|
|
106
|
+
end
|
|
107
|
+
@ids
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def entry_id
|
|
111
|
+
identifiers.entry_id
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def gi
|
|
115
|
+
identifiers.gi
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def accession
|
|
119
|
+
identifiers.accession
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def accessions
|
|
123
|
+
identifiers.accessions
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def acc_version
|
|
127
|
+
identifiers.acc_version
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def locus
|
|
131
|
+
identifiers.locus
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
end #class FastaFormat
|
|
135
|
+
|
|
136
|
+
class FastaNumericFormat < FastaFormat
|
|
137
|
+
|
|
138
|
+
def data
|
|
139
|
+
unless @list
|
|
140
|
+
@list = @data.strip.split(/\s+/).map {|x| x.to_i}
|
|
141
|
+
end
|
|
142
|
+
@list
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def length
|
|
146
|
+
data.length
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def each
|
|
150
|
+
data.each do |x|
|
|
151
|
+
yield x
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def [](n)
|
|
156
|
+
data[n]
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
undef query, blast, fasta, seq, naseq, nalen, aaseq, aalen
|
|
160
|
+
|
|
161
|
+
end #class FastaNumericFormat
|
|
162
|
+
|
|
163
|
+
class FastaDefline
|
|
164
|
+
|
|
165
|
+
# specs are described in:
|
|
166
|
+
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
|
167
|
+
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
|
168
|
+
|
|
169
|
+
NSIDs = {
|
|
170
|
+
# NCBI and WU-BLAST
|
|
171
|
+
'gi' => [ 'gi' ], # NCBI GI
|
|
172
|
+
'gb' => [ 'acc_version', 'locus' ], # GenBank
|
|
173
|
+
'emb' => [ 'acc_version', 'locus' ], # EMBL
|
|
174
|
+
'dbj' => [ 'acc_version', 'locus' ], # DDBJ
|
|
175
|
+
'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
|
|
176
|
+
'pdb' => [ 'entry_id', 'chain' ], # PDB
|
|
177
|
+
'bbs' => [ 'number' ], # GenInfo Backbone Id
|
|
178
|
+
'gnl' => [ 'database' , 'entry_id' ], # General database identifier
|
|
179
|
+
'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
|
|
180
|
+
'lcl' => [ 'entry_id' ], # Local Sequence identifier
|
|
181
|
+
|
|
182
|
+
# WU-BLAST and NCBI
|
|
183
|
+
'pir' => [ 'accession', 'entry_id' ], # PIR
|
|
184
|
+
'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
|
|
185
|
+
'pat' => [ 'country', 'number', 'serial' ], # Patents
|
|
186
|
+
|
|
187
|
+
# WU-BLAST only
|
|
188
|
+
'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
|
|
189
|
+
'gim' => [ 'number' ], # NCBI GenInfo Import identifier
|
|
190
|
+
'gp' => [ 'acc_version', 'locus' ], # GenPept
|
|
191
|
+
'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
|
|
192
|
+
'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
|
|
193
|
+
'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
|
|
194
|
+
'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
|
|
195
|
+
|
|
196
|
+
# Original
|
|
197
|
+
'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
def initialize(str)
|
|
201
|
+
@deflines = []
|
|
202
|
+
@info = {}
|
|
203
|
+
@list_ids = []
|
|
204
|
+
|
|
205
|
+
@entry_id = nil
|
|
206
|
+
|
|
207
|
+
lines = str.split("\x01")
|
|
208
|
+
lines.each do |line|
|
|
209
|
+
add_defline(line)
|
|
210
|
+
end
|
|
211
|
+
end #def initialize
|
|
212
|
+
|
|
213
|
+
attr_reader :list_ids
|
|
214
|
+
attr_reader :entry_id
|
|
215
|
+
|
|
216
|
+
def add_defline(str)
|
|
217
|
+
case str
|
|
218
|
+
when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
|
|
219
|
+
# NSIDs
|
|
220
|
+
# examples:
|
|
221
|
+
# >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
|
|
222
|
+
#
|
|
223
|
+
# note: regexp (:?) means grouping without backreferences
|
|
224
|
+
i = $1
|
|
225
|
+
d = $2
|
|
226
|
+
tks = i.split('|')
|
|
227
|
+
tks << '' if i[-1,1] == '|'
|
|
228
|
+
a = parse_NSIDs(tks)
|
|
229
|
+
i = a[0].join('|')
|
|
230
|
+
a.unshift('|')
|
|
231
|
+
d = tks.join('|') + ' ' + d unless tks.empty?
|
|
232
|
+
a << d
|
|
233
|
+
this_line = a
|
|
234
|
+
match_EC(d)
|
|
235
|
+
parse_square_brackets(d).each do |x|
|
|
236
|
+
if !match_EC(x, false) and x =~ /\A[A-Z]/ then
|
|
237
|
+
di = [ x ]
|
|
238
|
+
@list_ids << di
|
|
239
|
+
@info['organism'] = x unless @info['organism']
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
|
|
244
|
+
# examples:
|
|
245
|
+
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
|
246
|
+
# >emb:CACDC28 [X80034] C.albicans CDC28 gene
|
|
247
|
+
i = $1
|
|
248
|
+
d = $2
|
|
249
|
+
a = parse_ColonSepID(i)
|
|
250
|
+
i = a.join(':')
|
|
251
|
+
this_line = [ ':', a , d ]
|
|
252
|
+
match_EC(d)
|
|
253
|
+
parse_square_brackets(d).each do |x|
|
|
254
|
+
if !match_EC(x, false) and x =~ /:/ then
|
|
255
|
+
parse_ColonSepID(x)
|
|
256
|
+
elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
|
|
257
|
+
@list_ids << [ $1 ]
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
when /^\>?\s*(\S+)(?:\s+(.+))?$/
|
|
262
|
+
# examples:
|
|
263
|
+
# >ABC12345 this is test
|
|
264
|
+
i = $1
|
|
265
|
+
d = $2.to_s
|
|
266
|
+
@list_ids << [ i.chomp('.') ]
|
|
267
|
+
this_line = [ '', [ i ], d ]
|
|
268
|
+
match_EC(d)
|
|
269
|
+
else
|
|
270
|
+
i = str
|
|
271
|
+
d = ''
|
|
272
|
+
match_EC(i)
|
|
273
|
+
this_line = [ '', [ i ], d ]
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
@deflines << this_line
|
|
277
|
+
@entry_id = i unless @entry_id
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def match_EC(str, write_flag = true)
|
|
281
|
+
di = nil
|
|
282
|
+
str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
|
|
283
|
+
di = [ 'EC', $1 ]
|
|
284
|
+
if write_flag then
|
|
285
|
+
@info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
|
|
286
|
+
@list_ids << di
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
di
|
|
290
|
+
end
|
|
291
|
+
private :match_EC
|
|
292
|
+
|
|
293
|
+
def parse_square_brackets(str)
|
|
294
|
+
r = []
|
|
295
|
+
str.scan(/\[([^\]]*)\]/) do |x|
|
|
296
|
+
r << x[0]
|
|
297
|
+
end
|
|
298
|
+
r
|
|
299
|
+
end
|
|
300
|
+
private :parse_square_brackets
|
|
301
|
+
|
|
302
|
+
def parse_ColonSepID(str)
|
|
303
|
+
di = str.split(':', 2)
|
|
304
|
+
di << nil if di.size <= 1
|
|
305
|
+
@list_ids << di
|
|
306
|
+
di
|
|
307
|
+
end
|
|
308
|
+
private :parse_ColonSepID
|
|
309
|
+
|
|
310
|
+
def parse_NSIDs(ary)
|
|
311
|
+
# this method destroys ary
|
|
312
|
+
data = []
|
|
313
|
+
while token = ary.shift
|
|
314
|
+
if labels = self.class::NSIDs[token] then
|
|
315
|
+
di = [ token ]
|
|
316
|
+
idtype = token
|
|
317
|
+
labels.each do |x|
|
|
318
|
+
token = ary.shift
|
|
319
|
+
break unless token
|
|
320
|
+
if self.class::NSIDs[token] then
|
|
321
|
+
ary.unshift(token)
|
|
322
|
+
break #each
|
|
323
|
+
end
|
|
324
|
+
if token.length > 0 then
|
|
325
|
+
di << token
|
|
326
|
+
else
|
|
327
|
+
di << nil
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
data << di
|
|
331
|
+
else
|
|
332
|
+
if token.length > 0 then
|
|
333
|
+
# UCID (uncontrolled identifiers)
|
|
334
|
+
di = [ token ]
|
|
335
|
+
data << di
|
|
336
|
+
@info['ucid'] = token unless @info['ucid']
|
|
337
|
+
end
|
|
338
|
+
break #while
|
|
339
|
+
end
|
|
340
|
+
end #while
|
|
341
|
+
@list_ids.concat data
|
|
342
|
+
data
|
|
343
|
+
end #def parse_NSIDs
|
|
344
|
+
private :parse_NSIDs
|
|
345
|
+
|
|
346
|
+
def to_s
|
|
347
|
+
@deflines.collect { |a|
|
|
348
|
+
s = a[0]
|
|
349
|
+
(a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
|
|
350
|
+
}.join("\x01")
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def description
|
|
354
|
+
@deflines[0].to_a[-1]
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
def descriptions
|
|
358
|
+
@deflines.collect do |a|
|
|
359
|
+
a[-1]
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
def id_strings
|
|
364
|
+
r = []
|
|
365
|
+
@list_ids.each do |a|
|
|
366
|
+
if a.size >= 2 then
|
|
367
|
+
r.concat a[1..-1].find_all { |x| x }
|
|
368
|
+
else
|
|
369
|
+
if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
|
|
370
|
+
r << a[0]
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
r.concat( words(true, []).find_all do |x|
|
|
375
|
+
x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
|
|
376
|
+
x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
|
377
|
+
end)
|
|
378
|
+
r
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
KillWords = [
|
|
382
|
+
'an', 'the', 'this', 'that',
|
|
383
|
+
'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
|
|
384
|
+
'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
|
|
385
|
+
'from', 'and', 'or', 'not',
|
|
386
|
+
'dna', 'rna', 'mrna', 'cdna', 'orf',
|
|
387
|
+
'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
|
|
388
|
+
'similar', 'involved', 'identical', 'identity',
|
|
389
|
+
'cds', 'clone', 'library', 'contig', 'contigs',
|
|
390
|
+
'homolog', 'homologue', 'homologs', 'homologous',
|
|
391
|
+
'protein', 'proteins', 'gene', 'genes',
|
|
392
|
+
'product', 'products', 'sequence', 'sequences',
|
|
393
|
+
'strain', 'strains', 'region', 'regions',
|
|
394
|
+
]
|
|
395
|
+
KillWordsHash = {}
|
|
396
|
+
KillWords.each { |x| KillWordsHash[x] = true }
|
|
397
|
+
|
|
398
|
+
KillRegexpArray = [
|
|
399
|
+
/\A\d{1,3}\%?\z/,
|
|
400
|
+
/\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
|
|
401
|
+
/\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
|
402
|
+
]
|
|
403
|
+
|
|
404
|
+
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
|
|
405
|
+
kwhash = self.class::KillWordsHash)
|
|
406
|
+
a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
|
|
407
|
+
a.collect! do |x|
|
|
408
|
+
x.sub!(/\A[\$\*\-\+]+/, '')
|
|
409
|
+
x.sub!(/[\$\*\-\=]+\z/, '')
|
|
410
|
+
if x.size <= 1 then
|
|
411
|
+
nil
|
|
412
|
+
elsif kwhash[x.downcase] then
|
|
413
|
+
nil
|
|
414
|
+
else
|
|
415
|
+
if kill_regexp.find { |expr| expr =~ x } then
|
|
416
|
+
nil
|
|
417
|
+
else
|
|
418
|
+
x
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
a.compact!
|
|
423
|
+
a.collect! { |x| x.downcase } unless case_sensitive
|
|
424
|
+
a.sort!
|
|
425
|
+
a.uniq!
|
|
426
|
+
a
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def get(db)
|
|
430
|
+
db =db.to_s
|
|
431
|
+
r = nil
|
|
432
|
+
unless r = @info[db] then
|
|
433
|
+
di = @list_ids.find { |x| x[0] == db.to_s }
|
|
434
|
+
if di and di.size <= 2 then
|
|
435
|
+
r = di[-1]
|
|
436
|
+
elsif di then
|
|
437
|
+
labels = self.class::NSIDs[db]
|
|
438
|
+
[ 'acc_version', 'entry_id',
|
|
439
|
+
'locus', 'accession', 'number'].each do |x|
|
|
440
|
+
if i = labels.index(x) then
|
|
441
|
+
r = di[i+1]
|
|
442
|
+
break if r
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
r = di[1..-1].find { |x| x } unless r
|
|
446
|
+
end
|
|
447
|
+
@info[db] = r if r
|
|
448
|
+
end
|
|
449
|
+
r
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
def get_by_type(tstr)
|
|
453
|
+
@list_ids.each do |x|
|
|
454
|
+
if labels = self.class::NSIDs[x[0]] then
|
|
455
|
+
if i = labels.index(tstr) then
|
|
456
|
+
return x[i+1]
|
|
457
|
+
end
|
|
458
|
+
end
|
|
459
|
+
end
|
|
460
|
+
nil
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
def get_all_by_type(*tstrarg)
|
|
464
|
+
d = []
|
|
465
|
+
@list_ids.each do |x|
|
|
466
|
+
if labels = self.class::NSIDs[x[0]] then
|
|
467
|
+
tstrarg.each do |y|
|
|
468
|
+
if i = labels.index(y) then
|
|
469
|
+
d << x[i+1] if x[i+1]
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
end
|
|
474
|
+
d
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
def locus
|
|
478
|
+
unless defined?(@locus)
|
|
479
|
+
@locus = get_by_type('locus')
|
|
480
|
+
end
|
|
481
|
+
@locus
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
def gi
|
|
485
|
+
unless defined?(@gi) then
|
|
486
|
+
@gi = get_by_type('gi')
|
|
487
|
+
end
|
|
488
|
+
@gi
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
def acc_version
|
|
492
|
+
unless defined?(@acc_version) then
|
|
493
|
+
@acc_version = get_by_type('acc_version')
|
|
494
|
+
end
|
|
495
|
+
@acc_version
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
def accessions
|
|
499
|
+
unless defined?(@accessions) then
|
|
500
|
+
@accessions = get_all_by_type('accession', 'acc_version')
|
|
501
|
+
@accessions.collect! { |x| x.sub(/\..*\z/, '') }
|
|
502
|
+
end
|
|
503
|
+
@accessions
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
def accession
|
|
507
|
+
unless defined?(@accession) then
|
|
508
|
+
if acc_version then
|
|
509
|
+
@accession = acc_version.split('.')[0]
|
|
510
|
+
else
|
|
511
|
+
@accession = accessions[0]
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
@accession
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
def method_missing(name, *args)
|
|
518
|
+
# raise ArgumentError,
|
|
519
|
+
# "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
|
|
520
|
+
r = get(name, *args)
|
|
521
|
+
if !r and !(self.class::NSIDs[name.to_s]) then
|
|
522
|
+
raise "NameError: undefined method `#{name.inspect}'"
|
|
523
|
+
end
|
|
524
|
+
r
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
end #class FastaDefline
|
|
528
|
+
|
|
529
|
+
end #module Bio
|
|
530
|
+
|
|
531
|
+
if __FILE__ == $0
|
|
532
|
+
|
|
533
|
+
f_str = <<END
|
|
534
|
+
>sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
|
535
|
+
MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
|
|
536
|
+
VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
|
|
537
|
+
GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL
|
|
538
|
+
KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC
|
|
539
|
+
IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP
|
|
540
|
+
QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
|
|
541
|
+
>sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
|
|
542
|
+
MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
|
|
543
|
+
TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
|
|
544
|
+
GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
|
|
545
|
+
DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
|
|
546
|
+
DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
|
|
547
|
+
EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
|
|
548
|
+
AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
|
|
549
|
+
QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
|
|
550
|
+
CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
|
|
551
|
+
FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
|
|
552
|
+
KTGDPLEWRRLFKKISTICRDIILIPN
|
|
553
|
+
END
|
|
554
|
+
|
|
555
|
+
f = Bio::FastaFormat.new(f_str)
|
|
556
|
+
puts "### FastaFormat"
|
|
557
|
+
puts "# entry"
|
|
558
|
+
puts f.entry
|
|
559
|
+
puts "# entry_id"
|
|
560
|
+
p f.entry_id
|
|
561
|
+
puts "# definition"
|
|
562
|
+
p f.definition
|
|
563
|
+
puts "# data"
|
|
564
|
+
p f.data
|
|
565
|
+
puts "# seq"
|
|
566
|
+
p f.seq
|
|
567
|
+
puts "# seq.type"
|
|
568
|
+
p f.seq.type
|
|
569
|
+
puts "# length"
|
|
570
|
+
p f.length
|
|
571
|
+
puts "# aaseq"
|
|
572
|
+
p f.aaseq
|
|
573
|
+
puts "# aaseq.type"
|
|
574
|
+
p f.aaseq.type
|
|
575
|
+
puts "# aaseq.composition"
|
|
576
|
+
p f.aaseq.composition
|
|
577
|
+
puts "# aalen"
|
|
578
|
+
p f.aalen
|
|
579
|
+
|
|
580
|
+
puts
|
|
581
|
+
|
|
582
|
+
n_str = <<END
|
|
583
|
+
>CRA3575282.F
|
|
584
|
+
24 15 23 29 20 13 20 21 21 23 22 25 13 22 17 15 25 27 32 26
|
|
585
|
+
32 29 29 25
|
|
586
|
+
END
|
|
587
|
+
|
|
588
|
+
n = Bio::FastaNumericFormat.new(n_str)
|
|
589
|
+
puts "### FastaNumericFormat"
|
|
590
|
+
puts "# entry"
|
|
591
|
+
puts n.entry
|
|
592
|
+
puts "# entry_id"
|
|
593
|
+
p n.entry_id
|
|
594
|
+
puts "# definition"
|
|
595
|
+
p n.definition
|
|
596
|
+
puts "# data"
|
|
597
|
+
p n.data
|
|
598
|
+
puts "# length"
|
|
599
|
+
p n.length
|
|
600
|
+
puts "# percent to ratio by yield"
|
|
601
|
+
n.each do |x|
|
|
602
|
+
p x/100.0
|
|
603
|
+
end
|
|
604
|
+
puts "# first three"
|
|
605
|
+
p n[0]
|
|
606
|
+
p n[1]
|
|
607
|
+
p n[2]
|
|
608
|
+
puts "# last one"
|
|
609
|
+
p n[-1]
|
|
610
|
+
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
=begin
|
|
614
|
+
|
|
615
|
+
= Bio::FastaFormat
|
|
616
|
+
|
|
617
|
+
Treats a FASTA formatted entry, such as:
|
|
618
|
+
|
|
619
|
+
>id and/or some comments <== comment line
|
|
620
|
+
ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
|
|
621
|
+
ATGCATGCATGCATGCATGCATGCATGCATGCATGC
|
|
622
|
+
ATGCATGCATGC
|
|
623
|
+
|
|
624
|
+
The precedent '>' can be omitted and the trailing '>' will be removed
|
|
625
|
+
automatically.
|
|
626
|
+
|
|
627
|
+
--- Bio::FastaFormat.new(entry)
|
|
628
|
+
|
|
629
|
+
Stores the comment and sequence information from one entry of the
|
|
630
|
+
FASTA format string. If the argument contains more than one
|
|
631
|
+
entry, only the first entry is used.
|
|
632
|
+
|
|
633
|
+
--- Bio::FastaFormat#entry
|
|
634
|
+
|
|
635
|
+
Returns the stored one entry as a FASTA format. (same as to_s)
|
|
636
|
+
|
|
637
|
+
--- Bio::FastaFormat#definition
|
|
638
|
+
|
|
639
|
+
Returns the comment line of the FASTA formatted data.
|
|
640
|
+
|
|
641
|
+
--- Bio::FastaFormat#seq
|
|
642
|
+
|
|
643
|
+
Returns a joined sequence line as a String.
|
|
644
|
+
|
|
645
|
+
--- Bio::FastaFormat#query(factory)
|
|
646
|
+
--- Bio::FastaFormat#fasta(factory)
|
|
647
|
+
--- Bio::FastaFormat#blast(factory)
|
|
648
|
+
|
|
649
|
+
Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast
|
|
650
|
+
factory object.
|
|
651
|
+
|
|
652
|
+
#!/usr/bin/env ruby
|
|
653
|
+
|
|
654
|
+
require 'bio'
|
|
655
|
+
|
|
656
|
+
factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
|
|
657
|
+
flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
|
|
658
|
+
flatfile.each do |entry|
|
|
659
|
+
p entry.definition
|
|
660
|
+
result = entry.fasta(factory)
|
|
661
|
+
result.each do |hit|
|
|
662
|
+
print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
|
|
663
|
+
p hit.lap_at
|
|
664
|
+
end
|
|
665
|
+
end
|
|
666
|
+
|
|
667
|
+
--- Bio::FastaFormat#length
|
|
668
|
+
|
|
669
|
+
Returns sequence length.
|
|
670
|
+
|
|
671
|
+
--- Bio::FastaFormat#naseq
|
|
672
|
+
--- Bio::FastaFormat#nalen
|
|
673
|
+
--- Bio::FastaFormat#aaseq
|
|
674
|
+
--- Bio::FastaFormat#aalen
|
|
675
|
+
|
|
676
|
+
If you know whether the sequence is NA or AA, use these methods.
|
|
677
|
+
'naseq' and 'aaseq' methods returen the Bio::Sequence::NA or
|
|
678
|
+
Bio::Sequence::AA object respectively. 'nalen' and 'aalen' methods
|
|
679
|
+
return the length of them.
|
|
680
|
+
|
|
681
|
+
--- Bio::FastaFormat#identifiers
|
|
682
|
+
|
|
683
|
+
Parsing FASTA Defline, and extract IDs.
|
|
684
|
+
IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
|
685
|
+
or ":"-separated IDs.
|
|
686
|
+
It returns a Bio::FastaDefline instance.
|
|
687
|
+
|
|
688
|
+
--- Bio::FastaFormat#entry_id
|
|
689
|
+
|
|
690
|
+
Parsing FASTA Defline (using #identifiers method), and
|
|
691
|
+
shows a possibly unique identifier.
|
|
692
|
+
It returns a string.
|
|
693
|
+
|
|
694
|
+
--- Bio::FastaFormat#gi
|
|
695
|
+
--- Bio::FastaFormat#locus
|
|
696
|
+
--- Bio::FastaFormat#accession
|
|
697
|
+
--- Bio::FastaFormat#acc_version
|
|
698
|
+
|
|
699
|
+
Parsing FASTA Defline (using #identifiers method), and
|
|
700
|
+
shows GI/locus/accession/accession with version number.
|
|
701
|
+
If a entry has more than two of such IDs,
|
|
702
|
+
only the first ID are shown.
|
|
703
|
+
It returns a string or nil.
|
|
704
|
+
|
|
705
|
+
--- Bio::FastaFormat#accessions
|
|
706
|
+
|
|
707
|
+
Parsing FASTA Defline (using #identifiers method), and
|
|
708
|
+
shows accession numbers.
|
|
709
|
+
It returns an array of strings.
|
|
710
|
+
|
|
711
|
+
--- Bio::FastaFormat
|
|
712
|
+
|
|
713
|
+
= Bio::FastaNumericFormat
|
|
714
|
+
|
|
715
|
+
Treats a FASTA formatted numerical entry, such as:
|
|
716
|
+
|
|
717
|
+
>id and/or some comments <== comment line
|
|
718
|
+
24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
|
|
719
|
+
22 17 15 25 27 32 26 32 29 29 25
|
|
720
|
+
|
|
721
|
+
The precedent '>' can be omitted and the trailing '>' will be removed
|
|
722
|
+
automatically.
|
|
723
|
+
|
|
724
|
+
--- Bio::FastaNumericFormat.new(entry)
|
|
725
|
+
|
|
726
|
+
Stores the comment and the list of the numerical data.
|
|
727
|
+
|
|
728
|
+
--- Bio::FastaNumericFormat#definition
|
|
729
|
+
|
|
730
|
+
The comment line of the FASTA formatted data.
|
|
731
|
+
|
|
732
|
+
--- Bio::FastaNumericFormat#data
|
|
733
|
+
|
|
734
|
+
Returns the list of the numerical data (typically the quality score
|
|
735
|
+
of its corresponding sequence) as an Array.
|
|
736
|
+
|
|
737
|
+
--- Bio::FastaNumericFormat#length
|
|
738
|
+
|
|
739
|
+
Returns the number of elements in the numerical data.
|
|
740
|
+
|
|
741
|
+
--- Bio::FastaNumericFormat#each
|
|
742
|
+
|
|
743
|
+
Yields on each elements of the numerical data.
|
|
744
|
+
|
|
745
|
+
--- Bio::FastaNumericFormat#[](n)
|
|
746
|
+
|
|
747
|
+
Returns the n-th element.
|
|
748
|
+
|
|
749
|
+
--- Bio::FastaNumericFormat#identifiers
|
|
750
|
+
--- Bio::FastaNumericFormat#entry_id
|
|
751
|
+
--- Bio::FastaNumericFormat#gi
|
|
752
|
+
--- Bio::FastaNumericFormat#locus
|
|
753
|
+
--- Bio::FastaNumericFormat#accession
|
|
754
|
+
--- Bio::FastaNumericFormat#acc_version
|
|
755
|
+
--- Bio::FastaNumericFormat#accessions
|
|
756
|
+
|
|
757
|
+
Same as Bio::FastaFormat.
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
= Bio::FastaDefline
|
|
761
|
+
|
|
762
|
+
Parsing FASTA Defline, and extract IDs and other informations.
|
|
763
|
+
IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
|
764
|
+
or ":"-separated IDs.
|
|
765
|
+
|
|
766
|
+
--- see also:
|
|
767
|
+
ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
|
768
|
+
http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
|
769
|
+
|
|
770
|
+
--- Bio::FastaDefline.new(str)
|
|
771
|
+
|
|
772
|
+
Parses given string.
|
|
773
|
+
|
|
774
|
+
--- Bio::FastaFormat#entry_id
|
|
775
|
+
|
|
776
|
+
Shows a possibly unique identifier.
|
|
777
|
+
Returns a string.
|
|
778
|
+
|
|
779
|
+
--- Bio::FastaDefline#gi
|
|
780
|
+
--- Bio::FastaDefline#locus
|
|
781
|
+
--- Bio::FastaDefline#accession
|
|
782
|
+
--- Bio::FastaDefline#acc_version
|
|
783
|
+
|
|
784
|
+
Shows GI/locus/accession/accession with version number.
|
|
785
|
+
If the entry has more than two of such IDs,
|
|
786
|
+
only the first ID are shown.
|
|
787
|
+
Returns a string or nil.
|
|
788
|
+
|
|
789
|
+
--- Bio::FastaFormat#accessions
|
|
790
|
+
|
|
791
|
+
Shows accession numbers.
|
|
792
|
+
Returns an array of strings.
|
|
793
|
+
|
|
794
|
+
--- Bio::FastaDefline#add_defline(str)
|
|
795
|
+
|
|
796
|
+
Parses given string and adds parsed data.
|
|
797
|
+
|
|
798
|
+
--- Bio::FastaDefline#to_s
|
|
799
|
+
|
|
800
|
+
Shows original string.
|
|
801
|
+
Note that the result of this method may be different from
|
|
802
|
+
original string which is given in FastaDefline.new method.
|
|
803
|
+
|
|
804
|
+
--- Bio::FastaDefline#id_strings
|
|
805
|
+
|
|
806
|
+
Shows ID-like strings.
|
|
807
|
+
Returns an array of strings.
|
|
808
|
+
|
|
809
|
+
--- Bio::FastaDefline#list_ids
|
|
810
|
+
|
|
811
|
+
Shows array that contains IDs (or ID-like strings).
|
|
812
|
+
Returns an array of arrays of strings.
|
|
813
|
+
|
|
814
|
+
--- Bio::FastaDefline#description
|
|
815
|
+
--- Bio::FastaDefline#descriptions
|
|
816
|
+
|
|
817
|
+
--- Bio::FastaDefline#words(case_sensitive = nil,
|
|
818
|
+
kill_words_regexp_array, kill_words_hash)
|
|
819
|
+
|
|
820
|
+
--- Bio::FastaDefline#get(tag_of_id)
|
|
821
|
+
|
|
822
|
+
--- Bio::FastaDefline#get_by_type(type_of_id)
|
|
823
|
+
|
|
824
|
+
--- Bio::FastaDefline#get_all_by_type(type_of_id)
|
|
825
|
+
|
|
826
|
+
--- examples:
|
|
827
|
+
rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
|
|
828
|
+
rub.entry_id ==> 'gi|671595'
|
|
829
|
+
rub.get('emb') ==> 'CAA85678.1'
|
|
830
|
+
rub.emb ==> 'CAA85678.1'
|
|
831
|
+
rub.gi ==> '671595'
|
|
832
|
+
rub.accession ==> 'CAA85678'
|
|
833
|
+
rub.accessions ==> [ 'CAA85678' ]
|
|
834
|
+
rub.acc_version ==> 'CAA85678.1'
|
|
835
|
+
rub.locus ==> nil
|
|
836
|
+
rub.list_ids ==> [["gi", "671595"],
|
|
837
|
+
["emb", "CAA85678.1", nil],
|
|
838
|
+
["Perovskia abrotanoides"]]
|
|
839
|
+
|
|
840
|
+
ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
|
841
|
+
ckr.entry_id ==> "gi|2495000"
|
|
842
|
+
ckr.sp ==> "CCKR_CAVPO"
|
|
843
|
+
ckr.pir ==> "I51898"
|
|
844
|
+
ckr.gb ==> "AAB29504.1"
|
|
845
|
+
ckr.gi ==> "2495000"
|
|
846
|
+
ckr.accession ==> "AAB29504"
|
|
847
|
+
ckr.accessions ==> ["Q63931", "AAB29504"]
|
|
848
|
+
ckr.acc_version ==> "AAB29504.1"
|
|
849
|
+
ckr.locus ==> nil
|
|
850
|
+
ckr.description ==>
|
|
851
|
+
"CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
|
852
|
+
ckr.descriptions ==>
|
|
853
|
+
["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
|
854
|
+
"cholecystokinin A receptor - guinea pig",
|
|
855
|
+
"cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
|
856
|
+
ckr.words ==>
|
|
857
|
+
["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
|
858
|
+
"receptor", "type"]
|
|
859
|
+
ckr.id_strings ==>
|
|
860
|
+
["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
|
861
|
+
"544724", "AAB29504.1", "Cavia"]
|
|
862
|
+
ckr.list_ids ==>
|
|
863
|
+
[["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
|
864
|
+
["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
|
865
|
+
["gb", "AAB29504.1", nil], ["Cavia"]]
|
|
866
|
+
|
|
867
|
+
=end
|
|
868
|
+
|
|
869
|
+
|