bio 1.2.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
data/lib/bio/db/embl/sptr.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2001-2006 Mitsuteru C. Nakao <n@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id
|
7
|
+
# $Id:$
|
8
8
|
#
|
9
9
|
# == Description
|
10
10
|
#
|
@@ -241,7 +241,7 @@ class SPTR < EMBLDB
|
|
241
241
|
records = gn_line.split(/\s*and\s*/)
|
242
242
|
records.each do |record|
|
243
243
|
gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
|
244
|
-
record.
|
244
|
+
record.each_line(';') do |element|
|
245
245
|
case element
|
246
246
|
when /Name=/ then
|
247
247
|
gene_hash[:name] = $'[0..-2]
|
@@ -505,11 +505,10 @@ class SPTR < EMBLDB
|
|
505
505
|
else
|
506
506
|
hash['journal'] = value
|
507
507
|
end
|
508
|
-
when 'RX' # PUBMED, MEDLINE
|
509
|
-
value.
|
510
|
-
tag, xref = item.split(/; /).map {|i| i.strip }
|
508
|
+
when 'RX' # PUBMED, MEDLINE, DOI
|
509
|
+
value.each do |tag, xref|
|
511
510
|
hash[ tag.downcase ] = xref
|
512
|
-
|
511
|
+
end
|
513
512
|
end
|
514
513
|
}
|
515
514
|
Reference.new(hash)
|
@@ -678,17 +677,17 @@ class SPTR < EMBLDB
|
|
678
677
|
when 'COFACTOR'
|
679
678
|
return @data['CC'][topic]
|
680
679
|
when 'DEVELOPMENTAL STAGE'
|
681
|
-
return @data['CC'][topic].
|
680
|
+
return @data['CC'][topic].join('')
|
682
681
|
when 'DISEASE'
|
683
|
-
return @data['CC'][topic].
|
682
|
+
return @data['CC'][topic].join('')
|
684
683
|
when 'DOMAIN'
|
685
684
|
return @data['CC'][topic]
|
686
685
|
when 'ENZYME REGULATION'
|
687
|
-
return @data['CC'][topic].
|
686
|
+
return @data['CC'][topic].join('')
|
688
687
|
when 'FUNCTION'
|
689
|
-
return @data['CC'][topic].
|
688
|
+
return @data['CC'][topic].join('')
|
690
689
|
when 'INDUCTION'
|
691
|
-
return @data['CC'][topic].
|
690
|
+
return @data['CC'][topic].join('')
|
692
691
|
when 'INTERACTION'
|
693
692
|
return cc_interaction(@data['CC'][topic])
|
694
693
|
when 'MASS SPECTROMETRY'
|
@@ -749,7 +748,7 @@ class SPTR < EMBLDB
|
|
749
748
|
|
750
749
|
|
751
750
|
def cc_alternative_products(data)
|
752
|
-
ap = data.
|
751
|
+
ap = data.join('')
|
753
752
|
return ap unless ap
|
754
753
|
|
755
754
|
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
|
@@ -822,7 +821,7 @@ class SPTR < EMBLDB
|
|
822
821
|
|
823
822
|
|
824
823
|
def cc_caution(data)
|
825
|
-
data.
|
824
|
+
data.join('')
|
826
825
|
end
|
827
826
|
private :cc_caution
|
828
827
|
|
@@ -831,7 +830,7 @@ class SPTR < EMBLDB
|
|
831
830
|
#
|
832
831
|
# CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
|
833
832
|
def cc_interaction(data)
|
834
|
-
str = data.
|
833
|
+
str = data.join('')
|
835
834
|
it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
|
836
835
|
it.map {|ent|
|
837
836
|
ent.map! {|x| x.strip }
|
@@ -894,7 +893,7 @@ class SPTR < EMBLDB
|
|
894
893
|
|
895
894
|
|
896
895
|
def cc_rna_editing(data)
|
897
|
-
|
896
|
+
data = data.join('')
|
898
897
|
entry = {'Modified_positions' => [], 'Note' => ""}
|
899
898
|
if data =~ /Modified_positions=(.+?)(\.|;)/
|
900
899
|
entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
|
@@ -961,7 +960,7 @@ class SPTR < EMBLDB
|
|
961
960
|
unless key
|
962
961
|
embl_dr
|
963
962
|
else
|
964
|
-
embl_dr[key].map {|x|
|
963
|
+
(embl_dr[key] or []).map {|x|
|
965
964
|
{'Accession' => x[0],
|
966
965
|
'Version' => x[1],
|
967
966
|
' ' => x[2],
|
data/lib/bio/db/fantom.rb
CHANGED
@@ -4,13 +4,11 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2003 GOTO Naohisa <ng@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id
|
7
|
+
# $Id:$
|
8
8
|
#
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
rescue LoadError
|
13
|
-
end
|
10
|
+
require 'rexml/document'
|
11
|
+
require 'cgi'
|
14
12
|
require 'uri'
|
15
13
|
require 'net/http'
|
16
14
|
|
@@ -32,17 +30,17 @@ module Bio
|
|
32
30
|
def get_by_id(idstr, http_proxy = nil)
|
33
31
|
addr = 'fantom.gsc.riken.go.jp'
|
34
32
|
port = 80
|
35
|
-
path = "/db/maxml/maxmlseq.cgi?masterid=#{
|
33
|
+
path = "/db/maxml/maxmlseq.cgi?masterid=#{CGI.escape(idstr.to_s)}&style=xml"
|
36
34
|
xml = ''
|
37
35
|
if http_proxy then
|
38
36
|
proxy = URI.parse(http_proxy.to_s)
|
39
37
|
Net::HTTP.start(addr, port, proxy.host, proxy.port) do |http|
|
40
|
-
response
|
38
|
+
response = http.get(path)
|
41
39
|
xml = response.body
|
42
40
|
end
|
43
41
|
else
|
44
42
|
Bio::Command.start_http(addr, port) do |http|
|
45
|
-
response
|
43
|
+
response = http.get(path)
|
46
44
|
xml = response.body
|
47
45
|
end
|
48
46
|
end
|
data/lib/bio/db/fasta.rb
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# = bio/db/fasta.rb - FASTA format class
|
3
3
|
#
|
4
4
|
# Copyright:: Copyright (C) 2001, 2002
|
5
|
-
#
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
6
6
|
# Toshiaki Katayama <k@bioruby.org>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id: fasta.rb,v 1.28
|
9
|
+
# $Id: fasta.rb,v 1.28.2.3 2008/06/20 13:43:36 ngoto Exp $
|
10
10
|
#
|
11
11
|
# == Description
|
12
12
|
#
|
@@ -14,45 +14,7 @@
|
|
14
14
|
#
|
15
15
|
# == Examples
|
16
16
|
#
|
17
|
-
#
|
18
|
-
# rub.entry_id ==> 'gi|671595'
|
19
|
-
# rub.get('emb') ==> 'CAA85678.1'
|
20
|
-
# rub.emb ==> 'CAA85678.1'
|
21
|
-
# rub.gi ==> '671595'
|
22
|
-
# rub.accession ==> 'CAA85678'
|
23
|
-
# rub.accessions ==> [ 'CAA85678' ]
|
24
|
-
# rub.acc_version ==> 'CAA85678.1'
|
25
|
-
# rub.locus ==> nil
|
26
|
-
# rub.list_ids ==> [["gi", "671595"],
|
27
|
-
# ["emb", "CAA85678.1", nil],
|
28
|
-
# ["Perovskia abrotanoides"]]
|
29
|
-
#
|
30
|
-
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
31
|
-
# ckr.entry_id ==> "gi|2495000"
|
32
|
-
# ckr.sp ==> "CCKR_CAVPO"
|
33
|
-
# ckr.pir ==> "I51898"
|
34
|
-
# ckr.gb ==> "AAB29504.1"
|
35
|
-
# ckr.gi ==> "2495000"
|
36
|
-
# ckr.accession ==> "AAB29504"
|
37
|
-
# ckr.accessions ==> ["Q63931", "AAB29504"]
|
38
|
-
# ckr.acc_version ==> "AAB29504.1"
|
39
|
-
# ckr.locus ==> nil
|
40
|
-
# ckr.description ==>
|
41
|
-
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
42
|
-
# ckr.descriptions ==>
|
43
|
-
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
44
|
-
# "cholecystokinin A receptor - guinea pig",
|
45
|
-
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
46
|
-
# ckr.words ==>
|
47
|
-
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
48
|
-
# "receptor", "type"]
|
49
|
-
# ckr.id_strings ==>
|
50
|
-
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
51
|
-
# "544724", "AAB29504.1", "Cavia"]
|
52
|
-
# ckr.list_ids ==>
|
53
|
-
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
54
|
-
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
55
|
-
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
17
|
+
# See documents of Bio::FastaFormat class.
|
56
18
|
#
|
57
19
|
# == References
|
58
20
|
#
|
@@ -65,6 +27,8 @@
|
|
65
27
|
|
66
28
|
require 'bio/db'
|
67
29
|
require 'bio/sequence'
|
30
|
+
require 'bio/sequence/dblink'
|
31
|
+
require 'bio/db/fasta/defline'
|
68
32
|
|
69
33
|
module Bio
|
70
34
|
|
@@ -81,7 +45,7 @@ module Bio
|
|
81
45
|
#
|
82
46
|
# === Examples
|
83
47
|
#
|
84
|
-
# f_str = <<
|
48
|
+
# f_str = <<END_OF_STRING
|
85
49
|
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
86
50
|
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
|
87
51
|
# VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
|
@@ -101,7 +65,7 @@ module Bio
|
|
101
65
|
# CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
|
102
66
|
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
|
103
67
|
# KTGDPLEWRRLFKKISTICRDIILIPN
|
104
|
-
#
|
68
|
+
# END_OF_STRING
|
105
69
|
#
|
106
70
|
# f = Bio::FastaFormat.new(f_str)
|
107
71
|
# puts "### FastaFormat"
|
@@ -253,12 +217,10 @@ module Bio
|
|
253
217
|
# might also be changed (but not always be changed)
|
254
218
|
# because of efficiency.
|
255
219
|
#
|
256
|
-
def
|
257
|
-
|
258
|
-
obj = Bio::Sequence.new(@seq)
|
259
|
-
obj.definition = self.definition
|
260
|
-
obj
|
220
|
+
def to_biosequence
|
221
|
+
Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
|
261
222
|
end
|
223
|
+
alias to_seq to_biosequence
|
262
224
|
|
263
225
|
# Parsing FASTA Defline, and extract IDs.
|
264
226
|
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
@@ -362,465 +324,6 @@ module Bio
|
|
362
324
|
|
363
325
|
end #class FastaNumericFormat
|
364
326
|
|
365
|
-
|
366
|
-
# Parsing FASTA Defline, and extract IDs and other informations.
|
367
|
-
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
368
|
-
# or ":"-separated IDs.
|
369
|
-
#
|
370
|
-
# specs are described in:
|
371
|
-
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
372
|
-
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
373
|
-
#
|
374
|
-
# === Examples
|
375
|
-
#
|
376
|
-
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
|
377
|
-
# rub.entry_id ==> 'gi|671595'
|
378
|
-
# rub.get('emb') ==> 'CAA85678.1'
|
379
|
-
# rub.emb ==> 'CAA85678.1'
|
380
|
-
# rub.gi ==> '671595'
|
381
|
-
# rub.accession ==> 'CAA85678'
|
382
|
-
# rub.accessions ==> [ 'CAA85678' ]
|
383
|
-
# rub.acc_version ==> 'CAA85678.1'
|
384
|
-
# rub.locus ==> nil
|
385
|
-
# rub.list_ids ==> [["gi", "671595"],
|
386
|
-
# ["emb", "CAA85678.1", nil],
|
387
|
-
# ["Perovskia abrotanoides"]]
|
388
|
-
#
|
389
|
-
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
390
|
-
# ckr.entry_id ==> "gi|2495000"
|
391
|
-
# ckr.sp ==> "CCKR_CAVPO"
|
392
|
-
# ckr.pir ==> "I51898"
|
393
|
-
# ckr.gb ==> "AAB29504.1"
|
394
|
-
# ckr.gi ==> "2495000"
|
395
|
-
# ckr.accession ==> "AAB29504"
|
396
|
-
# ckr.accessions ==> ["Q63931", "AAB29504"]
|
397
|
-
# ckr.acc_version ==> "AAB29504.1"
|
398
|
-
# ckr.locus ==> nil
|
399
|
-
# ckr.description ==>
|
400
|
-
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
401
|
-
# ckr.descriptions ==>
|
402
|
-
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
403
|
-
# "cholecystokinin A receptor - guinea pig",
|
404
|
-
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
405
|
-
# ckr.words ==>
|
406
|
-
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
407
|
-
# "receptor", "type"]
|
408
|
-
# ckr.id_strings ==>
|
409
|
-
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
410
|
-
# "544724", "AAB29504.1", "Cavia"]
|
411
|
-
# ckr.list_ids ==>
|
412
|
-
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
413
|
-
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
414
|
-
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
415
|
-
#
|
416
|
-
# === Refereneces
|
417
|
-
#
|
418
|
-
# * Fasta format description (NCBI)
|
419
|
-
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
420
|
-
#
|
421
|
-
# * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
|
422
|
-
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
423
|
-
#
|
424
|
-
# * README.formatdb
|
425
|
-
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
426
|
-
#
|
427
|
-
class FastaDefline
|
428
|
-
|
429
|
-
NSIDs = {
|
430
|
-
# NCBI and WU-BLAST
|
431
|
-
'gi' => [ 'gi' ], # NCBI GI
|
432
|
-
'gb' => [ 'acc_version', 'locus' ], # GenBank
|
433
|
-
'emb' => [ 'acc_version', 'locus' ], # EMBL
|
434
|
-
'dbj' => [ 'acc_version', 'locus' ], # DDBJ
|
435
|
-
'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
|
436
|
-
'pdb' => [ 'entry_id', 'chain' ], # PDB
|
437
|
-
'bbs' => [ 'number' ], # GenInfo Backbone Id
|
438
|
-
'gnl' => [ 'database' , 'entry_id' ], # General database identifier
|
439
|
-
'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
|
440
|
-
'lcl' => [ 'entry_id' ], # Local Sequence identifier
|
441
|
-
|
442
|
-
# WU-BLAST and NCBI
|
443
|
-
'pir' => [ 'accession', 'entry_id' ], # PIR
|
444
|
-
'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
|
445
|
-
'pat' => [ 'country', 'number', 'serial' ], # Patents
|
446
|
-
|
447
|
-
# WU-BLAST only
|
448
|
-
'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
|
449
|
-
'gim' => [ 'number' ], # NCBI GenInfo Import identifier
|
450
|
-
'gp' => [ 'acc_version', 'locus' ], # GenPept
|
451
|
-
'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
|
452
|
-
'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
|
453
|
-
'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
|
454
|
-
'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
|
455
|
-
|
456
|
-
# Original
|
457
|
-
'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
|
458
|
-
}
|
459
|
-
|
460
|
-
# Shows array that contains IDs (or ID-like strings).
|
461
|
-
# Returns an array of arrays of strings.
|
462
|
-
attr_reader :list_ids
|
463
|
-
|
464
|
-
# Shows a possibly unique identifier.
|
465
|
-
# Returns a string.
|
466
|
-
attr_reader :entry_id
|
467
|
-
|
468
|
-
# Parses given string.
|
469
|
-
def initialize(str)
|
470
|
-
@deflines = []
|
471
|
-
@info = {}
|
472
|
-
@list_ids = []
|
473
|
-
|
474
|
-
@entry_id = nil
|
475
|
-
|
476
|
-
lines = str.split("\x01")
|
477
|
-
lines.each do |line|
|
478
|
-
add_defline(line)
|
479
|
-
end
|
480
|
-
end #def initialize
|
481
|
-
|
482
|
-
# Parses given string and adds parsed data.
|
483
|
-
def add_defline(str)
|
484
|
-
case str
|
485
|
-
when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
|
486
|
-
# NSIDs
|
487
|
-
# examples:
|
488
|
-
# >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
|
489
|
-
#
|
490
|
-
# note: regexp (:?) means grouping without backreferences
|
491
|
-
i = $1
|
492
|
-
d = $2
|
493
|
-
tks = i.split('|')
|
494
|
-
tks << '' if i[-1,1] == '|'
|
495
|
-
a = parse_NSIDs(tks)
|
496
|
-
i = a[0].join('|')
|
497
|
-
a.unshift('|')
|
498
|
-
d = tks.join('|') + ' ' + d unless tks.empty?
|
499
|
-
a << d
|
500
|
-
this_line = a
|
501
|
-
match_EC(d)
|
502
|
-
parse_square_brackets(d).each do |x|
|
503
|
-
if !match_EC(x, false) and x =~ /\A[A-Z]/ then
|
504
|
-
di = [ x ]
|
505
|
-
@list_ids << di
|
506
|
-
@info['organism'] = x unless @info['organism']
|
507
|
-
end
|
508
|
-
end
|
509
|
-
|
510
|
-
when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
|
511
|
-
# examples:
|
512
|
-
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
513
|
-
# >emb:CACDC28 [X80034] C.albicans CDC28 gene
|
514
|
-
i = $1
|
515
|
-
d = $2
|
516
|
-
a = parse_ColonSepID(i)
|
517
|
-
i = a.join(':')
|
518
|
-
this_line = [ ':', a , d ]
|
519
|
-
match_EC(d)
|
520
|
-
parse_square_brackets(d).each do |x|
|
521
|
-
if !match_EC(x, false) and x =~ /:/ then
|
522
|
-
parse_ColonSepID(x)
|
523
|
-
elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
|
524
|
-
@list_ids << [ $1 ]
|
525
|
-
end
|
526
|
-
end
|
527
|
-
|
528
|
-
when /^\>?\s*(\S+)(?:\s+(.+))?$/
|
529
|
-
# examples:
|
530
|
-
# >ABC12345 this is test
|
531
|
-
i = $1
|
532
|
-
d = $2.to_s
|
533
|
-
@list_ids << [ i.chomp('.') ]
|
534
|
-
this_line = [ '', [ i ], d ]
|
535
|
-
match_EC(d)
|
536
|
-
else
|
537
|
-
i = str
|
538
|
-
d = ''
|
539
|
-
match_EC(i)
|
540
|
-
this_line = [ '', [ i ], d ]
|
541
|
-
end
|
542
|
-
|
543
|
-
@deflines << this_line
|
544
|
-
@entry_id = i unless @entry_id
|
545
|
-
end
|
546
|
-
|
547
|
-
def match_EC(str, write_flag = true)
|
548
|
-
di = nil
|
549
|
-
str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
|
550
|
-
di = [ 'EC', $1 ]
|
551
|
-
if write_flag then
|
552
|
-
@info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
|
553
|
-
@list_ids << di
|
554
|
-
end
|
555
|
-
end
|
556
|
-
di
|
557
|
-
end
|
558
|
-
private :match_EC
|
559
|
-
|
560
|
-
def parse_square_brackets(str)
|
561
|
-
r = []
|
562
|
-
str.scan(/\[([^\]]*)\]/) do |x|
|
563
|
-
r << x[0]
|
564
|
-
end
|
565
|
-
r
|
566
|
-
end
|
567
|
-
private :parse_square_brackets
|
568
|
-
|
569
|
-
def parse_ColonSepID(str)
|
570
|
-
di = str.split(':', 2)
|
571
|
-
di << nil if di.size <= 1
|
572
|
-
@list_ids << di
|
573
|
-
di
|
574
|
-
end
|
575
|
-
private :parse_ColonSepID
|
576
|
-
|
577
|
-
def parse_NSIDs(ary)
|
578
|
-
# this method destroys ary
|
579
|
-
data = []
|
580
|
-
while token = ary.shift
|
581
|
-
if labels = self.class::NSIDs[token] then
|
582
|
-
di = [ token ]
|
583
|
-
idtype = token
|
584
|
-
labels.each do |x|
|
585
|
-
token = ary.shift
|
586
|
-
break unless token
|
587
|
-
if self.class::NSIDs[token] then
|
588
|
-
ary.unshift(token)
|
589
|
-
break #each
|
590
|
-
end
|
591
|
-
if token.length > 0 then
|
592
|
-
di << token
|
593
|
-
else
|
594
|
-
di << nil
|
595
|
-
end
|
596
|
-
end
|
597
|
-
data << di
|
598
|
-
else
|
599
|
-
if token.length > 0 then
|
600
|
-
# UCID (uncontrolled identifiers)
|
601
|
-
di = [ token ]
|
602
|
-
data << di
|
603
|
-
@info['ucid'] = token unless @info['ucid']
|
604
|
-
end
|
605
|
-
break #while
|
606
|
-
end
|
607
|
-
end #while
|
608
|
-
@list_ids.concat data
|
609
|
-
data
|
610
|
-
end #def parse_NSIDs
|
611
|
-
private :parse_NSIDs
|
612
|
-
|
613
|
-
|
614
|
-
# Shows original string.
|
615
|
-
# Note that the result of this method may be different from
|
616
|
-
# original string which is given in FastaDefline.new method.
|
617
|
-
def to_s
|
618
|
-
@deflines.collect { |a|
|
619
|
-
s = a[0]
|
620
|
-
(a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
|
621
|
-
}.join("\x01")
|
622
|
-
end
|
623
|
-
|
624
|
-
# Shows description.
|
625
|
-
def description
|
626
|
-
@deflines[0].to_a[-1]
|
627
|
-
end
|
628
|
-
|
629
|
-
# Returns descriptions.
|
630
|
-
def descriptions
|
631
|
-
@deflines.collect do |a|
|
632
|
-
a[-1]
|
633
|
-
end
|
634
|
-
end
|
635
|
-
|
636
|
-
# Shows ID-like strings.
|
637
|
-
# Returns an array of strings.
|
638
|
-
def id_strings
|
639
|
-
r = []
|
640
|
-
@list_ids.each do |a|
|
641
|
-
if a.size >= 2 then
|
642
|
-
r.concat a[1..-1].find_all { |x| x }
|
643
|
-
else
|
644
|
-
if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
|
645
|
-
r << a[0]
|
646
|
-
end
|
647
|
-
end
|
648
|
-
end
|
649
|
-
r.concat( words(true, []).find_all do |x|
|
650
|
-
x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
|
651
|
-
x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
652
|
-
end)
|
653
|
-
r
|
654
|
-
end
|
655
|
-
|
656
|
-
KillWords = [
|
657
|
-
'an', 'the', 'this', 'that',
|
658
|
-
'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
|
659
|
-
'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
|
660
|
-
'from', 'and', 'or', 'not',
|
661
|
-
'dna', 'rna', 'mrna', 'cdna', 'orf',
|
662
|
-
'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
|
663
|
-
'similar', 'involved', 'identical', 'identity',
|
664
|
-
'cds', 'clone', 'library', 'contig', 'contigs',
|
665
|
-
'homolog', 'homologue', 'homologs', 'homologous',
|
666
|
-
'protein', 'proteins', 'gene', 'genes',
|
667
|
-
'product', 'products', 'sequence', 'sequences',
|
668
|
-
'strain', 'strains', 'region', 'regions',
|
669
|
-
]
|
670
|
-
KillWordsHash = {}
|
671
|
-
KillWords.each { |x| KillWordsHash[x] = true }
|
672
|
-
|
673
|
-
KillRegexpArray = [
|
674
|
-
/\A\d{1,3}\%?\z/,
|
675
|
-
/\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
|
676
|
-
/\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
677
|
-
]
|
678
|
-
|
679
|
-
# Shows words used in the defline. Returns an Array.
|
680
|
-
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
|
681
|
-
kwhash = self.class::KillWordsHash)
|
682
|
-
a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
|
683
|
-
a.collect! do |x|
|
684
|
-
x.sub!(/\A[\$\*\-\+]+/, '')
|
685
|
-
x.sub!(/[\$\*\-\=]+\z/, '')
|
686
|
-
if x.size <= 1 then
|
687
|
-
nil
|
688
|
-
elsif kwhash[x.downcase] then
|
689
|
-
nil
|
690
|
-
else
|
691
|
-
if kill_regexp.find { |expr| expr =~ x } then
|
692
|
-
nil
|
693
|
-
else
|
694
|
-
x
|
695
|
-
end
|
696
|
-
end
|
697
|
-
end
|
698
|
-
a.compact!
|
699
|
-
a.collect! { |x| x.downcase } unless case_sensitive
|
700
|
-
a.sort!
|
701
|
-
a.uniq!
|
702
|
-
a
|
703
|
-
end
|
704
|
-
|
705
|
-
# Returns identifires by a database name.
|
706
|
-
def get(dbname)
|
707
|
-
db = dbname.to_s
|
708
|
-
r = nil
|
709
|
-
unless r = @info[db] then
|
710
|
-
di = @list_ids.find { |x| x[0] == db.to_s }
|
711
|
-
if di and di.size <= 2 then
|
712
|
-
r = di[-1]
|
713
|
-
elsif di then
|
714
|
-
labels = self.class::NSIDs[db]
|
715
|
-
[ 'acc_version', 'entry_id',
|
716
|
-
'locus', 'accession', 'number'].each do |x|
|
717
|
-
if i = labels.index(x) then
|
718
|
-
r = di[i+1]
|
719
|
-
break if r
|
720
|
-
end
|
721
|
-
end
|
722
|
-
r = di[1..-1].find { |x| x } unless r
|
723
|
-
end
|
724
|
-
@info[db] = r if r
|
725
|
-
end
|
726
|
-
r
|
727
|
-
end
|
728
|
-
|
729
|
-
# Returns an identifier by given type.
|
730
|
-
def get_by_type(type_str)
|
731
|
-
@list_ids.each do |x|
|
732
|
-
if labels = self.class::NSIDs[x[0]] then
|
733
|
-
if i = labels.index(type_str) then
|
734
|
-
return x[i+1]
|
735
|
-
end
|
736
|
-
end
|
737
|
-
end
|
738
|
-
nil
|
739
|
-
end
|
740
|
-
|
741
|
-
# Returns identifiers by given type.
|
742
|
-
def get_all_by_type(*type_strarg)
|
743
|
-
d = []
|
744
|
-
@list_ids.each do |x|
|
745
|
-
if labels = self.class::NSIDs[x[0]] then
|
746
|
-
type_strarg.each do |y|
|
747
|
-
if i = labels.index(y) then
|
748
|
-
d << x[i+1] if x[i+1]
|
749
|
-
end
|
750
|
-
end
|
751
|
-
end
|
752
|
-
end
|
753
|
-
d
|
754
|
-
end
|
755
|
-
|
756
|
-
# Shows locus.
|
757
|
-
# If the entry has more than two of such IDs,
|
758
|
-
# only the first ID are shown.
|
759
|
-
# Returns a string or nil.
|
760
|
-
def locus
|
761
|
-
unless defined?(@locus)
|
762
|
-
@locus = get_by_type('locus')
|
763
|
-
end
|
764
|
-
@locus
|
765
|
-
end
|
766
|
-
|
767
|
-
# Shows GI.
|
768
|
-
# If the entry has more than two of such IDs,
|
769
|
-
# only the first ID are shown.
|
770
|
-
# Returns a string or nil.
|
771
|
-
def gi
|
772
|
-
unless defined?(@gi) then
|
773
|
-
@gi = get_by_type('gi')
|
774
|
-
end
|
775
|
-
@gi
|
776
|
-
end
|
777
|
-
|
778
|
-
# Shows accession with version number.
|
779
|
-
# If the entry has more than two of such IDs,
|
780
|
-
# only the first ID are shown.
|
781
|
-
# Returns a string or nil.
|
782
|
-
def acc_version
|
783
|
-
unless defined?(@acc_version) then
|
784
|
-
@acc_version = get_by_type('acc_version')
|
785
|
-
end
|
786
|
-
@acc_version
|
787
|
-
end
|
788
|
-
|
789
|
-
# Shows accession numbers.
|
790
|
-
# Returns an array of strings.
|
791
|
-
def accessions
|
792
|
-
unless defined?(@accessions) then
|
793
|
-
@accessions = get_all_by_type('accession', 'acc_version')
|
794
|
-
@accessions.collect! { |x| x.sub(/\..*\z/, '') }
|
795
|
-
end
|
796
|
-
@accessions
|
797
|
-
end
|
798
|
-
|
799
|
-
# Shows an accession number.
|
800
|
-
def accession
|
801
|
-
unless defined?(@accession) then
|
802
|
-
if acc_version then
|
803
|
-
@accession = acc_version.split('.')[0]
|
804
|
-
else
|
805
|
-
@accession = accessions[0]
|
806
|
-
end
|
807
|
-
end
|
808
|
-
@accession
|
809
|
-
end
|
810
|
-
|
811
|
-
def method_missing(name, *args)
|
812
|
-
# raise ArgumentError,
|
813
|
-
# "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
|
814
|
-
r = get(name, *args)
|
815
|
-
if !r and !(self.class::NSIDs[name.to_s]) then
|
816
|
-
raise "NameError: undefined method `#{name.inspect}'"
|
817
|
-
end
|
818
|
-
r
|
819
|
-
end
|
820
|
-
|
821
|
-
|
822
|
-
end #class FastaDefline
|
823
|
-
|
824
327
|
end #module Bio
|
825
328
|
|
826
329
|
if __FILE__ == $0
|