bio 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
data/lib/bio/db/embl/sptr.rb
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# Copyright:: Copyright (C) 2001-2006 Mitsuteru C. Nakao <n@bioruby.org>
|
|
5
5
|
# License:: The Ruby License
|
|
6
6
|
#
|
|
7
|
-
# $Id
|
|
7
|
+
# $Id:$
|
|
8
8
|
#
|
|
9
9
|
# == Description
|
|
10
10
|
#
|
|
@@ -241,7 +241,7 @@ class SPTR < EMBLDB
|
|
|
241
241
|
records = gn_line.split(/\s*and\s*/)
|
|
242
242
|
records.each do |record|
|
|
243
243
|
gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
|
|
244
|
-
record.
|
|
244
|
+
record.each_line(';') do |element|
|
|
245
245
|
case element
|
|
246
246
|
when /Name=/ then
|
|
247
247
|
gene_hash[:name] = $'[0..-2]
|
|
@@ -505,11 +505,10 @@ class SPTR < EMBLDB
|
|
|
505
505
|
else
|
|
506
506
|
hash['journal'] = value
|
|
507
507
|
end
|
|
508
|
-
when 'RX' # PUBMED, MEDLINE
|
|
509
|
-
value.
|
|
510
|
-
tag, xref = item.split(/; /).map {|i| i.strip }
|
|
508
|
+
when 'RX' # PUBMED, MEDLINE, DOI
|
|
509
|
+
value.each do |tag, xref|
|
|
511
510
|
hash[ tag.downcase ] = xref
|
|
512
|
-
|
|
511
|
+
end
|
|
513
512
|
end
|
|
514
513
|
}
|
|
515
514
|
Reference.new(hash)
|
|
@@ -678,17 +677,17 @@ class SPTR < EMBLDB
|
|
|
678
677
|
when 'COFACTOR'
|
|
679
678
|
return @data['CC'][topic]
|
|
680
679
|
when 'DEVELOPMENTAL STAGE'
|
|
681
|
-
return @data['CC'][topic].
|
|
680
|
+
return @data['CC'][topic].join('')
|
|
682
681
|
when 'DISEASE'
|
|
683
|
-
return @data['CC'][topic].
|
|
682
|
+
return @data['CC'][topic].join('')
|
|
684
683
|
when 'DOMAIN'
|
|
685
684
|
return @data['CC'][topic]
|
|
686
685
|
when 'ENZYME REGULATION'
|
|
687
|
-
return @data['CC'][topic].
|
|
686
|
+
return @data['CC'][topic].join('')
|
|
688
687
|
when 'FUNCTION'
|
|
689
|
-
return @data['CC'][topic].
|
|
688
|
+
return @data['CC'][topic].join('')
|
|
690
689
|
when 'INDUCTION'
|
|
691
|
-
return @data['CC'][topic].
|
|
690
|
+
return @data['CC'][topic].join('')
|
|
692
691
|
when 'INTERACTION'
|
|
693
692
|
return cc_interaction(@data['CC'][topic])
|
|
694
693
|
when 'MASS SPECTROMETRY'
|
|
@@ -749,7 +748,7 @@ class SPTR < EMBLDB
|
|
|
749
748
|
|
|
750
749
|
|
|
751
750
|
def cc_alternative_products(data)
|
|
752
|
-
ap = data.
|
|
751
|
+
ap = data.join('')
|
|
753
752
|
return ap unless ap
|
|
754
753
|
|
|
755
754
|
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
|
|
@@ -822,7 +821,7 @@ class SPTR < EMBLDB
|
|
|
822
821
|
|
|
823
822
|
|
|
824
823
|
def cc_caution(data)
|
|
825
|
-
data.
|
|
824
|
+
data.join('')
|
|
826
825
|
end
|
|
827
826
|
private :cc_caution
|
|
828
827
|
|
|
@@ -831,7 +830,7 @@ class SPTR < EMBLDB
|
|
|
831
830
|
#
|
|
832
831
|
# CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
|
|
833
832
|
def cc_interaction(data)
|
|
834
|
-
str = data.
|
|
833
|
+
str = data.join('')
|
|
835
834
|
it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
|
|
836
835
|
it.map {|ent|
|
|
837
836
|
ent.map! {|x| x.strip }
|
|
@@ -894,7 +893,7 @@ class SPTR < EMBLDB
|
|
|
894
893
|
|
|
895
894
|
|
|
896
895
|
def cc_rna_editing(data)
|
|
897
|
-
|
|
896
|
+
data = data.join('')
|
|
898
897
|
entry = {'Modified_positions' => [], 'Note' => ""}
|
|
899
898
|
if data =~ /Modified_positions=(.+?)(\.|;)/
|
|
900
899
|
entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
|
|
@@ -961,7 +960,7 @@ class SPTR < EMBLDB
|
|
|
961
960
|
unless key
|
|
962
961
|
embl_dr
|
|
963
962
|
else
|
|
964
|
-
embl_dr[key].map {|x|
|
|
963
|
+
(embl_dr[key] or []).map {|x|
|
|
965
964
|
{'Accession' => x[0],
|
|
966
965
|
'Version' => x[1],
|
|
967
966
|
' ' => x[2],
|
data/lib/bio/db/fantom.rb
CHANGED
|
@@ -4,13 +4,11 @@
|
|
|
4
4
|
# Copyright:: Copyright (C) 2003 GOTO Naohisa <ng@bioruby.org>
|
|
5
5
|
# License:: The Ruby License
|
|
6
6
|
#
|
|
7
|
-
# $Id
|
|
7
|
+
# $Id:$
|
|
8
8
|
#
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
rescue LoadError
|
|
13
|
-
end
|
|
10
|
+
require 'rexml/document'
|
|
11
|
+
require 'cgi'
|
|
14
12
|
require 'uri'
|
|
15
13
|
require 'net/http'
|
|
16
14
|
|
|
@@ -32,17 +30,17 @@ module Bio
|
|
|
32
30
|
def get_by_id(idstr, http_proxy = nil)
|
|
33
31
|
addr = 'fantom.gsc.riken.go.jp'
|
|
34
32
|
port = 80
|
|
35
|
-
path = "/db/maxml/maxmlseq.cgi?masterid=#{
|
|
33
|
+
path = "/db/maxml/maxmlseq.cgi?masterid=#{CGI.escape(idstr.to_s)}&style=xml"
|
|
36
34
|
xml = ''
|
|
37
35
|
if http_proxy then
|
|
38
36
|
proxy = URI.parse(http_proxy.to_s)
|
|
39
37
|
Net::HTTP.start(addr, port, proxy.host, proxy.port) do |http|
|
|
40
|
-
response
|
|
38
|
+
response = http.get(path)
|
|
41
39
|
xml = response.body
|
|
42
40
|
end
|
|
43
41
|
else
|
|
44
42
|
Bio::Command.start_http(addr, port) do |http|
|
|
45
|
-
response
|
|
43
|
+
response = http.get(path)
|
|
46
44
|
xml = response.body
|
|
47
45
|
end
|
|
48
46
|
end
|
data/lib/bio/db/fasta.rb
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
# = bio/db/fasta.rb - FASTA format class
|
|
3
3
|
#
|
|
4
4
|
# Copyright:: Copyright (C) 2001, 2002
|
|
5
|
-
#
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
|
6
6
|
# Toshiaki Katayama <k@bioruby.org>
|
|
7
7
|
# License:: The Ruby License
|
|
8
8
|
#
|
|
9
|
-
# $Id: fasta.rb,v 1.28
|
|
9
|
+
# $Id: fasta.rb,v 1.28.2.3 2008/06/20 13:43:36 ngoto Exp $
|
|
10
10
|
#
|
|
11
11
|
# == Description
|
|
12
12
|
#
|
|
@@ -14,45 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
# == Examples
|
|
16
16
|
#
|
|
17
|
-
#
|
|
18
|
-
# rub.entry_id ==> 'gi|671595'
|
|
19
|
-
# rub.get('emb') ==> 'CAA85678.1'
|
|
20
|
-
# rub.emb ==> 'CAA85678.1'
|
|
21
|
-
# rub.gi ==> '671595'
|
|
22
|
-
# rub.accession ==> 'CAA85678'
|
|
23
|
-
# rub.accessions ==> [ 'CAA85678' ]
|
|
24
|
-
# rub.acc_version ==> 'CAA85678.1'
|
|
25
|
-
# rub.locus ==> nil
|
|
26
|
-
# rub.list_ids ==> [["gi", "671595"],
|
|
27
|
-
# ["emb", "CAA85678.1", nil],
|
|
28
|
-
# ["Perovskia abrotanoides"]]
|
|
29
|
-
#
|
|
30
|
-
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
|
31
|
-
# ckr.entry_id ==> "gi|2495000"
|
|
32
|
-
# ckr.sp ==> "CCKR_CAVPO"
|
|
33
|
-
# ckr.pir ==> "I51898"
|
|
34
|
-
# ckr.gb ==> "AAB29504.1"
|
|
35
|
-
# ckr.gi ==> "2495000"
|
|
36
|
-
# ckr.accession ==> "AAB29504"
|
|
37
|
-
# ckr.accessions ==> ["Q63931", "AAB29504"]
|
|
38
|
-
# ckr.acc_version ==> "AAB29504.1"
|
|
39
|
-
# ckr.locus ==> nil
|
|
40
|
-
# ckr.description ==>
|
|
41
|
-
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
|
42
|
-
# ckr.descriptions ==>
|
|
43
|
-
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
|
44
|
-
# "cholecystokinin A receptor - guinea pig",
|
|
45
|
-
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
|
46
|
-
# ckr.words ==>
|
|
47
|
-
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
|
48
|
-
# "receptor", "type"]
|
|
49
|
-
# ckr.id_strings ==>
|
|
50
|
-
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
|
51
|
-
# "544724", "AAB29504.1", "Cavia"]
|
|
52
|
-
# ckr.list_ids ==>
|
|
53
|
-
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
|
54
|
-
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
|
55
|
-
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
|
17
|
+
# See documents of Bio::FastaFormat class.
|
|
56
18
|
#
|
|
57
19
|
# == References
|
|
58
20
|
#
|
|
@@ -65,6 +27,8 @@
|
|
|
65
27
|
|
|
66
28
|
require 'bio/db'
|
|
67
29
|
require 'bio/sequence'
|
|
30
|
+
require 'bio/sequence/dblink'
|
|
31
|
+
require 'bio/db/fasta/defline'
|
|
68
32
|
|
|
69
33
|
module Bio
|
|
70
34
|
|
|
@@ -81,7 +45,7 @@ module Bio
|
|
|
81
45
|
#
|
|
82
46
|
# === Examples
|
|
83
47
|
#
|
|
84
|
-
# f_str = <<
|
|
48
|
+
# f_str = <<END_OF_STRING
|
|
85
49
|
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
|
86
50
|
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
|
|
87
51
|
# VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
|
|
@@ -101,7 +65,7 @@ module Bio
|
|
|
101
65
|
# CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
|
|
102
66
|
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
|
|
103
67
|
# KTGDPLEWRRLFKKISTICRDIILIPN
|
|
104
|
-
#
|
|
68
|
+
# END_OF_STRING
|
|
105
69
|
#
|
|
106
70
|
# f = Bio::FastaFormat.new(f_str)
|
|
107
71
|
# puts "### FastaFormat"
|
|
@@ -253,12 +217,10 @@ module Bio
|
|
|
253
217
|
# might also be changed (but not always be changed)
|
|
254
218
|
# because of efficiency.
|
|
255
219
|
#
|
|
256
|
-
def
|
|
257
|
-
|
|
258
|
-
obj = Bio::Sequence.new(@seq)
|
|
259
|
-
obj.definition = self.definition
|
|
260
|
-
obj
|
|
220
|
+
def to_biosequence
|
|
221
|
+
Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
|
|
261
222
|
end
|
|
223
|
+
alias to_seq to_biosequence
|
|
262
224
|
|
|
263
225
|
# Parsing FASTA Defline, and extract IDs.
|
|
264
226
|
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
|
@@ -362,465 +324,6 @@ module Bio
|
|
|
362
324
|
|
|
363
325
|
end #class FastaNumericFormat
|
|
364
326
|
|
|
365
|
-
|
|
366
|
-
# Parsing FASTA Defline, and extract IDs and other informations.
|
|
367
|
-
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
|
368
|
-
# or ":"-separated IDs.
|
|
369
|
-
#
|
|
370
|
-
# specs are described in:
|
|
371
|
-
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
|
372
|
-
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
|
373
|
-
#
|
|
374
|
-
# === Examples
|
|
375
|
-
#
|
|
376
|
-
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
|
|
377
|
-
# rub.entry_id ==> 'gi|671595'
|
|
378
|
-
# rub.get('emb') ==> 'CAA85678.1'
|
|
379
|
-
# rub.emb ==> 'CAA85678.1'
|
|
380
|
-
# rub.gi ==> '671595'
|
|
381
|
-
# rub.accession ==> 'CAA85678'
|
|
382
|
-
# rub.accessions ==> [ 'CAA85678' ]
|
|
383
|
-
# rub.acc_version ==> 'CAA85678.1'
|
|
384
|
-
# rub.locus ==> nil
|
|
385
|
-
# rub.list_ids ==> [["gi", "671595"],
|
|
386
|
-
# ["emb", "CAA85678.1", nil],
|
|
387
|
-
# ["Perovskia abrotanoides"]]
|
|
388
|
-
#
|
|
389
|
-
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
|
390
|
-
# ckr.entry_id ==> "gi|2495000"
|
|
391
|
-
# ckr.sp ==> "CCKR_CAVPO"
|
|
392
|
-
# ckr.pir ==> "I51898"
|
|
393
|
-
# ckr.gb ==> "AAB29504.1"
|
|
394
|
-
# ckr.gi ==> "2495000"
|
|
395
|
-
# ckr.accession ==> "AAB29504"
|
|
396
|
-
# ckr.accessions ==> ["Q63931", "AAB29504"]
|
|
397
|
-
# ckr.acc_version ==> "AAB29504.1"
|
|
398
|
-
# ckr.locus ==> nil
|
|
399
|
-
# ckr.description ==>
|
|
400
|
-
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
|
401
|
-
# ckr.descriptions ==>
|
|
402
|
-
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
|
403
|
-
# "cholecystokinin A receptor - guinea pig",
|
|
404
|
-
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
|
405
|
-
# ckr.words ==>
|
|
406
|
-
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
|
407
|
-
# "receptor", "type"]
|
|
408
|
-
# ckr.id_strings ==>
|
|
409
|
-
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
|
410
|
-
# "544724", "AAB29504.1", "Cavia"]
|
|
411
|
-
# ckr.list_ids ==>
|
|
412
|
-
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
|
413
|
-
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
|
414
|
-
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
|
415
|
-
#
|
|
416
|
-
# === Refereneces
|
|
417
|
-
#
|
|
418
|
-
# * Fasta format description (NCBI)
|
|
419
|
-
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
|
420
|
-
#
|
|
421
|
-
# * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
|
|
422
|
-
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
|
423
|
-
#
|
|
424
|
-
# * README.formatdb
|
|
425
|
-
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
|
426
|
-
#
|
|
427
|
-
class FastaDefline
|
|
428
|
-
|
|
429
|
-
NSIDs = {
|
|
430
|
-
# NCBI and WU-BLAST
|
|
431
|
-
'gi' => [ 'gi' ], # NCBI GI
|
|
432
|
-
'gb' => [ 'acc_version', 'locus' ], # GenBank
|
|
433
|
-
'emb' => [ 'acc_version', 'locus' ], # EMBL
|
|
434
|
-
'dbj' => [ 'acc_version', 'locus' ], # DDBJ
|
|
435
|
-
'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
|
|
436
|
-
'pdb' => [ 'entry_id', 'chain' ], # PDB
|
|
437
|
-
'bbs' => [ 'number' ], # GenInfo Backbone Id
|
|
438
|
-
'gnl' => [ 'database' , 'entry_id' ], # General database identifier
|
|
439
|
-
'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
|
|
440
|
-
'lcl' => [ 'entry_id' ], # Local Sequence identifier
|
|
441
|
-
|
|
442
|
-
# WU-BLAST and NCBI
|
|
443
|
-
'pir' => [ 'accession', 'entry_id' ], # PIR
|
|
444
|
-
'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
|
|
445
|
-
'pat' => [ 'country', 'number', 'serial' ], # Patents
|
|
446
|
-
|
|
447
|
-
# WU-BLAST only
|
|
448
|
-
'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
|
|
449
|
-
'gim' => [ 'number' ], # NCBI GenInfo Import identifier
|
|
450
|
-
'gp' => [ 'acc_version', 'locus' ], # GenPept
|
|
451
|
-
'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
|
|
452
|
-
'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
|
|
453
|
-
'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
|
|
454
|
-
'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
|
|
455
|
-
|
|
456
|
-
# Original
|
|
457
|
-
'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
# Shows array that contains IDs (or ID-like strings).
|
|
461
|
-
# Returns an array of arrays of strings.
|
|
462
|
-
attr_reader :list_ids
|
|
463
|
-
|
|
464
|
-
# Shows a possibly unique identifier.
|
|
465
|
-
# Returns a string.
|
|
466
|
-
attr_reader :entry_id
|
|
467
|
-
|
|
468
|
-
# Parses given string.
|
|
469
|
-
def initialize(str)
|
|
470
|
-
@deflines = []
|
|
471
|
-
@info = {}
|
|
472
|
-
@list_ids = []
|
|
473
|
-
|
|
474
|
-
@entry_id = nil
|
|
475
|
-
|
|
476
|
-
lines = str.split("\x01")
|
|
477
|
-
lines.each do |line|
|
|
478
|
-
add_defline(line)
|
|
479
|
-
end
|
|
480
|
-
end #def initialize
|
|
481
|
-
|
|
482
|
-
# Parses given string and adds parsed data.
|
|
483
|
-
def add_defline(str)
|
|
484
|
-
case str
|
|
485
|
-
when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
|
|
486
|
-
# NSIDs
|
|
487
|
-
# examples:
|
|
488
|
-
# >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
|
|
489
|
-
#
|
|
490
|
-
# note: regexp (:?) means grouping without backreferences
|
|
491
|
-
i = $1
|
|
492
|
-
d = $2
|
|
493
|
-
tks = i.split('|')
|
|
494
|
-
tks << '' if i[-1,1] == '|'
|
|
495
|
-
a = parse_NSIDs(tks)
|
|
496
|
-
i = a[0].join('|')
|
|
497
|
-
a.unshift('|')
|
|
498
|
-
d = tks.join('|') + ' ' + d unless tks.empty?
|
|
499
|
-
a << d
|
|
500
|
-
this_line = a
|
|
501
|
-
match_EC(d)
|
|
502
|
-
parse_square_brackets(d).each do |x|
|
|
503
|
-
if !match_EC(x, false) and x =~ /\A[A-Z]/ then
|
|
504
|
-
di = [ x ]
|
|
505
|
-
@list_ids << di
|
|
506
|
-
@info['organism'] = x unless @info['organism']
|
|
507
|
-
end
|
|
508
|
-
end
|
|
509
|
-
|
|
510
|
-
when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
|
|
511
|
-
# examples:
|
|
512
|
-
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
|
513
|
-
# >emb:CACDC28 [X80034] C.albicans CDC28 gene
|
|
514
|
-
i = $1
|
|
515
|
-
d = $2
|
|
516
|
-
a = parse_ColonSepID(i)
|
|
517
|
-
i = a.join(':')
|
|
518
|
-
this_line = [ ':', a , d ]
|
|
519
|
-
match_EC(d)
|
|
520
|
-
parse_square_brackets(d).each do |x|
|
|
521
|
-
if !match_EC(x, false) and x =~ /:/ then
|
|
522
|
-
parse_ColonSepID(x)
|
|
523
|
-
elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
|
|
524
|
-
@list_ids << [ $1 ]
|
|
525
|
-
end
|
|
526
|
-
end
|
|
527
|
-
|
|
528
|
-
when /^\>?\s*(\S+)(?:\s+(.+))?$/
|
|
529
|
-
# examples:
|
|
530
|
-
# >ABC12345 this is test
|
|
531
|
-
i = $1
|
|
532
|
-
d = $2.to_s
|
|
533
|
-
@list_ids << [ i.chomp('.') ]
|
|
534
|
-
this_line = [ '', [ i ], d ]
|
|
535
|
-
match_EC(d)
|
|
536
|
-
else
|
|
537
|
-
i = str
|
|
538
|
-
d = ''
|
|
539
|
-
match_EC(i)
|
|
540
|
-
this_line = [ '', [ i ], d ]
|
|
541
|
-
end
|
|
542
|
-
|
|
543
|
-
@deflines << this_line
|
|
544
|
-
@entry_id = i unless @entry_id
|
|
545
|
-
end
|
|
546
|
-
|
|
547
|
-
def match_EC(str, write_flag = true)
|
|
548
|
-
di = nil
|
|
549
|
-
str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
|
|
550
|
-
di = [ 'EC', $1 ]
|
|
551
|
-
if write_flag then
|
|
552
|
-
@info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
|
|
553
|
-
@list_ids << di
|
|
554
|
-
end
|
|
555
|
-
end
|
|
556
|
-
di
|
|
557
|
-
end
|
|
558
|
-
private :match_EC
|
|
559
|
-
|
|
560
|
-
def parse_square_brackets(str)
|
|
561
|
-
r = []
|
|
562
|
-
str.scan(/\[([^\]]*)\]/) do |x|
|
|
563
|
-
r << x[0]
|
|
564
|
-
end
|
|
565
|
-
r
|
|
566
|
-
end
|
|
567
|
-
private :parse_square_brackets
|
|
568
|
-
|
|
569
|
-
def parse_ColonSepID(str)
|
|
570
|
-
di = str.split(':', 2)
|
|
571
|
-
di << nil if di.size <= 1
|
|
572
|
-
@list_ids << di
|
|
573
|
-
di
|
|
574
|
-
end
|
|
575
|
-
private :parse_ColonSepID
|
|
576
|
-
|
|
577
|
-
def parse_NSIDs(ary)
|
|
578
|
-
# this method destroys ary
|
|
579
|
-
data = []
|
|
580
|
-
while token = ary.shift
|
|
581
|
-
if labels = self.class::NSIDs[token] then
|
|
582
|
-
di = [ token ]
|
|
583
|
-
idtype = token
|
|
584
|
-
labels.each do |x|
|
|
585
|
-
token = ary.shift
|
|
586
|
-
break unless token
|
|
587
|
-
if self.class::NSIDs[token] then
|
|
588
|
-
ary.unshift(token)
|
|
589
|
-
break #each
|
|
590
|
-
end
|
|
591
|
-
if token.length > 0 then
|
|
592
|
-
di << token
|
|
593
|
-
else
|
|
594
|
-
di << nil
|
|
595
|
-
end
|
|
596
|
-
end
|
|
597
|
-
data << di
|
|
598
|
-
else
|
|
599
|
-
if token.length > 0 then
|
|
600
|
-
# UCID (uncontrolled identifiers)
|
|
601
|
-
di = [ token ]
|
|
602
|
-
data << di
|
|
603
|
-
@info['ucid'] = token unless @info['ucid']
|
|
604
|
-
end
|
|
605
|
-
break #while
|
|
606
|
-
end
|
|
607
|
-
end #while
|
|
608
|
-
@list_ids.concat data
|
|
609
|
-
data
|
|
610
|
-
end #def parse_NSIDs
|
|
611
|
-
private :parse_NSIDs
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
# Shows original string.
|
|
615
|
-
# Note that the result of this method may be different from
|
|
616
|
-
# original string which is given in FastaDefline.new method.
|
|
617
|
-
def to_s
|
|
618
|
-
@deflines.collect { |a|
|
|
619
|
-
s = a[0]
|
|
620
|
-
(a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
|
|
621
|
-
}.join("\x01")
|
|
622
|
-
end
|
|
623
|
-
|
|
624
|
-
# Shows description.
|
|
625
|
-
def description
|
|
626
|
-
@deflines[0].to_a[-1]
|
|
627
|
-
end
|
|
628
|
-
|
|
629
|
-
# Returns descriptions.
|
|
630
|
-
def descriptions
|
|
631
|
-
@deflines.collect do |a|
|
|
632
|
-
a[-1]
|
|
633
|
-
end
|
|
634
|
-
end
|
|
635
|
-
|
|
636
|
-
# Shows ID-like strings.
|
|
637
|
-
# Returns an array of strings.
|
|
638
|
-
def id_strings
|
|
639
|
-
r = []
|
|
640
|
-
@list_ids.each do |a|
|
|
641
|
-
if a.size >= 2 then
|
|
642
|
-
r.concat a[1..-1].find_all { |x| x }
|
|
643
|
-
else
|
|
644
|
-
if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
|
|
645
|
-
r << a[0]
|
|
646
|
-
end
|
|
647
|
-
end
|
|
648
|
-
end
|
|
649
|
-
r.concat( words(true, []).find_all do |x|
|
|
650
|
-
x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
|
|
651
|
-
x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
|
652
|
-
end)
|
|
653
|
-
r
|
|
654
|
-
end
|
|
655
|
-
|
|
656
|
-
KillWords = [
|
|
657
|
-
'an', 'the', 'this', 'that',
|
|
658
|
-
'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
|
|
659
|
-
'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
|
|
660
|
-
'from', 'and', 'or', 'not',
|
|
661
|
-
'dna', 'rna', 'mrna', 'cdna', 'orf',
|
|
662
|
-
'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
|
|
663
|
-
'similar', 'involved', 'identical', 'identity',
|
|
664
|
-
'cds', 'clone', 'library', 'contig', 'contigs',
|
|
665
|
-
'homolog', 'homologue', 'homologs', 'homologous',
|
|
666
|
-
'protein', 'proteins', 'gene', 'genes',
|
|
667
|
-
'product', 'products', 'sequence', 'sequences',
|
|
668
|
-
'strain', 'strains', 'region', 'regions',
|
|
669
|
-
]
|
|
670
|
-
KillWordsHash = {}
|
|
671
|
-
KillWords.each { |x| KillWordsHash[x] = true }
|
|
672
|
-
|
|
673
|
-
KillRegexpArray = [
|
|
674
|
-
/\A\d{1,3}\%?\z/,
|
|
675
|
-
/\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
|
|
676
|
-
/\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
|
677
|
-
]
|
|
678
|
-
|
|
679
|
-
# Shows words used in the defline. Returns an Array.
|
|
680
|
-
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
|
|
681
|
-
kwhash = self.class::KillWordsHash)
|
|
682
|
-
a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
|
|
683
|
-
a.collect! do |x|
|
|
684
|
-
x.sub!(/\A[\$\*\-\+]+/, '')
|
|
685
|
-
x.sub!(/[\$\*\-\=]+\z/, '')
|
|
686
|
-
if x.size <= 1 then
|
|
687
|
-
nil
|
|
688
|
-
elsif kwhash[x.downcase] then
|
|
689
|
-
nil
|
|
690
|
-
else
|
|
691
|
-
if kill_regexp.find { |expr| expr =~ x } then
|
|
692
|
-
nil
|
|
693
|
-
else
|
|
694
|
-
x
|
|
695
|
-
end
|
|
696
|
-
end
|
|
697
|
-
end
|
|
698
|
-
a.compact!
|
|
699
|
-
a.collect! { |x| x.downcase } unless case_sensitive
|
|
700
|
-
a.sort!
|
|
701
|
-
a.uniq!
|
|
702
|
-
a
|
|
703
|
-
end
|
|
704
|
-
|
|
705
|
-
# Returns identifires by a database name.
|
|
706
|
-
def get(dbname)
|
|
707
|
-
db = dbname.to_s
|
|
708
|
-
r = nil
|
|
709
|
-
unless r = @info[db] then
|
|
710
|
-
di = @list_ids.find { |x| x[0] == db.to_s }
|
|
711
|
-
if di and di.size <= 2 then
|
|
712
|
-
r = di[-1]
|
|
713
|
-
elsif di then
|
|
714
|
-
labels = self.class::NSIDs[db]
|
|
715
|
-
[ 'acc_version', 'entry_id',
|
|
716
|
-
'locus', 'accession', 'number'].each do |x|
|
|
717
|
-
if i = labels.index(x) then
|
|
718
|
-
r = di[i+1]
|
|
719
|
-
break if r
|
|
720
|
-
end
|
|
721
|
-
end
|
|
722
|
-
r = di[1..-1].find { |x| x } unless r
|
|
723
|
-
end
|
|
724
|
-
@info[db] = r if r
|
|
725
|
-
end
|
|
726
|
-
r
|
|
727
|
-
end
|
|
728
|
-
|
|
729
|
-
# Returns an identifier by given type.
|
|
730
|
-
def get_by_type(type_str)
|
|
731
|
-
@list_ids.each do |x|
|
|
732
|
-
if labels = self.class::NSIDs[x[0]] then
|
|
733
|
-
if i = labels.index(type_str) then
|
|
734
|
-
return x[i+1]
|
|
735
|
-
end
|
|
736
|
-
end
|
|
737
|
-
end
|
|
738
|
-
nil
|
|
739
|
-
end
|
|
740
|
-
|
|
741
|
-
# Returns identifiers by given type.
|
|
742
|
-
def get_all_by_type(*type_strarg)
|
|
743
|
-
d = []
|
|
744
|
-
@list_ids.each do |x|
|
|
745
|
-
if labels = self.class::NSIDs[x[0]] then
|
|
746
|
-
type_strarg.each do |y|
|
|
747
|
-
if i = labels.index(y) then
|
|
748
|
-
d << x[i+1] if x[i+1]
|
|
749
|
-
end
|
|
750
|
-
end
|
|
751
|
-
end
|
|
752
|
-
end
|
|
753
|
-
d
|
|
754
|
-
end
|
|
755
|
-
|
|
756
|
-
# Shows locus.
|
|
757
|
-
# If the entry has more than two of such IDs,
|
|
758
|
-
# only the first ID are shown.
|
|
759
|
-
# Returns a string or nil.
|
|
760
|
-
def locus
|
|
761
|
-
unless defined?(@locus)
|
|
762
|
-
@locus = get_by_type('locus')
|
|
763
|
-
end
|
|
764
|
-
@locus
|
|
765
|
-
end
|
|
766
|
-
|
|
767
|
-
# Shows GI.
|
|
768
|
-
# If the entry has more than two of such IDs,
|
|
769
|
-
# only the first ID are shown.
|
|
770
|
-
# Returns a string or nil.
|
|
771
|
-
def gi
|
|
772
|
-
unless defined?(@gi) then
|
|
773
|
-
@gi = get_by_type('gi')
|
|
774
|
-
end
|
|
775
|
-
@gi
|
|
776
|
-
end
|
|
777
|
-
|
|
778
|
-
# Shows accession with version number.
|
|
779
|
-
# If the entry has more than two of such IDs,
|
|
780
|
-
# only the first ID are shown.
|
|
781
|
-
# Returns a string or nil.
|
|
782
|
-
def acc_version
|
|
783
|
-
unless defined?(@acc_version) then
|
|
784
|
-
@acc_version = get_by_type('acc_version')
|
|
785
|
-
end
|
|
786
|
-
@acc_version
|
|
787
|
-
end
|
|
788
|
-
|
|
789
|
-
# Shows accession numbers.
|
|
790
|
-
# Returns an array of strings.
|
|
791
|
-
def accessions
|
|
792
|
-
unless defined?(@accessions) then
|
|
793
|
-
@accessions = get_all_by_type('accession', 'acc_version')
|
|
794
|
-
@accessions.collect! { |x| x.sub(/\..*\z/, '') }
|
|
795
|
-
end
|
|
796
|
-
@accessions
|
|
797
|
-
end
|
|
798
|
-
|
|
799
|
-
# Shows an accession number.
|
|
800
|
-
def accession
|
|
801
|
-
unless defined?(@accession) then
|
|
802
|
-
if acc_version then
|
|
803
|
-
@accession = acc_version.split('.')[0]
|
|
804
|
-
else
|
|
805
|
-
@accession = accessions[0]
|
|
806
|
-
end
|
|
807
|
-
end
|
|
808
|
-
@accession
|
|
809
|
-
end
|
|
810
|
-
|
|
811
|
-
def method_missing(name, *args)
|
|
812
|
-
# raise ArgumentError,
|
|
813
|
-
# "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
|
|
814
|
-
r = get(name, *args)
|
|
815
|
-
if !r and !(self.class::NSIDs[name.to_s]) then
|
|
816
|
-
raise "NameError: undefined method `#{name.inspect}'"
|
|
817
|
-
end
|
|
818
|
-
r
|
|
819
|
-
end
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
end #class FastaDefline
|
|
823
|
-
|
|
824
327
|
end #module Bio
|
|
825
328
|
|
|
826
329
|
if __FILE__ == $0
|