bio 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/fasta/defline.rb - FASTA defline parser class
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001, 2002
|
|
5
|
+
# GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>,
|
|
6
|
+
# Toshiaki Katayama <k@bioruby.org>
|
|
7
|
+
# License:: The Ruby License
|
|
8
|
+
#
|
|
9
|
+
# $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
|
|
10
|
+
#
|
|
11
|
+
# == Description
|
|
12
|
+
#
|
|
13
|
+
# Bio::FastaDefline is a parser class for definition line (defline)
|
|
14
|
+
# of the FASTA format.
|
|
15
|
+
#
|
|
16
|
+
# == Examples
|
|
17
|
+
#
|
|
18
|
+
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
|
|
19
|
+
# rub.entry_id ==> 'gi|671595'
|
|
20
|
+
# rub.get('emb') ==> 'CAA85678.1'
|
|
21
|
+
# rub.emb ==> 'CAA85678.1'
|
|
22
|
+
# rub.gi ==> '671595'
|
|
23
|
+
# rub.accession ==> 'CAA85678'
|
|
24
|
+
# rub.accessions ==> [ 'CAA85678' ]
|
|
25
|
+
# rub.acc_version ==> 'CAA85678.1'
|
|
26
|
+
# rub.locus ==> nil
|
|
27
|
+
# rub.list_ids ==> [["gi", "671595"],
|
|
28
|
+
# ["emb", "CAA85678.1", nil],
|
|
29
|
+
# ["Perovskia abrotanoides"]]
|
|
30
|
+
#
|
|
31
|
+
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
|
32
|
+
# ckr.entry_id ==> "gi|2495000"
|
|
33
|
+
# ckr.sp ==> "CCKR_CAVPO"
|
|
34
|
+
# ckr.pir ==> "I51898"
|
|
35
|
+
# ckr.gb ==> "AAB29504.1"
|
|
36
|
+
# ckr.gi ==> "2495000"
|
|
37
|
+
# ckr.accession ==> "AAB29504"
|
|
38
|
+
# ckr.accessions ==> ["Q63931", "AAB29504"]
|
|
39
|
+
# ckr.acc_version ==> "AAB29504.1"
|
|
40
|
+
# ckr.locus ==> nil
|
|
41
|
+
# ckr.description ==>
|
|
42
|
+
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
|
43
|
+
# ckr.descriptions ==>
|
|
44
|
+
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
|
45
|
+
# "cholecystokinin A receptor - guinea pig",
|
|
46
|
+
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
|
47
|
+
# ckr.words ==>
|
|
48
|
+
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
|
49
|
+
# "receptor", "type"]
|
|
50
|
+
# ckr.id_strings ==>
|
|
51
|
+
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
|
52
|
+
# "544724", "AAB29504.1", "Cavia"]
|
|
53
|
+
# ckr.list_ids ==>
|
|
54
|
+
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
|
55
|
+
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
|
56
|
+
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
|
57
|
+
#
|
|
58
|
+
# == References
|
|
59
|
+
#
|
|
60
|
+
# * FASTA format (WikiPedia)
|
|
61
|
+
# http://en.wikipedia.org/wiki/FASTA_format
|
|
62
|
+
#
|
|
63
|
+
# * Fasta format description (NCBI)
|
|
64
|
+
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
|
65
|
+
#
|
|
66
|
+
|
|
67
|
+
module Bio
|
|
68
|
+
|
|
69
|
+
#--
|
|
70
|
+
# split from fasta.rb revision 1.28
|
|
71
|
+
#++
|
|
72
|
+
|
|
73
|
+
# Parsing FASTA Defline, and extract IDs and other informations.
|
|
74
|
+
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
|
|
75
|
+
# or ":"-separated IDs.
|
|
76
|
+
#
|
|
77
|
+
# specs are described in:
|
|
78
|
+
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
|
79
|
+
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
|
80
|
+
#
|
|
81
|
+
# === Examples
|
|
82
|
+
#
|
|
83
|
+
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
|
|
84
|
+
# rub.entry_id ==> 'gi|671595'
|
|
85
|
+
# rub.get('emb') ==> 'CAA85678.1'
|
|
86
|
+
# rub.emb ==> 'CAA85678.1'
|
|
87
|
+
# rub.gi ==> '671595'
|
|
88
|
+
# rub.accession ==> 'CAA85678'
|
|
89
|
+
# rub.accessions ==> [ 'CAA85678' ]
|
|
90
|
+
# rub.acc_version ==> 'CAA85678.1'
|
|
91
|
+
# rub.locus ==> nil
|
|
92
|
+
# rub.list_ids ==> [["gi", "671595"],
|
|
93
|
+
# ["emb", "CAA85678.1", nil],
|
|
94
|
+
# ["Perovskia abrotanoides"]]
|
|
95
|
+
#
|
|
96
|
+
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
|
|
97
|
+
# ckr.entry_id ==> "gi|2495000"
|
|
98
|
+
# ckr.sp ==> "CCKR_CAVPO"
|
|
99
|
+
# ckr.pir ==> "I51898"
|
|
100
|
+
# ckr.gb ==> "AAB29504.1"
|
|
101
|
+
# ckr.gi ==> "2495000"
|
|
102
|
+
# ckr.accession ==> "AAB29504"
|
|
103
|
+
# ckr.accessions ==> ["Q63931", "AAB29504"]
|
|
104
|
+
# ckr.acc_version ==> "AAB29504.1"
|
|
105
|
+
# ckr.locus ==> nil
|
|
106
|
+
# ckr.description ==>
|
|
107
|
+
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
|
|
108
|
+
# ckr.descriptions ==>
|
|
109
|
+
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
|
|
110
|
+
# "cholecystokinin A receptor - guinea pig",
|
|
111
|
+
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
|
|
112
|
+
# ckr.words ==>
|
|
113
|
+
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
|
|
114
|
+
# "receptor", "type"]
|
|
115
|
+
# ckr.id_strings ==>
|
|
116
|
+
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
|
|
117
|
+
# "544724", "AAB29504.1", "Cavia"]
|
|
118
|
+
# ckr.list_ids ==>
|
|
119
|
+
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
|
|
120
|
+
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
|
121
|
+
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
|
122
|
+
#
|
|
123
|
+
# === Refereneces
|
|
124
|
+
#
|
|
125
|
+
# * Fasta format description (NCBI)
|
|
126
|
+
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
|
127
|
+
#
|
|
128
|
+
# * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
|
|
129
|
+
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
|
|
130
|
+
#
|
|
131
|
+
# * README.formatdb
|
|
132
|
+
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
|
|
133
|
+
#
|
|
134
|
+
class FastaDefline
|
|
135
|
+
|
|
136
|
+
NSIDs = {
|
|
137
|
+
# NCBI and WU-BLAST
|
|
138
|
+
'gi' => [ 'gi' ], # NCBI GI
|
|
139
|
+
'gb' => [ 'acc_version', 'locus' ], # GenBank
|
|
140
|
+
'emb' => [ 'acc_version', 'locus' ], # EMBL
|
|
141
|
+
'dbj' => [ 'acc_version', 'locus' ], # DDBJ
|
|
142
|
+
'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
|
|
143
|
+
'pdb' => [ 'entry_id', 'chain' ], # PDB
|
|
144
|
+
'bbs' => [ 'number' ], # GenInfo Backbone Id
|
|
145
|
+
'gnl' => [ 'database' , 'entry_id' ], # General database identifier
|
|
146
|
+
'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
|
|
147
|
+
'lcl' => [ 'entry_id' ], # Local Sequence identifier
|
|
148
|
+
|
|
149
|
+
# WU-BLAST and NCBI
|
|
150
|
+
'pir' => [ 'accession', 'entry_id' ], # PIR
|
|
151
|
+
'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
|
|
152
|
+
'pat' => [ 'country', 'number', 'serial' ], # Patents
|
|
153
|
+
|
|
154
|
+
# WU-BLAST only
|
|
155
|
+
'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
|
|
156
|
+
'gim' => [ 'number' ], # NCBI GenInfo Import identifier
|
|
157
|
+
'gp' => [ 'acc_version', 'locus' ], # GenPept
|
|
158
|
+
'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
|
|
159
|
+
'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
|
|
160
|
+
'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
|
|
161
|
+
'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
|
|
162
|
+
|
|
163
|
+
# Original
|
|
164
|
+
'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# Shows array that contains IDs (or ID-like strings).
|
|
168
|
+
# Returns an array of arrays of strings.
|
|
169
|
+
attr_reader :list_ids
|
|
170
|
+
|
|
171
|
+
# Shows a possibly unique identifier.
|
|
172
|
+
# Returns a string.
|
|
173
|
+
attr_reader :entry_id
|
|
174
|
+
|
|
175
|
+
# Parses given string.
|
|
176
|
+
def initialize(str)
|
|
177
|
+
@deflines = []
|
|
178
|
+
@info = {}
|
|
179
|
+
@list_ids = []
|
|
180
|
+
|
|
181
|
+
@entry_id = nil
|
|
182
|
+
|
|
183
|
+
lines = str.split("\x01")
|
|
184
|
+
lines.each do |line|
|
|
185
|
+
add_defline(line)
|
|
186
|
+
end
|
|
187
|
+
end #def initialize
|
|
188
|
+
|
|
189
|
+
# Parses given string and adds parsed data.
|
|
190
|
+
def add_defline(str)
|
|
191
|
+
case str
|
|
192
|
+
when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
|
|
193
|
+
# NSIDs
|
|
194
|
+
# examples:
|
|
195
|
+
# >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
|
|
196
|
+
#
|
|
197
|
+
# note: regexp (:?) means grouping without backreferences
|
|
198
|
+
i = $1
|
|
199
|
+
d = $2
|
|
200
|
+
tks = i.split('|')
|
|
201
|
+
tks << '' if i[-1,1] == '|'
|
|
202
|
+
a = parse_NSIDs(tks)
|
|
203
|
+
i = a[0].join('|')
|
|
204
|
+
a.unshift('|')
|
|
205
|
+
d = tks.join('|') + ' ' + d unless tks.empty?
|
|
206
|
+
a << d
|
|
207
|
+
this_line = a
|
|
208
|
+
match_EC(d)
|
|
209
|
+
parse_square_brackets(d).each do |x|
|
|
210
|
+
if !match_EC(x, false) and x =~ /\A[A-Z]/ then
|
|
211
|
+
di = [ x ]
|
|
212
|
+
@list_ids << di
|
|
213
|
+
@info['organism'] = x unless @info['organism']
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
|
|
218
|
+
# examples:
|
|
219
|
+
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
|
|
220
|
+
# >emb:CACDC28 [X80034] C.albicans CDC28 gene
|
|
221
|
+
i = $1
|
|
222
|
+
d = $2
|
|
223
|
+
a = parse_ColonSepID(i)
|
|
224
|
+
i = a.join(':')
|
|
225
|
+
this_line = [ ':', a , d ]
|
|
226
|
+
match_EC(d)
|
|
227
|
+
parse_square_brackets(d).each do |x|
|
|
228
|
+
if !match_EC(x, false) and x =~ /:/ then
|
|
229
|
+
parse_ColonSepID(x)
|
|
230
|
+
elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
|
|
231
|
+
@list_ids << [ $1 ]
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
when /^\>?\s*(\S+)(?:\s+(.+))?$/
|
|
236
|
+
# examples:
|
|
237
|
+
# >ABC12345 this is test
|
|
238
|
+
i = $1
|
|
239
|
+
d = $2.to_s
|
|
240
|
+
@list_ids << [ i.chomp('.') ]
|
|
241
|
+
this_line = [ '', [ i ], d ]
|
|
242
|
+
match_EC(d)
|
|
243
|
+
else
|
|
244
|
+
i = str
|
|
245
|
+
d = ''
|
|
246
|
+
match_EC(i)
|
|
247
|
+
this_line = [ '', [ i ], d ]
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
@deflines << this_line
|
|
251
|
+
@entry_id = i unless @entry_id
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def match_EC(str, write_flag = true)
|
|
255
|
+
di = nil
|
|
256
|
+
str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
|
|
257
|
+
di = [ 'EC', $1 ]
|
|
258
|
+
if write_flag then
|
|
259
|
+
@info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
|
|
260
|
+
@list_ids << di
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
di
|
|
264
|
+
end
|
|
265
|
+
private :match_EC
|
|
266
|
+
|
|
267
|
+
def parse_square_brackets(str)
|
|
268
|
+
r = []
|
|
269
|
+
str.scan(/\[([^\]]*)\]/) do |x|
|
|
270
|
+
r << x[0]
|
|
271
|
+
end
|
|
272
|
+
r
|
|
273
|
+
end
|
|
274
|
+
private :parse_square_brackets
|
|
275
|
+
|
|
276
|
+
def parse_ColonSepID(str)
|
|
277
|
+
di = str.split(':', 2)
|
|
278
|
+
di << nil if di.size <= 1
|
|
279
|
+
@list_ids << di
|
|
280
|
+
di
|
|
281
|
+
end
|
|
282
|
+
private :parse_ColonSepID
|
|
283
|
+
|
|
284
|
+
def parse_NSIDs(ary)
|
|
285
|
+
# this method destroys ary
|
|
286
|
+
data = []
|
|
287
|
+
while token = ary.shift
|
|
288
|
+
if labels = self.class::NSIDs[token] then
|
|
289
|
+
di = [ token ]
|
|
290
|
+
idtype = token
|
|
291
|
+
labels.each do |x|
|
|
292
|
+
token = ary.shift
|
|
293
|
+
break unless token
|
|
294
|
+
if self.class::NSIDs[token] then
|
|
295
|
+
ary.unshift(token)
|
|
296
|
+
break #each
|
|
297
|
+
end
|
|
298
|
+
if token.length > 0 then
|
|
299
|
+
di << token
|
|
300
|
+
else
|
|
301
|
+
di << nil
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
data << di
|
|
305
|
+
else
|
|
306
|
+
if token.length > 0 then
|
|
307
|
+
# UCID (uncontrolled identifiers)
|
|
308
|
+
di = [ token ]
|
|
309
|
+
data << di
|
|
310
|
+
@info['ucid'] = token unless @info['ucid']
|
|
311
|
+
end
|
|
312
|
+
break #while
|
|
313
|
+
end
|
|
314
|
+
end #while
|
|
315
|
+
@list_ids.concat data
|
|
316
|
+
data
|
|
317
|
+
end #def parse_NSIDs
|
|
318
|
+
private :parse_NSIDs
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# Shows original string.
|
|
322
|
+
# Note that the result of this method may be different from
|
|
323
|
+
# original string which is given in FastaDefline.new method.
|
|
324
|
+
def to_s
|
|
325
|
+
@deflines.collect { |a|
|
|
326
|
+
s = a[0]
|
|
327
|
+
(a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
|
|
328
|
+
}.join("\x01")
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# Shows description.
|
|
332
|
+
def description
|
|
333
|
+
@deflines[0].to_a[-1]
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
# Returns descriptions.
|
|
337
|
+
def descriptions
|
|
338
|
+
@deflines.collect do |a|
|
|
339
|
+
a[-1]
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# Shows ID-like strings.
|
|
344
|
+
# Returns an array of strings.
|
|
345
|
+
def id_strings
|
|
346
|
+
r = []
|
|
347
|
+
@list_ids.each do |a|
|
|
348
|
+
if a.size >= 2 then
|
|
349
|
+
r.concat a[1..-1].find_all { |x| x }
|
|
350
|
+
else
|
|
351
|
+
if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
|
|
352
|
+
r << a[0]
|
|
353
|
+
end
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
r.concat( words(true, []).find_all do |x|
|
|
357
|
+
x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
|
|
358
|
+
x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
|
359
|
+
end)
|
|
360
|
+
r
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
KillWords = [
|
|
364
|
+
'an', 'the', 'this', 'that',
|
|
365
|
+
'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
|
|
366
|
+
'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
|
|
367
|
+
'from', 'and', 'or', 'not',
|
|
368
|
+
'dna', 'rna', 'mrna', 'cdna', 'orf',
|
|
369
|
+
'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
|
|
370
|
+
'similar', 'involved', 'identical', 'identity',
|
|
371
|
+
'cds', 'clone', 'library', 'contig', 'contigs',
|
|
372
|
+
'homolog', 'homologue', 'homologs', 'homologous',
|
|
373
|
+
'protein', 'proteins', 'gene', 'genes',
|
|
374
|
+
'product', 'products', 'sequence', 'sequences',
|
|
375
|
+
'strain', 'strains', 'region', 'regions',
|
|
376
|
+
]
|
|
377
|
+
KillWordsHash = {}
|
|
378
|
+
KillWords.each { |x| KillWordsHash[x] = true }
|
|
379
|
+
|
|
380
|
+
KillRegexpArray = [
|
|
381
|
+
/\A\d{1,3}\%?\z/,
|
|
382
|
+
/\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
|
|
383
|
+
/\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
|
|
384
|
+
]
|
|
385
|
+
|
|
386
|
+
# Shows words used in the defline. Returns an Array.
|
|
387
|
+
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
|
|
388
|
+
kwhash = self.class::KillWordsHash)
|
|
389
|
+
a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
|
|
390
|
+
a.collect! do |x|
|
|
391
|
+
x.sub!(/\A[\$\*\-\+]+/, '')
|
|
392
|
+
x.sub!(/[\$\*\-\=]+\z/, '')
|
|
393
|
+
if x.size <= 1 then
|
|
394
|
+
nil
|
|
395
|
+
elsif kwhash[x.downcase] then
|
|
396
|
+
nil
|
|
397
|
+
else
|
|
398
|
+
if kill_regexp.find { |expr| expr =~ x } then
|
|
399
|
+
nil
|
|
400
|
+
else
|
|
401
|
+
x
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
a.compact!
|
|
406
|
+
a.collect! { |x| x.downcase } unless case_sensitive
|
|
407
|
+
a.sort!
|
|
408
|
+
a.uniq!
|
|
409
|
+
a
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
# Returns identifires by a database name.
|
|
413
|
+
def get(dbname)
|
|
414
|
+
db = dbname.to_s
|
|
415
|
+
r = nil
|
|
416
|
+
unless r = @info[db] then
|
|
417
|
+
di = @list_ids.find { |x| x[0] == db.to_s }
|
|
418
|
+
if di and di.size <= 2 then
|
|
419
|
+
r = di[-1]
|
|
420
|
+
elsif di then
|
|
421
|
+
labels = self.class::NSIDs[db]
|
|
422
|
+
[ 'acc_version', 'entry_id',
|
|
423
|
+
'locus', 'accession', 'number'].each do |x|
|
|
424
|
+
if i = labels.index(x) then
|
|
425
|
+
r = di[i+1]
|
|
426
|
+
break if r
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
r = di[1..-1].find { |x| x } unless r
|
|
430
|
+
end
|
|
431
|
+
@info[db] = r if r
|
|
432
|
+
end
|
|
433
|
+
r
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
# Returns an identifier by given type.
|
|
437
|
+
def get_by_type(type_str)
|
|
438
|
+
@list_ids.each do |x|
|
|
439
|
+
if labels = self.class::NSIDs[x[0]] then
|
|
440
|
+
if i = labels.index(type_str) then
|
|
441
|
+
return x[i+1]
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
nil
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
# Returns identifiers by given type.
|
|
449
|
+
def get_all_by_type(*type_strarg)
|
|
450
|
+
d = []
|
|
451
|
+
@list_ids.each do |x|
|
|
452
|
+
if labels = self.class::NSIDs[x[0]] then
|
|
453
|
+
type_strarg.each do |y|
|
|
454
|
+
if i = labels.index(y) then
|
|
455
|
+
d << x[i+1] if x[i+1]
|
|
456
|
+
end
|
|
457
|
+
end
|
|
458
|
+
end
|
|
459
|
+
end
|
|
460
|
+
d
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
# Shows locus.
|
|
464
|
+
# If the entry has more than two of such IDs,
|
|
465
|
+
# only the first ID are shown.
|
|
466
|
+
# Returns a string or nil.
|
|
467
|
+
def locus
|
|
468
|
+
unless defined?(@locus)
|
|
469
|
+
@locus = get_by_type('locus')
|
|
470
|
+
end
|
|
471
|
+
@locus
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# Shows GI.
|
|
475
|
+
# If the entry has more than two of such IDs,
|
|
476
|
+
# only the first ID are shown.
|
|
477
|
+
# Returns a string or nil.
|
|
478
|
+
def gi
|
|
479
|
+
unless defined?(@gi) then
|
|
480
|
+
@gi = get_by_type('gi')
|
|
481
|
+
end
|
|
482
|
+
@gi
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# Shows accession with version number.
|
|
486
|
+
# If the entry has more than two of such IDs,
|
|
487
|
+
# only the first ID are shown.
|
|
488
|
+
# Returns a string or nil.
|
|
489
|
+
def acc_version
|
|
490
|
+
unless defined?(@acc_version) then
|
|
491
|
+
@acc_version = get_by_type('acc_version')
|
|
492
|
+
end
|
|
493
|
+
@acc_version
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
# Shows accession numbers.
|
|
497
|
+
# Returns an array of strings.
|
|
498
|
+
def accessions
|
|
499
|
+
unless defined?(@accessions) then
|
|
500
|
+
@accessions = get_all_by_type('accession', 'acc_version')
|
|
501
|
+
@accessions.collect! { |x| x.sub(/\..*\z/, '') }
|
|
502
|
+
end
|
|
503
|
+
@accessions
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
# Shows an accession number.
|
|
507
|
+
def accession
|
|
508
|
+
unless defined?(@accession) then
|
|
509
|
+
if acc_version then
|
|
510
|
+
@accession = acc_version.split('.')[0]
|
|
511
|
+
else
|
|
512
|
+
@accession = accessions[0]
|
|
513
|
+
end
|
|
514
|
+
end
|
|
515
|
+
@accession
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
def method_missing(name, *args)
|
|
519
|
+
# raise ArgumentError,
|
|
520
|
+
# "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
|
|
521
|
+
r = get(name, *args)
|
|
522
|
+
if !r and !(self.class::NSIDs[name.to_s]) then
|
|
523
|
+
raise "NameError: undefined method `#{name.inspect}'"
|
|
524
|
+
end
|
|
525
|
+
r
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
end #class FastaDefline
|
|
530
|
+
|
|
531
|
+
end #module Bio
|
|
532
|
+
|