bio 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/fasta/fasta_to_biosequence.rb - Bio::FastaFormat to Bio::Sequence adapter module
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2008
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
# $Id:$
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
require 'bio/sequence'
|
|
12
|
+
require 'bio/sequence/adapter'
|
|
13
|
+
|
|
14
|
+
# Internal use only. Normal users should not use this module.
|
|
15
|
+
#
|
|
16
|
+
# Bio::FastaFormat to Bio::Sequence adapter module.
|
|
17
|
+
# It is internally used in Bio::FastaFormat#to_biosequence.
|
|
18
|
+
#
|
|
19
|
+
module Bio::Sequence::Adapter::FastaFormat
|
|
20
|
+
|
|
21
|
+
extend Bio::Sequence::Adapter
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def_biosequence_adapter :seq
|
|
26
|
+
|
|
27
|
+
# primary accession
|
|
28
|
+
def_biosequence_adapter :primary_accession do |orig|
|
|
29
|
+
orig.identifiers.accessions.first or orig.identifiers.entry_id
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# secondary accessions
|
|
33
|
+
def_biosequence_adapter :secondary_accessions do |orig|
|
|
34
|
+
orig.identifiers.accessions[1..-1]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# entry_id
|
|
38
|
+
def_biosequence_adapter :entry_id do |orig|
|
|
39
|
+
orig.identifiers.locus or orig.identifiers.accessions.first or
|
|
40
|
+
orig.identifiers.entry_id
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# NCBI GI is stored on other_seqids
|
|
44
|
+
def_biosequence_adapter :other_seqids do |orig|
|
|
45
|
+
other = []
|
|
46
|
+
if orig.identifiers.gi then
|
|
47
|
+
other.push Bio::Sequence::DBLink.new('GI', orig.identifiers.gi)
|
|
48
|
+
end
|
|
49
|
+
other.empty? ? nil : other
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# definition
|
|
53
|
+
def_biosequence_adapter :definition do |orig|
|
|
54
|
+
if orig.identifiers.accessions.empty? and
|
|
55
|
+
!(orig.identifiers.gi) then
|
|
56
|
+
orig.definition
|
|
57
|
+
else
|
|
58
|
+
orig.identifiers.description
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end #module Bio::Sequence::Adapter::FastaFormat
|
|
63
|
+
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/fasta/format_fasta.rb - Fasta format generater
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2006-2008
|
|
5
|
+
# Toshiaki Katayama <k@bioruby.org>,
|
|
6
|
+
# Naohisa Goto <ng@bioruby.org>,
|
|
7
|
+
# Jan Aerts <jan.aerts@bbsrc.ac.uk>
|
|
8
|
+
# License:: The Ruby License
|
|
9
|
+
#
|
|
10
|
+
# $Id: format_fasta.rb,v 1.1.2.1 2008/03/04 11:26:59 ngoto Exp $
|
|
11
|
+
#
|
|
12
|
+
|
|
13
|
+
require 'bio/sequence/format'
|
|
14
|
+
|
|
15
|
+
module Bio::Sequence::Format::Formatter
|
|
16
|
+
|
|
17
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
|
|
18
|
+
# Simple Fasta format output class for Bio::Sequence.
|
|
19
|
+
class Fasta < Bio::Sequence::Format::FormatterBase
|
|
20
|
+
|
|
21
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
|
|
22
|
+
#
|
|
23
|
+
# Creates a new Fasta format generater object from the sequence.
|
|
24
|
+
#
|
|
25
|
+
# ---
|
|
26
|
+
# *Arguments*:
|
|
27
|
+
# * _sequence_: Bio::Sequence object
|
|
28
|
+
# * (optional) :header => _header_: String (default nil)
|
|
29
|
+
# * (optional) :width => _width_: Fixnum (default 70)
|
|
30
|
+
def initialize; end if false # dummy for RDoc
|
|
31
|
+
|
|
32
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
|
|
33
|
+
#
|
|
34
|
+
# Output the FASTA format string of the sequence.
|
|
35
|
+
#
|
|
36
|
+
# Currently, this method is used in Bio::Sequence#output like so,
|
|
37
|
+
#
|
|
38
|
+
# s = Bio::Sequence.new('atgc')
|
|
39
|
+
# puts s.output(:fasta) #=> "> \natgc\n"
|
|
40
|
+
# ---
|
|
41
|
+
# *Returns*:: String object
|
|
42
|
+
def output
|
|
43
|
+
header = @options[:header]
|
|
44
|
+
width = @options.has_key?(:width) ? @options[:width] : 70
|
|
45
|
+
seq = @sequence.seq
|
|
46
|
+
entry_id = @sequence.entry_id ||
|
|
47
|
+
"#{@sequence.primary_accession}.#{@sequence.sequence_version}"
|
|
48
|
+
definition = @sequence.definition
|
|
49
|
+
header ||= "#{entry_id} #{definition}"
|
|
50
|
+
|
|
51
|
+
">#{header}\n" +
|
|
52
|
+
if width
|
|
53
|
+
seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
|
|
54
|
+
else
|
|
55
|
+
seq.to_s + "\n"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end #class Fasta
|
|
59
|
+
|
|
60
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
|
|
61
|
+
# NCBI-Style Fasta format output class for Bio::Sequence.
|
|
62
|
+
# (like "ncbi" format in EMBOSS)
|
|
63
|
+
#
|
|
64
|
+
# Note that this class is under construction.
|
|
65
|
+
class Fasta_ncbi < Bio::Sequence::Format::FormatterBase
|
|
66
|
+
|
|
67
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
|
|
68
|
+
#
|
|
69
|
+
# Output the FASTA format string of the sequence.
|
|
70
|
+
#
|
|
71
|
+
# Currently, this method is used in Bio::Sequence#output like so,
|
|
72
|
+
#
|
|
73
|
+
# s = Bio::Sequence.new('atgc')
|
|
74
|
+
# puts s.output(:ncbi) #=> "> \natgc\n"
|
|
75
|
+
# ---
|
|
76
|
+
# *Returns*:: String object
|
|
77
|
+
def output
|
|
78
|
+
width = 70
|
|
79
|
+
seq = @sequence.seq
|
|
80
|
+
#gi = @sequence.gi_number
|
|
81
|
+
dbname = 'lcl'
|
|
82
|
+
if @sequence.primary_accession.to_s.empty? then
|
|
83
|
+
idstr = @sequence.entry_id
|
|
84
|
+
else
|
|
85
|
+
idstr = "#{@sequence.primary_accession}.#{@sequence.sequence_version}"
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
definition = @sequence.definition
|
|
89
|
+
header = "#{dbname}|#{idstr} #{definition}"
|
|
90
|
+
|
|
91
|
+
">#{header}\n" + seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
|
|
92
|
+
end
|
|
93
|
+
end #class Ncbi
|
|
94
|
+
|
|
95
|
+
end #module Bio::Sequence::Format::Formatter
|
|
96
|
+
|
|
97
|
+
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# Copyright:: Copyright (C) 2004 Toshiaki Katayama <k@bioruby.org>
|
|
5
5
|
# License:: The Ruby License
|
|
6
6
|
#
|
|
7
|
-
# $Id: common.rb,v 1.11
|
|
7
|
+
# $Id: common.rb,v 1.11.2.5 2008/06/17 15:53:21 ngoto Exp $
|
|
8
8
|
#
|
|
9
9
|
|
|
10
10
|
require 'bio/db'
|
|
@@ -44,7 +44,7 @@ module Common
|
|
|
44
44
|
|
|
45
45
|
# ACCESSION -- Returns contents of the ACCESSION record as an Array.
|
|
46
46
|
def accessions
|
|
47
|
-
|
|
47
|
+
field_fetch('ACCESSION').strip.split(/\s+/)
|
|
48
48
|
end
|
|
49
49
|
|
|
50
50
|
|
|
@@ -137,9 +137,20 @@ module Common
|
|
|
137
137
|
unless @data['REFERENCE']
|
|
138
138
|
ary = []
|
|
139
139
|
toptag2array(get('REFERENCE')).each do |ref|
|
|
140
|
-
hash = Hash.new
|
|
140
|
+
hash = Hash.new
|
|
141
141
|
subtag2array(ref).each do |field|
|
|
142
142
|
case tag_get(field)
|
|
143
|
+
when /REFERENCE/
|
|
144
|
+
if /(\d+)(\s*\((.+)\))?/m =~ tag_cut(field) then
|
|
145
|
+
hash['embl_gb_record_number'] = $1.to_i
|
|
146
|
+
if $3 and $3 != 'sites' then
|
|
147
|
+
seqpos = $3
|
|
148
|
+
seqpos.sub!(/\A\s*bases\s+/, '')
|
|
149
|
+
seqpos.gsub!(/(\d+)\s+to\s+(\d+)/, "\\1-\\2")
|
|
150
|
+
seqpos.gsub!(/\s*\;\s*/, ', ')
|
|
151
|
+
hash['sequence_position'] = seqpos
|
|
152
|
+
end
|
|
153
|
+
end
|
|
143
154
|
when /AUTHORS/
|
|
144
155
|
authors = truncate(tag_cut(field))
|
|
145
156
|
authors = authors.split(/, /)
|
|
@@ -163,11 +174,14 @@ module Common
|
|
|
163
174
|
hash['medline'] = truncate(tag_cut(field))
|
|
164
175
|
when /PUBMED/
|
|
165
176
|
hash['pubmed'] = truncate(tag_cut(field))
|
|
177
|
+
when /REMARK/
|
|
178
|
+
hash['comments'] ||= []
|
|
179
|
+
hash['comments'].push truncate(tag_cut(field))
|
|
166
180
|
end
|
|
167
181
|
end
|
|
168
182
|
ary.push(Reference.new(hash))
|
|
169
183
|
end
|
|
170
|
-
@data['REFERENCE'] =
|
|
184
|
+
@data['REFERENCE'] = ary.extend(Bio::References::BackwardCompatibility)
|
|
171
185
|
end
|
|
172
186
|
if block_given?
|
|
173
187
|
@data['REFERENCE'].each do |r|
|
|
@@ -181,12 +195,15 @@ module Common
|
|
|
181
195
|
|
|
182
196
|
# COMMENT -- Returns contents of the COMMENT record as a String.
|
|
183
197
|
def comment
|
|
184
|
-
|
|
198
|
+
str = get('COMMENT').to_s.sub(/\ACOMMENT /, '')
|
|
199
|
+
str.gsub!(/^ {12}/, '')
|
|
200
|
+
str.chomp!
|
|
201
|
+
str
|
|
185
202
|
end
|
|
186
203
|
|
|
187
204
|
|
|
188
|
-
# FEATURES -- Returns contents of the FEATURES record as
|
|
189
|
-
#
|
|
205
|
+
# FEATURES -- Returns contents of the FEATURES record as an array of
|
|
206
|
+
# Bio::Feature objects.
|
|
190
207
|
def features
|
|
191
208
|
unless @data['FEATURES']
|
|
192
209
|
ary = []
|
|
@@ -228,7 +245,7 @@ module Common
|
|
|
228
245
|
parse_qualifiers(subary)
|
|
229
246
|
end
|
|
230
247
|
|
|
231
|
-
@data['FEATURES'] =
|
|
248
|
+
@data['FEATURES'] = ary.extend(Bio::Features::BackwardCompatibility)
|
|
232
249
|
end
|
|
233
250
|
if block_given?
|
|
234
251
|
@data['FEATURES'].each do |f|
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/genbank/format_genbank.rb - GenBank format generater
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2008 Naohisa Goto <ng@bioruby.org>
|
|
5
|
+
# License:: The Ruby License
|
|
6
|
+
#
|
|
7
|
+
# $Id: format_genbank.rb,v 1.1.2.5 2008/06/17 15:59:24 ngoto Exp $
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
require 'bio/sequence/format'
|
|
11
|
+
|
|
12
|
+
module Bio::Sequence::Format::NucFormatter
|
|
13
|
+
|
|
14
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
|
|
15
|
+
# GenBank format output class for Bio::Sequence.
|
|
16
|
+
class Genbank < Bio::Sequence::Format::FormatterBase
|
|
17
|
+
|
|
18
|
+
# helper methods
|
|
19
|
+
include Bio::Sequence::Format::INSDFeatureHelper
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
# string wrapper for GenBank format
|
|
24
|
+
def genbank_wrap(str)
|
|
25
|
+
wrap(str.to_s, 67).gsub(/\n/, "\n" + " " * 12)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# string wrap with adding a dot at the end of the string
|
|
29
|
+
def genbank_wrap_dot(str)
|
|
30
|
+
str = str.to_s
|
|
31
|
+
str = str + '.' unless /\.\z/ =~ str
|
|
32
|
+
genbank_wrap(str)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Given words (an Array of String) are wrapping with EMBL style.
|
|
36
|
+
# Each word is never splitted inside the word.
|
|
37
|
+
def genbank_wrap_words(array)
|
|
38
|
+
width = 67
|
|
39
|
+
result = []
|
|
40
|
+
str = nil
|
|
41
|
+
array.each do |x|
|
|
42
|
+
if str then
|
|
43
|
+
if str.length + 1 + x.length > width then
|
|
44
|
+
str = nil
|
|
45
|
+
else
|
|
46
|
+
str.concat ' '
|
|
47
|
+
str.concat x
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
unless str then
|
|
51
|
+
str = "#{x}"
|
|
52
|
+
result.push str
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
result.join("\n" + " " * 12)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# formats references
|
|
59
|
+
def reference_format_genbank(ref, num)
|
|
60
|
+
pos = ref.sequence_position.to_s.gsub(/\s/, '')
|
|
61
|
+
pos.gsub!(/(\d+)\-(\d+)/, "\\1 to \\2")
|
|
62
|
+
pos.gsub!(/\s*\,\s*/, '; ')
|
|
63
|
+
if pos.empty?
|
|
64
|
+
pos = ''
|
|
65
|
+
else
|
|
66
|
+
pos = " (bases #{pos})"
|
|
67
|
+
end
|
|
68
|
+
volissue = "#{ref.volume.to_s}"
|
|
69
|
+
volissue += " (#{ref.issue})" unless ref.issue.to_s.empty?
|
|
70
|
+
journal = "#{ref.journal.to_s}"
|
|
71
|
+
journal += " #{volissue}" unless volissue.empty?
|
|
72
|
+
journal += ", #{ref.pages}" unless ref.pages.to_s.empty?
|
|
73
|
+
journal += " (#{ref.year})" unless ref.year.to_s.empty?
|
|
74
|
+
|
|
75
|
+
alist = ref.authors.collect do |x|
|
|
76
|
+
y = x.to_s.strip.split(/\, *([^\,]+)\z/)
|
|
77
|
+
y[1].gsub!(/\. +/, '.') if y[1]
|
|
78
|
+
y.join(',')
|
|
79
|
+
end
|
|
80
|
+
lastauthor = alist.pop
|
|
81
|
+
last2author = alist.pop
|
|
82
|
+
alist.each { |x| x.concat ',' }
|
|
83
|
+
alist.push last2author if last2author
|
|
84
|
+
alist.push "and" unless alist.empty?
|
|
85
|
+
alist.push lastauthor.to_s
|
|
86
|
+
result = <<__END_OF_REFERENCE__
|
|
87
|
+
REFERENCE #{ genbank_wrap(sprintf('%-2d%s', num, pos))}
|
|
88
|
+
AUTHORS #{ genbank_wrap_words(alist) }
|
|
89
|
+
TITLE #{ genbank_wrap(ref.title.to_s) }
|
|
90
|
+
JOURNAL #{ genbank_wrap(journal) }
|
|
91
|
+
__END_OF_REFERENCE__
|
|
92
|
+
unless ref.pubmed.to_s.empty? then
|
|
93
|
+
result.concat " PUBMED #{ genbank_wrap(ref.pubmed) }\n"
|
|
94
|
+
end
|
|
95
|
+
if ref.comments and !(ref.comments.empty?) then
|
|
96
|
+
ref.comments.each do |c|
|
|
97
|
+
result.concat " REMARK #{ genbank_wrap(c) }\n"
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
result
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# formats comments lines as GenBank
|
|
104
|
+
def comments_format_genbank(cmnts)
|
|
105
|
+
return '' if !cmnts or cmnts.empty?
|
|
106
|
+
cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
|
|
107
|
+
a = []
|
|
108
|
+
cmnts.each do |str|
|
|
109
|
+
a.push "COMMENT #{ genbank_wrap(str) }\n"
|
|
110
|
+
end
|
|
111
|
+
a.join('')
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# formats sequence lines as GenBank
|
|
115
|
+
def seq_format_genbank(str)
|
|
116
|
+
i = 1
|
|
117
|
+
result = str.gsub(/.{1,60}/) do |s|
|
|
118
|
+
s = s.gsub(/.{1,10}/, ' \0')
|
|
119
|
+
y = sprintf("%9d%s\n", i, s)
|
|
120
|
+
i += 60
|
|
121
|
+
y
|
|
122
|
+
end
|
|
123
|
+
result
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# formats date
|
|
127
|
+
def date_format_genbank
|
|
128
|
+
date_modified || date_created || null_date
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# moleculue type
|
|
132
|
+
def mol_type_genbank
|
|
133
|
+
if /(DNA|(t|r|m|u|sn|sno)?RNA)/i =~ molecule_type.to_s then
|
|
134
|
+
$1.sub(/[DR]NA/) { |x| x.upcase }
|
|
135
|
+
else
|
|
136
|
+
'NA'
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# NCBI GI number
|
|
141
|
+
def ncbi_gi_number
|
|
142
|
+
ids = other_seqids
|
|
143
|
+
if ids and r = ids.find { |x| x.database == 'GI' } then
|
|
144
|
+
r.id
|
|
145
|
+
else
|
|
146
|
+
nil
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# strandedness
|
|
151
|
+
def strandedness_genbank
|
|
152
|
+
return nil unless strandedness
|
|
153
|
+
case strandedness
|
|
154
|
+
when 'single'; 'ss-';
|
|
155
|
+
when 'double'; 'ds-';
|
|
156
|
+
when 'mixed'; 'ms-';
|
|
157
|
+
else; nil
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Erb template of GenBank format for Bio::Sequence
|
|
162
|
+
erb_template <<'__END_OF_TEMPLATE__'
|
|
163
|
+
LOCUS <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", strandedness_genbank) %><%= sprintf("%-6s", mol_type_genbank) %> <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= date_format_genbank %>
|
|
164
|
+
DEFINITION <%= genbank_wrap_dot(definition.to_s) %>
|
|
165
|
+
ACCESSION <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
|
|
166
|
+
VERSION <%= primary_accession %>.<%= sequence_version %><% if gi = ncbi_gi_number then %> GI:<%= gi %><% end %>
|
|
167
|
+
KEYWORDS <%= genbank_wrap_dot((keywords or []).join('; ')) %>
|
|
168
|
+
SOURCE <%= genbank_wrap(species) %>
|
|
169
|
+
ORGANISM <%= genbank_wrap(species) %>
|
|
170
|
+
<%= genbank_wrap_dot((classification or []).join('; ')) %>
|
|
171
|
+
<%
|
|
172
|
+
n = 0
|
|
173
|
+
(references or []).each do |ref|
|
|
174
|
+
n += 1
|
|
175
|
+
%><%= reference_format_genbank(ref, n) %><%
|
|
176
|
+
end
|
|
177
|
+
%><%= comments_format_genbank(comments)
|
|
178
|
+
%>FEATURES Location/Qualifiers
|
|
179
|
+
<%= format_features_genbank(features || [])
|
|
180
|
+
%>ORIGIN
|
|
181
|
+
<%= seq_format_genbank(seq)
|
|
182
|
+
%>//
|
|
183
|
+
__END_OF_TEMPLATE__
|
|
184
|
+
|
|
185
|
+
end #class Genbank
|
|
186
|
+
end #module Bio::Sequence::Format::NucFormatter
|
|
187
|
+
|
|
@@ -4,11 +4,14 @@
|
|
|
4
4
|
# Copyright:: Copyright (C) 2000-2005 Toshiaki Katayama <k@bioruby.org>
|
|
5
5
|
# License:: The Ruby License
|
|
6
6
|
#
|
|
7
|
-
# $Id: genbank.rb,v 0.40
|
|
7
|
+
# $Id: genbank.rb,v 0.40.2.4 2008/06/17 15:56:18 ngoto Exp $
|
|
8
8
|
#
|
|
9
9
|
|
|
10
|
+
require 'date'
|
|
10
11
|
require 'bio/db'
|
|
11
12
|
require 'bio/db/genbank/common'
|
|
13
|
+
require 'bio/sequence'
|
|
14
|
+
require 'bio/sequence/dblink'
|
|
12
15
|
|
|
13
16
|
module Bio
|
|
14
17
|
|
|
@@ -121,10 +124,42 @@ class GenBank < NCBIDB
|
|
|
121
124
|
alias naseq seq
|
|
122
125
|
alias nalen length
|
|
123
126
|
|
|
127
|
+
# (obsolete???) length of the sequence
|
|
124
128
|
def seq_len
|
|
125
129
|
seq.length
|
|
126
130
|
end
|
|
127
131
|
|
|
132
|
+
# modified date. Returns Date object, String or nil.
|
|
133
|
+
def date_modified
|
|
134
|
+
begin
|
|
135
|
+
Date.parse(self.date)
|
|
136
|
+
rescue ArgumentError, TypeError, NoMethodError, NameError
|
|
137
|
+
self.date
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Taxonomy classfication. Returns an array of strings.
|
|
142
|
+
def classification
|
|
143
|
+
self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Strandedness. Returns one of 'single', 'double', 'mixed', or nil.
|
|
147
|
+
def strandedness
|
|
148
|
+
case self.strand.to_s.downcase
|
|
149
|
+
when 'ss-'; 'single'
|
|
150
|
+
when 'ds-'; 'double'
|
|
151
|
+
when 'ms-'; 'mixed'
|
|
152
|
+
else nil; end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# converts Bio::GenBank to Bio::Sequence
|
|
156
|
+
# ---
|
|
157
|
+
# *Arguments*:
|
|
158
|
+
# *Returns*:: Bio::Sequence object
|
|
159
|
+
def to_biosequence
|
|
160
|
+
Bio::Sequence.adapter(self, Bio::Sequence::Adapter::GenBank)
|
|
161
|
+
end
|
|
162
|
+
|
|
128
163
|
end # GenBank
|
|
129
164
|
end # Bio
|
|
130
165
|
|