bio 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
data/lib/bio/sequence/common.rb
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# Ryan Raaum <ryan@raaum.org>
|
|
7
7
|
# License:: The Ruby License
|
|
8
8
|
#
|
|
9
|
-
# $Id
|
|
9
|
+
# $Id:$
|
|
10
10
|
#
|
|
11
11
|
|
|
12
12
|
module Bio
|
|
@@ -37,7 +37,7 @@ class Sequence
|
|
|
37
37
|
# # Create a random sequence with the composition of a current sequence
|
|
38
38
|
# puts dna.randomize
|
|
39
39
|
module Common
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
# Return sequence as
|
|
42
42
|
# String[http://corelib.rubyonrails.org/classes/String.html].
|
|
43
43
|
# The original sequence is unchanged.
|
|
@@ -65,7 +65,7 @@ module Common
|
|
|
65
65
|
def seq
|
|
66
66
|
self.class.new(self)
|
|
67
67
|
end
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
# Normalize the current sequence, removing all whitespace and
|
|
70
70
|
# transforming all positions to uppercase if the sequence is AA or
|
|
71
71
|
# transforming all positions to lowercase if the sequence is NA.
|
|
@@ -241,53 +241,30 @@ module Common
|
|
|
241
241
|
# * (optional) _hash_: Hash object
|
|
242
242
|
# *Returns*:: new Bio::Sequence::NA/AA object
|
|
243
243
|
def randomize(hash = nil)
|
|
244
|
-
length = self.length
|
|
245
244
|
if hash
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
245
|
+
tmp = ''
|
|
246
|
+
hash.each {|k, v|
|
|
247
|
+
tmp += k * v.to_i
|
|
248
|
+
}
|
|
249
249
|
else
|
|
250
|
-
|
|
250
|
+
tmp = self
|
|
251
251
|
end
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
yield max.first
|
|
264
|
-
else
|
|
265
|
-
seq += max.first
|
|
252
|
+
seq = self.class.new(tmp)
|
|
253
|
+
# Reference: http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
|
|
254
|
+
seq.length.downto(2) do |n|
|
|
255
|
+
k = rand(n)
|
|
256
|
+
c = seq[n - 1]
|
|
257
|
+
seq[n - 1] = seq[k]
|
|
258
|
+
seq[k] = c
|
|
259
|
+
end
|
|
260
|
+
if block_given? then
|
|
261
|
+
(0...seq.length).each do |i|
|
|
262
|
+
yield seq[i, 1]
|
|
266
263
|
end
|
|
264
|
+
return self.class.new('')
|
|
265
|
+
else
|
|
266
|
+
return seq
|
|
267
267
|
end
|
|
268
|
-
return self.class.new(seq)
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
# Generate a new random sequence with the given frequency of bases.
|
|
272
|
-
# The sequence length is determined by their cumulative sum.
|
|
273
|
-
# (See also Bio::Sequence::Common#randomize which creates a new
|
|
274
|
-
# randomized sequence object using the base composition of an existing
|
|
275
|
-
# sequence instance).
|
|
276
|
-
#
|
|
277
|
-
# counts = {'R'=>1,'L'=>2,'E'=>3,'A'=>4}
|
|
278
|
-
# puts Bio::Sequence::AA.randomize(counts) #=> "AAEAELALRE" (for example)
|
|
279
|
-
#
|
|
280
|
-
# You may also feed the output of randomize into a block
|
|
281
|
-
#
|
|
282
|
-
# actual_counts = {'R'=>0,'L'=>0,'E'=>0,'A'=>0}
|
|
283
|
-
# Bio::Sequence::AA.randomize(counts) {|x| actual_counts[x] += 1}
|
|
284
|
-
# actual_counts #=> {"A"=>4, "L"=>2, "E"=>3, "R"=>1}
|
|
285
|
-
# ---
|
|
286
|
-
# *Arguments*:
|
|
287
|
-
# * (optional) _hash_: Hash object
|
|
288
|
-
# *Returns*:: Bio::Sequence::NA/AA object
|
|
289
|
-
def self.randomize(*arg, &block)
|
|
290
|
-
self.new('').randomize(*arg, &block)
|
|
291
268
|
end
|
|
292
269
|
|
|
293
270
|
# Return a new sequence extracted from the original using a GenBank style
|
data/lib/bio/sequence/compat.rb
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# Ryan Raaum <ryan@raaum.org>
|
|
7
7
|
# License:: The Ruby License
|
|
8
8
|
#
|
|
9
|
-
# $Id
|
|
9
|
+
# $Id:$
|
|
10
10
|
#
|
|
11
11
|
|
|
12
12
|
|
|
@@ -30,7 +30,7 @@ class Sequence
|
|
|
30
30
|
# ---
|
|
31
31
|
# *Returns*:: String object
|
|
32
32
|
def to_s
|
|
33
|
-
String.new(
|
|
33
|
+
String.new(self.seq)
|
|
34
34
|
end
|
|
35
35
|
alias to_str to_s
|
|
36
36
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/sequence/dblink.rb - sequence ID with database name
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2008
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
# $Id: dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
require 'bio/sequence'
|
|
12
|
+
|
|
13
|
+
# Bio::Sequence::DBLink stores IDs with the database name.
|
|
14
|
+
# Its main purpose is to store database cross-reference information
|
|
15
|
+
# for a sequence entry.
|
|
16
|
+
class Bio::Sequence::DBLink
|
|
17
|
+
|
|
18
|
+
# creates a new DBLink object
|
|
19
|
+
def initialize(database, primary_id, *secondary_ids)
|
|
20
|
+
@database = database
|
|
21
|
+
@id = primary_id
|
|
22
|
+
@secondary_ids = secondary_ids
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Database name, or namespace identifier (String).
|
|
26
|
+
attr_reader :database
|
|
27
|
+
|
|
28
|
+
# Primary identifier (String)
|
|
29
|
+
attr_reader :id
|
|
30
|
+
|
|
31
|
+
# Secondary identifiers (Array of String)
|
|
32
|
+
attr_reader :secondary_ids
|
|
33
|
+
|
|
34
|
+
#--
|
|
35
|
+
# class methods
|
|
36
|
+
#++
|
|
37
|
+
|
|
38
|
+
# Parses DR line in EMBL entry, and returns a DBLink object.
|
|
39
|
+
def self.parse_embl_DR_line(str)
|
|
40
|
+
str = str.sub(/\.\s*\z/, '')
|
|
41
|
+
str.sub!(/\ADR /, '')
|
|
42
|
+
self.new(*(str.split(/\s*\;\s*/, 3)))
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Parses DR line in UniProt entry, and returns a DBLink object.
|
|
46
|
+
def self.parse_uniprot_DR_line(str)
|
|
47
|
+
str = str.sub(/\.\s*\z/, '')
|
|
48
|
+
str.sub!(/\ADR /, '')
|
|
49
|
+
self.new(*(str.split(/\s*\;\s*/)))
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end #class Bio::Sequence::DBLink
|
|
53
|
+
|
|
54
|
+
|
data/lib/bio/sequence/format.rb
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
#
|
|
2
2
|
# = bio/sequence/format.rb - various output format of the biological sequence
|
|
3
3
|
#
|
|
4
|
-
# Copyright:: Copyright (C) 2006
|
|
4
|
+
# Copyright:: Copyright (C) 2006-2008
|
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>,
|
|
6
6
|
# Naohisa Goto <ng@bioruby.org>,
|
|
7
|
-
# Ryan Raaum <ryan@raaum.org
|
|
7
|
+
# Ryan Raaum <ryan@raaum.org>,
|
|
8
|
+
# Jan Aerts <jan.aerts@bbsrc.ac.uk>
|
|
8
9
|
# License:: The Ruby License
|
|
9
10
|
#
|
|
10
11
|
# = TODO
|
|
11
12
|
#
|
|
12
13
|
# porting from N. Goto's feature-output.rb on BioRuby list.
|
|
13
14
|
#
|
|
14
|
-
# $Id: format.rb,v 1.4
|
|
15
|
+
# $Id: format.rb,v 1.4.2.8 2008/06/17 15:50:05 ngoto Exp $
|
|
15
16
|
#
|
|
16
17
|
|
|
18
|
+
require 'erb'
|
|
17
19
|
|
|
18
20
|
module Bio
|
|
19
21
|
|
|
20
|
-
autoload :Sequence, 'bio/sequence'
|
|
21
|
-
|
|
22
22
|
class Sequence
|
|
23
23
|
|
|
24
24
|
# = DESCRIPTION
|
|
@@ -33,149 +33,326 @@ class Sequence
|
|
|
33
33
|
# puts s.output(:embl)
|
|
34
34
|
module Format
|
|
35
35
|
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
36
|
+
# Repository of generic (or both nucleotide and protein) sequence
|
|
37
|
+
# formatter classes
|
|
38
|
+
module Formatter
|
|
39
|
+
|
|
40
|
+
# Raw format generatar
|
|
41
|
+
autoload :Raw, 'bio/sequence/format_raw'
|
|
42
|
+
|
|
43
|
+
# Fasta format generater
|
|
44
|
+
autoload :Fasta, 'bio/db/fasta/format_fasta'
|
|
45
|
+
|
|
46
|
+
# NCBI-style Fasta format generatar
|
|
47
|
+
# (resemble to EMBOSS "ncbi" format)
|
|
48
|
+
autoload :Fasta_ncbi, 'bio/db/fasta/format_fasta'
|
|
49
|
+
|
|
50
|
+
end #module Formatter
|
|
51
|
+
|
|
52
|
+
# Repository of nucleotide sequence formatter classes
|
|
53
|
+
module NucFormatter
|
|
54
|
+
|
|
55
|
+
# GenBank format generater
|
|
56
|
+
# Note that the name is 'Genbank' and NOT 'GenBank'
|
|
57
|
+
autoload :Genbank, 'bio/db/genbank/format_genbank'
|
|
58
|
+
|
|
59
|
+
# EMBL format generater
|
|
60
|
+
# Note that the name is 'Embl' and NOT 'EMBL'
|
|
61
|
+
autoload :Embl, 'bio/db/embl/format_embl'
|
|
62
|
+
|
|
63
|
+
end #module NucFormatter
|
|
64
|
+
|
|
65
|
+
# Repository of protein sequence formatter classes
|
|
66
|
+
module AminoFormatter
|
|
67
|
+
# currently no formats available
|
|
68
|
+
end #module AminoFormatter
|
|
69
|
+
|
|
70
|
+
# Formatter base class.
|
|
71
|
+
# Any formatter class should inherit this class.
|
|
72
|
+
class FormatterBase
|
|
73
|
+
|
|
74
|
+
# Returns a formatterd string of the given sequence
|
|
75
|
+
# ---
|
|
76
|
+
# *Arguments*:
|
|
77
|
+
# * (required) _sequence_: Bio::Sequence object
|
|
78
|
+
# * (optional) _options_: a Hash object
|
|
79
|
+
# *Returns*:: String object
|
|
80
|
+
def self.output(sequence, options = {})
|
|
81
|
+
self.new(sequence, options).output
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# register new Erb template
|
|
85
|
+
def self.erb_template(str)
|
|
86
|
+
erb = ERB.new(str)
|
|
87
|
+
erb.def_method(self, 'output')
|
|
88
|
+
true
|
|
89
|
+
end
|
|
90
|
+
private_class_method :erb_template
|
|
91
|
+
|
|
92
|
+
# generates output data
|
|
93
|
+
# ---
|
|
94
|
+
# *Returns*:: String object
|
|
95
|
+
def output
|
|
96
|
+
raise NotImplementedError, 'should be implemented in subclass'
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# creates a new formatter object for output
|
|
100
|
+
def initialize(sequence, options = {})
|
|
101
|
+
@sequence = sequence
|
|
102
|
+
@options = options
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
private
|
|
106
|
+
|
|
107
|
+
# any unknown methods are delegated to the sequence object
|
|
108
|
+
def method_missing(sym, *args, &block) #:nodoc:
|
|
109
|
+
begin
|
|
110
|
+
@sequence.__send__(sym, *args, &block)
|
|
111
|
+
rescue NoMethodError => evar
|
|
112
|
+
lineno = __LINE__ - 2
|
|
113
|
+
file = __FILE__
|
|
114
|
+
bt_here = [ "#{file}:#{lineno}:in \`__send__\'",
|
|
115
|
+
"#{file}:#{lineno}:in \`method_missing\'"
|
|
116
|
+
]
|
|
117
|
+
if bt_here == evar.backtrace[0, 2] then
|
|
118
|
+
bt = evar.backtrace[2..-1]
|
|
119
|
+
evar = evar.class.new("undefined method \`#{sym.to_s}\' for #{self.inspect}")
|
|
120
|
+
evar.set_backtrace(bt)
|
|
121
|
+
end
|
|
122
|
+
raise(evar)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end #class FormatterBase
|
|
126
|
+
|
|
127
|
+
# Using Bio::Sequence::Format, return a String with the Bio::Sequence
|
|
128
|
+
# object formatted in the given style.
|
|
45
129
|
#
|
|
46
|
-
#
|
|
130
|
+
# Formats currently implemented are: 'fasta', 'genbank', and 'embl'
|
|
47
131
|
#
|
|
48
132
|
# s = Bio::Sequence.new('atgc')
|
|
49
133
|
# puts s.output(:fasta) #=> "> \natgc\n"
|
|
134
|
+
#
|
|
135
|
+
# The style argument is given as a Ruby
|
|
136
|
+
# Symbol(http://www.ruby-doc.org/core/classes/Symbol.html)
|
|
50
137
|
# ---
|
|
51
|
-
# *Arguments*:
|
|
52
|
-
# * (
|
|
53
|
-
# * (optional) _width_: Fixnum (default nil)
|
|
138
|
+
# *Arguments*:
|
|
139
|
+
# * (required) _format_: :fasta, :genbank, *or* :embl
|
|
54
140
|
# *Returns*:: String object
|
|
55
|
-
def
|
|
56
|
-
|
|
141
|
+
def output(format = :fasta, options = {})
|
|
142
|
+
formatter_const = format.to_s.capitalize.intern
|
|
143
|
+
|
|
144
|
+
formatter_class = nil
|
|
145
|
+
get_formatter_repositories.each do |mod|
|
|
146
|
+
begin
|
|
147
|
+
formatter_class = mod.const_get(formatter_const)
|
|
148
|
+
rescue NameError
|
|
149
|
+
end
|
|
150
|
+
break if formatter_class
|
|
151
|
+
end
|
|
152
|
+
unless formatter_class then
|
|
153
|
+
raise "unknown format name #{format.inspect}"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
formatter_class.output(self, options)
|
|
157
|
+
end
|
|
57
158
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
159
|
+
# Returns a list of available output formats for the sequence
|
|
160
|
+
# ---
|
|
161
|
+
# *Arguments*:
|
|
162
|
+
# *Returns*:: Array of Symbols
|
|
163
|
+
def list_output_formats
|
|
164
|
+
a = get_formatter_repositories.collect { |mod| mod.constants }
|
|
165
|
+
a.flatten!
|
|
166
|
+
a.collect! { |x| x.to_s.downcase.intern }
|
|
167
|
+
a
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
# returns formatter repository modules
|
|
173
|
+
def get_formatter_repositories
|
|
174
|
+
if self.moltype == Bio::Sequence::NA then
|
|
175
|
+
[ NucFormatter, Formatter ]
|
|
176
|
+
elsif self.moltype == Bio::Sequence::AA then
|
|
177
|
+
[ AminoFormatter, Formatter ]
|
|
61
178
|
else
|
|
62
|
-
|
|
179
|
+
[ NucFormatter, AminoFormatter, Formatter ]
|
|
63
180
|
end
|
|
64
181
|
end
|
|
65
182
|
|
|
183
|
+
#---
|
|
184
|
+
|
|
66
185
|
# Not yet implemented :)
|
|
67
186
|
# Remove the nodoc command after implementation!
|
|
68
187
|
# ---
|
|
69
188
|
# *Returns*:: String object
|
|
70
|
-
def format_gff #:nodoc:
|
|
71
|
-
|
|
72
|
-
end
|
|
189
|
+
#def format_gff #:nodoc:
|
|
190
|
+
# raise NotImplementedError
|
|
191
|
+
#end
|
|
192
|
+
|
|
193
|
+
#+++
|
|
194
|
+
|
|
195
|
+
# Formatting helper methods for INSD (NCBI, EMBL, DDBJ) feature table
|
|
196
|
+
module INSDFeatureHelper
|
|
197
|
+
private
|
|
73
198
|
|
|
74
199
|
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
|
|
75
200
|
# case, it would be difficult to successfully call this method outside
|
|
76
201
|
# its expected context).
|
|
77
202
|
#
|
|
78
|
-
# Output the Genbank format string of the sequence.
|
|
203
|
+
# Output the Genbank feature format string of the sequence.
|
|
79
204
|
# Used in Bio::Sequence#output.
|
|
80
205
|
# ---
|
|
81
206
|
# *Returns*:: String object
|
|
82
|
-
def
|
|
207
|
+
def format_features_genbank(features)
|
|
83
208
|
prefix = ' ' * 5
|
|
84
209
|
indent = prefix + ' ' * 16
|
|
85
210
|
fwidth = 79 - indent.length
|
|
86
|
-
|
|
87
|
-
format_features(prefix, indent, fwidth)
|
|
211
|
+
|
|
212
|
+
format_features(features, prefix, indent, fwidth)
|
|
88
213
|
end
|
|
89
214
|
|
|
90
215
|
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
|
|
91
216
|
# case, it would be difficult to successfully call this method outside
|
|
92
217
|
# its expected context).
|
|
93
218
|
#
|
|
94
|
-
# Output the EMBL format string of the sequence.
|
|
219
|
+
# Output the EMBL feature format string of the sequence.
|
|
95
220
|
# Used in Bio::Sequence#output.
|
|
96
221
|
# ---
|
|
97
222
|
# *Returns*:: String object
|
|
98
|
-
def
|
|
223
|
+
def format_features_embl(features)
|
|
99
224
|
prefix = 'FT '
|
|
100
225
|
indent = prefix + ' ' * 16
|
|
101
226
|
fwidth = 80 - indent.length
|
|
102
|
-
|
|
103
|
-
format_features(prefix, indent, fwidth)
|
|
227
|
+
|
|
228
|
+
format_features(features, prefix, indent, fwidth)
|
|
104
229
|
end
|
|
105
230
|
|
|
231
|
+
# format INSD featurs
|
|
232
|
+
def format_features(features, prefix, indent, width)
|
|
233
|
+
result = []
|
|
234
|
+
features.each do |feature|
|
|
235
|
+
result.push format_feature(feature, prefix, indent, width)
|
|
236
|
+
end
|
|
237
|
+
return result.join('')
|
|
238
|
+
end
|
|
106
239
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
result = ''
|
|
111
|
-
@features.each do |feature|
|
|
112
|
-
result << prefix + sprintf("%-16s", feature.feature)
|
|
113
|
-
|
|
114
|
-
position = feature.position
|
|
115
|
-
#position = feature.locations.to_s
|
|
240
|
+
# format an INSD feature
|
|
241
|
+
def format_feature(feature, prefix, indent, width)
|
|
242
|
+
result = prefix + sprintf("%-16s", feature.feature)
|
|
116
243
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
result << head << line
|
|
120
|
-
head = indent
|
|
121
|
-
end
|
|
244
|
+
position = feature.position
|
|
245
|
+
#position = feature.locations.to_s
|
|
122
246
|
|
|
123
|
-
|
|
124
|
-
|
|
247
|
+
result << wrap_and_split_lines(position, width).join("\n" + indent)
|
|
248
|
+
result << "\n"
|
|
249
|
+
result << format_qualifiers(feature.qualifiers, indent, width)
|
|
125
250
|
return result
|
|
126
251
|
end
|
|
127
252
|
|
|
253
|
+
# format qualifiers
|
|
128
254
|
def format_qualifiers(qualifiers, indent, width)
|
|
129
|
-
qualifiers.
|
|
255
|
+
qualifiers.collect do |qualifier|
|
|
130
256
|
q = qualifier.qualifier
|
|
131
257
|
v = qualifier.value.to_s
|
|
132
258
|
|
|
133
259
|
if v == true
|
|
134
|
-
lines =
|
|
260
|
+
lines = wrap_with_newline('/' + q, width)
|
|
135
261
|
elsif q == 'translation'
|
|
136
|
-
lines = fold(
|
|
262
|
+
lines = fold("/#{q}=\"#{v}\"", width)
|
|
137
263
|
else
|
|
138
|
-
if v[/\D/]
|
|
264
|
+
if v[/\D/] or q == 'chromosome'
|
|
139
265
|
#v.delete!("\x00-\x1f\x7f-\xff")
|
|
140
266
|
v.gsub!(/"/, '""')
|
|
141
267
|
v = '"' + v + '"'
|
|
142
268
|
end
|
|
143
|
-
lines =
|
|
269
|
+
lines = wrap_with_newline('/' + q + '=' + v, width)
|
|
144
270
|
end
|
|
145
271
|
|
|
146
|
-
|
|
147
|
-
|
|
272
|
+
lines.gsub!(/^/, indent)
|
|
273
|
+
lines
|
|
274
|
+
end.join
|
|
148
275
|
end
|
|
149
276
|
|
|
150
277
|
def fold(str, width)
|
|
151
278
|
str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
|
|
152
279
|
end
|
|
153
280
|
|
|
154
|
-
def
|
|
281
|
+
def fold_and_split_lines(str, width)
|
|
282
|
+
str.scan(Regexp.new(".{1,#{width}}"))
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def wrap_and_split_lines(str, width)
|
|
155
286
|
result = []
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
287
|
+
lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
|
|
288
|
+
lefts.each do |left|
|
|
289
|
+
left.rstrip!
|
|
290
|
+
while left and left.length > width
|
|
291
|
+
line = nil
|
|
292
|
+
width.downto(1) do |i|
|
|
293
|
+
if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
|
|
294
|
+
line = left[0..(i-1)].sub(/ +\z/, '')
|
|
295
|
+
left = left[i..-1].sub(/\A +/, '')
|
|
296
|
+
break
|
|
297
|
+
end
|
|
164
298
|
end
|
|
299
|
+
if line.nil? then
|
|
300
|
+
line = left[0..(width-1)]
|
|
301
|
+
left = left[width..-1]
|
|
302
|
+
end
|
|
303
|
+
result << line
|
|
304
|
+
left = nil if left.to_s.empty?
|
|
165
305
|
end
|
|
166
|
-
if
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
306
|
+
result << left if left
|
|
307
|
+
end
|
|
308
|
+
return result
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
def wrap_with_newline(str, width)
|
|
312
|
+
result = wrap_and_split_lines(str, width)
|
|
313
|
+
result_string = result.join("\n")
|
|
314
|
+
result_string << "\n" unless result_string.empty?
|
|
315
|
+
return result_string
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def wrap(str, width = 80, prefix = '')
|
|
319
|
+
actual_width = width - prefix.length
|
|
320
|
+
result = wrap_and_split_lines(str, actual_width)
|
|
321
|
+
result_string = result.join("\n#{prefix}")
|
|
322
|
+
result_string = prefix + result_string unless result_string.empty?
|
|
323
|
+
return result_string
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
#--
|
|
327
|
+
# internal use only
|
|
328
|
+
MonthStr = [ nil,
|
|
329
|
+
'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
|
|
330
|
+
'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
|
|
331
|
+
].collect { |x| x.freeze }.freeze
|
|
332
|
+
#++
|
|
333
|
+
|
|
334
|
+
# formats a date from Date, DateTime, or Time object, or String.
|
|
335
|
+
def format_date(d)
|
|
336
|
+
begin
|
|
337
|
+
yy = d.year
|
|
338
|
+
mm = d.month
|
|
339
|
+
dd = d.day
|
|
340
|
+
rescue NoMethodError, NameError, ArgumentError, TypeError
|
|
341
|
+
return sprintf("%-11s", d)
|
|
171
342
|
end
|
|
172
|
-
|
|
173
|
-
|
|
343
|
+
sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# null date
|
|
347
|
+
def null_date
|
|
348
|
+
Date.new(0, 1, 1)
|
|
174
349
|
end
|
|
175
350
|
|
|
176
|
-
end #
|
|
351
|
+
end #module INSDFeatureHelper
|
|
352
|
+
|
|
353
|
+
end #module Format
|
|
177
354
|
|
|
178
|
-
end # Sequence
|
|
355
|
+
end #class Sequence
|
|
179
356
|
|
|
180
|
-
end # Bio
|
|
357
|
+
end #module Bio
|
|
181
358
|
|