bio 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3421 -0
- data/KNOWN_ISSUES.rdoc +88 -0
- data/README.rdoc +252 -0
- data/README_DEV.rdoc +285 -0
- data/Rakefile +143 -0
- data/bin/bioruby +0 -0
- data/bin/br_biofetch.rb +0 -0
- data/bin/br_bioflat.rb +12 -1
- data/bin/br_biogetseq.rb +0 -0
- data/bin/br_pmfetch.rb +4 -3
- data/bioruby.gemspec +477 -0
- data/bioruby.gemspec.erb +117 -0
- data/doc/Changes-0.7.rd +7 -0
- data/doc/Changes-1.3.rdoc +239 -0
- data/doc/Tutorial.rd +296 -184
- data/doc/Tutorial.rd.html +1031 -0
- data/doc/Tutorial.rd.ja +111 -45
- data/doc/Tutorial.rd.ja.html +2225 -0
- data/doc/bioruby.css +281 -0
- data/extconf.rb +2 -0
- data/lib/bio.rb +29 -4
- data/lib/bio/appl/blast.rb +306 -121
- data/lib/bio/appl/blast/ddbj.rb +142 -0
- data/lib/bio/appl/blast/format0.rb +35 -25
- data/lib/bio/appl/blast/format8.rb +2 -2
- data/lib/bio/appl/blast/genomenet.rb +263 -0
- data/lib/bio/appl/blast/ncbioptions.rb +220 -0
- data/lib/bio/appl/blast/remote.rb +106 -0
- data/lib/bio/appl/blast/report.rb +260 -9
- data/lib/bio/appl/blast/rexml.rb +12 -5
- data/lib/bio/appl/blast/rpsblast.rb +277 -0
- data/lib/bio/appl/blast/wublast.rb +133 -12
- data/lib/bio/appl/blast/xmlparser.rb +35 -18
- data/lib/bio/appl/blat/report.rb +46 -5
- data/lib/bio/appl/emboss.rb +62 -13
- data/lib/bio/appl/fasta.rb +9 -11
- data/lib/bio/appl/genscan/report.rb +3 -3
- data/lib/bio/appl/hmmer.rb +1 -1
- data/lib/bio/appl/hmmer/report.rb +10 -10
- data/lib/bio/appl/paml/baseml.rb +95 -0
- data/lib/bio/appl/paml/baseml/report.rb +32 -0
- data/lib/bio/appl/paml/codeml.rb +242 -0
- data/lib/bio/appl/paml/codeml/rates.rb +67 -0
- data/lib/bio/appl/paml/codeml/report.rb +67 -0
- data/lib/bio/appl/paml/common.rb +348 -0
- data/lib/bio/appl/paml/common_report.rb +38 -0
- data/lib/bio/appl/paml/yn00.rb +103 -0
- data/lib/bio/appl/paml/yn00/report.rb +32 -0
- data/lib/bio/appl/psort.rb +2 -2
- data/lib/bio/appl/pts1.rb +5 -5
- data/lib/bio/appl/tmhmm/report.rb +10 -1
- data/lib/bio/command.rb +297 -41
- data/lib/bio/compat/features.rb +157 -0
- data/lib/bio/compat/references.rb +128 -0
- data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
- data/lib/bio/db/biosql/sequence.rb +508 -0
- data/lib/bio/db/embl/common.rb +28 -12
- data/lib/bio/db/embl/embl.rb +107 -9
- data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
- data/lib/bio/db/embl/format_embl.rb +190 -0
- data/lib/bio/db/embl/sptr.rb +15 -16
- data/lib/bio/db/fantom.rb +6 -8
- data/lib/bio/db/fasta.rb +10 -507
- data/lib/bio/db/fasta/defline.rb +532 -0
- data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
- data/lib/bio/db/fasta/format_fasta.rb +97 -0
- data/lib/bio/db/genbank/common.rb +25 -8
- data/lib/bio/db/genbank/format_genbank.rb +187 -0
- data/lib/bio/db/genbank/genbank.rb +36 -1
- data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
- data/lib/bio/db/gff.rb +1791 -119
- data/lib/bio/db/kegg/glycan.rb +2 -6
- data/lib/bio/db/lasergene.rb +3 -3
- data/lib/bio/db/medline.rb +4 -1
- data/lib/bio/db/newick.rb +10 -10
- data/lib/bio/db/pdb/chain.rb +6 -2
- data/lib/bio/db/pdb/pdb.rb +12 -3
- data/lib/bio/db/rebase.rb +7 -8
- data/lib/bio/db/soft.rb +3 -3
- data/lib/bio/feature.rb +1 -88
- data/lib/bio/io/biosql/biodatabase.rb +64 -0
- data/lib/bio/io/biosql/bioentry.rb +29 -0
- data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
- data/lib/bio/io/biosql/bioentry_path.rb +12 -0
- data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
- data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
- data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
- data/lib/bio/io/biosql/biosequence.rb +11 -0
- data/lib/bio/io/biosql/comment.rb +7 -0
- data/lib/bio/io/biosql/config/database.yml +20 -0
- data/lib/bio/io/biosql/dbxref.rb +13 -0
- data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
- data/lib/bio/io/biosql/location.rb +32 -0
- data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
- data/lib/bio/io/biosql/ontology.rb +10 -0
- data/lib/bio/io/biosql/reference.rb +9 -0
- data/lib/bio/io/biosql/seqfeature.rb +32 -0
- data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
- data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
- data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
- data/lib/bio/io/biosql/taxon.rb +12 -0
- data/lib/bio/io/biosql/taxon_name.rb +9 -0
- data/lib/bio/io/biosql/term.rb +27 -0
- data/lib/bio/io/biosql/term_dbxref.rb +11 -0
- data/lib/bio/io/biosql/term_path.rb +12 -0
- data/lib/bio/io/biosql/term_relationship.rb +13 -0
- data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
- data/lib/bio/io/biosql/term_synonym.rb +10 -0
- data/lib/bio/io/das.rb +7 -7
- data/lib/bio/io/ddbjxml.rb +57 -0
- data/lib/bio/io/ensembl.rb +2 -2
- data/lib/bio/io/fetch.rb +28 -14
- data/lib/bio/io/flatfile.rb +17 -853
- data/lib/bio/io/flatfile/autodetection.rb +545 -0
- data/lib/bio/io/flatfile/buffer.rb +237 -0
- data/lib/bio/io/flatfile/index.rb +17 -7
- data/lib/bio/io/flatfile/indexer.rb +30 -12
- data/lib/bio/io/flatfile/splitter.rb +297 -0
- data/lib/bio/io/hinv.rb +442 -0
- data/lib/bio/io/keggapi.rb +2 -2
- data/lib/bio/io/ncbirest.rb +733 -0
- data/lib/bio/io/pubmed.rb +34 -80
- data/lib/bio/io/registry.rb +2 -2
- data/lib/bio/io/sql.rb +178 -357
- data/lib/bio/io/togows.rb +458 -0
- data/lib/bio/location.rb +106 -11
- data/lib/bio/pathway.rb +120 -14
- data/lib/bio/reference.rb +115 -101
- data/lib/bio/sequence.rb +164 -183
- data/lib/bio/sequence/adapter.rb +108 -0
- data/lib/bio/sequence/common.rb +22 -45
- data/lib/bio/sequence/compat.rb +2 -2
- data/lib/bio/sequence/dblink.rb +54 -0
- data/lib/bio/sequence/format.rb +254 -77
- data/lib/bio/sequence/format_raw.rb +23 -0
- data/lib/bio/shell.rb +3 -1
- data/lib/bio/shell/core.rb +2 -2
- data/lib/bio/shell/plugin/entry.rb +33 -4
- data/lib/bio/shell/plugin/ncbirest.rb +64 -0
- data/lib/bio/shell/plugin/togows.rb +40 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
- data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
- data/lib/bio/tree.rb +4 -2
- data/lib/bio/util/color_scheme.rb +2 -2
- data/lib/bio/util/contingency_table.rb +2 -2
- data/lib/bio/util/restriction_enzyme.rb +2 -2
- data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
- data/lib/bio/version.rb +25 -0
- data/rdoc.zsh +8 -0
- data/sample/any2fasta.rb +0 -0
- data/sample/biofetch.rb +0 -0
- data/sample/dbget +0 -0
- data/sample/demo_sequence.rb +158 -0
- data/sample/enzymes.rb +0 -0
- data/sample/fasta2tab.rb +0 -0
- data/sample/fastagrep.rb +72 -0
- data/sample/fastasort.rb +54 -0
- data/sample/fsplit.rb +0 -0
- data/sample/gb2fasta.rb +2 -3
- data/sample/gb2tab.rb +0 -0
- data/sample/gbtab2mysql.rb +0 -0
- data/sample/genes2nuc.rb +0 -0
- data/sample/genes2pep.rb +0 -0
- data/sample/genes2tab.rb +0 -0
- data/sample/genome2rb.rb +0 -0
- data/sample/genome2tab.rb +0 -0
- data/sample/goslim.rb +0 -0
- data/sample/gt2fasta.rb +0 -0
- data/sample/na2aa.rb +34 -0
- data/sample/pmfetch.rb +0 -0
- data/sample/pmsearch.rb +0 -0
- data/sample/ssearch2tab.rb +0 -0
- data/sample/tfastx2tab.rb +0 -0
- data/sample/vs-genes.rb +0 -0
- data/setup.rb +1596 -0
- data/test/data/blast/blastp-multi.m7 +188 -0
- data/test/data/command/echoarg2.bat +1 -0
- data/test/data/paml/codeml/control_file.txt +30 -0
- data/test/data/paml/codeml/output.txt +78 -0
- data/test/data/paml/codeml/rates +217 -0
- data/test/data/rpsblast/misc.rpsblast +193 -0
- data/test/data/soft/GDS100_partial.soft +0 -0
- data/test/data/soft/GSE3457_family_partial.soft +0 -0
- data/test/functional/bio/appl/test_pts1.rb +115 -0
- data/test/functional/bio/io/test_ensembl.rb +123 -80
- data/test/functional/bio/io/test_togows.rb +267 -0
- data/test/functional/bio/sequence/test_output_embl.rb +51 -0
- data/test/functional/bio/test_command.rb +301 -0
- data/test/runner.rb +17 -1
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
- data/test/unit/bio/appl/blast/test_report.rb +753 -35
- data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
- data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
- data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
- data/test/unit/bio/appl/test_blast.rb +135 -4
- data/test/unit/bio/appl/test_fasta.rb +2 -2
- data/test/unit/bio/appl/test_pts1.rb +1 -64
- data/test/unit/bio/db/embl/test_common.rb +15 -15
- data/test/unit/bio/db/embl/test_embl.rb +4 -4
- data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
- data/test/unit/bio/db/embl/test_sptr.rb +38 -1
- data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
- data/test/unit/bio/db/test_gff.rb +1151 -25
- data/test/unit/bio/db/test_medline.rb +127 -0
- data/test/unit/bio/db/test_nexus.rb +5 -1
- data/test/unit/bio/db/test_prosite.rb +4 -4
- data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
- data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
- data/test/unit/bio/io/test_ddbjxml.rb +8 -3
- data/test/unit/bio/io/test_fastacmd.rb +5 -5
- data/test/unit/bio/io/test_flatfile.rb +357 -106
- data/test/unit/bio/io/test_soapwsdl.rb +2 -2
- data/test/unit/bio/io/test_togows.rb +161 -0
- data/test/unit/bio/sequence/test_common.rb +210 -11
- data/test/unit/bio/sequence/test_compat.rb +3 -3
- data/test/unit/bio/sequence/test_dblink.rb +58 -0
- data/test/unit/bio/sequence/test_na.rb +2 -2
- data/test/unit/bio/test_command.rb +111 -50
- data/test/unit/bio/test_feature.rb +29 -1
- data/test/unit/bio/test_location.rb +566 -6
- data/test/unit/bio/test_pathway.rb +91 -65
- data/test/unit/bio/test_reference.rb +67 -13
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
- metadata +202 -167
- data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
data/lib/bio/db/embl/common.rb
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# Mitsuteru C. Nakao <n@bioruby.org>
|
|
6
6
|
# License:: The Ruby License
|
|
7
7
|
#
|
|
8
|
-
# $Id: common.rb,v 1.12
|
|
8
|
+
# $Id: common.rb,v 1.12.2.5 2008/05/07 12:22:10 ngoto Exp $
|
|
9
9
|
#
|
|
10
10
|
# == Description
|
|
11
11
|
#
|
|
@@ -73,6 +73,7 @@
|
|
|
73
73
|
|
|
74
74
|
require 'bio/db'
|
|
75
75
|
require 'bio/reference'
|
|
76
|
+
require 'bio/compat/references'
|
|
76
77
|
|
|
77
78
|
module Bio
|
|
78
79
|
class EMBLDB
|
|
@@ -270,33 +271,48 @@ module Common
|
|
|
270
271
|
def references
|
|
271
272
|
unless @data['references']
|
|
272
273
|
ary = self.ref.map {|ent|
|
|
273
|
-
hash = Hash.new
|
|
274
|
+
hash = Hash.new
|
|
274
275
|
ent.each {|key, value|
|
|
275
276
|
case key
|
|
277
|
+
when 'RN'
|
|
278
|
+
if /\[(\d+)\]/ =~ value.to_s
|
|
279
|
+
hash['embl_gb_record_number'] = $1.to_i
|
|
280
|
+
end
|
|
281
|
+
when 'RC'
|
|
282
|
+
unless value.to_s.strip.empty?
|
|
283
|
+
hash['comments'] ||= []
|
|
284
|
+
hash['comments'].push value
|
|
285
|
+
end
|
|
286
|
+
when 'RP'
|
|
287
|
+
hash['sequence_position'] = value
|
|
276
288
|
when 'RA'
|
|
277
|
-
|
|
289
|
+
a = value.split(/\, /)
|
|
290
|
+
a.each do |x|
|
|
291
|
+
x.sub!(/( [^ ]+)\z/, ",\\1")
|
|
292
|
+
end
|
|
293
|
+
hash['authors'] = a
|
|
278
294
|
when 'RT'
|
|
279
295
|
hash['title'] = value
|
|
280
296
|
when 'RL'
|
|
281
|
-
if
|
|
282
|
-
hash['journal'] = $1
|
|
297
|
+
if /(.*) (\d+) *(\(([^\)]+)\))?(\, |\:)([a-zA-Z\d]+\-[a-zA-Z\d]+) *\((\d+)\)\.?\z/ =~ value.to_s
|
|
298
|
+
hash['journal'] = $1.rstrip
|
|
283
299
|
hash['volume'] = $2
|
|
284
|
-
hash['issue'] = $
|
|
285
|
-
hash['pages'] = $
|
|
286
|
-
hash['year'] = $
|
|
300
|
+
hash['issue'] = $4
|
|
301
|
+
hash['pages'] = $6
|
|
302
|
+
hash['year'] = $7
|
|
287
303
|
else
|
|
288
304
|
hash['journal'] = value
|
|
289
305
|
end
|
|
290
|
-
when 'RX' # PUBMED,
|
|
291
|
-
value.split(
|
|
292
|
-
tag, xref = item.split(
|
|
306
|
+
when 'RX' # PUBMED, DOI, (AGRICOLA)
|
|
307
|
+
value.split(/\. /).each {|item|
|
|
308
|
+
tag, xref = item.split(/\; /).map {|i| i.strip.sub(/\.\z/, '') }
|
|
293
309
|
hash[ tag.downcase ] = xref
|
|
294
310
|
}
|
|
295
311
|
end
|
|
296
312
|
}
|
|
297
313
|
Reference.new(hash)
|
|
298
314
|
}
|
|
299
|
-
@data['references'] =
|
|
315
|
+
@data['references'] = ary.extend(Bio::References::BackwardCompatibility)
|
|
300
316
|
end
|
|
301
317
|
@data['references']
|
|
302
318
|
end
|
data/lib/bio/db/embl/embl.rb
CHANGED
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
# = bio/db/embl/embl.rb - EMBL database class
|
|
3
3
|
#
|
|
4
4
|
#
|
|
5
|
-
# Copyright:: Copyright (C) 2001-2007
|
|
5
|
+
# Copyright:: Copyright (C) 2001-2007
|
|
6
|
+
# Mitsuteru C. Nakao <n@bioruby.org>
|
|
7
|
+
# Jan Aerts <jan.aerts@bbsrc.ac.uk>
|
|
6
8
|
# License:: The Ruby License
|
|
7
9
|
#
|
|
8
|
-
# $Id: embl.rb,v 1.29
|
|
10
|
+
# $Id: embl.rb,v 1.29.2.7 2008/06/17 16:04:36 ngoto Exp $
|
|
9
11
|
#
|
|
10
12
|
# == Description
|
|
11
13
|
#
|
|
@@ -29,8 +31,13 @@
|
|
|
29
31
|
# http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html
|
|
30
32
|
#
|
|
31
33
|
|
|
34
|
+
require 'date'
|
|
32
35
|
require 'bio/db'
|
|
33
36
|
require 'bio/db/embl/common'
|
|
37
|
+
require 'bio/compat/features'
|
|
38
|
+
require 'bio/compat/references'
|
|
39
|
+
require 'bio/sequence'
|
|
40
|
+
require 'bio/sequence/dblink'
|
|
34
41
|
|
|
35
42
|
module Bio
|
|
36
43
|
class EMBL < EMBLDB
|
|
@@ -120,6 +127,14 @@ class EMBL < EMBLDB
|
|
|
120
127
|
end
|
|
121
128
|
alias molecule_type molecule
|
|
122
129
|
|
|
130
|
+
def data_class
|
|
131
|
+
id_line('DATA_CLASS')
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def topology
|
|
135
|
+
id_line('TOPOLOGY')
|
|
136
|
+
end
|
|
137
|
+
|
|
123
138
|
# returns DIVISION in the ID line.
|
|
124
139
|
# * Bio::EMBL#division -> String
|
|
125
140
|
def division
|
|
@@ -221,8 +236,8 @@ class EMBL < EMBLDB
|
|
|
221
236
|
# RN RC RP RX RA RT RL
|
|
222
237
|
#
|
|
223
238
|
# Bio::EMBLDB#ref
|
|
224
|
-
|
|
225
|
-
|
|
239
|
+
|
|
240
|
+
|
|
226
241
|
##
|
|
227
242
|
# DR Line; defabases cross-regerence (>=0)
|
|
228
243
|
# "DR database_identifier; primary_identifier; secondary_identifier."
|
|
@@ -246,7 +261,6 @@ class EMBL < EMBLDB
|
|
|
246
261
|
# FT Line; feature table data (>=0)
|
|
247
262
|
def ft
|
|
248
263
|
unless @data['FT']
|
|
249
|
-
@data['FT'] = Array.new
|
|
250
264
|
ary = Array.new
|
|
251
265
|
in_quote = false
|
|
252
266
|
@orig['FT'].each_line do |line|
|
|
@@ -276,7 +290,7 @@ class EMBL < EMBLDB
|
|
|
276
290
|
parse_qualifiers(subary)
|
|
277
291
|
end
|
|
278
292
|
|
|
279
|
-
@data['FT'] =
|
|
293
|
+
@data['FT'] = ary.extend(Bio::Features::BackwardCompatibility)
|
|
280
294
|
end
|
|
281
295
|
if block_given?
|
|
282
296
|
@data['FT'].each do |feature|
|
|
@@ -311,9 +325,9 @@ class EMBL < EMBLDB
|
|
|
311
325
|
#
|
|
312
326
|
# CC Line; comments of notes (>=0)
|
|
313
327
|
def cc
|
|
314
|
-
get('CC')
|
|
328
|
+
get('CC').to_s.gsub(/^CC /, '')
|
|
315
329
|
end
|
|
316
|
-
|
|
330
|
+
alias comment cc
|
|
317
331
|
|
|
318
332
|
##
|
|
319
333
|
# XX Line; spacer line (many)
|
|
@@ -355,13 +369,96 @@ class EMBL < EMBLDB
|
|
|
355
369
|
# @orig[''] as sequence
|
|
356
370
|
# bb Line; (blanks) sequence data (>=1)
|
|
357
371
|
def seq
|
|
358
|
-
Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
|
|
372
|
+
Bio::Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
|
|
359
373
|
end
|
|
360
374
|
alias naseq seq
|
|
361
375
|
alias ntseq seq
|
|
362
376
|
|
|
377
|
+
#--
|
|
363
378
|
# // Line; termination line (end; 1/entry)
|
|
379
|
+
#++
|
|
380
|
+
|
|
381
|
+
# modified date. Returns Date object, String or nil.
|
|
382
|
+
def date_modified
|
|
383
|
+
parse_date(self.dt['updated'])
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# created date. Returns Date object, String or nil.
|
|
387
|
+
def date_created
|
|
388
|
+
parse_date(self.dt['created'])
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# release number when last updated
|
|
392
|
+
def release_modified
|
|
393
|
+
parse_release_version(self.dt['updated'])[0]
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
# release number when created
|
|
397
|
+
def release_created
|
|
398
|
+
parse_release_version(self.dt['created'])[0]
|
|
399
|
+
end
|
|
364
400
|
|
|
401
|
+
# entry version number numbered by EMBL
|
|
402
|
+
def entry_version
|
|
403
|
+
parse_release_version(self.dt['updated'])[1]
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
# parse date string. Returns Date object.
|
|
407
|
+
def parse_date(str)
|
|
408
|
+
begin
|
|
409
|
+
Date.parse(str)
|
|
410
|
+
rescue ArgumentError, TypeError, NoMethodError, NameError
|
|
411
|
+
str
|
|
412
|
+
end
|
|
413
|
+
end
|
|
414
|
+
private :parse_date
|
|
415
|
+
|
|
416
|
+
# extracts release and version numbers from DT line
|
|
417
|
+
def parse_release_version(str)
|
|
418
|
+
return [ nil, nil ] unless str
|
|
419
|
+
a = str.split(/[\(\,\)]/)
|
|
420
|
+
dstr = a.shift
|
|
421
|
+
rel = nil
|
|
422
|
+
ver = nil
|
|
423
|
+
a.each do |x|
|
|
424
|
+
case x
|
|
425
|
+
when /Rel\.\s*(.+)/
|
|
426
|
+
rel = $1.strip
|
|
427
|
+
when /Version\s*(.+)/
|
|
428
|
+
ver = $1.strip
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
[ rel, ver ]
|
|
432
|
+
end
|
|
433
|
+
private :parse_release_version
|
|
434
|
+
|
|
435
|
+
# database references (DR).
|
|
436
|
+
# Returns an array of Bio::Sequence::DBLink objects.
|
|
437
|
+
def dblinks
|
|
438
|
+
get('DR').split(/\n/).collect { |x|
|
|
439
|
+
Bio::Sequence::DBLink.parse_embl_DR_line(x)
|
|
440
|
+
}
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
# species
|
|
444
|
+
def species
|
|
445
|
+
self.fetch('OS')
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
# taxonomy classfication
|
|
449
|
+
alias classification oc
|
|
450
|
+
|
|
451
|
+
# features
|
|
452
|
+
alias features ft
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
# converts the entry to Bio::Sequence object
|
|
456
|
+
# ---
|
|
457
|
+
# *Arguments*::
|
|
458
|
+
# *Returns*:: Bio::Sequence object
|
|
459
|
+
def to_biosequence
|
|
460
|
+
Bio::Sequence.adapter(self, Bio::Sequence::Adapter::EMBL)
|
|
461
|
+
end
|
|
365
462
|
|
|
366
463
|
### private methods
|
|
367
464
|
|
|
@@ -400,3 +497,4 @@ class EMBL < EMBLDB
|
|
|
400
497
|
end # class EMBL
|
|
401
498
|
|
|
402
499
|
end # module Bio
|
|
500
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/embl/embl_to_biosequence.rb - Bio::EMBL to Bio::Sequence adapter module
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2008
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
# $Id:$
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
require 'bio/sequence'
|
|
12
|
+
require 'bio/sequence/adapter'
|
|
13
|
+
|
|
14
|
+
# Internal use only. Normal users should not use this module.
|
|
15
|
+
#
|
|
16
|
+
# Bio::EMBL to Bio::Sequence adapter module.
|
|
17
|
+
# It is internally used in Bio::EMBL#to_biosequence.
|
|
18
|
+
#
|
|
19
|
+
module Bio::Sequence::Adapter::EMBL
|
|
20
|
+
|
|
21
|
+
extend Bio::Sequence::Adapter
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def_biosequence_adapter :seq
|
|
26
|
+
|
|
27
|
+
def_biosequence_adapter :id_namespace do |orig|
|
|
28
|
+
'EMBL'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def_biosequence_adapter :entry_id
|
|
32
|
+
|
|
33
|
+
def_biosequence_adapter :primary_accession do |orig|
|
|
34
|
+
orig.accessions[0]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def_biosequence_adapter :secondary_accessions do |orig|
|
|
38
|
+
orig.accessions[1..-1] || []
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def_biosequence_adapter :molecule_type
|
|
42
|
+
|
|
43
|
+
def_biosequence_adapter :data_class
|
|
44
|
+
|
|
45
|
+
def_biosequence_adapter :definition, :description
|
|
46
|
+
|
|
47
|
+
def_biosequence_adapter :topology
|
|
48
|
+
|
|
49
|
+
def_biosequence_adapter :date_created
|
|
50
|
+
|
|
51
|
+
def_biosequence_adapter :date_modified
|
|
52
|
+
|
|
53
|
+
def_biosequence_adapter :release_created
|
|
54
|
+
|
|
55
|
+
def_biosequence_adapter :release_modified
|
|
56
|
+
|
|
57
|
+
def_biosequence_adapter :entry_version
|
|
58
|
+
|
|
59
|
+
def_biosequence_adapter :division
|
|
60
|
+
|
|
61
|
+
def_biosequence_adapter :sequence_version, :version
|
|
62
|
+
|
|
63
|
+
def_biosequence_adapter :keywords
|
|
64
|
+
|
|
65
|
+
def_biosequence_adapter :species
|
|
66
|
+
|
|
67
|
+
def_biosequence_adapter :classification
|
|
68
|
+
|
|
69
|
+
#--
|
|
70
|
+
# unsupported yet
|
|
71
|
+
# def_biosequence_adapter :organelle do |orig|
|
|
72
|
+
# orig.fetch('OG')
|
|
73
|
+
# end
|
|
74
|
+
#++
|
|
75
|
+
|
|
76
|
+
def_biosequence_adapter :references
|
|
77
|
+
|
|
78
|
+
def_biosequence_adapter :features
|
|
79
|
+
|
|
80
|
+
def_biosequence_adapter :comments, :cc
|
|
81
|
+
|
|
82
|
+
def_biosequence_adapter :dblinks
|
|
83
|
+
|
|
84
|
+
end #module Bio::Sequence::Adapter::EMBL
|
|
85
|
+
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/embl/format_embl.rb - EMBL format generater
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2008
|
|
5
|
+
# Jan Aerts <jandot@bioruby.org>,
|
|
6
|
+
# Naohisa Goto <ng@bioruby.org>
|
|
7
|
+
# License:: The Ruby License
|
|
8
|
+
#
|
|
9
|
+
# $Id: format_embl.rb,v 1.1.2.7 2008/06/19 12:45:15 ngoto Exp $
|
|
10
|
+
#
|
|
11
|
+
|
|
12
|
+
require 'bio/sequence/format'
|
|
13
|
+
|
|
14
|
+
module Bio::Sequence::Format::NucFormatter
|
|
15
|
+
|
|
16
|
+
# INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
|
|
17
|
+
# Embl format output class for Bio::Sequence.
|
|
18
|
+
class Embl < Bio::Sequence::Format::FormatterBase
|
|
19
|
+
|
|
20
|
+
# helper methods
|
|
21
|
+
include Bio::Sequence::Format::INSDFeatureHelper
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
# wrapping with EMBL style
|
|
26
|
+
def embl_wrap(prefix, str)
|
|
27
|
+
wrap(str.to_s, 80, prefix)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Given words (an Array of String) are wrapping with EMBL style.
|
|
31
|
+
# Each word is never splitted inside the word.
|
|
32
|
+
def embl_wrap_words(prefix, array)
|
|
33
|
+
width = 80
|
|
34
|
+
result = []
|
|
35
|
+
str = nil
|
|
36
|
+
array.each do |x|
|
|
37
|
+
if str then
|
|
38
|
+
if str.length + 1 + x.length > width then
|
|
39
|
+
str = nil
|
|
40
|
+
else
|
|
41
|
+
str.concat ' '
|
|
42
|
+
str.concat x
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
unless str then
|
|
46
|
+
str = prefix + x
|
|
47
|
+
result.push str
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
result.join("\n")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# format reference
|
|
54
|
+
# ref:: Bio::Reference object
|
|
55
|
+
# hash:: (optional) a hash for RN (reference number) administration
|
|
56
|
+
def reference_format_embl(ref, hash = nil)
|
|
57
|
+
lines = Array.new
|
|
58
|
+
if ref.embl_gb_record_number or hash then
|
|
59
|
+
refno = ref.embl_gb_record_number.to_i
|
|
60
|
+
hash ||= {}
|
|
61
|
+
if refno <= 0 or hash[refno] then
|
|
62
|
+
refno = hash.keys.sort[-1].to_i + 1
|
|
63
|
+
hash[refno] = true
|
|
64
|
+
end
|
|
65
|
+
lines << embl_wrap("RN ", "[#{refno}]")
|
|
66
|
+
end
|
|
67
|
+
if ref.comments then
|
|
68
|
+
ref.comments.each do |cmnt|
|
|
69
|
+
lines << embl_wrap("RC ", cmnt)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
unless ref.sequence_position.to_s.empty? then
|
|
73
|
+
lines << embl_wrap("RP ", "#{ref.sequence_position}")
|
|
74
|
+
end
|
|
75
|
+
unless ref.doi.to_s.empty? then
|
|
76
|
+
lines << embl_wrap("RX ", "DOI; #{ref.doi}.")
|
|
77
|
+
end
|
|
78
|
+
unless ref.pubmed.to_s.empty? then
|
|
79
|
+
lines << embl_wrap("RX ", "PUBMED; #{ref.pubmed}.")
|
|
80
|
+
end
|
|
81
|
+
unless ref.authors.empty? then
|
|
82
|
+
auth = ref.authors.collect do |x|
|
|
83
|
+
y = x.to_s.strip.split(/\, *([^\,]+)\z/)
|
|
84
|
+
y[1].gsub!(/\. +/, '.') if y[1]
|
|
85
|
+
y.join(' ')
|
|
86
|
+
end
|
|
87
|
+
lastauth = auth.pop
|
|
88
|
+
auth.each { |x| x.concat ',' }
|
|
89
|
+
auth.push(lastauth.to_s + ';')
|
|
90
|
+
lines << embl_wrap_words('RA ', auth)
|
|
91
|
+
end
|
|
92
|
+
lines << embl_wrap('RT ',
|
|
93
|
+
(ref.title.to_s.empty? ? '' :
|
|
94
|
+
"\"#{ref.title}\"") + ';')
|
|
95
|
+
unless ref.journal.to_s.empty? then
|
|
96
|
+
volissue = "#{ref.volume.to_s}"
|
|
97
|
+
volissue = "#{volissue}(#{ref.issue})" unless ref.issue.to_s.empty?
|
|
98
|
+
rl = "#{ref.journal}"
|
|
99
|
+
rl += " #{volissue}" unless volissue.empty?
|
|
100
|
+
rl += ":#{ref.pages}" unless ref.pages.to_s.empty?
|
|
101
|
+
rl += "(#{ref.year})" unless ref.year.to_s.empty?
|
|
102
|
+
rl += '.'
|
|
103
|
+
lines << embl_wrap('RL ', rl)
|
|
104
|
+
end
|
|
105
|
+
lines << "XX"
|
|
106
|
+
return lines.join("\n")
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def seq_format_embl(seq)
|
|
110
|
+
counter = 0
|
|
111
|
+
result = seq.gsub(/.{1,60}/) do |x|
|
|
112
|
+
counter += x.length
|
|
113
|
+
x = x.gsub(/.{10}/, '\0 ')
|
|
114
|
+
sprintf(" %-66s%9d\n", x, counter)
|
|
115
|
+
end
|
|
116
|
+
result.chomp!
|
|
117
|
+
result
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def seq_composition(seq)
|
|
121
|
+
{ :a => seq.count('aA'),
|
|
122
|
+
:c => seq.count('cC'),
|
|
123
|
+
:g => seq.count('gG'),
|
|
124
|
+
:t => seq.count('tTuU'),
|
|
125
|
+
:other => seq.count('^aAcCgGtTuU')
|
|
126
|
+
}
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# moleculue type
|
|
130
|
+
def mol_type_embl
|
|
131
|
+
if mt = molecule_type then
|
|
132
|
+
mt
|
|
133
|
+
elsif f = (features or []).find { |f| f.feature == 'source' } and
|
|
134
|
+
q = f.qualifiers.find { |q| q.qualifier == 'mol_type' } then
|
|
135
|
+
q.value
|
|
136
|
+
else
|
|
137
|
+
'NA'
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# CC line. Comments.
|
|
142
|
+
def comments_format_embl(cmnts)
|
|
143
|
+
return '' if !cmnts or cmnts.empty?
|
|
144
|
+
cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
|
|
145
|
+
a = []
|
|
146
|
+
cmnts.each do |str|
|
|
147
|
+
a.push embl_wrap('CC ', str)
|
|
148
|
+
end
|
|
149
|
+
unless a.empty? then
|
|
150
|
+
a.push "XX "
|
|
151
|
+
a.push '' # dummy to put "\n" at the end of the string
|
|
152
|
+
end
|
|
153
|
+
a.join("\n")
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# Erb template of EMBL format for Bio::Sequence
|
|
158
|
+
erb_template <<'__END_OF_TEMPLATE__'
|
|
159
|
+
ID <%= primary_accession || entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= mol_type_embl %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
|
|
160
|
+
XX
|
|
161
|
+
<%= embl_wrap('AC ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
|
|
162
|
+
XX
|
|
163
|
+
DT <%= format_date(date_created || null_date) %> (Rel. <%= release_created || 0 %>, Created)
|
|
164
|
+
DT <%= format_date(date_modified || null_date) %> (Rel. <%= release_modified || 0 %>, Last updated, Version <%= entry_version || 0 %>)
|
|
165
|
+
XX
|
|
166
|
+
<%= embl_wrap('DE ', definition) %>
|
|
167
|
+
XX
|
|
168
|
+
<%= embl_wrap('KW ', (keywords || []).join('; ') + '.') %>
|
|
169
|
+
XX
|
|
170
|
+
OS <%= species %>
|
|
171
|
+
<%= embl_wrap('OC ', (classification || []).join('; ') + '.') %>
|
|
172
|
+
XX
|
|
173
|
+
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
|
|
174
|
+
<% end %><% (dblinks || []).each do |r|
|
|
175
|
+
%>DR <%= r.database %>; <%= r.id %><% unless r.secondary_ids.empty? %>; <%= r.secondary_ids[0] %><% end %>.
|
|
176
|
+
<% end %><% if dblinks and !dblinks.empty? then
|
|
177
|
+
%>XX
|
|
178
|
+
<% end %><%= comments_format_embl(comments)
|
|
179
|
+
%>FH Key Location/Qualifiers
|
|
180
|
+
FH
|
|
181
|
+
<%= format_features_embl(features || []) %>XX
|
|
182
|
+
SQ Sequence <%= seq.length %> BP; <% c = seq_composition(seq) %><%= c[:a] %> A; <%= c[:c] %> C; <%= c[:g] %> G; <%= c[:t] %> T; <%= c[:other] %> other;
|
|
183
|
+
<%= seq_format_embl(seq) %>
|
|
184
|
+
//
|
|
185
|
+
__END_OF_TEMPLATE__
|
|
186
|
+
|
|
187
|
+
end #class Embl
|
|
188
|
+
|
|
189
|
+
end #module Bio::Sequence::Format::NucFormatter
|
|
190
|
+
|