bio-gff3 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README +65 -0
- data/README.rdoc +19 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/gff3-fetch +99 -0
- data/bio-gff3.gemspec +101 -0
- data/lib/bio-gff3.rb +0 -0
- data/lib/bio/db/gff/gffassemble.rb +300 -0
- data/lib/bio/db/gff/gffdb.rb +40 -0
- data/lib/bio/db/gff/gfffasta.rb +68 -0
- data/lib/bio/db/gff/gfffileiterator.rb +77 -0
- data/lib/bio/db/gff/gffinmemory.rb +63 -0
- data/lib/bio/db/gff/gffnocache.rb +124 -0
- data/lib/bio/db/gff/gffparser.rb +154 -0
- data/lib/bio/system/lruhash.rb +268 -0
- data/spec/gff3_assemble2_spec.rb +73 -0
- data/spec/gff3_assemble3_spec.rb +62 -0
- data/spec/gff3_assemble_spec.rb +291 -0
- data/spec/gff3_fileiterator_spec.rb +43 -0
- data/spec/gffdb_spec.rb +99 -0
- data/test/data/gff/MhA1_Contig1133.fa +2 -0
- data/test/data/gff/MhA1_Contig1133.gff3 +1862 -0
- data/test/data/gff/MhA1_Contig125.fa +673 -0
- data/test/data/gff/MhA1_Contig125.gff3 +2177 -0
- data/test/data/gff/standard.gff3 +25 -0
- data/test/data/gff/test-cds.gff3 +98 -0
- data/test/data/gff/test-ext-fasta.fa +16 -0
- data/test/data/gff/test-ext-fasta.gff3 +57 -0
- data/test/data/gff/test.gff3 +74 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-gff3.rb +7 -0
- metadata +180 -0
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "shoulda", ">= 0"
|
10
|
+
gem "bundler", "~> 1.0.0"
|
11
|
+
gem "jeweler", "~> 1.5.2"
|
12
|
+
gem "rcov", ">= 0"
|
13
|
+
gem "bio", ">= 1.4.1"
|
14
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
bio (1.4.1)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.5.2)
|
7
|
+
bundler (~> 1.0.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rake (0.8.7)
|
11
|
+
rcov (0.9.9)
|
12
|
+
shoulda (2.11.3)
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
ruby
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
bio (>= 1.4.1)
|
19
|
+
bundler (~> 1.0.0)
|
20
|
+
jeweler (~> 1.5.2)
|
21
|
+
rcov
|
22
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Pjotr Prins
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
= GFF3 plugin for BioRuby, aimed at parsing big data =
|
2
|
+
|
3
|
+
Features:
|
4
|
+
|
5
|
+
# Take GFF (genome browser) information and digest mRNA and CDS sequences
|
6
|
+
# Options for low memory use and caching of records
|
7
|
+
# Support for external FASTA files
|
8
|
+
|
9
|
+
You can use this plugin in two ways. First as a standalone program, next as a
|
10
|
+
plugin library to BioRuby.
|
11
|
+
|
12
|
+
For example, fetch mRNA and CDS information from GFF3 files and output to FASTA:
|
13
|
+
|
14
|
+
./bin/gff3-fetch mrna test/data/gff/test.gff3
|
15
|
+
./bin/gff3-fetch cds test/data/gff/test.gff3
|
16
|
+
|
17
|
+
Or clone this repository and add the 'lib' dir to the Ruby search path and
|
18
|
+
|
19
|
+
require 'bio/db/gff/gffdb'
|
20
|
+
|
21
|
+
You can also run RSpec with something like
|
22
|
+
|
23
|
+
ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gffdb_spec.rb
|
24
|
+
|
25
|
+
This implementation depends on BioRuby's basic GFF3 parser, with the possible
|
26
|
+
advantage that the plugin is faster and does not consume all memory. The Gff3
|
27
|
+
specs are based on the output of the Wormbase genome browser.
|
28
|
+
|
29
|
+
For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html
|
30
|
+
|
31
|
+
Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
32
|
+
|
33
|
+
-------------------------------------------------------------------------------
|
34
|
+
|
35
|
+
Usage:
|
36
|
+
|
37
|
+
BioRuby GFF3 Plugin Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
38
|
+
|
39
|
+
Fetch and assemble mRNAs, or CDS and print in FASTA format.
|
40
|
+
|
41
|
+
gff3-fetch [--no-cache] mRNA|CDS [filename.fa] filename.gff
|
42
|
+
|
43
|
+
Where:
|
44
|
+
|
45
|
+
--no-cache : do not load everything in memory
|
46
|
+
mRNA : assemble mRNA
|
47
|
+
CDS : assemble CDS
|
48
|
+
|
49
|
+
Multiple GFF3 files can be used. For external FASTA files, always the last
|
50
|
+
one before the GFF file is used.
|
51
|
+
|
52
|
+
Examples:
|
53
|
+
|
54
|
+
Find mRNA and CDS information from test.gff3 (which includes sequence information)
|
55
|
+
|
56
|
+
./bin/gff3-fetch mRNA test/data/gff/test.gff3
|
57
|
+
./bin/gff3-fetch CDS test/data/gff/test.gff3
|
58
|
+
|
59
|
+
Find mRNA from external FASTA file, without loading everythin in RAM
|
60
|
+
|
61
|
+
./bin/gff3-fetch --no-cache mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
62
|
+
|
63
|
+
If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475
|
64
|
+
|
65
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= bio-gff3
|
2
|
+
|
3
|
+
Description goes here.
|
4
|
+
|
5
|
+
== Contributing to bio-gff3
|
6
|
+
|
7
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
8
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
9
|
+
* Fork the project
|
10
|
+
* Start a feature/bugfix branch
|
11
|
+
* Commit and push until you are happy with your contribution
|
12
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
13
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2010 Pjotr Prins. See LICENSE.txt for
|
18
|
+
further details.
|
19
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "bio-gff3"
|
16
|
+
gem.homepage = "http://github.com/pjotrp/bioruby-gff3"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{BioRuby GFF3 plugin for big data}
|
19
|
+
gem.description = %Q{GFF3 (genome browser) information and digest mRNA and CDS sequences.
|
20
|
+
Options for low memory use and caching of records.
|
21
|
+
Support for external FASTA files.
|
22
|
+
}
|
23
|
+
gem.email = "pjotr.prins@thebird.nl"
|
24
|
+
gem.authors = ["Pjotr Prins"]
|
25
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
26
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
27
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
28
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
29
|
+
end
|
30
|
+
Jeweler::RubygemsDotOrgTasks.new
|
31
|
+
|
32
|
+
require 'rake/testtask'
|
33
|
+
Rake::TestTask.new(:test) do |test|
|
34
|
+
test.libs << 'lib' << 'test'
|
35
|
+
test.pattern = 'test/**/test_*.rb'
|
36
|
+
test.verbose = true
|
37
|
+
end
|
38
|
+
|
39
|
+
require 'rcov/rcovtask'
|
40
|
+
Rcov::RcovTask.new do |test|
|
41
|
+
test.libs << 'test'
|
42
|
+
test.pattern = 'test/**/test_*.rb'
|
43
|
+
test.verbose = true
|
44
|
+
end
|
45
|
+
|
46
|
+
task :default => :test
|
47
|
+
|
48
|
+
require 'rake/rdoctask'
|
49
|
+
Rake::RDocTask.new do |rdoc|
|
50
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
51
|
+
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "bio-gff3 #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.6.0
|
data/bin/gff3-fetch
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
#
|
3
|
+
# Author:: Pjotr Prins
|
4
|
+
# Copyright:: August 2010
|
5
|
+
# License:: Ruby License
|
6
|
+
#
|
7
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
8
|
+
|
9
|
+
|
10
|
+
USAGE = <<EOM
|
11
|
+
Fetch and assemble mRNAs, or CDS and print in FASTA format.
|
12
|
+
|
13
|
+
gff3-fetch [--no-cache] mRNA|CDS [filename.fa] filename.gff
|
14
|
+
|
15
|
+
Where:
|
16
|
+
|
17
|
+
--no-cache : do not load everything in memory
|
18
|
+
mRNA : assemble mRNA
|
19
|
+
CDS : assemble CDS
|
20
|
+
|
21
|
+
Multiple GFF3 files can be used. For external FASTA files, always the last
|
22
|
+
one before the GFF file is used.
|
23
|
+
|
24
|
+
Examples:
|
25
|
+
|
26
|
+
Find mRNA and CDS information from test.gff3 (which includes sequence information)
|
27
|
+
|
28
|
+
./bin/gff3-fetch mRNA test/data/gff/test.gff3
|
29
|
+
./bin/gff3-fetch CDS test/data/gff/test.gff3
|
30
|
+
|
31
|
+
Find CDS from exteranl FASTA file
|
32
|
+
|
33
|
+
./bin/gff3-fetch cds test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
|
34
|
+
|
35
|
+
Find mRNA from external FASTA file, without loading everything in RAM
|
36
|
+
|
37
|
+
./bin/gff3-fetch --no-cache mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
38
|
+
|
39
|
+
If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475
|
40
|
+
|
41
|
+
|
42
|
+
EOM
|
43
|
+
|
44
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
45
|
+
$: << rootpath+'/lib'
|
46
|
+
$: << rootpath+'/../bioruby/lib'
|
47
|
+
|
48
|
+
require 'bio/db/gff/gffdb'
|
49
|
+
|
50
|
+
$stderr.print "BioRuby GFF3 Plugin Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
|
51
|
+
|
52
|
+
if ARGV.size == 0
|
53
|
+
print USAGE
|
54
|
+
end
|
55
|
+
|
56
|
+
gfftype = ARGV.shift
|
57
|
+
caching = true
|
58
|
+
if gfftype == "--no-cache"
|
59
|
+
caching = false
|
60
|
+
gfftype = ARGV.shift
|
61
|
+
end
|
62
|
+
raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
|
63
|
+
|
64
|
+
fastafn = nil
|
65
|
+
|
66
|
+
ARGV.each do | fn |
|
67
|
+
if File.extname(fn) =~ /fa|fas|fasta/i
|
68
|
+
fastafn = fn
|
69
|
+
next
|
70
|
+
end
|
71
|
+
options = {:validate => false}
|
72
|
+
options = {:validate => false, :cache_components => :cache_none, :cache_records => :cache_none} if caching == false
|
73
|
+
options[:fasta_filename] = fastafn if fastafn
|
74
|
+
|
75
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(fn,options)
|
76
|
+
gff = gffdb.assembler
|
77
|
+
case gfftype.downcase
|
78
|
+
when 'mrna'
|
79
|
+
gff.each_mRNA_seq do | id, seq |
|
80
|
+
puts ">"+id
|
81
|
+
puts seq
|
82
|
+
end
|
83
|
+
when 'exon'
|
84
|
+
gff.each_exon_seq do | id, seq |
|
85
|
+
puts ">"+id
|
86
|
+
puts seq
|
87
|
+
end
|
88
|
+
when 'cds'
|
89
|
+
gff.each_CDS_seq do | id, seq |
|
90
|
+
puts ">"+id
|
91
|
+
puts seq
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise "Unknown action <#{gfftype}>"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
|
data/bio-gff3.gemspec
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{bio-gff3}
|
8
|
+
s.version = "0.6.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Pjotr Prins"]
|
12
|
+
s.date = %q{2010-12-29}
|
13
|
+
s.default_executable = %q{gff3-fetch}
|
14
|
+
s.description = %q{GFF3 (genome browser) information and digest mRNA and CDS sequences.
|
15
|
+
Options for low memory use and caching of records.
|
16
|
+
Support for external FASTA files.
|
17
|
+
}
|
18
|
+
s.email = %q{pjotr.prins@thebird.nl}
|
19
|
+
s.executables = ["gff3-fetch"]
|
20
|
+
s.extra_rdoc_files = [
|
21
|
+
"LICENSE.txt",
|
22
|
+
"README",
|
23
|
+
"README.rdoc"
|
24
|
+
]
|
25
|
+
s.files = [
|
26
|
+
"Gemfile",
|
27
|
+
"Gemfile.lock",
|
28
|
+
"LICENSE.txt",
|
29
|
+
"README",
|
30
|
+
"README.rdoc",
|
31
|
+
"Rakefile",
|
32
|
+
"VERSION",
|
33
|
+
"bin/gff3-fetch",
|
34
|
+
"bio-gff3.gemspec",
|
35
|
+
"lib/bio-gff3.rb",
|
36
|
+
"lib/bio/db/gff/gffassemble.rb",
|
37
|
+
"lib/bio/db/gff/gffdb.rb",
|
38
|
+
"lib/bio/db/gff/gfffasta.rb",
|
39
|
+
"lib/bio/db/gff/gfffileiterator.rb",
|
40
|
+
"lib/bio/db/gff/gffinmemory.rb",
|
41
|
+
"lib/bio/db/gff/gffnocache.rb",
|
42
|
+
"lib/bio/db/gff/gffparser.rb",
|
43
|
+
"lib/bio/system/lruhash.rb",
|
44
|
+
"spec/gff3_assemble2_spec.rb",
|
45
|
+
"spec/gff3_assemble3_spec.rb",
|
46
|
+
"spec/gff3_assemble_spec.rb",
|
47
|
+
"spec/gff3_fileiterator_spec.rb",
|
48
|
+
"spec/gffdb_spec.rb",
|
49
|
+
"test/data/gff/MhA1_Contig1133.fa",
|
50
|
+
"test/data/gff/MhA1_Contig1133.gff3",
|
51
|
+
"test/data/gff/MhA1_Contig125.fa",
|
52
|
+
"test/data/gff/MhA1_Contig125.gff3",
|
53
|
+
"test/data/gff/standard.gff3",
|
54
|
+
"test/data/gff/test-cds.gff3",
|
55
|
+
"test/data/gff/test-ext-fasta.fa",
|
56
|
+
"test/data/gff/test-ext-fasta.gff3",
|
57
|
+
"test/data/gff/test.gff3",
|
58
|
+
"test/helper.rb",
|
59
|
+
"test/test_bio-gff3.rb"
|
60
|
+
]
|
61
|
+
s.homepage = %q{http://github.com/pjotrp/bioruby-gff3}
|
62
|
+
s.licenses = ["MIT"]
|
63
|
+
s.require_paths = ["lib"]
|
64
|
+
s.rubygems_version = %q{1.3.7}
|
65
|
+
s.summary = %q{BioRuby GFF3 plugin for big data}
|
66
|
+
s.test_files = [
|
67
|
+
"spec/gff3_assemble2_spec.rb",
|
68
|
+
"spec/gff3_assemble3_spec.rb",
|
69
|
+
"spec/gff3_assemble_spec.rb",
|
70
|
+
"spec/gff3_fileiterator_spec.rb",
|
71
|
+
"spec/gffdb_spec.rb",
|
72
|
+
"test/helper.rb",
|
73
|
+
"test/test_bio-gff3.rb"
|
74
|
+
]
|
75
|
+
|
76
|
+
if s.respond_to? :specification_version then
|
77
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
78
|
+
s.specification_version = 3
|
79
|
+
|
80
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
81
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
82
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
83
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
84
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
85
|
+
s.add_development_dependency(%q<bio>, [">= 1.4.1"])
|
86
|
+
else
|
87
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
88
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
89
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
90
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
91
|
+
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
92
|
+
end
|
93
|
+
else
|
94
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
95
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
96
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
97
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
98
|
+
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
data/lib/bio-gff3.rb
ADDED
File without changes
|
@@ -0,0 +1,300 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffassemble.rb - Assemble mRNA and CDS from GFF
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file
|
9
|
+
|
10
|
+
module Bio
|
11
|
+
module GFFbrowser
|
12
|
+
|
13
|
+
module Helpers
|
14
|
+
|
15
|
+
module Error
|
16
|
+
def info str, id=''
|
17
|
+
$stderr.print "Info: "+str+" <#{id}>\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
def warn str,id=''
|
21
|
+
Kernel.warn "Warning: "+str+" <#{id}>"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Helper class for counting IDs
|
26
|
+
class Counter < Hash
|
27
|
+
def add id
|
28
|
+
self[id] = 0 if self[id] == nil
|
29
|
+
self[id] += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Helper class for storing linked records based on a shared ID
|
34
|
+
class LinkedRecs < Hash
|
35
|
+
include Error
|
36
|
+
def add id, rec
|
37
|
+
info "Adding #{rec.feature_type} <#{id}>"
|
38
|
+
self[id] = [] if self[id] == nil
|
39
|
+
self[id] << rec
|
40
|
+
end
|
41
|
+
|
42
|
+
# Validate all lists belong to the same container/component
|
43
|
+
def validate_seqname
|
44
|
+
each do | id, rec |
|
45
|
+
seqname = rec.first.seqname
|
46
|
+
rec.each do | section |
|
47
|
+
raise "Non-matching seqname #{section.seqname} in #{seqname}" if section.seqname != seqname
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Validate all lists share the same parent (if available). First checks
|
53
|
+
# for Parent attribute, next for mRNA attribute
|
54
|
+
def validate_shared_parent
|
55
|
+
each do | id, rec |
|
56
|
+
parent = rec.first.get_attribute('Parent')
|
57
|
+
if parent
|
58
|
+
rec.each do | section |
|
59
|
+
_parent = section.get_attribute('Parent')
|
60
|
+
raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent
|
61
|
+
end
|
62
|
+
end
|
63
|
+
parent = rec.first.get_attribute('mRNA')
|
64
|
+
if parent
|
65
|
+
rec.each do | section |
|
66
|
+
_parent = section.get_attribute('mRNA')
|
67
|
+
raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# walk all (CDS) lists for every container/component and
|
74
|
+
# validate they do not overlap
|
75
|
+
def validate_nonoverlapping
|
76
|
+
each do | id, rec |
|
77
|
+
sections = Sections::sort(rec)
|
78
|
+
sections.each_with_index do | check, i |
|
79
|
+
neighbour = sections[i+1]
|
80
|
+
if neighbour and check.intersection(neighbour)
|
81
|
+
warn "Overlapping sections for ",id
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class Section < Range
|
89
|
+
attr_reader :rec
|
90
|
+
def initialize rec
|
91
|
+
super(rec.start,rec.end)
|
92
|
+
@rec = rec
|
93
|
+
end
|
94
|
+
def intersection(other)
|
95
|
+
raise ArgumentError, 'value must be a Range' unless other.kind_of?(Range)
|
96
|
+
min, max = first, exclude_end? ? max : last
|
97
|
+
other_min, other_max = other.first, other.exclude_end? ? other.max : other.last
|
98
|
+
new_min = self === other_min ? other_min : other === min ? min : nil
|
99
|
+
new_max = self === other_max ? other_max : other === max ? max : nil
|
100
|
+
new_min && new_max ? new_min..new_max : nil
|
101
|
+
end
|
102
|
+
alias_method :&, :intersection
|
103
|
+
def <=> other
|
104
|
+
first <=> other.first
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
module Sections
|
109
|
+
# Return list of sorted Sections
|
110
|
+
def Sections::sort rec
|
111
|
+
sections = []
|
112
|
+
rec.each do | section |
|
113
|
+
sections.push Section.new(section)
|
114
|
+
end
|
115
|
+
sections.sort
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
module Record
|
120
|
+
include Error
|
121
|
+
# Format a record ID by, first, getting the ID attribute. If that fails
|
122
|
+
# the seqname is used with the start/stop positions.
|
123
|
+
def Record::formatID rec
|
124
|
+
id = rec.id if rec.id
|
125
|
+
if !id
|
126
|
+
if rec.seqname
|
127
|
+
id = "#{rec.seqname} #{rec.start} #{rec.end}".strip
|
128
|
+
else
|
129
|
+
id = 'unknown'
|
130
|
+
$stderr.print "Record with unknown ID"+rec.to_s
|
131
|
+
end
|
132
|
+
end
|
133
|
+
id
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
module Gff3Component
|
138
|
+
|
139
|
+
include Error
|
140
|
+
|
141
|
+
COMPONENT_TYPES = %w{
|
142
|
+
gene SO:0000704 contig transcript Component region
|
143
|
+
}
|
144
|
+
|
145
|
+
# Walk the component list to find a matching component/container for a
|
146
|
+
# record. First use the parent ID. If that is missing go by sequence
|
147
|
+
# name.
|
148
|
+
def find_component rec
|
149
|
+
parent = rec.get_attribute('Parent')
|
150
|
+
if @componentlist[parent]
|
151
|
+
# nice, there is a match
|
152
|
+
info "find_component: Matched parent", parent
|
153
|
+
return @componentlist[parent]
|
154
|
+
end
|
155
|
+
search = rec.seqname
|
156
|
+
if @componentlist[search]
|
157
|
+
info "find_component: Matched seqname", search
|
158
|
+
return @componentlist[search]
|
159
|
+
end
|
160
|
+
@componentlist.each do | componentid, component |
|
161
|
+
# dissemble id
|
162
|
+
(id, start, stop) = componentid.split(/ /)
|
163
|
+
if id==search and rec.start >= start.to_i and rec.end <= stop.to_i
|
164
|
+
info "find_component: Matched column 0 and location", componentid
|
165
|
+
return component
|
166
|
+
end
|
167
|
+
end
|
168
|
+
# Ah, painful. At this point the record has no matching container, probably
|
169
|
+
# because it has no parent ID and the component has an ID. We have to go by
|
170
|
+
# ID for every component individually
|
171
|
+
@componentlist.each do | componentid, component |
|
172
|
+
if component.seqname==search and rec.start >= component.start and rec.end <= component.end
|
173
|
+
# p ["----",search,rec]
|
174
|
+
# p component
|
175
|
+
info "find_component: Matched (long search) column 0 and location", componentid
|
176
|
+
return component
|
177
|
+
end
|
178
|
+
end
|
179
|
+
warn "Could not find container/component for",Record::formatID(rec)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
module Gff3Features
|
184
|
+
|
185
|
+
# Ignore the following features (case sensitive?)
|
186
|
+
IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
|
187
|
+
transposon Match similarity UTR
|
188
|
+
TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
|
189
|
+
polyA_site SO:0000553
|
190
|
+
five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
|
191
|
+
exon SO:0000147
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
module Gff3Sequence
|
196
|
+
# Patch a sequence together from a Sequence string and an array
|
197
|
+
# of records. Note that rec positions are 1-based coordinates, relative
|
198
|
+
# to the landmark given in column 1 - in this case the sequence as it
|
199
|
+
# is passed in. The following options are available:
|
200
|
+
#
|
201
|
+
# :phase : set phase (default true)
|
202
|
+
# :reverse : do reverse if reverse is indicated (true)
|
203
|
+
# :complement : do complement if reverse is indicated (true)
|
204
|
+
# :trim : make sure sequence is multiple of 3 nucleotide bps (false)
|
205
|
+
#
|
206
|
+
# there are two special options:
|
207
|
+
#
|
208
|
+
# :raw : raw sequence (all above false)
|
209
|
+
# :codonize : codon sequence (all above true)
|
210
|
+
#
|
211
|
+
def assemble sequence, startpos, reclist, options = { :phase=>true, :reverse=>true, :trim=>false, :complement=>false }
|
212
|
+
do_phase = options[:phase]
|
213
|
+
do_reverse = options[:reverse]
|
214
|
+
do_trim = options[:trim]
|
215
|
+
do_complement = options[:complement]
|
216
|
+
if options[:raw]
|
217
|
+
do_phase = false
|
218
|
+
do_reverse = false
|
219
|
+
do_trim = false
|
220
|
+
do_complement = false
|
221
|
+
elsif options[:codonize]
|
222
|
+
do_phase = true
|
223
|
+
do_reverse = true
|
224
|
+
do_trim = true
|
225
|
+
do_complement = true
|
226
|
+
end
|
227
|
+
retval = ""
|
228
|
+
sectionlist = Sections::sort(reclist)
|
229
|
+
reverse = false
|
230
|
+
# we assume strand is always the same
|
231
|
+
rec0 = sectionlist.first.rec
|
232
|
+
reverse = (rec0.strand == '-') if rec0.strand
|
233
|
+
if reverse
|
234
|
+
# fetch phase from the last feature when reversed
|
235
|
+
rec0 = sectionlist.last.rec
|
236
|
+
end
|
237
|
+
frame = 0
|
238
|
+
frame = rec0.frame if rec0.frame
|
239
|
+
sectionlist.each do | section |
|
240
|
+
if sequence.kind_of?(Bio::FastaFormat)
|
241
|
+
sequence = sequence.seq
|
242
|
+
end
|
243
|
+
rec = section.rec
|
244
|
+
seq = sequence[(rec.start-1)..(rec.end-1)]
|
245
|
+
retval += seq
|
246
|
+
end
|
247
|
+
seq = retval
|
248
|
+
if do_reverse
|
249
|
+
# if strand is negative, reverse
|
250
|
+
seq = seq.reverse if reverse
|
251
|
+
end
|
252
|
+
if do_phase
|
253
|
+
# For forward strand features, phase is counted from the start
|
254
|
+
# field. For reverse strand features, phase is counted from the end
|
255
|
+
# field.
|
256
|
+
#
|
257
|
+
# With a reverse protein coding string in Wormbase
|
258
|
+
# the phase appears to be disregarded - or rather handled
|
259
|
+
# by start-stop. This is a hack.
|
260
|
+
if do_reverse and reverse and (seq.size % 3 == 0)
|
261
|
+
# do nothing
|
262
|
+
else
|
263
|
+
seq = seq[frame..-1] if frame != 0 # set phase
|
264
|
+
end
|
265
|
+
end
|
266
|
+
if do_complement
|
267
|
+
# if strand is negative, forward complement
|
268
|
+
if reverse
|
269
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
270
|
+
seq = ntseq.forward_complement.upcase
|
271
|
+
end
|
272
|
+
end
|
273
|
+
if do_trim
|
274
|
+
reduce = seq.size % 3
|
275
|
+
seq = seq[0..(seq.size-1-reduce)] if reduce != 0
|
276
|
+
end
|
277
|
+
retval = seq
|
278
|
+
retval
|
279
|
+
end
|
280
|
+
|
281
|
+
# Patch a sequence together from a Sequence string and an array
|
282
|
+
# of records and translate in the correct direction and frame
|
283
|
+
def assembleAA sequence, startpos, rec
|
284
|
+
seq = assemble(sequence, startpos, rec, :phase=>true, :reverse=>true, :complement=>true)
|
285
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
286
|
+
ntseq.translate
|
287
|
+
end
|
288
|
+
|
289
|
+
# Create a description for output
|
290
|
+
def description id, component, rec
|
291
|
+
sections = Sections::sort(rec)
|
292
|
+
id+' Sequence:'+component.seqname+"_#{component.start}:#{component.end} ("+
|
293
|
+
sections.map { |s| "#{s.first}:#{s.last}" }.join(', ') +")"
|
294
|
+
end
|
295
|
+
|
296
|
+
end
|
297
|
+
end # Helpers
|
298
|
+
|
299
|
+
end
|
300
|
+
end
|