bio-gff3 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README +65 -0
- data/README.rdoc +19 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/gff3-fetch +99 -0
- data/bio-gff3.gemspec +101 -0
- data/lib/bio-gff3.rb +0 -0
- data/lib/bio/db/gff/gffassemble.rb +300 -0
- data/lib/bio/db/gff/gffdb.rb +40 -0
- data/lib/bio/db/gff/gfffasta.rb +68 -0
- data/lib/bio/db/gff/gfffileiterator.rb +77 -0
- data/lib/bio/db/gff/gffinmemory.rb +63 -0
- data/lib/bio/db/gff/gffnocache.rb +124 -0
- data/lib/bio/db/gff/gffparser.rb +154 -0
- data/lib/bio/system/lruhash.rb +268 -0
- data/spec/gff3_assemble2_spec.rb +73 -0
- data/spec/gff3_assemble3_spec.rb +62 -0
- data/spec/gff3_assemble_spec.rb +291 -0
- data/spec/gff3_fileiterator_spec.rb +43 -0
- data/spec/gffdb_spec.rb +99 -0
- data/test/data/gff/MhA1_Contig1133.fa +2 -0
- data/test/data/gff/MhA1_Contig1133.gff3 +1862 -0
- data/test/data/gff/MhA1_Contig125.fa +673 -0
- data/test/data/gff/MhA1_Contig125.gff3 +2177 -0
- data/test/data/gff/standard.gff3 +25 -0
- data/test/data/gff/test-cds.gff3 +98 -0
- data/test/data/gff/test-ext-fasta.fa +16 -0
- data/test/data/gff/test-ext-fasta.gff3 +57 -0
- data/test/data/gff/test.gff3 +74 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-gff3.rb +7 -0
- metadata +180 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffdb.rb - GFF database class
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
|
8
|
+
# Create db from a GFF file
|
9
|
+
|
10
|
+
require 'bio'
|
11
|
+
require 'bio/db/gff/gfffileiterator'
|
12
|
+
require 'bio/db/gff/gfffasta'
|
13
|
+
require 'bio/db/gff/gffassemble'
|
14
|
+
require 'bio/db/gff/gffparser'
|
15
|
+
require 'bio/db/gff/gffinmemory'
|
16
|
+
require 'bio/db/gff/gffnocache'
|
17
|
+
|
18
|
+
module Bio
|
19
|
+
module GFFbrowser
|
20
|
+
class GFFdb
|
21
|
+
attr_reader :assembler
|
22
|
+
|
23
|
+
include Digest
|
24
|
+
|
25
|
+
# Initialize a GFF parser
|
26
|
+
def initialize filename, options = {}
|
27
|
+
cache_recs = options[:cache_records]
|
28
|
+
@assembler =
|
29
|
+
case cache_recs
|
30
|
+
when :cache_none
|
31
|
+
NoCache.new(filename, options)
|
32
|
+
else
|
33
|
+
InMemory.new(filename, options) # default
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end # GFFdb
|
38
|
+
end # GFFbrowser
|
39
|
+
end # Bio
|
40
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# = bio/db/gff/gfffasta.rb - Fetch records from a file in FASTA format
|
2
|
+
#
|
3
|
+
# Copyright:: Copyright (C) 2010
|
4
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
5
|
+
# License:: The Ruby License
|
6
|
+
#
|
7
|
+
# This requires a special implementation as it uses an open file and we
|
8
|
+
# retain file seek positions.
|
9
|
+
|
10
|
+
module Bio
|
11
|
+
|
12
|
+
class GFF
|
13
|
+
|
14
|
+
# Read FASTA records from file and store seek positions, which are
|
15
|
+
# used to retrieve the records. Note, this implementation merely retains
|
16
|
+
# records in memory (FIXME)
|
17
|
+
class FastaReader
|
18
|
+
def initialize fh, io_seek=nil
|
19
|
+
@fh = fh
|
20
|
+
@h = {}
|
21
|
+
parse
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse
|
25
|
+
# read FASTA records
|
26
|
+
header = nil
|
27
|
+
seqs = []
|
28
|
+
@fh.each_line do | line |
|
29
|
+
line = line.strip
|
30
|
+
next if line =~ /^#/
|
31
|
+
if line =~ /^>/ # FASTA record header
|
32
|
+
add(header,seqs)
|
33
|
+
header = line
|
34
|
+
seqs = []
|
35
|
+
else
|
36
|
+
seqs << line
|
37
|
+
end
|
38
|
+
end
|
39
|
+
add(header,seqs)
|
40
|
+
end
|
41
|
+
|
42
|
+
def [] index
|
43
|
+
@h[index]
|
44
|
+
end
|
45
|
+
|
46
|
+
def each
|
47
|
+
@h.each do | k,v |
|
48
|
+
yield k, v
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
def add header, seqs
|
54
|
+
if header
|
55
|
+
id, fastarec = fasta_rec(header, seqs)
|
56
|
+
@h[id] = fastarec.data.strip
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def fasta_rec header, buf
|
61
|
+
fst = Bio::FastaFormat.new(header+"\n"+buf.to_s)
|
62
|
+
return fst.definition, fst
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end # GFF
|
68
|
+
end # Bio
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gfffileiterator.rb - Fetch records from a file
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
|
8
|
+
module Bio
|
9
|
+
|
10
|
+
class GFF
|
11
|
+
|
12
|
+
class GFF3
|
13
|
+
|
14
|
+
class FileRecord < Record
|
15
|
+
attr_accessor :io_seek
|
16
|
+
def initialize io_seek, buf
|
17
|
+
@io_seek = io_seek
|
18
|
+
super(buf)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# GFF3::FileIterator takes a file and yields GFF3 records with their
|
23
|
+
# seek position included in the record.
|
24
|
+
class FileIterator
|
25
|
+
attr_accessor :fh
|
26
|
+
attr_reader :fasta_io_seek
|
27
|
+
|
28
|
+
def initialize filename
|
29
|
+
@fh = File.open(filename)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Iterate over every record in the file, yielding the record ID and
|
33
|
+
# (File)Record, which includes the io_seek position in the file
|
34
|
+
def each_rec()
|
35
|
+
fpos = 0
|
36
|
+
@fh.each_line do | line |
|
37
|
+
line = line.strip
|
38
|
+
if line == "##FASTA"
|
39
|
+
@fasta_io_seek = fpos
|
40
|
+
break
|
41
|
+
end
|
42
|
+
if line.size != 0 and line !~ /^#/
|
43
|
+
rec = FileRecord.new(fpos, line)
|
44
|
+
lastpos = @fh.tell
|
45
|
+
id = rec.id
|
46
|
+
yield id, rec
|
47
|
+
@fh.seek(lastpos) # reset filepos, just in case it changed
|
48
|
+
end
|
49
|
+
fpos = @fh.tell
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Iterate over all contained FASTA sequences, yielding the ID
|
54
|
+
# and sequence as a FASTA record. Normally call each_rec first and
|
55
|
+
# you can test for existing FASTA records if fasta_io_seek != nil
|
56
|
+
def each_sequence
|
57
|
+
if @fasta_io_seek == nil
|
58
|
+
# Find the FASTA location first
|
59
|
+
@fh.each_line do | line |
|
60
|
+
break if line.strip == "##FASTA"
|
61
|
+
end
|
62
|
+
else
|
63
|
+
@fh.seek(@fasta_io_seek)
|
64
|
+
end
|
65
|
+
fasta = Bio::GFF::FastaReader.new(@fh)
|
66
|
+
fasta.each do | id, fastarec |
|
67
|
+
yield id, fastarec
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end # GFF3
|
72
|
+
end # GFF
|
73
|
+
end # Bio
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffinmemory.rb - Assemble mRNA and CDS from GFF in RAM
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file
|
9
|
+
|
10
|
+
module Bio
|
11
|
+
module GFFbrowser
|
12
|
+
|
13
|
+
module Digest
|
14
|
+
|
15
|
+
class InMemory
|
16
|
+
include Parser
|
17
|
+
include Gff3Sequence
|
18
|
+
attr_reader :sequencelist
|
19
|
+
|
20
|
+
def initialize filename, options
|
21
|
+
@options = options
|
22
|
+
# Invoke the BioRuby in memory parser
|
23
|
+
@gff = Bio::GFF::GFF3.new(File.read(filename))
|
24
|
+
end
|
25
|
+
|
26
|
+
# Digest mRNA from the GFFdb and store in Hash
|
27
|
+
# Next yield(id, seq) from Hash
|
28
|
+
def parse
|
29
|
+
info "---- Digest DB and store data in mRNA Hash"
|
30
|
+
@count_ids = Counter.new # Count ids
|
31
|
+
@count_seqnames = Counter.new # Count seqnames
|
32
|
+
@componentlist = {} # Store containers, like genes, contigs
|
33
|
+
@mrnalist = LinkedRecs.new # Store linked mRNA records
|
34
|
+
@cdslist = LinkedRecs.new
|
35
|
+
@exonlist = LinkedRecs.new
|
36
|
+
@sequencelist = {}
|
37
|
+
@unrecognized_features = {}
|
38
|
+
@gff.records.each do | rec |
|
39
|
+
store_record(rec)
|
40
|
+
end
|
41
|
+
@gff.sequences.each do | bioseq |
|
42
|
+
id = bioseq.entry_id
|
43
|
+
@sequencelist[id] = bioseq.to_s # in Bio::Sequence with contained Bio::FastaFormat
|
44
|
+
end
|
45
|
+
validate_mrnas
|
46
|
+
validate_cdss
|
47
|
+
show_unrecognized_features
|
48
|
+
@genelist = @count_ids.keys
|
49
|
+
read_fasta
|
50
|
+
end
|
51
|
+
|
52
|
+
def each_item list
|
53
|
+
list.each do | id, recs |
|
54
|
+
seqid = recs[0].seqname
|
55
|
+
component = find_component(recs[0])
|
56
|
+
yield id, recs, component
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffnocache.rb - Assemble mRNA and CDS from GFF by fseek
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file without using RAM - also check
|
9
|
+
# out the caching edition, which uses limited amounts of RAM
|
10
|
+
|
11
|
+
module Bio
|
12
|
+
module GFFbrowser
|
13
|
+
|
14
|
+
module Digest
|
15
|
+
|
16
|
+
module NoCacheHelpers
|
17
|
+
|
18
|
+
module SeekRec
|
19
|
+
# Fetch a record using fh and file seek position
|
20
|
+
def SeekRec::fetch(fh,fpos)
|
21
|
+
return nil if fh==nil or fpos==nil
|
22
|
+
fh.seek(fpos)
|
23
|
+
GFF::GFF3::FileRecord.new(fpos, fh.gets)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# The hardwired to file RecList
|
28
|
+
class SeekRecList
|
29
|
+
def initialize fh
|
30
|
+
@fh = fh
|
31
|
+
@h = {}
|
32
|
+
end
|
33
|
+
|
34
|
+
def []= id, rec
|
35
|
+
raise "id #{id} occurs twice!" if @h[id]
|
36
|
+
fpos = rec.io_seek
|
37
|
+
@h[id] = fpos
|
38
|
+
end
|
39
|
+
|
40
|
+
def [](id)
|
41
|
+
fpos = @h[id]
|
42
|
+
SeekRec::fetch(@fh,fpos)
|
43
|
+
end
|
44
|
+
|
45
|
+
def each
|
46
|
+
@h.each do | id,fpos |
|
47
|
+
yield id, self[id]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class SeekLinkedRecs < Hash
|
53
|
+
include Helpers::Error
|
54
|
+
def add id, rec
|
55
|
+
info "Adding #{rec.feature_type} <#{id}>"
|
56
|
+
self[id] = [] if self[id] == nil
|
57
|
+
self[id] << rec.io_seek
|
58
|
+
end
|
59
|
+
# validation is switched off for NoCache
|
60
|
+
def validate_seqname
|
61
|
+
end
|
62
|
+
# validation is switched off for NoCache
|
63
|
+
def validate_nonoverlapping
|
64
|
+
end
|
65
|
+
# validation is switched off for NoCache
|
66
|
+
def validate_shared_parent
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class NoCache
|
72
|
+
include Parser
|
73
|
+
include NoCacheHelpers
|
74
|
+
include Gff3Sequence
|
75
|
+
|
76
|
+
def initialize filename, options
|
77
|
+
@filename = filename
|
78
|
+
@options = options
|
79
|
+
@iter = Bio::GFF::GFF3::FileIterator.new(@filename)
|
80
|
+
end
|
81
|
+
|
82
|
+
# parse the whole file once and store all seek locations,
|
83
|
+
# rather than the records themselves
|
84
|
+
def parse
|
85
|
+
info "---- Digest DB and store data in mRNA Hash (NoCache)"
|
86
|
+
@count_ids = Counter.new # Count ids
|
87
|
+
@count_seqnames = Counter.new # Count seqnames
|
88
|
+
@componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
|
89
|
+
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
90
|
+
@cdslist = SeekLinkedRecs.new
|
91
|
+
@exonlist = SeekLinkedRecs.new
|
92
|
+
@sequencelist = {}
|
93
|
+
@unrecognized_features = {}
|
94
|
+
@iter.each_rec do | id, rec |
|
95
|
+
store_record(rec)
|
96
|
+
end
|
97
|
+
@iter.each_sequence do | id, bioseq |
|
98
|
+
@sequencelist[id] = bioseq.to_s
|
99
|
+
end
|
100
|
+
validate_mrnas
|
101
|
+
validate_cdss
|
102
|
+
show_unrecognized_features
|
103
|
+
@genelist = @count_ids.keys
|
104
|
+
read_fasta
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_item list
|
108
|
+
# p list.class
|
109
|
+
fh = @iter.fh
|
110
|
+
list.each do | id, io_seeklist |
|
111
|
+
recs = []
|
112
|
+
io_seeklist.each do | fpos |
|
113
|
+
recs << SeekRec::fetch(fh,fpos)
|
114
|
+
end
|
115
|
+
seqid = recs[0].seqname
|
116
|
+
component = find_component(recs[0])
|
117
|
+
yield id, recs, component
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffparser.rb - Parsing logic for GFF3 file
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
|
9
|
+
module Bio
|
10
|
+
module GFFbrowser
|
11
|
+
module Digest
|
12
|
+
|
13
|
+
module Parser
|
14
|
+
|
15
|
+
include Bio::GFFbrowser::Helpers
|
16
|
+
include Bio::GFFbrowser::Helpers::Error
|
17
|
+
include Gff3Component
|
18
|
+
include Gff3Features
|
19
|
+
|
20
|
+
def store_record rec
|
21
|
+
return if rec.comment # skip GFF comments
|
22
|
+
id = Record::formatID(rec)
|
23
|
+
@count_ids.add(id)
|
24
|
+
@count_seqnames.add(rec.seqname)
|
25
|
+
|
26
|
+
if COMPONENT_TYPES.include?(rec.feature_type)
|
27
|
+
# check for container ID
|
28
|
+
warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
|
29
|
+
@componentlist[id] = rec
|
30
|
+
info "Added #{rec.feature_type} with component ID #{id}"
|
31
|
+
else
|
32
|
+
case rec.feature_type
|
33
|
+
when 'mRNA' || 'SO:0000234'
|
34
|
+
@mrnalist.add(id,rec)
|
35
|
+
when 'CDS' || 'SO:0000316'
|
36
|
+
@cdslist.add(id,rec)
|
37
|
+
when 'exon' || 'SO:0000147'
|
38
|
+
@exonlist.add(id,rec)
|
39
|
+
else
|
40
|
+
if !IGNORE_FEATURES.include?(rec.feature_type)
|
41
|
+
@unrecognized_features[rec.feature_type] = true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate_mrnas
|
48
|
+
return if not @options[:validate]
|
49
|
+
# validate gene/container/component seqname is shared
|
50
|
+
@mrnalist.validate_seqname
|
51
|
+
@mrnalist.validate_shared_parent
|
52
|
+
end
|
53
|
+
|
54
|
+
def validate_cdss
|
55
|
+
return if not @options[:validate]
|
56
|
+
@cdslist.validate_seqname
|
57
|
+
# validate CDS sections do not overlap
|
58
|
+
@cdslist.validate_nonoverlapping
|
59
|
+
# validate sections share the parent
|
60
|
+
@cdslist.validate_shared_parent
|
61
|
+
# display unhandled features
|
62
|
+
end
|
63
|
+
|
64
|
+
def show_unrecognized_features
|
65
|
+
@unrecognized_features.keys.each do | k |
|
66
|
+
warn "Feature has no match",k if k
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def read_fasta
|
71
|
+
if @options[:fasta_filename]
|
72
|
+
File.open(@options[:fasta_filename]) do | f |
|
73
|
+
fasta = Bio::GFF::FastaReader.new(f)
|
74
|
+
fasta.each do | id, fastarec |
|
75
|
+
# p fastarec
|
76
|
+
@sequencelist[id] = fastarec
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
# p :inmemory, @sequencelist
|
81
|
+
end
|
82
|
+
|
83
|
+
# Yield the id, recs, containing component and sequence of mRNAs
|
84
|
+
def each_mRNA
|
85
|
+
parse if !@mrnalist
|
86
|
+
each_item(@mrnalist) { |id, recs, component | yield id, recs, component }
|
87
|
+
end
|
88
|
+
|
89
|
+
# Yield the id, recs, and containing component
|
90
|
+
def each_CDS
|
91
|
+
parse if !@cdslist
|
92
|
+
each_item(@cdslist) { |id, recs, component | yield id, recs, component }
|
93
|
+
end
|
94
|
+
|
95
|
+
# Yield the id, recs, and containing component
|
96
|
+
def each_exon
|
97
|
+
parse if !@exonlist
|
98
|
+
each_item(@exonlist) { |id, recs, component | yield id, recs, component }
|
99
|
+
end
|
100
|
+
|
101
|
+
# Yield a unique description and the sequence
|
102
|
+
def each_mRNA_seq
|
103
|
+
each_mRNA do | id, reclist, component |
|
104
|
+
if component
|
105
|
+
sequence = @sequencelist[component.seqname]
|
106
|
+
# p sequence
|
107
|
+
if sequence
|
108
|
+
yield description(id,component,reclist), assemble(sequence,component.start,reclist)
|
109
|
+
else
|
110
|
+
warn "No sequence information for",id
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Yield a unique description and the sequence
|
117
|
+
def each_CDS_seq
|
118
|
+
each_CDS do | id, reclist, component |
|
119
|
+
if component
|
120
|
+
sequence = @sequencelist[component.seqname]
|
121
|
+
# p sequence
|
122
|
+
if sequence
|
123
|
+
seq = assemble(sequence,component.start,reclist,:codonize=>true)
|
124
|
+
if seq.size % 3 != 0
|
125
|
+
p reclist # leave this in
|
126
|
+
# raise "CDS size #{seq.size} is not a multiple of 3! <#{seq}>"
|
127
|
+
warn "CDS size is not a multiple of 3",id
|
128
|
+
end
|
129
|
+
yield description(id,component,reclist), seq
|
130
|
+
else
|
131
|
+
warn "No sequence information for",id
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Yield a unique description and the sequence
|
138
|
+
def each_exon_seq
|
139
|
+
each_exon do | id, reclist, component |
|
140
|
+
if component
|
141
|
+
sequence = @sequencelist[component.seqname]
|
142
|
+
if sequence
|
143
|
+
seq = assemble(sequence,component.start,reclist)
|
144
|
+
yield description(id,component,reclist), seq
|
145
|
+
else
|
146
|
+
warn "No sequence information for",id
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|