bio-gff3 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README +65 -0
- data/README.rdoc +19 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/gff3-fetch +99 -0
- data/bio-gff3.gemspec +101 -0
- data/lib/bio-gff3.rb +0 -0
- data/lib/bio/db/gff/gffassemble.rb +300 -0
- data/lib/bio/db/gff/gffdb.rb +40 -0
- data/lib/bio/db/gff/gfffasta.rb +68 -0
- data/lib/bio/db/gff/gfffileiterator.rb +77 -0
- data/lib/bio/db/gff/gffinmemory.rb +63 -0
- data/lib/bio/db/gff/gffnocache.rb +124 -0
- data/lib/bio/db/gff/gffparser.rb +154 -0
- data/lib/bio/system/lruhash.rb +268 -0
- data/spec/gff3_assemble2_spec.rb +73 -0
- data/spec/gff3_assemble3_spec.rb +62 -0
- data/spec/gff3_assemble_spec.rb +291 -0
- data/spec/gff3_fileiterator_spec.rb +43 -0
- data/spec/gffdb_spec.rb +99 -0
- data/test/data/gff/MhA1_Contig1133.fa +2 -0
- data/test/data/gff/MhA1_Contig1133.gff3 +1862 -0
- data/test/data/gff/MhA1_Contig125.fa +673 -0
- data/test/data/gff/MhA1_Contig125.gff3 +2177 -0
- data/test/data/gff/standard.gff3 +25 -0
- data/test/data/gff/test-cds.gff3 +98 -0
- data/test/data/gff/test-ext-fasta.fa +16 -0
- data/test/data/gff/test-ext-fasta.gff3 +57 -0
- data/test/data/gff/test.gff3 +74 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-gff3.rb +7 -0
- metadata +180 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffdb.rb - GFF database class
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
|
8
|
+
# Create db from a GFF file
|
9
|
+
|
10
|
+
require 'bio'
|
11
|
+
require 'bio/db/gff/gfffileiterator'
|
12
|
+
require 'bio/db/gff/gfffasta'
|
13
|
+
require 'bio/db/gff/gffassemble'
|
14
|
+
require 'bio/db/gff/gffparser'
|
15
|
+
require 'bio/db/gff/gffinmemory'
|
16
|
+
require 'bio/db/gff/gffnocache'
|
17
|
+
|
18
|
+
module Bio
|
19
|
+
module GFFbrowser
|
20
|
+
class GFFdb
|
21
|
+
attr_reader :assembler
|
22
|
+
|
23
|
+
include Digest
|
24
|
+
|
25
|
+
# Initialize a GFF parser
|
26
|
+
def initialize filename, options = {}
|
27
|
+
cache_recs = options[:cache_records]
|
28
|
+
@assembler =
|
29
|
+
case cache_recs
|
30
|
+
when :cache_none
|
31
|
+
NoCache.new(filename, options)
|
32
|
+
else
|
33
|
+
InMemory.new(filename, options) # default
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end # GFFdb
|
38
|
+
end # GFFbrowser
|
39
|
+
end # Bio
|
40
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# = bio/db/gff/gfffasta.rb - Fetch records from a file in FASTA format
|
2
|
+
#
|
3
|
+
# Copyright:: Copyright (C) 2010
|
4
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
5
|
+
# License:: The Ruby License
|
6
|
+
#
|
7
|
+
# This requires a special implementation as it uses an open file and we
|
8
|
+
# retain file seek positions.
|
9
|
+
|
10
|
+
module Bio
|
11
|
+
|
12
|
+
class GFF
|
13
|
+
|
14
|
+
# Read FASTA records from file and store seek positions, which are
|
15
|
+
# used to retrieve the records. Note, this implementation merely retains
|
16
|
+
# records in memory (FIXME)
|
17
|
+
class FastaReader
|
18
|
+
def initialize fh, io_seek=nil
|
19
|
+
@fh = fh
|
20
|
+
@h = {}
|
21
|
+
parse
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse
|
25
|
+
# read FASTA records
|
26
|
+
header = nil
|
27
|
+
seqs = []
|
28
|
+
@fh.each_line do | line |
|
29
|
+
line = line.strip
|
30
|
+
next if line =~ /^#/
|
31
|
+
if line =~ /^>/ # FASTA record header
|
32
|
+
add(header,seqs)
|
33
|
+
header = line
|
34
|
+
seqs = []
|
35
|
+
else
|
36
|
+
seqs << line
|
37
|
+
end
|
38
|
+
end
|
39
|
+
add(header,seqs)
|
40
|
+
end
|
41
|
+
|
42
|
+
def [] index
|
43
|
+
@h[index]
|
44
|
+
end
|
45
|
+
|
46
|
+
def each
|
47
|
+
@h.each do | k,v |
|
48
|
+
yield k, v
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
def add header, seqs
|
54
|
+
if header
|
55
|
+
id, fastarec = fasta_rec(header, seqs)
|
56
|
+
@h[id] = fastarec.data.strip
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def fasta_rec header, buf
|
61
|
+
fst = Bio::FastaFormat.new(header+"\n"+buf.to_s)
|
62
|
+
return fst.definition, fst
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end # GFF
|
68
|
+
end # Bio
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gfffileiterator.rb - Fetch records from a file
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
|
8
|
+
module Bio
|
9
|
+
|
10
|
+
class GFF
|
11
|
+
|
12
|
+
class GFF3
|
13
|
+
|
14
|
+
class FileRecord < Record
|
15
|
+
attr_accessor :io_seek
|
16
|
+
def initialize io_seek, buf
|
17
|
+
@io_seek = io_seek
|
18
|
+
super(buf)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# GFF3::FileIterator takes a file and yields GFF3 records with their
|
23
|
+
# seek position included in the record.
|
24
|
+
class FileIterator
|
25
|
+
attr_accessor :fh
|
26
|
+
attr_reader :fasta_io_seek
|
27
|
+
|
28
|
+
def initialize filename
|
29
|
+
@fh = File.open(filename)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Iterate over every record in the file, yielding the record ID and
|
33
|
+
# (File)Record, which includes the io_seek position in the file
|
34
|
+
def each_rec()
|
35
|
+
fpos = 0
|
36
|
+
@fh.each_line do | line |
|
37
|
+
line = line.strip
|
38
|
+
if line == "##FASTA"
|
39
|
+
@fasta_io_seek = fpos
|
40
|
+
break
|
41
|
+
end
|
42
|
+
if line.size != 0 and line !~ /^#/
|
43
|
+
rec = FileRecord.new(fpos, line)
|
44
|
+
lastpos = @fh.tell
|
45
|
+
id = rec.id
|
46
|
+
yield id, rec
|
47
|
+
@fh.seek(lastpos) # reset filepos, just in case it changed
|
48
|
+
end
|
49
|
+
fpos = @fh.tell
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Iterate over all contained FASTA sequences, yielding the ID
|
54
|
+
# and sequence as a FASTA record. Normally call each_rec first and
|
55
|
+
# you can test for existing FASTA records if fasta_io_seek != nil
|
56
|
+
def each_sequence
|
57
|
+
if @fasta_io_seek == nil
|
58
|
+
# Find the FASTA location first
|
59
|
+
@fh.each_line do | line |
|
60
|
+
break if line.strip == "##FASTA"
|
61
|
+
end
|
62
|
+
else
|
63
|
+
@fh.seek(@fasta_io_seek)
|
64
|
+
end
|
65
|
+
fasta = Bio::GFF::FastaReader.new(@fh)
|
66
|
+
fasta.each do | id, fastarec |
|
67
|
+
yield id, fastarec
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end # GFF3
|
72
|
+
end # GFF
|
73
|
+
end # Bio
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffinmemory.rb - Assemble mRNA and CDS from GFF in RAM
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file
|
9
|
+
|
10
|
+
module Bio
|
11
|
+
module GFFbrowser
|
12
|
+
|
13
|
+
module Digest
|
14
|
+
|
15
|
+
class InMemory
|
16
|
+
include Parser
|
17
|
+
include Gff3Sequence
|
18
|
+
attr_reader :sequencelist
|
19
|
+
|
20
|
+
def initialize filename, options
|
21
|
+
@options = options
|
22
|
+
# Invoke the BioRuby in memory parser
|
23
|
+
@gff = Bio::GFF::GFF3.new(File.read(filename))
|
24
|
+
end
|
25
|
+
|
26
|
+
# Digest mRNA from the GFFdb and store in Hash
|
27
|
+
# Next yield(id, seq) from Hash
|
28
|
+
def parse
|
29
|
+
info "---- Digest DB and store data in mRNA Hash"
|
30
|
+
@count_ids = Counter.new # Count ids
|
31
|
+
@count_seqnames = Counter.new # Count seqnames
|
32
|
+
@componentlist = {} # Store containers, like genes, contigs
|
33
|
+
@mrnalist = LinkedRecs.new # Store linked mRNA records
|
34
|
+
@cdslist = LinkedRecs.new
|
35
|
+
@exonlist = LinkedRecs.new
|
36
|
+
@sequencelist = {}
|
37
|
+
@unrecognized_features = {}
|
38
|
+
@gff.records.each do | rec |
|
39
|
+
store_record(rec)
|
40
|
+
end
|
41
|
+
@gff.sequences.each do | bioseq |
|
42
|
+
id = bioseq.entry_id
|
43
|
+
@sequencelist[id] = bioseq.to_s # in Bio::Sequence with contained Bio::FastaFormat
|
44
|
+
end
|
45
|
+
validate_mrnas
|
46
|
+
validate_cdss
|
47
|
+
show_unrecognized_features
|
48
|
+
@genelist = @count_ids.keys
|
49
|
+
read_fasta
|
50
|
+
end
|
51
|
+
|
52
|
+
def each_item list
|
53
|
+
list.each do | id, recs |
|
54
|
+
seqid = recs[0].seqname
|
55
|
+
component = find_component(recs[0])
|
56
|
+
yield id, recs, component
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffnocache.rb - Assemble mRNA and CDS from GFF by fseek
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file without using RAM - also check
|
9
|
+
# out the caching edition, which uses limited amounts of RAM
|
10
|
+
|
11
|
+
module Bio
|
12
|
+
module GFFbrowser
|
13
|
+
|
14
|
+
module Digest
|
15
|
+
|
16
|
+
module NoCacheHelpers
|
17
|
+
|
18
|
+
module SeekRec
|
19
|
+
# Fetch a record using fh and file seek position
|
20
|
+
def SeekRec::fetch(fh,fpos)
|
21
|
+
return nil if fh==nil or fpos==nil
|
22
|
+
fh.seek(fpos)
|
23
|
+
GFF::GFF3::FileRecord.new(fpos, fh.gets)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# The hardwired to file RecList
|
28
|
+
class SeekRecList
|
29
|
+
def initialize fh
|
30
|
+
@fh = fh
|
31
|
+
@h = {}
|
32
|
+
end
|
33
|
+
|
34
|
+
def []= id, rec
|
35
|
+
raise "id #{id} occurs twice!" if @h[id]
|
36
|
+
fpos = rec.io_seek
|
37
|
+
@h[id] = fpos
|
38
|
+
end
|
39
|
+
|
40
|
+
def [](id)
|
41
|
+
fpos = @h[id]
|
42
|
+
SeekRec::fetch(@fh,fpos)
|
43
|
+
end
|
44
|
+
|
45
|
+
def each
|
46
|
+
@h.each do | id,fpos |
|
47
|
+
yield id, self[id]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class SeekLinkedRecs < Hash
|
53
|
+
include Helpers::Error
|
54
|
+
def add id, rec
|
55
|
+
info "Adding #{rec.feature_type} <#{id}>"
|
56
|
+
self[id] = [] if self[id] == nil
|
57
|
+
self[id] << rec.io_seek
|
58
|
+
end
|
59
|
+
# validation is switched off for NoCache
|
60
|
+
def validate_seqname
|
61
|
+
end
|
62
|
+
# validation is switched off for NoCache
|
63
|
+
def validate_nonoverlapping
|
64
|
+
end
|
65
|
+
# validation is switched off for NoCache
|
66
|
+
def validate_shared_parent
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class NoCache
|
72
|
+
include Parser
|
73
|
+
include NoCacheHelpers
|
74
|
+
include Gff3Sequence
|
75
|
+
|
76
|
+
def initialize filename, options
|
77
|
+
@filename = filename
|
78
|
+
@options = options
|
79
|
+
@iter = Bio::GFF::GFF3::FileIterator.new(@filename)
|
80
|
+
end
|
81
|
+
|
82
|
+
# parse the whole file once and store all seek locations,
|
83
|
+
# rather than the records themselves
|
84
|
+
def parse
|
85
|
+
info "---- Digest DB and store data in mRNA Hash (NoCache)"
|
86
|
+
@count_ids = Counter.new # Count ids
|
87
|
+
@count_seqnames = Counter.new # Count seqnames
|
88
|
+
@componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
|
89
|
+
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
90
|
+
@cdslist = SeekLinkedRecs.new
|
91
|
+
@exonlist = SeekLinkedRecs.new
|
92
|
+
@sequencelist = {}
|
93
|
+
@unrecognized_features = {}
|
94
|
+
@iter.each_rec do | id, rec |
|
95
|
+
store_record(rec)
|
96
|
+
end
|
97
|
+
@iter.each_sequence do | id, bioseq |
|
98
|
+
@sequencelist[id] = bioseq.to_s
|
99
|
+
end
|
100
|
+
validate_mrnas
|
101
|
+
validate_cdss
|
102
|
+
show_unrecognized_features
|
103
|
+
@genelist = @count_ids.keys
|
104
|
+
read_fasta
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_item list
|
108
|
+
# p list.class
|
109
|
+
fh = @iter.fh
|
110
|
+
list.each do | id, io_seeklist |
|
111
|
+
recs = []
|
112
|
+
io_seeklist.each do | fpos |
|
113
|
+
recs << SeekRec::fetch(fh,fpos)
|
114
|
+
end
|
115
|
+
seqid = recs[0].seqname
|
116
|
+
component = find_component(recs[0])
|
117
|
+
yield id, recs, component
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gffparser.rb - Parsing logic for GFF3 file
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
|
9
|
+
module Bio
|
10
|
+
module GFFbrowser
|
11
|
+
module Digest
|
12
|
+
|
13
|
+
module Parser
|
14
|
+
|
15
|
+
include Bio::GFFbrowser::Helpers
|
16
|
+
include Bio::GFFbrowser::Helpers::Error
|
17
|
+
include Gff3Component
|
18
|
+
include Gff3Features
|
19
|
+
|
20
|
+
def store_record rec
|
21
|
+
return if rec.comment # skip GFF comments
|
22
|
+
id = Record::formatID(rec)
|
23
|
+
@count_ids.add(id)
|
24
|
+
@count_seqnames.add(rec.seqname)
|
25
|
+
|
26
|
+
if COMPONENT_TYPES.include?(rec.feature_type)
|
27
|
+
# check for container ID
|
28
|
+
warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
|
29
|
+
@componentlist[id] = rec
|
30
|
+
info "Added #{rec.feature_type} with component ID #{id}"
|
31
|
+
else
|
32
|
+
case rec.feature_type
|
33
|
+
when 'mRNA' || 'SO:0000234'
|
34
|
+
@mrnalist.add(id,rec)
|
35
|
+
when 'CDS' || 'SO:0000316'
|
36
|
+
@cdslist.add(id,rec)
|
37
|
+
when 'exon' || 'SO:0000147'
|
38
|
+
@exonlist.add(id,rec)
|
39
|
+
else
|
40
|
+
if !IGNORE_FEATURES.include?(rec.feature_type)
|
41
|
+
@unrecognized_features[rec.feature_type] = true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate_mrnas
|
48
|
+
return if not @options[:validate]
|
49
|
+
# validate gene/container/component seqname is shared
|
50
|
+
@mrnalist.validate_seqname
|
51
|
+
@mrnalist.validate_shared_parent
|
52
|
+
end
|
53
|
+
|
54
|
+
def validate_cdss
|
55
|
+
return if not @options[:validate]
|
56
|
+
@cdslist.validate_seqname
|
57
|
+
# validate CDS sections do not overlap
|
58
|
+
@cdslist.validate_nonoverlapping
|
59
|
+
# validate sections share the parent
|
60
|
+
@cdslist.validate_shared_parent
|
61
|
+
# display unhandled features
|
62
|
+
end
|
63
|
+
|
64
|
+
def show_unrecognized_features
|
65
|
+
@unrecognized_features.keys.each do | k |
|
66
|
+
warn "Feature has no match",k if k
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def read_fasta
|
71
|
+
if @options[:fasta_filename]
|
72
|
+
File.open(@options[:fasta_filename]) do | f |
|
73
|
+
fasta = Bio::GFF::FastaReader.new(f)
|
74
|
+
fasta.each do | id, fastarec |
|
75
|
+
# p fastarec
|
76
|
+
@sequencelist[id] = fastarec
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
# p :inmemory, @sequencelist
|
81
|
+
end
|
82
|
+
|
83
|
+
# Yield the id, recs, containing component and sequence of mRNAs
|
84
|
+
def each_mRNA
|
85
|
+
parse if !@mrnalist
|
86
|
+
each_item(@mrnalist) { |id, recs, component | yield id, recs, component }
|
87
|
+
end
|
88
|
+
|
89
|
+
# Yield the id, recs, and containing component
|
90
|
+
def each_CDS
|
91
|
+
parse if !@cdslist
|
92
|
+
each_item(@cdslist) { |id, recs, component | yield id, recs, component }
|
93
|
+
end
|
94
|
+
|
95
|
+
# Yield the id, recs, and containing component
|
96
|
+
def each_exon
|
97
|
+
parse if !@exonlist
|
98
|
+
each_item(@exonlist) { |id, recs, component | yield id, recs, component }
|
99
|
+
end
|
100
|
+
|
101
|
+
# Yield a unique description and the sequence
|
102
|
+
def each_mRNA_seq
|
103
|
+
each_mRNA do | id, reclist, component |
|
104
|
+
if component
|
105
|
+
sequence = @sequencelist[component.seqname]
|
106
|
+
# p sequence
|
107
|
+
if sequence
|
108
|
+
yield description(id,component,reclist), assemble(sequence,component.start,reclist)
|
109
|
+
else
|
110
|
+
warn "No sequence information for",id
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Yield a unique description and the sequence
|
117
|
+
def each_CDS_seq
|
118
|
+
each_CDS do | id, reclist, component |
|
119
|
+
if component
|
120
|
+
sequence = @sequencelist[component.seqname]
|
121
|
+
# p sequence
|
122
|
+
if sequence
|
123
|
+
seq = assemble(sequence,component.start,reclist,:codonize=>true)
|
124
|
+
if seq.size % 3 != 0
|
125
|
+
p reclist # leave this in
|
126
|
+
# raise "CDS size #{seq.size} is not a multiple of 3! <#{seq}>"
|
127
|
+
warn "CDS size is not a multiple of 3",id
|
128
|
+
end
|
129
|
+
yield description(id,component,reclist), seq
|
130
|
+
else
|
131
|
+
warn "No sequence information for",id
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Yield a unique description and the sequence
|
138
|
+
def each_exon_seq
|
139
|
+
each_exon do | id, reclist, component |
|
140
|
+
if component
|
141
|
+
sequence = @sequencelist[component.seqname]
|
142
|
+
if sequence
|
143
|
+
seq = assemble(sequence,component.start,reclist)
|
144
|
+
yield description(id,component,reclist), seq
|
145
|
+
else
|
146
|
+
warn "No sequence information for",id
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|