bio-gff3 0.8.5 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -2
- data/Rakefile +13 -4
- data/VERSION +1 -1
- data/bin/gff3-fetch +50 -14
- data/bio-gff3.gemspec +15 -20
- data/lib/bio/db/gff/block/gffblockparser.rb +93 -0
- data/lib/bio/db/gff/digest/gffinmemory.rb +2 -0
- data/lib/bio/db/gff/digest/gfflrucache.rb +208 -0
- data/lib/bio/db/gff/digest/gffnocache.rb +28 -9
- data/lib/bio/db/gff/digest/gffparser.rb +1 -1
- data/lib/bio/db/gff/file/gfffileiterator.rb +16 -7
- data/lib/bio/db/gff/gff3.rb +15 -5
- data/lib/bio/db/gff/gff3parserec.rb +1 -6
- data/lib/bio/db/gff/gffcomponent.rb +8 -6
- data/lib/bio/db/gff/gffrecord.rb +13 -8
- data/lib/bio/db/gff/gffsection.rb +0 -1
- data/lib/bio/db/gff/gffsequence.rb +3 -9
- data/lib/bio/db/gff/gffvalidate.rb +1 -1
- data/lib/bio/output/gfflogger.rb +10 -1
- data/spec/gff3_fileiterator_spec.rb +5 -4
- data/spec/gffdb_spec.rb +7 -1
- data/spec/gffparserec.rb +1 -1
- data/test/data/regression/test_ext_gff3.rtest +4 -5
- data/test/data/regression/test_gff3.rtest +4 -5
- data/test/data/regression/test_lrucache_ext_gff3.rtest +64 -0
- data/test/data/regression/test_lrucache_gff3.rtest +68 -0
- data/test/data/regression/test_nocache_ext_gff3.rtest +2 -0
- data/test/data/regression/test_nocache_gff3.rtest +3 -6
- data/test/test_bio-gff3.rb +6 -1
- metadata +37 -77
@@ -21,19 +21,24 @@ module Bio
|
|
21
21
|
# record
|
22
22
|
module SeekRec
|
23
23
|
# Fetch a record using fh and file seek position
|
24
|
-
def SeekRec::fetch(fh,fpos)
|
24
|
+
def SeekRec::fetch(fh,fpos,parser)
|
25
25
|
return nil if fh==nil or fpos==nil
|
26
26
|
fh.seek(fpos)
|
27
|
-
|
27
|
+
if parser == :bioruby
|
28
|
+
GFF::GFF3::BioRubyFileRecord.new(fpos, fh.gets)
|
29
|
+
else
|
30
|
+
GFF::GFF3::FastParserFileRecord.new(fpos, fh.gets)
|
31
|
+
end
|
28
32
|
end
|
29
33
|
end
|
30
34
|
|
31
35
|
# Helper class which gives Hash-like access to the
|
32
36
|
# no-cache GFF3 file
|
33
37
|
class SeekRecList
|
34
|
-
def initialize fh
|
38
|
+
def initialize fh, parser
|
35
39
|
@fh = fh
|
36
40
|
@h = {}
|
41
|
+
@parser = parser
|
37
42
|
end
|
38
43
|
|
39
44
|
def []= id, rec
|
@@ -44,7 +49,7 @@ module Bio
|
|
44
49
|
|
45
50
|
def [](id)
|
46
51
|
fpos = @h[id]
|
47
|
-
SeekRec::fetch(@fh,fpos)
|
52
|
+
SeekRec::fetch(@fh,fpos,@parser)
|
48
53
|
end
|
49
54
|
|
50
55
|
def each
|
@@ -56,7 +61,7 @@ module Bio
|
|
56
61
|
|
57
62
|
# List of ids
|
58
63
|
class SeekLinkedRecs < Hash
|
59
|
-
include Helpers::
|
64
|
+
include Helpers::Logger
|
60
65
|
def add id, rec
|
61
66
|
info "Adding #{rec.feature_type} <#{id}>"
|
62
67
|
self[id] = [] if self[id] == nil
|
@@ -91,14 +96,22 @@ module Bio
|
|
91
96
|
info "---- Digest DB and store data in mRNA Hash (NoCache)"
|
92
97
|
@count_ids = Counter.new # Count ids
|
93
98
|
@count_seqnames = Counter.new # Count seqnames
|
94
|
-
@componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
|
99
|
+
@componentlist = SeekRecList.new(@iter.fh,@options[:parser]) # Store containers, like genes, contigs
|
95
100
|
@orflist = SeekLinkedRecs.new # Store linked gene records
|
96
101
|
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
97
102
|
@cdslist = SeekLinkedRecs.new
|
98
103
|
@exonlist = SeekLinkedRecs.new
|
99
104
|
@sequencelist = {}
|
100
105
|
@unrecognized_features = {}
|
101
|
-
@iter.each_rec do |
|
106
|
+
@iter.each_rec do |fpos, line|
|
107
|
+
rec = case @options[:parser]
|
108
|
+
when :bioruby
|
109
|
+
Bio::GFF::GFF3::BioRubyFileRecord.new(fpos, line)
|
110
|
+
when :line
|
111
|
+
Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
112
|
+
else
|
113
|
+
raise 'Unknown parser'
|
114
|
+
end
|
102
115
|
store_record(rec)
|
103
116
|
end
|
104
117
|
@iter.each_sequence do | id, bioseq |
|
@@ -117,11 +130,17 @@ module Bio
|
|
117
130
|
list.each do | id, io_seeklist |
|
118
131
|
recs = []
|
119
132
|
io_seeklist.each do | fpos |
|
120
|
-
recs << SeekRec::fetch(fh,fpos)
|
133
|
+
recs << SeekRec::fetch(fh,fpos,@options[:parser])
|
121
134
|
end
|
122
135
|
seqid = recs[0].seqname
|
123
136
|
component = find_component(recs[0])
|
124
|
-
|
137
|
+
if @options[:no_assemble]
|
138
|
+
recs.each do | rec |
|
139
|
+
yield id, [rec], component
|
140
|
+
end
|
141
|
+
else
|
142
|
+
yield id, recs, component
|
143
|
+
end
|
125
144
|
end
|
126
145
|
end
|
127
146
|
|
@@ -13,7 +13,7 @@ module Bio
|
|
13
13
|
|
14
14
|
# FileRecord inherits from the BioRuby Record, but
|
15
15
|
# adds the file seek position.
|
16
|
-
class
|
16
|
+
class BioRubyFileRecord < Record
|
17
17
|
attr_accessor :io_seek
|
18
18
|
def initialize io_seek, buf
|
19
19
|
@io_seek = io_seek
|
@@ -21,6 +21,16 @@ module Bio
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
+
class FastParserFileRecord < GFFbrowser::FastLineRecord
|
25
|
+
attr_accessor :io_seek
|
26
|
+
|
27
|
+
include Bio::GFFbrowser::FastLineParser
|
28
|
+
def initialize io_seek, buf
|
29
|
+
@io_seek = io_seek
|
30
|
+
super(parse_line_fast(buf))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
24
34
|
# GFF3::FileIterator takes a file and yields GFF3 records with their
|
25
35
|
# seek position included in the record.
|
26
36
|
class FileIterator
|
@@ -31,9 +41,10 @@ module Bio
|
|
31
41
|
@fh = File.open(filename)
|
32
42
|
end
|
33
43
|
|
34
|
-
# Iterate over every record in the file, yielding the
|
35
|
-
#
|
36
|
-
def each_rec
|
44
|
+
# Iterate over every record in the file, yielding the seekpos
|
45
|
+
# and line containing the record
|
46
|
+
def each_rec
|
47
|
+
@fh.seek(0)
|
37
48
|
fpos = 0
|
38
49
|
@fh.each_line do | line |
|
39
50
|
line = line.strip
|
@@ -42,10 +53,8 @@ module Bio
|
|
42
53
|
break
|
43
54
|
end
|
44
55
|
if line.size != 0 and line !~ /^#/
|
45
|
-
rec = FileRecord.new(fpos, line)
|
46
56
|
lastpos = @fh.tell
|
47
|
-
|
48
|
-
yield id, rec
|
57
|
+
yield fpos, line
|
49
58
|
@fh.seek(lastpos) # reset filepos, just in case it changed
|
50
59
|
end
|
51
60
|
fpos = @fh.tell
|
data/lib/bio/db/gff/gff3.rb
CHANGED
@@ -9,6 +9,8 @@
|
|
9
9
|
|
10
10
|
require 'bio/db/gff/digest/gffinmemory'
|
11
11
|
require 'bio/db/gff/digest/gffnocache'
|
12
|
+
require 'bio/db/gff/digest/gfflrucache'
|
13
|
+
require 'bio/db/gff/block/gffblockparser'
|
12
14
|
|
13
15
|
module Bio
|
14
16
|
module GFFbrowser
|
@@ -16,16 +18,24 @@ module Bio
|
|
16
18
|
attr_reader :assembler
|
17
19
|
|
18
20
|
include Digest
|
21
|
+
include Block
|
19
22
|
|
20
23
|
# Initialize a GFF parser
|
21
24
|
def initialize filename, options = {}
|
25
|
+
options[:parser] = :line if options[:parser] == nil
|
22
26
|
cache_recs = options[:cache_records]
|
23
27
|
@assembler =
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
28
|
+
if options[:block]
|
29
|
+
GffBlockParser.new(filename, options)
|
30
|
+
else
|
31
|
+
case cache_recs
|
32
|
+
when :cache_none
|
33
|
+
NoCache.new(filename, options)
|
34
|
+
when :cache_lru
|
35
|
+
LruCache.new(filename, options)
|
36
|
+
else
|
37
|
+
InMemory.new(filename, options) # default
|
38
|
+
end
|
29
39
|
end
|
30
40
|
end
|
31
41
|
|
@@ -24,7 +24,7 @@ module Bio
|
|
24
24
|
#
|
25
25
|
module FastLineParser
|
26
26
|
|
27
|
-
include Helpers::
|
27
|
+
include Helpers::Logger
|
28
28
|
|
29
29
|
# Returns a (partial) record, assuming it is a valid GFF3
|
30
30
|
# format, no validation takes place, other than field counting (!)
|
@@ -44,11 +44,6 @@ module Bio
|
|
44
44
|
return nil
|
45
45
|
end
|
46
46
|
|
47
|
-
fs[GFF3_START] = fs[GFF3_START].to_i
|
48
|
-
fs[GFF3_END] = fs[GFF3_END].to_i
|
49
|
-
fs[GFF3_SCORE] = fs[GFF3_SCORE].to_f
|
50
|
-
fs[GFF3_PHASE] = fs[GFF3_PHASE].to_i
|
51
|
-
fs[GFF3_ATTRIBUTES] = parse_attributes_fast(fs[GFF3_ATTRIBUTES],options)
|
52
47
|
fs
|
53
48
|
end
|
54
49
|
|
@@ -7,13 +7,15 @@
|
|
7
7
|
#
|
8
8
|
# Fetch information from a GFF file
|
9
9
|
|
10
|
+
require 'set'
|
11
|
+
|
10
12
|
module Bio
|
11
13
|
module GFFbrowser
|
12
14
|
|
13
15
|
module Helpers
|
14
16
|
|
15
17
|
module Record
|
16
|
-
include
|
18
|
+
include Logger
|
17
19
|
# Format a record ID by, first, getting the ID attribute. If that fails
|
18
20
|
# the seqname is used with the start/stop positions.
|
19
21
|
def Record::formatID rec
|
@@ -33,11 +35,11 @@ module Bio
|
|
33
35
|
|
34
36
|
module Gff3Component
|
35
37
|
|
36
|
-
include
|
38
|
+
include Logger
|
37
39
|
|
38
|
-
COMPONENT_TYPES = %w{
|
40
|
+
COMPONENT_TYPES = Set.new(%w{
|
39
41
|
gene SO:0000704 contig transcript Component region
|
40
|
-
}
|
42
|
+
})
|
41
43
|
|
42
44
|
# Walk the component list to find a matching component/container for a
|
43
45
|
# record. First use the parent ID. If that is missing go by sequence
|
@@ -80,13 +82,13 @@ module Bio
|
|
80
82
|
module Gff3Features
|
81
83
|
|
82
84
|
# Ignore the following features (case sensitive?)
|
83
|
-
IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
|
85
|
+
IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + Set.new(%w{
|
84
86
|
transposon Match similarity UTR
|
85
87
|
TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
|
86
88
|
polyA_site SO:0000553
|
87
89
|
five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
|
88
90
|
exon SO:0000147
|
89
|
-
}
|
91
|
+
})
|
90
92
|
end
|
91
93
|
|
92
94
|
end
|
data/lib/bio/db/gff/gffrecord.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'bio/db/gff/gff3parserec'
|
2
|
+
|
1
3
|
module Bio
|
2
4
|
module GFFbrowser
|
3
5
|
|
@@ -7,6 +9,9 @@ module Bio
|
|
7
9
|
|
8
10
|
# Using the fast line parser
|
9
11
|
class FastLineRecord < Record
|
12
|
+
|
13
|
+
include FastLineParser
|
14
|
+
|
10
15
|
def initialize fields
|
11
16
|
@fields = fields
|
12
17
|
end
|
@@ -16,27 +21,27 @@ module Bio
|
|
16
21
|
end
|
17
22
|
|
18
23
|
def seqid
|
19
|
-
@fields[GFF3_SEQID]
|
24
|
+
@seqid_ ||= @fields[GFF3_SEQID]
|
20
25
|
end
|
21
26
|
|
22
27
|
alias seqname :seqid
|
23
28
|
|
24
29
|
def phase
|
25
|
-
@fields[GFF3_PHASE]
|
30
|
+
@phase_ ||= @fields[GFF3_PHASE].to_i
|
26
31
|
end
|
27
32
|
|
28
33
|
alias frame :phase
|
29
34
|
|
30
35
|
def start
|
31
|
-
@fields[GFF3_START]
|
36
|
+
@start_ ||= @fields[GFF3_START].to_i
|
32
37
|
end
|
33
38
|
|
34
39
|
def end
|
35
|
-
@fields[GFF3_END]
|
40
|
+
@end_ ||= @fields[GFF3_END].to_i
|
36
41
|
end
|
37
42
|
|
38
43
|
def score
|
39
|
-
@fields[GFF3_SCORE]
|
44
|
+
@score_ ||= @fields[GFF3_SCORE].to_f
|
40
45
|
end
|
41
46
|
|
42
47
|
def strand
|
@@ -44,7 +49,7 @@ module Bio
|
|
44
49
|
end
|
45
50
|
|
46
51
|
def feature
|
47
|
-
@fields[GFF3_TYPE]
|
52
|
+
@feature_ ||= @fields[GFF3_TYPE]
|
48
53
|
end
|
49
54
|
|
50
55
|
alias feature_type :feature
|
@@ -53,7 +58,7 @@ module Bio
|
|
53
58
|
end
|
54
59
|
|
55
60
|
def attributes
|
56
|
-
@fields[GFF3_ATTRIBUTES]
|
61
|
+
@attributes_ ||= parse_attributes_fast(@fields[GFF3_ATTRIBUTES])
|
57
62
|
end
|
58
63
|
|
59
64
|
def get_attribute name
|
@@ -61,7 +66,7 @@ module Bio
|
|
61
66
|
end
|
62
67
|
|
63
68
|
def id
|
64
|
-
attributes['ID']
|
69
|
+
@id_ ||= attributes['ID']
|
65
70
|
end
|
66
71
|
|
67
72
|
alias entry_id :id
|
@@ -15,7 +15,7 @@ module Bio
|
|
15
15
|
|
16
16
|
module Gff3Sequence
|
17
17
|
|
18
|
-
include Bio::GFFbrowser::Helpers::
|
18
|
+
include Bio::GFFbrowser::Helpers::Logger
|
19
19
|
|
20
20
|
|
21
21
|
# Patch a sequence together from a Sequence string and an array
|
@@ -61,13 +61,13 @@ module Bio
|
|
61
61
|
orf_frame = startpos - 1
|
62
62
|
orf_frameshift = orf_frame % 3
|
63
63
|
sectionlist = sectionlist.reverse if orf_reverse
|
64
|
-
|
64
|
+
if do_debug
|
65
65
|
debug options.to_s
|
66
66
|
debug [:reverse,do_reverse].to_s
|
67
67
|
debug [:complement,do_complement].to_s
|
68
68
|
debug [:trim,do_trim].to_s
|
69
69
|
debug [:orf_reverse, orf_reverse, rec0.strand].to_s
|
70
|
-
|
70
|
+
end
|
71
71
|
|
72
72
|
if sequence.kind_of?(Bio::FastaFormat)
|
73
73
|
# BioRuby conversion
|
@@ -80,18 +80,12 @@ module Bio
|
|
80
80
|
if do_reverse and orf_reverse
|
81
81
|
s = s.reverse
|
82
82
|
end
|
83
|
-
# Correct for phase. Unfortunately the use of phase is ambiguous.
|
84
|
-
# Here we check whether rec.start is in line with orf_frame. If it
|
85
|
-
# is, we correct for phase. Otherwise it is ignored.
|
86
83
|
if do_phase and rec.phase
|
87
84
|
phase = rec.phase.to_i
|
88
|
-
# if ((rec.start-startpos) % 3 == 0)
|
89
85
|
s = s[phase..-1]
|
90
|
-
# end
|
91
86
|
end
|
92
87
|
s
|
93
88
|
}
|
94
|
-
# p seq
|
95
89
|
seq = seq.join
|
96
90
|
if do_complement and do_reverse and orf_reverse
|
97
91
|
ntseq = Bio::Sequence::NA.new(seq)
|
data/lib/bio/output/gfflogger.rb
CHANGED
@@ -4,7 +4,7 @@ module Bio
|
|
4
4
|
|
5
5
|
module Helpers
|
6
6
|
|
7
|
-
module
|
7
|
+
module Logger
|
8
8
|
include Bio::Log
|
9
9
|
|
10
10
|
def debug str, id=''
|
@@ -27,6 +27,15 @@ module Bio
|
|
27
27
|
log.error_(str+" <#{id}>",:act => FailOnError.new)
|
28
28
|
|
29
29
|
end
|
30
|
+
|
31
|
+
def log_sys_info msg
|
32
|
+
log = LoggerPlus['bio-gff3']
|
33
|
+
rmem = `ps -o rss= -p #{Process.pid}`.to_i
|
34
|
+
vmem = `ps -o vsz= -p #{Process.pid}`.to_i
|
35
|
+
if rmem or vmem
|
36
|
+
log.info7 "Memory used #{msg} RAM #{rmem/1024}M, VMEM #{vmem/1024}M"
|
37
|
+
end
|
38
|
+
end
|
30
39
|
end
|
31
40
|
end
|
32
41
|
end
|
@@ -13,10 +13,11 @@ TEST2='test/data/gff/standard.gff3'
|
|
13
13
|
|
14
14
|
describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
|
15
15
|
|
16
|
+
|
16
17
|
it "should parse a file and yield records" do
|
17
18
|
iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
|
18
|
-
iter.each_rec do |
|
19
|
-
|
19
|
+
iter.each_rec do | fpos, line |
|
20
|
+
rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
20
21
|
rec.io_seek.should == 51
|
21
22
|
break
|
22
23
|
end
|
@@ -25,8 +26,8 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
|
|
25
26
|
it "should handle embedded FASTA records" do
|
26
27
|
iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
|
27
28
|
last = nil
|
28
|
-
iter.each_rec do |
|
29
|
-
|
29
|
+
iter.each_rec do | fpos, line |
|
30
|
+
rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
30
31
|
last = rec
|
31
32
|
end
|
32
33
|
last.io_seek.should == 3342
|
data/spec/gffdb_spec.rb
CHANGED
@@ -54,7 +54,13 @@ describe GFF3, "GFF3 API (InMemory) with everything in memory" do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => :cache_none" do
|
57
|
-
|
57
|
+
before :all do
|
58
|
+
# initialize
|
59
|
+
gff3 = Bio::GFFbrowser::GFF3.new(TESTGFF1, :cache_components => :cache_none, :cache_records => :cache_lru)
|
60
|
+
@gff = gff3.assembler
|
61
|
+
end
|
62
|
+
|
63
|
+
iterators_should_be_implemented
|
58
64
|
end
|
59
65
|
|
60
66
|
describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => 1000" do
|