bio-gff3 0.8.5 → 0.8.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -2
- data/Rakefile +13 -4
- data/VERSION +1 -1
- data/bin/gff3-fetch +50 -14
- data/bio-gff3.gemspec +15 -20
- data/lib/bio/db/gff/block/gffblockparser.rb +93 -0
- data/lib/bio/db/gff/digest/gffinmemory.rb +2 -0
- data/lib/bio/db/gff/digest/gfflrucache.rb +208 -0
- data/lib/bio/db/gff/digest/gffnocache.rb +28 -9
- data/lib/bio/db/gff/digest/gffparser.rb +1 -1
- data/lib/bio/db/gff/file/gfffileiterator.rb +16 -7
- data/lib/bio/db/gff/gff3.rb +15 -5
- data/lib/bio/db/gff/gff3parserec.rb +1 -6
- data/lib/bio/db/gff/gffcomponent.rb +8 -6
- data/lib/bio/db/gff/gffrecord.rb +13 -8
- data/lib/bio/db/gff/gffsection.rb +0 -1
- data/lib/bio/db/gff/gffsequence.rb +3 -9
- data/lib/bio/db/gff/gffvalidate.rb +1 -1
- data/lib/bio/output/gfflogger.rb +10 -1
- data/spec/gff3_fileiterator_spec.rb +5 -4
- data/spec/gffdb_spec.rb +7 -1
- data/spec/gffparserec.rb +1 -1
- data/test/data/regression/test_ext_gff3.rtest +4 -5
- data/test/data/regression/test_gff3.rtest +4 -5
- data/test/data/regression/test_lrucache_ext_gff3.rtest +64 -0
- data/test/data/regression/test_lrucache_gff3.rtest +68 -0
- data/test/data/regression/test_nocache_ext_gff3.rtest +2 -0
- data/test/data/regression/test_nocache_gff3.rtest +3 -6
- data/test/test_bio-gff3.rb +6 -1
- metadata +37 -77
@@ -21,19 +21,24 @@ module Bio
|
|
21
21
|
# record
|
22
22
|
module SeekRec
|
23
23
|
# Fetch a record using fh and file seek position
|
24
|
-
def SeekRec::fetch(fh,fpos)
|
24
|
+
def SeekRec::fetch(fh,fpos,parser)
|
25
25
|
return nil if fh==nil or fpos==nil
|
26
26
|
fh.seek(fpos)
|
27
|
-
|
27
|
+
if parser == :bioruby
|
28
|
+
GFF::GFF3::BioRubyFileRecord.new(fpos, fh.gets)
|
29
|
+
else
|
30
|
+
GFF::GFF3::FastParserFileRecord.new(fpos, fh.gets)
|
31
|
+
end
|
28
32
|
end
|
29
33
|
end
|
30
34
|
|
31
35
|
# Helper class which gives Hash-like access to the
|
32
36
|
# no-cache GFF3 file
|
33
37
|
class SeekRecList
|
34
|
-
def initialize fh
|
38
|
+
def initialize fh, parser
|
35
39
|
@fh = fh
|
36
40
|
@h = {}
|
41
|
+
@parser = parser
|
37
42
|
end
|
38
43
|
|
39
44
|
def []= id, rec
|
@@ -44,7 +49,7 @@ module Bio
|
|
44
49
|
|
45
50
|
def [](id)
|
46
51
|
fpos = @h[id]
|
47
|
-
SeekRec::fetch(@fh,fpos)
|
52
|
+
SeekRec::fetch(@fh,fpos,@parser)
|
48
53
|
end
|
49
54
|
|
50
55
|
def each
|
@@ -56,7 +61,7 @@ module Bio
|
|
56
61
|
|
57
62
|
# List of ids
|
58
63
|
class SeekLinkedRecs < Hash
|
59
|
-
include Helpers::
|
64
|
+
include Helpers::Logger
|
60
65
|
def add id, rec
|
61
66
|
info "Adding #{rec.feature_type} <#{id}>"
|
62
67
|
self[id] = [] if self[id] == nil
|
@@ -91,14 +96,22 @@ module Bio
|
|
91
96
|
info "---- Digest DB and store data in mRNA Hash (NoCache)"
|
92
97
|
@count_ids = Counter.new # Count ids
|
93
98
|
@count_seqnames = Counter.new # Count seqnames
|
94
|
-
@componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
|
99
|
+
@componentlist = SeekRecList.new(@iter.fh,@options[:parser]) # Store containers, like genes, contigs
|
95
100
|
@orflist = SeekLinkedRecs.new # Store linked gene records
|
96
101
|
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
97
102
|
@cdslist = SeekLinkedRecs.new
|
98
103
|
@exonlist = SeekLinkedRecs.new
|
99
104
|
@sequencelist = {}
|
100
105
|
@unrecognized_features = {}
|
101
|
-
@iter.each_rec do |
|
106
|
+
@iter.each_rec do |fpos, line|
|
107
|
+
rec = case @options[:parser]
|
108
|
+
when :bioruby
|
109
|
+
Bio::GFF::GFF3::BioRubyFileRecord.new(fpos, line)
|
110
|
+
when :line
|
111
|
+
Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
112
|
+
else
|
113
|
+
raise 'Unknown parser'
|
114
|
+
end
|
102
115
|
store_record(rec)
|
103
116
|
end
|
104
117
|
@iter.each_sequence do | id, bioseq |
|
@@ -117,11 +130,17 @@ module Bio
|
|
117
130
|
list.each do | id, io_seeklist |
|
118
131
|
recs = []
|
119
132
|
io_seeklist.each do | fpos |
|
120
|
-
recs << SeekRec::fetch(fh,fpos)
|
133
|
+
recs << SeekRec::fetch(fh,fpos,@options[:parser])
|
121
134
|
end
|
122
135
|
seqid = recs[0].seqname
|
123
136
|
component = find_component(recs[0])
|
124
|
-
|
137
|
+
if @options[:no_assemble]
|
138
|
+
recs.each do | rec |
|
139
|
+
yield id, [rec], component
|
140
|
+
end
|
141
|
+
else
|
142
|
+
yield id, recs, component
|
143
|
+
end
|
125
144
|
end
|
126
145
|
end
|
127
146
|
|
@@ -13,7 +13,7 @@ module Bio
|
|
13
13
|
|
14
14
|
# FileRecord inherits from the BioRuby Record, but
|
15
15
|
# adds the file seek position.
|
16
|
-
class
|
16
|
+
class BioRubyFileRecord < Record
|
17
17
|
attr_accessor :io_seek
|
18
18
|
def initialize io_seek, buf
|
19
19
|
@io_seek = io_seek
|
@@ -21,6 +21,16 @@ module Bio
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
+
class FastParserFileRecord < GFFbrowser::FastLineRecord
|
25
|
+
attr_accessor :io_seek
|
26
|
+
|
27
|
+
include Bio::GFFbrowser::FastLineParser
|
28
|
+
def initialize io_seek, buf
|
29
|
+
@io_seek = io_seek
|
30
|
+
super(parse_line_fast(buf))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
24
34
|
# GFF3::FileIterator takes a file and yields GFF3 records with their
|
25
35
|
# seek position included in the record.
|
26
36
|
class FileIterator
|
@@ -31,9 +41,10 @@ module Bio
|
|
31
41
|
@fh = File.open(filename)
|
32
42
|
end
|
33
43
|
|
34
|
-
# Iterate over every record in the file, yielding the
|
35
|
-
#
|
36
|
-
def each_rec
|
44
|
+
# Iterate over every record in the file, yielding the seekpos
|
45
|
+
# and line containing the record
|
46
|
+
def each_rec
|
47
|
+
@fh.seek(0)
|
37
48
|
fpos = 0
|
38
49
|
@fh.each_line do | line |
|
39
50
|
line = line.strip
|
@@ -42,10 +53,8 @@ module Bio
|
|
42
53
|
break
|
43
54
|
end
|
44
55
|
if line.size != 0 and line !~ /^#/
|
45
|
-
rec = FileRecord.new(fpos, line)
|
46
56
|
lastpos = @fh.tell
|
47
|
-
|
48
|
-
yield id, rec
|
57
|
+
yield fpos, line
|
49
58
|
@fh.seek(lastpos) # reset filepos, just in case it changed
|
50
59
|
end
|
51
60
|
fpos = @fh.tell
|
data/lib/bio/db/gff/gff3.rb
CHANGED
@@ -9,6 +9,8 @@
|
|
9
9
|
|
10
10
|
require 'bio/db/gff/digest/gffinmemory'
|
11
11
|
require 'bio/db/gff/digest/gffnocache'
|
12
|
+
require 'bio/db/gff/digest/gfflrucache'
|
13
|
+
require 'bio/db/gff/block/gffblockparser'
|
12
14
|
|
13
15
|
module Bio
|
14
16
|
module GFFbrowser
|
@@ -16,16 +18,24 @@ module Bio
|
|
16
18
|
attr_reader :assembler
|
17
19
|
|
18
20
|
include Digest
|
21
|
+
include Block
|
19
22
|
|
20
23
|
# Initialize a GFF parser
|
21
24
|
def initialize filename, options = {}
|
25
|
+
options[:parser] = :line if options[:parser] == nil
|
22
26
|
cache_recs = options[:cache_records]
|
23
27
|
@assembler =
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
28
|
+
if options[:block]
|
29
|
+
GffBlockParser.new(filename, options)
|
30
|
+
else
|
31
|
+
case cache_recs
|
32
|
+
when :cache_none
|
33
|
+
NoCache.new(filename, options)
|
34
|
+
when :cache_lru
|
35
|
+
LruCache.new(filename, options)
|
36
|
+
else
|
37
|
+
InMemory.new(filename, options) # default
|
38
|
+
end
|
29
39
|
end
|
30
40
|
end
|
31
41
|
|
@@ -24,7 +24,7 @@ module Bio
|
|
24
24
|
#
|
25
25
|
module FastLineParser
|
26
26
|
|
27
|
-
include Helpers::
|
27
|
+
include Helpers::Logger
|
28
28
|
|
29
29
|
# Returns a (partial) record, assuming it is a valid GFF3
|
30
30
|
# format, no validation takes place, other than field counting (!)
|
@@ -44,11 +44,6 @@ module Bio
|
|
44
44
|
return nil
|
45
45
|
end
|
46
46
|
|
47
|
-
fs[GFF3_START] = fs[GFF3_START].to_i
|
48
|
-
fs[GFF3_END] = fs[GFF3_END].to_i
|
49
|
-
fs[GFF3_SCORE] = fs[GFF3_SCORE].to_f
|
50
|
-
fs[GFF3_PHASE] = fs[GFF3_PHASE].to_i
|
51
|
-
fs[GFF3_ATTRIBUTES] = parse_attributes_fast(fs[GFF3_ATTRIBUTES],options)
|
52
47
|
fs
|
53
48
|
end
|
54
49
|
|
@@ -7,13 +7,15 @@
|
|
7
7
|
#
|
8
8
|
# Fetch information from a GFF file
|
9
9
|
|
10
|
+
require 'set'
|
11
|
+
|
10
12
|
module Bio
|
11
13
|
module GFFbrowser
|
12
14
|
|
13
15
|
module Helpers
|
14
16
|
|
15
17
|
module Record
|
16
|
-
include
|
18
|
+
include Logger
|
17
19
|
# Format a record ID by, first, getting the ID attribute. If that fails
|
18
20
|
# the seqname is used with the start/stop positions.
|
19
21
|
def Record::formatID rec
|
@@ -33,11 +35,11 @@ module Bio
|
|
33
35
|
|
34
36
|
module Gff3Component
|
35
37
|
|
36
|
-
include
|
38
|
+
include Logger
|
37
39
|
|
38
|
-
COMPONENT_TYPES = %w{
|
40
|
+
COMPONENT_TYPES = Set.new(%w{
|
39
41
|
gene SO:0000704 contig transcript Component region
|
40
|
-
}
|
42
|
+
})
|
41
43
|
|
42
44
|
# Walk the component list to find a matching component/container for a
|
43
45
|
# record. First use the parent ID. If that is missing go by sequence
|
@@ -80,13 +82,13 @@ module Bio
|
|
80
82
|
module Gff3Features
|
81
83
|
|
82
84
|
# Ignore the following features (case sensitive?)
|
83
|
-
IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
|
85
|
+
IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + Set.new(%w{
|
84
86
|
transposon Match similarity UTR
|
85
87
|
TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
|
86
88
|
polyA_site SO:0000553
|
87
89
|
five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
|
88
90
|
exon SO:0000147
|
89
|
-
}
|
91
|
+
})
|
90
92
|
end
|
91
93
|
|
92
94
|
end
|
data/lib/bio/db/gff/gffrecord.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'bio/db/gff/gff3parserec'
|
2
|
+
|
1
3
|
module Bio
|
2
4
|
module GFFbrowser
|
3
5
|
|
@@ -7,6 +9,9 @@ module Bio
|
|
7
9
|
|
8
10
|
# Using the fast line parser
|
9
11
|
class FastLineRecord < Record
|
12
|
+
|
13
|
+
include FastLineParser
|
14
|
+
|
10
15
|
def initialize fields
|
11
16
|
@fields = fields
|
12
17
|
end
|
@@ -16,27 +21,27 @@ module Bio
|
|
16
21
|
end
|
17
22
|
|
18
23
|
def seqid
|
19
|
-
@fields[GFF3_SEQID]
|
24
|
+
@seqid_ ||= @fields[GFF3_SEQID]
|
20
25
|
end
|
21
26
|
|
22
27
|
alias seqname :seqid
|
23
28
|
|
24
29
|
def phase
|
25
|
-
@fields[GFF3_PHASE]
|
30
|
+
@phase_ ||= @fields[GFF3_PHASE].to_i
|
26
31
|
end
|
27
32
|
|
28
33
|
alias frame :phase
|
29
34
|
|
30
35
|
def start
|
31
|
-
@fields[GFF3_START]
|
36
|
+
@start_ ||= @fields[GFF3_START].to_i
|
32
37
|
end
|
33
38
|
|
34
39
|
def end
|
35
|
-
@fields[GFF3_END]
|
40
|
+
@end_ ||= @fields[GFF3_END].to_i
|
36
41
|
end
|
37
42
|
|
38
43
|
def score
|
39
|
-
@fields[GFF3_SCORE]
|
44
|
+
@score_ ||= @fields[GFF3_SCORE].to_f
|
40
45
|
end
|
41
46
|
|
42
47
|
def strand
|
@@ -44,7 +49,7 @@ module Bio
|
|
44
49
|
end
|
45
50
|
|
46
51
|
def feature
|
47
|
-
@fields[GFF3_TYPE]
|
52
|
+
@feature_ ||= @fields[GFF3_TYPE]
|
48
53
|
end
|
49
54
|
|
50
55
|
alias feature_type :feature
|
@@ -53,7 +58,7 @@ module Bio
|
|
53
58
|
end
|
54
59
|
|
55
60
|
def attributes
|
56
|
-
@fields[GFF3_ATTRIBUTES]
|
61
|
+
@attributes_ ||= parse_attributes_fast(@fields[GFF3_ATTRIBUTES])
|
57
62
|
end
|
58
63
|
|
59
64
|
def get_attribute name
|
@@ -61,7 +66,7 @@ module Bio
|
|
61
66
|
end
|
62
67
|
|
63
68
|
def id
|
64
|
-
attributes['ID']
|
69
|
+
@id_ ||= attributes['ID']
|
65
70
|
end
|
66
71
|
|
67
72
|
alias entry_id :id
|
@@ -15,7 +15,7 @@ module Bio
|
|
15
15
|
|
16
16
|
module Gff3Sequence
|
17
17
|
|
18
|
-
include Bio::GFFbrowser::Helpers::
|
18
|
+
include Bio::GFFbrowser::Helpers::Logger
|
19
19
|
|
20
20
|
|
21
21
|
# Patch a sequence together from a Sequence string and an array
|
@@ -61,13 +61,13 @@ module Bio
|
|
61
61
|
orf_frame = startpos - 1
|
62
62
|
orf_frameshift = orf_frame % 3
|
63
63
|
sectionlist = sectionlist.reverse if orf_reverse
|
64
|
-
|
64
|
+
if do_debug
|
65
65
|
debug options.to_s
|
66
66
|
debug [:reverse,do_reverse].to_s
|
67
67
|
debug [:complement,do_complement].to_s
|
68
68
|
debug [:trim,do_trim].to_s
|
69
69
|
debug [:orf_reverse, orf_reverse, rec0.strand].to_s
|
70
|
-
|
70
|
+
end
|
71
71
|
|
72
72
|
if sequence.kind_of?(Bio::FastaFormat)
|
73
73
|
# BioRuby conversion
|
@@ -80,18 +80,12 @@ module Bio
|
|
80
80
|
if do_reverse and orf_reverse
|
81
81
|
s = s.reverse
|
82
82
|
end
|
83
|
-
# Correct for phase. Unfortunately the use of phase is ambiguous.
|
84
|
-
# Here we check whether rec.start is in line with orf_frame. If it
|
85
|
-
# is, we correct for phase. Otherwise it is ignored.
|
86
83
|
if do_phase and rec.phase
|
87
84
|
phase = rec.phase.to_i
|
88
|
-
# if ((rec.start-startpos) % 3 == 0)
|
89
85
|
s = s[phase..-1]
|
90
|
-
# end
|
91
86
|
end
|
92
87
|
s
|
93
88
|
}
|
94
|
-
# p seq
|
95
89
|
seq = seq.join
|
96
90
|
if do_complement and do_reverse and orf_reverse
|
97
91
|
ntseq = Bio::Sequence::NA.new(seq)
|
data/lib/bio/output/gfflogger.rb
CHANGED
@@ -4,7 +4,7 @@ module Bio
|
|
4
4
|
|
5
5
|
module Helpers
|
6
6
|
|
7
|
-
module
|
7
|
+
module Logger
|
8
8
|
include Bio::Log
|
9
9
|
|
10
10
|
def debug str, id=''
|
@@ -27,6 +27,15 @@ module Bio
|
|
27
27
|
log.error_(str+" <#{id}>",:act => FailOnError.new)
|
28
28
|
|
29
29
|
end
|
30
|
+
|
31
|
+
def log_sys_info msg
|
32
|
+
log = LoggerPlus['bio-gff3']
|
33
|
+
rmem = `ps -o rss= -p #{Process.pid}`.to_i
|
34
|
+
vmem = `ps -o vsz= -p #{Process.pid}`.to_i
|
35
|
+
if rmem or vmem
|
36
|
+
log.info7 "Memory used #{msg} RAM #{rmem/1024}M, VMEM #{vmem/1024}M"
|
37
|
+
end
|
38
|
+
end
|
30
39
|
end
|
31
40
|
end
|
32
41
|
end
|
@@ -13,10 +13,11 @@ TEST2='test/data/gff/standard.gff3'
|
|
13
13
|
|
14
14
|
describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
|
15
15
|
|
16
|
+
|
16
17
|
it "should parse a file and yield records" do
|
17
18
|
iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
|
18
|
-
iter.each_rec do |
|
19
|
-
|
19
|
+
iter.each_rec do | fpos, line |
|
20
|
+
rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
20
21
|
rec.io_seek.should == 51
|
21
22
|
break
|
22
23
|
end
|
@@ -25,8 +26,8 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
|
|
25
26
|
it "should handle embedded FASTA records" do
|
26
27
|
iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
|
27
28
|
last = nil
|
28
|
-
iter.each_rec do |
|
29
|
-
|
29
|
+
iter.each_rec do | fpos, line |
|
30
|
+
rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
30
31
|
last = rec
|
31
32
|
end
|
32
33
|
last.io_seek.should == 3342
|
data/spec/gffdb_spec.rb
CHANGED
@@ -54,7 +54,13 @@ describe GFF3, "GFF3 API (InMemory) with everything in memory" do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => :cache_none" do
|
57
|
-
|
57
|
+
before :all do
|
58
|
+
# initialize
|
59
|
+
gff3 = Bio::GFFbrowser::GFF3.new(TESTGFF1, :cache_components => :cache_none, :cache_records => :cache_lru)
|
60
|
+
@gff = gff3.assembler
|
61
|
+
end
|
62
|
+
|
63
|
+
iterators_should_be_implemented
|
58
64
|
end
|
59
65
|
|
60
66
|
describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => 1000" do
|