bio-gff3 0.8.5 → 0.8.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -2
- data/Rakefile +13 -4
- data/VERSION +1 -1
- data/bin/gff3-fetch +50 -14
- data/bio-gff3.gemspec +15 -20
- data/lib/bio/db/gff/block/gffblockparser.rb +93 -0
- data/lib/bio/db/gff/digest/gffinmemory.rb +2 -0
- data/lib/bio/db/gff/digest/gfflrucache.rb +208 -0
- data/lib/bio/db/gff/digest/gffnocache.rb +28 -9
- data/lib/bio/db/gff/digest/gffparser.rb +1 -1
- data/lib/bio/db/gff/file/gfffileiterator.rb +16 -7
- data/lib/bio/db/gff/gff3.rb +15 -5
- data/lib/bio/db/gff/gff3parserec.rb +1 -6
- data/lib/bio/db/gff/gffcomponent.rb +8 -6
- data/lib/bio/db/gff/gffrecord.rb +13 -8
- data/lib/bio/db/gff/gffsection.rb +0 -1
- data/lib/bio/db/gff/gffsequence.rb +3 -9
- data/lib/bio/db/gff/gffvalidate.rb +1 -1
- data/lib/bio/output/gfflogger.rb +10 -1
- data/spec/gff3_fileiterator_spec.rb +5 -4
- data/spec/gffdb_spec.rb +7 -1
- data/spec/gffparserec.rb +1 -1
- data/test/data/regression/test_ext_gff3.rtest +4 -5
- data/test/data/regression/test_gff3.rtest +4 -5
- data/test/data/regression/test_lrucache_ext_gff3.rtest +64 -0
- data/test/data/regression/test_lrucache_gff3.rtest +68 -0
- data/test/data/regression/test_nocache_ext_gff3.rtest +2 -0
- data/test/data/regression/test_nocache_gff3.rtest +3 -6
- data/test/test_bio-gff3.rb +6 -1
- metadata +37 -77
data/Gemfile
CHANGED
@@ -2,7 +2,8 @@ source "http://rubygems.org"
|
|
2
2
|
|
3
3
|
# Runtime dependencies
|
4
4
|
gem "bio", ">= 1.3.1"
|
5
|
-
gem "
|
5
|
+
gem "log4r", "> 1.1.6"
|
6
|
+
gem "bio-logger", "> 0.8.0"
|
6
7
|
|
7
8
|
# Add dependencies to develop your gem here.
|
8
9
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -11,5 +12,5 @@ group :development do
|
|
11
12
|
gem "bundler", "~> 1.0.0"
|
12
13
|
gem "jeweler", "~> 1.5.2"
|
13
14
|
gem "rcov", ">= 0"
|
14
|
-
gem "rspec", ">= 2.
|
15
|
+
gem "rspec", ">= 2.3.0"
|
15
16
|
end
|
data/Rakefile
CHANGED
@@ -24,10 +24,9 @@ Support for external FASTA files.
|
|
24
24
|
gem.authors = ["Pjotr Prins"]
|
25
25
|
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
26
26
|
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
27
|
-
gem.add_runtime_dependency 'bio', '>= 1.4.1'
|
28
|
-
gem.add_runtime_dependency 'log4r', '> 1.1.6'
|
29
|
-
gem.add_runtime_dependency 'bio-logger', '>= 0.
|
30
|
-
gem.add_development_dependency 'rspec', '> 2.0'
|
27
|
+
# gem.add_runtime_dependency 'bio', '>= 1.4.1'
|
28
|
+
# gem.add_runtime_dependency 'log4r', '> 1.1.6'
|
29
|
+
# gem.add_runtime_dependency 'bio-logger', '>= 0.8.0'
|
31
30
|
end
|
32
31
|
Jeweler::RubygemsDotOrgTasks.new
|
33
32
|
|
@@ -36,8 +35,18 @@ Rake::TestTask.new(:test) do |test|
|
|
36
35
|
test.libs << 'lib' << 'test'
|
37
36
|
test.pattern = 'test/**/test_*.rb'
|
38
37
|
test.verbose = true
|
38
|
+
Kernel.system('rspec spec/*.rb')
|
39
39
|
end
|
40
40
|
|
41
|
+
#require 'spec/rake/spectask'
|
42
|
+
#Spec::Rake::SpecTask.new(:spec) do |t|
|
43
|
+
# t.spec_files = Dir.glob('spec/**/*_spec.rb')
|
44
|
+
# t.spec_opts << '--format specdoc'
|
45
|
+
# t.warning = true
|
46
|
+
# t.rcov = true
|
47
|
+
#end
|
48
|
+
|
49
|
+
|
41
50
|
require 'rcov/rcovtask'
|
42
51
|
Rcov::RcovTask.new do |test|
|
43
52
|
test.libs << 'test'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.6
|
data/bin/gff3-fetch
CHANGED
@@ -25,16 +25,16 @@ USAGE = <<EOM
|
|
25
25
|
CDS : assemble CDS
|
26
26
|
exon : list all exons
|
27
27
|
gene|ORF : list gene ORFs
|
28
|
-
other : use any type from GFF3 definition, e.g. 'Terminate'
|
28
|
+
other : use any type from GFF3 definition, e.g. 'Terminate'
|
29
29
|
|
30
30
|
and the following performance options:
|
31
31
|
|
32
32
|
--parser bioruby : use BioRuby GFF3 parser (slow)
|
33
|
-
--parser line : use GFF3 line parser
|
34
|
-
--
|
33
|
+
--parser line : use GFF3 line parser (faster, default)
|
34
|
+
--block : parse GFF3 by block (optimistic) -- NYI
|
35
35
|
--cache full : load all in RAM (fast, default)
|
36
36
|
--cache none : do not load anything in memory (slow)
|
37
|
-
--cache lru : use
|
37
|
+
--cache lru : use least recently used cache (limit RAM use, fast) -- NYI
|
38
38
|
--max-cpus num : use num threads -- NYI
|
39
39
|
--emboss : use EMBOSS translation (fast) -- NYI
|
40
40
|
|
@@ -74,21 +74,45 @@ USAGE = <<EOM
|
|
74
74
|
|
75
75
|
gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR
|
76
76
|
|
77
|
+
Fine tuning outpt - show messages matching regex
|
78
|
+
|
79
|
+
gff3-fetch mRNA test/data/gff/test.gff3 --trace '=msg =~ /component/'
|
80
|
+
|
77
81
|
Fine tuning output - write log messages to file.log
|
78
82
|
|
79
83
|
gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR --logger file.log
|
80
84
|
|
85
|
+
For more information on output, see the bioruby-logger plugin.
|
86
|
+
|
81
87
|
== Performance
|
82
88
|
|
83
89
|
time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 2> /dev/null > test.fa
|
84
90
|
|
85
|
-
|
91
|
+
Digesting parser:
|
92
|
+
|
93
|
+
Cache real user sys version RAM
|
86
94
|
------------------------------------------------------------
|
87
95
|
full,bioruby 12m41 12m28 0m09 (0.8.0)
|
88
96
|
full,line 12m13 12m06 0m07 (0.8.5)
|
89
|
-
|
90
|
-
|
91
|
-
|
97
|
+
full,line,lazy 11m51 11m43 0m07 (0.8.6) 6,600M
|
98
|
+
|
99
|
+
none,bioruby 504m 477m 26m50 (0.8.0)
|
100
|
+
none,line 297m 267m 28m36 (0.8.5)
|
101
|
+
none,line,lazy 132m 106m 26m01 (0.8.6) 650M
|
102
|
+
|
103
|
+
lru,bioruby 533m 510m 22m47 (0.8.5)
|
104
|
+
lru,line 353m 326m 26m44 (0.8.5) 1K
|
105
|
+
lru,line 305m 281m 22m30 (0.8.5) 10K
|
106
|
+
lru,line,lazy 182m 161m 21m10 (0.8.6) 10K
|
107
|
+
lru,line,lazy 75m 75m 0m17 (0.8.6) 50K 730M
|
108
|
+
------------------------------------------------------------
|
109
|
+
|
110
|
+
Block parser:
|
111
|
+
|
112
|
+
Cache real user sys gff3 version
|
113
|
+
------------------------------------------------------------
|
114
|
+
in preparation
|
115
|
+
------------------------------------------------------------
|
92
116
|
|
93
117
|
where
|
94
118
|
|
@@ -132,7 +156,7 @@ Bio::Log::CLI.trace('info')
|
|
132
156
|
options = OpenStruct.new()
|
133
157
|
|
134
158
|
# ---- Default options
|
135
|
-
options.parser = :
|
159
|
+
options.parser = :line
|
136
160
|
|
137
161
|
opts = OptionParser.new() { |opts|
|
138
162
|
opts.on_tail("-h", "--help", "Show help and examples") {
|
@@ -141,7 +165,7 @@ opts = OptionParser.new() { |opts|
|
|
141
165
|
exit()
|
142
166
|
}
|
143
167
|
|
144
|
-
opts.on("--parser [bioruby,line]", String, "Parser (default
|
168
|
+
opts.on("--parser [bioruby,line]", String, "Parser (default line)") do |p|
|
145
169
|
case p.downcase
|
146
170
|
when 'bioruby'
|
147
171
|
options.parser = :bioruby
|
@@ -152,12 +176,12 @@ opts = OptionParser.new() { |opts|
|
|
152
176
|
end
|
153
177
|
end
|
154
178
|
|
155
|
-
opts.on("--cache [none
|
179
|
+
opts.on("--cache [full,lru,none]", String, "Caching (default full)") do |cache|
|
156
180
|
case cache.downcase
|
157
181
|
when 'none'
|
158
182
|
options.cache = :cache_none
|
159
|
-
|
160
|
-
|
183
|
+
when 'lru'
|
184
|
+
options.cache = :cache_lru
|
161
185
|
when 'full'
|
162
186
|
options.cache = :cache_full
|
163
187
|
else
|
@@ -165,6 +189,10 @@ opts = OptionParser.new() { |opts|
|
|
165
189
|
end
|
166
190
|
end
|
167
191
|
|
192
|
+
opts.on("--block", "Parse by block") do |b|
|
193
|
+
options.block = true
|
194
|
+
end
|
195
|
+
|
168
196
|
opts.on("--no-assemble", "output sequences without assembling") do |b|
|
169
197
|
options.no_assemble = true
|
170
198
|
end
|
@@ -229,6 +257,7 @@ ARGV.each do | fn |
|
|
229
257
|
opts = {}
|
230
258
|
opts[:validate] = options.validate
|
231
259
|
opts[:parser] = options.parser
|
260
|
+
opts[:block] = options.block
|
232
261
|
opts[:cache_components] = options.cache
|
233
262
|
opts[:cache_records] = options.cache
|
234
263
|
opts[:fasta_filename] = fastafn if fastafn
|
@@ -238,6 +267,10 @@ ARGV.each do | fn |
|
|
238
267
|
opts[:phase] = options.phase
|
239
268
|
opts[:debug] = options.debug
|
240
269
|
|
270
|
+
include Bio::GFFbrowser::Helpers::Logger
|
271
|
+
debug $:.to_s
|
272
|
+
debug opts.to_s
|
273
|
+
log_sys_info("BaseLine")
|
241
274
|
gff3 = Bio::GFFbrowser::GFF3.new(fn,opts)
|
242
275
|
|
243
276
|
gff = gff3.assembler
|
@@ -265,8 +298,11 @@ ARGV.each do | fn |
|
|
265
298
|
writer.put(id,seq)
|
266
299
|
end
|
267
300
|
else
|
268
|
-
|
301
|
+
gff.each_seq(gfftype.downcase) do | id, seq |
|
302
|
+
writer.put(id,seq)
|
303
|
+
end
|
269
304
|
end
|
305
|
+
log_sys_info("Done")
|
270
306
|
fastafn = nil
|
271
307
|
end
|
272
308
|
|
data/bio-gff3.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-gff3}
|
8
|
-
s.version = "0.8.
|
8
|
+
s.version = "0.8.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-17}
|
13
13
|
s.default_executable = %q{gff3-fetch}
|
14
14
|
s.description = %q{GFF3 (genome browser) information and digest mRNA and CDS sequences.
|
15
15
|
Options for low memory use and caching of records.
|
@@ -31,7 +31,9 @@ Support for external FASTA files.
|
|
31
31
|
"bin/gff3-fetch",
|
32
32
|
"bio-gff3.gemspec",
|
33
33
|
"lib/bio-gff3.rb",
|
34
|
+
"lib/bio/db/gff/block/gffblockparser.rb",
|
34
35
|
"lib/bio/db/gff/digest/gffinmemory.rb",
|
36
|
+
"lib/bio/db/gff/digest/gfflrucache.rb",
|
35
37
|
"lib/bio/db/gff/digest/gffnocache.rb",
|
36
38
|
"lib/bio/db/gff/digest/gffparser.rb",
|
37
39
|
"lib/bio/db/gff/file/gfffasta.rb",
|
@@ -65,6 +67,8 @@ Support for external FASTA files.
|
|
65
67
|
"test/data/gff/test.gff3",
|
66
68
|
"test/data/regression/test_ext_gff3.rtest",
|
67
69
|
"test/data/regression/test_gff3.rtest",
|
70
|
+
"test/data/regression/test_lrucache_ext_gff3.rtest",
|
71
|
+
"test/data/regression/test_lrucache_gff3.rtest",
|
68
72
|
"test/data/regression/test_nocache_ext_gff3.rtest",
|
69
73
|
"test/data/regression/test_nocache_gff3.rtest",
|
70
74
|
"test/helper.rb",
|
@@ -94,41 +98,32 @@ Support for external FASTA files.
|
|
94
98
|
|
95
99
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
96
100
|
s.add_runtime_dependency(%q<bio>, [">= 1.3.1"])
|
97
|
-
s.add_runtime_dependency(%q<
|
101
|
+
s.add_runtime_dependency(%q<log4r>, ["> 1.1.6"])
|
102
|
+
s.add_runtime_dependency(%q<bio-logger>, ["> 0.8.0"])
|
98
103
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
99
104
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
100
105
|
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
101
106
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
102
|
-
s.add_development_dependency(%q<rspec>, [">= 2.
|
103
|
-
s.add_runtime_dependency(%q<bio>, [">= 1.4.1"])
|
104
|
-
s.add_runtime_dependency(%q<log4r>, ["> 1.1.6"])
|
105
|
-
s.add_runtime_dependency(%q<bio-logger>, [">= 0.6.1"])
|
106
|
-
s.add_development_dependency(%q<rspec>, ["> 2.0"])
|
107
|
+
s.add_development_dependency(%q<rspec>, [">= 2.3.0"])
|
107
108
|
else
|
108
109
|
s.add_dependency(%q<bio>, [">= 1.3.1"])
|
109
|
-
s.add_dependency(%q<
|
110
|
+
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
111
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
110
112
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
111
113
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
112
114
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
113
115
|
s.add_dependency(%q<rcov>, [">= 0"])
|
114
|
-
s.add_dependency(%q<rspec>, [">= 2.
|
115
|
-
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
116
|
-
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
117
|
-
s.add_dependency(%q<bio-logger>, [">= 0.6.1"])
|
118
|
-
s.add_dependency(%q<rspec>, ["> 2.0"])
|
116
|
+
s.add_dependency(%q<rspec>, [">= 2.3.0"])
|
119
117
|
end
|
120
118
|
else
|
121
119
|
s.add_dependency(%q<bio>, [">= 1.3.1"])
|
122
|
-
s.add_dependency(%q<
|
120
|
+
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
121
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
123
122
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
124
123
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
125
124
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
126
125
|
s.add_dependency(%q<rcov>, [">= 0"])
|
127
|
-
s.add_dependency(%q<rspec>, [">= 2.
|
128
|
-
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
129
|
-
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
130
|
-
s.add_dependency(%q<bio-logger>, [">= 0.6.1"])
|
131
|
-
s.add_dependency(%q<rspec>, ["> 2.0"])
|
126
|
+
s.add_dependency(%q<rspec>, [">= 2.3.0"])
|
132
127
|
end
|
133
128
|
end
|
134
129
|
|
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
module Bio
|
3
|
+
module GFFbrowser
|
4
|
+
|
5
|
+
module Block
|
6
|
+
|
7
|
+
# The block parser simplifies parsing, by assuming GFF3 is
|
8
|
+
# organised into blocks. All relevant information is
|
9
|
+
# resolved a block at a time.
|
10
|
+
class GffBlockParser
|
11
|
+
include FastLineParser
|
12
|
+
|
13
|
+
def initialize filename, options
|
14
|
+
info "Starting block parser"
|
15
|
+
@filename = filename
|
16
|
+
@options = options
|
17
|
+
@iter = Bio::GFF::GFF3::FileIterator.new(@filename)
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(gfftype)
|
21
|
+
@inseqidlist = {}
|
22
|
+
# Fetch FASTA first
|
23
|
+
@sequencelist = {}
|
24
|
+
if @options[:fasta_filename]
|
25
|
+
File.open(@options[:fasta_filename]) do | f |
|
26
|
+
fasta = Bio::GFF::FastaReader.new(f)
|
27
|
+
fasta.each do | id, fastarec |
|
28
|
+
# p fastarec
|
29
|
+
@sequencelist[id] = fastarec
|
30
|
+
end
|
31
|
+
end
|
32
|
+
else
|
33
|
+
# Embedded FASTA
|
34
|
+
@iter.each_sequence do | id, bioseq |
|
35
|
+
@sequencelist[id] = bioseq.to_s
|
36
|
+
end
|
37
|
+
end
|
38
|
+
seqid = nil
|
39
|
+
recs = []
|
40
|
+
@iter.each_rec do | fpos, line |
|
41
|
+
rec = FastLineRecord.new(parse_line_fast(line))
|
42
|
+
if seqid != rec.seqid
|
43
|
+
# starting a new block
|
44
|
+
if @inseqidlist[rec.seqid]
|
45
|
+
# not a well formed GFF3 file, we need
|
46
|
+
# to drop
|
47
|
+
error "GFF3 file not sorted, falling back to line parser"
|
48
|
+
raise "ERROR, bailing out"
|
49
|
+
end
|
50
|
+
parse_block(gfftype,recs,@sequencelist[seqid]) { | id, seq | yield id,seq } if seqid
|
51
|
+
recs = []
|
52
|
+
seqid = rec.seqid
|
53
|
+
@inseqidlist[seqid] = true
|
54
|
+
end
|
55
|
+
recs.push rec
|
56
|
+
end
|
57
|
+
parse_block(gfftype,recs,@sequencelist[seqid]) { | id, seq | yield id,seq } if seqid
|
58
|
+
end
|
59
|
+
|
60
|
+
# Parse sequence objects sharing the same seqid
|
61
|
+
# and yield each +gfftype+ as an iq,seq
|
62
|
+
def parse_block gfftype, recs, sequence
|
63
|
+
recs.each do | rec |
|
64
|
+
if rec.feature_type.downcase == gfftype
|
65
|
+
yield rec.id, sequence[rec.start-1..rec.end-1]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def each_seq(gfftype)
|
71
|
+
parse(gfftype) { | id, seq | yield id,seq }
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
def each_gene_seq
|
76
|
+
each_seq('gene') { | id, seq | yield id,seq }
|
77
|
+
end
|
78
|
+
def each_mRNA_seq
|
79
|
+
each_seq('mrna') { | id, seq | yield id,seq }
|
80
|
+
|
81
|
+
end
|
82
|
+
def each_exon_seq
|
83
|
+
each_seq('exon') { | id, seq | yield id,seq }
|
84
|
+
|
85
|
+
end
|
86
|
+
def each_CDS_seq
|
87
|
+
each_seq('cds') { | id, seq | yield id,seq }
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,208 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gfflrucache.rb - Assemble mRNA and CDS from GFF by LRU cache
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file without using RAM - using a least
|
9
|
+
# recently used cache 'LRU' - also check out the caching edition,
|
10
|
+
# which uses limited amounts of RAM.
|
11
|
+
#
|
12
|
+
# In effect it is NoCache with parser recs cached in the LRU hash
|
13
|
+
|
14
|
+
require 'bio/db/gff/digest/gffparser'
|
15
|
+
require 'bio/system/lruhash'
|
16
|
+
|
17
|
+
module Bio
|
18
|
+
module GFFbrowser
|
19
|
+
|
20
|
+
module Digest
|
21
|
+
|
22
|
+
module LruCacheHelpers
|
23
|
+
|
24
|
+
# Module to fetch a line from GFF3 file and returns a parsed
|
25
|
+
# record
|
26
|
+
module SeekRec
|
27
|
+
# Fetch a record using fh and file seek position
|
28
|
+
def SeekRec::fetch(fh,fpos,parser)
|
29
|
+
return nil if fh==nil or fpos==nil
|
30
|
+
fh.seek(fpos)
|
31
|
+
if parser == :bioruby
|
32
|
+
GFF::GFF3::BioRubyFileRecord.new(fpos, fh.gets)
|
33
|
+
else
|
34
|
+
GFF::GFF3::FastParserFileRecord.new(fpos, fh.gets)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
module LruRec
|
40
|
+
# Fetch a record using fh and file seek position,
|
41
|
+
# utilising the LRU cache
|
42
|
+
def fetch(fh,fpos,parser)
|
43
|
+
return nil if fh==nil or fpos==nil
|
44
|
+
rec = @lru[fpos]
|
45
|
+
if rec==nil
|
46
|
+
rec = SeekRec::fetch(fh,fpos,parser)
|
47
|
+
@lru[fpos] = rec
|
48
|
+
end
|
49
|
+
rec
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Helper class which gives Hash-like access to the
|
54
|
+
# no-cache GFF3 file
|
55
|
+
class SeekRecList
|
56
|
+
include LruRec
|
57
|
+
|
58
|
+
def initialize fh, parser, lru
|
59
|
+
@fh = fh
|
60
|
+
@h = {}
|
61
|
+
@parser = parser
|
62
|
+
@lru = lru
|
63
|
+
end
|
64
|
+
|
65
|
+
def []= id, rec
|
66
|
+
raise "id #{id} occurs twice!" if @h[id]
|
67
|
+
fpos = rec.io_seek
|
68
|
+
@h[id] = fpos
|
69
|
+
end
|
70
|
+
|
71
|
+
def [](id)
|
72
|
+
fpos = @h[id]
|
73
|
+
fetch(@fh,fpos,@parser)
|
74
|
+
end
|
75
|
+
|
76
|
+
def each
|
77
|
+
@h.each do | id,fpos |
|
78
|
+
yield id, self[id]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# List of ids
|
84
|
+
class SeekLinkedRecs < Hash
|
85
|
+
include Helpers::Logger
|
86
|
+
def add id, rec
|
87
|
+
info "Adding #{rec.feature_type} <#{id}>"
|
88
|
+
self[id] = [] if self[id] == nil
|
89
|
+
self[id] << rec.io_seek
|
90
|
+
end
|
91
|
+
# validation is switched off for LruCache
|
92
|
+
def validate_seqname
|
93
|
+
end
|
94
|
+
# validation is switched off for LruCache
|
95
|
+
def validate_nonoverlapping
|
96
|
+
end
|
97
|
+
# validation is switched off for LruCache
|
98
|
+
def validate_shared_parent
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class LruTracker
|
104
|
+
include Helpers::Logger
|
105
|
+
attr_accessor :hits, :misses, :calls
|
106
|
+
attr_reader :cache
|
107
|
+
|
108
|
+
def initialize
|
109
|
+
@cache = LRUHash.new 50000
|
110
|
+
@hits = 0
|
111
|
+
@misses = 0
|
112
|
+
@calls = 0
|
113
|
+
end
|
114
|
+
|
115
|
+
def [](name)
|
116
|
+
@calls += 1
|
117
|
+
item = @cache[name]
|
118
|
+
if @cache[name] == nil
|
119
|
+
@misses += 1
|
120
|
+
else
|
121
|
+
@hits += 1
|
122
|
+
end
|
123
|
+
item
|
124
|
+
end
|
125
|
+
|
126
|
+
def []=(name,item)
|
127
|
+
@cache[name] = item
|
128
|
+
end
|
129
|
+
def display msg
|
130
|
+
info "Cache calls #{msg} = #{@calls}"
|
131
|
+
info "Cache hits #{msg} = #{@hits}"
|
132
|
+
info "Cache misses #{msg} = #{@misses}"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
class LruCache
|
137
|
+
include Parser
|
138
|
+
include LruCacheHelpers
|
139
|
+
include Gff3Sequence
|
140
|
+
include LruRec
|
141
|
+
|
142
|
+
def initialize filename, options
|
143
|
+
@filename = filename
|
144
|
+
@options = options
|
145
|
+
@iter = Bio::GFF::GFF3::FileIterator.new(@filename)
|
146
|
+
@lru = LruTracker.new
|
147
|
+
end
|
148
|
+
|
149
|
+
# parse the whole file once and store all seek locations,
|
150
|
+
# rather than the records themselves
|
151
|
+
def parse
|
152
|
+
info "---- Digest DB and store data in mRNA Hash (LruCache)"
|
153
|
+
@count_ids = Counter.new # Count ids
|
154
|
+
@count_seqnames = Counter.new # Count seqnames
|
155
|
+
@componentlist = SeekRecList.new(@iter.fh,@options[:parser],@lru) # Store containers, like genes, contigs
|
156
|
+
@orflist = SeekLinkedRecs.new # Store linked gene records
|
157
|
+
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
158
|
+
@cdslist = SeekLinkedRecs.new
|
159
|
+
@exonlist = SeekLinkedRecs.new
|
160
|
+
@sequencelist = {}
|
161
|
+
@unrecognized_features = {}
|
162
|
+
@iter.each_rec do |fpos, line|
|
163
|
+
rec = case @options[:parser]
|
164
|
+
when :bioruby
|
165
|
+
Bio::GFF::GFF3::BioRubyFileRecord.new(fpos, line)
|
166
|
+
when :line
|
167
|
+
Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
168
|
+
else
|
169
|
+
raise 'Unknown parser'
|
170
|
+
end
|
171
|
+
store_record(rec)
|
172
|
+
end
|
173
|
+
@iter.each_sequence do | id, bioseq |
|
174
|
+
@sequencelist[id] = bioseq.to_s
|
175
|
+
end
|
176
|
+
validate_mrnas
|
177
|
+
validate_cdss
|
178
|
+
show_unrecognized_features
|
179
|
+
@genelist = @count_ids.keys
|
180
|
+
read_fasta
|
181
|
+
@lru.display('After reading files')
|
182
|
+
end
|
183
|
+
|
184
|
+
def each_item list
|
185
|
+
# p list.class
|
186
|
+
fh = @iter.fh
|
187
|
+
list.each do | id, io_seeklist |
|
188
|
+
recs = []
|
189
|
+
io_seeklist.each do | fpos |
|
190
|
+
recs << fetch(fh,fpos,@options[:parser])
|
191
|
+
end
|
192
|
+
seqid = recs[0].seqname
|
193
|
+
component = find_component(recs[0])
|
194
|
+
if @options[:no_assemble]
|
195
|
+
recs.each do | rec |
|
196
|
+
yield id, [rec], component
|
197
|
+
end
|
198
|
+
else
|
199
|
+
yield id, recs, component
|
200
|
+
end
|
201
|
+
end
|
202
|
+
@lru.display('After iterating')
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|