bio-gff3 0.8.5 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -2
- data/Rakefile +13 -4
- data/VERSION +1 -1
- data/bin/gff3-fetch +50 -14
- data/bio-gff3.gemspec +15 -20
- data/lib/bio/db/gff/block/gffblockparser.rb +93 -0
- data/lib/bio/db/gff/digest/gffinmemory.rb +2 -0
- data/lib/bio/db/gff/digest/gfflrucache.rb +208 -0
- data/lib/bio/db/gff/digest/gffnocache.rb +28 -9
- data/lib/bio/db/gff/digest/gffparser.rb +1 -1
- data/lib/bio/db/gff/file/gfffileiterator.rb +16 -7
- data/lib/bio/db/gff/gff3.rb +15 -5
- data/lib/bio/db/gff/gff3parserec.rb +1 -6
- data/lib/bio/db/gff/gffcomponent.rb +8 -6
- data/lib/bio/db/gff/gffrecord.rb +13 -8
- data/lib/bio/db/gff/gffsection.rb +0 -1
- data/lib/bio/db/gff/gffsequence.rb +3 -9
- data/lib/bio/db/gff/gffvalidate.rb +1 -1
- data/lib/bio/output/gfflogger.rb +10 -1
- data/spec/gff3_fileiterator_spec.rb +5 -4
- data/spec/gffdb_spec.rb +7 -1
- data/spec/gffparserec.rb +1 -1
- data/test/data/regression/test_ext_gff3.rtest +4 -5
- data/test/data/regression/test_gff3.rtest +4 -5
- data/test/data/regression/test_lrucache_ext_gff3.rtest +64 -0
- data/test/data/regression/test_lrucache_gff3.rtest +68 -0
- data/test/data/regression/test_nocache_ext_gff3.rtest +2 -0
- data/test/data/regression/test_nocache_gff3.rtest +3 -6
- data/test/test_bio-gff3.rb +6 -1
- metadata +37 -77
data/Gemfile
CHANGED
@@ -2,7 +2,8 @@ source "http://rubygems.org"
|
|
2
2
|
|
3
3
|
# Runtime dependencies
|
4
4
|
gem "bio", ">= 1.3.1"
|
5
|
-
gem "
|
5
|
+
gem "log4r", "> 1.1.6"
|
6
|
+
gem "bio-logger", "> 0.8.0"
|
6
7
|
|
7
8
|
# Add dependencies to develop your gem here.
|
8
9
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -11,5 +12,5 @@ group :development do
|
|
11
12
|
gem "bundler", "~> 1.0.0"
|
12
13
|
gem "jeweler", "~> 1.5.2"
|
13
14
|
gem "rcov", ">= 0"
|
14
|
-
gem "rspec", ">= 2.
|
15
|
+
gem "rspec", ">= 2.3.0"
|
15
16
|
end
|
data/Rakefile
CHANGED
@@ -24,10 +24,9 @@ Support for external FASTA files.
|
|
24
24
|
gem.authors = ["Pjotr Prins"]
|
25
25
|
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
26
26
|
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
27
|
-
gem.add_runtime_dependency 'bio', '>= 1.4.1'
|
28
|
-
gem.add_runtime_dependency 'log4r', '> 1.1.6'
|
29
|
-
gem.add_runtime_dependency 'bio-logger', '>= 0.
|
30
|
-
gem.add_development_dependency 'rspec', '> 2.0'
|
27
|
+
# gem.add_runtime_dependency 'bio', '>= 1.4.1'
|
28
|
+
# gem.add_runtime_dependency 'log4r', '> 1.1.6'
|
29
|
+
# gem.add_runtime_dependency 'bio-logger', '>= 0.8.0'
|
31
30
|
end
|
32
31
|
Jeweler::RubygemsDotOrgTasks.new
|
33
32
|
|
@@ -36,8 +35,18 @@ Rake::TestTask.new(:test) do |test|
|
|
36
35
|
test.libs << 'lib' << 'test'
|
37
36
|
test.pattern = 'test/**/test_*.rb'
|
38
37
|
test.verbose = true
|
38
|
+
Kernel.system('rspec spec/*.rb')
|
39
39
|
end
|
40
40
|
|
41
|
+
#require 'spec/rake/spectask'
|
42
|
+
#Spec::Rake::SpecTask.new(:spec) do |t|
|
43
|
+
# t.spec_files = Dir.glob('spec/**/*_spec.rb')
|
44
|
+
# t.spec_opts << '--format specdoc'
|
45
|
+
# t.warning = true
|
46
|
+
# t.rcov = true
|
47
|
+
#end
|
48
|
+
|
49
|
+
|
41
50
|
require 'rcov/rcovtask'
|
42
51
|
Rcov::RcovTask.new do |test|
|
43
52
|
test.libs << 'test'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.6
|
data/bin/gff3-fetch
CHANGED
@@ -25,16 +25,16 @@ USAGE = <<EOM
|
|
25
25
|
CDS : assemble CDS
|
26
26
|
exon : list all exons
|
27
27
|
gene|ORF : list gene ORFs
|
28
|
-
other : use any type from GFF3 definition, e.g. 'Terminate'
|
28
|
+
other : use any type from GFF3 definition, e.g. 'Terminate'
|
29
29
|
|
30
30
|
and the following performance options:
|
31
31
|
|
32
32
|
--parser bioruby : use BioRuby GFF3 parser (slow)
|
33
|
-
--parser line : use GFF3 line parser
|
34
|
-
--
|
33
|
+
--parser line : use GFF3 line parser (faster, default)
|
34
|
+
--block : parse GFF3 by block (optimistic) -- NYI
|
35
35
|
--cache full : load all in RAM (fast, default)
|
36
36
|
--cache none : do not load anything in memory (slow)
|
37
|
-
--cache lru : use
|
37
|
+
--cache lru : use least recently used cache (limit RAM use, fast) -- NYI
|
38
38
|
--max-cpus num : use num threads -- NYI
|
39
39
|
--emboss : use EMBOSS translation (fast) -- NYI
|
40
40
|
|
@@ -74,21 +74,45 @@ USAGE = <<EOM
|
|
74
74
|
|
75
75
|
gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR
|
76
76
|
|
77
|
+
Fine tuning outpt - show messages matching regex
|
78
|
+
|
79
|
+
gff3-fetch mRNA test/data/gff/test.gff3 --trace '=msg =~ /component/'
|
80
|
+
|
77
81
|
Fine tuning output - write log messages to file.log
|
78
82
|
|
79
83
|
gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR --logger file.log
|
80
84
|
|
85
|
+
For more information on output, see the bioruby-logger plugin.
|
86
|
+
|
81
87
|
== Performance
|
82
88
|
|
83
89
|
time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 2> /dev/null > test.fa
|
84
90
|
|
85
|
-
|
91
|
+
Digesting parser:
|
92
|
+
|
93
|
+
Cache real user sys version RAM
|
86
94
|
------------------------------------------------------------
|
87
95
|
full,bioruby 12m41 12m28 0m09 (0.8.0)
|
88
96
|
full,line 12m13 12m06 0m07 (0.8.5)
|
89
|
-
|
90
|
-
|
91
|
-
|
97
|
+
full,line,lazy 11m51 11m43 0m07 (0.8.6) 6,600M
|
98
|
+
|
99
|
+
none,bioruby 504m 477m 26m50 (0.8.0)
|
100
|
+
none,line 297m 267m 28m36 (0.8.5)
|
101
|
+
none,line,lazy 132m 106m 26m01 (0.8.6) 650M
|
102
|
+
|
103
|
+
lru,bioruby 533m 510m 22m47 (0.8.5)
|
104
|
+
lru,line 353m 326m 26m44 (0.8.5) 1K
|
105
|
+
lru,line 305m 281m 22m30 (0.8.5) 10K
|
106
|
+
lru,line,lazy 182m 161m 21m10 (0.8.6) 10K
|
107
|
+
lru,line,lazy 75m 75m 0m17 (0.8.6) 50K 730M
|
108
|
+
------------------------------------------------------------
|
109
|
+
|
110
|
+
Block parser:
|
111
|
+
|
112
|
+
Cache real user sys gff3 version
|
113
|
+
------------------------------------------------------------
|
114
|
+
in preparation
|
115
|
+
------------------------------------------------------------
|
92
116
|
|
93
117
|
where
|
94
118
|
|
@@ -132,7 +156,7 @@ Bio::Log::CLI.trace('info')
|
|
132
156
|
options = OpenStruct.new()
|
133
157
|
|
134
158
|
# ---- Default options
|
135
|
-
options.parser = :
|
159
|
+
options.parser = :line
|
136
160
|
|
137
161
|
opts = OptionParser.new() { |opts|
|
138
162
|
opts.on_tail("-h", "--help", "Show help and examples") {
|
@@ -141,7 +165,7 @@ opts = OptionParser.new() { |opts|
|
|
141
165
|
exit()
|
142
166
|
}
|
143
167
|
|
144
|
-
opts.on("--parser [bioruby,line]", String, "Parser (default
|
168
|
+
opts.on("--parser [bioruby,line]", String, "Parser (default line)") do |p|
|
145
169
|
case p.downcase
|
146
170
|
when 'bioruby'
|
147
171
|
options.parser = :bioruby
|
@@ -152,12 +176,12 @@ opts = OptionParser.new() { |opts|
|
|
152
176
|
end
|
153
177
|
end
|
154
178
|
|
155
|
-
opts.on("--cache [none
|
179
|
+
opts.on("--cache [full,lru,none]", String, "Caching (default full)") do |cache|
|
156
180
|
case cache.downcase
|
157
181
|
when 'none'
|
158
182
|
options.cache = :cache_none
|
159
|
-
|
160
|
-
|
183
|
+
when 'lru'
|
184
|
+
options.cache = :cache_lru
|
161
185
|
when 'full'
|
162
186
|
options.cache = :cache_full
|
163
187
|
else
|
@@ -165,6 +189,10 @@ opts = OptionParser.new() { |opts|
|
|
165
189
|
end
|
166
190
|
end
|
167
191
|
|
192
|
+
opts.on("--block", "Parse by block") do |b|
|
193
|
+
options.block = true
|
194
|
+
end
|
195
|
+
|
168
196
|
opts.on("--no-assemble", "output sequences without assembling") do |b|
|
169
197
|
options.no_assemble = true
|
170
198
|
end
|
@@ -229,6 +257,7 @@ ARGV.each do | fn |
|
|
229
257
|
opts = {}
|
230
258
|
opts[:validate] = options.validate
|
231
259
|
opts[:parser] = options.parser
|
260
|
+
opts[:block] = options.block
|
232
261
|
opts[:cache_components] = options.cache
|
233
262
|
opts[:cache_records] = options.cache
|
234
263
|
opts[:fasta_filename] = fastafn if fastafn
|
@@ -238,6 +267,10 @@ ARGV.each do | fn |
|
|
238
267
|
opts[:phase] = options.phase
|
239
268
|
opts[:debug] = options.debug
|
240
269
|
|
270
|
+
include Bio::GFFbrowser::Helpers::Logger
|
271
|
+
debug $:.to_s
|
272
|
+
debug opts.to_s
|
273
|
+
log_sys_info("BaseLine")
|
241
274
|
gff3 = Bio::GFFbrowser::GFF3.new(fn,opts)
|
242
275
|
|
243
276
|
gff = gff3.assembler
|
@@ -265,8 +298,11 @@ ARGV.each do | fn |
|
|
265
298
|
writer.put(id,seq)
|
266
299
|
end
|
267
300
|
else
|
268
|
-
|
301
|
+
gff.each_seq(gfftype.downcase) do | id, seq |
|
302
|
+
writer.put(id,seq)
|
303
|
+
end
|
269
304
|
end
|
305
|
+
log_sys_info("Done")
|
270
306
|
fastafn = nil
|
271
307
|
end
|
272
308
|
|
data/bio-gff3.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-gff3}
|
8
|
-
s.version = "0.8.
|
8
|
+
s.version = "0.8.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-17}
|
13
13
|
s.default_executable = %q{gff3-fetch}
|
14
14
|
s.description = %q{GFF3 (genome browser) information and digest mRNA and CDS sequences.
|
15
15
|
Options for low memory use and caching of records.
|
@@ -31,7 +31,9 @@ Support for external FASTA files.
|
|
31
31
|
"bin/gff3-fetch",
|
32
32
|
"bio-gff3.gemspec",
|
33
33
|
"lib/bio-gff3.rb",
|
34
|
+
"lib/bio/db/gff/block/gffblockparser.rb",
|
34
35
|
"lib/bio/db/gff/digest/gffinmemory.rb",
|
36
|
+
"lib/bio/db/gff/digest/gfflrucache.rb",
|
35
37
|
"lib/bio/db/gff/digest/gffnocache.rb",
|
36
38
|
"lib/bio/db/gff/digest/gffparser.rb",
|
37
39
|
"lib/bio/db/gff/file/gfffasta.rb",
|
@@ -65,6 +67,8 @@ Support for external FASTA files.
|
|
65
67
|
"test/data/gff/test.gff3",
|
66
68
|
"test/data/regression/test_ext_gff3.rtest",
|
67
69
|
"test/data/regression/test_gff3.rtest",
|
70
|
+
"test/data/regression/test_lrucache_ext_gff3.rtest",
|
71
|
+
"test/data/regression/test_lrucache_gff3.rtest",
|
68
72
|
"test/data/regression/test_nocache_ext_gff3.rtest",
|
69
73
|
"test/data/regression/test_nocache_gff3.rtest",
|
70
74
|
"test/helper.rb",
|
@@ -94,41 +98,32 @@ Support for external FASTA files.
|
|
94
98
|
|
95
99
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
96
100
|
s.add_runtime_dependency(%q<bio>, [">= 1.3.1"])
|
97
|
-
s.add_runtime_dependency(%q<
|
101
|
+
s.add_runtime_dependency(%q<log4r>, ["> 1.1.6"])
|
102
|
+
s.add_runtime_dependency(%q<bio-logger>, ["> 0.8.0"])
|
98
103
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
99
104
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
100
105
|
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
101
106
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
102
|
-
s.add_development_dependency(%q<rspec>, [">= 2.
|
103
|
-
s.add_runtime_dependency(%q<bio>, [">= 1.4.1"])
|
104
|
-
s.add_runtime_dependency(%q<log4r>, ["> 1.1.6"])
|
105
|
-
s.add_runtime_dependency(%q<bio-logger>, [">= 0.6.1"])
|
106
|
-
s.add_development_dependency(%q<rspec>, ["> 2.0"])
|
107
|
+
s.add_development_dependency(%q<rspec>, [">= 2.3.0"])
|
107
108
|
else
|
108
109
|
s.add_dependency(%q<bio>, [">= 1.3.1"])
|
109
|
-
s.add_dependency(%q<
|
110
|
+
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
111
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
110
112
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
111
113
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
112
114
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
113
115
|
s.add_dependency(%q<rcov>, [">= 0"])
|
114
|
-
s.add_dependency(%q<rspec>, [">= 2.
|
115
|
-
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
116
|
-
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
117
|
-
s.add_dependency(%q<bio-logger>, [">= 0.6.1"])
|
118
|
-
s.add_dependency(%q<rspec>, ["> 2.0"])
|
116
|
+
s.add_dependency(%q<rspec>, [">= 2.3.0"])
|
119
117
|
end
|
120
118
|
else
|
121
119
|
s.add_dependency(%q<bio>, [">= 1.3.1"])
|
122
|
-
s.add_dependency(%q<
|
120
|
+
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
121
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
123
122
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
124
123
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
125
124
|
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
126
125
|
s.add_dependency(%q<rcov>, [">= 0"])
|
127
|
-
s.add_dependency(%q<rspec>, [">= 2.
|
128
|
-
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
129
|
-
s.add_dependency(%q<log4r>, ["> 1.1.6"])
|
130
|
-
s.add_dependency(%q<bio-logger>, [">= 0.6.1"])
|
131
|
-
s.add_dependency(%q<rspec>, ["> 2.0"])
|
126
|
+
s.add_dependency(%q<rspec>, [">= 2.3.0"])
|
132
127
|
end
|
133
128
|
end
|
134
129
|
|
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
module Bio
|
3
|
+
module GFFbrowser
|
4
|
+
|
5
|
+
module Block
|
6
|
+
|
7
|
+
# The block parser simplifies parsing, by assuming GFF3 is
|
8
|
+
# organised into blocks. All relevant information is
|
9
|
+
# resolved a block at a time.
|
10
|
+
class GffBlockParser
|
11
|
+
include FastLineParser
|
12
|
+
|
13
|
+
def initialize filename, options
|
14
|
+
info "Starting block parser"
|
15
|
+
@filename = filename
|
16
|
+
@options = options
|
17
|
+
@iter = Bio::GFF::GFF3::FileIterator.new(@filename)
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(gfftype)
|
21
|
+
@inseqidlist = {}
|
22
|
+
# Fetch FASTA first
|
23
|
+
@sequencelist = {}
|
24
|
+
if @options[:fasta_filename]
|
25
|
+
File.open(@options[:fasta_filename]) do | f |
|
26
|
+
fasta = Bio::GFF::FastaReader.new(f)
|
27
|
+
fasta.each do | id, fastarec |
|
28
|
+
# p fastarec
|
29
|
+
@sequencelist[id] = fastarec
|
30
|
+
end
|
31
|
+
end
|
32
|
+
else
|
33
|
+
# Embedded FASTA
|
34
|
+
@iter.each_sequence do | id, bioseq |
|
35
|
+
@sequencelist[id] = bioseq.to_s
|
36
|
+
end
|
37
|
+
end
|
38
|
+
seqid = nil
|
39
|
+
recs = []
|
40
|
+
@iter.each_rec do | fpos, line |
|
41
|
+
rec = FastLineRecord.new(parse_line_fast(line))
|
42
|
+
if seqid != rec.seqid
|
43
|
+
# starting a new block
|
44
|
+
if @inseqidlist[rec.seqid]
|
45
|
+
# not a well formed GFF3 file, we need
|
46
|
+
# to drop
|
47
|
+
error "GFF3 file not sorted, falling back to line parser"
|
48
|
+
raise "ERROR, bailing out"
|
49
|
+
end
|
50
|
+
parse_block(gfftype,recs,@sequencelist[seqid]) { | id, seq | yield id,seq } if seqid
|
51
|
+
recs = []
|
52
|
+
seqid = rec.seqid
|
53
|
+
@inseqidlist[seqid] = true
|
54
|
+
end
|
55
|
+
recs.push rec
|
56
|
+
end
|
57
|
+
parse_block(gfftype,recs,@sequencelist[seqid]) { | id, seq | yield id,seq } if seqid
|
58
|
+
end
|
59
|
+
|
60
|
+
# Parse sequence objects sharing the same seqid
|
61
|
+
# and yield each +gfftype+ as an iq,seq
|
62
|
+
def parse_block gfftype, recs, sequence
|
63
|
+
recs.each do | rec |
|
64
|
+
if rec.feature_type.downcase == gfftype
|
65
|
+
yield rec.id, sequence[rec.start-1..rec.end-1]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def each_seq(gfftype)
|
71
|
+
parse(gfftype) { | id, seq | yield id,seq }
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
def each_gene_seq
|
76
|
+
each_seq('gene') { | id, seq | yield id,seq }
|
77
|
+
end
|
78
|
+
def each_mRNA_seq
|
79
|
+
each_seq('mrna') { | id, seq | yield id,seq }
|
80
|
+
|
81
|
+
end
|
82
|
+
def each_exon_seq
|
83
|
+
each_seq('exon') { | id, seq | yield id,seq }
|
84
|
+
|
85
|
+
end
|
86
|
+
def each_CDS_seq
|
87
|
+
each_seq('cds') { | id, seq | yield id,seq }
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,208 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/gff/gfflrucache.rb - Assemble mRNA and CDS from GFF by LRU cache
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2010
|
5
|
+
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# Fetch information from a GFF file without using RAM - using a least
|
9
|
+
# recently used cache 'LRU' - also check out the caching edition,
|
10
|
+
# which uses limited amounts of RAM.
|
11
|
+
#
|
12
|
+
# In effect it is NoCache with parser recs cached in the LRU hash
|
13
|
+
|
14
|
+
require 'bio/db/gff/digest/gffparser'
|
15
|
+
require 'bio/system/lruhash'
|
16
|
+
|
17
|
+
module Bio
|
18
|
+
module GFFbrowser
|
19
|
+
|
20
|
+
module Digest
|
21
|
+
|
22
|
+
module LruCacheHelpers
|
23
|
+
|
24
|
+
# Module to fetch a line from GFF3 file and returns a parsed
|
25
|
+
# record
|
26
|
+
module SeekRec
|
27
|
+
# Fetch a record using fh and file seek position
|
28
|
+
def SeekRec::fetch(fh,fpos,parser)
|
29
|
+
return nil if fh==nil or fpos==nil
|
30
|
+
fh.seek(fpos)
|
31
|
+
if parser == :bioruby
|
32
|
+
GFF::GFF3::BioRubyFileRecord.new(fpos, fh.gets)
|
33
|
+
else
|
34
|
+
GFF::GFF3::FastParserFileRecord.new(fpos, fh.gets)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
module LruRec
|
40
|
+
# Fetch a record using fh and file seek position,
|
41
|
+
# utilising the LRU cache
|
42
|
+
def fetch(fh,fpos,parser)
|
43
|
+
return nil if fh==nil or fpos==nil
|
44
|
+
rec = @lru[fpos]
|
45
|
+
if rec==nil
|
46
|
+
rec = SeekRec::fetch(fh,fpos,parser)
|
47
|
+
@lru[fpos] = rec
|
48
|
+
end
|
49
|
+
rec
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Helper class which gives Hash-like access to the
|
54
|
+
# no-cache GFF3 file
|
55
|
+
class SeekRecList
|
56
|
+
include LruRec
|
57
|
+
|
58
|
+
def initialize fh, parser, lru
|
59
|
+
@fh = fh
|
60
|
+
@h = {}
|
61
|
+
@parser = parser
|
62
|
+
@lru = lru
|
63
|
+
end
|
64
|
+
|
65
|
+
def []= id, rec
|
66
|
+
raise "id #{id} occurs twice!" if @h[id]
|
67
|
+
fpos = rec.io_seek
|
68
|
+
@h[id] = fpos
|
69
|
+
end
|
70
|
+
|
71
|
+
def [](id)
|
72
|
+
fpos = @h[id]
|
73
|
+
fetch(@fh,fpos,@parser)
|
74
|
+
end
|
75
|
+
|
76
|
+
def each
|
77
|
+
@h.each do | id,fpos |
|
78
|
+
yield id, self[id]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# List of ids
|
84
|
+
class SeekLinkedRecs < Hash
|
85
|
+
include Helpers::Logger
|
86
|
+
def add id, rec
|
87
|
+
info "Adding #{rec.feature_type} <#{id}>"
|
88
|
+
self[id] = [] if self[id] == nil
|
89
|
+
self[id] << rec.io_seek
|
90
|
+
end
|
91
|
+
# validation is switched off for LruCache
|
92
|
+
def validate_seqname
|
93
|
+
end
|
94
|
+
# validation is switched off for LruCache
|
95
|
+
def validate_nonoverlapping
|
96
|
+
end
|
97
|
+
# validation is switched off for LruCache
|
98
|
+
def validate_shared_parent
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class LruTracker
|
104
|
+
include Helpers::Logger
|
105
|
+
attr_accessor :hits, :misses, :calls
|
106
|
+
attr_reader :cache
|
107
|
+
|
108
|
+
def initialize
|
109
|
+
@cache = LRUHash.new 50000
|
110
|
+
@hits = 0
|
111
|
+
@misses = 0
|
112
|
+
@calls = 0
|
113
|
+
end
|
114
|
+
|
115
|
+
def [](name)
|
116
|
+
@calls += 1
|
117
|
+
item = @cache[name]
|
118
|
+
if @cache[name] == nil
|
119
|
+
@misses += 1
|
120
|
+
else
|
121
|
+
@hits += 1
|
122
|
+
end
|
123
|
+
item
|
124
|
+
end
|
125
|
+
|
126
|
+
def []=(name,item)
|
127
|
+
@cache[name] = item
|
128
|
+
end
|
129
|
+
def display msg
|
130
|
+
info "Cache calls #{msg} = #{@calls}"
|
131
|
+
info "Cache hits #{msg} = #{@hits}"
|
132
|
+
info "Cache misses #{msg} = #{@misses}"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
class LruCache
|
137
|
+
include Parser
|
138
|
+
include LruCacheHelpers
|
139
|
+
include Gff3Sequence
|
140
|
+
include LruRec
|
141
|
+
|
142
|
+
def initialize filename, options
|
143
|
+
@filename = filename
|
144
|
+
@options = options
|
145
|
+
@iter = Bio::GFF::GFF3::FileIterator.new(@filename)
|
146
|
+
@lru = LruTracker.new
|
147
|
+
end
|
148
|
+
|
149
|
+
# parse the whole file once and store all seek locations,
|
150
|
+
# rather than the records themselves
|
151
|
+
def parse
|
152
|
+
info "---- Digest DB and store data in mRNA Hash (LruCache)"
|
153
|
+
@count_ids = Counter.new # Count ids
|
154
|
+
@count_seqnames = Counter.new # Count seqnames
|
155
|
+
@componentlist = SeekRecList.new(@iter.fh,@options[:parser],@lru) # Store containers, like genes, contigs
|
156
|
+
@orflist = SeekLinkedRecs.new # Store linked gene records
|
157
|
+
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
158
|
+
@cdslist = SeekLinkedRecs.new
|
159
|
+
@exonlist = SeekLinkedRecs.new
|
160
|
+
@sequencelist = {}
|
161
|
+
@unrecognized_features = {}
|
162
|
+
@iter.each_rec do |fpos, line|
|
163
|
+
rec = case @options[:parser]
|
164
|
+
when :bioruby
|
165
|
+
Bio::GFF::GFF3::BioRubyFileRecord.new(fpos, line)
|
166
|
+
when :line
|
167
|
+
Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
|
168
|
+
else
|
169
|
+
raise 'Unknown parser'
|
170
|
+
end
|
171
|
+
store_record(rec)
|
172
|
+
end
|
173
|
+
@iter.each_sequence do | id, bioseq |
|
174
|
+
@sequencelist[id] = bioseq.to_s
|
175
|
+
end
|
176
|
+
validate_mrnas
|
177
|
+
validate_cdss
|
178
|
+
show_unrecognized_features
|
179
|
+
@genelist = @count_ids.keys
|
180
|
+
read_fasta
|
181
|
+
@lru.display('After reading files')
|
182
|
+
end
|
183
|
+
|
184
|
+
def each_item list
|
185
|
+
# p list.class
|
186
|
+
fh = @iter.fh
|
187
|
+
list.each do | id, io_seeklist |
|
188
|
+
recs = []
|
189
|
+
io_seeklist.each do | fpos |
|
190
|
+
recs << fetch(fh,fpos,@options[:parser])
|
191
|
+
end
|
192
|
+
seqid = recs[0].seqname
|
193
|
+
component = find_component(recs[0])
|
194
|
+
if @options[:no_assemble]
|
195
|
+
recs.each do | rec |
|
196
|
+
yield id, [rec], component
|
197
|
+
end
|
198
|
+
else
|
199
|
+
yield id, recs, component
|
200
|
+
end
|
201
|
+
end
|
202
|
+
@lru.display('After iterating')
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|