bio-gff3 0.8.5 → 0.8.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,19 +21,24 @@ module Bio
21
21
  # record
22
22
  module SeekRec
23
23
  # Fetch a record using fh and file seek position
24
- def SeekRec::fetch(fh,fpos)
24
+ def SeekRec::fetch(fh,fpos,parser)
25
25
  return nil if fh==nil or fpos==nil
26
26
  fh.seek(fpos)
27
- GFF::GFF3::FileRecord.new(fpos, fh.gets)
27
+ if parser == :bioruby
28
+ GFF::GFF3::BioRubyFileRecord.new(fpos, fh.gets)
29
+ else
30
+ GFF::GFF3::FastParserFileRecord.new(fpos, fh.gets)
31
+ end
28
32
  end
29
33
  end
30
34
 
31
35
  # Helper class which gives Hash-like access to the
32
36
  # no-cache GFF3 file
33
37
  class SeekRecList
34
- def initialize fh
38
+ def initialize fh, parser
35
39
  @fh = fh
36
40
  @h = {}
41
+ @parser = parser
37
42
  end
38
43
 
39
44
  def []= id, rec
@@ -44,7 +49,7 @@ module Bio
44
49
 
45
50
  def [](id)
46
51
  fpos = @h[id]
47
- SeekRec::fetch(@fh,fpos)
52
+ SeekRec::fetch(@fh,fpos,@parser)
48
53
  end
49
54
 
50
55
  def each
@@ -56,7 +61,7 @@ module Bio
56
61
 
57
62
  # List of ids
58
63
  class SeekLinkedRecs < Hash
59
- include Helpers::Error
64
+ include Helpers::Logger
60
65
  def add id, rec
61
66
  info "Adding #{rec.feature_type} <#{id}>"
62
67
  self[id] = [] if self[id] == nil
@@ -91,14 +96,22 @@ module Bio
91
96
  info "---- Digest DB and store data in mRNA Hash (NoCache)"
92
97
  @count_ids = Counter.new # Count ids
93
98
  @count_seqnames = Counter.new # Count seqnames
94
- @componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
99
+ @componentlist = SeekRecList.new(@iter.fh,@options[:parser]) # Store containers, like genes, contigs
95
100
  @orflist = SeekLinkedRecs.new # Store linked gene records
96
101
  @mrnalist = SeekLinkedRecs.new # Store linked mRNA records
97
102
  @cdslist = SeekLinkedRecs.new
98
103
  @exonlist = SeekLinkedRecs.new
99
104
  @sequencelist = {}
100
105
  @unrecognized_features = {}
101
- @iter.each_rec do | id, rec |
106
+ @iter.each_rec do |fpos, line|
107
+ rec = case @options[:parser]
108
+ when :bioruby
109
+ Bio::GFF::GFF3::BioRubyFileRecord.new(fpos, line)
110
+ when :line
111
+ Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
112
+ else
113
+ raise 'Unknown parser'
114
+ end
102
115
  store_record(rec)
103
116
  end
104
117
  @iter.each_sequence do | id, bioseq |
@@ -117,11 +130,17 @@ module Bio
117
130
  list.each do | id, io_seeklist |
118
131
  recs = []
119
132
  io_seeklist.each do | fpos |
120
- recs << SeekRec::fetch(fh,fpos)
133
+ recs << SeekRec::fetch(fh,fpos,@options[:parser])
121
134
  end
122
135
  seqid = recs[0].seqname
123
136
  component = find_component(recs[0])
124
- yield id, recs, component
137
+ if @options[:no_assemble]
138
+ recs.each do | rec |
139
+ yield id, [rec], component
140
+ end
141
+ else
142
+ yield id, recs, component
143
+ end
125
144
  end
126
145
  end
127
146
 
@@ -19,7 +19,7 @@ module Bio
19
19
 
20
20
  include Bio::GFFbrowser::Helpers
21
21
  include Bio::GFFbrowser::Helpers::Validate
22
- include Bio::GFFbrowser::Helpers::Error
22
+ include Bio::GFFbrowser::Helpers::Logger
23
23
  include Gff3Component
24
24
  include Gff3Features
25
25
 
@@ -13,7 +13,7 @@ module Bio
13
13
 
14
14
  # FileRecord inherits from the BioRuby Record, but
15
15
  # adds the file seek position.
16
- class FileRecord < Record
16
+ class BioRubyFileRecord < Record
17
17
  attr_accessor :io_seek
18
18
  def initialize io_seek, buf
19
19
  @io_seek = io_seek
@@ -21,6 +21,16 @@ module Bio
21
21
  end
22
22
  end
23
23
 
24
+ class FastParserFileRecord < GFFbrowser::FastLineRecord
25
+ attr_accessor :io_seek
26
+
27
+ include Bio::GFFbrowser::FastLineParser
28
+ def initialize io_seek, buf
29
+ @io_seek = io_seek
30
+ super(parse_line_fast(buf))
31
+ end
32
+ end
33
+
24
34
  # GFF3::FileIterator takes a file and yields GFF3 records with their
25
35
  # seek position included in the record.
26
36
  class FileIterator
@@ -31,9 +41,10 @@ module Bio
31
41
  @fh = File.open(filename)
32
42
  end
33
43
 
34
- # Iterate over every record in the file, yielding the record ID and
35
- # (File)Record, which includes the io_seek position in the file
36
- def each_rec()
44
+ # Iterate over every record in the file, yielding the seekpos
45
+ # and line containing the record
46
+ def each_rec
47
+ @fh.seek(0)
37
48
  fpos = 0
38
49
  @fh.each_line do | line |
39
50
  line = line.strip
@@ -42,10 +53,8 @@ module Bio
42
53
  break
43
54
  end
44
55
  if line.size != 0 and line !~ /^#/
45
- rec = FileRecord.new(fpos, line)
46
56
  lastpos = @fh.tell
47
- id = rec.id
48
- yield id, rec
57
+ yield fpos, line
49
58
  @fh.seek(lastpos) # reset filepos, just in case it changed
50
59
  end
51
60
  fpos = @fh.tell
@@ -9,6 +9,8 @@
9
9
 
10
10
  require 'bio/db/gff/digest/gffinmemory'
11
11
  require 'bio/db/gff/digest/gffnocache'
12
+ require 'bio/db/gff/digest/gfflrucache'
13
+ require 'bio/db/gff/block/gffblockparser'
12
14
 
13
15
  module Bio
14
16
  module GFFbrowser
@@ -16,16 +18,24 @@ module Bio
16
18
  attr_reader :assembler
17
19
 
18
20
  include Digest
21
+ include Block
19
22
 
20
23
  # Initialize a GFF parser
21
24
  def initialize filename, options = {}
25
+ options[:parser] = :line if options[:parser] == nil
22
26
  cache_recs = options[:cache_records]
23
27
  @assembler =
24
- case cache_recs
25
- when :cache_none
26
- NoCache.new(filename, options)
27
- else
28
- InMemory.new(filename, options) # default
28
+ if options[:block]
29
+ GffBlockParser.new(filename, options)
30
+ else
31
+ case cache_recs
32
+ when :cache_none
33
+ NoCache.new(filename, options)
34
+ when :cache_lru
35
+ LruCache.new(filename, options)
36
+ else
37
+ InMemory.new(filename, options) # default
38
+ end
29
39
  end
30
40
  end
31
41
 
@@ -24,7 +24,7 @@ module Bio
24
24
  #
25
25
  module FastLineParser
26
26
 
27
- include Helpers::Error
27
+ include Helpers::Logger
28
28
 
29
29
  # Returns a (partial) record, assuming it is a valid GFF3
30
30
  # format, no validation takes place, other than field counting (!)
@@ -44,11 +44,6 @@ module Bio
44
44
  return nil
45
45
  end
46
46
 
47
- fs[GFF3_START] = fs[GFF3_START].to_i
48
- fs[GFF3_END] = fs[GFF3_END].to_i
49
- fs[GFF3_SCORE] = fs[GFF3_SCORE].to_f
50
- fs[GFF3_PHASE] = fs[GFF3_PHASE].to_i
51
- fs[GFF3_ATTRIBUTES] = parse_attributes_fast(fs[GFF3_ATTRIBUTES],options)
52
47
  fs
53
48
  end
54
49
 
@@ -7,13 +7,15 @@
7
7
  #
8
8
  # Fetch information from a GFF file
9
9
 
10
+ require 'set'
11
+
10
12
  module Bio
11
13
  module GFFbrowser
12
14
 
13
15
  module Helpers
14
16
 
15
17
  module Record
16
- include Error
18
+ include Logger
17
19
  # Format a record ID by, first, getting the ID attribute. If that fails
18
20
  # the seqname is used with the start/stop positions.
19
21
  def Record::formatID rec
@@ -33,11 +35,11 @@ module Bio
33
35
 
34
36
  module Gff3Component
35
37
 
36
- include Error
38
+ include Logger
37
39
 
38
- COMPONENT_TYPES = %w{
40
+ COMPONENT_TYPES = Set.new(%w{
39
41
  gene SO:0000704 contig transcript Component region
40
- }
42
+ })
41
43
 
42
44
  # Walk the component list to find a matching component/container for a
43
45
  # record. First use the parent ID. If that is missing go by sequence
@@ -80,13 +82,13 @@ module Bio
80
82
  module Gff3Features
81
83
 
82
84
  # Ignore the following features (case sensitive?)
83
- IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
85
+ IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + Set.new(%w{
84
86
  transposon Match similarity UTR
85
87
  TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
86
88
  polyA_site SO:0000553
87
89
  five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
88
90
  exon SO:0000147
89
- }
91
+ })
90
92
  end
91
93
 
92
94
  end
@@ -1,3 +1,5 @@
1
+ require 'bio/db/gff/gff3parserec'
2
+
1
3
  module Bio
2
4
  module GFFbrowser
3
5
 
@@ -7,6 +9,9 @@ module Bio
7
9
 
8
10
  # Using the fast line parser
9
11
  class FastLineRecord < Record
12
+
13
+ include FastLineParser
14
+
10
15
  def initialize fields
11
16
  @fields = fields
12
17
  end
@@ -16,27 +21,27 @@ module Bio
16
21
  end
17
22
 
18
23
  def seqid
19
- @fields[GFF3_SEQID]
24
+ @seqid_ ||= @fields[GFF3_SEQID]
20
25
  end
21
26
 
22
27
  alias seqname :seqid
23
28
 
24
29
  def phase
25
- @fields[GFF3_PHASE]
30
+ @phase_ ||= @fields[GFF3_PHASE].to_i
26
31
  end
27
32
 
28
33
  alias frame :phase
29
34
 
30
35
  def start
31
- @fields[GFF3_START]
36
+ @start_ ||= @fields[GFF3_START].to_i
32
37
  end
33
38
 
34
39
  def end
35
- @fields[GFF3_END]
40
+ @end_ ||= @fields[GFF3_END].to_i
36
41
  end
37
42
 
38
43
  def score
39
- @fields[GFF3_SCORE]
44
+ @score_ ||= @fields[GFF3_SCORE].to_f
40
45
  end
41
46
 
42
47
  def strand
@@ -44,7 +49,7 @@ module Bio
44
49
  end
45
50
 
46
51
  def feature
47
- @fields[GFF3_TYPE]
52
+ @feature_ ||= @fields[GFF3_TYPE]
48
53
  end
49
54
 
50
55
  alias feature_type :feature
@@ -53,7 +58,7 @@ module Bio
53
58
  end
54
59
 
55
60
  def attributes
56
- @fields[GFF3_ATTRIBUTES]
61
+ @attributes_ ||= parse_attributes_fast(@fields[GFF3_ATTRIBUTES])
57
62
  end
58
63
 
59
64
  def get_attribute name
@@ -61,7 +66,7 @@ module Bio
61
66
  end
62
67
 
63
68
  def id
64
- attributes['ID']
69
+ @id_ ||= attributes['ID']
65
70
  end
66
71
 
67
72
  alias entry_id :id
@@ -1,4 +1,3 @@
1
-
2
1
  module Bio
3
2
  module GFFbrowser
4
3
 
@@ -15,7 +15,7 @@ module Bio
15
15
 
16
16
  module Gff3Sequence
17
17
 
18
- include Bio::GFFbrowser::Helpers::Error
18
+ include Bio::GFFbrowser::Helpers::Logger
19
19
 
20
20
 
21
21
  # Patch a sequence together from a Sequence string and an array
@@ -61,13 +61,13 @@ module Bio
61
61
  orf_frame = startpos - 1
62
62
  orf_frameshift = orf_frame % 3
63
63
  sectionlist = sectionlist.reverse if orf_reverse
64
- # if do_debug
64
+ if do_debug
65
65
  debug options.to_s
66
66
  debug [:reverse,do_reverse].to_s
67
67
  debug [:complement,do_complement].to_s
68
68
  debug [:trim,do_trim].to_s
69
69
  debug [:orf_reverse, orf_reverse, rec0.strand].to_s
70
- # end
70
+ end
71
71
 
72
72
  if sequence.kind_of?(Bio::FastaFormat)
73
73
  # BioRuby conversion
@@ -80,18 +80,12 @@ module Bio
80
80
  if do_reverse and orf_reverse
81
81
  s = s.reverse
82
82
  end
83
- # Correct for phase. Unfortunately the use of phase is ambiguous.
84
- # Here we check whether rec.start is in line with orf_frame. If it
85
- # is, we correct for phase. Otherwise it is ignored.
86
83
  if do_phase and rec.phase
87
84
  phase = rec.phase.to_i
88
- # if ((rec.start-startpos) % 3 == 0)
89
85
  s = s[phase..-1]
90
- # end
91
86
  end
92
87
  s
93
88
  }
94
- # p seq
95
89
  seq = seq.join
96
90
  if do_complement and do_reverse and orf_reverse
97
91
  ntseq = Bio::Sequence::NA.new(seq)
@@ -33,7 +33,7 @@ module Bio
33
33
 
34
34
  # Helper class for storing linked records based on a shared ID
35
35
  class LinkedRecs < Hash
36
- include Error
36
+ include Logger
37
37
  def add id, rec
38
38
  info "Adding #{rec.feature_type} <#{id}>"
39
39
  self[id] = [] if self[id] == nil
@@ -4,7 +4,7 @@ module Bio
4
4
 
5
5
  module Helpers
6
6
 
7
- module Error
7
+ module Logger
8
8
  include Bio::Log
9
9
 
10
10
  def debug str, id=''
@@ -27,6 +27,15 @@ module Bio
27
27
  log.error_(str+" <#{id}>",:act => FailOnError.new)
28
28
 
29
29
  end
30
+
31
+ def log_sys_info msg
32
+ log = LoggerPlus['bio-gff3']
33
+ rmem = `ps -o rss= -p #{Process.pid}`.to_i
34
+ vmem = `ps -o vsz= -p #{Process.pid}`.to_i
35
+ if rmem or vmem
36
+ log.info7 "Memory used #{msg} RAM #{rmem/1024}M, VMEM #{vmem/1024}M"
37
+ end
38
+ end
30
39
  end
31
40
  end
32
41
  end
@@ -13,10 +13,11 @@ TEST2='test/data/gff/standard.gff3'
13
13
 
14
14
  describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
15
15
 
16
+
16
17
  it "should parse a file and yield records" do
17
18
  iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
18
- iter.each_rec do | id, rec |
19
- # p [id, rec, rec.io_seek]
19
+ iter.each_rec do | fpos, line |
20
+ rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
20
21
  rec.io_seek.should == 51
21
22
  break
22
23
  end
@@ -25,8 +26,8 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
25
26
  it "should handle embedded FASTA records" do
26
27
  iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
27
28
  last = nil
28
- iter.each_rec do | id, rec |
29
- # p [id, rec]
29
+ iter.each_rec do | fpos, line |
30
+ rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
30
31
  last = rec
31
32
  end
32
33
  last.io_seek.should == 3342
@@ -54,7 +54,13 @@ describe GFF3, "GFF3 API (InMemory) with everything in memory" do
54
54
  end
55
55
 
56
56
  describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => :cache_none" do
57
- # iterators_should_be_implemented
57
+ before :all do
58
+ # initialize
59
+ gff3 = Bio::GFFbrowser::GFF3.new(TESTGFF1, :cache_components => :cache_none, :cache_records => :cache_lru)
60
+ @gff = gff3.assembler
61
+ end
62
+
63
+ iterators_should_be_implemented
58
64
  end
59
65
 
60
66
  describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => 1000" do