bio-gff3 0.8.5 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,19 +21,24 @@ module Bio
21
21
  # record
22
22
  module SeekRec
23
23
  # Fetch a record using fh and file seek position
24
- def SeekRec::fetch(fh,fpos)
24
+ def SeekRec::fetch(fh,fpos,parser)
25
25
  return nil if fh==nil or fpos==nil
26
26
  fh.seek(fpos)
27
- GFF::GFF3::FileRecord.new(fpos, fh.gets)
27
+ if parser == :bioruby
28
+ GFF::GFF3::BioRubyFileRecord.new(fpos, fh.gets)
29
+ else
30
+ GFF::GFF3::FastParserFileRecord.new(fpos, fh.gets)
31
+ end
28
32
  end
29
33
  end
30
34
 
31
35
  # Helper class which gives Hash-like access to the
32
36
  # no-cache GFF3 file
33
37
  class SeekRecList
34
- def initialize fh
38
+ def initialize fh, parser
35
39
  @fh = fh
36
40
  @h = {}
41
+ @parser = parser
37
42
  end
38
43
 
39
44
  def []= id, rec
@@ -44,7 +49,7 @@ module Bio
44
49
 
45
50
  def [](id)
46
51
  fpos = @h[id]
47
- SeekRec::fetch(@fh,fpos)
52
+ SeekRec::fetch(@fh,fpos,@parser)
48
53
  end
49
54
 
50
55
  def each
@@ -56,7 +61,7 @@ module Bio
56
61
 
57
62
  # List of ids
58
63
  class SeekLinkedRecs < Hash
59
- include Helpers::Error
64
+ include Helpers::Logger
60
65
  def add id, rec
61
66
  info "Adding #{rec.feature_type} <#{id}>"
62
67
  self[id] = [] if self[id] == nil
@@ -91,14 +96,22 @@ module Bio
91
96
  info "---- Digest DB and store data in mRNA Hash (NoCache)"
92
97
  @count_ids = Counter.new # Count ids
93
98
  @count_seqnames = Counter.new # Count seqnames
94
- @componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
99
+ @componentlist = SeekRecList.new(@iter.fh,@options[:parser]) # Store containers, like genes, contigs
95
100
  @orflist = SeekLinkedRecs.new # Store linked gene records
96
101
  @mrnalist = SeekLinkedRecs.new # Store linked mRNA records
97
102
  @cdslist = SeekLinkedRecs.new
98
103
  @exonlist = SeekLinkedRecs.new
99
104
  @sequencelist = {}
100
105
  @unrecognized_features = {}
101
- @iter.each_rec do | id, rec |
106
+ @iter.each_rec do |fpos, line|
107
+ rec = case @options[:parser]
108
+ when :bioruby
109
+ Bio::GFF::GFF3::BioRubyFileRecord.new(fpos, line)
110
+ when :line
111
+ Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
112
+ else
113
+ raise 'Unknown parser'
114
+ end
102
115
  store_record(rec)
103
116
  end
104
117
  @iter.each_sequence do | id, bioseq |
@@ -117,11 +130,17 @@ module Bio
117
130
  list.each do | id, io_seeklist |
118
131
  recs = []
119
132
  io_seeklist.each do | fpos |
120
- recs << SeekRec::fetch(fh,fpos)
133
+ recs << SeekRec::fetch(fh,fpos,@options[:parser])
121
134
  end
122
135
  seqid = recs[0].seqname
123
136
  component = find_component(recs[0])
124
- yield id, recs, component
137
+ if @options[:no_assemble]
138
+ recs.each do | rec |
139
+ yield id, [rec], component
140
+ end
141
+ else
142
+ yield id, recs, component
143
+ end
125
144
  end
126
145
  end
127
146
 
@@ -19,7 +19,7 @@ module Bio
19
19
 
20
20
  include Bio::GFFbrowser::Helpers
21
21
  include Bio::GFFbrowser::Helpers::Validate
22
- include Bio::GFFbrowser::Helpers::Error
22
+ include Bio::GFFbrowser::Helpers::Logger
23
23
  include Gff3Component
24
24
  include Gff3Features
25
25
 
@@ -13,7 +13,7 @@ module Bio
13
13
 
14
14
  # FileRecord inherits from the BioRuby Record, but
15
15
  # adds the file seek position.
16
- class FileRecord < Record
16
+ class BioRubyFileRecord < Record
17
17
  attr_accessor :io_seek
18
18
  def initialize io_seek, buf
19
19
  @io_seek = io_seek
@@ -21,6 +21,16 @@ module Bio
21
21
  end
22
22
  end
23
23
 
24
+ class FastParserFileRecord < GFFbrowser::FastLineRecord
25
+ attr_accessor :io_seek
26
+
27
+ include Bio::GFFbrowser::FastLineParser
28
+ def initialize io_seek, buf
29
+ @io_seek = io_seek
30
+ super(parse_line_fast(buf))
31
+ end
32
+ end
33
+
24
34
  # GFF3::FileIterator takes a file and yields GFF3 records with their
25
35
  # seek position included in the record.
26
36
  class FileIterator
@@ -31,9 +41,10 @@ module Bio
31
41
  @fh = File.open(filename)
32
42
  end
33
43
 
34
- # Iterate over every record in the file, yielding the record ID and
35
- # (File)Record, which includes the io_seek position in the file
36
- def each_rec()
44
+ # Iterate over every record in the file, yielding the seekpos
45
+ # and line containing the record
46
+ def each_rec
47
+ @fh.seek(0)
37
48
  fpos = 0
38
49
  @fh.each_line do | line |
39
50
  line = line.strip
@@ -42,10 +53,8 @@ module Bio
42
53
  break
43
54
  end
44
55
  if line.size != 0 and line !~ /^#/
45
- rec = FileRecord.new(fpos, line)
46
56
  lastpos = @fh.tell
47
- id = rec.id
48
- yield id, rec
57
+ yield fpos, line
49
58
  @fh.seek(lastpos) # reset filepos, just in case it changed
50
59
  end
51
60
  fpos = @fh.tell
@@ -9,6 +9,8 @@
9
9
 
10
10
  require 'bio/db/gff/digest/gffinmemory'
11
11
  require 'bio/db/gff/digest/gffnocache'
12
+ require 'bio/db/gff/digest/gfflrucache'
13
+ require 'bio/db/gff/block/gffblockparser'
12
14
 
13
15
  module Bio
14
16
  module GFFbrowser
@@ -16,16 +18,24 @@ module Bio
16
18
  attr_reader :assembler
17
19
 
18
20
  include Digest
21
+ include Block
19
22
 
20
23
  # Initialize a GFF parser
21
24
  def initialize filename, options = {}
25
+ options[:parser] = :line if options[:parser] == nil
22
26
  cache_recs = options[:cache_records]
23
27
  @assembler =
24
- case cache_recs
25
- when :cache_none
26
- NoCache.new(filename, options)
27
- else
28
- InMemory.new(filename, options) # default
28
+ if options[:block]
29
+ GffBlockParser.new(filename, options)
30
+ else
31
+ case cache_recs
32
+ when :cache_none
33
+ NoCache.new(filename, options)
34
+ when :cache_lru
35
+ LruCache.new(filename, options)
36
+ else
37
+ InMemory.new(filename, options) # default
38
+ end
29
39
  end
30
40
  end
31
41
 
@@ -24,7 +24,7 @@ module Bio
24
24
  #
25
25
  module FastLineParser
26
26
 
27
- include Helpers::Error
27
+ include Helpers::Logger
28
28
 
29
29
  # Returns a (partial) record, assuming it is a valid GFF3
30
30
  # format, no validation takes place, other than field counting (!)
@@ -44,11 +44,6 @@ module Bio
44
44
  return nil
45
45
  end
46
46
 
47
- fs[GFF3_START] = fs[GFF3_START].to_i
48
- fs[GFF3_END] = fs[GFF3_END].to_i
49
- fs[GFF3_SCORE] = fs[GFF3_SCORE].to_f
50
- fs[GFF3_PHASE] = fs[GFF3_PHASE].to_i
51
- fs[GFF3_ATTRIBUTES] = parse_attributes_fast(fs[GFF3_ATTRIBUTES],options)
52
47
  fs
53
48
  end
54
49
 
@@ -7,13 +7,15 @@
7
7
  #
8
8
  # Fetch information from a GFF file
9
9
 
10
+ require 'set'
11
+
10
12
  module Bio
11
13
  module GFFbrowser
12
14
 
13
15
  module Helpers
14
16
 
15
17
  module Record
16
- include Error
18
+ include Logger
17
19
  # Format a record ID by, first, getting the ID attribute. If that fails
18
20
  # the seqname is used with the start/stop positions.
19
21
  def Record::formatID rec
@@ -33,11 +35,11 @@ module Bio
33
35
 
34
36
  module Gff3Component
35
37
 
36
- include Error
38
+ include Logger
37
39
 
38
- COMPONENT_TYPES = %w{
40
+ COMPONENT_TYPES = Set.new(%w{
39
41
  gene SO:0000704 contig transcript Component region
40
- }
42
+ })
41
43
 
42
44
  # Walk the component list to find a matching component/container for a
43
45
  # record. First use the parent ID. If that is missing go by sequence
@@ -80,13 +82,13 @@ module Bio
80
82
  module Gff3Features
81
83
 
82
84
  # Ignore the following features (case sensitive?)
83
- IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
85
+ IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + Set.new(%w{
84
86
  transposon Match similarity UTR
85
87
  TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
86
88
  polyA_site SO:0000553
87
89
  five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
88
90
  exon SO:0000147
89
- }
91
+ })
90
92
  end
91
93
 
92
94
  end
@@ -1,3 +1,5 @@
1
+ require 'bio/db/gff/gff3parserec'
2
+
1
3
  module Bio
2
4
  module GFFbrowser
3
5
 
@@ -7,6 +9,9 @@ module Bio
7
9
 
8
10
  # Using the fast line parser
9
11
  class FastLineRecord < Record
12
+
13
+ include FastLineParser
14
+
10
15
  def initialize fields
11
16
  @fields = fields
12
17
  end
@@ -16,27 +21,27 @@ module Bio
16
21
  end
17
22
 
18
23
  def seqid
19
- @fields[GFF3_SEQID]
24
+ @seqid_ ||= @fields[GFF3_SEQID]
20
25
  end
21
26
 
22
27
  alias seqname :seqid
23
28
 
24
29
  def phase
25
- @fields[GFF3_PHASE]
30
+ @phase_ ||= @fields[GFF3_PHASE].to_i
26
31
  end
27
32
 
28
33
  alias frame :phase
29
34
 
30
35
  def start
31
- @fields[GFF3_START]
36
+ @start_ ||= @fields[GFF3_START].to_i
32
37
  end
33
38
 
34
39
  def end
35
- @fields[GFF3_END]
40
+ @end_ ||= @fields[GFF3_END].to_i
36
41
  end
37
42
 
38
43
  def score
39
- @fields[GFF3_SCORE]
44
+ @score_ ||= @fields[GFF3_SCORE].to_f
40
45
  end
41
46
 
42
47
  def strand
@@ -44,7 +49,7 @@ module Bio
44
49
  end
45
50
 
46
51
  def feature
47
- @fields[GFF3_TYPE]
52
+ @feature_ ||= @fields[GFF3_TYPE]
48
53
  end
49
54
 
50
55
  alias feature_type :feature
@@ -53,7 +58,7 @@ module Bio
53
58
  end
54
59
 
55
60
  def attributes
56
- @fields[GFF3_ATTRIBUTES]
61
+ @attributes_ ||= parse_attributes_fast(@fields[GFF3_ATTRIBUTES])
57
62
  end
58
63
 
59
64
  def get_attribute name
@@ -61,7 +66,7 @@ module Bio
61
66
  end
62
67
 
63
68
  def id
64
- attributes['ID']
69
+ @id_ ||= attributes['ID']
65
70
  end
66
71
 
67
72
  alias entry_id :id
@@ -1,4 +1,3 @@
1
-
2
1
  module Bio
3
2
  module GFFbrowser
4
3
 
@@ -15,7 +15,7 @@ module Bio
15
15
 
16
16
  module Gff3Sequence
17
17
 
18
- include Bio::GFFbrowser::Helpers::Error
18
+ include Bio::GFFbrowser::Helpers::Logger
19
19
 
20
20
 
21
21
  # Patch a sequence together from a Sequence string and an array
@@ -61,13 +61,13 @@ module Bio
61
61
  orf_frame = startpos - 1
62
62
  orf_frameshift = orf_frame % 3
63
63
  sectionlist = sectionlist.reverse if orf_reverse
64
- # if do_debug
64
+ if do_debug
65
65
  debug options.to_s
66
66
  debug [:reverse,do_reverse].to_s
67
67
  debug [:complement,do_complement].to_s
68
68
  debug [:trim,do_trim].to_s
69
69
  debug [:orf_reverse, orf_reverse, rec0.strand].to_s
70
- # end
70
+ end
71
71
 
72
72
  if sequence.kind_of?(Bio::FastaFormat)
73
73
  # BioRuby conversion
@@ -80,18 +80,12 @@ module Bio
80
80
  if do_reverse and orf_reverse
81
81
  s = s.reverse
82
82
  end
83
- # Correct for phase. Unfortunately the use of phase is ambiguous.
84
- # Here we check whether rec.start is in line with orf_frame. If it
85
- # is, we correct for phase. Otherwise it is ignored.
86
83
  if do_phase and rec.phase
87
84
  phase = rec.phase.to_i
88
- # if ((rec.start-startpos) % 3 == 0)
89
85
  s = s[phase..-1]
90
- # end
91
86
  end
92
87
  s
93
88
  }
94
- # p seq
95
89
  seq = seq.join
96
90
  if do_complement and do_reverse and orf_reverse
97
91
  ntseq = Bio::Sequence::NA.new(seq)
@@ -33,7 +33,7 @@ module Bio
33
33
 
34
34
  # Helper class for storing linked records based on a shared ID
35
35
  class LinkedRecs < Hash
36
- include Error
36
+ include Logger
37
37
  def add id, rec
38
38
  info "Adding #{rec.feature_type} <#{id}>"
39
39
  self[id] = [] if self[id] == nil
@@ -4,7 +4,7 @@ module Bio
4
4
 
5
5
  module Helpers
6
6
 
7
- module Error
7
+ module Logger
8
8
  include Bio::Log
9
9
 
10
10
  def debug str, id=''
@@ -27,6 +27,15 @@ module Bio
27
27
  log.error_(str+" <#{id}>",:act => FailOnError.new)
28
28
 
29
29
  end
30
+
31
+ def log_sys_info msg
32
+ log = LoggerPlus['bio-gff3']
33
+ rmem = `ps -o rss= -p #{Process.pid}`.to_i
34
+ vmem = `ps -o vsz= -p #{Process.pid}`.to_i
35
+ if rmem or vmem
36
+ log.info7 "Memory used #{msg} RAM #{rmem/1024}M, VMEM #{vmem/1024}M"
37
+ end
38
+ end
30
39
  end
31
40
  end
32
41
  end
@@ -13,10 +13,11 @@ TEST2='test/data/gff/standard.gff3'
13
13
 
14
14
  describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
15
15
 
16
+
16
17
  it "should parse a file and yield records" do
17
18
  iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
18
- iter.each_rec do | id, rec |
19
- # p [id, rec, rec.io_seek]
19
+ iter.each_rec do | fpos, line |
20
+ rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
20
21
  rec.io_seek.should == 51
21
22
  break
22
23
  end
@@ -25,8 +26,8 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
25
26
  it "should handle embedded FASTA records" do
26
27
  iter = Bio::GFF::GFF3::FileIterator.new(TEST1)
27
28
  last = nil
28
- iter.each_rec do | id, rec |
29
- # p [id, rec]
29
+ iter.each_rec do | fpos, line |
30
+ rec = Bio::GFF::GFF3::FastParserFileRecord.new(fpos, line)
30
31
  last = rec
31
32
  end
32
33
  last.io_seek.should == 3342
@@ -54,7 +54,13 @@ describe GFF3, "GFF3 API (InMemory) with everything in memory" do
54
54
  end
55
55
 
56
56
  describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => :cache_none" do
57
- # iterators_should_be_implemented
57
+ before :all do
58
+ # initialize
59
+ gff3 = Bio::GFFbrowser::GFF3.new(TESTGFF1, :cache_components => :cache_none, :cache_records => :cache_lru)
60
+ @gff = gff3.assembler
61
+ end
62
+
63
+ iterators_should_be_implemented
58
64
  end
59
65
 
60
66
  describe GFF3, "GFF3 API with :cache_components => 1000, :cache_records => 1000" do