bio-gff3 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ #
2
+ # = bio/db/gff/gffdb.rb - GFF database class
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+
8
+ # Create db from a GFF file
9
+
10
+ require 'bio'
11
+ require 'bio/db/gff/gfffileiterator'
12
+ require 'bio/db/gff/gfffasta'
13
+ require 'bio/db/gff/gffassemble'
14
+ require 'bio/db/gff/gffparser'
15
+ require 'bio/db/gff/gffinmemory'
16
+ require 'bio/db/gff/gffnocache'
17
+
18
+ module Bio
19
+ module GFFbrowser
20
+ class GFFdb
21
+ attr_reader :assembler
22
+
23
+ include Digest
24
+
25
+ # Initialize a GFF parser
26
+ def initialize filename, options = {}
27
+ cache_recs = options[:cache_records]
28
+ @assembler =
29
+ case cache_recs
30
+ when :cache_none
31
+ NoCache.new(filename, options)
32
+ else
33
+ InMemory.new(filename, options) # default
34
+ end
35
+ end
36
+
37
+ end # GFFdb
38
+ end # GFFbrowser
39
+ end # Bio
40
+
@@ -0,0 +1,68 @@
1
+ # = bio/db/gff/gfffasta.rb - Fetch records from a file in FASTA format
2
+ #
3
+ # Copyright:: Copyright (C) 2010
4
+ # Pjotr Prins <pjotr.prins@thebird.nl>
5
+ # License:: The Ruby License
6
+ #
7
+ # This requires a special implementation as it uses an open file and we
8
+ # retain file seek positions.
9
+
10
+ module Bio
11
+
12
+ class GFF
13
+
14
+ # Read FASTA records from file and store seek positions, which are
15
+ # used to retrieve the records. Note, this implementation merely retains
16
+ # records in memory (FIXME)
17
+ class FastaReader
18
+ def initialize fh, io_seek=nil
19
+ @fh = fh
20
+ @h = {}
21
+ parse
22
+ end
23
+
24
+ def parse
25
+ # read FASTA records
26
+ header = nil
27
+ seqs = []
28
+ @fh.each_line do | line |
29
+ line = line.strip
30
+ next if line =~ /^#/
31
+ if line =~ /^>/ # FASTA record header
32
+ add(header,seqs)
33
+ header = line
34
+ seqs = []
35
+ else
36
+ seqs << line
37
+ end
38
+ end
39
+ add(header,seqs)
40
+ end
41
+
42
+ def [] index
43
+ @h[index]
44
+ end
45
+
46
+ def each
47
+ @h.each do | k,v |
48
+ yield k, v
49
+ end
50
+ end
51
+
52
+ private
53
+ def add header, seqs
54
+ if header
55
+ id, fastarec = fasta_rec(header, seqs)
56
+ @h[id] = fastarec.data.strip
57
+ end
58
+ end
59
+
60
+ def fasta_rec header, buf
61
+ fst = Bio::FastaFormat.new(header+"\n"+buf.to_s)
62
+ return fst.definition, fst
63
+ end
64
+
65
+ end
66
+
67
+ end # GFF
68
+ end # Bio
@@ -0,0 +1,77 @@
1
+ #
2
+ # = bio/db/gff/gfffileiterator.rb - Fetch records from a file
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+
8
+ module Bio
9
+
10
+ class GFF
11
+
12
+ class GFF3
13
+
14
+ class FileRecord < Record
15
+ attr_accessor :io_seek
16
+ def initialize io_seek, buf
17
+ @io_seek = io_seek
18
+ super(buf)
19
+ end
20
+ end
21
+
22
+ # GFF3::FileIterator takes a file and yields GFF3 records with their
23
+ # seek position included in the record.
24
+ class FileIterator
25
+ attr_accessor :fh
26
+ attr_reader :fasta_io_seek
27
+
28
+ def initialize filename
29
+ @fh = File.open(filename)
30
+ end
31
+
32
+ # Iterate over every record in the file, yielding the record ID and
33
+ # (File)Record, which includes the io_seek position in the file
34
+ def each_rec()
35
+ fpos = 0
36
+ @fh.each_line do | line |
37
+ line = line.strip
38
+ if line == "##FASTA"
39
+ @fasta_io_seek = fpos
40
+ break
41
+ end
42
+ if line.size != 0 and line !~ /^#/
43
+ rec = FileRecord.new(fpos, line)
44
+ lastpos = @fh.tell
45
+ id = rec.id
46
+ yield id, rec
47
+ @fh.seek(lastpos) # reset filepos, just in case it changed
48
+ end
49
+ fpos = @fh.tell
50
+ end
51
+ end
52
+
53
+ # Iterate over all contained FASTA sequences, yielding the ID
54
+ # and sequence as a FASTA record. Normally call each_rec first and
55
+ # you can test for existing FASTA records if fasta_io_seek != nil
56
+ def each_sequence
57
+ if @fasta_io_seek == nil
58
+ # Find the FASTA location first
59
+ @fh.each_line do | line |
60
+ break if line.strip == "##FASTA"
61
+ end
62
+ else
63
+ @fh.seek(@fasta_io_seek)
64
+ end
65
+ fasta = Bio::GFF::FastaReader.new(@fh)
66
+ fasta.each do | id, fastarec |
67
+ yield id, fastarec
68
+ end
69
+ end
70
+ end
71
+ end # GFF3
72
+ end # GFF
73
+ end # Bio
74
+
75
+
76
+
77
+
@@ -0,0 +1,63 @@
1
+ #
2
+ # = bio/db/gff/gffinmemory.rb - Assemble mRNA and CDS from GFF in RAM
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+ #
8
+ # Fetch information from a GFF file
9
+
10
+ module Bio
11
+ module GFFbrowser
12
+
13
+ module Digest
14
+
15
+ class InMemory
16
+ include Parser
17
+ include Gff3Sequence
18
+ attr_reader :sequencelist
19
+
20
+ def initialize filename, options
21
+ @options = options
22
+ # Invoke the BioRuby in memory parser
23
+ @gff = Bio::GFF::GFF3.new(File.read(filename))
24
+ end
25
+
26
+ # Digest mRNA from the GFFdb and store in Hash
27
+ # Next yield(id, seq) from Hash
28
+ def parse
29
+ info "---- Digest DB and store data in mRNA Hash"
30
+ @count_ids = Counter.new # Count ids
31
+ @count_seqnames = Counter.new # Count seqnames
32
+ @componentlist = {} # Store containers, like genes, contigs
33
+ @mrnalist = LinkedRecs.new # Store linked mRNA records
34
+ @cdslist = LinkedRecs.new
35
+ @exonlist = LinkedRecs.new
36
+ @sequencelist = {}
37
+ @unrecognized_features = {}
38
+ @gff.records.each do | rec |
39
+ store_record(rec)
40
+ end
41
+ @gff.sequences.each do | bioseq |
42
+ id = bioseq.entry_id
43
+ @sequencelist[id] = bioseq.to_s # in Bio::Sequence with contained Bio::FastaFormat
44
+ end
45
+ validate_mrnas
46
+ validate_cdss
47
+ show_unrecognized_features
48
+ @genelist = @count_ids.keys
49
+ read_fasta
50
+ end
51
+
52
+ def each_item list
53
+ list.each do | id, recs |
54
+ seqid = recs[0].seqname
55
+ component = find_component(recs[0])
56
+ yield id, recs, component
57
+ end
58
+ end
59
+
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,124 @@
1
+ #
2
+ # = bio/db/gff/gffnocache.rb - Assemble mRNA and CDS from GFF by fseek
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+ #
8
+ # Fetch information from a GFF file without using RAM - also check
9
+ # out the caching edition, which uses limited amounts of RAM
10
+
11
+ module Bio
12
+ module GFFbrowser
13
+
14
+ module Digest
15
+
16
+ module NoCacheHelpers
17
+
18
+ module SeekRec
19
+ # Fetch a record using fh and file seek position
20
+ def SeekRec::fetch(fh,fpos)
21
+ return nil if fh==nil or fpos==nil
22
+ fh.seek(fpos)
23
+ GFF::GFF3::FileRecord.new(fpos, fh.gets)
24
+ end
25
+ end
26
+
27
+ # The hardwired to file RecList
28
+ class SeekRecList
29
+ def initialize fh
30
+ @fh = fh
31
+ @h = {}
32
+ end
33
+
34
+ def []= id, rec
35
+ raise "id #{id} occurs twice!" if @h[id]
36
+ fpos = rec.io_seek
37
+ @h[id] = fpos
38
+ end
39
+
40
+ def [](id)
41
+ fpos = @h[id]
42
+ SeekRec::fetch(@fh,fpos)
43
+ end
44
+
45
+ def each
46
+ @h.each do | id,fpos |
47
+ yield id, self[id]
48
+ end
49
+ end
50
+ end
51
+
52
+ class SeekLinkedRecs < Hash
53
+ include Helpers::Error
54
+ def add id, rec
55
+ info "Adding #{rec.feature_type} <#{id}>"
56
+ self[id] = [] if self[id] == nil
57
+ self[id] << rec.io_seek
58
+ end
59
+ # validation is switched off for NoCache
60
+ def validate_seqname
61
+ end
62
+ # validation is switched off for NoCache
63
+ def validate_nonoverlapping
64
+ end
65
+ # validation is switched off for NoCache
66
+ def validate_shared_parent
67
+ end
68
+ end
69
+ end
70
+
71
+ class NoCache
72
+ include Parser
73
+ include NoCacheHelpers
74
+ include Gff3Sequence
75
+
76
+ def initialize filename, options
77
+ @filename = filename
78
+ @options = options
79
+ @iter = Bio::GFF::GFF3::FileIterator.new(@filename)
80
+ end
81
+
82
+ # parse the whole file once and store all seek locations,
83
+ # rather than the records themselves
84
+ def parse
85
+ info "---- Digest DB and store data in mRNA Hash (NoCache)"
86
+ @count_ids = Counter.new # Count ids
87
+ @count_seqnames = Counter.new # Count seqnames
88
+ @componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
89
+ @mrnalist = SeekLinkedRecs.new # Store linked mRNA records
90
+ @cdslist = SeekLinkedRecs.new
91
+ @exonlist = SeekLinkedRecs.new
92
+ @sequencelist = {}
93
+ @unrecognized_features = {}
94
+ @iter.each_rec do | id, rec |
95
+ store_record(rec)
96
+ end
97
+ @iter.each_sequence do | id, bioseq |
98
+ @sequencelist[id] = bioseq.to_s
99
+ end
100
+ validate_mrnas
101
+ validate_cdss
102
+ show_unrecognized_features
103
+ @genelist = @count_ids.keys
104
+ read_fasta
105
+ end
106
+
107
+ def each_item list
108
+ # p list.class
109
+ fh = @iter.fh
110
+ list.each do | id, io_seeklist |
111
+ recs = []
112
+ io_seeklist.each do | fpos |
113
+ recs << SeekRec::fetch(fh,fpos)
114
+ end
115
+ seqid = recs[0].seqname
116
+ component = find_component(recs[0])
117
+ yield id, recs, component
118
+ end
119
+ end
120
+
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,154 @@
1
+ #
2
+ # = bio/db/gff/gffparser.rb - Parsing logic for GFF3 file
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+ #
8
+
9
+ module Bio
10
+ module GFFbrowser
11
+ module Digest
12
+
13
+ module Parser
14
+
15
+ include Bio::GFFbrowser::Helpers
16
+ include Bio::GFFbrowser::Helpers::Error
17
+ include Gff3Component
18
+ include Gff3Features
19
+
20
+ def store_record rec
21
+ return if rec.comment # skip GFF comments
22
+ id = Record::formatID(rec)
23
+ @count_ids.add(id)
24
+ @count_seqnames.add(rec.seqname)
25
+
26
+ if COMPONENT_TYPES.include?(rec.feature_type)
27
+ # check for container ID
28
+ warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
29
+ @componentlist[id] = rec
30
+ info "Added #{rec.feature_type} with component ID #{id}"
31
+ else
32
+ case rec.feature_type
33
+ when 'mRNA' || 'SO:0000234'
34
+ @mrnalist.add(id,rec)
35
+ when 'CDS' || 'SO:0000316'
36
+ @cdslist.add(id,rec)
37
+ when 'exon' || 'SO:0000147'
38
+ @exonlist.add(id,rec)
39
+ else
40
+ if !IGNORE_FEATURES.include?(rec.feature_type)
41
+ @unrecognized_features[rec.feature_type] = true
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ def validate_mrnas
48
+ return if not @options[:validate]
49
+ # validate gene/container/component seqname is shared
50
+ @mrnalist.validate_seqname
51
+ @mrnalist.validate_shared_parent
52
+ end
53
+
54
+ def validate_cdss
55
+ return if not @options[:validate]
56
+ @cdslist.validate_seqname
57
+ # validate CDS sections do not overlap
58
+ @cdslist.validate_nonoverlapping
59
+ # validate sections share the parent
60
+ @cdslist.validate_shared_parent
61
+ # display unhandled features
62
+ end
63
+
64
+ def show_unrecognized_features
65
+ @unrecognized_features.keys.each do | k |
66
+ warn "Feature has no match",k if k
67
+ end
68
+ end
69
+
70
+ def read_fasta
71
+ if @options[:fasta_filename]
72
+ File.open(@options[:fasta_filename]) do | f |
73
+ fasta = Bio::GFF::FastaReader.new(f)
74
+ fasta.each do | id, fastarec |
75
+ # p fastarec
76
+ @sequencelist[id] = fastarec
77
+ end
78
+ end
79
+ end
80
+ # p :inmemory, @sequencelist
81
+ end
82
+
83
+ # Yield the id, recs, containing component and sequence of mRNAs
84
+ def each_mRNA
85
+ parse if !@mrnalist
86
+ each_item(@mrnalist) { |id, recs, component | yield id, recs, component }
87
+ end
88
+
89
+ # Yield the id, recs, and containing component
90
+ def each_CDS
91
+ parse if !@cdslist
92
+ each_item(@cdslist) { |id, recs, component | yield id, recs, component }
93
+ end
94
+
95
+ # Yield the id, recs, and containing component
96
+ def each_exon
97
+ parse if !@exonlist
98
+ each_item(@exonlist) { |id, recs, component | yield id, recs, component }
99
+ end
100
+
101
+ # Yield a unique description and the sequence
102
+ def each_mRNA_seq
103
+ each_mRNA do | id, reclist, component |
104
+ if component
105
+ sequence = @sequencelist[component.seqname]
106
+ # p sequence
107
+ if sequence
108
+ yield description(id,component,reclist), assemble(sequence,component.start,reclist)
109
+ else
110
+ warn "No sequence information for",id
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ # Yield a unique description and the sequence
117
+ def each_CDS_seq
118
+ each_CDS do | id, reclist, component |
119
+ if component
120
+ sequence = @sequencelist[component.seqname]
121
+ # p sequence
122
+ if sequence
123
+ seq = assemble(sequence,component.start,reclist,:codonize=>true)
124
+ if seq.size % 3 != 0
125
+ p reclist # leave this in
126
+ # raise "CDS size #{seq.size} is not a multiple of 3! <#{seq}>"
127
+ warn "CDS size is not a multiple of 3",id
128
+ end
129
+ yield description(id,component,reclist), seq
130
+ else
131
+ warn "No sequence information for",id
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ # Yield a unique description and the sequence
138
+ def each_exon_seq
139
+ each_exon do | id, reclist, component |
140
+ if component
141
+ sequence = @sequencelist[component.seqname]
142
+ if sequence
143
+ seq = assemble(sequence,component.start,reclist)
144
+ yield description(id,component,reclist), seq
145
+ else
146
+ warn "No sequence information for",id
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end