bio-gff3 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,40 @@
1
+ #
2
+ # = bio/db/gff/gffdb.rb - GFF database class
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+
8
+ # Create db from a GFF file
9
+
10
+ require 'bio'
11
+ require 'bio/db/gff/gfffileiterator'
12
+ require 'bio/db/gff/gfffasta'
13
+ require 'bio/db/gff/gffassemble'
14
+ require 'bio/db/gff/gffparser'
15
+ require 'bio/db/gff/gffinmemory'
16
+ require 'bio/db/gff/gffnocache'
17
+
18
+ module Bio
19
+ module GFFbrowser
20
+ class GFFdb
21
+ attr_reader :assembler
22
+
23
+ include Digest
24
+
25
+ # Initialize a GFF parser
26
+ def initialize filename, options = {}
27
+ cache_recs = options[:cache_records]
28
+ @assembler =
29
+ case cache_recs
30
+ when :cache_none
31
+ NoCache.new(filename, options)
32
+ else
33
+ InMemory.new(filename, options) # default
34
+ end
35
+ end
36
+
37
+ end # GFFdb
38
+ end # GFFbrowser
39
+ end # Bio
40
+
@@ -0,0 +1,68 @@
1
+ # = bio/db/gff/gfffasta.rb - Fetch records from a file in FASTA format
2
+ #
3
+ # Copyright:: Copyright (C) 2010
4
+ # Pjotr Prins <pjotr.prins@thebird.nl>
5
+ # License:: The Ruby License
6
+ #
7
+ # This requires a special implementation as it uses an open file and we
8
+ # retain file seek positions.
9
+
10
+ module Bio
11
+
12
+ class GFF
13
+
14
+ # Read FASTA records from file and store seek positions, which are
15
+ # used to retrieve the records. Note, this implementation merely retains
16
+ # records in memory (FIXME)
17
+ class FastaReader
18
+ def initialize fh, io_seek=nil
19
+ @fh = fh
20
+ @h = {}
21
+ parse
22
+ end
23
+
24
+ def parse
25
+ # read FASTA records
26
+ header = nil
27
+ seqs = []
28
+ @fh.each_line do | line |
29
+ line = line.strip
30
+ next if line =~ /^#/
31
+ if line =~ /^>/ # FASTA record header
32
+ add(header,seqs)
33
+ header = line
34
+ seqs = []
35
+ else
36
+ seqs << line
37
+ end
38
+ end
39
+ add(header,seqs)
40
+ end
41
+
42
+ def [] index
43
+ @h[index]
44
+ end
45
+
46
+ def each
47
+ @h.each do | k,v |
48
+ yield k, v
49
+ end
50
+ end
51
+
52
+ private
53
+ def add header, seqs
54
+ if header
55
+ id, fastarec = fasta_rec(header, seqs)
56
+ @h[id] = fastarec.data.strip
57
+ end
58
+ end
59
+
60
+ def fasta_rec header, buf
61
+ fst = Bio::FastaFormat.new(header+"\n"+buf.to_s)
62
+ return fst.definition, fst
63
+ end
64
+
65
+ end
66
+
67
+ end # GFF
68
+ end # Bio
@@ -0,0 +1,77 @@
1
+ #
2
+ # = bio/db/gff/gfffileiterator.rb - Fetch records from a file
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+
8
+ module Bio
9
+
10
+ class GFF
11
+
12
+ class GFF3
13
+
14
+ class FileRecord < Record
15
+ attr_accessor :io_seek
16
+ def initialize io_seek, buf
17
+ @io_seek = io_seek
18
+ super(buf)
19
+ end
20
+ end
21
+
22
+ # GFF3::FileIterator takes a file and yields GFF3 records with their
23
+ # seek position included in the record.
24
+ class FileIterator
25
+ attr_accessor :fh
26
+ attr_reader :fasta_io_seek
27
+
28
+ def initialize filename
29
+ @fh = File.open(filename)
30
+ end
31
+
32
+ # Iterate over every record in the file, yielding the record ID and
33
+ # (File)Record, which includes the io_seek position in the file
34
+ def each_rec()
35
+ fpos = 0
36
+ @fh.each_line do | line |
37
+ line = line.strip
38
+ if line == "##FASTA"
39
+ @fasta_io_seek = fpos
40
+ break
41
+ end
42
+ if line.size != 0 and line !~ /^#/
43
+ rec = FileRecord.new(fpos, line)
44
+ lastpos = @fh.tell
45
+ id = rec.id
46
+ yield id, rec
47
+ @fh.seek(lastpos) # reset filepos, just in case it changed
48
+ end
49
+ fpos = @fh.tell
50
+ end
51
+ end
52
+
53
+ # Iterate over all contained FASTA sequences, yielding the ID
54
+ # and sequence as a FASTA record. Normally call each_rec first and
55
+ # you can test for existing FASTA records if fasta_io_seek != nil
56
+ def each_sequence
57
+ if @fasta_io_seek == nil
58
+ # Find the FASTA location first
59
+ @fh.each_line do | line |
60
+ break if line.strip == "##FASTA"
61
+ end
62
+ else
63
+ @fh.seek(@fasta_io_seek)
64
+ end
65
+ fasta = Bio::GFF::FastaReader.new(@fh)
66
+ fasta.each do | id, fastarec |
67
+ yield id, fastarec
68
+ end
69
+ end
70
+ end
71
+ end # GFF3
72
+ end # GFF
73
+ end # Bio
74
+
75
+
76
+
77
+
@@ -0,0 +1,63 @@
1
+ #
2
+ # = bio/db/gff/gffinmemory.rb - Assemble mRNA and CDS from GFF in RAM
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+ #
8
+ # Fetch information from a GFF file
9
+
10
+ module Bio
11
+ module GFFbrowser
12
+
13
+ module Digest
14
+
15
+ class InMemory
16
+ include Parser
17
+ include Gff3Sequence
18
+ attr_reader :sequencelist
19
+
20
+ def initialize filename, options
21
+ @options = options
22
+ # Invoke the BioRuby in memory parser
23
+ @gff = Bio::GFF::GFF3.new(File.read(filename))
24
+ end
25
+
26
+ # Digest mRNA from the GFFdb and store in Hash
27
+ # Next yield(id, seq) from Hash
28
+ def parse
29
+ info "---- Digest DB and store data in mRNA Hash"
30
+ @count_ids = Counter.new # Count ids
31
+ @count_seqnames = Counter.new # Count seqnames
32
+ @componentlist = {} # Store containers, like genes, contigs
33
+ @mrnalist = LinkedRecs.new # Store linked mRNA records
34
+ @cdslist = LinkedRecs.new
35
+ @exonlist = LinkedRecs.new
36
+ @sequencelist = {}
37
+ @unrecognized_features = {}
38
+ @gff.records.each do | rec |
39
+ store_record(rec)
40
+ end
41
+ @gff.sequences.each do | bioseq |
42
+ id = bioseq.entry_id
43
+ @sequencelist[id] = bioseq.to_s # in Bio::Sequence with contained Bio::FastaFormat
44
+ end
45
+ validate_mrnas
46
+ validate_cdss
47
+ show_unrecognized_features
48
+ @genelist = @count_ids.keys
49
+ read_fasta
50
+ end
51
+
52
+ def each_item list
53
+ list.each do | id, recs |
54
+ seqid = recs[0].seqname
55
+ component = find_component(recs[0])
56
+ yield id, recs, component
57
+ end
58
+ end
59
+
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,124 @@
1
+ #
2
+ # = bio/db/gff/gffnocache.rb - Assemble mRNA and CDS from GFF by fseek
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+ #
8
+ # Fetch information from a GFF file without using RAM - also check
9
+ # out the caching edition, which uses limited amounts of RAM
10
+
11
+ module Bio
12
+ module GFFbrowser
13
+
14
+ module Digest
15
+
16
+ module NoCacheHelpers
17
+
18
+ module SeekRec
19
+ # Fetch a record using fh and file seek position
20
+ def SeekRec::fetch(fh,fpos)
21
+ return nil if fh==nil or fpos==nil
22
+ fh.seek(fpos)
23
+ GFF::GFF3::FileRecord.new(fpos, fh.gets)
24
+ end
25
+ end
26
+
27
+ # The hardwired to file RecList
28
+ class SeekRecList
29
+ def initialize fh
30
+ @fh = fh
31
+ @h = {}
32
+ end
33
+
34
+ def []= id, rec
35
+ raise "id #{id} occurs twice!" if @h[id]
36
+ fpos = rec.io_seek
37
+ @h[id] = fpos
38
+ end
39
+
40
+ def [](id)
41
+ fpos = @h[id]
42
+ SeekRec::fetch(@fh,fpos)
43
+ end
44
+
45
+ def each
46
+ @h.each do | id,fpos |
47
+ yield id, self[id]
48
+ end
49
+ end
50
+ end
51
+
52
+ class SeekLinkedRecs < Hash
53
+ include Helpers::Error
54
+ def add id, rec
55
+ info "Adding #{rec.feature_type} <#{id}>"
56
+ self[id] = [] if self[id] == nil
57
+ self[id] << rec.io_seek
58
+ end
59
+ # validation is switched off for NoCache
60
+ def validate_seqname
61
+ end
62
+ # validation is switched off for NoCache
63
+ def validate_nonoverlapping
64
+ end
65
+ # validation is switched off for NoCache
66
+ def validate_shared_parent
67
+ end
68
+ end
69
+ end
70
+
71
+ class NoCache
72
+ include Parser
73
+ include NoCacheHelpers
74
+ include Gff3Sequence
75
+
76
+ def initialize filename, options
77
+ @filename = filename
78
+ @options = options
79
+ @iter = Bio::GFF::GFF3::FileIterator.new(@filename)
80
+ end
81
+
82
+ # parse the whole file once and store all seek locations,
83
+ # rather than the records themselves
84
+ def parse
85
+ info "---- Digest DB and store data in mRNA Hash (NoCache)"
86
+ @count_ids = Counter.new # Count ids
87
+ @count_seqnames = Counter.new # Count seqnames
88
+ @componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
89
+ @mrnalist = SeekLinkedRecs.new # Store linked mRNA records
90
+ @cdslist = SeekLinkedRecs.new
91
+ @exonlist = SeekLinkedRecs.new
92
+ @sequencelist = {}
93
+ @unrecognized_features = {}
94
+ @iter.each_rec do | id, rec |
95
+ store_record(rec)
96
+ end
97
+ @iter.each_sequence do | id, bioseq |
98
+ @sequencelist[id] = bioseq.to_s
99
+ end
100
+ validate_mrnas
101
+ validate_cdss
102
+ show_unrecognized_features
103
+ @genelist = @count_ids.keys
104
+ read_fasta
105
+ end
106
+
107
+ def each_item list
108
+ # p list.class
109
+ fh = @iter.fh
110
+ list.each do | id, io_seeklist |
111
+ recs = []
112
+ io_seeklist.each do | fpos |
113
+ recs << SeekRec::fetch(fh,fpos)
114
+ end
115
+ seqid = recs[0].seqname
116
+ component = find_component(recs[0])
117
+ yield id, recs, component
118
+ end
119
+ end
120
+
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,154 @@
1
+ #
2
+ # = bio/db/gff/gffparser.rb - Parsing logic for GFF3 file
3
+ #
4
+ # Copyright:: Copyright (C) 2010
5
+ # Pjotr Prins <pjotr.prins@thebird.nl>
6
+ # License:: The Ruby License
7
+ #
8
+
9
+ module Bio
10
+ module GFFbrowser
11
+ module Digest
12
+
13
+ module Parser
14
+
15
+ include Bio::GFFbrowser::Helpers
16
+ include Bio::GFFbrowser::Helpers::Error
17
+ include Gff3Component
18
+ include Gff3Features
19
+
20
+ def store_record rec
21
+ return if rec.comment # skip GFF comments
22
+ id = Record::formatID(rec)
23
+ @count_ids.add(id)
24
+ @count_seqnames.add(rec.seqname)
25
+
26
+ if COMPONENT_TYPES.include?(rec.feature_type)
27
+ # check for container ID
28
+ warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
29
+ @componentlist[id] = rec
30
+ info "Added #{rec.feature_type} with component ID #{id}"
31
+ else
32
+ case rec.feature_type
33
+ when 'mRNA' || 'SO:0000234'
34
+ @mrnalist.add(id,rec)
35
+ when 'CDS' || 'SO:0000316'
36
+ @cdslist.add(id,rec)
37
+ when 'exon' || 'SO:0000147'
38
+ @exonlist.add(id,rec)
39
+ else
40
+ if !IGNORE_FEATURES.include?(rec.feature_type)
41
+ @unrecognized_features[rec.feature_type] = true
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ def validate_mrnas
48
+ return if not @options[:validate]
49
+ # validate gene/container/component seqname is shared
50
+ @mrnalist.validate_seqname
51
+ @mrnalist.validate_shared_parent
52
+ end
53
+
54
+ def validate_cdss
55
+ return if not @options[:validate]
56
+ @cdslist.validate_seqname
57
+ # validate CDS sections do not overlap
58
+ @cdslist.validate_nonoverlapping
59
+ # validate sections share the parent
60
+ @cdslist.validate_shared_parent
61
+ # display unhandled features
62
+ end
63
+
64
+ def show_unrecognized_features
65
+ @unrecognized_features.keys.each do | k |
66
+ warn "Feature has no match",k if k
67
+ end
68
+ end
69
+
70
+ def read_fasta
71
+ if @options[:fasta_filename]
72
+ File.open(@options[:fasta_filename]) do | f |
73
+ fasta = Bio::GFF::FastaReader.new(f)
74
+ fasta.each do | id, fastarec |
75
+ # p fastarec
76
+ @sequencelist[id] = fastarec
77
+ end
78
+ end
79
+ end
80
+ # p :inmemory, @sequencelist
81
+ end
82
+
83
+ # Yield the id, recs, containing component and sequence of mRNAs
84
+ def each_mRNA
85
+ parse if !@mrnalist
86
+ each_item(@mrnalist) { |id, recs, component | yield id, recs, component }
87
+ end
88
+
89
+ # Yield the id, recs, and containing component
90
+ def each_CDS
91
+ parse if !@cdslist
92
+ each_item(@cdslist) { |id, recs, component | yield id, recs, component }
93
+ end
94
+
95
+ # Yield the id, recs, and containing component
96
+ def each_exon
97
+ parse if !@exonlist
98
+ each_item(@exonlist) { |id, recs, component | yield id, recs, component }
99
+ end
100
+
101
+ # Yield a unique description and the sequence
102
+ def each_mRNA_seq
103
+ each_mRNA do | id, reclist, component |
104
+ if component
105
+ sequence = @sequencelist[component.seqname]
106
+ # p sequence
107
+ if sequence
108
+ yield description(id,component,reclist), assemble(sequence,component.start,reclist)
109
+ else
110
+ warn "No sequence information for",id
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ # Yield a unique description and the sequence
117
+ def each_CDS_seq
118
+ each_CDS do | id, reclist, component |
119
+ if component
120
+ sequence = @sequencelist[component.seqname]
121
+ # p sequence
122
+ if sequence
123
+ seq = assemble(sequence,component.start,reclist,:codonize=>true)
124
+ if seq.size % 3 != 0
125
+ p reclist # leave this in
126
+ # raise "CDS size #{seq.size} is not a multiple of 3! <#{seq}>"
127
+ warn "CDS size is not a multiple of 3",id
128
+ end
129
+ yield description(id,component,reclist), seq
130
+ else
131
+ warn "No sequence information for",id
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ # Yield a unique description and the sequence
138
+ def each_exon_seq
139
+ each_exon do | id, reclist, component |
140
+ if component
141
+ sequence = @sequencelist[component.seqname]
142
+ if sequence
143
+ seq = assemble(sequence,component.start,reclist)
144
+ yield description(id,component,reclist), seq
145
+ else
146
+ warn "No sequence information for",id
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end