bio-gff3 0.8.3 → 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +88 -0
- data/VERSION +1 -1
- data/bin/gff3-fetch +11 -5
- data/bio-gff3.gemspec +1 -1
- data/lib/bio/db/gff/gffinmemory.rb +1 -0
- data/lib/bio/db/gff/gffnocache.rb +1 -0
- data/lib/bio/db/gff/gffparser.rb +37 -14
- data/spec/gff3_fileiterator_spec.rb +1 -1
- data/spec/gffdb_spec.rb +4 -2
- data/test/data/gff/test.gff3 +1 -0
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -58,3 +58,91 @@ For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html
|
|
58
58
|
Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
|
59
59
|
|
60
60
|
|
61
|
+
|
62
|
+
Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
|
63
|
+
|
64
|
+
gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
|
65
|
+
|
66
|
+
Where (NYI == Not Yet Implemented):
|
67
|
+
|
68
|
+
--translate : output as amino acid sequence
|
69
|
+
--validate : validate GFF3 file by translating
|
70
|
+
--fix : check 3-frame translation and fix, if possible
|
71
|
+
--fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
|
72
|
+
--no-assemble : output each record as a sequence -- NYI
|
73
|
+
--add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
|
74
|
+
|
75
|
+
type is any valid type in the GFF3 definition. For example:
|
76
|
+
|
77
|
+
mRNA : assemble mRNA
|
78
|
+
CDS : assemble CDS
|
79
|
+
exon : list all exons
|
80
|
+
gene|ORF : list gene ORFs
|
81
|
+
other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
|
82
|
+
|
83
|
+
and the following performance options:
|
84
|
+
|
85
|
+
--cache full : load all in RAM (fast)
|
86
|
+
--cache none : do not load anything in memory (slow)
|
87
|
+
--low-mem : use LRU cache (limit RAM use, fast) -- NYI
|
88
|
+
--max-cpus num : use num threads -- NYI
|
89
|
+
--emboss : use EMBOSS translation (fast) -- NYI
|
90
|
+
|
91
|
+
Multiple GFF3 files can be used. With external FASTA files, always the last
|
92
|
+
one before the GFF3 filename is matched.
|
93
|
+
|
94
|
+
Note that above switches are only partially implemented at this stage. Full
|
95
|
+
feature support is projected Feb. 2011.
|
96
|
+
|
97
|
+
Examples:
|
98
|
+
|
99
|
+
Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
|
100
|
+
|
101
|
+
gff3-fetch mRNA test/data/gff/test.gff3
|
102
|
+
gff3-fetch CDS test/data/gff/test.gff3
|
103
|
+
|
104
|
+
Find CDS records from external FASTA file, adding phase and translate to protein sequence
|
105
|
+
|
106
|
+
gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
|
107
|
+
|
108
|
+
Find mRNA from external FASTA file, without loading everything in RAM
|
109
|
+
|
110
|
+
gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
111
|
+
gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
112
|
+
|
113
|
+
Validate GFF3 file using EMBOSS translation and validation
|
114
|
+
|
115
|
+
gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
116
|
+
|
117
|
+
Find GENEID predicted terminal exons
|
118
|
+
|
119
|
+
gff3-fetch terminal chromosome1.fa geneid.gff3
|
120
|
+
|
121
|
+
== Performance
|
122
|
+
|
123
|
+
time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
|
124
|
+
|
125
|
+
Cache real user sys
|
126
|
+
----------------------------------------------------
|
127
|
+
full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011)
|
128
|
+
none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011)
|
129
|
+
----------------------------------------------------
|
130
|
+
|
131
|
+
where
|
132
|
+
|
133
|
+
52M m_hapla.WS217.dna.fa
|
134
|
+
456M m_hapla.WS217.gff3
|
135
|
+
|
136
|
+
ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
|
137
|
+
on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine.
|
138
|
+
|
139
|
+
== Cite
|
140
|
+
|
141
|
+
If you use this software, please cite
|
142
|
+
|
143
|
+
http://dx.doi.org/10.1093/bioinformatics/btq475
|
144
|
+
|
145
|
+
== Copyright
|
146
|
+
|
147
|
+
Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
|
148
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.4
|
data/bin/gff3-fetch
CHANGED
@@ -26,7 +26,7 @@ USAGE = <<EOM
|
|
26
26
|
mRNA : assemble mRNA
|
27
27
|
CDS : assemble CDS
|
28
28
|
exon : list all exons
|
29
|
-
gene
|
29
|
+
gene|ORF : list gene ORFs
|
30
30
|
other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
|
31
31
|
|
32
32
|
and the following performance options:
|
@@ -167,8 +167,6 @@ opts.parse!(ARGV)
|
|
167
167
|
|
168
168
|
gfftype = ARGV.shift
|
169
169
|
|
170
|
-
raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
|
171
|
-
|
172
170
|
fastafn = nil
|
173
171
|
|
174
172
|
ARGV.each do | fn |
|
@@ -189,7 +187,15 @@ ARGV.each do | fn |
|
|
189
187
|
gff = gffdb.assembler
|
190
188
|
writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
|
191
189
|
case gfftype.downcase
|
192
|
-
when '
|
190
|
+
when 'gene'
|
191
|
+
gff.each_gene_seq do | id, seq |
|
192
|
+
writer.put(id,seq)
|
193
|
+
end
|
194
|
+
when 'orf'
|
195
|
+
gff.each_gene_seq do | id, seq |
|
196
|
+
writer.put(id,seq)
|
197
|
+
end
|
198
|
+
when 'mrna'
|
193
199
|
gff.each_mRNA_seq do | id, seq |
|
194
200
|
writer.put(id,seq)
|
195
201
|
end
|
@@ -202,7 +208,7 @@ ARGV.each do | fn |
|
|
202
208
|
writer.put(id,seq)
|
203
209
|
end
|
204
210
|
else
|
205
|
-
raise "Unknown action <#{gfftype}>"
|
211
|
+
raise "Unknown action on type <#{gfftype}>"
|
206
212
|
end
|
207
213
|
fastafn = nil
|
208
214
|
end
|
data/bio-gff3.gemspec
CHANGED
@@ -30,6 +30,7 @@ module Bio
|
|
30
30
|
@count_ids = Counter.new # Count ids
|
31
31
|
@count_seqnames = Counter.new # Count seqnames
|
32
32
|
@componentlist = {} # Store containers, like genes, contigs
|
33
|
+
@orflist = LinkedRecs.new
|
33
34
|
@mrnalist = LinkedRecs.new # Store linked mRNA records
|
34
35
|
@cdslist = LinkedRecs.new
|
35
36
|
@exonlist = LinkedRecs.new
|
@@ -90,6 +90,7 @@ module Bio
|
|
90
90
|
@count_ids = Counter.new # Count ids
|
91
91
|
@count_seqnames = Counter.new # Count seqnames
|
92
92
|
@componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
|
93
|
+
@orflist = SeekLinkedRecs.new # Store linked gene records
|
93
94
|
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
94
95
|
@cdslist = SeekLinkedRecs.new
|
95
96
|
@exonlist = SeekLinkedRecs.new
|
data/lib/bio/db/gff/gffparser.rb
CHANGED
@@ -23,24 +23,26 @@ module Bio
|
|
23
23
|
@count_ids.add(id)
|
24
24
|
@count_seqnames.add(rec.seqname)
|
25
25
|
|
26
|
-
|
26
|
+
is_component = COMPONENT_TYPES.include?(rec.feature_type)
|
27
|
+
if is_component
|
27
28
|
# check for container ID
|
28
29
|
warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
|
29
30
|
@componentlist[id] = rec
|
30
31
|
info "Added #{rec.feature_type} with component ID #{id}"
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
32
|
+
end
|
33
|
+
case rec.feature_type
|
34
|
+
when 'gene' || 'SO:0000704'
|
35
|
+
@orflist.add(id,rec)
|
36
|
+
when 'mRNA' || 'SO:0000234'
|
37
|
+
@mrnalist.add(id,rec)
|
38
|
+
when 'CDS' || 'SO:0000316'
|
39
|
+
@cdslist.add(id,rec)
|
40
|
+
when 'exon' || 'SO:0000147'
|
41
|
+
@exonlist.add(id,rec)
|
42
|
+
else
|
43
|
+
if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
|
44
|
+
@unrecognized_features[rec.feature_type] = true
|
45
|
+
end
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
@@ -80,6 +82,12 @@ module Bio
|
|
80
82
|
# p :inmemory, @sequencelist
|
81
83
|
end
|
82
84
|
|
85
|
+
# Yield the id, recs, containing component and sequence of genes
|
86
|
+
def each_gene
|
87
|
+
parse if !@orflist
|
88
|
+
each_item(@orflist) { |id, recs, component | yield id, recs, component }
|
89
|
+
end
|
90
|
+
|
83
91
|
# Yield the id, recs, containing component and sequence of mRNAs
|
84
92
|
def each_mRNA
|
85
93
|
parse if !@mrnalist
|
@@ -98,6 +106,21 @@ module Bio
|
|
98
106
|
each_item(@exonlist) { |id, recs, component | yield id, recs, component }
|
99
107
|
end
|
100
108
|
|
109
|
+
# Yield a unique description and the sequence
|
110
|
+
def each_gene_seq
|
111
|
+
each_gene do | id, reclist, component |
|
112
|
+
if component
|
113
|
+
sequence = @sequencelist[component.seqname]
|
114
|
+
# p sequence
|
115
|
+
if sequence
|
116
|
+
yield description(id,component,reclist), assemble(sequence,component.start,reclist)
|
117
|
+
else
|
118
|
+
error "No sequence information for",id
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
101
124
|
# Yield a unique description and the sequence
|
102
125
|
def each_mRNA_seq
|
103
126
|
each_mRNA do | id, reclist, component |
|
data/spec/gffdb_spec.rb
CHANGED
@@ -22,13 +22,15 @@ TESTGFF1FASTA='test/data/gff/test-ext-fasta.fa'
|
|
22
22
|
def iterators_should_be_implemented
|
23
23
|
if TEST_NON_IMPLEMENTED
|
24
24
|
it "should implement each_gene"
|
25
|
-
it "should implement each_gene"
|
26
|
-
it "should implement each_gene_seq"
|
27
25
|
it "should implement each_mRNA"
|
28
26
|
it "should implement each_exon"
|
29
27
|
it "should implement each_exon_seq"
|
30
28
|
it "should implement each_CDS"
|
31
29
|
end
|
30
|
+
it "should implement each_gene_seq" do
|
31
|
+
h = {} ; @gff.each_gene_seq { | id, seq | h[id] = seq }
|
32
|
+
h["gene01 Sequence:test01_1:400 (3:280)"].should == "GAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTAGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACACCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGATAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTA" if h["gene01 Sequence:test01_1:400 (3:280)"]
|
33
|
+
end
|
32
34
|
it "should implement each_mRNA_seq" do
|
33
35
|
h = {} ; @gff.each_mRNA_seq { | id, seq | h[id] = seq }
|
34
36
|
h["mrna01short Sequence:test01_1:400 (3:14)"].should == "GAAGATTTGTAT"
|
data/test/data/gff/test.gff3
CHANGED
@@ -44,6 +44,7 @@ Contig4 confirmed CDS 32000 35000 . + . ID=Misc:thing3;mRNA=trans-9
|
|
44
44
|
##gff-version 3
|
45
45
|
##sequence-region test01 1 400
|
46
46
|
test01 RANDOM contig 1 400 . + . ID=test01;Note=this is test
|
47
|
+
test01 . gene 3 280 . + . ID=gene01;Name=tesGene;Parent=test01;Note=this is test gene
|
47
48
|
test01 . mRNA 3 14 . + . ID=mrna01short;Name=testmRNA;Note=this is test mRNA
|
48
49
|
test01 . mRNA 101 230 . + . ID=mrna01;Name=testmRNA;Note=this is test mRNA
|
49
50
|
test01 . mRNA 101 280 . + . ID=mrna01a;Name=testmRNAalterative;Note=test of alternative splicing variant
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 8
|
8
|
-
-
|
9
|
-
version: 0.8.
|
8
|
+
- 4
|
9
|
+
version: 0.8.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
180
180
|
requirements:
|
181
181
|
- - ">="
|
182
182
|
- !ruby/object:Gem::Version
|
183
|
-
hash:
|
183
|
+
hash: 520620943
|
184
184
|
segments:
|
185
185
|
- 0
|
186
186
|
version: "0"
|