bio-gff3 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +88 -0
- data/VERSION +1 -1
- data/bin/gff3-fetch +11 -5
- data/bio-gff3.gemspec +1 -1
- data/lib/bio/db/gff/gffinmemory.rb +1 -0
- data/lib/bio/db/gff/gffnocache.rb +1 -0
- data/lib/bio/db/gff/gffparser.rb +37 -14
- data/spec/gff3_fileiterator_spec.rb +1 -1
- data/spec/gffdb_spec.rb +4 -2
- data/test/data/gff/test.gff3 +1 -0
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -58,3 +58,91 @@ For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html
|
|
58
58
|
Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
|
59
59
|
|
60
60
|
|
61
|
+
|
62
|
+
Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
|
63
|
+
|
64
|
+
gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
|
65
|
+
|
66
|
+
Where (NYI == Not Yet Implemented):
|
67
|
+
|
68
|
+
--translate : output as amino acid sequence
|
69
|
+
--validate : validate GFF3 file by translating
|
70
|
+
--fix : check 3-frame translation and fix, if possible
|
71
|
+
--fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
|
72
|
+
--no-assemble : output each record as a sequence -- NYI
|
73
|
+
--add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
|
74
|
+
|
75
|
+
type is any valid type in the GFF3 definition. For example:
|
76
|
+
|
77
|
+
mRNA : assemble mRNA
|
78
|
+
CDS : assemble CDS
|
79
|
+
exon : list all exons
|
80
|
+
gene|ORF : list gene ORFs
|
81
|
+
other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
|
82
|
+
|
83
|
+
and the following performance options:
|
84
|
+
|
85
|
+
--cache full : load all in RAM (fast)
|
86
|
+
--cache none : do not load anything in memory (slow)
|
87
|
+
--low-mem : use LRU cache (limit RAM use, fast) -- NYI
|
88
|
+
--max-cpus num : use num threads -- NYI
|
89
|
+
--emboss : use EMBOSS translation (fast) -- NYI
|
90
|
+
|
91
|
+
Multiple GFF3 files can be used. With external FASTA files, always the last
|
92
|
+
one before the GFF3 filename is matched.
|
93
|
+
|
94
|
+
Note that above switches are only partially implemented at this stage. Full
|
95
|
+
feature support is projected Feb. 2011.
|
96
|
+
|
97
|
+
Examples:
|
98
|
+
|
99
|
+
Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
|
100
|
+
|
101
|
+
gff3-fetch mRNA test/data/gff/test.gff3
|
102
|
+
gff3-fetch CDS test/data/gff/test.gff3
|
103
|
+
|
104
|
+
Find CDS records from external FASTA file, adding phase and translate to protein sequence
|
105
|
+
|
106
|
+
gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
|
107
|
+
|
108
|
+
Find mRNA from external FASTA file, without loading everything in RAM
|
109
|
+
|
110
|
+
gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
111
|
+
gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
112
|
+
|
113
|
+
Validate GFF3 file using EMBOSS translation and validation
|
114
|
+
|
115
|
+
gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
|
116
|
+
|
117
|
+
Find GENEID predicted terminal exons
|
118
|
+
|
119
|
+
gff3-fetch terminal chromosome1.fa geneid.gff3
|
120
|
+
|
121
|
+
== Performance
|
122
|
+
|
123
|
+
time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
|
124
|
+
|
125
|
+
Cache real user sys
|
126
|
+
----------------------------------------------------
|
127
|
+
full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011)
|
128
|
+
none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011)
|
129
|
+
----------------------------------------------------
|
130
|
+
|
131
|
+
where
|
132
|
+
|
133
|
+
52M m_hapla.WS217.dna.fa
|
134
|
+
456M m_hapla.WS217.gff3
|
135
|
+
|
136
|
+
ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
|
137
|
+
on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine.
|
138
|
+
|
139
|
+
== Cite
|
140
|
+
|
141
|
+
If you use this software, please cite
|
142
|
+
|
143
|
+
http://dx.doi.org/10.1093/bioinformatics/btq475
|
144
|
+
|
145
|
+
== Copyright
|
146
|
+
|
147
|
+
Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
|
148
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.4
|
data/bin/gff3-fetch
CHANGED
@@ -26,7 +26,7 @@ USAGE = <<EOM
|
|
26
26
|
mRNA : assemble mRNA
|
27
27
|
CDS : assemble CDS
|
28
28
|
exon : list all exons
|
29
|
-
gene
|
29
|
+
gene|ORF : list gene ORFs
|
30
30
|
other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
|
31
31
|
|
32
32
|
and the following performance options:
|
@@ -167,8 +167,6 @@ opts.parse!(ARGV)
|
|
167
167
|
|
168
168
|
gfftype = ARGV.shift
|
169
169
|
|
170
|
-
raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
|
171
|
-
|
172
170
|
fastafn = nil
|
173
171
|
|
174
172
|
ARGV.each do | fn |
|
@@ -189,7 +187,15 @@ ARGV.each do | fn |
|
|
189
187
|
gff = gffdb.assembler
|
190
188
|
writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
|
191
189
|
case gfftype.downcase
|
192
|
-
when '
|
190
|
+
when 'gene'
|
191
|
+
gff.each_gene_seq do | id, seq |
|
192
|
+
writer.put(id,seq)
|
193
|
+
end
|
194
|
+
when 'orf'
|
195
|
+
gff.each_gene_seq do | id, seq |
|
196
|
+
writer.put(id,seq)
|
197
|
+
end
|
198
|
+
when 'mrna'
|
193
199
|
gff.each_mRNA_seq do | id, seq |
|
194
200
|
writer.put(id,seq)
|
195
201
|
end
|
@@ -202,7 +208,7 @@ ARGV.each do | fn |
|
|
202
208
|
writer.put(id,seq)
|
203
209
|
end
|
204
210
|
else
|
205
|
-
raise "Unknown action <#{gfftype}>"
|
211
|
+
raise "Unknown action on type <#{gfftype}>"
|
206
212
|
end
|
207
213
|
fastafn = nil
|
208
214
|
end
|
data/bio-gff3.gemspec
CHANGED
@@ -30,6 +30,7 @@ module Bio
|
|
30
30
|
@count_ids = Counter.new # Count ids
|
31
31
|
@count_seqnames = Counter.new # Count seqnames
|
32
32
|
@componentlist = {} # Store containers, like genes, contigs
|
33
|
+
@orflist = LinkedRecs.new
|
33
34
|
@mrnalist = LinkedRecs.new # Store linked mRNA records
|
34
35
|
@cdslist = LinkedRecs.new
|
35
36
|
@exonlist = LinkedRecs.new
|
@@ -90,6 +90,7 @@ module Bio
|
|
90
90
|
@count_ids = Counter.new # Count ids
|
91
91
|
@count_seqnames = Counter.new # Count seqnames
|
92
92
|
@componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
|
93
|
+
@orflist = SeekLinkedRecs.new # Store linked gene records
|
93
94
|
@mrnalist = SeekLinkedRecs.new # Store linked mRNA records
|
94
95
|
@cdslist = SeekLinkedRecs.new
|
95
96
|
@exonlist = SeekLinkedRecs.new
|
data/lib/bio/db/gff/gffparser.rb
CHANGED
@@ -23,24 +23,26 @@ module Bio
|
|
23
23
|
@count_ids.add(id)
|
24
24
|
@count_seqnames.add(rec.seqname)
|
25
25
|
|
26
|
-
|
26
|
+
is_component = COMPONENT_TYPES.include?(rec.feature_type)
|
27
|
+
if is_component
|
27
28
|
# check for container ID
|
28
29
|
warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
|
29
30
|
@componentlist[id] = rec
|
30
31
|
info "Added #{rec.feature_type} with component ID #{id}"
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
32
|
+
end
|
33
|
+
case rec.feature_type
|
34
|
+
when 'gene' || 'SO:0000704'
|
35
|
+
@orflist.add(id,rec)
|
36
|
+
when 'mRNA' || 'SO:0000234'
|
37
|
+
@mrnalist.add(id,rec)
|
38
|
+
when 'CDS' || 'SO:0000316'
|
39
|
+
@cdslist.add(id,rec)
|
40
|
+
when 'exon' || 'SO:0000147'
|
41
|
+
@exonlist.add(id,rec)
|
42
|
+
else
|
43
|
+
if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
|
44
|
+
@unrecognized_features[rec.feature_type] = true
|
45
|
+
end
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
@@ -80,6 +82,12 @@ module Bio
|
|
80
82
|
# p :inmemory, @sequencelist
|
81
83
|
end
|
82
84
|
|
85
|
+
# Yield the id, recs, containing component and sequence of genes
|
86
|
+
def each_gene
|
87
|
+
parse if !@orflist
|
88
|
+
each_item(@orflist) { |id, recs, component | yield id, recs, component }
|
89
|
+
end
|
90
|
+
|
83
91
|
# Yield the id, recs, containing component and sequence of mRNAs
|
84
92
|
def each_mRNA
|
85
93
|
parse if !@mrnalist
|
@@ -98,6 +106,21 @@ module Bio
|
|
98
106
|
each_item(@exonlist) { |id, recs, component | yield id, recs, component }
|
99
107
|
end
|
100
108
|
|
109
|
+
# Yield a unique description and the sequence
|
110
|
+
def each_gene_seq
|
111
|
+
each_gene do | id, reclist, component |
|
112
|
+
if component
|
113
|
+
sequence = @sequencelist[component.seqname]
|
114
|
+
# p sequence
|
115
|
+
if sequence
|
116
|
+
yield description(id,component,reclist), assemble(sequence,component.start,reclist)
|
117
|
+
else
|
118
|
+
error "No sequence information for",id
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
101
124
|
# Yield a unique description and the sequence
|
102
125
|
def each_mRNA_seq
|
103
126
|
each_mRNA do | id, reclist, component |
|
data/spec/gffdb_spec.rb
CHANGED
@@ -22,13 +22,15 @@ TESTGFF1FASTA='test/data/gff/test-ext-fasta.fa'
|
|
22
22
|
def iterators_should_be_implemented
|
23
23
|
if TEST_NON_IMPLEMENTED
|
24
24
|
it "should implement each_gene"
|
25
|
-
it "should implement each_gene"
|
26
|
-
it "should implement each_gene_seq"
|
27
25
|
it "should implement each_mRNA"
|
28
26
|
it "should implement each_exon"
|
29
27
|
it "should implement each_exon_seq"
|
30
28
|
it "should implement each_CDS"
|
31
29
|
end
|
30
|
+
it "should implement each_gene_seq" do
|
31
|
+
h = {} ; @gff.each_gene_seq { | id, seq | h[id] = seq }
|
32
|
+
h["gene01 Sequence:test01_1:400 (3:280)"].should == "GAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTAGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACACCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGATAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTA" if h["gene01 Sequence:test01_1:400 (3:280)"]
|
33
|
+
end
|
32
34
|
it "should implement each_mRNA_seq" do
|
33
35
|
h = {} ; @gff.each_mRNA_seq { | id, seq | h[id] = seq }
|
34
36
|
h["mrna01short Sequence:test01_1:400 (3:14)"].should == "GAAGATTTGTAT"
|
data/test/data/gff/test.gff3
CHANGED
@@ -44,6 +44,7 @@ Contig4 confirmed CDS 32000 35000 . + . ID=Misc:thing3;mRNA=trans-9
|
|
44
44
|
##gff-version 3
|
45
45
|
##sequence-region test01 1 400
|
46
46
|
test01 RANDOM contig 1 400 . + . ID=test01;Note=this is test
|
47
|
+
test01 . gene 3 280 . + . ID=gene01;Name=tesGene;Parent=test01;Note=this is test gene
|
47
48
|
test01 . mRNA 3 14 . + . ID=mrna01short;Name=testmRNA;Note=this is test mRNA
|
48
49
|
test01 . mRNA 101 230 . + . ID=mrna01;Name=testmRNA;Note=this is test mRNA
|
49
50
|
test01 . mRNA 101 280 . + . ID=mrna01a;Name=testmRNAalterative;Note=test of alternative splicing variant
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 8
|
8
|
-
-
|
9
|
-
version: 0.8.
|
8
|
+
- 4
|
9
|
+
version: 0.8.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
180
180
|
requirements:
|
181
181
|
- - ">="
|
182
182
|
- !ruby/object:Gem::Version
|
183
|
-
hash:
|
183
|
+
hash: 520620943
|
184
184
|
segments:
|
185
185
|
- 0
|
186
186
|
version: "0"
|