bio-gff3 0.8.2 → 0.8.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bin/gff3-fetch +12 -1
- data/bio-gff3.gemspec +1 -1
- data/lib/bio/db/gff/gffassemble.rb +42 -6
- data/lib/bio/output/gfffastawriter.rb +2 -2
- data/spec/gff3_assemble3_spec.rb +20 -1
- data/spec/gff3_assemble_spec.rb +2 -2
- data/test/data/gff/test-cds.gff3 +15 -0
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.3
|
data/bin/gff3-fetch
CHANGED
@@ -16,9 +16,10 @@ USAGE = <<EOM
|
|
16
16
|
|
17
17
|
--translate : output as amino acid sequence
|
18
18
|
--validate : validate GFF3 file by translating
|
19
|
+
--fix : check 3-frame translation and fix, if possible
|
20
|
+
--fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
|
19
21
|
--no-assemble : output each record as a sequence -- NYI
|
20
22
|
--add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
|
21
|
-
--fix : check 3-frame translation and fix, if possible -- NYI
|
22
23
|
|
23
24
|
type is any valid type in the GFF3 definition. For example:
|
24
25
|
|
@@ -142,6 +143,14 @@ opts = OptionParser.new() { |opts|
|
|
142
143
|
$stop_on_error = true # replace global in near future
|
143
144
|
end
|
144
145
|
|
146
|
+
opts.on("--fix", "Fix frame errors in the GFF3 definition") do |v|
|
147
|
+
options.fix = true
|
148
|
+
end
|
149
|
+
|
150
|
+
opts.on("--fix-wormbase", "Wormbase fix gene1 frame error") do |v|
|
151
|
+
options.fix_wormbase = true
|
152
|
+
end
|
153
|
+
|
145
154
|
# opts.on("-q", "--quiet", "Run quietly") do |q|
|
146
155
|
# options.quiet = q
|
147
156
|
# end
|
@@ -174,6 +183,8 @@ ARGV.each do | fn |
|
|
174
183
|
opts[:cache_components] = options.cache
|
175
184
|
opts[:cache_records] = options.cache
|
176
185
|
opts[:fasta_filename] = fastafn if fastafn
|
186
|
+
opts[:fix_wormbase] = options.fix_wormbase
|
187
|
+
opts[:fix] = options.fix
|
177
188
|
gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts)
|
178
189
|
gff = gffdb.assembler
|
179
190
|
writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
|
data/bio-gff3.gemspec
CHANGED
@@ -206,16 +206,20 @@ module Bio
|
|
206
206
|
# :reverse : do reverse if reverse is indicated (default true)
|
207
207
|
# :complement : do complement if reverse is indicated (default true)
|
208
208
|
# :phase : do set CDS phase (default false, normally ignore)
|
209
|
-
# :trim : make sure sequence is multiple of 3 nucleotide bps (default
|
209
|
+
# :trim : make sure sequence is multiple of 3 nucleotide bps (default true)
|
210
210
|
#
|
211
|
-
#
|
211
|
+
# special options:
|
212
212
|
#
|
213
213
|
# :raw : raw sequence (all above false)
|
214
|
-
# :codonize : codon sequence (reverse, complement and trim are true)
|
214
|
+
# :codonize : codon sequence (reverse, complement, and trim are true)
|
215
|
+
# :fix : fix errors (default false)
|
215
216
|
#
|
216
|
-
def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>
|
217
|
+
def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false }
|
218
|
+
# default to nil, if not passed in
|
217
219
|
do_debug = options[:debug]
|
218
220
|
do_phase = options[:phase]
|
221
|
+
do_fix = options[:fix]
|
222
|
+
# default to true, if not passed in
|
219
223
|
do_reverse = (options[:reverse] == false ? false : true)
|
220
224
|
do_trim = (options[:trim] == false ? false : true)
|
221
225
|
do_complement = (options[:complement] == false ? false : true)
|
@@ -274,18 +278,50 @@ module Bio
|
|
274
278
|
ntseq = Bio::Sequence::NA.new(seq)
|
275
279
|
seq = ntseq.forward_complement.upcase
|
276
280
|
end
|
281
|
+
# This is the place to fix sequences (e.g. the Wormbase bug)
|
282
|
+
if do_fix or @options[:fix] or @options[:fix_wormbase]
|
283
|
+
if @options[:fix_wormbase] and rec0.id.index('gene1')==0
|
284
|
+
# Wormbase gene1 only, so ignore rest
|
285
|
+
else
|
286
|
+
test_frame = 0
|
287
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
288
|
+
aaseq = ntseq.translate
|
289
|
+
if aaseq.count('*') > 1
|
290
|
+
test_frame = 1
|
291
|
+
seq = seq[1..-1]
|
292
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
293
|
+
aaseq = ntseq.translate
|
294
|
+
if aaseq.count('*') > 1
|
295
|
+
test_frame = 2
|
296
|
+
seq = seq[1..-1]
|
297
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
298
|
+
aaseq = ntseq.translate
|
299
|
+
raise 'Validation problem '+rec0.id if aaseq.count('*') > 1
|
300
|
+
end
|
301
|
+
end
|
302
|
+
if test_frame > 0
|
303
|
+
warn rec0.id,"Frame adjusted to #{test_frame} (fix)"
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
277
307
|
if do_trim
|
278
308
|
reduce = seq.size % 3
|
279
309
|
seq = seq[0..(seq.size-1-reduce)] if reduce != 0
|
280
310
|
end
|
311
|
+
if @options[:validate]
|
312
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
313
|
+
aaseq = ntseq.translate
|
314
|
+
raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1
|
315
|
+
end
|
316
|
+
|
281
317
|
retval = seq
|
282
318
|
retval
|
283
319
|
end
|
284
320
|
|
285
321
|
# Patch a sequence together from a Sequence string and an array
|
286
322
|
# of records and translate in the correct direction and frame. The options
|
287
|
-
# are the same as for +assemble
|
288
|
-
def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>
|
323
|
+
# are the same as for +assemble+, except :trim defaults to true.
|
324
|
+
def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true }
|
289
325
|
seq = assemble(sequence, startpos, reclist, options)
|
290
326
|
ntseq = Bio::Sequence::NA.new(seq)
|
291
327
|
ntseq.translate
|
@@ -10,11 +10,11 @@ module Bio
|
|
10
10
|
|
11
11
|
def put id, seq
|
12
12
|
puts '>'+id
|
13
|
-
put_seq seq
|
13
|
+
put_seq id, seq
|
14
14
|
end
|
15
15
|
private
|
16
16
|
|
17
|
-
def put_seq seq
|
17
|
+
def put_seq id, seq
|
18
18
|
if @do_translate or @do_validate
|
19
19
|
ntseq = Bio::Sequence::NA.new(seq)
|
20
20
|
aaseq = ntseq.translate
|
data/spec/gff3_assemble3_spec.rb
CHANGED
@@ -30,7 +30,6 @@ describe GFFdb, "Assemble CDS (extra checks)" do
|
|
30
30
|
name = "cds:MhA1_Contig1040.frz3.gene"
|
31
31
|
recs = @cdslist[name]
|
32
32
|
component = @componentlist[name]
|
33
|
-
p recs
|
34
33
|
ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
35
34
|
ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
|
36
35
|
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
@@ -56,6 +55,26 @@ describe GFFdb, "Assemble CDS (extra checks)" do
|
|
56
55
|
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
57
56
|
aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
|
58
57
|
end
|
58
|
+
it "should fix Wormbase error MhA1_Contig3426.frz3.gene1" do
|
59
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig3426"]
|
60
|
+
@componentlist = {}
|
61
|
+
@cdslist = {}
|
62
|
+
@gff.each_CDS do | id, reclist, component |
|
63
|
+
@componentlist[id] = component
|
64
|
+
@cdslist[id] = reclist
|
65
|
+
end
|
66
|
+
name = "cds:MhA1_Contig3426.frz3.gene1"
|
67
|
+
recs = @cdslist[name]
|
68
|
+
component = @componentlist[name]
|
69
|
+
# :raw should not fix
|
70
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
71
|
+
ntseq.should == "GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
|
72
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true,:fix=>true)
|
73
|
+
ntseq.should == "ATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
|
74
|
+
ntseq.size.should == 153
|
75
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs,:fix=>true)
|
76
|
+
aaseq.should == "IQQQQLEVFPSSSSAPQQQQYPAQQLQLVTPFIACIADELRELIDEMRMF*"
|
77
|
+
end
|
59
78
|
end
|
60
79
|
|
61
80
|
|
data/spec/gff3_assemble_spec.rb
CHANGED
@@ -77,7 +77,7 @@ describe GFFdb, "Assemble CDS" do
|
|
77
77
|
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
78
78
|
cds0 = recs[0]
|
79
79
|
cds0.seqname.should == 'MhA1_Contig1133'
|
80
|
-
seq = @gff.assemble(@contigsequence,component.start,[cds0])
|
80
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds0],:trim=>false)
|
81
81
|
seq.size.should == 143
|
82
82
|
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
|
83
83
|
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
|
@@ -88,7 +88,7 @@ describe GFFdb, "Assemble CDS" do
|
|
88
88
|
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
89
89
|
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
90
90
|
cds1 = recs[1]
|
91
|
-
seq = @gff.assemble(@contigsequence,component.start,[cds1])
|
91
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:trim=>false)
|
92
92
|
seq.size.should == 244
|
93
93
|
seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
|
94
94
|
seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase => true)
|
data/test/data/gff/test-cds.gff3
CHANGED
@@ -9,6 +9,8 @@ ene29.1;Parent=transcript:MhA1_Contig1040.frz3.gene29
|
|
9
9
|
MhA1_Contig1040 WormBase CDS 1 180 . - 2 ID=cds:MhA1_Contig1040.frz3.gene
|
10
10
|
29;Parent=transcript:MhA1_Contig1040.frz3.gene29
|
11
11
|
|
12
|
+
|
13
|
+
|
12
14
|
##gff-version 3 ##sequence-regio
|
13
15
|
# Gene gene:MhA1_Contig2992.frz3.gene1
|
14
16
|
MhA1_Contig2992 WormBase gene 577 2176 . - . ID=gene:MhA1_Contig2992.frz3.gene1;Name=MhA1_Contig2992.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig2992.frz3.gene1
|
@@ -50,6 +52,13 @@ MhA1_Contig2992 RepeatMask repeat_region 2957 3019 28 + . ID=RepeatMask.324638
|
|
50
52
|
MhA1_Contig2992 Dust repeat_region 2959 3019 . - . ID=Dust.42112
|
51
53
|
MhA1_Contig2992 RepeatMask repeat_region 3194 3237 23 + . ID=RepeatMask.324639
|
52
54
|
MhA1_Contig2992 Dust repeat_region 3222 3277 . - . ID=Dust.42114
|
55
|
+
##gff-version 3
|
56
|
+
##sequence-region MhA1_Contig3426 1 2029
|
57
|
+
# Gene gene:MhA1_Contig3426.frz3.gene1
|
58
|
+
MhA1_Contig3426 WormBase gene 192 346 . + . ID=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig3426.frz3.gene1
|
59
|
+
MhA1_Contig3426 WormBase mRNA 192 346 . + . ID=transcript:MhA1_Contig3426.frz3.gene1;Parent=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;public_name=MhA1_Contig3426.frz3.gene1
|
60
|
+
MhA1_Contig3426 WormBase exon 192 346 . + . ID=exon:MhA1_Contig3426.frz3.gene1.1;Parent=transcript:MhA1_Contig3426.frz3.gene1
|
61
|
+
MhA1_Contig3426 WormBase CDS 192 346 . + 0 ID=cds:MhA1_Contig3426.frz3.gene1;Parent=transcript:MhA1_Contig3426.frz3.gene1
|
53
62
|
##FASTA
|
54
63
|
>MhA1_Contig2992
|
55
64
|
TTTTGGTGACCAAAGTTCCTATTGGTGACCAAAATTCCAGTGCCCAATATTCCGTTTTTTGACTTGGTGACCAAAATTCC
|
@@ -96,3 +105,9 @@ TATTTTTTGTTAAATAAAAGGTTTAAATTAATTATTTGTGCTTTTTCGAATTTTTCATTTAAATCCTTTATTTTTTTGAA
|
|
96
105
|
ATTATCATAAAGCTCTAATGATGCTTTTTGAATTTTTGAGACATTTTCAATATCAAAATTTGGTCCGGAAAATTTATTTA
|
97
106
|
>MhA1_Contig1040
|
98
107
|
TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAAAC
|
108
|
+
>MhA1_Contig3426
|
109
|
+
TTAATAAATTTAATTCATTAAAATTTTAAAAAGAAAGGGACATTCGAGGGGAAATGAGAGAGAACGAGAGAAAATGGACG
|
110
|
+
GGAAATTAAATTAAAAAATAAAAAATTAATTTTTATTTTTTTTTATTTAATTTAAAATTAATTTTCTACATTTATTAAAT
|
111
|
+
CTTAAATTATTAATTTTAAATTAATTTAAAG GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCC
|
112
|
+
TCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGT
|
113
|
+
TGATAGATGAAATGCGTATGTTTTAG AATATTTTTTAAAAAAAAATTAAAAAAAATTTTTTTTTGCCAAACAGGCTCTCG
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 8
|
8
|
-
-
|
9
|
-
version: 0.8.
|
8
|
+
- 3
|
9
|
+
version: 0.8.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
180
180
|
requirements:
|
181
181
|
- - ">="
|
182
182
|
- !ruby/object:Gem::Version
|
183
|
-
hash: -
|
183
|
+
hash: -450771567
|
184
184
|
segments:
|
185
185
|
- 0
|
186
186
|
version: "0"
|