bio-gff3 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bin/gff3-fetch +12 -1
- data/bio-gff3.gemspec +1 -1
- data/lib/bio/db/gff/gffassemble.rb +42 -6
- data/lib/bio/output/gfffastawriter.rb +2 -2
- data/spec/gff3_assemble3_spec.rb +20 -1
- data/spec/gff3_assemble_spec.rb +2 -2
- data/test/data/gff/test-cds.gff3 +15 -0
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.3
|
data/bin/gff3-fetch
CHANGED
@@ -16,9 +16,10 @@ USAGE = <<EOM
|
|
16
16
|
|
17
17
|
--translate : output as amino acid sequence
|
18
18
|
--validate : validate GFF3 file by translating
|
19
|
+
--fix : check 3-frame translation and fix, if possible
|
20
|
+
--fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
|
19
21
|
--no-assemble : output each record as a sequence -- NYI
|
20
22
|
--add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
|
21
|
-
--fix : check 3-frame translation and fix, if possible -- NYI
|
22
23
|
|
23
24
|
type is any valid type in the GFF3 definition. For example:
|
24
25
|
|
@@ -142,6 +143,14 @@ opts = OptionParser.new() { |opts|
|
|
142
143
|
$stop_on_error = true # replace global in near future
|
143
144
|
end
|
144
145
|
|
146
|
+
opts.on("--fix", "Fix frame errors in the GFF3 definition") do |v|
|
147
|
+
options.fix = true
|
148
|
+
end
|
149
|
+
|
150
|
+
opts.on("--fix-wormbase", "Wormbase fix gene1 frame error") do |v|
|
151
|
+
options.fix_wormbase = true
|
152
|
+
end
|
153
|
+
|
145
154
|
# opts.on("-q", "--quiet", "Run quietly") do |q|
|
146
155
|
# options.quiet = q
|
147
156
|
# end
|
@@ -174,6 +183,8 @@ ARGV.each do | fn |
|
|
174
183
|
opts[:cache_components] = options.cache
|
175
184
|
opts[:cache_records] = options.cache
|
176
185
|
opts[:fasta_filename] = fastafn if fastafn
|
186
|
+
opts[:fix_wormbase] = options.fix_wormbase
|
187
|
+
opts[:fix] = options.fix
|
177
188
|
gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts)
|
178
189
|
gff = gffdb.assembler
|
179
190
|
writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
|
data/bio-gff3.gemspec
CHANGED
@@ -206,16 +206,20 @@ module Bio
|
|
206
206
|
# :reverse : do reverse if reverse is indicated (default true)
|
207
207
|
# :complement : do complement if reverse is indicated (default true)
|
208
208
|
# :phase : do set CDS phase (default false, normally ignore)
|
209
|
-
# :trim : make sure sequence is multiple of 3 nucleotide bps (default
|
209
|
+
# :trim : make sure sequence is multiple of 3 nucleotide bps (default true)
|
210
210
|
#
|
211
|
-
#
|
211
|
+
# special options:
|
212
212
|
#
|
213
213
|
# :raw : raw sequence (all above false)
|
214
|
-
# :codonize : codon sequence (reverse, complement and trim are true)
|
214
|
+
# :codonize : codon sequence (reverse, complement, and trim are true)
|
215
|
+
# :fix : fix errors (default false)
|
215
216
|
#
|
216
|
-
def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>
|
217
|
+
def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false }
|
218
|
+
# default to nil, if not passed in
|
217
219
|
do_debug = options[:debug]
|
218
220
|
do_phase = options[:phase]
|
221
|
+
do_fix = options[:fix]
|
222
|
+
# default to true, if not passed in
|
219
223
|
do_reverse = (options[:reverse] == false ? false : true)
|
220
224
|
do_trim = (options[:trim] == false ? false : true)
|
221
225
|
do_complement = (options[:complement] == false ? false : true)
|
@@ -274,18 +278,50 @@ module Bio
|
|
274
278
|
ntseq = Bio::Sequence::NA.new(seq)
|
275
279
|
seq = ntseq.forward_complement.upcase
|
276
280
|
end
|
281
|
+
# This is the place to fix sequences (e.g. the Wormbase bug)
|
282
|
+
if do_fix or @options[:fix] or @options[:fix_wormbase]
|
283
|
+
if @options[:fix_wormbase] and rec0.id.index('gene1')==0
|
284
|
+
# Wormbase gene1 only, so ignore rest
|
285
|
+
else
|
286
|
+
test_frame = 0
|
287
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
288
|
+
aaseq = ntseq.translate
|
289
|
+
if aaseq.count('*') > 1
|
290
|
+
test_frame = 1
|
291
|
+
seq = seq[1..-1]
|
292
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
293
|
+
aaseq = ntseq.translate
|
294
|
+
if aaseq.count('*') > 1
|
295
|
+
test_frame = 2
|
296
|
+
seq = seq[1..-1]
|
297
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
298
|
+
aaseq = ntseq.translate
|
299
|
+
raise 'Validation problem '+rec0.id if aaseq.count('*') > 1
|
300
|
+
end
|
301
|
+
end
|
302
|
+
if test_frame > 0
|
303
|
+
warn rec0.id,"Frame adjusted to #{test_frame} (fix)"
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
277
307
|
if do_trim
|
278
308
|
reduce = seq.size % 3
|
279
309
|
seq = seq[0..(seq.size-1-reduce)] if reduce != 0
|
280
310
|
end
|
311
|
+
if @options[:validate]
|
312
|
+
ntseq = Bio::Sequence::NA.new(seq)
|
313
|
+
aaseq = ntseq.translate
|
314
|
+
raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1
|
315
|
+
end
|
316
|
+
|
281
317
|
retval = seq
|
282
318
|
retval
|
283
319
|
end
|
284
320
|
|
285
321
|
# Patch a sequence together from a Sequence string and an array
|
286
322
|
# of records and translate in the correct direction and frame. The options
|
287
|
-
# are the same as for +assemble
|
288
|
-
def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>
|
323
|
+
# are the same as for +assemble+, except :trim defaults to true.
|
324
|
+
def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true }
|
289
325
|
seq = assemble(sequence, startpos, reclist, options)
|
290
326
|
ntseq = Bio::Sequence::NA.new(seq)
|
291
327
|
ntseq.translate
|
@@ -10,11 +10,11 @@ module Bio
|
|
10
10
|
|
11
11
|
def put id, seq
|
12
12
|
puts '>'+id
|
13
|
-
put_seq seq
|
13
|
+
put_seq id, seq
|
14
14
|
end
|
15
15
|
private
|
16
16
|
|
17
|
-
def put_seq seq
|
17
|
+
def put_seq id, seq
|
18
18
|
if @do_translate or @do_validate
|
19
19
|
ntseq = Bio::Sequence::NA.new(seq)
|
20
20
|
aaseq = ntseq.translate
|
data/spec/gff3_assemble3_spec.rb
CHANGED
@@ -30,7 +30,6 @@ describe GFFdb, "Assemble CDS (extra checks)" do
|
|
30
30
|
name = "cds:MhA1_Contig1040.frz3.gene"
|
31
31
|
recs = @cdslist[name]
|
32
32
|
component = @componentlist[name]
|
33
|
-
p recs
|
34
33
|
ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
35
34
|
ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
|
36
35
|
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
@@ -56,6 +55,26 @@ describe GFFdb, "Assemble CDS (extra checks)" do
|
|
56
55
|
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
57
56
|
aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
|
58
57
|
end
|
58
|
+
it "should fix Wormbase error MhA1_Contig3426.frz3.gene1" do
|
59
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig3426"]
|
60
|
+
@componentlist = {}
|
61
|
+
@cdslist = {}
|
62
|
+
@gff.each_CDS do | id, reclist, component |
|
63
|
+
@componentlist[id] = component
|
64
|
+
@cdslist[id] = reclist
|
65
|
+
end
|
66
|
+
name = "cds:MhA1_Contig3426.frz3.gene1"
|
67
|
+
recs = @cdslist[name]
|
68
|
+
component = @componentlist[name]
|
69
|
+
# :raw should not fix
|
70
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
71
|
+
ntseq.should == "GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
|
72
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true,:fix=>true)
|
73
|
+
ntseq.should == "ATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
|
74
|
+
ntseq.size.should == 153
|
75
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs,:fix=>true)
|
76
|
+
aaseq.should == "IQQQQLEVFPSSSSAPQQQQYPAQQLQLVTPFIACIADELRELIDEMRMF*"
|
77
|
+
end
|
59
78
|
end
|
60
79
|
|
61
80
|
|
data/spec/gff3_assemble_spec.rb
CHANGED
@@ -77,7 +77,7 @@ describe GFFdb, "Assemble CDS" do
|
|
77
77
|
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
78
78
|
cds0 = recs[0]
|
79
79
|
cds0.seqname.should == 'MhA1_Contig1133'
|
80
|
-
seq = @gff.assemble(@contigsequence,component.start,[cds0])
|
80
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds0],:trim=>false)
|
81
81
|
seq.size.should == 143
|
82
82
|
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
|
83
83
|
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
|
@@ -88,7 +88,7 @@ describe GFFdb, "Assemble CDS" do
|
|
88
88
|
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
89
89
|
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
90
90
|
cds1 = recs[1]
|
91
|
-
seq = @gff.assemble(@contigsequence,component.start,[cds1])
|
91
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:trim=>false)
|
92
92
|
seq.size.should == 244
|
93
93
|
seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
|
94
94
|
seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase => true)
|
data/test/data/gff/test-cds.gff3
CHANGED
@@ -9,6 +9,8 @@ ene29.1;Parent=transcript:MhA1_Contig1040.frz3.gene29
|
|
9
9
|
MhA1_Contig1040 WormBase CDS 1 180 . - 2 ID=cds:MhA1_Contig1040.frz3.gene
|
10
10
|
29;Parent=transcript:MhA1_Contig1040.frz3.gene29
|
11
11
|
|
12
|
+
|
13
|
+
|
12
14
|
##gff-version 3 ##sequence-regio
|
13
15
|
# Gene gene:MhA1_Contig2992.frz3.gene1
|
14
16
|
MhA1_Contig2992 WormBase gene 577 2176 . - . ID=gene:MhA1_Contig2992.frz3.gene1;Name=MhA1_Contig2992.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig2992.frz3.gene1
|
@@ -50,6 +52,13 @@ MhA1_Contig2992 RepeatMask repeat_region 2957 3019 28 + . ID=RepeatMask.324638
|
|
50
52
|
MhA1_Contig2992 Dust repeat_region 2959 3019 . - . ID=Dust.42112
|
51
53
|
MhA1_Contig2992 RepeatMask repeat_region 3194 3237 23 + . ID=RepeatMask.324639
|
52
54
|
MhA1_Contig2992 Dust repeat_region 3222 3277 . - . ID=Dust.42114
|
55
|
+
##gff-version 3
|
56
|
+
##sequence-region MhA1_Contig3426 1 2029
|
57
|
+
# Gene gene:MhA1_Contig3426.frz3.gene1
|
58
|
+
MhA1_Contig3426 WormBase gene 192 346 . + . ID=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig3426.frz3.gene1
|
59
|
+
MhA1_Contig3426 WormBase mRNA 192 346 . + . ID=transcript:MhA1_Contig3426.frz3.gene1;Parent=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;public_name=MhA1_Contig3426.frz3.gene1
|
60
|
+
MhA1_Contig3426 WormBase exon 192 346 . + . ID=exon:MhA1_Contig3426.frz3.gene1.1;Parent=transcript:MhA1_Contig3426.frz3.gene1
|
61
|
+
MhA1_Contig3426 WormBase CDS 192 346 . + 0 ID=cds:MhA1_Contig3426.frz3.gene1;Parent=transcript:MhA1_Contig3426.frz3.gene1
|
53
62
|
##FASTA
|
54
63
|
>MhA1_Contig2992
|
55
64
|
TTTTGGTGACCAAAGTTCCTATTGGTGACCAAAATTCCAGTGCCCAATATTCCGTTTTTTGACTTGGTGACCAAAATTCC
|
@@ -96,3 +105,9 @@ TATTTTTTGTTAAATAAAAGGTTTAAATTAATTATTTGTGCTTTTTCGAATTTTTCATTTAAATCCTTTATTTTTTTGAA
|
|
96
105
|
ATTATCATAAAGCTCTAATGATGCTTTTTGAATTTTTGAGACATTTTCAATATCAAAATTTGGTCCGGAAAATTTATTTA
|
97
106
|
>MhA1_Contig1040
|
98
107
|
TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAAAC
|
108
|
+
>MhA1_Contig3426
|
109
|
+
TTAATAAATTTAATTCATTAAAATTTTAAAAAGAAAGGGACATTCGAGGGGAAATGAGAGAGAACGAGAGAAAATGGACG
|
110
|
+
GGAAATTAAATTAAAAAATAAAAAATTAATTTTTATTTTTTTTTATTTAATTTAAAATTAATTTTCTACATTTATTAAAT
|
111
|
+
CTTAAATTATTAATTTTAAATTAATTTAAAG GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCC
|
112
|
+
TCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGT
|
113
|
+
TGATAGATGAAATGCGTATGTTTTAG AATATTTTTTAAAAAAAAATTAAAAAAAATTTTTTTTTGCCAAACAGGCTCTCG
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 8
|
8
|
-
-
|
9
|
-
version: 0.8.
|
8
|
+
- 3
|
9
|
+
version: 0.8.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
180
180
|
requirements:
|
181
181
|
- - ">="
|
182
182
|
- !ruby/object:Gem::Version
|
183
|
-
hash: -
|
183
|
+
hash: -450771567
|
184
184
|
segments:
|
185
185
|
- 0
|
186
186
|
version: "0"
|