bio-gff3 0.8.2 → 0.8.3

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.2
1
+ 0.8.3
data/bin/gff3-fetch CHANGED
@@ -16,9 +16,10 @@ USAGE = <<EOM
16
16
 
17
17
  --translate : output as amino acid sequence
18
18
  --validate : validate GFF3 file by translating
19
+ --fix : check 3-frame translation and fix, if possible
20
+ --fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
19
21
  --no-assemble : output each record as a sequence -- NYI
20
22
  --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
21
- --fix : check 3-frame translation and fix, if possible -- NYI
22
23
 
23
24
  type is any valid type in the GFF3 definition. For example:
24
25
 
@@ -142,6 +143,14 @@ opts = OptionParser.new() { |opts|
142
143
  $stop_on_error = true # replace global in near future
143
144
  end
144
145
 
146
+ opts.on("--fix", "Fix frame errors in the GFF3 definition") do |v|
147
+ options.fix = true
148
+ end
149
+
150
+ opts.on("--fix-wormbase", "Wormbase fix gene1 frame error") do |v|
151
+ options.fix_wormbase = true
152
+ end
153
+
145
154
  # opts.on("-q", "--quiet", "Run quietly") do |q|
146
155
  # options.quiet = q
147
156
  # end
@@ -174,6 +183,8 @@ ARGV.each do | fn |
174
183
  opts[:cache_components] = options.cache
175
184
  opts[:cache_records] = options.cache
176
185
  opts[:fasta_filename] = fastafn if fastafn
186
+ opts[:fix_wormbase] = options.fix_wormbase
187
+ opts[:fix] = options.fix
177
188
  gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts)
178
189
  gff = gffdb.assembler
179
190
  writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
data/bio-gff3.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-gff3}
8
- s.version = "0.8.2"
8
+ s.version = "0.8.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
@@ -206,16 +206,20 @@ module Bio
206
206
  # :reverse : do reverse if reverse is indicated (default true)
207
207
  # :complement : do complement if reverse is indicated (default true)
208
208
  # :phase : do set CDS phase (default false, normally ignore)
209
- # :trim : make sure sequence is multiple of 3 nucleotide bps (default false)
209
+ # :trim : make sure sequence is multiple of 3 nucleotide bps (default true)
210
210
  #
211
- # there are two special options:
211
+ # special options:
212
212
  #
213
213
  # :raw : raw sequence (all above false)
214
- # :codonize : codon sequence (reverse, complement and trim are true)
214
+ # :codonize : codon sequence (reverse, complement, and trim are true)
215
+ # :fix : fix errors (default false)
215
216
  #
216
- def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>false, :complement=>true, :debug=>false }
217
+ def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false }
218
+ # default to nil, if not passed in
217
219
  do_debug = options[:debug]
218
220
  do_phase = options[:phase]
221
+ do_fix = options[:fix]
222
+ # default to true, if not passed in
219
223
  do_reverse = (options[:reverse] == false ? false : true)
220
224
  do_trim = (options[:trim] == false ? false : true)
221
225
  do_complement = (options[:complement] == false ? false : true)
@@ -274,18 +278,50 @@ module Bio
274
278
  ntseq = Bio::Sequence::NA.new(seq)
275
279
  seq = ntseq.forward_complement.upcase
276
280
  end
281
+ # This is the place to fix sequences (e.g. the Wormbase bug)
282
+ if do_fix or @options[:fix] or @options[:fix_wormbase]
283
+ if @options[:fix_wormbase] and rec0.id.index('gene1')==0
284
+ # Wormbase gene1 only, so ignore rest
285
+ else
286
+ test_frame = 0
287
+ ntseq = Bio::Sequence::NA.new(seq)
288
+ aaseq = ntseq.translate
289
+ if aaseq.count('*') > 1
290
+ test_frame = 1
291
+ seq = seq[1..-1]
292
+ ntseq = Bio::Sequence::NA.new(seq)
293
+ aaseq = ntseq.translate
294
+ if aaseq.count('*') > 1
295
+ test_frame = 2
296
+ seq = seq[1..-1]
297
+ ntseq = Bio::Sequence::NA.new(seq)
298
+ aaseq = ntseq.translate
299
+ raise 'Validation problem '+rec0.id if aaseq.count('*') > 1
300
+ end
301
+ end
302
+ if test_frame > 0
303
+ warn rec0.id,"Frame adjusted to #{test_frame} (fix)"
304
+ end
305
+ end
306
+ end
277
307
  if do_trim
278
308
  reduce = seq.size % 3
279
309
  seq = seq[0..(seq.size-1-reduce)] if reduce != 0
280
310
  end
311
+ if @options[:validate]
312
+ ntseq = Bio::Sequence::NA.new(seq)
313
+ aaseq = ntseq.translate
314
+ raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1
315
+ end
316
+
281
317
  retval = seq
282
318
  retval
283
319
  end
284
320
 
285
321
  # Patch a sequence together from a Sequence string and an array
286
322
  # of records and translate in the correct direction and frame. The options
287
- # are the same as for +assemble+.
288
- def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>false, :complement=>true }
323
+ # are the same as for +assemble+, except :trim defaults to true.
324
+ def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true }
289
325
  seq = assemble(sequence, startpos, reclist, options)
290
326
  ntseq = Bio::Sequence::NA.new(seq)
291
327
  ntseq.translate
@@ -10,11 +10,11 @@ module Bio
10
10
 
11
11
  def put id, seq
12
12
  puts '>'+id
13
- put_seq seq
13
+ put_seq id, seq
14
14
  end
15
15
  private
16
16
 
17
- def put_seq seq
17
+ def put_seq id, seq
18
18
  if @do_translate or @do_validate
19
19
  ntseq = Bio::Sequence::NA.new(seq)
20
20
  aaseq = ntseq.translate
@@ -30,7 +30,6 @@ describe GFFdb, "Assemble CDS (extra checks)" do
30
30
  name = "cds:MhA1_Contig1040.frz3.gene"
31
31
  recs = @cdslist[name]
32
32
  component = @componentlist[name]
33
- p recs
34
33
  ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
35
34
  ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
36
35
  ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
@@ -56,6 +55,26 @@ describe GFFdb, "Assemble CDS (extra checks)" do
56
55
  aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
57
56
  aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
58
57
  end
58
+ it "should fix Wormbase error MhA1_Contig3426.frz3.gene1" do
59
+ @contigsequence = @gff.sequencelist["MhA1_Contig3426"]
60
+ @componentlist = {}
61
+ @cdslist = {}
62
+ @gff.each_CDS do | id, reclist, component |
63
+ @componentlist[id] = component
64
+ @cdslist[id] = reclist
65
+ end
66
+ name = "cds:MhA1_Contig3426.frz3.gene1"
67
+ recs = @cdslist[name]
68
+ component = @componentlist[name]
69
+ # :raw should not fix
70
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
71
+ ntseq.should == "GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
72
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true,:fix=>true)
73
+ ntseq.should == "ATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
74
+ ntseq.size.should == 153
75
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs,:fix=>true)
76
+ aaseq.should == "IQQQQLEVFPSSSSAPQQQQYPAQQLQLVTPFIACIADELRELIDEMRMF*"
77
+ end
59
78
  end
60
79
 
61
80
 
@@ -77,7 +77,7 @@ describe GFFdb, "Assemble CDS" do
77
77
  component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
78
78
  cds0 = recs[0]
79
79
  cds0.seqname.should == 'MhA1_Contig1133'
80
- seq = @gff.assemble(@contigsequence,component.start,[cds0])
80
+ seq = @gff.assemble(@contigsequence,component.start,[cds0],:trim=>false)
81
81
  seq.size.should == 143
82
82
  seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
83
83
  aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
@@ -88,7 +88,7 @@ describe GFFdb, "Assemble CDS" do
88
88
  recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
89
89
  component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
90
90
  cds1 = recs[1]
91
- seq = @gff.assemble(@contigsequence,component.start,[cds1])
91
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:trim=>false)
92
92
  seq.size.should == 244
93
93
  seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
94
94
  seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase => true)
@@ -9,6 +9,8 @@ ene29.1;Parent=transcript:MhA1_Contig1040.frz3.gene29
9
9
  MhA1_Contig1040 WormBase CDS 1 180 . - 2 ID=cds:MhA1_Contig1040.frz3.gene
10
10
  29;Parent=transcript:MhA1_Contig1040.frz3.gene29
11
11
 
12
+
13
+
12
14
  ##gff-version 3 ##sequence-regio
13
15
  # Gene gene:MhA1_Contig2992.frz3.gene1
14
16
  MhA1_Contig2992 WormBase gene 577 2176 . - . ID=gene:MhA1_Contig2992.frz3.gene1;Name=MhA1_Contig2992.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig2992.frz3.gene1
@@ -50,6 +52,13 @@ MhA1_Contig2992 RepeatMask repeat_region 2957 3019 28 + . ID=RepeatMask.324638
50
52
  MhA1_Contig2992 Dust repeat_region 2959 3019 . - . ID=Dust.42112
51
53
  MhA1_Contig2992 RepeatMask repeat_region 3194 3237 23 + . ID=RepeatMask.324639
52
54
  MhA1_Contig2992 Dust repeat_region 3222 3277 . - . ID=Dust.42114
55
+ ##gff-version 3
56
+ ##sequence-region MhA1_Contig3426 1 2029
57
+ # Gene gene:MhA1_Contig3426.frz3.gene1
58
+ MhA1_Contig3426 WormBase gene 192 346 . + . ID=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig3426.frz3.gene1
59
+ MhA1_Contig3426 WormBase mRNA 192 346 . + . ID=transcript:MhA1_Contig3426.frz3.gene1;Parent=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;public_name=MhA1_Contig3426.frz3.gene1
60
+ MhA1_Contig3426 WormBase exon 192 346 . + . ID=exon:MhA1_Contig3426.frz3.gene1.1;Parent=transcript:MhA1_Contig3426.frz3.gene1
61
+ MhA1_Contig3426 WormBase CDS 192 346 . + 0 ID=cds:MhA1_Contig3426.frz3.gene1;Parent=transcript:MhA1_Contig3426.frz3.gene1
53
62
  ##FASTA
54
63
  >MhA1_Contig2992
55
64
  TTTTGGTGACCAAAGTTCCTATTGGTGACCAAAATTCCAGTGCCCAATATTCCGTTTTTTGACTTGGTGACCAAAATTCC
@@ -96,3 +105,9 @@ TATTTTTTGTTAAATAAAAGGTTTAAATTAATTATTTGTGCTTTTTCGAATTTTTCATTTAAATCCTTTATTTTTTTGAA
96
105
  ATTATCATAAAGCTCTAATGATGCTTTTTGAATTTTTGAGACATTTTCAATATCAAAATTTGGTCCGGAAAATTTATTTA
97
106
  >MhA1_Contig1040
98
107
  TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAAAC
108
+ >MhA1_Contig3426
109
+ TTAATAAATTTAATTCATTAAAATTTTAAAAAGAAAGGGACATTCGAGGGGAAATGAGAGAGAACGAGAGAAAATGGACG
110
+ GGAAATTAAATTAAAAAATAAAAAATTAATTTTTATTTTTTTTTATTTAATTTAAAATTAATTTTCTACATTTATTAAAT
111
+ CTTAAATTATTAATTTTAAATTAATTTAAAG GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCC
112
+ TCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGT
113
+ TGATAGATGAAATGCGTATGTTTTAG AATATTTTTTAAAAAAAAATTAAAAAAAATTTTTTTTTGCCAAACAGGCTCTCG
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 8
8
- - 2
9
- version: 0.8.2
8
+ - 3
9
+ version: 0.8.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pjotr Prins
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
180
180
  requirements:
181
181
  - - ">="
182
182
  - !ruby/object:Gem::Version
183
- hash: -876270257
183
+ hash: -450771567
184
184
  segments:
185
185
  - 0
186
186
  version: "0"