bio-gff3 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.2
1
+ 0.8.3
data/bin/gff3-fetch CHANGED
@@ -16,9 +16,10 @@ USAGE = <<EOM
16
16
 
17
17
  --translate : output as amino acid sequence
18
18
  --validate : validate GFF3 file by translating
19
+ --fix : check 3-frame translation and fix, if possible
20
+ --fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
19
21
  --no-assemble : output each record as a sequence -- NYI
20
22
  --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
21
- --fix : check 3-frame translation and fix, if possible -- NYI
22
23
 
23
24
  type is any valid type in the GFF3 definition. For example:
24
25
 
@@ -142,6 +143,14 @@ opts = OptionParser.new() { |opts|
142
143
  $stop_on_error = true # replace global in near future
143
144
  end
144
145
 
146
+ opts.on("--fix", "Fix frame errors in the GFF3 definition") do |v|
147
+ options.fix = true
148
+ end
149
+
150
+ opts.on("--fix-wormbase", "Wormbase fix gene1 frame error") do |v|
151
+ options.fix_wormbase = true
152
+ end
153
+
145
154
  # opts.on("-q", "--quiet", "Run quietly") do |q|
146
155
  # options.quiet = q
147
156
  # end
@@ -174,6 +183,8 @@ ARGV.each do | fn |
174
183
  opts[:cache_components] = options.cache
175
184
  opts[:cache_records] = options.cache
176
185
  opts[:fasta_filename] = fastafn if fastafn
186
+ opts[:fix_wormbase] = options.fix_wormbase
187
+ opts[:fix] = options.fix
177
188
  gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts)
178
189
  gff = gffdb.assembler
179
190
  writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
data/bio-gff3.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-gff3}
8
- s.version = "0.8.2"
8
+ s.version = "0.8.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
@@ -206,16 +206,20 @@ module Bio
206
206
  # :reverse : do reverse if reverse is indicated (default true)
207
207
  # :complement : do complement if reverse is indicated (default true)
208
208
  # :phase : do set CDS phase (default false, normally ignore)
209
- # :trim : make sure sequence is multiple of 3 nucleotide bps (default false)
209
+ # :trim : make sure sequence is multiple of 3 nucleotide bps (default true)
210
210
  #
211
- # there are two special options:
211
+ # special options:
212
212
  #
213
213
  # :raw : raw sequence (all above false)
214
- # :codonize : codon sequence (reverse, complement and trim are true)
214
+ # :codonize : codon sequence (reverse, complement, and trim are true)
215
+ # :fix : fix errors (default false)
215
216
  #
216
- def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>false, :complement=>true, :debug=>false }
217
+ def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false }
218
+ # default to nil, if not passed in
217
219
  do_debug = options[:debug]
218
220
  do_phase = options[:phase]
221
+ do_fix = options[:fix]
222
+ # default to true, if not passed in
219
223
  do_reverse = (options[:reverse] == false ? false : true)
220
224
  do_trim = (options[:trim] == false ? false : true)
221
225
  do_complement = (options[:complement] == false ? false : true)
@@ -274,18 +278,50 @@ module Bio
274
278
  ntseq = Bio::Sequence::NA.new(seq)
275
279
  seq = ntseq.forward_complement.upcase
276
280
  end
281
+ # This is the place to fix sequences (e.g. the Wormbase bug)
282
+ if do_fix or @options[:fix] or @options[:fix_wormbase]
283
+ if @options[:fix_wormbase] and rec0.id.index('gene1')==0
284
+ # Wormbase gene1 only, so ignore rest
285
+ else
286
+ test_frame = 0
287
+ ntseq = Bio::Sequence::NA.new(seq)
288
+ aaseq = ntseq.translate
289
+ if aaseq.count('*') > 1
290
+ test_frame = 1
291
+ seq = seq[1..-1]
292
+ ntseq = Bio::Sequence::NA.new(seq)
293
+ aaseq = ntseq.translate
294
+ if aaseq.count('*') > 1
295
+ test_frame = 2
296
+ seq = seq[1..-1]
297
+ ntseq = Bio::Sequence::NA.new(seq)
298
+ aaseq = ntseq.translate
299
+ raise 'Validation problem '+rec0.id if aaseq.count('*') > 1
300
+ end
301
+ end
302
+ if test_frame > 0
303
+ warn rec0.id,"Frame adjusted to #{test_frame} (fix)"
304
+ end
305
+ end
306
+ end
277
307
  if do_trim
278
308
  reduce = seq.size % 3
279
309
  seq = seq[0..(seq.size-1-reduce)] if reduce != 0
280
310
  end
311
+ if @options[:validate]
312
+ ntseq = Bio::Sequence::NA.new(seq)
313
+ aaseq = ntseq.translate
314
+ raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1
315
+ end
316
+
281
317
  retval = seq
282
318
  retval
283
319
  end
284
320
 
285
321
  # Patch a sequence together from a Sequence string and an array
286
322
  # of records and translate in the correct direction and frame. The options
287
- # are the same as for +assemble+.
288
- def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>false, :complement=>true }
323
+ # are the same as for +assemble+, except :trim defaults to true.
324
+ def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true }
289
325
  seq = assemble(sequence, startpos, reclist, options)
290
326
  ntseq = Bio::Sequence::NA.new(seq)
291
327
  ntseq.translate
@@ -10,11 +10,11 @@ module Bio
10
10
 
11
11
  def put id, seq
12
12
  puts '>'+id
13
- put_seq seq
13
+ put_seq id, seq
14
14
  end
15
15
  private
16
16
 
17
- def put_seq seq
17
+ def put_seq id, seq
18
18
  if @do_translate or @do_validate
19
19
  ntseq = Bio::Sequence::NA.new(seq)
20
20
  aaseq = ntseq.translate
@@ -30,7 +30,6 @@ describe GFFdb, "Assemble CDS (extra checks)" do
30
30
  name = "cds:MhA1_Contig1040.frz3.gene"
31
31
  recs = @cdslist[name]
32
32
  component = @componentlist[name]
33
- p recs
34
33
  ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
35
34
  ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
36
35
  ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
@@ -56,6 +55,26 @@ describe GFFdb, "Assemble CDS (extra checks)" do
56
55
  aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
57
56
  aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
58
57
  end
58
+ it "should fix Wormbase error MhA1_Contig3426.frz3.gene1" do
59
+ @contigsequence = @gff.sequencelist["MhA1_Contig3426"]
60
+ @componentlist = {}
61
+ @cdslist = {}
62
+ @gff.each_CDS do | id, reclist, component |
63
+ @componentlist[id] = component
64
+ @cdslist[id] = reclist
65
+ end
66
+ name = "cds:MhA1_Contig3426.frz3.gene1"
67
+ recs = @cdslist[name]
68
+ component = @componentlist[name]
69
+ # :raw should not fix
70
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
71
+ ntseq.should == "GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
72
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true,:fix=>true)
73
+ ntseq.should == "ATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCCTCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGTTGATAGATGAAATGCGTATGTTTTAG"
74
+ ntseq.size.should == 153
75
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs,:fix=>true)
76
+ aaseq.should == "IQQQQLEVFPSSSSAPQQQQYPAQQLQLVTPFIACIADELRELIDEMRMF*"
77
+ end
59
78
  end
60
79
 
61
80
 
@@ -77,7 +77,7 @@ describe GFFdb, "Assemble CDS" do
77
77
  component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
78
78
  cds0 = recs[0]
79
79
  cds0.seqname.should == 'MhA1_Contig1133'
80
- seq = @gff.assemble(@contigsequence,component.start,[cds0])
80
+ seq = @gff.assemble(@contigsequence,component.start,[cds0],:trim=>false)
81
81
  seq.size.should == 143
82
82
  seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
83
83
  aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
@@ -88,7 +88,7 @@ describe GFFdb, "Assemble CDS" do
88
88
  recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
89
89
  component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
90
90
  cds1 = recs[1]
91
- seq = @gff.assemble(@contigsequence,component.start,[cds1])
91
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:trim=>false)
92
92
  seq.size.should == 244
93
93
  seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
94
94
  seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase => true)
@@ -9,6 +9,8 @@ ene29.1;Parent=transcript:MhA1_Contig1040.frz3.gene29
9
9
  MhA1_Contig1040 WormBase CDS 1 180 . - 2 ID=cds:MhA1_Contig1040.frz3.gene
10
10
  29;Parent=transcript:MhA1_Contig1040.frz3.gene29
11
11
 
12
+
13
+
12
14
  ##gff-version 3 ##sequence-regio
13
15
  # Gene gene:MhA1_Contig2992.frz3.gene1
14
16
  MhA1_Contig2992 WormBase gene 577 2176 . - . ID=gene:MhA1_Contig2992.frz3.gene1;Name=MhA1_Contig2992.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig2992.frz3.gene1
@@ -50,6 +52,13 @@ MhA1_Contig2992 RepeatMask repeat_region 2957 3019 28 + . ID=RepeatMask.324638
50
52
  MhA1_Contig2992 Dust repeat_region 2959 3019 . - . ID=Dust.42112
51
53
  MhA1_Contig2992 RepeatMask repeat_region 3194 3237 23 + . ID=RepeatMask.324639
52
54
  MhA1_Contig2992 Dust repeat_region 3222 3277 . - . ID=Dust.42114
55
+ ##gff-version 3
56
+ ##sequence-region MhA1_Contig3426 1 2029
57
+ # Gene gene:MhA1_Contig3426.frz3.gene1
58
+ MhA1_Contig3426 WormBase gene 192 346 . + . ID=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;Note=PREDICTED protein_coding;public_name=MhA1_Contig3426.frz3.gene1
59
+ MhA1_Contig3426 WormBase mRNA 192 346 . + . ID=transcript:MhA1_Contig3426.frz3.gene1;Parent=gene:MhA1_Contig3426.frz3.gene1;Name=MhA1_Contig3426.frz3.gene1;public_name=MhA1_Contig3426.frz3.gene1
60
+ MhA1_Contig3426 WormBase exon 192 346 . + . ID=exon:MhA1_Contig3426.frz3.gene1.1;Parent=transcript:MhA1_Contig3426.frz3.gene1
61
+ MhA1_Contig3426 WormBase CDS 192 346 . + 0 ID=cds:MhA1_Contig3426.frz3.gene1;Parent=transcript:MhA1_Contig3426.frz3.gene1
53
62
  ##FASTA
54
63
  >MhA1_Contig2992
55
64
  TTTTGGTGACCAAAGTTCCTATTGGTGACCAAAATTCCAGTGCCCAATATTCCGTTTTTTGACTTGGTGACCAAAATTCC
@@ -96,3 +105,9 @@ TATTTTTTGTTAAATAAAAGGTTTAAATTAATTATTTGTGCTTTTTCGAATTTTTCATTTAAATCCTTTATTTTTTTGAA
96
105
  ATTATCATAAAGCTCTAATGATGCTTTTTGAATTTTTGAGACATTTTCAATATCAAAATTTGGTCCGGAAAATTTATTTA
97
106
  >MhA1_Contig1040
98
107
  TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAAAC
108
+ >MhA1_Contig3426
109
+ TTAATAAATTTAATTCATTAAAATTTTAAAAAGAAAGGGACATTCGAGGGGAAATGAGAGAGAACGAGAGAAAATGGACG
110
+ GGAAATTAAATTAAAAAATAAAAAATTAATTTTTATTTTTTTTTATTTAATTTAAAATTAATTTTCTACATTTATTAAAT
111
+ CTTAAATTATTAATTTTAAATTAATTTAAAG GCATCCAACAACAACAATTAGAAGTCTTTCCCAGCTCCTCCTCTGCCCC
112
+ TCAGCAACAACAATACCCAGCGCAGCAGCTTCAATTAGTTACTCCTTTTATTGCATGCATAGCAGATGAATTGAGGGAGT
113
+ TGATAGATGAAATGCGTATGTTTTAG AATATTTTTTAAAAAAAAATTAAAAAAAATTTTTTTTTGCCAAACAGGCTCTCG
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 8
8
- - 2
9
- version: 0.8.2
8
+ - 3
9
+ version: 0.8.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pjotr Prins
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
180
180
  requirements:
181
181
  - - ">="
182
182
  - !ruby/object:Gem::Version
183
- hash: -876270257
183
+ hash: -450771567
184
184
  segments:
185
185
  - 0
186
186
  version: "0"