RubyGems - bio-gff3 - Versions diffs - 0.8.3 → 0.8.4 - Mend

bio-gff3 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/README.rdoc +88 -0
data/VERSION +1 -1
data/bin/gff3-fetch +11 -5
data/bio-gff3.gemspec +1 -1
data/lib/bio/db/gff/gffinmemory.rb +1 -0
data/lib/bio/db/gff/gffnocache.rb +1 -0
data/lib/bio/db/gff/gffparser.rb +37 -14
data/spec/gff3_fileiterator_spec.rb +1 -1
data/spec/gffdb_spec.rb +4 -2
data/test/data/gff/test.gff3 +1 -0
metadata +3 -3

data/README.rdoc CHANGED

@@ -58,3 +58,91 @@ For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html
 Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
+  Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
+    gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
+  Where (NYI == Not Yet Implemented):
+    --translate     : output as amino acid sequence
+    --validate      : validate GFF3 file by translating
+    --fix           : check 3-frame translation and fix, if possible
+    --fix-wormbase  : fix 3-frame translation on ORFs named 'gene1'
+    --no-assemble   : output each record as a sequence -- NYI
+    --add-phase     : output records using phase (useful w. no-assemble CDS to AA) --NYI
+  type is any valid type in the GFF3 definition. For example:
+    mRNA            : assemble mRNA
+    CDS             : assemble CDS
+    exon            : list all exons
+    gene|ORF        : list gene ORFs
+    other           : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
+  and the following performance options:
+    --cache full    : load all in RAM (fast)
+    --cache none    : do not load anything in memory (slow)
+    --low-mem       : use LRU cache (limit RAM use, fast) -- NYI
+    --max-cpus num  : use num threads -- NYI
+    --emboss        : use EMBOSS translation (fast) -- NYI
+  Multiple GFF3 files can be used. With external FASTA files, always the last
+  one before the GFF3 filename is matched.
+  Note that above switches are only partially implemented at this stage. Full
+  feature support is projected Feb. 2011.
+  Examples:
+    Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
+      gff3-fetch mRNA test/data/gff/test.gff3
+      gff3-fetch CDS test/data/gff/test.gff3
+    Find CDS records from external FASTA file, adding phase and translate to protein sequence
+      gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
+    Find mRNA from external FASTA file, without loading everything in RAM
+      gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
+      gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
+    Validate GFF3 file using EMBOSS translation and validation
+      gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
+    Find GENEID predicted terminal exons
+      gff3-fetch terminal chromosome1.fa geneid.gff3
+== Performance
+time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
+  Cache      real     user     sys
+  ----------------------------------------------------
+  full       12m41s   12m28s   0m09s (0.8.0 Jan. 2011)
+  none      504m39s  477m49s  26m50s (0.8.0 Jan. 2011)
+  ----------------------------------------------------
+where
+   52M m_hapla.WS217.dna.fa
+  456M m_hapla.WS217.gff3
+ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
+on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine.
+== Cite
+  If you use this software, please cite
+    http://dx.doi.org/10.1093/bioinformatics/btq475
+== Copyright
+Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.8.3
1	+ 0.8.4

data/bin/gff3-fetch CHANGED

@@ -26,7 +26,7 @@ USAGE = <<EOM
     mRNA            : assemble mRNA
     CDS             : assemble CDS
     exon            : list all exons
-    gene/ORF        : list gene ORFs -- NYI
+    gene|ORF        : list gene ORFs
     other           : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
   and the following performance options:
@@ -167,8 +167,6 @@ opts.parse!(ARGV)
 gfftype = ARGV.shift
-raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
 fastafn = nil
 ARGV.each do | fn |
@@ -189,7 +187,15 @@ ARGV.each do | fn |
   gff = gffdb.assembler
   writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
   case gfftype.downcase
-    when 'mrna'
+    when 'gene'
+          gff.each_gene_seq do | id, seq |
+            writer.put(id,seq)
+          end
+    when 'orf'
+          gff.each_gene_seq do | id, seq |
+            writer.put(id,seq)
+          end
+    when 'mrna'
           gff.each_mRNA_seq do | id, seq |
             writer.put(id,seq)
           end
@@ -202,7 +208,7 @@ ARGV.each do | fn |
             writer.put(id,seq)
           end
     else
-      raise "Unknown action <#{gfftype}>"
+      raise "Unknown action on type <#{gfftype}>"
   end
   fastafn = nil
 end

data/bio-gff3.gemspec CHANGED

@@ -5,7 +5,7 @@
 Gem::Specification.new do |s|
   s.name = %q{bio-gff3}
-  s.version = "0.8.3"
+  s.version = "0.8.4"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Pjotr Prins"]

data/lib/bio/db/gff/gffinmemory.rb CHANGED

@@ -30,6 +30,7 @@ module Bio
           @count_ids          = Counter.new   # Count ids
           @count_seqnames     = Counter.new   # Count seqnames
           @componentlist      = {} # Store containers, like genes, contigs
+          @orflist            = LinkedRecs.new
           @mrnalist           = LinkedRecs.new   # Store linked mRNA records
           @cdslist            = LinkedRecs.new
           @exonlist           = LinkedRecs.new

data/lib/bio/db/gff/gffnocache.rb CHANGED

@@ -90,6 +90,7 @@ module Bio
           @count_ids          = Counter.new   # Count ids
           @count_seqnames     = Counter.new   # Count seqnames
           @componentlist      = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
+          @orflist            = SeekLinkedRecs.new   # Store linked gene records
           @mrnalist           = SeekLinkedRecs.new   # Store linked mRNA records
           @cdslist            = SeekLinkedRecs.new
           @exonlist           = SeekLinkedRecs.new

data/lib/bio/db/gff/gffparser.rb CHANGED

@@ -23,24 +23,26 @@ module Bio
             @count_ids.add(id)
             @count_seqnames.add(rec.seqname)
-            if COMPONENT_TYPES.include?(rec.feature_type)
+            is_component = COMPONENT_TYPES.include?(rec.feature_type)
+            if is_component
               # check for container ID
               warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
               @componentlist[id] = rec
               info "Added #{rec.feature_type} with component ID #{id}"
-            else
-              case rec.feature_type
-                when 'mRNA' || 'SO:0000234'
-                  @mrnalist.add(id,rec)
-                when 'CDS'  || 'SO:0000316'
-                  @cdslist.add(id,rec)
-                when 'exon' || 'SO:0000147'
-                  @exonlist.add(id,rec)
-                else
-                  if !IGNORE_FEATURES.include?(rec.feature_type)
-                    @unrecognized_features[rec.feature_type] = true
-                  end
-              end
+            end
+            case rec.feature_type
+              when 'gene' || 'SO:0000704'
+                @orflist.add(id,rec)
+              when 'mRNA' || 'SO:0000234'
+                @mrnalist.add(id,rec)
+              when 'CDS'  || 'SO:0000316'
+                @cdslist.add(id,rec)
+              when 'exon' || 'SO:0000147'
+                @exonlist.add(id,rec)
+              else
+                if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
+                  @unrecognized_features[rec.feature_type] = true
+                end
             end
         end
@@ -80,6 +82,12 @@ module Bio
           # p :inmemory, @sequencelist
         end
+        # Yield the id, recs, containing component and sequence of genes
+        def each_gene
+          parse if !@orflist
+          each_item(@orflist) { |id, recs, component | yield id, recs, component }
+        end
         # Yield the id, recs, containing component and sequence of mRNAs
         def each_mRNA
           parse if !@mrnalist
@@ -98,6 +106,21 @@ module Bio
           each_item(@exonlist) { |id, recs, component | yield id, recs, component }
         end
+        # Yield a unique description and the sequence
+        def each_gene_seq
+          each_gene do | id, reclist, component |
+            if component
+              sequence = @sequencelist[component.seqname]
+              # p sequence
+              if sequence
+                yield description(id,component,reclist), assemble(sequence,component.start,reclist)
+              else
+                error "No sequence information for",id
+              end
+            end
+          end
+        end
         # Yield a unique description and the sequence
         def each_mRNA_seq
           each_mRNA do | id, reclist, component |

data/spec/gff3_fileiterator_spec.rb CHANGED

@@ -29,7 +29,7 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
       # p [id, rec]
       last = rec
     end
-    last.io_seek.should == 3256
+    last.io_seek.should == 3342
     firstid = 'unknown'
     iter.each_sequence do | id, seq |
       # p [id, seq]

data/spec/gffdb_spec.rb CHANGED

@@ -22,13 +22,15 @@ TESTGFF1FASTA='test/data/gff/test-ext-fasta.fa'
 def iterators_should_be_implemented
   if TEST_NON_IMPLEMENTED
     it "should implement each_gene"
-    it "should implement each_gene"
-    it "should implement each_gene_seq"
     it "should implement each_mRNA"
     it "should implement each_exon"
     it "should implement each_exon_seq"
     it "should implement each_CDS"
   end
+  it "should implement each_gene_seq" do
+    h = {} ; @gff.each_gene_seq { | id, seq | h[id] = seq }
+    h["gene01 Sequence:test01_1:400 (3:280)"].should == "GAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTAGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACACCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGATAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTA" if h["gene01 Sequence:test01_1:400 (3:280)"]
+  end
   it "should implement each_mRNA_seq" do
     h = {} ; @gff.each_mRNA_seq { | id, seq | h[id] = seq }
     h["mrna01short Sequence:test01_1:400 (3:14)"].should == "GAAGATTTGTAT"

data/test/data/gff/test.gff3 CHANGED

@@ -44,6 +44,7 @@ Contig4	confirmed	CDS	32000	35000	.	+	.	ID=Misc:thing3;mRNA=trans-9
 ##gff-version 3
 ##sequence-region test01 1 400
 test01	RANDOM	contig	1	400	.	+	.	ID=test01;Note=this is test
+test01	.	gene	3	280	.	+	.	ID=gene01;Name=tesGene;Parent=test01;Note=this is test gene
 test01	.	mRNA	3	14	.	+	.	ID=mrna01short;Name=testmRNA;Note=this is test mRNA
 test01	.	mRNA	101	230	.	+	.	ID=mrna01;Name=testmRNA;Note=this is test mRNA
 test01	.	mRNA	101	280	.	+	.	ID=mrna01a;Name=testmRNAalterative;Note=test of alternative splicing variant

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 8
-  - 3
-  version: 0.8.3
+  - 4
+  version: 0.8.4
 platform: ruby
 authors:
 - Pjotr Prins
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: -450771567
+      hash: 520620943
       segments:
       - 0
       version: "0"