bio-gff3 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +88 -0
- data/VERSION +1 -1
- data/bin/gff3-fetch +11 -5
- data/bio-gff3.gemspec +1 -1
- data/lib/bio/db/gff/gffinmemory.rb +1 -0
- data/lib/bio/db/gff/gffnocache.rb +1 -0
- data/lib/bio/db/gff/gffparser.rb +37 -14
- data/spec/gff3_fileiterator_spec.rb +1 -1
- data/spec/gffdb_spec.rb +4 -2
- data/test/data/gff/test.gff3 +1 -0
- metadata +3 -3
    
        data/README.rdoc
    CHANGED
    
    | @@ -58,3 +58,91 @@ For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html | |
| 58 58 | 
             
            Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl> 
         | 
| 59 59 |  | 
| 60 60 |  | 
| 61 | 
            +
             | 
| 62 | 
            +
              Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format. 
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
         | 
| 65 | 
            +
             | 
| 66 | 
            +
              Where (NYI == Not Yet Implemented):
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                --translate     : output as amino acid sequence 
         | 
| 69 | 
            +
                --validate      : validate GFF3 file by translating
         | 
| 70 | 
            +
                --fix           : check 3-frame translation and fix, if possible 
         | 
| 71 | 
            +
                --fix-wormbase  : fix 3-frame translation on ORFs named 'gene1'
         | 
| 72 | 
            +
                --no-assemble   : output each record as a sequence -- NYI
         | 
| 73 | 
            +
                --add-phase     : output records using phase (useful w. no-assemble CDS to AA) --NYI
         | 
| 74 | 
            +
              
         | 
| 75 | 
            +
              type is any valid type in the GFF3 definition. For example:
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                mRNA            : assemble mRNA
         | 
| 78 | 
            +
                CDS             : assemble CDS 
         | 
| 79 | 
            +
                exon            : list all exons
         | 
| 80 | 
            +
                gene|ORF        : list gene ORFs 
         | 
| 81 | 
            +
                other           : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
         | 
| 82 | 
            +
             | 
| 83 | 
            +
              and the following performance options:
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                --cache full    : load all in RAM (fast)
         | 
| 86 | 
            +
                --cache none    : do not load anything in memory (slow)
         | 
| 87 | 
            +
                --low-mem       : use LRU cache (limit RAM use, fast) -- NYI
         | 
| 88 | 
            +
                --max-cpus num  : use num threads -- NYI
         | 
| 89 | 
            +
                --emboss        : use EMBOSS translation (fast) -- NYI
         | 
| 90 | 
            +
             | 
| 91 | 
            +
              Multiple GFF3 files can be used. With external FASTA files, always the last
         | 
| 92 | 
            +
              one before the GFF3 filename is matched.
         | 
| 93 | 
            +
             | 
| 94 | 
            +
              Note that above switches are only partially implemented at this stage. Full
         | 
| 95 | 
            +
              feature support is projected Feb. 2011.
         | 
| 96 | 
            +
             | 
| 97 | 
            +
              Examples:
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                  gff3-fetch mRNA test/data/gff/test.gff3
         | 
| 102 | 
            +
                  gff3-fetch CDS test/data/gff/test.gff3
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                Find CDS records from external FASTA file, adding phase and translate to protein sequence
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                  gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                Find mRNA from external FASTA file, without loading everything in RAM
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                  gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3   
         | 
| 111 | 
            +
                  gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3   
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                Validate GFF3 file using EMBOSS translation and validation
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                  gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3   
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                Find GENEID predicted terminal exons
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                  gff3-fetch terminal chromosome1.fa geneid.gff3
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            == Performance
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
         | 
| 124 | 
            +
             | 
| 125 | 
            +
              Cache      real     user     sys
         | 
| 126 | 
            +
              ----------------------------------------------------
         | 
| 127 | 
            +
              full       12m41s   12m28s   0m09s (0.8.0 Jan. 2011)
         | 
| 128 | 
            +
              none      504m39s  477m49s  26m50s (0.8.0 Jan. 2011)
         | 
| 129 | 
            +
              ----------------------------------------------------
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            where
         | 
| 132 | 
            +
             | 
| 133 | 
            +
               52M m_hapla.WS217.dna.fa
         | 
| 134 | 
            +
              456M m_hapla.WS217.gff3
         | 
| 135 | 
            +
             | 
| 136 | 
            +
            ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
         | 
| 137 | 
            +
            on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine. 
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            == Cite
         | 
| 140 | 
            +
             | 
| 141 | 
            +
              If you use this software, please cite 
         | 
| 142 | 
            +
              
         | 
| 143 | 
            +
                http://dx.doi.org/10.1093/bioinformatics/btq475
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            == Copyright
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl> 
         | 
| 148 | 
            +
             | 
    
        data/VERSION
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            0.8. | 
| 1 | 
            +
            0.8.4
         | 
    
        data/bin/gff3-fetch
    CHANGED
    
    | @@ -26,7 +26,7 @@ USAGE = <<EOM | |
| 26 26 | 
             
                mRNA            : assemble mRNA
         | 
| 27 27 | 
             
                CDS             : assemble CDS 
         | 
| 28 28 | 
             
                exon            : list all exons
         | 
| 29 | 
            -
                gene | 
| 29 | 
            +
                gene|ORF        : list gene ORFs 
         | 
| 30 30 | 
             
                other           : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
         | 
| 31 31 |  | 
| 32 32 | 
             
              and the following performance options:
         | 
| @@ -167,8 +167,6 @@ opts.parse!(ARGV) | |
| 167 167 |  | 
| 168 168 | 
             
            gfftype = ARGV.shift
         | 
| 169 169 |  | 
| 170 | 
            -
            raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
         | 
| 171 | 
            -
             | 
| 172 170 | 
             
            fastafn = nil
         | 
| 173 171 |  | 
| 174 172 | 
             
            ARGV.each do | fn |
         | 
| @@ -189,7 +187,15 @@ ARGV.each do | fn | | |
| 189 187 | 
             
              gff = gffdb.assembler
         | 
| 190 188 | 
             
              writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
         | 
| 191 189 | 
             
              case gfftype.downcase
         | 
| 192 | 
            -
                when ' | 
| 190 | 
            +
                when 'gene'
         | 
| 191 | 
            +
                      gff.each_gene_seq do | id, seq |
         | 
| 192 | 
            +
                        writer.put(id,seq)
         | 
| 193 | 
            +
                      end
         | 
| 194 | 
            +
                when 'orf'
         | 
| 195 | 
            +
                      gff.each_gene_seq do | id, seq |
         | 
| 196 | 
            +
                        writer.put(id,seq)
         | 
| 197 | 
            +
                      end
         | 
| 198 | 
            +
                when 'mrna'
         | 
| 193 199 | 
             
                      gff.each_mRNA_seq do | id, seq |
         | 
| 194 200 | 
             
                        writer.put(id,seq)
         | 
| 195 201 | 
             
                      end
         | 
| @@ -202,7 +208,7 @@ ARGV.each do | fn | | |
| 202 208 | 
             
                        writer.put(id,seq)
         | 
| 203 209 | 
             
                      end
         | 
| 204 210 | 
             
                else
         | 
| 205 | 
            -
                  raise "Unknown action <#{gfftype}>"
         | 
| 211 | 
            +
                  raise "Unknown action on type <#{gfftype}>"
         | 
| 206 212 | 
             
              end
         | 
| 207 213 | 
             
              fastafn = nil
         | 
| 208 214 | 
             
            end
         | 
    
        data/bio-gff3.gemspec
    CHANGED
    
    
| @@ -30,6 +30,7 @@ module Bio | |
| 30 30 | 
             
                      @count_ids          = Counter.new   # Count ids
         | 
| 31 31 | 
             
                      @count_seqnames     = Counter.new   # Count seqnames
         | 
| 32 32 | 
             
                      @componentlist      = {} # Store containers, like genes, contigs
         | 
| 33 | 
            +
                      @orflist            = LinkedRecs.new
         | 
| 33 34 | 
             
                      @mrnalist           = LinkedRecs.new   # Store linked mRNA records
         | 
| 34 35 | 
             
                      @cdslist            = LinkedRecs.new
         | 
| 35 36 | 
             
                      @exonlist           = LinkedRecs.new
         | 
| @@ -90,6 +90,7 @@ module Bio | |
| 90 90 | 
             
                      @count_ids          = Counter.new   # Count ids
         | 
| 91 91 | 
             
                      @count_seqnames     = Counter.new   # Count seqnames
         | 
| 92 92 | 
             
                      @componentlist      = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
         | 
| 93 | 
            +
                      @orflist            = SeekLinkedRecs.new   # Store linked gene records
         | 
| 93 94 | 
             
                      @mrnalist           = SeekLinkedRecs.new   # Store linked mRNA records
         | 
| 94 95 | 
             
                      @cdslist            = SeekLinkedRecs.new
         | 
| 95 96 | 
             
                      @exonlist           = SeekLinkedRecs.new
         | 
    
        data/lib/bio/db/gff/gffparser.rb
    CHANGED
    
    | @@ -23,24 +23,26 @@ module Bio | |
| 23 23 | 
             
                        @count_ids.add(id)
         | 
| 24 24 | 
             
                        @count_seqnames.add(rec.seqname)
         | 
| 25 25 |  | 
| 26 | 
            -
                         | 
| 26 | 
            +
                        is_component = COMPONENT_TYPES.include?(rec.feature_type)
         | 
| 27 | 
            +
                        if is_component
         | 
| 27 28 | 
             
                          # check for container ID
         | 
| 28 29 | 
             
                          warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
         | 
| 29 30 | 
             
                          @componentlist[id] = rec
         | 
| 30 31 | 
             
                          info "Added #{rec.feature_type} with component ID #{id}"
         | 
| 31 | 
            -
                         | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 32 | 
            +
                        end 
         | 
| 33 | 
            +
                        case rec.feature_type
         | 
| 34 | 
            +
                          when 'gene' || 'SO:0000704'
         | 
| 35 | 
            +
                            @orflist.add(id,rec)
         | 
| 36 | 
            +
                          when 'mRNA' || 'SO:0000234'
         | 
| 37 | 
            +
                            @mrnalist.add(id,rec)
         | 
| 38 | 
            +
                          when 'CDS'  || 'SO:0000316'
         | 
| 39 | 
            +
                            @cdslist.add(id,rec)
         | 
| 40 | 
            +
                          when 'exon' || 'SO:0000147'
         | 
| 41 | 
            +
                            @exonlist.add(id,rec)
         | 
| 42 | 
            +
                          else
         | 
| 43 | 
            +
                            if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
         | 
| 44 | 
            +
                              @unrecognized_features[rec.feature_type] = true
         | 
| 45 | 
            +
                            end
         | 
| 44 46 | 
             
                        end
         | 
| 45 47 | 
             
                    end
         | 
| 46 48 |  | 
| @@ -80,6 +82,12 @@ module Bio | |
| 80 82 | 
             
                      # p :inmemory, @sequencelist
         | 
| 81 83 | 
             
                    end
         | 
| 82 84 |  | 
| 85 | 
            +
                    # Yield the id, recs, containing component and sequence of genes
         | 
| 86 | 
            +
                    def each_gene
         | 
| 87 | 
            +
                      parse if !@orflist
         | 
| 88 | 
            +
                      each_item(@orflist) { |id, recs, component | yield id, recs, component }
         | 
| 89 | 
            +
                    end
         | 
| 90 | 
            +
             | 
| 83 91 | 
             
                    # Yield the id, recs, containing component and sequence of mRNAs
         | 
| 84 92 | 
             
                    def each_mRNA
         | 
| 85 93 | 
             
                      parse if !@mrnalist
         | 
| @@ -98,6 +106,21 @@ module Bio | |
| 98 106 | 
             
                      each_item(@exonlist) { |id, recs, component | yield id, recs, component }
         | 
| 99 107 | 
             
                    end
         | 
| 100 108 |  | 
| 109 | 
            +
                    # Yield a unique description and the sequence
         | 
| 110 | 
            +
                    def each_gene_seq
         | 
| 111 | 
            +
                      each_gene do | id, reclist, component |
         | 
| 112 | 
            +
                        if component
         | 
| 113 | 
            +
                          sequence = @sequencelist[component.seqname]
         | 
| 114 | 
            +
                          # p sequence
         | 
| 115 | 
            +
                          if sequence
         | 
| 116 | 
            +
                            yield description(id,component,reclist), assemble(sequence,component.start,reclist)
         | 
| 117 | 
            +
                          else 
         | 
| 118 | 
            +
                            error "No sequence information for",id
         | 
| 119 | 
            +
                          end
         | 
| 120 | 
            +
                        end
         | 
| 121 | 
            +
                      end
         | 
| 122 | 
            +
                    end
         | 
| 123 | 
            +
             | 
| 101 124 | 
             
                    # Yield a unique description and the sequence
         | 
| 102 125 | 
             
                    def each_mRNA_seq
         | 
| 103 126 | 
             
                      each_mRNA do | id, reclist, component |
         | 
    
        data/spec/gffdb_spec.rb
    CHANGED
    
    | @@ -22,13 +22,15 @@ TESTGFF1FASTA='test/data/gff/test-ext-fasta.fa' | |
| 22 22 | 
             
            def iterators_should_be_implemented
         | 
| 23 23 | 
             
              if TEST_NON_IMPLEMENTED
         | 
| 24 24 | 
             
                it "should implement each_gene" 
         | 
| 25 | 
            -
                it "should implement each_gene" 
         | 
| 26 | 
            -
                it "should implement each_gene_seq" 
         | 
| 27 25 | 
             
                it "should implement each_mRNA" 
         | 
| 28 26 | 
             
                it "should implement each_exon" 
         | 
| 29 27 | 
             
                it "should implement each_exon_seq" 
         | 
| 30 28 | 
             
                it "should implement each_CDS" 
         | 
| 31 29 | 
             
              end
         | 
| 30 | 
            +
              it "should implement each_gene_seq" do
         | 
| 31 | 
            +
                h = {} ; @gff.each_gene_seq { | id, seq | h[id] = seq }
         | 
| 32 | 
            +
                h["gene01 Sequence:test01_1:400 (3:280)"].should == "GAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTAGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACACCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGATAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTA" if h["gene01 Sequence:test01_1:400 (3:280)"]
         | 
| 33 | 
            +
              end
         | 
| 32 34 | 
             
              it "should implement each_mRNA_seq" do
         | 
| 33 35 | 
             
                h = {} ; @gff.each_mRNA_seq { | id, seq | h[id] = seq }
         | 
| 34 36 | 
             
                h["mrna01short Sequence:test01_1:400 (3:14)"].should == "GAAGATTTGTAT"
         | 
    
        data/test/data/gff/test.gff3
    CHANGED
    
    | @@ -44,6 +44,7 @@ Contig4	confirmed	CDS	32000	35000	.	+	.	ID=Misc:thing3;mRNA=trans-9 | |
| 44 44 | 
             
            ##gff-version 3
         | 
| 45 45 | 
             
            ##sequence-region test01 1 400
         | 
| 46 46 | 
             
            test01	RANDOM	contig	1	400	.	+	.	ID=test01;Note=this is test
         | 
| 47 | 
            +
            test01	.	gene	3	280	.	+	.	ID=gene01;Name=tesGene;Parent=test01;Note=this is test gene
         | 
| 47 48 | 
             
            test01	.	mRNA	3	14	.	+	.	ID=mrna01short;Name=testmRNA;Note=this is test mRNA
         | 
| 48 49 | 
             
            test01	.	mRNA	101	230	.	+	.	ID=mrna01;Name=testmRNA;Note=this is test mRNA
         | 
| 49 50 | 
             
            test01	.	mRNA	101	280	.	+	.	ID=mrna01a;Name=testmRNAalterative;Note=test of alternative splicing variant
         | 
    
        metadata
    CHANGED
    
    | @@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version | |
| 5 5 | 
             
              segments: 
         | 
| 6 6 | 
             
              - 0
         | 
| 7 7 | 
             
              - 8
         | 
| 8 | 
            -
              -  | 
| 9 | 
            -
              version: 0.8. | 
| 8 | 
            +
              - 4
         | 
| 9 | 
            +
              version: 0.8.4
         | 
| 10 10 | 
             
            platform: ruby
         | 
| 11 11 | 
             
            authors: 
         | 
| 12 12 | 
             
            - Pjotr Prins
         | 
| @@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement | |
| 180 180 | 
             
              requirements: 
         | 
| 181 181 | 
             
              - - ">="
         | 
| 182 182 | 
             
                - !ruby/object:Gem::Version 
         | 
| 183 | 
            -
                  hash:  | 
| 183 | 
            +
                  hash: 520620943
         | 
| 184 184 | 
             
                  segments: 
         | 
| 185 185 | 
             
                  - 0
         | 
| 186 186 | 
             
                  version: "0"
         |