bio-gff3 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,3 +58,91 @@ For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html
58
58
  Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
59
59
 
60
60
 
61
+
62
+ Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
63
+
64
+ gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
65
+
66
+ Where (NYI == Not Yet Implemented):
67
+
68
+ --translate : output as amino acid sequence
69
+ --validate : validate GFF3 file by translating
70
+ --fix : check 3-frame translation and fix, if possible
71
+ --fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
72
+ --no-assemble : output each record as a sequence -- NYI
73
+ --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
74
+
75
+ type is any valid type in the GFF3 definition. For example:
76
+
77
+ mRNA : assemble mRNA
78
+ CDS : assemble CDS
79
+ exon : list all exons
80
+ gene|ORF : list gene ORFs
81
+ other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
82
+
83
+ and the following performance options:
84
+
85
+ --cache full : load all in RAM (fast)
86
+ --cache none : do not load anything in memory (slow)
87
+ --low-mem : use LRU cache (limit RAM use, fast) -- NYI
88
+ --max-cpus num : use num threads -- NYI
89
+ --emboss : use EMBOSS translation (fast) -- NYI
90
+
91
+ Multiple GFF3 files can be used. With external FASTA files, always the last
92
+ one before the GFF3 filename is matched.
93
+
94
+ Note that above switches are only partially implemented at this stage. Full
95
+ feature support is projected Feb. 2011.
96
+
97
+ Examples:
98
+
99
+ Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
100
+
101
+ gff3-fetch mRNA test/data/gff/test.gff3
102
+ gff3-fetch CDS test/data/gff/test.gff3
103
+
104
+ Find CDS records from external FASTA file, adding phase and translate to protein sequence
105
+
106
+ gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
107
+
108
+ Find mRNA from external FASTA file, without loading everything in RAM
109
+
110
+ gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
111
+ gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
112
+
113
+ Validate GFF3 file using EMBOSS translation and validation
114
+
115
+ gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
116
+
117
+ Find GENEID predicted terminal exons
118
+
119
+ gff3-fetch terminal chromosome1.fa geneid.gff3
120
+
121
+ == Performance
122
+
123
+ time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
124
+
125
+ Cache real user sys
126
+ ----------------------------------------------------
127
+ full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011)
128
+ none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011)
129
+ ----------------------------------------------------
130
+
131
+ where
132
+
133
+ 52M m_hapla.WS217.dna.fa
134
+ 456M m_hapla.WS217.gff3
135
+
136
+ ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
137
+ on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine.
138
+
139
+ == Cite
140
+
141
+ If you use this software, please cite
142
+
143
+ http://dx.doi.org/10.1093/bioinformatics/btq475
144
+
145
+ == Copyright
146
+
147
+ Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
148
+
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.3
1
+ 0.8.4
@@ -26,7 +26,7 @@ USAGE = <<EOM
26
26
  mRNA : assemble mRNA
27
27
  CDS : assemble CDS
28
28
  exon : list all exons
29
- gene/ORF : list gene ORFs -- NYI
29
+ gene|ORF : list gene ORFs
30
30
  other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
31
31
 
32
32
  and the following performance options:
@@ -167,8 +167,6 @@ opts.parse!(ARGV)
167
167
 
168
168
  gfftype = ARGV.shift
169
169
 
170
- raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
171
-
172
170
  fastafn = nil
173
171
 
174
172
  ARGV.each do | fn |
@@ -189,7 +187,15 @@ ARGV.each do | fn |
189
187
  gff = gffdb.assembler
190
188
  writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
191
189
  case gfftype.downcase
192
- when 'mrna'
190
+ when 'gene'
191
+ gff.each_gene_seq do | id, seq |
192
+ writer.put(id,seq)
193
+ end
194
+ when 'orf'
195
+ gff.each_gene_seq do | id, seq |
196
+ writer.put(id,seq)
197
+ end
198
+ when 'mrna'
193
199
  gff.each_mRNA_seq do | id, seq |
194
200
  writer.put(id,seq)
195
201
  end
@@ -202,7 +208,7 @@ ARGV.each do | fn |
202
208
  writer.put(id,seq)
203
209
  end
204
210
  else
205
- raise "Unknown action <#{gfftype}>"
211
+ raise "Unknown action on type <#{gfftype}>"
206
212
  end
207
213
  fastafn = nil
208
214
  end
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-gff3}
8
- s.version = "0.8.3"
8
+ s.version = "0.8.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
@@ -30,6 +30,7 @@ module Bio
30
30
  @count_ids = Counter.new # Count ids
31
31
  @count_seqnames = Counter.new # Count seqnames
32
32
  @componentlist = {} # Store containers, like genes, contigs
33
+ @orflist = LinkedRecs.new
33
34
  @mrnalist = LinkedRecs.new # Store linked mRNA records
34
35
  @cdslist = LinkedRecs.new
35
36
  @exonlist = LinkedRecs.new
@@ -90,6 +90,7 @@ module Bio
90
90
  @count_ids = Counter.new # Count ids
91
91
  @count_seqnames = Counter.new # Count seqnames
92
92
  @componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
93
+ @orflist = SeekLinkedRecs.new # Store linked gene records
93
94
  @mrnalist = SeekLinkedRecs.new # Store linked mRNA records
94
95
  @cdslist = SeekLinkedRecs.new
95
96
  @exonlist = SeekLinkedRecs.new
@@ -23,24 +23,26 @@ module Bio
23
23
  @count_ids.add(id)
24
24
  @count_seqnames.add(rec.seqname)
25
25
 
26
- if COMPONENT_TYPES.include?(rec.feature_type)
26
+ is_component = COMPONENT_TYPES.include?(rec.feature_type)
27
+ if is_component
27
28
  # check for container ID
28
29
  warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
29
30
  @componentlist[id] = rec
30
31
  info "Added #{rec.feature_type} with component ID #{id}"
31
- else
32
- case rec.feature_type
33
- when 'mRNA' || 'SO:0000234'
34
- @mrnalist.add(id,rec)
35
- when 'CDS' || 'SO:0000316'
36
- @cdslist.add(id,rec)
37
- when 'exon' || 'SO:0000147'
38
- @exonlist.add(id,rec)
39
- else
40
- if !IGNORE_FEATURES.include?(rec.feature_type)
41
- @unrecognized_features[rec.feature_type] = true
42
- end
43
- end
32
+ end
33
+ case rec.feature_type
34
+ when 'gene' || 'SO:0000704'
35
+ @orflist.add(id,rec)
36
+ when 'mRNA' || 'SO:0000234'
37
+ @mrnalist.add(id,rec)
38
+ when 'CDS' || 'SO:0000316'
39
+ @cdslist.add(id,rec)
40
+ when 'exon' || 'SO:0000147'
41
+ @exonlist.add(id,rec)
42
+ else
43
+ if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
44
+ @unrecognized_features[rec.feature_type] = true
45
+ end
44
46
  end
45
47
  end
46
48
 
@@ -80,6 +82,12 @@ module Bio
80
82
  # p :inmemory, @sequencelist
81
83
  end
82
84
 
85
+ # Yield the id, recs, containing component and sequence of genes
86
+ def each_gene
87
+ parse if !@orflist
88
+ each_item(@orflist) { |id, recs, component | yield id, recs, component }
89
+ end
90
+
83
91
  # Yield the id, recs, containing component and sequence of mRNAs
84
92
  def each_mRNA
85
93
  parse if !@mrnalist
@@ -98,6 +106,21 @@ module Bio
98
106
  each_item(@exonlist) { |id, recs, component | yield id, recs, component }
99
107
  end
100
108
 
109
+ # Yield a unique description and the sequence
110
+ def each_gene_seq
111
+ each_gene do | id, reclist, component |
112
+ if component
113
+ sequence = @sequencelist[component.seqname]
114
+ # p sequence
115
+ if sequence
116
+ yield description(id,component,reclist), assemble(sequence,component.start,reclist)
117
+ else
118
+ error "No sequence information for",id
119
+ end
120
+ end
121
+ end
122
+ end
123
+
101
124
  # Yield a unique description and the sequence
102
125
  def each_mRNA_seq
103
126
  each_mRNA do | id, reclist, component |
@@ -29,7 +29,7 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
29
29
  # p [id, rec]
30
30
  last = rec
31
31
  end
32
- last.io_seek.should == 3256
32
+ last.io_seek.should == 3342
33
33
  firstid = 'unknown'
34
34
  iter.each_sequence do | id, seq |
35
35
  # p [id, seq]
@@ -22,13 +22,15 @@ TESTGFF1FASTA='test/data/gff/test-ext-fasta.fa'
22
22
  def iterators_should_be_implemented
23
23
  if TEST_NON_IMPLEMENTED
24
24
  it "should implement each_gene"
25
- it "should implement each_gene"
26
- it "should implement each_gene_seq"
27
25
  it "should implement each_mRNA"
28
26
  it "should implement each_exon"
29
27
  it "should implement each_exon_seq"
30
28
  it "should implement each_CDS"
31
29
  end
30
+ it "should implement each_gene_seq" do
31
+ h = {} ; @gff.each_gene_seq { | id, seq | h[id] = seq }
32
+ h["gene01 Sequence:test01_1:400 (3:280)"].should == "GAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTAGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACACCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGATAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTA" if h["gene01 Sequence:test01_1:400 (3:280)"]
33
+ end
32
34
  it "should implement each_mRNA_seq" do
33
35
  h = {} ; @gff.each_mRNA_seq { | id, seq | h[id] = seq }
34
36
  h["mrna01short Sequence:test01_1:400 (3:14)"].should == "GAAGATTTGTAT"
@@ -44,6 +44,7 @@ Contig4 confirmed CDS 32000 35000 . + . ID=Misc:thing3;mRNA=trans-9
44
44
  ##gff-version 3
45
45
  ##sequence-region test01 1 400
46
46
  test01 RANDOM contig 1 400 . + . ID=test01;Note=this is test
47
+ test01 . gene 3 280 . + . ID=gene01;Name=tesGene;Parent=test01;Note=this is test gene
47
48
  test01 . mRNA 3 14 . + . ID=mrna01short;Name=testmRNA;Note=this is test mRNA
48
49
  test01 . mRNA 101 230 . + . ID=mrna01;Name=testmRNA;Note=this is test mRNA
49
50
  test01 . mRNA 101 280 . + . ID=mrna01a;Name=testmRNAalterative;Note=test of alternative splicing variant
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 8
8
- - 3
9
- version: 0.8.3
8
+ - 4
9
+ version: 0.8.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pjotr Prins
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
180
180
  requirements:
181
181
  - - ">="
182
182
  - !ruby/object:Gem::Version
183
- hash: -450771567
183
+ hash: 520620943
184
184
  segments:
185
185
  - 0
186
186
  version: "0"