bio-gff3 0.8.3 → 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -58,3 +58,91 @@ For a write-up see http://thebird.nl/bioruby/BioRuby_GFF3.html
58
58
  Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
59
59
 
60
60
 
61
+
62
+ Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
63
+
64
+ gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
65
+
66
+ Where (NYI == Not Yet Implemented):
67
+
68
+ --translate : output as amino acid sequence
69
+ --validate : validate GFF3 file by translating
70
+ --fix : check 3-frame translation and fix, if possible
71
+ --fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
72
+ --no-assemble : output each record as a sequence -- NYI
73
+ --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
74
+
75
+ type is any valid type in the GFF3 definition. For example:
76
+
77
+ mRNA : assemble mRNA
78
+ CDS : assemble CDS
79
+ exon : list all exons
80
+ gene|ORF : list gene ORFs
81
+ other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
82
+
83
+ and the following performance options:
84
+
85
+ --cache full : load all in RAM (fast)
86
+ --cache none : do not load anything in memory (slow)
87
+ --low-mem : use LRU cache (limit RAM use, fast) -- NYI
88
+ --max-cpus num : use num threads -- NYI
89
+ --emboss : use EMBOSS translation (fast) -- NYI
90
+
91
+ Multiple GFF3 files can be used. With external FASTA files, always the last
92
+ one before the GFF3 filename is matched.
93
+
94
+ Note that above switches are only partially implemented at this stage. Full
95
+ feature support is projected Feb. 2011.
96
+
97
+ Examples:
98
+
99
+ Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
100
+
101
+ gff3-fetch mRNA test/data/gff/test.gff3
102
+ gff3-fetch CDS test/data/gff/test.gff3
103
+
104
+ Find CDS records from external FASTA file, adding phase and translate to protein sequence
105
+
106
+ gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
107
+
108
+ Find mRNA from external FASTA file, without loading everything in RAM
109
+
110
+ gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
111
+ gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
112
+
113
+ Validate GFF3 file using EMBOSS translation and validation
114
+
115
+ gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
116
+
117
+ Find GENEID predicted terminal exons
118
+
119
+ gff3-fetch terminal chromosome1.fa geneid.gff3
120
+
121
+ == Performance
122
+
123
+ time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
124
+
125
+ Cache real user sys
126
+ ----------------------------------------------------
127
+ full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011)
128
+ none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011)
129
+ ----------------------------------------------------
130
+
131
+ where
132
+
133
+ 52M m_hapla.WS217.dna.fa
134
+ 456M m_hapla.WS217.gff3
135
+
136
+ ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
137
+ on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine.
138
+
139
+ == Cite
140
+
141
+ If you use this software, please cite
142
+
143
+ http://dx.doi.org/10.1093/bioinformatics/btq475
144
+
145
+ == Copyright
146
+
147
+ Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
148
+
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.3
1
+ 0.8.4
@@ -26,7 +26,7 @@ USAGE = <<EOM
26
26
  mRNA : assemble mRNA
27
27
  CDS : assemble CDS
28
28
  exon : list all exons
29
- gene/ORF : list gene ORFs -- NYI
29
+ gene|ORF : list gene ORFs
30
30
  other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
31
31
 
32
32
  and the following performance options:
@@ -167,8 +167,6 @@ opts.parse!(ARGV)
167
167
 
168
168
  gfftype = ARGV.shift
169
169
 
170
- raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
171
-
172
170
  fastafn = nil
173
171
 
174
172
  ARGV.each do | fn |
@@ -189,7 +187,15 @@ ARGV.each do | fn |
189
187
  gff = gffdb.assembler
190
188
  writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
191
189
  case gfftype.downcase
192
- when 'mrna'
190
+ when 'gene'
191
+ gff.each_gene_seq do | id, seq |
192
+ writer.put(id,seq)
193
+ end
194
+ when 'orf'
195
+ gff.each_gene_seq do | id, seq |
196
+ writer.put(id,seq)
197
+ end
198
+ when 'mrna'
193
199
  gff.each_mRNA_seq do | id, seq |
194
200
  writer.put(id,seq)
195
201
  end
@@ -202,7 +208,7 @@ ARGV.each do | fn |
202
208
  writer.put(id,seq)
203
209
  end
204
210
  else
205
- raise "Unknown action <#{gfftype}>"
211
+ raise "Unknown action on type <#{gfftype}>"
206
212
  end
207
213
  fastafn = nil
208
214
  end
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-gff3}
8
- s.version = "0.8.3"
8
+ s.version = "0.8.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
@@ -30,6 +30,7 @@ module Bio
30
30
  @count_ids = Counter.new # Count ids
31
31
  @count_seqnames = Counter.new # Count seqnames
32
32
  @componentlist = {} # Store containers, like genes, contigs
33
+ @orflist = LinkedRecs.new
33
34
  @mrnalist = LinkedRecs.new # Store linked mRNA records
34
35
  @cdslist = LinkedRecs.new
35
36
  @exonlist = LinkedRecs.new
@@ -90,6 +90,7 @@ module Bio
90
90
  @count_ids = Counter.new # Count ids
91
91
  @count_seqnames = Counter.new # Count seqnames
92
92
  @componentlist = SeekRecList.new(@iter.fh) # Store containers, like genes, contigs
93
+ @orflist = SeekLinkedRecs.new # Store linked gene records
93
94
  @mrnalist = SeekLinkedRecs.new # Store linked mRNA records
94
95
  @cdslist = SeekLinkedRecs.new
95
96
  @exonlist = SeekLinkedRecs.new
@@ -23,24 +23,26 @@ module Bio
23
23
  @count_ids.add(id)
24
24
  @count_seqnames.add(rec.seqname)
25
25
 
26
- if COMPONENT_TYPES.include?(rec.feature_type)
26
+ is_component = COMPONENT_TYPES.include?(rec.feature_type)
27
+ if is_component
27
28
  # check for container ID
28
29
  warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
29
30
  @componentlist[id] = rec
30
31
  info "Added #{rec.feature_type} with component ID #{id}"
31
- else
32
- case rec.feature_type
33
- when 'mRNA' || 'SO:0000234'
34
- @mrnalist.add(id,rec)
35
- when 'CDS' || 'SO:0000316'
36
- @cdslist.add(id,rec)
37
- when 'exon' || 'SO:0000147'
38
- @exonlist.add(id,rec)
39
- else
40
- if !IGNORE_FEATURES.include?(rec.feature_type)
41
- @unrecognized_features[rec.feature_type] = true
42
- end
43
- end
32
+ end
33
+ case rec.feature_type
34
+ when 'gene' || 'SO:0000704'
35
+ @orflist.add(id,rec)
36
+ when 'mRNA' || 'SO:0000234'
37
+ @mrnalist.add(id,rec)
38
+ when 'CDS' || 'SO:0000316'
39
+ @cdslist.add(id,rec)
40
+ when 'exon' || 'SO:0000147'
41
+ @exonlist.add(id,rec)
42
+ else
43
+ if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
44
+ @unrecognized_features[rec.feature_type] = true
45
+ end
44
46
  end
45
47
  end
46
48
 
@@ -80,6 +82,12 @@ module Bio
80
82
  # p :inmemory, @sequencelist
81
83
  end
82
84
 
85
+ # Yield the id, recs, containing component and sequence of genes
86
+ def each_gene
87
+ parse if !@orflist
88
+ each_item(@orflist) { |id, recs, component | yield id, recs, component }
89
+ end
90
+
83
91
  # Yield the id, recs, containing component and sequence of mRNAs
84
92
  def each_mRNA
85
93
  parse if !@mrnalist
@@ -98,6 +106,21 @@ module Bio
98
106
  each_item(@exonlist) { |id, recs, component | yield id, recs, component }
99
107
  end
100
108
 
109
+ # Yield a unique description and the sequence
110
+ def each_gene_seq
111
+ each_gene do | id, reclist, component |
112
+ if component
113
+ sequence = @sequencelist[component.seqname]
114
+ # p sequence
115
+ if sequence
116
+ yield description(id,component,reclist), assemble(sequence,component.start,reclist)
117
+ else
118
+ error "No sequence information for",id
119
+ end
120
+ end
121
+ end
122
+ end
123
+
101
124
  # Yield a unique description and the sequence
102
125
  def each_mRNA_seq
103
126
  each_mRNA do | id, reclist, component |
@@ -29,7 +29,7 @@ describe Bio::GFF::GFF3::FileIterator, "iterates a GFF3 file" do
29
29
  # p [id, rec]
30
30
  last = rec
31
31
  end
32
- last.io_seek.should == 3256
32
+ last.io_seek.should == 3342
33
33
  firstid = 'unknown'
34
34
  iter.each_sequence do | id, seq |
35
35
  # p [id, seq]
@@ -22,13 +22,15 @@ TESTGFF1FASTA='test/data/gff/test-ext-fasta.fa'
22
22
  def iterators_should_be_implemented
23
23
  if TEST_NON_IMPLEMENTED
24
24
  it "should implement each_gene"
25
- it "should implement each_gene"
26
- it "should implement each_gene_seq"
27
25
  it "should implement each_mRNA"
28
26
  it "should implement each_exon"
29
27
  it "should implement each_exon_seq"
30
28
  it "should implement each_CDS"
31
29
  end
30
+ it "should implement each_gene_seq" do
31
+ h = {} ; @gff.each_gene_seq { | id, seq | h[id] = seq }
32
+ h["gene01 Sequence:test01_1:400 (3:280)"].should == "GAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTAGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACACCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGATAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTA" if h["gene01 Sequence:test01_1:400 (3:280)"]
33
+ end
32
34
  it "should implement each_mRNA_seq" do
33
35
  h = {} ; @gff.each_mRNA_seq { | id, seq | h[id] = seq }
34
36
  h["mrna01short Sequence:test01_1:400 (3:14)"].should == "GAAGATTTGTAT"
@@ -44,6 +44,7 @@ Contig4 confirmed CDS 32000 35000 . + . ID=Misc:thing3;mRNA=trans-9
44
44
  ##gff-version 3
45
45
  ##sequence-region test01 1 400
46
46
  test01 RANDOM contig 1 400 . + . ID=test01;Note=this is test
47
+ test01 . gene 3 280 . + . ID=gene01;Name=tesGene;Parent=test01;Note=this is test gene
47
48
  test01 . mRNA 3 14 . + . ID=mrna01short;Name=testmRNA;Note=this is test mRNA
48
49
  test01 . mRNA 101 230 . + . ID=mrna01;Name=testmRNA;Note=this is test mRNA
49
50
  test01 . mRNA 101 280 . + . ID=mrna01a;Name=testmRNAalterative;Note=test of alternative splicing variant
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 8
8
- - 3
9
- version: 0.8.3
8
+ - 4
9
+ version: 0.8.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pjotr Prins
@@ -180,7 +180,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
180
180
  requirements:
181
181
  - - ">="
182
182
  - !ruby/object:Gem::Version
183
- hash: -450771567
183
+ hash: 520620943
184
184
  segments:
185
185
  - 0
186
186
  version: "0"