germ 0.1 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/hash_table_aux/HashTableAux.c +19 -6
- data/lib/fasta.rb +122 -24
- data/lib/fastq.rb +45 -0
- data/lib/genetic_code.rb +141 -0
- data/lib/genomic_locus.rb +50 -0
- data/lib/germ/config.rb +64 -4
- data/lib/germ/flagstat.rb +4 -0
- data/lib/germ.rb +3 -0
- data/lib/go.rb +164 -0
- data/lib/gtf/gene.rb +293 -0
- data/lib/gtf.rb +34 -202
- data/lib/hash_table.rb +190 -54
- data/lib/intervals.rb +225 -250
- data/lib/maf.rb +42 -58
- data/lib/mutation.rb +41 -0
- data/lib/mutation_set.rb +60 -239
- data/lib/mutect.rb +22 -17
- data/lib/oncotator.rb +43 -1
- data/lib/sdrf.rb +14 -0
- data/lib/tcga.rb +41 -0
- data/lib/vcf.rb +77 -73
- metadata +33 -33
data/lib/go.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
require 'hash_table'
|
2
|
+
require 'germ/config'
|
3
|
+
|
4
|
+
module GO
|
5
|
+
class TagSet
|
6
|
+
def initialize
|
7
|
+
@tags = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_reader :tags
|
11
|
+
def add_tag line
|
12
|
+
tag, value = line.scan(/^(.*?): (.*?)(?: !.*)?$/).flatten
|
13
|
+
tag = tag.to_sym
|
14
|
+
@tags[tag] ||= Set.new
|
15
|
+
@tags[tag] << value
|
16
|
+
end
|
17
|
+
|
18
|
+
def respond_to_missing? sym, include_all = false
|
19
|
+
@tags.has_key?(sym) || super
|
20
|
+
end
|
21
|
+
|
22
|
+
def tag sym
|
23
|
+
@tags[sym] || []
|
24
|
+
end
|
25
|
+
|
26
|
+
def method_missing sym
|
27
|
+
set = @tags[sym]
|
28
|
+
if set
|
29
|
+
if set.size == 1
|
30
|
+
set.first
|
31
|
+
else
|
32
|
+
set
|
33
|
+
end
|
34
|
+
else
|
35
|
+
super
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
class Term
|
40
|
+
attr_reader :id, :name, :def, :namespace
|
41
|
+
def initialize tags, ont
|
42
|
+
@tags = tags
|
43
|
+
@ontology = ont
|
44
|
+
@id = @tags.id
|
45
|
+
@name = @tags.name
|
46
|
+
@namespace = @tags.namespace
|
47
|
+
@def = @tags.def
|
48
|
+
end
|
49
|
+
|
50
|
+
def parent_terms
|
51
|
+
@parent_terms ||= get_parent_terms
|
52
|
+
end
|
53
|
+
|
54
|
+
def collected_terms
|
55
|
+
@collected_terms ||= ([self] + parent_terms + parent_terms.map(&:collected_terms)).flatten.uniq
|
56
|
+
end
|
57
|
+
|
58
|
+
def depth
|
59
|
+
# see how far you have to go up to get to your root
|
60
|
+
@depth ||= (parent_terms.map(&:depth).min || 0) + 1
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
def get_parent_terms
|
65
|
+
@tags.tag(:is_a).map do |isa|
|
66
|
+
@ontology.term isa
|
67
|
+
end.compact
|
68
|
+
end
|
69
|
+
end
|
70
|
+
class Ontology
|
71
|
+
extend GermDefault
|
72
|
+
|
73
|
+
attr_reader :header
|
74
|
+
def initialize file
|
75
|
+
@header = TagSet.new
|
76
|
+
@terms = {}
|
77
|
+
@types = []
|
78
|
+
parse_file(file) if File.exists?(file)
|
79
|
+
end
|
80
|
+
|
81
|
+
def inspect
|
82
|
+
"#<#{self.class.name}:#{object_id} @terms=#{@terms.count}>"
|
83
|
+
end
|
84
|
+
|
85
|
+
def term id
|
86
|
+
@terms[id]
|
87
|
+
end
|
88
|
+
|
89
|
+
def find pattern
|
90
|
+
@terms.select do |id,term|
|
91
|
+
term.name =~ pattern
|
92
|
+
end.values
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
BLANK_LINE = /^\s*$/
|
97
|
+
TERM = /^\[Term\]$/
|
98
|
+
TYPEDEF = /^\[Typedef\]$/
|
99
|
+
|
100
|
+
def parse_file file
|
101
|
+
@io = File.open file
|
102
|
+
while line = @io.gets
|
103
|
+
line.chomp!
|
104
|
+
next if line =~ BLANK_LINE
|
105
|
+
if line =~ TERM
|
106
|
+
read_term
|
107
|
+
elsif line =~ TYPEDEF
|
108
|
+
read_typedef
|
109
|
+
else
|
110
|
+
@header.add_tag line
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def read_term
|
116
|
+
term = GO::Term.new read_stanza, self
|
117
|
+
@terms[term.id] = term
|
118
|
+
end
|
119
|
+
|
120
|
+
def read_typedef
|
121
|
+
@types.push read_stanza
|
122
|
+
end
|
123
|
+
|
124
|
+
def read_stanza
|
125
|
+
stanza = TagSet.new
|
126
|
+
while line = @io.gets
|
127
|
+
line.chomp!
|
128
|
+
break if line =~ BLANK_LINE
|
129
|
+
stanza.add_tag line
|
130
|
+
end
|
131
|
+
stanza
|
132
|
+
end
|
133
|
+
end
|
134
|
+
class Annotation < HashTable
|
135
|
+
extend GermDefault
|
136
|
+
def initialize file=nil, opts={}
|
137
|
+
opts = opts.merge :header => [ :db, :db_object_id, :db_object_symbol,
|
138
|
+
:qualifier, :go_id, :db_reference,
|
139
|
+
:evidence_code, :with, :aspect, :db_object_name,
|
140
|
+
:synonym, :db_object_type, :taxon, :date,
|
141
|
+
:assigned_by, :annotation_extension,
|
142
|
+
:gene_product_form_id ],
|
143
|
+
:skip_header => true,
|
144
|
+
:comment => "!",
|
145
|
+
:ontology => GO::Ontology.default
|
146
|
+
super file, opts
|
147
|
+
@annos = {}
|
148
|
+
end
|
149
|
+
def ontology
|
150
|
+
@ontology ||= @opts[:ontology]
|
151
|
+
end
|
152
|
+
class AnnotationLine < HashTable::HashLine
|
153
|
+
def term
|
154
|
+
@table.ontology.term go_id
|
155
|
+
end
|
156
|
+
end
|
157
|
+
def gene_annotations gene
|
158
|
+
@annos[gene] ||= select do |g|
|
159
|
+
g.db_object_symbol == gene
|
160
|
+
end
|
161
|
+
end
|
162
|
+
line_class AnnotationLine
|
163
|
+
end
|
164
|
+
end
|
data/lib/gtf/gene.rb
ADDED
@@ -0,0 +1,293 @@
|
|
1
|
+
class GTF < HashTable
|
2
|
+
def gene name
|
3
|
+
intervals = idx(:gene_name,name)
|
4
|
+
@genes[name] ||= GTF::Gene.new intervals if intervals
|
5
|
+
end
|
6
|
+
|
7
|
+
def promoters
|
8
|
+
@promoters ||= begin
|
9
|
+
promoters = []
|
10
|
+
idx_keys(:gene_name).each do |name|
|
11
|
+
promoters.concat gene(name).transcripts.map(&:transcript_start)
|
12
|
+
end
|
13
|
+
wrap promoters
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Gene
|
18
|
+
include Enumerable
|
19
|
+
include IntervalList
|
20
|
+
attr_reader :name, :strand, :transcripts, :intervals
|
21
|
+
def initialize intervals
|
22
|
+
@intervals = intervals.sort_by &:start
|
23
|
+
@gene = @intervals.find{|l| l.feature == "gene"}
|
24
|
+
@name = @gene.attribute[:gene_name]
|
25
|
+
@strand = @gene.strand
|
26
|
+
@transcripts = build_transcripts
|
27
|
+
end
|
28
|
+
|
29
|
+
def each
|
30
|
+
@intervals.each do |int|
|
31
|
+
yield int
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def site pos
|
36
|
+
score = { :cds => 1, :exon => 2, :utr => 3, :intron => 4, :transcript => 5, :igr => 6 }
|
37
|
+
sites = @transcripts.map do |t|
|
38
|
+
{ :gene => name }.update(t.site pos) if t.contains? pos
|
39
|
+
end.compact
|
40
|
+
sites.push(:type => :igr)
|
41
|
+
sites.sort_by{|s| score[s[:type]] }.first
|
42
|
+
end
|
43
|
+
|
44
|
+
# compute unified intervals from the list of intervals
|
45
|
+
def unified
|
46
|
+
@unified ||= exons.flatten do |unif|
|
47
|
+
unif.feature = "unified"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def exons
|
52
|
+
@exons ||= @intervals.select{|e| e.feature == "exon"}
|
53
|
+
end
|
54
|
+
|
55
|
+
def canonical
|
56
|
+
# find out which transcript has the longest cds
|
57
|
+
canon = @transcripts.max_by &:canonical_transcript_score
|
58
|
+
canon if canon.cds_size
|
59
|
+
end
|
60
|
+
|
61
|
+
def inspect
|
62
|
+
"#<#{self.class.name}:#{object_id} @transcripts=#{@transcripts.count}>"
|
63
|
+
end
|
64
|
+
|
65
|
+
def respond_to_missing? sym, include_all = false
|
66
|
+
@gene.respond_to?(sym) || super
|
67
|
+
end
|
68
|
+
|
69
|
+
def method_missing sym, *args, &block
|
70
|
+
@gene.send sym, *args, &block
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
def build_transcripts
|
75
|
+
(@intervals.select{|l| l.feature == "transcript"}).map do |t|
|
76
|
+
name = t.transcript_name
|
77
|
+
transcript_ints = @intervals.select do |l|
|
78
|
+
l.attribute[:transcript_name] == name && l.seqname == t.seqname
|
79
|
+
end
|
80
|
+
GTF::Transcript.new(transcript_ints, name, @gtf)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class Transcript
|
86
|
+
attr_reader :name, :intervals, :introns, :transcript
|
87
|
+
def initialize array, name, gtf
|
88
|
+
@intervals = array
|
89
|
+
@name = name
|
90
|
+
@gtf = gtf
|
91
|
+
|
92
|
+
@transcript = @intervals.find{|t| t.feature == "transcript"}
|
93
|
+
end
|
94
|
+
|
95
|
+
def site pos
|
96
|
+
i = @transcript.clone do |c|
|
97
|
+
c.start = c.stop = pos
|
98
|
+
end
|
99
|
+
intron = nil
|
100
|
+
overlaps = @intervals.select{|f| f.contains? i }
|
101
|
+
return cds_pos i if overlaps.find{|f| f.feature == "cds" }
|
102
|
+
return intron_pos intron if intron = overlaps.find{|f| f.feature == "intron" }
|
103
|
+
return utr_pos if overlaps.find{|f| f.feature =~ /UTR/ }
|
104
|
+
{ :type => :transcript }
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
def utr_pos
|
109
|
+
{ :type => :utr }
|
110
|
+
end
|
111
|
+
|
112
|
+
def intron_frame intron
|
113
|
+
# find the terminal frame of the leading exon
|
114
|
+
if strand == "+"
|
115
|
+
(intron.prev_exon.frame + intron.prev_exon.size)%3
|
116
|
+
else
|
117
|
+
intron.post_exon.frame
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def canonical_transcript_score
|
122
|
+
(is_ccds? ? 100000 : 0) + cds_size
|
123
|
+
end
|
124
|
+
|
125
|
+
def is_ccds?
|
126
|
+
respond_to?(:ccds_id) && ccds_id != nil
|
127
|
+
end
|
128
|
+
|
129
|
+
def cds_size
|
130
|
+
cds.inject(0) do |sum,reg|
|
131
|
+
sum += reg.size
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def cds_seq
|
136
|
+
@cds_seq ||= get_cds_seq
|
137
|
+
end
|
138
|
+
|
139
|
+
def cds_pos
|
140
|
+
@cds_pos ||= get_cds_pos
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
def get_cds_pos
|
145
|
+
pos = cds.map do |c|
|
146
|
+
c.size.times.map do |i|
|
147
|
+
GenomicLocus::Position.new c.seqname, i + c.start
|
148
|
+
end
|
149
|
+
end.flatten
|
150
|
+
|
151
|
+
strand == "+" ? pos : pos.reverse
|
152
|
+
end
|
153
|
+
|
154
|
+
def get_cds_seq
|
155
|
+
seq = cds.map(&:seq).join ''
|
156
|
+
strand == "+" ? seq : seq.reverse.tr('ATGC','TACG')
|
157
|
+
end
|
158
|
+
|
159
|
+
public
|
160
|
+
def protein_seq
|
161
|
+
trinucs.map do |t|
|
162
|
+
t.codon.aa.letter
|
163
|
+
end.join ''
|
164
|
+
end
|
165
|
+
|
166
|
+
def protein_seq_at locus
|
167
|
+
trinucs.map do |t|
|
168
|
+
# Just include it if it overlaps the locus
|
169
|
+
t.codon.aa.letter if t.pos.any? {|p| p.overlaps? locus}
|
170
|
+
end.compact.join ''
|
171
|
+
end
|
172
|
+
|
173
|
+
def protein_change mutation
|
174
|
+
# replace the positions that overlap the mutation
|
175
|
+
tnucs = trinucs.select do |tn|
|
176
|
+
tn.pos.any? do |p|
|
177
|
+
p.overlaps? mutation
|
178
|
+
end
|
179
|
+
end
|
180
|
+
return nil if tnucs.empty?
|
181
|
+
muts = tnucs.map do |tn|
|
182
|
+
seq = tn.seq.to_s
|
183
|
+
3.times do |i|
|
184
|
+
next unless mutation.overlaps? tn.pos[i]
|
185
|
+
seq[i] = mutation.alt_at(tn.pos[i])
|
186
|
+
seq[i] = seq[i].tr('ATGC', 'TACG') if strand == "-"
|
187
|
+
end
|
188
|
+
TriNuc.new seq, tn.pos, strand
|
189
|
+
end
|
190
|
+
pre = tnucs.map do |tn|
|
191
|
+
tn.codon.aa.letter
|
192
|
+
end.join ''
|
193
|
+
post = muts.map do |tn|
|
194
|
+
tn.codon.aa.letter
|
195
|
+
end.join ''
|
196
|
+
"#{pre}#{tnucs.first.index+1}#{post}"
|
197
|
+
end
|
198
|
+
|
199
|
+
def trinucs
|
200
|
+
@trinucs ||= trinucs_for cds_seq, cds_pos
|
201
|
+
end
|
202
|
+
|
203
|
+
def trinucs_for cds_seq, cds_pos
|
204
|
+
aa_count = cds_seq.size / 3
|
205
|
+
aa_count.times.map do |i|
|
206
|
+
range = 3 * i .. 3*i + 2
|
207
|
+
TriNuc.new cds_seq[range], cds_pos[range], strand, i
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def intron_pos intron
|
212
|
+
{ :type => :intron, :pos => cds_pos(intron.start-1), :frame => intron_frame(intron) }
|
213
|
+
end
|
214
|
+
|
215
|
+
def utr3
|
216
|
+
return @utr3 if @utr3
|
217
|
+
cs = strand == "+" ? cds.first : cds.last
|
218
|
+
@utr3 = exons.select{ |e| strand == "+" ? !e.above?(cs) : !e.below?(cs) }
|
219
|
+
.map{|e| e.strict_diff(cs) }
|
220
|
+
.compact.map(&:to_a)
|
221
|
+
@utr3.each do |u|
|
222
|
+
u.feature = "3' UTR"
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def utr5
|
227
|
+
return @utr5 if @utr5
|
228
|
+
cs = strand == "+" ? cds.last : cds.first
|
229
|
+
@utr5 = exons.select{|e| strand == "+" ? !e.below?(cs) : !e.above?(cs) }
|
230
|
+
.map{|e| e.strict_diff(cs)}
|
231
|
+
.compact.map(&:to_a)
|
232
|
+
@utr5.each do |u|
|
233
|
+
u.feature = "5' UTR"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def build_introns
|
238
|
+
return if !exons
|
239
|
+
@introns = exons.map.with_index do |e1,i|
|
240
|
+
e2 = @exons[i+1]
|
241
|
+
next if !e2
|
242
|
+
intron = e1.clone do |c|
|
243
|
+
c.start = e1.stop+1
|
244
|
+
c.stop = e2.start-1
|
245
|
+
end
|
246
|
+
intron.feature = "intron"
|
247
|
+
intron.prev_exon = e1
|
248
|
+
intron.post_exon = e2
|
249
|
+
intron
|
250
|
+
end.compact
|
251
|
+
@intervals.concat @introns
|
252
|
+
end
|
253
|
+
|
254
|
+
def build_utrs
|
255
|
+
@intervals.concat @utr3 if @utr3
|
256
|
+
@intervals.concat @utr5 if @utr5
|
257
|
+
end
|
258
|
+
|
259
|
+
def respond_to_missing? sym, include_all=false
|
260
|
+
@transcript.respond_to?(sym) || super
|
261
|
+
end
|
262
|
+
|
263
|
+
def method_missing sym, *args, &block
|
264
|
+
@transcript.send(sym, *args, &block)
|
265
|
+
end
|
266
|
+
|
267
|
+
def contains? pos
|
268
|
+
start <= pos && stop >= pos
|
269
|
+
end
|
270
|
+
def transcript_start
|
271
|
+
@transcript_start ||= @transcript.clone do |c|
|
272
|
+
c.start = c.stop = (strand == "+" ? start : stop)
|
273
|
+
c.feature = "transcript_start"
|
274
|
+
end
|
275
|
+
end
|
276
|
+
def exons
|
277
|
+
@exons ||= @intervals.select{|e| e.feature == "exon"}
|
278
|
+
end
|
279
|
+
def cds
|
280
|
+
@cds ||= @intervals.select{|e| e.feature == "CDS"}
|
281
|
+
end
|
282
|
+
def inspect
|
283
|
+
"#<#{self.class}:0x#{'%x' % (object_id << 1)} @name=#{@name} @intervals=#{@intervals.count}>"
|
284
|
+
end
|
285
|
+
# output this transcript in the odious 'refFlat' format, demanded by Picard and others
|
286
|
+
def to_refflat
|
287
|
+
[ gene_name, name, seqname, strand, start, stop, cds.map(&:start).min, cds.map(&:stop).max, exons.count,
|
288
|
+
exons.map(&:start).sort.join(','),
|
289
|
+
exons.map(&:stop).sort.join(',')
|
290
|
+
].join "\t"
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|