germ 0.1 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/go.rb ADDED
@@ -0,0 +1,164 @@
1
+ require 'hash_table'
2
+ require 'germ/config'
3
+
4
+ module GO
5
+ class TagSet
6
+ def initialize
7
+ @tags = {}
8
+ end
9
+
10
+ attr_reader :tags
11
+ def add_tag line
12
+ tag, value = line.scan(/^(.*?): (.*?)(?: !.*)?$/).flatten
13
+ tag = tag.to_sym
14
+ @tags[tag] ||= Set.new
15
+ @tags[tag] << value
16
+ end
17
+
18
+ def respond_to_missing? sym, include_all = false
19
+ @tags.has_key?(sym) || super
20
+ end
21
+
22
+ def tag sym
23
+ @tags[sym] || []
24
+ end
25
+
26
+ def method_missing sym
27
+ set = @tags[sym]
28
+ if set
29
+ if set.size == 1
30
+ set.first
31
+ else
32
+ set
33
+ end
34
+ else
35
+ super
36
+ end
37
+ end
38
+ end
39
+ class Term
40
+ attr_reader :id, :name, :def, :namespace
41
+ def initialize tags, ont
42
+ @tags = tags
43
+ @ontology = ont
44
+ @id = @tags.id
45
+ @name = @tags.name
46
+ @namespace = @tags.namespace
47
+ @def = @tags.def
48
+ end
49
+
50
+ def parent_terms
51
+ @parent_terms ||= get_parent_terms
52
+ end
53
+
54
+ def collected_terms
55
+ @collected_terms ||= ([self] + parent_terms + parent_terms.map(&:collected_terms)).flatten.uniq
56
+ end
57
+
58
+ def depth
59
+ # see how far you have to go up to get to your root
60
+ @depth ||= (parent_terms.map(&:depth).min || 0) + 1
61
+ end
62
+
63
+ private
64
+ def get_parent_terms
65
+ @tags.tag(:is_a).map do |isa|
66
+ @ontology.term isa
67
+ end.compact
68
+ end
69
+ end
70
+ class Ontology
71
+ extend GermDefault
72
+
73
+ attr_reader :header
74
+ def initialize file
75
+ @header = TagSet.new
76
+ @terms = {}
77
+ @types = []
78
+ parse_file(file) if File.exists?(file)
79
+ end
80
+
81
+ def inspect
82
+ "#<#{self.class.name}:#{object_id} @terms=#{@terms.count}>"
83
+ end
84
+
85
+ def term id
86
+ @terms[id]
87
+ end
88
+
89
+ def find pattern
90
+ @terms.select do |id,term|
91
+ term.name =~ pattern
92
+ end.values
93
+ end
94
+
95
+ private
96
+ BLANK_LINE = /^\s*$/
97
+ TERM = /^\[Term\]$/
98
+ TYPEDEF = /^\[Typedef\]$/
99
+
100
+ def parse_file file
101
+ @io = File.open file
102
+ while line = @io.gets
103
+ line.chomp!
104
+ next if line =~ BLANK_LINE
105
+ if line =~ TERM
106
+ read_term
107
+ elsif line =~ TYPEDEF
108
+ read_typedef
109
+ else
110
+ @header.add_tag line
111
+ end
112
+ end
113
+ end
114
+
115
+ def read_term
116
+ term = GO::Term.new read_stanza, self
117
+ @terms[term.id] = term
118
+ end
119
+
120
+ def read_typedef
121
+ @types.push read_stanza
122
+ end
123
+
124
+ def read_stanza
125
+ stanza = TagSet.new
126
+ while line = @io.gets
127
+ line.chomp!
128
+ break if line =~ BLANK_LINE
129
+ stanza.add_tag line
130
+ end
131
+ stanza
132
+ end
133
+ end
134
+ class Annotation < HashTable
135
+ extend GermDefault
136
+ def initialize file=nil, opts={}
137
+ opts = opts.merge :header => [ :db, :db_object_id, :db_object_symbol,
138
+ :qualifier, :go_id, :db_reference,
139
+ :evidence_code, :with, :aspect, :db_object_name,
140
+ :synonym, :db_object_type, :taxon, :date,
141
+ :assigned_by, :annotation_extension,
142
+ :gene_product_form_id ],
143
+ :skip_header => true,
144
+ :comment => "!",
145
+ :ontology => GO::Ontology.default
146
+ super file, opts
147
+ @annos = {}
148
+ end
149
+ def ontology
150
+ @ontology ||= @opts[:ontology]
151
+ end
152
+ class AnnotationLine < HashTable::HashLine
153
+ def term
154
+ @table.ontology.term go_id
155
+ end
156
+ end
157
+ def gene_annotations gene
158
+ @annos[gene] ||= select do |g|
159
+ g.db_object_symbol == gene
160
+ end
161
+ end
162
+ line_class AnnotationLine
163
+ end
164
+ end
data/lib/gtf/gene.rb ADDED
@@ -0,0 +1,293 @@
1
+ class GTF < HashTable
2
+ def gene name
3
+ intervals = idx(:gene_name,name)
4
+ @genes[name] ||= GTF::Gene.new intervals if intervals
5
+ end
6
+
7
+ def promoters
8
+ @promoters ||= begin
9
+ promoters = []
10
+ idx_keys(:gene_name).each do |name|
11
+ promoters.concat gene(name).transcripts.map(&:transcript_start)
12
+ end
13
+ wrap promoters
14
+ end
15
+ end
16
+
17
+ class Gene
18
+ include Enumerable
19
+ include IntervalList
20
+ attr_reader :name, :strand, :transcripts, :intervals
21
+ def initialize intervals
22
+ @intervals = intervals.sort_by &:start
23
+ @gene = @intervals.find{|l| l.feature == "gene"}
24
+ @name = @gene.attribute[:gene_name]
25
+ @strand = @gene.strand
26
+ @transcripts = build_transcripts
27
+ end
28
+
29
+ def each
30
+ @intervals.each do |int|
31
+ yield int
32
+ end
33
+ end
34
+
35
+ def site pos
36
+ score = { :cds => 1, :exon => 2, :utr => 3, :intron => 4, :transcript => 5, :igr => 6 }
37
+ sites = @transcripts.map do |t|
38
+ { :gene => name }.update(t.site pos) if t.contains? pos
39
+ end.compact
40
+ sites.push(:type => :igr)
41
+ sites.sort_by{|s| score[s[:type]] }.first
42
+ end
43
+
44
+ # compute unified intervals from the list of intervals
45
+ def unified
46
+ @unified ||= exons.flatten do |unif|
47
+ unif.feature = "unified"
48
+ end
49
+ end
50
+
51
+ def exons
52
+ @exons ||= @intervals.select{|e| e.feature == "exon"}
53
+ end
54
+
55
+ def canonical
56
+ # find out which transcript has the longest cds
57
+ canon = @transcripts.max_by &:canonical_transcript_score
58
+ canon if canon.cds_size
59
+ end
60
+
61
+ def inspect
62
+ "#<#{self.class.name}:#{object_id} @transcripts=#{@transcripts.count}>"
63
+ end
64
+
65
+ def respond_to_missing? sym, include_all = false
66
+ @gene.respond_to?(sym) || super
67
+ end
68
+
69
+ def method_missing sym, *args, &block
70
+ @gene.send sym, *args, &block
71
+ end
72
+
73
+ private
74
+ def build_transcripts
75
+ (@intervals.select{|l| l.feature == "transcript"}).map do |t|
76
+ name = t.transcript_name
77
+ transcript_ints = @intervals.select do |l|
78
+ l.attribute[:transcript_name] == name && l.seqname == t.seqname
79
+ end
80
+ GTF::Transcript.new(transcript_ints, name, @gtf)
81
+ end
82
+ end
83
+ end
84
+
85
+ class Transcript
86
+ attr_reader :name, :intervals, :introns, :transcript
87
+ def initialize array, name, gtf
88
+ @intervals = array
89
+ @name = name
90
+ @gtf = gtf
91
+
92
+ @transcript = @intervals.find{|t| t.feature == "transcript"}
93
+ end
94
+
95
+ def site pos
96
+ i = @transcript.clone do |c|
97
+ c.start = c.stop = pos
98
+ end
99
+ intron = nil
100
+ overlaps = @intervals.select{|f| f.contains? i }
101
+ return cds_pos i if overlaps.find{|f| f.feature == "cds" }
102
+ return intron_pos intron if intron = overlaps.find{|f| f.feature == "intron" }
103
+ return utr_pos if overlaps.find{|f| f.feature =~ /UTR/ }
104
+ { :type => :transcript }
105
+ end
106
+
107
+
108
+ def utr_pos
109
+ { :type => :utr }
110
+ end
111
+
112
+ def intron_frame intron
113
+ # find the terminal frame of the leading exon
114
+ if strand == "+"
115
+ (intron.prev_exon.frame + intron.prev_exon.size)%3
116
+ else
117
+ intron.post_exon.frame
118
+ end
119
+ end
120
+
121
+ def canonical_transcript_score
122
+ (is_ccds? ? 100000 : 0) + cds_size
123
+ end
124
+
125
+ def is_ccds?
126
+ respond_to?(:ccds_id) && ccds_id != nil
127
+ end
128
+
129
+ def cds_size
130
+ cds.inject(0) do |sum,reg|
131
+ sum += reg.size
132
+ end
133
+ end
134
+
135
+ def cds_seq
136
+ @cds_seq ||= get_cds_seq
137
+ end
138
+
139
+ def cds_pos
140
+ @cds_pos ||= get_cds_pos
141
+ end
142
+
143
+ private
144
+ def get_cds_pos
145
+ pos = cds.map do |c|
146
+ c.size.times.map do |i|
147
+ GenomicLocus::Position.new c.seqname, i + c.start
148
+ end
149
+ end.flatten
150
+
151
+ strand == "+" ? pos : pos.reverse
152
+ end
153
+
154
+ def get_cds_seq
155
+ seq = cds.map(&:seq).join ''
156
+ strand == "+" ? seq : seq.reverse.tr('ATGC','TACG')
157
+ end
158
+
159
+ public
160
+ def protein_seq
161
+ trinucs.map do |t|
162
+ t.codon.aa.letter
163
+ end.join ''
164
+ end
165
+
166
+ def protein_seq_at locus
167
+ trinucs.map do |t|
168
+ # Just include it if it overlaps the locus
169
+ t.codon.aa.letter if t.pos.any? {|p| p.overlaps? locus}
170
+ end.compact.join ''
171
+ end
172
+
173
+ def protein_change mutation
174
+ # replace the positions that overlap the mutation
175
+ tnucs = trinucs.select do |tn|
176
+ tn.pos.any? do |p|
177
+ p.overlaps? mutation
178
+ end
179
+ end
180
+ return nil if tnucs.empty?
181
+ muts = tnucs.map do |tn|
182
+ seq = tn.seq.to_s
183
+ 3.times do |i|
184
+ next unless mutation.overlaps? tn.pos[i]
185
+ seq[i] = mutation.alt_at(tn.pos[i])
186
+ seq[i] = seq[i].tr('ATGC', 'TACG') if strand == "-"
187
+ end
188
+ TriNuc.new seq, tn.pos, strand
189
+ end
190
+ pre = tnucs.map do |tn|
191
+ tn.codon.aa.letter
192
+ end.join ''
193
+ post = muts.map do |tn|
194
+ tn.codon.aa.letter
195
+ end.join ''
196
+ "#{pre}#{tnucs.first.index+1}#{post}"
197
+ end
198
+
199
+ def trinucs
200
+ @trinucs ||= trinucs_for cds_seq, cds_pos
201
+ end
202
+
203
+ def trinucs_for cds_seq, cds_pos
204
+ aa_count = cds_seq.size / 3
205
+ aa_count.times.map do |i|
206
+ range = 3 * i .. 3*i + 2
207
+ TriNuc.new cds_seq[range], cds_pos[range], strand, i
208
+ end
209
+ end
210
+
211
+ def intron_pos intron
212
+ { :type => :intron, :pos => cds_pos(intron.start-1), :frame => intron_frame(intron) }
213
+ end
214
+
215
+ def utr3
216
+ return @utr3 if @utr3
217
+ cs = strand == "+" ? cds.first : cds.last
218
+ @utr3 = exons.select{ |e| strand == "+" ? !e.above?(cs) : !e.below?(cs) }
219
+ .map{|e| e.strict_diff(cs) }
220
+ .compact.map(&:to_a)
221
+ @utr3.each do |u|
222
+ u.feature = "3' UTR"
223
+ end
224
+ end
225
+
226
+ def utr5
227
+ return @utr5 if @utr5
228
+ cs = strand == "+" ? cds.last : cds.first
229
+ @utr5 = exons.select{|e| strand == "+" ? !e.below?(cs) : !e.above?(cs) }
230
+ .map{|e| e.strict_diff(cs)}
231
+ .compact.map(&:to_a)
232
+ @utr5.each do |u|
233
+ u.feature = "5' UTR"
234
+ end
235
+ end
236
+
237
+ def build_introns
238
+ return if !exons
239
+ @introns = exons.map.with_index do |e1,i|
240
+ e2 = @exons[i+1]
241
+ next if !e2
242
+ intron = e1.clone do |c|
243
+ c.start = e1.stop+1
244
+ c.stop = e2.start-1
245
+ end
246
+ intron.feature = "intron"
247
+ intron.prev_exon = e1
248
+ intron.post_exon = e2
249
+ intron
250
+ end.compact
251
+ @intervals.concat @introns
252
+ end
253
+
254
+ def build_utrs
255
+ @intervals.concat @utr3 if @utr3
256
+ @intervals.concat @utr5 if @utr5
257
+ end
258
+
259
+ def respond_to_missing? sym, include_all=false
260
+ @transcript.respond_to?(sym) || super
261
+ end
262
+
263
+ def method_missing sym, *args, &block
264
+ @transcript.send(sym, *args, &block)
265
+ end
266
+
267
+ def contains? pos
268
+ start <= pos && stop >= pos
269
+ end
270
+ def transcript_start
271
+ @transcript_start ||= @transcript.clone do |c|
272
+ c.start = c.stop = (strand == "+" ? start : stop)
273
+ c.feature = "transcript_start"
274
+ end
275
+ end
276
+ def exons
277
+ @exons ||= @intervals.select{|e| e.feature == "exon"}
278
+ end
279
+ def cds
280
+ @cds ||= @intervals.select{|e| e.feature == "CDS"}
281
+ end
282
+ def inspect
283
+ "#<#{self.class}:0x#{'%x' % (object_id << 1)} @name=#{@name} @intervals=#{@intervals.count}>"
284
+ end
285
+ # output this transcript in the odious 'refFlat' format, demanded by Picard and others
286
+ def to_refflat
287
+ [ gene_name, name, seqname, strand, start, stop, cds.map(&:start).min, cds.map(&:stop).max, exons.count,
288
+ exons.map(&:start).sort.join(','),
289
+ exons.map(&:stop).sort.join(',')
290
+ ].join "\t"
291
+ end
292
+ end
293
+ end