germ 0.1 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/gtf.rb CHANGED
@@ -1,209 +1,48 @@
1
1
  require 'hash_table'
2
2
  require 'intervals'
3
+ require 'genomic_locus'
4
+ require 'genetic_code'
5
+ require 'gtf/gene'
6
+ require 'fasta'
7
+ require 'germ/config'
3
8
 
4
9
  class GTF < HashTable
5
- header_off
6
-
7
- class GTFLine < HashTable::HashLine
8
- include IntervalList::Interval
9
- def chrom; seqname; end
10
- def chrom= nc; seqname = nc; end
11
- def copy
12
- c = self.class.new @hash.clone
10
+ extend GermDefault
11
+ include IntervalList
12
+
13
+ def self.default_create file, idx=nil
14
+ if idx
15
+ new file, :idx => idx
16
+ else
17
+ new file
13
18
  end
14
19
  end
15
- line_class GTFLine
16
-
17
- class Gene
18
- class Transcript
19
- attr_reader :name, :intervals, :introns
20
- def initialize array, name
21
- @intervals = array
22
- @name = name
23
-
24
- @transcript = @intervals.find{|t| t.feature == "transcript"}
25
-
26
- build_introns
27
- end
28
-
29
- def site pos
30
- i = @transcript.clone :pos => pos
31
- intron = nil
32
- overlaps = @intervals.select{|f| f.contains? i }
33
- return cds_pos i if overlaps.find{|f| f.feature == "cds" }
34
- return intron_pos intron if intron = overlaps.find{|f| f.feature == "intron" }
35
- return utr_pos if overlaps.find{|f| f.feature =~ /UTR/ }
36
- { :type => :transcript }
37
- end
38
-
39
-
40
- def utr_pos
41
- { :type => :utr }
42
- end
43
-
44
- def intron_frame intron
45
- # find the terminal frame of the leading exon
46
- if strand == "+"
47
- (intron.prev_exon.frame + intron.prev_exon.size)%3
48
- else
49
- intron.post_exon.frame
50
- end
51
- end
52
-
53
- def cds_pos pos
54
- bases = 0
55
- if @strand == "+"
56
- cds.each do |c|
57
- if c.contains? pos
58
- bases += pos - c.start + 1
59
- break
60
- else
61
- bases += c.size
62
- end
63
- end
64
- else
65
- cds.reverse.each do |c|
66
- if c.contains? pos
67
- bases += c.stop - pos + 1
68
- break
69
- else
70
- bases += c.size
71
- end
72
- end
73
- end
74
- { :type => :cds, :pos => bases/3 }
75
- end
76
-
77
- def intron_pos intron
78
- { :type => :intron, :pos => cds_pos(intron.start-1), :frame => intron_frame(intron) }
79
- end
80
-
81
- def utr3
82
- return @utr3 if @utr3
83
- cs = strand == "+" ? cds.first : cds.last
84
- @utr3 = exons.select{ |e| strand == "+" ? !e.above?(cs) : !e.below?(cs) }
85
- .map{|e| e.strict_diff(cs) }
86
- .compact.map(&:to_a)
87
- @utr3.each do |u|
88
- u.feature = "3' UTR"
89
- end
90
- end
91
-
92
- def utr5
93
- return @utr5 if @utr5
94
- cs = strand == "+" ? cds.last : cds.first
95
- @utr5 = exons.select{|e| strand == "+" ? !e.below?(cs) : !e.above?(cs) }
96
- .map{|e| e.strict_diff(cs)}
97
- .compact.map(&:to_a)
98
- @utr5.each do |u|
99
- u.feature = "5' UTR"
100
- end
101
- end
102
-
103
- def build_introns
104
- return if !exons
105
- @introns = exons.map.with_index do |e1,i|
106
- e2 = @exons[i+1]
107
- next if !e2
108
- intron = e1.clone(:start => e1.stop+1, :stop => e2.start-1)
109
- intron.feature = "intron"
110
- intron.prev_exon = e1
111
- intron.post_exon = e2
112
- intron
113
- end.compact
114
- @intervals.concat @introns
115
- end
116
-
117
- def build_utrs
118
- @intervals.concat @utr3 if @utr3
119
- @intervals.concat @utr5 if @utr5
120
- end
121
20
 
122
- def start
123
- @transcript.start
124
- end
125
- def stop
126
- @transcript.stop
127
- end
128
- def strand
129
- @transcript.strand
130
- end
131
- def contains? pos
132
- start <= pos && stop >= pos
133
- end
134
- def exons
135
- @exons ||= @intervals.select{|e| e.feature == "exon"}.sort_by &:start
136
- end
137
- def cds
138
- @cds ||= @intervals.select{|e| e.feature == "CDS"}.sort_by &:start
139
- end
140
- end
141
-
142
- attr_reader :name, :strand, :transcripts, :intervals
143
- def initialize array
144
- @intervals = array
145
- @gene = @intervals.find{|l| l.feature == "gene"}
146
- @name = @gene.attribute[:gene_name]
147
- @strand = @gene.strand
148
- @transcripts = build_transcripts
149
- end
150
-
151
- def start
152
- @gene.start
153
- end
154
-
155
- def stop
156
- @gene.stop
157
- end
158
-
159
- def site pos
160
- score = { :cds => 1, :exon => 2, :utr => 3, :intron => 4, :transcript => 5, :igr => 6 }
161
- sites = @transcripts.map do |t|
162
- { :gene => name }.update(t.site pos) if t.contains? pos
163
- end.compact
164
- sites.push(:type => :igr)
165
- sites.sort_by{|s| score[s[:type]] }.first
166
- end
21
+ header_off
167
22
 
168
- # compute unified intervals from the list of intervals
169
- def unified
170
- ints = @intervals
171
- if block_given?
172
- ints = ints.select do |i|
173
- yield i
174
- end
175
- end
176
- list = IntervalList.new ints, :type => :flat
177
- list.collapse!
178
- list.to_a
23
+ class Feature < HashTable::HashLine
24
+ include GenomicLocus
25
+ def copy
26
+ self.class.new @hash.clone, @table
179
27
  end
180
28
 
181
- def canonical
182
- # find out which transcript has the longest cds
183
- @transcripts.max_by do |t|
184
- t.cds.inject(0) do |sum,cds|
185
- sum += cds.size
186
- end
187
- end
29
+ def seq
30
+ @seq ||= @table.fasta.locus_seq self
188
31
  end
189
32
 
190
- def inspect
191
- "#<#{self.class.name}:#{object_id} @transcripts=#{@transcripts.count}>"
33
+ def respond_to_missing? sym, include_all = false
34
+ self[:attribute].has_key?(sym) || super
192
35
  end
193
36
 
194
- private
195
- def build_transcripts
196
- (@intervals.select{|l| l.feature == "transcript"} || []).map do |t|
197
- name = t.attribute[:transcript_name]
198
- Transcript.new @intervals.select{|l| l.attribute[:transcript_name] == name}, name
37
+ def method_missing sym, *args, &block
38
+ if self[:attribute].has_key?(sym)
39
+ self[:attribute][sym]
40
+ else
41
+ super
199
42
  end
200
43
  end
201
44
  end
202
-
203
- def gene name
204
- intervals = gene_name[name]
205
- @genes[name] ||= GTF::Gene.new intervals if intervals
206
- end
45
+ line_class GTF::Feature
207
46
 
208
47
  def initialize file, opts=nil
209
48
  opts = { :comment => "#", :sep => " "}.merge(opts || {})
@@ -214,27 +53,20 @@ class GTF < HashTable
214
53
 
215
54
  super file, :comment => opts[:comment], :idx => opts[:idx],
216
55
  :header => [ :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :attribute ],
217
- :types => [ :str, :str, :str, :int, :int, :int, :str, :int, [ ";", @sep ] ]
56
+ :types => { :start => :int, :stop => :int, :score => :int, :frame =>
57
+ :int, :attribute => [ ";", @sep ] }
218
58
  end
219
59
 
220
60
  def inspect
221
61
  "#<#{self.class}:0x#{'%x' % (object_id << 1)} @lines=#{@lines.count}>"
222
62
  end
223
63
 
224
- def to_interval_list
225
- IntervalList.new self
64
+ def fasta
65
+ @opts[:fasta] || Fasta.default
226
66
  end
227
67
 
228
- def format_line g
229
- [ :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :attribute ].map do |h|
230
- if h == :attribute
231
- g[:attribute].map do |k,v|
232
- "#{k}#{@sep}#{v}"
233
- end.join("; ")
234
- else
235
- g[h]
236
- end
237
- end.join("\t")
68
+ def add_line hash
69
+ add_interval(super)
238
70
  end
239
71
 
240
72
  protected
data/lib/hash_table.rb CHANGED
@@ -9,18 +9,28 @@ class HashTable
9
9
  include HashTableAux
10
10
 
11
11
  class HashLine
12
- def initialize h
13
- if h.is_a? Array
14
- @hash = Hash[h]
15
- elsif h.is_a? Hash
16
- @hash = h
12
+ def self.alias_key sym1, sym2
13
+ define_method sym1 do
14
+ send sym2
17
15
  end
16
+ define_method "#{sym1}=" do |v|
17
+ send "#{sym2}=", v
18
+ end
19
+ end
20
+
21
+ def initialize h, table
22
+ @hash = Hash[h]
23
+ @table = table
18
24
  end
19
25
 
20
26
  def update hash
21
27
  @hash.update hash
22
28
  end
23
29
 
30
+ def set_table t
31
+ @table = t
32
+ end
33
+
24
34
  def [] ind
25
35
  @hash[ind]
26
36
  end
@@ -41,18 +51,48 @@ class HashTable
41
51
  @invalid
42
52
  end
43
53
 
54
+ def respond_to_missing? sym, include_all = false
55
+ if sym.to_s =~ /^(.*)=$/
56
+ true
57
+ else
58
+ @hash.has_key?(sym) || super
59
+ end
60
+ end
61
+
44
62
  def method_missing sym, *args, &block
45
- if @hash[sym]
63
+ if @hash.has_key? sym
46
64
  @hash[sym]
47
65
  elsif sym.to_s =~ /(.*)=/
48
66
  @hash[$1.to_sym] = args.first
49
67
  else
50
- nil
68
+ super
69
+ end
70
+ end
71
+
72
+ def to_s
73
+ @table.header.map do |h|
74
+ format_column h
75
+ end.join("\t")
76
+ end
77
+
78
+ def format_column column
79
+ if send(column).is_a?(Hash) && @table.types[column].is_a?(Array)
80
+ send(column).map do |key,value|
81
+ if value == true
82
+ # just print the key
83
+ key
84
+ else
85
+ "#{key}#{@table.types[column][1]}#{value}"
86
+ end
87
+ end.join @table.types[column][0]
88
+ else
89
+ send(column)
51
90
  end
52
91
  end
53
92
  end
54
93
 
55
94
  class << self
95
+ attr_reader :comment
56
96
  def line_type
57
97
  @line_type || HashLine
58
98
  end
@@ -73,44 +113,78 @@ class HashTable
73
113
  end
74
114
  header_on
75
115
 
76
- attr_accessor :header
116
+ attr_accessor :header, :types
77
117
  def [](ind)
78
- @lines[ind]
79
- end
80
-
81
- def method_missing sym, *args, &block
82
- if @index[sym]
83
- @index[sym]
118
+ if ind.is_a? Range
119
+ wrap @lines[ind]
84
120
  else
85
- super sym, *args, &block
121
+ @lines[ind]
86
122
  end
87
123
  end
88
124
 
125
+ def idx key, value=nil
126
+ @wrapped_index[ [key, value] ] ||= get_wrapped_table key, value
127
+ end
128
+
129
+ def idx_keys(key)
130
+ @bare_index[key].keys
131
+ end
132
+
89
133
  def sum(col)
90
134
  inject(0) do |sum,line|
91
135
  sum += line[col].to_f
92
136
  end
93
137
  end
94
138
 
139
+ [ :select, :reject, :sort, :sort_by ].each do |meth|
140
+ define_method(meth) do |&block|
141
+ wrap @lines.send(meth, &block)
142
+ end
143
+ end
144
+
145
+ def sample *args
146
+ samp = @lines.sample *args
147
+ if samp.is_a? Array
148
+ wrap samp
149
+ else
150
+ samp
151
+ end
152
+ end
153
+
154
+
95
155
  def select! &block
96
156
  @lines.select! &block
157
+ self
97
158
  end
98
159
 
99
160
  def sort_by! &block
100
161
  @lines.sort_by! &block
162
+ self
101
163
  end
102
164
 
103
165
  def use_header?
104
166
  self.class.use_header?
105
167
  end
106
168
 
169
+ def formatted_header
170
+ @header.map do |h|
171
+ @sleeve[h] || h
172
+ end.join("\t")
173
+ end
174
+
175
+ def preamble
176
+ @preamble
177
+ end
178
+
107
179
  def output f
108
- f.puts @header.join("\t") if use_header?
180
+ f.puts preamble
181
+ f.puts formatted_header if use_header?
109
182
  @lines.each do |l|
110
183
  l = yield l if block_given?
111
184
  next if !l || l.invalid?
112
- f.puts format_line(l)
185
+ f.puts l.to_s
113
186
  end
187
+ true
114
188
  end
115
189
 
116
190
  def inspect
@@ -123,73 +197,135 @@ class HashTable
123
197
  end
124
198
  end
125
199
 
126
- def initialize(file,opts={})
127
- @header = opts[:header]
128
- @skip_header = opts[:skip_header] && opts[:header]
129
- if @header.is_a? Hash
130
- @types = @header.values
131
- @header = @header.keys
132
- end
133
- create_index opts[:idx]
200
+ def initialize(obj=nil,opts={})
201
+ fix_opts(opts)
202
+ create_header
203
+ create_index
134
204
  @lines = []
135
- @comment = opts[:comment]
136
- @types ||= opts[:types]
205
+ @preamble = []
206
+ @sleeve = {}
207
+ @comment = @opts[:comment] || self.class.comment
208
+
209
+ if obj && obj.is_a?(String) && File.exists?(obj)
210
+ parse_file obj
211
+ elsif obj && obj.is_a?(Array)
212
+ # it's a stack of lines. Go with it.
213
+ @lines = obj
214
+ end
215
+ end
216
+
217
+ def << hash
218
+ add_line hash
219
+ end
220
+
221
+ def concat other_table
222
+ raise TypeError unless other_table.is_a? self.class
223
+ other_table.each do |line|
224
+ add_line line
225
+ end
226
+ self
227
+ end
137
228
 
138
- parse_file(file) if file && File.exists?(file)
229
+ def wrap lines
230
+ self.class.new lines, @opts.merge( :header => @header.clone, :types => @types.clone )
139
231
  end
140
232
 
233
+ def update_index key
234
+ create_index_for key
235
+ @lines.each do |line|
236
+ index_line_to_key line, key
237
+ end
238
+ end
239
+
240
+ protected
141
241
  def add_line hash
142
242
  if hash.is_a? HashLine
143
243
  @lines.push hash
244
+ hash.set_table self
144
245
  else
145
246
  @lines.push create_line(hash)
146
247
  end
248
+ index_line @lines.last
249
+ end
250
+
251
+ def create_header
252
+ validate_header
253
+
254
+ validate_types
255
+ end
256
+
257
+ def validate_header
258
+ @header = @opts[:header]
259
+ if @header.is_a? Hash
260
+ @opts[:types] = @header
261
+ @header = @header.keys
262
+ end
263
+ @skip_header = @opts[:skip_header] && @header
264
+ end
265
+
266
+ def enforce_header
267
+ end
268
+
269
+ def validate_types
270
+ @types = @opts[:types] || {}
271
+
272
+ raise TypeError, "Types must be a Hash!" unless @types.is_a?(Hash)
273
+
274
+ @types.each do |key,type|
275
+ case type
276
+ when Array
277
+ raise ArgumentError unless type.length == 2 && type.all?{|n| n.is_a? String}
278
+ end
279
+ end
147
280
  end
148
281
 
149
- private
150
282
  def parse_file file
151
283
  load_file file
284
+
285
+ fix_lines
286
+ end
287
+
288
+ def fix_lines
152
289
  @lines.each_index do |i|
153
290
  @lines[i] = create_line @lines[i]
154
- add_index @lines[i] unless @index.empty?
291
+ index_line @lines[i]
155
292
  end
156
293
  end
157
294
 
158
- def create_index idx
159
- if !idx
160
- @index = {}
161
- return
162
- end
163
- idx = [ idx ] if !idx.is_a? Array
164
- @index = Hash[idx.map{|i| [ i, {} ] }]
295
+ def fix_opts opts
296
+ @opts = opts
297
+ @opts[:idx] = [ @opts[:idx] ].flatten.compact
165
298
  end
166
299
 
167
- def set_header s, downcase=nil
168
- return nil if @header
169
- @header = s.chomp.split(/\t/).map{|s| downcase ? s.downcase.to_sym : s.to_sym }
300
+ def create_index
301
+ @bare_index = {}
302
+ @wrapped_index = {}
303
+ @opts[:idx].each do |key|
304
+ create_index_for key
305
+ end
170
306
  end
171
307
 
172
- def format_line l
173
- @header.map{|h| l[h]}.join("\t")
308
+ def create_index_for key
309
+ @bare_index[key] ||= Hash.new do |h,k| h[k] = []; end
174
310
  end
175
311
 
176
- def line_hash s
177
- @header.zip(s.split(/\t/))
312
+ def create_line s
313
+ self.class.line_type.new s, self
178
314
  end
179
315
 
180
- def is_comment? s
181
- @comment && s =~ @comment
316
+ def index_line line
317
+ @bare_index.each do |key,table|
318
+ index_line_to_key line, key
319
+ end
182
320
  end
183
321
 
184
- protected
185
- def create_line s
186
- self.class.line_type.new s
322
+ def index_line_to_key line, key
323
+ if line.respond_to?(key)
324
+ @bare_index[key][ line.send(key) ] << line
325
+ end
187
326
  end
188
327
 
189
- def add_index line
190
- @index.each do |key,ind|
191
- next if !line[key]
192
- (ind[ line[key] ] ||= []) << line
193
- end
328
+ def get_wrapped_table key, value
329
+ wrap @bare_index[key][value] if @bare_index[key] && @bare_index[key][value]
194
330
  end
195
331
  end