germ 0.1 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/hash_table_aux/HashTableAux.c +19 -6
- data/lib/fasta.rb +122 -24
- data/lib/fastq.rb +45 -0
- data/lib/genetic_code.rb +141 -0
- data/lib/genomic_locus.rb +50 -0
- data/lib/germ/config.rb +64 -4
- data/lib/germ/flagstat.rb +4 -0
- data/lib/germ.rb +3 -0
- data/lib/go.rb +164 -0
- data/lib/gtf/gene.rb +293 -0
- data/lib/gtf.rb +34 -202
- data/lib/hash_table.rb +190 -54
- data/lib/intervals.rb +225 -250
- data/lib/maf.rb +42 -58
- data/lib/mutation.rb +41 -0
- data/lib/mutation_set.rb +60 -239
- data/lib/mutect.rb +22 -17
- data/lib/oncotator.rb +43 -1
- data/lib/sdrf.rb +14 -0
- data/lib/tcga.rb +41 -0
- data/lib/vcf.rb +77 -73
- metadata +33 -33
data/lib/gtf.rb
CHANGED
@@ -1,209 +1,48 @@
|
|
1
1
|
require 'hash_table'
|
2
2
|
require 'intervals'
|
3
|
+
require 'genomic_locus'
|
4
|
+
require 'genetic_code'
|
5
|
+
require 'gtf/gene'
|
6
|
+
require 'fasta'
|
7
|
+
require 'germ/config'
|
3
8
|
|
4
9
|
class GTF < HashTable
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
extend GermDefault
|
11
|
+
include IntervalList
|
12
|
+
|
13
|
+
def self.default_create file, idx=nil
|
14
|
+
if idx
|
15
|
+
new file, :idx => idx
|
16
|
+
else
|
17
|
+
new file
|
13
18
|
end
|
14
19
|
end
|
15
|
-
line_class GTFLine
|
16
|
-
|
17
|
-
class Gene
|
18
|
-
class Transcript
|
19
|
-
attr_reader :name, :intervals, :introns
|
20
|
-
def initialize array, name
|
21
|
-
@intervals = array
|
22
|
-
@name = name
|
23
|
-
|
24
|
-
@transcript = @intervals.find{|t| t.feature == "transcript"}
|
25
|
-
|
26
|
-
build_introns
|
27
|
-
end
|
28
|
-
|
29
|
-
def site pos
|
30
|
-
i = @transcript.clone :pos => pos
|
31
|
-
intron = nil
|
32
|
-
overlaps = @intervals.select{|f| f.contains? i }
|
33
|
-
return cds_pos i if overlaps.find{|f| f.feature == "cds" }
|
34
|
-
return intron_pos intron if intron = overlaps.find{|f| f.feature == "intron" }
|
35
|
-
return utr_pos if overlaps.find{|f| f.feature =~ /UTR/ }
|
36
|
-
{ :type => :transcript }
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
def utr_pos
|
41
|
-
{ :type => :utr }
|
42
|
-
end
|
43
|
-
|
44
|
-
def intron_frame intron
|
45
|
-
# find the terminal frame of the leading exon
|
46
|
-
if strand == "+"
|
47
|
-
(intron.prev_exon.frame + intron.prev_exon.size)%3
|
48
|
-
else
|
49
|
-
intron.post_exon.frame
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def cds_pos pos
|
54
|
-
bases = 0
|
55
|
-
if @strand == "+"
|
56
|
-
cds.each do |c|
|
57
|
-
if c.contains? pos
|
58
|
-
bases += pos - c.start + 1
|
59
|
-
break
|
60
|
-
else
|
61
|
-
bases += c.size
|
62
|
-
end
|
63
|
-
end
|
64
|
-
else
|
65
|
-
cds.reverse.each do |c|
|
66
|
-
if c.contains? pos
|
67
|
-
bases += c.stop - pos + 1
|
68
|
-
break
|
69
|
-
else
|
70
|
-
bases += c.size
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
{ :type => :cds, :pos => bases/3 }
|
75
|
-
end
|
76
|
-
|
77
|
-
def intron_pos intron
|
78
|
-
{ :type => :intron, :pos => cds_pos(intron.start-1), :frame => intron_frame(intron) }
|
79
|
-
end
|
80
|
-
|
81
|
-
def utr3
|
82
|
-
return @utr3 if @utr3
|
83
|
-
cs = strand == "+" ? cds.first : cds.last
|
84
|
-
@utr3 = exons.select{ |e| strand == "+" ? !e.above?(cs) : !e.below?(cs) }
|
85
|
-
.map{|e| e.strict_diff(cs) }
|
86
|
-
.compact.map(&:to_a)
|
87
|
-
@utr3.each do |u|
|
88
|
-
u.feature = "3' UTR"
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def utr5
|
93
|
-
return @utr5 if @utr5
|
94
|
-
cs = strand == "+" ? cds.last : cds.first
|
95
|
-
@utr5 = exons.select{|e| strand == "+" ? !e.below?(cs) : !e.above?(cs) }
|
96
|
-
.map{|e| e.strict_diff(cs)}
|
97
|
-
.compact.map(&:to_a)
|
98
|
-
@utr5.each do |u|
|
99
|
-
u.feature = "5' UTR"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
def build_introns
|
104
|
-
return if !exons
|
105
|
-
@introns = exons.map.with_index do |e1,i|
|
106
|
-
e2 = @exons[i+1]
|
107
|
-
next if !e2
|
108
|
-
intron = e1.clone(:start => e1.stop+1, :stop => e2.start-1)
|
109
|
-
intron.feature = "intron"
|
110
|
-
intron.prev_exon = e1
|
111
|
-
intron.post_exon = e2
|
112
|
-
intron
|
113
|
-
end.compact
|
114
|
-
@intervals.concat @introns
|
115
|
-
end
|
116
|
-
|
117
|
-
def build_utrs
|
118
|
-
@intervals.concat @utr3 if @utr3
|
119
|
-
@intervals.concat @utr5 if @utr5
|
120
|
-
end
|
121
20
|
|
122
|
-
|
123
|
-
@transcript.start
|
124
|
-
end
|
125
|
-
def stop
|
126
|
-
@transcript.stop
|
127
|
-
end
|
128
|
-
def strand
|
129
|
-
@transcript.strand
|
130
|
-
end
|
131
|
-
def contains? pos
|
132
|
-
start <= pos && stop >= pos
|
133
|
-
end
|
134
|
-
def exons
|
135
|
-
@exons ||= @intervals.select{|e| e.feature == "exon"}.sort_by &:start
|
136
|
-
end
|
137
|
-
def cds
|
138
|
-
@cds ||= @intervals.select{|e| e.feature == "CDS"}.sort_by &:start
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
attr_reader :name, :strand, :transcripts, :intervals
|
143
|
-
def initialize array
|
144
|
-
@intervals = array
|
145
|
-
@gene = @intervals.find{|l| l.feature == "gene"}
|
146
|
-
@name = @gene.attribute[:gene_name]
|
147
|
-
@strand = @gene.strand
|
148
|
-
@transcripts = build_transcripts
|
149
|
-
end
|
150
|
-
|
151
|
-
def start
|
152
|
-
@gene.start
|
153
|
-
end
|
154
|
-
|
155
|
-
def stop
|
156
|
-
@gene.stop
|
157
|
-
end
|
158
|
-
|
159
|
-
def site pos
|
160
|
-
score = { :cds => 1, :exon => 2, :utr => 3, :intron => 4, :transcript => 5, :igr => 6 }
|
161
|
-
sites = @transcripts.map do |t|
|
162
|
-
{ :gene => name }.update(t.site pos) if t.contains? pos
|
163
|
-
end.compact
|
164
|
-
sites.push(:type => :igr)
|
165
|
-
sites.sort_by{|s| score[s[:type]] }.first
|
166
|
-
end
|
21
|
+
header_off
|
167
22
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
ints = ints.select do |i|
|
173
|
-
yield i
|
174
|
-
end
|
175
|
-
end
|
176
|
-
list = IntervalList.new ints, :type => :flat
|
177
|
-
list.collapse!
|
178
|
-
list.to_a
|
23
|
+
class Feature < HashTable::HashLine
|
24
|
+
include GenomicLocus
|
25
|
+
def copy
|
26
|
+
self.class.new @hash.clone, @table
|
179
27
|
end
|
180
28
|
|
181
|
-
def
|
182
|
-
|
183
|
-
@transcripts.max_by do |t|
|
184
|
-
t.cds.inject(0) do |sum,cds|
|
185
|
-
sum += cds.size
|
186
|
-
end
|
187
|
-
end
|
29
|
+
def seq
|
30
|
+
@seq ||= @table.fasta.locus_seq self
|
188
31
|
end
|
189
32
|
|
190
|
-
def
|
191
|
-
|
33
|
+
def respond_to_missing? sym, include_all = false
|
34
|
+
self[:attribute].has_key?(sym) || super
|
192
35
|
end
|
193
36
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
37
|
+
def method_missing sym, *args, &block
|
38
|
+
if self[:attribute].has_key?(sym)
|
39
|
+
self[:attribute][sym]
|
40
|
+
else
|
41
|
+
super
|
199
42
|
end
|
200
43
|
end
|
201
44
|
end
|
202
|
-
|
203
|
-
def gene name
|
204
|
-
intervals = gene_name[name]
|
205
|
-
@genes[name] ||= GTF::Gene.new intervals if intervals
|
206
|
-
end
|
45
|
+
line_class GTF::Feature
|
207
46
|
|
208
47
|
def initialize file, opts=nil
|
209
48
|
opts = { :comment => "#", :sep => " "}.merge(opts || {})
|
@@ -214,27 +53,20 @@ class GTF < HashTable
|
|
214
53
|
|
215
54
|
super file, :comment => opts[:comment], :idx => opts[:idx],
|
216
55
|
:header => [ :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :attribute ],
|
217
|
-
:types =>
|
56
|
+
:types => { :start => :int, :stop => :int, :score => :int, :frame =>
|
57
|
+
:int, :attribute => [ ";", @sep ] }
|
218
58
|
end
|
219
59
|
|
220
60
|
def inspect
|
221
61
|
"#<#{self.class}:0x#{'%x' % (object_id << 1)} @lines=#{@lines.count}>"
|
222
62
|
end
|
223
63
|
|
224
|
-
def
|
225
|
-
|
64
|
+
def fasta
|
65
|
+
@opts[:fasta] || Fasta.default
|
226
66
|
end
|
227
67
|
|
228
|
-
def
|
229
|
-
|
230
|
-
if h == :attribute
|
231
|
-
g[:attribute].map do |k,v|
|
232
|
-
"#{k}#{@sep}#{v}"
|
233
|
-
end.join("; ")
|
234
|
-
else
|
235
|
-
g[h]
|
236
|
-
end
|
237
|
-
end.join("\t")
|
68
|
+
def add_line hash
|
69
|
+
add_interval(super)
|
238
70
|
end
|
239
71
|
|
240
72
|
protected
|
data/lib/hash_table.rb
CHANGED
@@ -9,18 +9,28 @@ class HashTable
|
|
9
9
|
include HashTableAux
|
10
10
|
|
11
11
|
class HashLine
|
12
|
-
def
|
13
|
-
|
14
|
-
|
15
|
-
elsif h.is_a? Hash
|
16
|
-
@hash = h
|
12
|
+
def self.alias_key sym1, sym2
|
13
|
+
define_method sym1 do
|
14
|
+
send sym2
|
17
15
|
end
|
16
|
+
define_method "#{sym1}=" do |v|
|
17
|
+
send "#{sym2}=", v
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize h, table
|
22
|
+
@hash = Hash[h]
|
23
|
+
@table = table
|
18
24
|
end
|
19
25
|
|
20
26
|
def update hash
|
21
27
|
@hash.update hash
|
22
28
|
end
|
23
29
|
|
30
|
+
def set_table t
|
31
|
+
@table = t
|
32
|
+
end
|
33
|
+
|
24
34
|
def [] ind
|
25
35
|
@hash[ind]
|
26
36
|
end
|
@@ -41,18 +51,48 @@ class HashTable
|
|
41
51
|
@invalid
|
42
52
|
end
|
43
53
|
|
54
|
+
def respond_to_missing? sym, include_all = false
|
55
|
+
if sym.to_s =~ /^(.*)=$/
|
56
|
+
true
|
57
|
+
else
|
58
|
+
@hash.has_key?(sym) || super
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
44
62
|
def method_missing sym, *args, &block
|
45
|
-
if @hash
|
63
|
+
if @hash.has_key? sym
|
46
64
|
@hash[sym]
|
47
65
|
elsif sym.to_s =~ /(.*)=/
|
48
66
|
@hash[$1.to_sym] = args.first
|
49
67
|
else
|
50
|
-
|
68
|
+
super
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_s
|
73
|
+
@table.header.map do |h|
|
74
|
+
format_column h
|
75
|
+
end.join("\t")
|
76
|
+
end
|
77
|
+
|
78
|
+
def format_column column
|
79
|
+
if send(column).is_a?(Hash) && @table.types[column].is_a?(Array)
|
80
|
+
send(column).map do |key,value|
|
81
|
+
if value == true
|
82
|
+
# just print the key
|
83
|
+
key
|
84
|
+
else
|
85
|
+
"#{key}#{@table.types[column][1]}#{value}"
|
86
|
+
end
|
87
|
+
end.join @table.types[column][0]
|
88
|
+
else
|
89
|
+
send(column)
|
51
90
|
end
|
52
91
|
end
|
53
92
|
end
|
54
93
|
|
55
94
|
class << self
|
95
|
+
attr_reader :comment
|
56
96
|
def line_type
|
57
97
|
@line_type || HashLine
|
58
98
|
end
|
@@ -73,44 +113,78 @@ class HashTable
|
|
73
113
|
end
|
74
114
|
header_on
|
75
115
|
|
76
|
-
attr_accessor :header
|
116
|
+
attr_accessor :header, :types
|
77
117
|
def [](ind)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
def method_missing sym, *args, &block
|
82
|
-
if @index[sym]
|
83
|
-
@index[sym]
|
118
|
+
if ind.is_a? Range
|
119
|
+
wrap @lines[ind]
|
84
120
|
else
|
85
|
-
|
121
|
+
@lines[ind]
|
86
122
|
end
|
87
123
|
end
|
88
124
|
|
125
|
+
def idx key, value=nil
|
126
|
+
@wrapped_index[ [key, value] ] ||= get_wrapped_table key, value
|
127
|
+
end
|
128
|
+
|
129
|
+
def idx_keys(key)
|
130
|
+
@bare_index[key].keys
|
131
|
+
end
|
132
|
+
|
89
133
|
def sum(col)
|
90
134
|
inject(0) do |sum,line|
|
91
135
|
sum += line[col].to_f
|
92
136
|
end
|
93
137
|
end
|
94
138
|
|
139
|
+
[ :select, :reject, :sort, :sort_by ].each do |meth|
|
140
|
+
define_method(meth) do |&block|
|
141
|
+
wrap @lines.send(meth, &block)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def sample *args
|
146
|
+
samp = @lines.sample *args
|
147
|
+
if samp.is_a? Array
|
148
|
+
wrap samp
|
149
|
+
else
|
150
|
+
samp
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
|
95
155
|
def select! &block
|
96
156
|
@lines.select! &block
|
157
|
+
self
|
97
158
|
end
|
98
159
|
|
99
160
|
def sort_by! &block
|
100
161
|
@lines.sort_by! &block
|
162
|
+
self
|
101
163
|
end
|
102
164
|
|
103
165
|
def use_header?
|
104
166
|
self.class.use_header?
|
105
167
|
end
|
106
168
|
|
169
|
+
def formatted_header
|
170
|
+
@header.map do |h|
|
171
|
+
@sleeve[h] || h
|
172
|
+
end.join("\t")
|
173
|
+
end
|
174
|
+
|
175
|
+
def preamble
|
176
|
+
@preamble
|
177
|
+
end
|
178
|
+
|
107
179
|
def output f
|
108
|
-
f.puts
|
180
|
+
f.puts preamble
|
181
|
+
f.puts formatted_header if use_header?
|
109
182
|
@lines.each do |l|
|
110
183
|
l = yield l if block_given?
|
111
184
|
next if !l || l.invalid?
|
112
|
-
f.puts
|
185
|
+
f.puts l.to_s
|
113
186
|
end
|
187
|
+
true
|
114
188
|
end
|
115
189
|
|
116
190
|
def inspect
|
@@ -123,73 +197,135 @@ class HashTable
|
|
123
197
|
end
|
124
198
|
end
|
125
199
|
|
126
|
-
def initialize(
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
@types = @header.values
|
131
|
-
@header = @header.keys
|
132
|
-
end
|
133
|
-
create_index opts[:idx]
|
200
|
+
def initialize(obj=nil,opts={})
|
201
|
+
fix_opts(opts)
|
202
|
+
create_header
|
203
|
+
create_index
|
134
204
|
@lines = []
|
135
|
-
@
|
136
|
-
@
|
205
|
+
@preamble = []
|
206
|
+
@sleeve = {}
|
207
|
+
@comment = @opts[:comment] || self.class.comment
|
208
|
+
|
209
|
+
if obj && obj.is_a?(String) && File.exists?(obj)
|
210
|
+
parse_file obj
|
211
|
+
elsif obj && obj.is_a?(Array)
|
212
|
+
# it's a stack of lines. Go with it.
|
213
|
+
@lines = obj
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def << hash
|
218
|
+
add_line hash
|
219
|
+
end
|
220
|
+
|
221
|
+
def concat other_table
|
222
|
+
raise TypeError unless other_table.is_a? self.class
|
223
|
+
other_table.each do |line|
|
224
|
+
add_line line
|
225
|
+
end
|
226
|
+
self
|
227
|
+
end
|
137
228
|
|
138
|
-
|
229
|
+
def wrap lines
|
230
|
+
self.class.new lines, @opts.merge( :header => @header.clone, :types => @types.clone )
|
139
231
|
end
|
140
232
|
|
233
|
+
def update_index key
|
234
|
+
create_index_for key
|
235
|
+
@lines.each do |line|
|
236
|
+
index_line_to_key line, key
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
protected
|
141
241
|
def add_line hash
|
142
242
|
if hash.is_a? HashLine
|
143
243
|
@lines.push hash
|
244
|
+
hash.set_table self
|
144
245
|
else
|
145
246
|
@lines.push create_line(hash)
|
146
247
|
end
|
248
|
+
index_line @lines.last
|
249
|
+
end
|
250
|
+
|
251
|
+
def create_header
|
252
|
+
validate_header
|
253
|
+
|
254
|
+
validate_types
|
255
|
+
end
|
256
|
+
|
257
|
+
def validate_header
|
258
|
+
@header = @opts[:header]
|
259
|
+
if @header.is_a? Hash
|
260
|
+
@opts[:types] = @header
|
261
|
+
@header = @header.keys
|
262
|
+
end
|
263
|
+
@skip_header = @opts[:skip_header] && @header
|
264
|
+
end
|
265
|
+
|
266
|
+
def enforce_header
|
267
|
+
end
|
268
|
+
|
269
|
+
def validate_types
|
270
|
+
@types = @opts[:types] || {}
|
271
|
+
|
272
|
+
raise TypeError, "Types must be a Hash!" unless @types.is_a?(Hash)
|
273
|
+
|
274
|
+
@types.each do |key,type|
|
275
|
+
case type
|
276
|
+
when Array
|
277
|
+
raise ArgumentError unless type.length == 2 && type.all?{|n| n.is_a? String}
|
278
|
+
end
|
279
|
+
end
|
147
280
|
end
|
148
281
|
|
149
|
-
private
|
150
282
|
def parse_file file
|
151
283
|
load_file file
|
284
|
+
|
285
|
+
fix_lines
|
286
|
+
end
|
287
|
+
|
288
|
+
def fix_lines
|
152
289
|
@lines.each_index do |i|
|
153
290
|
@lines[i] = create_line @lines[i]
|
154
|
-
|
291
|
+
index_line @lines[i]
|
155
292
|
end
|
156
293
|
end
|
157
294
|
|
158
|
-
def
|
159
|
-
|
160
|
-
|
161
|
-
return
|
162
|
-
end
|
163
|
-
idx = [ idx ] if !idx.is_a? Array
|
164
|
-
@index = Hash[idx.map{|i| [ i, {} ] }]
|
295
|
+
def fix_opts opts
|
296
|
+
@opts = opts
|
297
|
+
@opts[:idx] = [ @opts[:idx] ].flatten.compact
|
165
298
|
end
|
166
299
|
|
167
|
-
def
|
168
|
-
|
169
|
-
@
|
300
|
+
def create_index
|
301
|
+
@bare_index = {}
|
302
|
+
@wrapped_index = {}
|
303
|
+
@opts[:idx].each do |key|
|
304
|
+
create_index_for key
|
305
|
+
end
|
170
306
|
end
|
171
307
|
|
172
|
-
def
|
173
|
-
@
|
308
|
+
def create_index_for key
|
309
|
+
@bare_index[key] ||= Hash.new do |h,k| h[k] = []; end
|
174
310
|
end
|
175
311
|
|
176
|
-
def
|
177
|
-
|
312
|
+
def create_line s
|
313
|
+
self.class.line_type.new s, self
|
178
314
|
end
|
179
315
|
|
180
|
-
def
|
181
|
-
@
|
316
|
+
def index_line line
|
317
|
+
@bare_index.each do |key,table|
|
318
|
+
index_line_to_key line, key
|
319
|
+
end
|
182
320
|
end
|
183
321
|
|
184
|
-
|
185
|
-
|
186
|
-
|
322
|
+
def index_line_to_key line, key
|
323
|
+
if line.respond_to?(key)
|
324
|
+
@bare_index[key][ line.send(key) ] << line
|
325
|
+
end
|
187
326
|
end
|
188
327
|
|
189
|
-
def
|
190
|
-
@
|
191
|
-
next if !line[key]
|
192
|
-
(ind[ line[key] ] ||= []) << line
|
193
|
-
end
|
328
|
+
def get_wrapped_table key, value
|
329
|
+
wrap @bare_index[key][value] if @bare_index[key] && @bare_index[key][value]
|
194
330
|
end
|
195
331
|
end
|