germ 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/fasta_aux/FastaAux.c +137 -0
- data/ext/fasta_aux/extconf.rb +7 -0
- data/ext/hash_table_aux/HashTableAux.c +246 -0
- data/ext/hash_table_aux/extconf.rb +7 -0
- data/lib/fasta.rb +79 -0
- data/lib/germ.rb +11 -0
- data/lib/germ/config.rb +34 -0
- data/lib/germ/data_types.rb +47 -0
- data/lib/germ/flagstat.rb +23 -0
- data/lib/germ/printer.rb +15 -0
- data/lib/gtf.rb +248 -0
- data/lib/hash_table.rb +195 -0
- data/lib/indelocator.rb +46 -0
- data/lib/intervals.rb +337 -0
- data/lib/maf.rb +92 -0
- data/lib/mutation_set.rb +351 -0
- data/lib/mutect.rb +43 -0
- data/lib/oncotator.rb +144 -0
- data/lib/sam.rb +196 -0
- data/lib/vcf.rb +162 -0
- metadata +115 -0
data/lib/germ.rb
ADDED
data/lib/germ/config.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
class TaylorlibConfig
|
3
|
+
def self.get_conf *keys
|
4
|
+
config = TaylorlibConfig.new
|
5
|
+
config.get_key *keys if config.loaded?
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
load_file if file_exists?
|
10
|
+
end
|
11
|
+
|
12
|
+
def loaded?
|
13
|
+
@config != nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_key *keys
|
17
|
+
keys.inject(@config) do |obj,key|
|
18
|
+
obj = obj[key]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
def config_file
|
24
|
+
ENV["TAYLORLIB_CONF"]
|
25
|
+
end
|
26
|
+
|
27
|
+
def file_exists?
|
28
|
+
config_file && File.exists?(config_file)
|
29
|
+
end
|
30
|
+
|
31
|
+
def load_file
|
32
|
+
@config = YAML.load File.read(config_file)
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module DataTypes
|
2
|
+
def attr_accessor_of_type name, type=nil
|
3
|
+
send :define_method, name do
|
4
|
+
instance_variable_get("@#{name}")
|
5
|
+
end
|
6
|
+
send :define_method, "#{name}=" do |v|
|
7
|
+
if block_given?
|
8
|
+
instance_variable_set "@#{name}", yield(v)
|
9
|
+
else
|
10
|
+
if v.respond_to? type
|
11
|
+
instance_variable_set "@#{name}", v.send(type)
|
12
|
+
else
|
13
|
+
raise ArgumentException
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def attr_string *names
|
20
|
+
names.each do |name|
|
21
|
+
attr_accessor_of_type name, :to_s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
def attr_integer *names
|
25
|
+
names.each do |name|
|
26
|
+
attr_accessor_of_type name, :to_i
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def attr_sym *names
|
31
|
+
names.each do |name|
|
32
|
+
attr_accessor_of_type name, :to_sym
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def attr_array *names
|
37
|
+
names.each do |name|
|
38
|
+
attr_accessor_of_type name do |v|
|
39
|
+
if block_given?
|
40
|
+
yield v
|
41
|
+
else
|
42
|
+
v.split(//)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class Flagstat
|
2
|
+
def initialize file
|
3
|
+
@headers = [ :total, :duplicates, :mapped, :paired_in_sequence, :read1, :read2,
|
4
|
+
:properly_paired, :both_mapped, :singletons, :mate_mapped_chr, :mate_mapped_chr_highq ]
|
5
|
+
@flags = Hash[@headers.zip(File.foreach(file).each_with_index.map do |l|
|
6
|
+
l.scan(/([0-9]+) \+ ([0-9]+)/).flatten
|
7
|
+
end)]
|
8
|
+
end
|
9
|
+
|
10
|
+
def each
|
11
|
+
@flags.each do |f,l|
|
12
|
+
yield f,l
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def method_missing(method, *args, &block)
|
17
|
+
return @flags[method].first.to_i if @flags[method]
|
18
|
+
method.to_s.match(/^chastity_(.*)/) do |m|
|
19
|
+
return @flags[m[1].to_sym].last.to_i if @flags[m[1].to_sym]
|
20
|
+
end
|
21
|
+
super
|
22
|
+
end
|
23
|
+
end
|
data/lib/germ/printer.rb
ADDED
data/lib/gtf.rb
ADDED
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'hash_table'
|
2
|
+
require 'intervals'
|
3
|
+
|
4
|
+
class GTF < HashTable
|
5
|
+
header_off
|
6
|
+
|
7
|
+
class GTFLine < HashTable::HashLine
|
8
|
+
include IntervalList::Interval
|
9
|
+
def chrom; seqname; end
|
10
|
+
def chrom= nc; seqname = nc; end
|
11
|
+
def copy
|
12
|
+
c = self.class.new @hash.clone
|
13
|
+
end
|
14
|
+
end
|
15
|
+
line_class GTFLine
|
16
|
+
|
17
|
+
class Gene
|
18
|
+
class Transcript
|
19
|
+
attr_reader :name, :intervals, :introns
|
20
|
+
def initialize array, name
|
21
|
+
@intervals = array
|
22
|
+
@name = name
|
23
|
+
|
24
|
+
@transcript = @intervals.find{|t| t.feature == "transcript"}
|
25
|
+
|
26
|
+
build_introns
|
27
|
+
end
|
28
|
+
|
29
|
+
def site pos
|
30
|
+
i = @transcript.clone :pos => pos
|
31
|
+
intron = nil
|
32
|
+
overlaps = @intervals.select{|f| f.contains? i }
|
33
|
+
return cds_pos i if overlaps.find{|f| f.feature == "cds" }
|
34
|
+
return intron_pos intron if intron = overlaps.find{|f| f.feature == "intron" }
|
35
|
+
return utr_pos if overlaps.find{|f| f.feature =~ /UTR/ }
|
36
|
+
{ :type => :transcript }
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def utr_pos
|
41
|
+
{ :type => :utr }
|
42
|
+
end
|
43
|
+
|
44
|
+
def intron_frame intron
|
45
|
+
# find the terminal frame of the leading exon
|
46
|
+
if strand == "+"
|
47
|
+
(intron.prev_exon.frame + intron.prev_exon.size)%3
|
48
|
+
else
|
49
|
+
intron.post_exon.frame
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def cds_pos pos
|
54
|
+
bases = 0
|
55
|
+
if @strand == "+"
|
56
|
+
cds.each do |c|
|
57
|
+
if c.contains? pos
|
58
|
+
bases += pos - c.start + 1
|
59
|
+
break
|
60
|
+
else
|
61
|
+
bases += c.size
|
62
|
+
end
|
63
|
+
end
|
64
|
+
else
|
65
|
+
cds.reverse.each do |c|
|
66
|
+
if c.contains? pos
|
67
|
+
bases += c.stop - pos + 1
|
68
|
+
break
|
69
|
+
else
|
70
|
+
bases += c.size
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
{ :type => :cds, :pos => bases/3 }
|
75
|
+
end
|
76
|
+
|
77
|
+
def intron_pos intron
|
78
|
+
{ :type => :intron, :pos => cds_pos(intron.start-1), :frame => intron_frame(intron) }
|
79
|
+
end
|
80
|
+
|
81
|
+
def utr3
|
82
|
+
return @utr3 if @utr3
|
83
|
+
cs = strand == "+" ? cds.first : cds.last
|
84
|
+
@utr3 = exons.select{ |e| strand == "+" ? !e.above?(cs) : !e.below?(cs) }
|
85
|
+
.map{|e| e.strict_diff(cs) }
|
86
|
+
.compact.map(&:to_a)
|
87
|
+
@utr3.each do |u|
|
88
|
+
u.feature = "3' UTR"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def utr5
|
93
|
+
return @utr5 if @utr5
|
94
|
+
cs = strand == "+" ? cds.last : cds.first
|
95
|
+
@utr5 = exons.select{|e| strand == "+" ? !e.below?(cs) : !e.above?(cs) }
|
96
|
+
.map{|e| e.strict_diff(cs)}
|
97
|
+
.compact.map(&:to_a)
|
98
|
+
@utr5.each do |u|
|
99
|
+
u.feature = "5' UTR"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def build_introns
|
104
|
+
return if !exons
|
105
|
+
@introns = exons.map.with_index do |e1,i|
|
106
|
+
e2 = @exons[i+1]
|
107
|
+
next if !e2
|
108
|
+
intron = e1.clone(:start => e1.stop+1, :stop => e2.start-1)
|
109
|
+
intron.feature = "intron"
|
110
|
+
intron.prev_exon = e1
|
111
|
+
intron.post_exon = e2
|
112
|
+
intron
|
113
|
+
end.compact
|
114
|
+
@intervals.concat @introns
|
115
|
+
end
|
116
|
+
|
117
|
+
def build_utrs
|
118
|
+
@intervals.concat @utr3 if @utr3
|
119
|
+
@intervals.concat @utr5 if @utr5
|
120
|
+
end
|
121
|
+
|
122
|
+
def start
|
123
|
+
@transcript.start
|
124
|
+
end
|
125
|
+
def stop
|
126
|
+
@transcript.stop
|
127
|
+
end
|
128
|
+
def strand
|
129
|
+
@transcript.strand
|
130
|
+
end
|
131
|
+
def contains? pos
|
132
|
+
start <= pos && stop >= pos
|
133
|
+
end
|
134
|
+
def exons
|
135
|
+
@exons ||= @intervals.select{|e| e.feature == "exon"}.sort_by &:start
|
136
|
+
end
|
137
|
+
def cds
|
138
|
+
@cds ||= @intervals.select{|e| e.feature == "CDS"}.sort_by &:start
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
attr_reader :name, :strand, :transcripts, :intervals
|
143
|
+
def initialize array
|
144
|
+
@intervals = array
|
145
|
+
@gene = @intervals.find{|l| l.feature == "gene"}
|
146
|
+
@name = @gene.attribute[:gene_name]
|
147
|
+
@strand = @gene.strand
|
148
|
+
@transcripts = build_transcripts
|
149
|
+
end
|
150
|
+
|
151
|
+
def start
|
152
|
+
@gene.start
|
153
|
+
end
|
154
|
+
|
155
|
+
def stop
|
156
|
+
@gene.stop
|
157
|
+
end
|
158
|
+
|
159
|
+
def site pos
|
160
|
+
score = { :cds => 1, :exon => 2, :utr => 3, :intron => 4, :transcript => 5, :igr => 6 }
|
161
|
+
sites = @transcripts.map do |t|
|
162
|
+
{ :gene => name }.update(t.site pos) if t.contains? pos
|
163
|
+
end.compact
|
164
|
+
sites.push(:type => :igr)
|
165
|
+
sites.sort_by{|s| score[s[:type]] }.first
|
166
|
+
end
|
167
|
+
|
168
|
+
# compute unified intervals from the list of intervals
|
169
|
+
def unified
|
170
|
+
ints = @intervals
|
171
|
+
if block_given?
|
172
|
+
ints = ints.select do |i|
|
173
|
+
yield i
|
174
|
+
end
|
175
|
+
end
|
176
|
+
list = IntervalList.new ints, :type => :flat
|
177
|
+
list.collapse!
|
178
|
+
list.to_a
|
179
|
+
end
|
180
|
+
|
181
|
+
def canonical
|
182
|
+
# find out which transcript has the longest cds
|
183
|
+
@transcripts.max_by do |t|
|
184
|
+
t.cds.inject(0) do |sum,cds|
|
185
|
+
sum += cds.size
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def inspect
|
191
|
+
"#<#{self.class.name}:#{object_id} @transcripts=#{@transcripts.count}>"
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
def build_transcripts
|
196
|
+
(@intervals.select{|l| l.feature == "transcript"} || []).map do |t|
|
197
|
+
name = t.attribute[:transcript_name]
|
198
|
+
Transcript.new @intervals.select{|l| l.attribute[:transcript_name] == name}, name
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def gene name
|
204
|
+
intervals = gene_name[name]
|
205
|
+
@genes[name] ||= GTF::Gene.new intervals if intervals
|
206
|
+
end
|
207
|
+
|
208
|
+
def initialize file, opts=nil
|
209
|
+
opts = { :comment => "#", :sep => " "}.merge(opts || {})
|
210
|
+
|
211
|
+
@sep = opts[:sep]
|
212
|
+
|
213
|
+
@genes = {}
|
214
|
+
|
215
|
+
super file, :comment => opts[:comment], :idx => opts[:idx],
|
216
|
+
:header => [ :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :attribute ],
|
217
|
+
:types => [ :str, :str, :str, :int, :int, :int, :str, :int, [ ";", @sep ] ]
|
218
|
+
end
|
219
|
+
|
220
|
+
def inspect
|
221
|
+
"#<#{self.class}:0x#{'%x' % (object_id << 1)} @lines=#{@lines.count}>"
|
222
|
+
end
|
223
|
+
|
224
|
+
def to_interval_list
|
225
|
+
IntervalList.new self
|
226
|
+
end
|
227
|
+
|
228
|
+
def format_line g
|
229
|
+
[ :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :attribute ].map do |h|
|
230
|
+
if h == :attribute
|
231
|
+
g[:attribute].map do |k,v|
|
232
|
+
"#{k}#{@sep}#{v}"
|
233
|
+
end.join("; ")
|
234
|
+
else
|
235
|
+
g[h]
|
236
|
+
end
|
237
|
+
end.join("\t")
|
238
|
+
end
|
239
|
+
|
240
|
+
protected
|
241
|
+
def add_index line
|
242
|
+
@index.each do |key,ind|
|
243
|
+
ikey = line[key] || line[:attribute][key]
|
244
|
+
next if !ikey
|
245
|
+
(ind[ ikey ] ||= []) << line
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
data/lib/hash_table.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'extlib'
|
3
|
+
require 'germ/printer'
|
4
|
+
require 'hash_table_aux/hash_table_aux'
|
5
|
+
|
6
|
+
class HashTable
|
7
|
+
include Enumerable
|
8
|
+
include Printer
|
9
|
+
include HashTableAux
|
10
|
+
|
11
|
+
class HashLine
|
12
|
+
def initialize h
|
13
|
+
if h.is_a? Array
|
14
|
+
@hash = Hash[h]
|
15
|
+
elsif h.is_a? Hash
|
16
|
+
@hash = h
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def update hash
|
21
|
+
@hash.update hash
|
22
|
+
end
|
23
|
+
|
24
|
+
def [] ind
|
25
|
+
@hash[ind]
|
26
|
+
end
|
27
|
+
|
28
|
+
def []= ind,v
|
29
|
+
@hash[ind] = v
|
30
|
+
end
|
31
|
+
|
32
|
+
def invalidate!
|
33
|
+
@invalid = true
|
34
|
+
end
|
35
|
+
|
36
|
+
def approve!
|
37
|
+
@invalid = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
def invalid?
|
41
|
+
@invalid
|
42
|
+
end
|
43
|
+
|
44
|
+
def method_missing sym, *args, &block
|
45
|
+
if @hash[sym]
|
46
|
+
@hash[sym]
|
47
|
+
elsif sym.to_s =~ /(.*)=/
|
48
|
+
@hash[$1.to_sym] = args.first
|
49
|
+
else
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class << self
|
56
|
+
def line_type
|
57
|
+
@line_type || HashLine
|
58
|
+
end
|
59
|
+
|
60
|
+
def line_class klass
|
61
|
+
@line_type = const_get klass.to_s.camel_case
|
62
|
+
end
|
63
|
+
|
64
|
+
def use_header?
|
65
|
+
@use_header
|
66
|
+
end
|
67
|
+
def header_on
|
68
|
+
@use_header = true
|
69
|
+
end
|
70
|
+
def header_off
|
71
|
+
@use_header = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
header_on
|
75
|
+
|
76
|
+
attr_accessor :header
|
77
|
+
def [](ind)
|
78
|
+
@lines[ind]
|
79
|
+
end
|
80
|
+
|
81
|
+
def method_missing sym, *args, &block
|
82
|
+
if @index[sym]
|
83
|
+
@index[sym]
|
84
|
+
else
|
85
|
+
super sym, *args, &block
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def sum(col)
|
90
|
+
inject(0) do |sum,line|
|
91
|
+
sum += line[col].to_f
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def select! &block
|
96
|
+
@lines.select! &block
|
97
|
+
end
|
98
|
+
|
99
|
+
def sort_by! &block
|
100
|
+
@lines.sort_by! &block
|
101
|
+
end
|
102
|
+
|
103
|
+
def use_header?
|
104
|
+
self.class.use_header?
|
105
|
+
end
|
106
|
+
|
107
|
+
def output f
|
108
|
+
f.puts @header.join("\t") if use_header?
|
109
|
+
@lines.each do |l|
|
110
|
+
l = yield l if block_given?
|
111
|
+
next if !l || l.invalid?
|
112
|
+
f.puts format_line(l)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def inspect
|
117
|
+
"#<#{self.class.name}:#{object_id} @lines=#{@lines.count}>"
|
118
|
+
end
|
119
|
+
|
120
|
+
def each
|
121
|
+
@lines.each do |l|
|
122
|
+
yield l
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def initialize(file,opts={})
|
127
|
+
@header = opts[:header]
|
128
|
+
@skip_header = opts[:skip_header] && opts[:header]
|
129
|
+
if @header.is_a? Hash
|
130
|
+
@types = @header.values
|
131
|
+
@header = @header.keys
|
132
|
+
end
|
133
|
+
create_index opts[:idx]
|
134
|
+
@lines = []
|
135
|
+
@comment = opts[:comment]
|
136
|
+
@types ||= opts[:types]
|
137
|
+
|
138
|
+
parse_file(file) if file && File.exists?(file)
|
139
|
+
end
|
140
|
+
|
141
|
+
def add_line hash
|
142
|
+
if hash.is_a? HashLine
|
143
|
+
@lines.push hash
|
144
|
+
else
|
145
|
+
@lines.push create_line(hash)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
def parse_file file
|
151
|
+
load_file file
|
152
|
+
@lines.each_index do |i|
|
153
|
+
@lines[i] = create_line @lines[i]
|
154
|
+
add_index @lines[i] unless @index.empty?
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def create_index idx
|
159
|
+
if !idx
|
160
|
+
@index = {}
|
161
|
+
return
|
162
|
+
end
|
163
|
+
idx = [ idx ] if !idx.is_a? Array
|
164
|
+
@index = Hash[idx.map{|i| [ i, {} ] }]
|
165
|
+
end
|
166
|
+
|
167
|
+
def set_header s, downcase=nil
|
168
|
+
return nil if @header
|
169
|
+
@header = s.chomp.split(/\t/).map{|s| downcase ? s.downcase.to_sym : s.to_sym }
|
170
|
+
end
|
171
|
+
|
172
|
+
def format_line l
|
173
|
+
@header.map{|h| l[h]}.join("\t")
|
174
|
+
end
|
175
|
+
|
176
|
+
def line_hash s
|
177
|
+
@header.zip(s.split(/\t/))
|
178
|
+
end
|
179
|
+
|
180
|
+
def is_comment? s
|
181
|
+
@comment && s =~ @comment
|
182
|
+
end
|
183
|
+
|
184
|
+
protected
|
185
|
+
def create_line s
|
186
|
+
self.class.line_type.new s
|
187
|
+
end
|
188
|
+
|
189
|
+
def add_index line
|
190
|
+
@index.each do |key,ind|
|
191
|
+
next if !line[key]
|
192
|
+
(ind[ line[key] ] ||= []) << line
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|