germ 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/fasta_aux/FastaAux.c +137 -0
- data/ext/fasta_aux/extconf.rb +7 -0
- data/ext/hash_table_aux/HashTableAux.c +246 -0
- data/ext/hash_table_aux/extconf.rb +7 -0
- data/lib/fasta.rb +79 -0
- data/lib/germ.rb +11 -0
- data/lib/germ/config.rb +34 -0
- data/lib/germ/data_types.rb +47 -0
- data/lib/germ/flagstat.rb +23 -0
- data/lib/germ/printer.rb +15 -0
- data/lib/gtf.rb +248 -0
- data/lib/hash_table.rb +195 -0
- data/lib/indelocator.rb +46 -0
- data/lib/intervals.rb +337 -0
- data/lib/maf.rb +92 -0
- data/lib/mutation_set.rb +351 -0
- data/lib/mutect.rb +43 -0
- data/lib/oncotator.rb +144 -0
- data/lib/sam.rb +196 -0
- data/lib/vcf.rb +162 -0
- metadata +115 -0
data/lib/sam.rb
ADDED
@@ -0,0 +1,196 @@
|
|
1
|
+
require 'germ/data_types'
|
2
|
+
require 'germ/printer'
|
3
|
+
class Sam
|
4
|
+
include Enumerable
|
5
|
+
include Printer
|
6
|
+
|
7
|
+
class Header
|
8
|
+
class Record
|
9
|
+
attr_reader :type
|
10
|
+
attr_reader :tags
|
11
|
+
attr_reader :comment
|
12
|
+
attr_reader :line
|
13
|
+
|
14
|
+
def initialize line
|
15
|
+
@line = line
|
16
|
+
if line =~ /^@CO\t(.*)/
|
17
|
+
@type = :CO
|
18
|
+
@comment = $1
|
19
|
+
end
|
20
|
+
line.match /^@([A-Za-z][A-Za-z])\t((?:[A-Za-z][A-Za-z0-9]:[ -~]+(?:\t|$))+)/ do |m|
|
21
|
+
@type = m[1].to_sym
|
22
|
+
@tags = Hash[m[2].split(/\t/).map{|s| s.split(/:/)}]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
if @type == :CO
|
28
|
+
"@#{@type}\t#{@comment}"
|
29
|
+
else
|
30
|
+
"@#{@type}\t#{tags.map{ |t,v| "#{t}:#{v}" }.join("\t")}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
attr_reader :records
|
36
|
+
def initialize sam, lines
|
37
|
+
@sam = sam
|
38
|
+
@records = []
|
39
|
+
lines.each do |l|
|
40
|
+
@records.push Sam::Header::Record.new(l)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def output f
|
45
|
+
@records.each do |r|
|
46
|
+
f.puts r
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Read
|
52
|
+
extend DataTypes
|
53
|
+
attr_sym :qname
|
54
|
+
attr_integer :flag, :pos, :mapq, :pnext, :tlen
|
55
|
+
attr_string :rname, :rnext, :other, :sam, :seq, :qual, :cigar_string
|
56
|
+
alias :chrom :rname
|
57
|
+
alias :start_pos :pos
|
58
|
+
alias :mate_pos :pnext
|
59
|
+
|
60
|
+
def cigar
|
61
|
+
@cigar ||= @cigar_string.scan(/(\d+)(\w)/)
|
62
|
+
end
|
63
|
+
|
64
|
+
def paired?; flag & 1; end
|
65
|
+
def mapped?; flag & 2; end
|
66
|
+
def unmapped?; flag & 4; end
|
67
|
+
def mate_unmapped?; flag & 8; end
|
68
|
+
def reversed?; flag & 16; end
|
69
|
+
def mate_reversed?; flag & 32; end
|
70
|
+
def first?; flag & 64; end
|
71
|
+
def second?; flag & 128; end
|
72
|
+
def secondary?; flag & 258; end
|
73
|
+
def unchaste?; flag & 512; end
|
74
|
+
def supplementary?; flag & 1024; end
|
75
|
+
|
76
|
+
def end_pos
|
77
|
+
epos = pos
|
78
|
+
cigar.each do |w,c|
|
79
|
+
case c
|
80
|
+
when 'M', 'D', 'N', '=', 'X'
|
81
|
+
epos += w.to_i
|
82
|
+
end
|
83
|
+
end
|
84
|
+
epos
|
85
|
+
end
|
86
|
+
|
87
|
+
def seq_at p
|
88
|
+
cpos = pos
|
89
|
+
cind = 0
|
90
|
+
cigar.each do |w,c|
|
91
|
+
case c
|
92
|
+
when 'M', '=', 'X'
|
93
|
+
epos = cpos + w.to_i
|
94
|
+
if (cpos...epos).include? p
|
95
|
+
return seq[cind + p - cpos]
|
96
|
+
end
|
97
|
+
cind = cind + w.to_i
|
98
|
+
cpos = epos
|
99
|
+
when 'D', 'N'
|
100
|
+
epos = cpos + w.to_i
|
101
|
+
if (cpos...epos).include? p
|
102
|
+
return nil
|
103
|
+
end
|
104
|
+
cind = cind + w.to_i
|
105
|
+
cpos = epos
|
106
|
+
when 'I'
|
107
|
+
cind = cind + w.to_i
|
108
|
+
when 'S', 'H', 'P'
|
109
|
+
return nil
|
110
|
+
end
|
111
|
+
end
|
112
|
+
nil
|
113
|
+
end
|
114
|
+
|
115
|
+
def junctions
|
116
|
+
epos = pos
|
117
|
+
cigar.map do |w,c|
|
118
|
+
case c
|
119
|
+
when 'M', 'D', '=', 'X'
|
120
|
+
epos += w.to_i
|
121
|
+
nil
|
122
|
+
when 'N'
|
123
|
+
[ epos-1, epos += w.to_i ]
|
124
|
+
end
|
125
|
+
end.compact
|
126
|
+
end
|
127
|
+
|
128
|
+
def mate_chrom
|
129
|
+
if rnext == "="
|
130
|
+
chrom
|
131
|
+
else
|
132
|
+
rnext
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def mate
|
137
|
+
sam.mates[qname].find{|l| l.pos == mate_pos}
|
138
|
+
end
|
139
|
+
|
140
|
+
def initialize s, fields
|
141
|
+
@sam = s
|
142
|
+
@header = [:qname, :flag, :rname, :pos, :mapq, :cigar_string, :rnext, :pnext, :tlen, :seq, :qual, :other]
|
143
|
+
@header.each_with_index do |s,i|
|
144
|
+
send "#{s}=".to_sym, fields[i]
|
145
|
+
end
|
146
|
+
sam.add_mate qname, self
|
147
|
+
end
|
148
|
+
|
149
|
+
def output f
|
150
|
+
f.puts [ @qname, @flag, @rname, @pos, @mapq, @cigar_string, @rnext, @pnext, @tlen, @seq, @qual, @other].join("\t")
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
attr_reader :reads, :mates
|
155
|
+
attr_accessor :header
|
156
|
+
def initialize
|
157
|
+
@reads = []
|
158
|
+
@mates = {}
|
159
|
+
end
|
160
|
+
|
161
|
+
def add_mate mate, record
|
162
|
+
@mates[mate] ||= []
|
163
|
+
@mates[mate].push record
|
164
|
+
end
|
165
|
+
|
166
|
+
def each
|
167
|
+
@reads.each do |r|
|
168
|
+
yield r
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def self.read file
|
173
|
+
sam = self.new
|
174
|
+
header = []
|
175
|
+
File.foreach(file) do |l|
|
176
|
+
if l =~ /^@/
|
177
|
+
header.push l
|
178
|
+
next
|
179
|
+
end
|
180
|
+
sam.reads.push Sam::Read.new(sam, l.chomp.split(/\t/,12))
|
181
|
+
end
|
182
|
+
sam.header = Sam::Header.new(sam, header)
|
183
|
+
sam
|
184
|
+
end
|
185
|
+
|
186
|
+
def output f
|
187
|
+
@header.output f
|
188
|
+
@reads.each do |r|
|
189
|
+
r.output f
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def inspect
|
194
|
+
"#<#{self.class.name}:#{object_id} @reads=#{reads.size}>"
|
195
|
+
end
|
196
|
+
end
|
data/lib/vcf.rb
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'mutation_set'
|
2
|
+
require 'oncotator'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
class VCF < MutationSet::Sample
|
6
|
+
requires "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"
|
7
|
+
comments "##"
|
8
|
+
|
9
|
+
class Preamble
|
10
|
+
def initialize lines
|
11
|
+
@items = {}
|
12
|
+
lines.each do |line|
|
13
|
+
add_key line.chomp
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
protected
|
18
|
+
def add_key line
|
19
|
+
line.match(/^##(?<key>\w+)=(?<value>.*)$/) do |m|
|
20
|
+
add_key_item m[:key].to_sym, m[:value]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_key_item key, value
|
25
|
+
@items[key] ||= []
|
26
|
+
@items[key].push new_item(value)
|
27
|
+
end
|
28
|
+
|
29
|
+
def new_item value
|
30
|
+
case value
|
31
|
+
when /^<(.*)>$/
|
32
|
+
Hash[$1.split(/,/).map{|s| s.split(/=/)}]
|
33
|
+
else
|
34
|
+
value
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def enforce_headers(array)
|
40
|
+
# kludge for empty vcf with no format line
|
41
|
+
missing = required.map(&:downcase) - array.map(&:downcase)
|
42
|
+
raise "VCF lacks required headers" if !missing.empty? && !(missing.first == "format" && missing.size == 1)
|
43
|
+
|
44
|
+
if array.size > required.size
|
45
|
+
@samples = array - required
|
46
|
+
end
|
47
|
+
|
48
|
+
@headers = array.map &:to_sym
|
49
|
+
end
|
50
|
+
|
51
|
+
class Line < MutationSet::Line
|
52
|
+
attr_reader :format, :mutation
|
53
|
+
alias_key :start, :pos
|
54
|
+
alias_key :ref_allele, :ref
|
55
|
+
def alt_allele; pick_alt; end
|
56
|
+
def stop; @stop || end_pos; end
|
57
|
+
def stop= nc; @stop = nc; end
|
58
|
+
|
59
|
+
def required
|
60
|
+
sample.required
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize(fields, s)
|
64
|
+
@sample = s
|
65
|
+
@mutation = Hash[clean_required.zip(fields[0...required.size])]
|
66
|
+
@mutation[:info] = Hash[@mutation[:info].split(/;/).map do |s|
|
67
|
+
key, value = s.split(/=/)
|
68
|
+
value ||= true
|
69
|
+
[ key.to_sym, value ]
|
70
|
+
end]
|
71
|
+
@format = @mutation[:format] = @mutation[:format].split(/:/).map(&:to_sym)
|
72
|
+
|
73
|
+
if @sample.samples
|
74
|
+
sample_fields = fields[required.size..-1]
|
75
|
+
@genotypes = {}
|
76
|
+
@sample.samples.each_with_index do |s,i|
|
77
|
+
next if !sample_fields[i]
|
78
|
+
@genotypes[s] = VCF::Genotype.new self, sample_fields[i].split(/:/)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def skip_genotype? g
|
84
|
+
name, geno = g.first
|
85
|
+
geno = genotype(geno)
|
86
|
+
|
87
|
+
!geno || geno.empty? || criteria_failed?(geno, name)
|
88
|
+
end
|
89
|
+
|
90
|
+
def pick_alt
|
91
|
+
alt.split(/,/).first
|
92
|
+
end
|
93
|
+
|
94
|
+
def end_pos
|
95
|
+
pos.to_i + ref.length-1
|
96
|
+
end
|
97
|
+
|
98
|
+
def to_s
|
99
|
+
clean_required.map{ |h|
|
100
|
+
case h
|
101
|
+
when :info
|
102
|
+
mutation[h].map{|k,v| "#{k}=#{v}" }.join(";")
|
103
|
+
when :format
|
104
|
+
mutation[h].join(":")
|
105
|
+
else
|
106
|
+
mutation[h]
|
107
|
+
end
|
108
|
+
}.join("\t") + "\t" + sample.samples.map{|s| genotype(s).to_s }.join("\t")
|
109
|
+
end
|
110
|
+
|
111
|
+
def genotype(s)
|
112
|
+
@genotypes[s] if @genotypes
|
113
|
+
end
|
114
|
+
|
115
|
+
def clean_required
|
116
|
+
sample.clean_headers[0...required.size]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class Genotype
|
121
|
+
attr_reader :info
|
122
|
+
def initialize(line,field)
|
123
|
+
@line = line
|
124
|
+
@info = Hash[line.format.zip(field)]
|
125
|
+
end
|
126
|
+
|
127
|
+
def homozygous?
|
128
|
+
@info[:GT] =~ /0.0/ || @info[:GT] =~ /1.1/
|
129
|
+
end
|
130
|
+
|
131
|
+
def heterozygous?
|
132
|
+
@info[:GT] =~ /0.1/ || @info[:GT] =~ /1.0/
|
133
|
+
end
|
134
|
+
|
135
|
+
def empty?
|
136
|
+
@info[:GT] =~ /\..\./
|
137
|
+
end
|
138
|
+
|
139
|
+
def callable?
|
140
|
+
@info[:GT] !~ /\..\./
|
141
|
+
end
|
142
|
+
|
143
|
+
def gt; @info[:GT]; end
|
144
|
+
def approx_depth; @info[:DP].to_i; end
|
145
|
+
def depth; alt_count + ref_count; end
|
146
|
+
def alt_count; @info[:AD] ? @info[:AD].split(/,/)[1].to_i : nil; end
|
147
|
+
def ref_count; @info[:AD] ? @info[:AD].split(/,/)[0].to_i : nil; end
|
148
|
+
def alt_freq; alt_count / depth.to_f; end
|
149
|
+
def ref_freq; ref_count / depth.to_f; end
|
150
|
+
def ref_length; @line.ref.length; end
|
151
|
+
def alt_length; @line.alt.length; end
|
152
|
+
def alt_base_quality; @info[:NQSBQ] ? @info[:NQSBQ].split(/,/)[0].to_f : nil; end
|
153
|
+
def alt_map_quality; @info[:MQS] ? @info[:MQS].split(/,/)[0].to_f : nil; end
|
154
|
+
def alt_mismatch_rate; @info[:NQSMM] ? @info[:NQSMM].split(/,/)[0].to_f : nil; end
|
155
|
+
def alt_mismatch_count; @info[:MM] ? @info[:MM].split(/,/)[0].to_f : nil; end
|
156
|
+
def quality; @info[:GQ].to_i; end
|
157
|
+
|
158
|
+
def to_s
|
159
|
+
@line.format.map{|f| @info[f]}.join(":")
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
metadata
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: germ
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Saurabh Asthana
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: extlib
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: net-http-persistent
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: sequel
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: See summary
|
63
|
+
email: Saurabh.Asthana@ucsf.edu
|
64
|
+
executables: []
|
65
|
+
extensions:
|
66
|
+
- ext/fasta_aux/extconf.rb
|
67
|
+
- ext/hash_table_aux/extconf.rb
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- lib/gtf.rb
|
71
|
+
- lib/mutation_set.rb
|
72
|
+
- lib/indelocator.rb
|
73
|
+
- lib/maf.rb
|
74
|
+
- lib/hash_table.rb
|
75
|
+
- lib/sam.rb
|
76
|
+
- lib/germ/config.rb
|
77
|
+
- lib/germ/printer.rb
|
78
|
+
- lib/germ/data_types.rb
|
79
|
+
- lib/germ/flagstat.rb
|
80
|
+
- lib/oncotator.rb
|
81
|
+
- lib/mutect.rb
|
82
|
+
- lib/vcf.rb
|
83
|
+
- lib/germ.rb
|
84
|
+
- lib/intervals.rb
|
85
|
+
- lib/fasta.rb
|
86
|
+
- ext/fasta_aux/FastaAux.c
|
87
|
+
- ext/hash_table_aux/HashTableAux.c
|
88
|
+
- ext/fasta_aux/extconf.rb
|
89
|
+
- ext/hash_table_aux/extconf.rb
|
90
|
+
homepage: http://github.com/mountetna/germ
|
91
|
+
licenses: []
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
- ext
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
|
+
none: false
|
105
|
+
requirements:
|
106
|
+
- - ! '>='
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '0'
|
109
|
+
requirements: []
|
110
|
+
rubyforge_project:
|
111
|
+
rubygems_version: 1.8.23
|
112
|
+
signing_key:
|
113
|
+
specification_version: 3
|
114
|
+
summary: Collection of utilities for use in computational genomics
|
115
|
+
test_files: []
|