germ 0.1 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 64855f9c2948d62bdb9852a39efdb04ed70c9628
4
+ data.tar.gz: 9573d639fa846e9f6581221621b9adf4978895a7
5
+ SHA512:
6
+ metadata.gz: d12a898d9e70a0554284e46d31ca51f0573b7a98abf1b4443f5b546ec45b3a790d4d35dc77bed97a3efd04365f3ddbc4e69698f31f3a71b16b57dc69b5aa4121
7
+ data.tar.gz: 0bc1bceb1363b210e7b3190693571d16c4ece1bc749440ab629f20c6a9bd02dcafaf955b749ee2c5b2056cde22814abdaa9a59d80c7c701659f7390868d8eba2
@@ -120,11 +120,11 @@ void make_hash_entry( VALUE h, VALUE s, char sep )
120
120
  key = ID2SYM( rb_intern(kf) );
121
121
 
122
122
  if (!vs || !*vs) {
123
- rb_hash_aset( h, key, Qnil );
123
+ rb_hash_aset( h, key, Qtrue );
124
124
  } else {
125
125
  char *vf = strip_space_quotes(vs, RSTRING_LEN(s));
126
126
  if (!vf)
127
- rb_hash_aset( h, key, Qnil );
127
+ rb_hash_aset( h, key, Qtrue );
128
128
  else {
129
129
  rb_hash_aset( h, key, rb_str_new2(vf) );
130
130
  xfree(vf);
@@ -179,10 +179,15 @@ void add_hash_line(VALUE lines, VALUE header, VALUE types, VALUE ary) {
179
179
  for (i=0;i<RARRAY_LEN(header);i++) {
180
180
  if (types == Qnil)
181
181
  rb_hash_aset( hash, rb_ary_entry(header,i), rb_ary_entry(ary,i) );
182
- else
183
- rb_hash_aset( hash, rb_ary_entry(header,i),
184
- convert_to_type( rb_ary_entry(ary,i), rb_ary_entry( types, i ) )
185
- );
182
+ else {
183
+ VALUE col_type = rb_hash_aref( types, rb_ary_entry(header,i) );
184
+ if (col_type == Qnil)
185
+ rb_hash_aset( hash, rb_ary_entry(header,i), rb_ary_entry(ary,i) );
186
+ else
187
+ rb_hash_aset( hash, rb_ary_entry(header,i),
188
+ convert_to_type( rb_ary_entry(ary,i), col_type )
189
+ );
190
+ }
186
191
  }
187
192
  rb_ary_push(lines, hash);
188
193
  }
@@ -190,6 +195,7 @@ void add_hash_line(VALUE lines, VALUE header, VALUE types, VALUE ary) {
190
195
 
191
196
  VALUE method_load_file(VALUE self, VALUE file) {
192
197
  VALUE cmmt = rb_iv_get(self,"@comment");
198
+ VALUE preamble = rb_iv_get(self,"@preamble");
193
199
  char *comment = (cmmt == Qnil) ? 0 : RSTRING_PTR(cmmt);
194
200
  int commentsize = (cmmt == Qnil) ? 0 : (RSTRING_LEN(cmmt));
195
201
 
@@ -221,6 +227,8 @@ VALUE method_load_file(VALUE self, VALUE file) {
221
227
  i = n - contents + 1;
222
228
  }
223
229
  if (comment && !strncmp(buf,comment,commentsize)) {
230
+ // if you haven't found the header yet, stick it on the preamble
231
+ if (!foundheader) rb_ary_push(preamble, rb_str_new2(buf));
224
232
  continue;
225
233
  }
226
234
  // okay, now you can split your string into tokens and push it
@@ -229,6 +237,11 @@ VALUE method_load_file(VALUE self, VALUE file) {
229
237
  if (header == Qnil) {
230
238
  header = convert_to_symbols(ary);
231
239
  rb_iv_set(self,"@header",header);
240
+ // enforce header quality
241
+ rb_funcall(self, rb_intern("enforce_header"),0);
242
+ // enforce_header might reset your header, so get it again
243
+ header = rb_iv_get(self,"@header");
244
+ foundheader = 1;
232
245
  continue;
233
246
  }
234
247
  if (skip_header != Qnil && !foundheader) {
data/lib/fasta.rb CHANGED
@@ -1,28 +1,65 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'fasta_aux/fasta_aux'
3
+ require 'germ/config'
4
+ require 'genomic_locus'
3
5
 
4
6
  class Fasta
5
7
  private
6
8
  include FastaAux
9
+ extend GermDefault
10
+
11
+ class << self
12
+ def guess_line_size file
13
+ io = File.open file
14
+
15
+ # Grab the first 50 lines
16
+ lines = 50.times.map do io.gets end.compact
17
+
18
+ chroms = lines.each_index.map.select do |i| lines[i] =~ /^>/ end
19
+
20
+ chroms.each_index do |i|
21
+ chrom_lines = lines[ chroms[i] + 1 ... chroms[i+1] || lines.size ]
22
+ # you need at least 2 lines to be sure.
23
+ next if chrom_lines.size < 2
24
+ return chrom_lines.first.chomp.size if chrom_lines[0].size == chrom_lines[1].size
25
+ end
26
+
27
+ # You can't guess, raise an exception.
28
+ raise "Could not guess file line size, please specify"
29
+ end
30
+ end
7
31
 
8
32
  class Chrom
9
- attr_reader :name, :size, :start
10
- def initialize n, fasta, sz, st
11
- @name, @fasta, @size, @start = n, fasta, sz, st
33
+ include GenomicLocus
34
+ attr_reader :seqname, :size, :start, :total, :centromere
35
+ alias_method :pos, :start
36
+ alias_method :stop, :size
37
+ def initialize n, fasta, sz, st, t
38
+ @seqname, @fasta, @size, @byte_start, @total = n, fasta, sz, st, t
39
+ # truncate the name
40
+ @seqname = short_chrom
41
+ @start = 1
12
42
  end
13
43
 
14
- def include? pos
15
- if pos.is_a? Array
16
- start,stop = pos.to_a
17
- include?(start) && include?(stop)
18
- else
19
- pos.is_a?(Fixnum) && pos >= 1 && pos <= size
44
+ def file_pos pos
45
+ return nil if !contains? pos
46
+ @byte_start + pos/line_size*(line_size+1) + (pos % line_size) - 1 - ((pos % line_size == 0) ? 1 : 0)
47
+ end
48
+
49
+ def set_cen_pos pos
50
+ @centromere = pos
51
+ end
52
+
53
+ def p
54
+ if @centromere
55
+ @p ||= GenomicLocus::Region.new @seqname, 1, @centromere
20
56
  end
21
57
  end
22
58
 
23
- def file_pos pos
24
- return nil if !include? pos
25
- start + pos/line_size*(line_size+1) + (pos % line_size) - 1 - ((pos % line_size == 0) ? 1 : 0)
59
+ def q
60
+ if @centromere
61
+ @q ||= GenomicLocus::Region.new @seqname, @centromere+1, stop
62
+ end
26
63
  end
27
64
 
28
65
  private
@@ -37,43 +74,104 @@ class Fasta
37
74
 
38
75
  def compute_chrom_stats
39
76
  @chroms = {}
77
+ @chrom_alias = {}
78
+ total = 0
40
79
  @seq_names.each_with_index do |name, i|
41
80
  if i < @seq_names.size-1
42
- @chroms[name] = Fasta::Chrom.new name, self, seq_size_from_byte_size(@seq_starts[i+1] - @seq_starts[i] - @seq_names[i+1].size - 3), @seq_starts[i]
81
+ size = seq_size_from_byte_size(@seq_starts[i+1] - @seq_starts[i] - @seq_names[i+1].size - 3)
43
82
  else
44
- @chroms[name] = Fasta::Chrom.new name, self, seq_size_from_byte_size(@io.size - @seq_starts[i]), @seq_starts[i]
83
+ size = seq_size_from_byte_size(@io.size - @seq_starts[i])
45
84
  end
85
+ total += size
86
+ chrom = Fasta::Chrom.new name, self, size, @seq_starts[i], total
87
+ name = chrom.short_chrom
88
+ @chroms[name] = chrom
89
+ # just save it as both
90
+ @chrom_alias[ chrom.short_chrom ] = chrom
91
+ @chrom_alias[ chrom.long_chrom ] = chrom
92
+ end
93
+ end
94
+
95
+ def shorten_seq_names
96
+ @seq_names = @seq_names.map do |name|
97
+ chrom(name).short_chrom
46
98
  end
47
99
  end
48
100
 
101
+ def position_from_total p, chrom_only=nil
102
+ p_chrom = @seq_names.bsearch do |name|
103
+ low = chrom(name).total - chrom(name).size + 1
104
+ high = chrom(name).total
105
+ p < low || p <= high
106
+ end
107
+ chrom_only ? p_chrom : GenomicLocus::Position.new(p_chrom, p - chrom(p_chrom).total + chrom(p_chrom).size)
108
+ end
49
109
 
50
110
  public
51
- attr_reader :line_size, :chroms
52
- def initialize file, size=nil
111
+ attr_reader :line_size, :chroms, :seq_names
112
+ def initialize file, size=nil, cen_file=nil
53
113
  @io = File.open(file)
54
114
 
55
- @line_size = size || 50
115
+ @line_size = size || Fasta.guess_line_size(file)
56
116
 
57
117
  get_seq_starts
58
118
 
59
119
  compute_chrom_stats
120
+
121
+ shorten_seq_names
122
+
123
+ if cen_file
124
+ load_centromeres(cen_file)
125
+ end
60
126
  end
61
127
 
62
- def size
63
- @chroms.inject(0) { |s,v| s += v.last.size }
128
+ def chrom name
129
+ @chrom_alias[name]
130
+ end
131
+
132
+ def genome_size
133
+ @genome_size ||= @seq_names.inject(0) { |s,name| s += chrom(name).size; s }
134
+ end
135
+
136
+ # find a random chromosome, weighted by size
137
+ def random_chrom
138
+ position_from_total 1+rand(genome_size),true
139
+ end
140
+
141
+ # pick a random base in the genome
142
+ def random_pos
143
+ position_from_total 1+rand(genome_size)
64
144
  end
65
145
 
66
146
  def inspect
67
147
  "#<#{self.class.name}:#{object_id} @chroms=#{@seq_names.count}>"
68
148
  end
69
149
 
70
- def get_seq chrom, start, stop
71
- seq = get_masked_seq chrom, start, stop
150
+ def locus_seq locus
151
+ raise TypeError, "not a GenomicLocus!" unless locus.is_a? GenomicLocus
152
+ get_seq locus.short_chrom, locus.start, locus.stop
153
+ end
154
+
155
+ def get_seq seqname, start, stop
156
+ seq = get_masked_seq seqname, start, stop
72
157
  seq && seq.upcase
73
158
  end
74
159
 
75
- def get_masked_seq chrom, start, stop
76
- raise ArgumentError.new("Improper interval") if !@chroms[chrom] || !@chroms[chrom].include?([start,stop])
77
- get_seq_chunk(@chroms[chrom].file_pos(start), @chroms[chrom].file_pos(stop)).gsub(/\n/,'')
160
+ def interval_missing?(locus)
161
+ !chrom(locus.seqname) || !chrom(locus.seqname).contains?(locus)
162
+ end
163
+
164
+ def get_masked_seq seqname, start, stop
165
+ raise ArgumentError, "Improper interval #{seqname}:#{start}-#{stop}" if interval_missing?(GenomicLocus::Region.new(seqname,start,stop))
166
+
167
+ get_seq_chunk(chrom(seqname).file_pos(start), chrom(seqname).file_pos(stop)).gsub(/\n/,'')
168
+ end
169
+
170
+ def load_centromeres file
171
+ File.foreach(file).each do |line|
172
+ seqname, cen_pos = line.split(/\t/)[0..1]
173
+ cen_pos = cen_pos.to_i
174
+ chrom(seqname).set_cen_pos(cen_pos)
175
+ end
78
176
  end
79
177
  end
data/lib/fastq.rb ADDED
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ require 'zlib'
3
+
4
+ class Fastq
5
+ class Read
6
+ attr_reader :id, :seq, :qual
7
+ def initialize id, seq, qual
8
+ @id = id
9
+ @seq = seq
10
+ @qual = qual
11
+ end
12
+ end
13
+ def initialize file
14
+ if is_gzipped?(file)
15
+ @io = IO.popen("zcat #{file}")
16
+ else
17
+ @io = File.open file
18
+ end
19
+ end
20
+
21
+ def each_read
22
+ while read = get_read
23
+ yield read
24
+ end
25
+ end
26
+
27
+ def get_read
28
+ return nil unless id = @io.gets
29
+ seq = @io.gets
30
+ plus = @io.gets
31
+ qual = @io.gets
32
+ Fastq::Read.new *[ id, seq, qual ].map(&:chomp)
33
+ end
34
+
35
+ private
36
+ def is_gzipped? file
37
+ return nil unless file && File.readable?(file)
38
+ begin
39
+ Zlib::GzipReader.new(File.open file).close
40
+ rescue Zlib::GzipFile::Error => e
41
+ return nil
42
+ end
43
+ true
44
+ end
45
+ end
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ class GeneticCode
4
+ MAP = {
5
+ :alanine => [ :GCT, :GCC, :GCA, :GCG ],
6
+ :leucine => [ :TTA, :TTG, :CTT, :CTC, :CTA, :CTG ],
7
+ :arginine => [ :CGT, :CGC, :CGA, :CGG, :AGA, :AGG ],
8
+ :lysine => [ :AAA, :AAG ],
9
+ :asparagine => [ :AAT, :AAC ],
10
+ :methionine => [ :ATG ],
11
+ :aspartic_acid => [ :GAT, :GAC ],
12
+ :phenylalinine => [ :TTT, :TTC ],
13
+ :cysteine => [ :TGT, :TGC ],
14
+ :proline => [ :CCT, :CCC, :CCA, :CCG ],
15
+ :glutamine => [ :CAA, :CAG ],
16
+ :serine => [ :TCT, :TCC, :TCA, :TCG, :AGT, :AGC ],
17
+ :glutamic_acid => [ :GAA, :GAG ],
18
+ :threonine => [ :ACT, :ACC, :ACA, :ACG ],
19
+ :gylcine => [ :GGT, :GGC, :GGA, :GGG ],
20
+ :tryptophan => [ :TGG ],
21
+ :histidine => [ :CAT, :CAC ],
22
+ :tyrosine => [ :TAT, :TAC ],
23
+ :isoleucine => [ :ATT, :ATC, :ATA ],
24
+ :valine => [ :GTT, :GTC, :GTA, :GTG ],
25
+ :stop => [ :TAA, :TGA, :TAG ]
26
+ }
27
+ class << self
28
+ def codons_for aa
29
+ MAP[aa]
30
+ end
31
+
32
+ def aa_for codon
33
+ aa = MAP.keys.find do |aa|
34
+ MAP[aa].include? codon
35
+ end
36
+ AminoAcid[aa] if aa
37
+ end
38
+ end
39
+ end
40
+
41
+ class AminoAcid
42
+ BASE = {
43
+ :alanine => { :letter => :A, :short => :Ala },
44
+ :leucine => { :letter => :L, :short => :Leu },
45
+ :arginine => { :letter => :R, :short => :Arg },
46
+ :lysine => { :letter => :K, :short => :Lys },
47
+ :asparagine => { :letter => :N, :short => :Asn },
48
+ :methionine => { :letter => :M, :short => :Met },
49
+ :aspartic_acid => { :letter => :D, :short => :Asp },
50
+ :phenylalinine => { :letter => :F, :short => :Phe },
51
+ :cysteine => { :letter => :C, :short => :Cys },
52
+ :proline => { :letter => :P, :short => :Pro },
53
+ :glutamine => { :letter => :Q, :short => :Gln },
54
+ :serine => { :letter => :S, :short => :Ser },
55
+ :glutamic_acid => { :letter => :E, :short => :Glu },
56
+ :threonine => { :letter => :T, :short => :Thr },
57
+ :gylcine => { :letter => :G, :short => :Gly },
58
+ :tryptophan => { :letter => :W, :short => :Trp },
59
+ :histidine => { :letter => :H, :short => :His },
60
+ :tyrosine => { :letter => :Y, :short => :Tyr },
61
+ :isoleucine => { :letter => :I, :short => :Ile },
62
+ :valine => { :letter => :V, :short => :Val },
63
+ :stop => { :letter => :*, :short => :Stop }
64
+ }
65
+ class << self
66
+ def [] aa_name
67
+ @aa ||= {}
68
+ @aa[aa_name] ||= build_aa aa_name
69
+ end
70
+
71
+ def build_aa aa_name
72
+ new aa_name
73
+ end
74
+ end
75
+
76
+ attr_reader :letter, :short, :codons, :name
77
+ def initialize aa_name
78
+ raise ArgumentError, "No such amino acid." unless BASE[aa_name]
79
+ aa_info = BASE[aa_name]
80
+ @name = aa_name
81
+ @letter = aa_info[:letter]
82
+ @short = aa_info[:short]
83
+ @codons = aa_info[:codons]
84
+ end
85
+ end
86
+
87
+ class Codon
88
+ class << self
89
+ def [] seq
90
+ @codons ||= {}
91
+ @codons[seq] ||= build_codon seq
92
+ end
93
+
94
+ def build_codon seq
95
+ return nil unless seq.is_a?(Symbol) && seq.to_s =~ /^[ATGC]{3}$/
96
+ new seq
97
+ end
98
+ end
99
+
100
+ attr_reader :seq, :aliases, :aa
101
+ def initialize seq
102
+ @seq = seq
103
+ @aa = GeneticCode.aa_for seq
104
+ @aliases = @aa.codons
105
+ end
106
+
107
+ def degeneracy
108
+ @degeneracy ||= compute_degeneracy
109
+ end
110
+
111
+ def distance_to codon
112
+ s1 = seq.to_s
113
+ s2 = codon.seq.to_s
114
+ 3.times.count do |i|
115
+ s1[i] != s2[i]
116
+ end
117
+ end
118
+
119
+ def compute_degeneracy
120
+ 3.times.map do |i|
121
+ [ "A", "T", "G", "C" ].count do |n|
122
+ mut = seq.to_s
123
+ mut[i] = n
124
+ @aliases.include? mut.to_sym
125
+ end
126
+ end
127
+ end
128
+ end
129
+
130
+ class TriNuc
131
+ attr_reader :codon, :pos, :seq, :index, :strand
132
+ def initialize seq, pos, strand, ind=nil
133
+ raise ArgumentError, "Sequence is malformed" unless seq && seq =~ /^[ATGC]{3}$/
134
+ raise ArgumentError, "Three genomic coordinates are required" unless pos.is_a?(Array) && pos.length == 3
135
+ @seq = seq.to_sym
136
+ @pos = pos
137
+ @strand = strand
138
+ @index = ind
139
+ @codon = Codon[@seq]
140
+ end
141
+ end
@@ -0,0 +1,50 @@
1
+ require 'intervals'
2
+ module GenomicLocus
3
+ include IntervalList::Interval
4
+ class Position
5
+ include GenomicLocus
6
+ attr_accessor :seqname, :pos
7
+ def initialize seqname, pos
8
+ @seqname, @pos = seqname, pos
9
+ end
10
+
11
+ alias_method :start, :pos
12
+ alias_method :stop, :pos
13
+ def copy
14
+ self.class.new seqname, pos
15
+ end
16
+ end
17
+
18
+ class Region
19
+ include GenomicLocus
20
+ attr_accessor :seqname, :start, :stop
21
+ def initialize seqname, start, stop
22
+ @seqname, @start, @stop = seqname, start, stop
23
+ end
24
+
25
+ def copy
26
+ self.class.new seqname, start, stop
27
+ end
28
+ end
29
+
30
+ def loc
31
+ @loc ||= "#{short_chrom}:#{start}".to_sym
32
+ end
33
+
34
+ def default_stop
35
+ # this should always be correct! Even if there is a dash.
36
+ start + ref.size - 1
37
+ end
38
+
39
+ def range
40
+ @range ||= "#{short_chrom}:#{start}-#{stop}".to_sym
41
+ end
42
+
43
+ def long_chrom
44
+ @long_chrom ||= "chr#{short_chrom}"
45
+ end
46
+
47
+ def short_chrom
48
+ @short_chrom ||= seqname.sub(/^chr/,'')
49
+ end
50
+ end
data/lib/germ/config.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  require 'yaml'
2
- class TaylorlibConfig
2
+ require 'extlib'
3
+
4
+ class GermConfig
3
5
  def self.get_conf *keys
4
- config = TaylorlibConfig.new
5
- config.get_key *keys if config.loaded?
6
+ @config ||= GermConfig.new
7
+ @config.get_key *keys if @config.loaded?
6
8
  end
7
9
 
8
10
  def initialize
@@ -15,13 +17,14 @@ class TaylorlibConfig
15
17
 
16
18
  def get_key *keys
17
19
  keys.inject(@config) do |obj,key|
20
+ raise GermConfig::KeyError, "Broken key chain: #{keys}" unless obj
18
21
  obj = obj[key]
19
22
  end
20
23
  end
21
24
 
22
25
  private
23
26
  def config_file
24
- ENV["TAYLORLIB_CONF"]
27
+ ENV["GERM_CONF"]
25
28
  end
26
29
 
27
30
  def file_exists?
@@ -31,4 +34,61 @@ class TaylorlibConfig
31
34
  def load_file
32
35
  @config = YAML.load File.read(config_file)
33
36
  end
37
+
38
+ class KeyError < StandardError
39
+ end
40
+ end
41
+
42
+
43
+ module GermDefault
44
+ CACHE = {}
45
+
46
+ def has_default *key_chain
47
+ @key_chain = key_chain
48
+ end
49
+
50
+ def cache
51
+ CACHE[self] ||= {}
52
+ end
53
+
54
+ def default
55
+ # get the default key
56
+ cache[:default] ||= load_default
57
+ end
58
+
59
+
60
+ def cache_load key
61
+ cache[key] ||= load_key(key)
62
+ end
63
+
64
+ def method_missing sym, *args, &block
65
+ begin
66
+ cache_load sym
67
+ rescue GermConfig::KeyError
68
+ # the key does not exist, try to pass it on
69
+ super
70
+ end
71
+ end
72
+
73
+ protected
74
+ def default_create *args
75
+ new *args
76
+ end
77
+
78
+ private
79
+ def key_chain
80
+ @key_chain ||= name.split(/::/).map do |name| name.snake_case.to_sym; end
81
+ end
82
+
83
+ def load_default
84
+ key = GermConfig.get_conf *key_chain, :default
85
+ raise GermConfig::KeyError, "No default key defined!" unless key
86
+ cache_load key
87
+ end
88
+
89
+ def load_key key
90
+ args = GermConfig.get_conf *key_chain, key
91
+ raise GermConfig::KeyError, "No such key defined!" unless args
92
+ default_create *args
93
+ end
34
94
  end
data/lib/germ/flagstat.rb CHANGED
@@ -13,6 +13,10 @@ class Flagstat
13
13
  end
14
14
  end
15
15
 
16
+ def respond_to_missing? sym, include_all = false
17
+ @flags[sym] || sym.to_s =~ /^chastity_/ || super
18
+ end
19
+
16
20
  def method_missing(method, *args, &block)
17
21
  return @flags[method].first.to_i if @flags[method]
18
22
  method.to_s.match(/^chastity_(.*)/) do |m|
data/lib/germ.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'fasta'
2
+ require 'fastq'
2
3
  require 'gtf'
3
4
  require 'hash_table'
4
5
  require 'indelocator'
@@ -9,3 +10,5 @@ require 'mutect'
9
10
  require 'oncotator'
10
11
  require 'sam'
11
12
  require 'vcf'
13
+ require 'genetic_code'
14
+ require 'genomic_locus'