germ 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/fasta_aux/FastaAux.c +137 -0
- data/ext/fasta_aux/extconf.rb +7 -0
- data/ext/hash_table_aux/HashTableAux.c +246 -0
- data/ext/hash_table_aux/extconf.rb +7 -0
- data/lib/fasta.rb +79 -0
- data/lib/germ.rb +11 -0
- data/lib/germ/config.rb +34 -0
- data/lib/germ/data_types.rb +47 -0
- data/lib/germ/flagstat.rb +23 -0
- data/lib/germ/printer.rb +15 -0
- data/lib/gtf.rb +248 -0
- data/lib/hash_table.rb +195 -0
- data/lib/indelocator.rb +46 -0
- data/lib/intervals.rb +337 -0
- data/lib/maf.rb +92 -0
- data/lib/mutation_set.rb +351 -0
- data/lib/mutect.rb +43 -0
- data/lib/oncotator.rb +144 -0
- data/lib/sam.rb +196 -0
- data/lib/vcf.rb +162 -0
- metadata +115 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "ruby/io.h"
|
3
|
+
#include "stdio.h"
|
4
|
+
#include <string.h>
|
5
|
+
|
6
|
+
VALUE FastaAux = Qnil;
|
7
|
+
|
8
|
+
void Init_fasta_aux();
|
9
|
+
VALUE method_get_seq_starts(VALUE self);
|
10
|
+
VALUE method_get_nmer_freq(VALUE self, VALUE seq, VALUE n);
|
11
|
+
VALUE method_get_seq_chunk(VALUE self, VALUE pos1, VALUE pos2);
|
12
|
+
FILE *get_io_ptr(VALUE self);
|
13
|
+
|
14
|
+
FILE *get_io_ptr(VALUE self) {
|
15
|
+
VALUE io;
|
16
|
+
io = rb_iv_get(self,"@io");
|
17
|
+
return rb_io_stdio_file(RFILE(io)->fptr);
|
18
|
+
}
|
19
|
+
|
20
|
+
void Init_fasta_aux() {
|
21
|
+
FastaAux = rb_define_module("FastaAux");
|
22
|
+
rb_define_method(FastaAux, "get_seq_starts", method_get_seq_starts, 0);
|
23
|
+
rb_define_method(FastaAux, "get_seq_chunk", method_get_seq_chunk, 2);
|
24
|
+
rb_define_method(FastaAux, "get_nmer_freq", method_get_nmer_freq, 2);
|
25
|
+
}
|
26
|
+
|
27
|
+
#define BUF_SIZE 1200 // should be at least large enough to hold a >chr line
|
28
|
+
|
29
|
+
VALUE method_get_seq_chunk(VALUE self, VALUE pos1, VALUE pos2) {
|
30
|
+
// extract the sequence between pos1 and pos2
|
31
|
+
FILE * fd;
|
32
|
+
char *buf;
|
33
|
+
unsigned int p1, p2;
|
34
|
+
VALUE s;
|
35
|
+
p1 = NUM2UINT(pos1);
|
36
|
+
p2 = NUM2UINT(pos2);
|
37
|
+
fd = get_io_ptr(self);
|
38
|
+
buf = ALLOC_N(char,p2-p1+1);
|
39
|
+
fseek(fd,p1,SEEK_SET);
|
40
|
+
fread(buf,1,p2-p1+1,fd);
|
41
|
+
s = rb_str_new(buf,p2-p1+1);
|
42
|
+
xfree(buf);
|
43
|
+
return s;
|
44
|
+
}
|
45
|
+
|
46
|
+
VALUE method_get_seq_starts(VALUE self) {
|
47
|
+
VALUE arr, pos;
|
48
|
+
FILE *fd;
|
49
|
+
int size;
|
50
|
+
char buf[BUF_SIZE];
|
51
|
+
char block[BUF_SIZE];
|
52
|
+
int bptr = 0;
|
53
|
+
unsigned int bytepos = 0;
|
54
|
+
arr = rb_ary_new();
|
55
|
+
pos = rb_ary_new();
|
56
|
+
fd = get_io_ptr(self);
|
57
|
+
rb_iv_set(self,"@seq_names",arr);
|
58
|
+
rb_iv_set(self,"@seq_starts",pos);
|
59
|
+
while (size = fread(buf,1,BUF_SIZE,fd)) { // = getc(fd)) != EOF) {
|
60
|
+
int i = 0;
|
61
|
+
for (i=0;i<size;i++) {
|
62
|
+
if (bptr && buf[i] == '\n') {
|
63
|
+
// you have a dirty fragment from the previous guy, add the rest of this one
|
64
|
+
if (i > 0) memcpy(block+bptr,buf,i);
|
65
|
+
rb_ary_push(arr, rb_str_new(block+1,bptr+i-1));
|
66
|
+
rb_ary_push(pos, UINT2NUM(bytepos+i+1));
|
67
|
+
bptr = 0;
|
68
|
+
}
|
69
|
+
if (buf[i] == '>') {
|
70
|
+
while (i < size && buf[i] != '\n') {
|
71
|
+
// push it onto the existing block
|
72
|
+
block[bptr++] = buf[i++];
|
73
|
+
}
|
74
|
+
if (i < size) {
|
75
|
+
rb_ary_push(arr, rb_str_new(block+1,bptr-1));
|
76
|
+
rb_ary_push(pos, UINT2NUM(bytepos+i+1));
|
77
|
+
bptr = 0;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
}
|
81
|
+
bytepos += size;
|
82
|
+
}
|
83
|
+
return Qnil;
|
84
|
+
}
|
85
|
+
|
86
|
+
int get_nmer_code(char *seq,int size)
|
87
|
+
{
|
88
|
+
int i;
|
89
|
+
uint r = 0;
|
90
|
+
for (i=0;i<size;i++) {
|
91
|
+
if (seq[i] == 'N') return -1;
|
92
|
+
if (seq[i] == 'a' || seq[i] == 'A') r = (r << 2) | 0;
|
93
|
+
if (seq[i] == 't' || seq[i] == 'T') r = (r << 2) | 1;
|
94
|
+
if (seq[i] == 'g' || seq[i] == 'G') r = (r << 2) | 2;
|
95
|
+
if (seq[i] == 'c' || seq[i] == 'C') r = (r << 2) | 3;
|
96
|
+
}
|
97
|
+
return r;
|
98
|
+
}
|
99
|
+
|
100
|
+
VALUE code_to_nmer(int code,int size)
|
101
|
+
{
|
102
|
+
char buf[512];
|
103
|
+
int i;
|
104
|
+
const char *c = "ATGC";
|
105
|
+
for (i=size-1;i>=0;i--) {
|
106
|
+
buf[i] = c[code & 3];
|
107
|
+
code = code >> 2;
|
108
|
+
}
|
109
|
+
return rb_str_new(buf,size);
|
110
|
+
}
|
111
|
+
|
112
|
+
VALUE method_get_nmer_freq(VALUE self, VALUE sq, VALUE nm) {
|
113
|
+
int n,i,code;
|
114
|
+
int size;
|
115
|
+
char *seq;
|
116
|
+
int *buf,bsize;
|
117
|
+
VALUE h;
|
118
|
+
|
119
|
+
n = NUM2INT(nm);
|
120
|
+
bsize = 1<<(n*2);
|
121
|
+
buf = ALLOC_N(int,bsize);
|
122
|
+
memset(buf,0,bsize*sizeof(int));
|
123
|
+
seq = RSTRING_PTR(sq);
|
124
|
+
size = RSTRING_LEN(sq);
|
125
|
+
for (i=0;i<=size-n;i++) {
|
126
|
+
code = get_nmer_code(seq+i,n);
|
127
|
+
if (code == -1) continue;
|
128
|
+
buf[code]++;
|
129
|
+
}
|
130
|
+
h = rb_hash_new();
|
131
|
+
for (i=0;i<bsize;i++) {
|
132
|
+
if (!buf[i]) continue;
|
133
|
+
rb_hash_aset( h, code_to_nmer(i,n), INT2NUM(buf[i]) );
|
134
|
+
}
|
135
|
+
xfree(buf);
|
136
|
+
return h;
|
137
|
+
}
|
@@ -0,0 +1,246 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "ruby/io.h"
|
3
|
+
#include "stdio.h"
|
4
|
+
#include <string.h>
|
5
|
+
#include <ctype.h>
|
6
|
+
|
7
|
+
VALUE HashTableAux = Qnil;
|
8
|
+
|
9
|
+
VALUE method_load_file(VALUE self, VALUE file);
|
10
|
+
|
11
|
+
void Init_hash_table_aux();
|
12
|
+
|
13
|
+
void Init_hash_table_aux() {
|
14
|
+
HashTableAux = rb_define_module("HashTableAux");
|
15
|
+
rb_define_method(HashTableAux, "load_file", method_load_file, 1);
|
16
|
+
}
|
17
|
+
|
18
|
+
#define BUF_SIZE 1200
|
19
|
+
|
20
|
+
long get_file_size(FILE *fp)
|
21
|
+
{
|
22
|
+
long fp_size;
|
23
|
+
fseek(fp, 0, SEEK_END);
|
24
|
+
fp_size = ftell(fp);
|
25
|
+
rewind(fp);
|
26
|
+
return fp_size;
|
27
|
+
}
|
28
|
+
char *get_file_contents(FILE *fp,long fp_size)
|
29
|
+
{
|
30
|
+
char *contents;
|
31
|
+
contents = ALLOC_N(char,fp_size);
|
32
|
+
fread(contents,sizeof(char), fp_size, fp);
|
33
|
+
return contents;
|
34
|
+
}
|
35
|
+
|
36
|
+
VALUE get_token_array(char *buf, char sep) {
|
37
|
+
char *token, *head;
|
38
|
+
VALUE ary = rb_ary_new();
|
39
|
+
|
40
|
+
head = buf;
|
41
|
+
while( *head && (token = strchr(head,sep) ) ) {
|
42
|
+
rb_ary_push(ary, rb_str_new(head, token - head));
|
43
|
+
head = token + 1;
|
44
|
+
}
|
45
|
+
if (*head) {
|
46
|
+
rb_ary_push(ary, rb_str_new2(head));
|
47
|
+
}
|
48
|
+
return ary;
|
49
|
+
}
|
50
|
+
|
51
|
+
VALUE convert_to_symbols(VALUE ary) {
|
52
|
+
int i;
|
53
|
+
for (i=0;i<RARRAY_LEN(ary);i++) {
|
54
|
+
rb_ary_store( ary, i, ID2SYM( rb_intern_str(rb_ary_entry(ary,i)) ) );
|
55
|
+
}
|
56
|
+
return ary;
|
57
|
+
}
|
58
|
+
#define TYPE_INT 0
|
59
|
+
#define TYPE_FLOAT 1
|
60
|
+
#define TYPE_SYM 2
|
61
|
+
#define TYPE_HASH 3
|
62
|
+
unsigned int convert_types[10];
|
63
|
+
void set_convert_types()
|
64
|
+
{
|
65
|
+
convert_types[TYPE_INT] = rb_intern("int");
|
66
|
+
convert_types[TYPE_FLOAT] = rb_intern("float");
|
67
|
+
convert_types[TYPE_SYM] = rb_intern("sym");
|
68
|
+
}
|
69
|
+
|
70
|
+
char *make_cstr(VALUE s) {
|
71
|
+
char *p = ALLOC_N(char,RSTRING_LEN(s)+1);
|
72
|
+
MEMCPY(p,RSTRING_PTR(s),char,RSTRING_LEN(s));
|
73
|
+
p[RSTRING_LEN(s)] = '\0';
|
74
|
+
return p;
|
75
|
+
}
|
76
|
+
|
77
|
+
char *strip_space_quotes( char *o, int len )
|
78
|
+
{
|
79
|
+
char *vf;
|
80
|
+
char *c = o;
|
81
|
+
char *d;
|
82
|
+
int iq = 0;
|
83
|
+
if (!c) return 0;
|
84
|
+
vf = ALLOC_N(char,len+1);
|
85
|
+
d = vf;
|
86
|
+
while(isspace(*c)) c++;
|
87
|
+
if (*c == '"' || *c == '\'') { c++; iq = 1; }
|
88
|
+
while(*c) *d++ = *c++;
|
89
|
+
// you hit the end, rewind spaces and quotes
|
90
|
+
if (d == vf) {
|
91
|
+
xfree(vf);
|
92
|
+
return 0;
|
93
|
+
}
|
94
|
+
while(isspace(*(d-1))) d--;
|
95
|
+
if (iq && *(d-1) == '"' || *(d-1) == '\'') d--;
|
96
|
+
*d = 0;
|
97
|
+
return vf;
|
98
|
+
}
|
99
|
+
|
100
|
+
void make_hash_entry( VALUE h, VALUE s, char sep )
|
101
|
+
{
|
102
|
+
char *p = make_cstr(s);
|
103
|
+
char *kf;
|
104
|
+
VALUE key;
|
105
|
+
char *vs = 0;
|
106
|
+
char *split;
|
107
|
+
char *head = p;
|
108
|
+
|
109
|
+
while (isspace(*head)) head++;
|
110
|
+
if (split = strchr(head,sep)) {
|
111
|
+
*split = 0;
|
112
|
+
vs = split+1;
|
113
|
+
}
|
114
|
+
|
115
|
+
kf = strip_space_quotes(head, RSTRING_LEN(s));
|
116
|
+
if (!kf) {
|
117
|
+
xfree(p);
|
118
|
+
return;
|
119
|
+
}
|
120
|
+
key = ID2SYM( rb_intern(kf) );
|
121
|
+
|
122
|
+
if (!vs || !*vs) {
|
123
|
+
rb_hash_aset( h, key, Qnil );
|
124
|
+
} else {
|
125
|
+
char *vf = strip_space_quotes(vs, RSTRING_LEN(s));
|
126
|
+
if (!vf)
|
127
|
+
rb_hash_aset( h, key, Qnil );
|
128
|
+
else {
|
129
|
+
rb_hash_aset( h, key, rb_str_new2(vf) );
|
130
|
+
xfree(vf);
|
131
|
+
}
|
132
|
+
}
|
133
|
+
xfree(kf);
|
134
|
+
xfree(p);
|
135
|
+
}
|
136
|
+
|
137
|
+
VALUE convert_to_type( VALUE v, VALUE type )
|
138
|
+
{
|
139
|
+
if (type == Qnil || v == Qnil) return v;
|
140
|
+
// return the matching type, assuming v is a string
|
141
|
+
if (TYPE(type) == T_ARRAY) // it's a hash array
|
142
|
+
{
|
143
|
+
// tokenize the value based on the first and last characters
|
144
|
+
char *p = make_cstr(v);
|
145
|
+
char *t1 = make_cstr( rb_ary_entry( type, 0 ) );
|
146
|
+
char *t2 = make_cstr( rb_ary_entry( type, 1 ) );
|
147
|
+
int i;
|
148
|
+
VALUE h = rb_hash_new();
|
149
|
+
VALUE ary = get_token_array(p,t1[0]);
|
150
|
+
for (i=0;i< RARRAY_LEN(ary); i++) {
|
151
|
+
make_hash_entry( h, rb_ary_entry( ary, i ), t2[0] );
|
152
|
+
}
|
153
|
+
xfree(p);
|
154
|
+
xfree(t1);
|
155
|
+
xfree(t2);
|
156
|
+
return h;
|
157
|
+
}
|
158
|
+
else if (SYM2ID(type) == convert_types[TYPE_INT]) {
|
159
|
+
char *p = make_cstr(v);
|
160
|
+
int i = atoi(p);
|
161
|
+
xfree(p);
|
162
|
+
return INT2NUM(i);
|
163
|
+
}
|
164
|
+
else if (SYM2ID(type) == convert_types[TYPE_FLOAT]) {
|
165
|
+
char *p = make_cstr(v);
|
166
|
+
double f = atof(p);
|
167
|
+
xfree(p);
|
168
|
+
return DBL2NUM(f);
|
169
|
+
}
|
170
|
+
else if (SYM2ID(type) == convert_types[TYPE_SYM]) {
|
171
|
+
return ID2SYM( rb_intern_str(v) );
|
172
|
+
}
|
173
|
+
return v;
|
174
|
+
}
|
175
|
+
|
176
|
+
void add_hash_line(VALUE lines, VALUE header, VALUE types, VALUE ary) {
|
177
|
+
VALUE hash = rb_hash_new();
|
178
|
+
int i;
|
179
|
+
for (i=0;i<RARRAY_LEN(header);i++) {
|
180
|
+
if (types == Qnil)
|
181
|
+
rb_hash_aset( hash, rb_ary_entry(header,i), rb_ary_entry(ary,i) );
|
182
|
+
else
|
183
|
+
rb_hash_aset( hash, rb_ary_entry(header,i),
|
184
|
+
convert_to_type( rb_ary_entry(ary,i), rb_ary_entry( types, i ) )
|
185
|
+
);
|
186
|
+
}
|
187
|
+
rb_ary_push(lines, hash);
|
188
|
+
}
|
189
|
+
|
190
|
+
|
191
|
+
VALUE method_load_file(VALUE self, VALUE file) {
|
192
|
+
VALUE cmmt = rb_iv_get(self,"@comment");
|
193
|
+
char *comment = (cmmt == Qnil) ? 0 : RSTRING_PTR(cmmt);
|
194
|
+
int commentsize = (cmmt == Qnil) ? 0 : (RSTRING_LEN(cmmt));
|
195
|
+
|
196
|
+
FILE *fp = fopen(RSTRING_PTR(file),"r");
|
197
|
+
long fp_size = get_file_size(fp);
|
198
|
+
char *contents = get_file_contents( fp, fp_size );
|
199
|
+
|
200
|
+
VALUE header = rb_iv_get(self,"@header");
|
201
|
+
VALUE skip_header = rb_iv_get(self,"@skip_header");
|
202
|
+
VALUE types = rb_iv_get(self,"@types");
|
203
|
+
|
204
|
+
|
205
|
+
char *buf = ALLOC_N(char,fp_size);
|
206
|
+
int i = 0, foundheader = 0;
|
207
|
+
char *n;
|
208
|
+
VALUE ary;
|
209
|
+
VALUE lines = rb_ary_new();
|
210
|
+
|
211
|
+
set_convert_types();
|
212
|
+
while (i < fp_size) {
|
213
|
+
if (!(n = strchr(contents+i, '\n'))) {
|
214
|
+
// there is no line before eof, copy the remainder of the buffer
|
215
|
+
strncpy(buf,contents+i,fp_size - i);
|
216
|
+
buf[fp_size - i] = 0;
|
217
|
+
i = fp_size;
|
218
|
+
} else {
|
219
|
+
strncpy(buf,contents+i,n-(contents+i));
|
220
|
+
buf[n-(contents+i)] = 0;
|
221
|
+
i = n - contents + 1;
|
222
|
+
}
|
223
|
+
if (comment && !strncmp(buf,comment,commentsize)) {
|
224
|
+
continue;
|
225
|
+
}
|
226
|
+
// okay, now you can split your string into tokens and push it
|
227
|
+
// onto an array.
|
228
|
+
ary = get_token_array(buf,'\t');
|
229
|
+
if (header == Qnil) {
|
230
|
+
header = convert_to_symbols(ary);
|
231
|
+
rb_iv_set(self,"@header",header);
|
232
|
+
continue;
|
233
|
+
}
|
234
|
+
if (skip_header != Qnil && !foundheader) {
|
235
|
+
// it expects there to be a header to be ignored
|
236
|
+
foundheader = 1;
|
237
|
+
continue;
|
238
|
+
}
|
239
|
+
add_hash_line( lines, header, types, ary );
|
240
|
+
}
|
241
|
+
|
242
|
+
rb_iv_set(self,"@lines",lines);
|
243
|
+
xfree(buf);
|
244
|
+
xfree(contents);
|
245
|
+
return Qnil;
|
246
|
+
}
|
data/lib/fasta.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'fasta_aux/fasta_aux'
|
3
|
+
|
4
|
+
class Fasta
|
5
|
+
private
|
6
|
+
include FastaAux
|
7
|
+
|
8
|
+
class Chrom
|
9
|
+
attr_reader :name, :size, :start
|
10
|
+
def initialize n, fasta, sz, st
|
11
|
+
@name, @fasta, @size, @start = n, fasta, sz, st
|
12
|
+
end
|
13
|
+
|
14
|
+
def include? pos
|
15
|
+
if pos.is_a? Array
|
16
|
+
start,stop = pos.to_a
|
17
|
+
include?(start) && include?(stop)
|
18
|
+
else
|
19
|
+
pos.is_a?(Fixnum) && pos >= 1 && pos <= size
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def file_pos pos
|
24
|
+
return nil if !include? pos
|
25
|
+
start + pos/line_size*(line_size+1) + (pos % line_size) - 1 - ((pos % line_size == 0) ? 1 : 0)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
def line_size
|
30
|
+
@fasta.line_size
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def seq_size_from_byte_size bytes
|
35
|
+
(bytes/(@line_size+1))*@line_size + (bytes % (@line_size+1))
|
36
|
+
end
|
37
|
+
|
38
|
+
def compute_chrom_stats
|
39
|
+
@chroms = {}
|
40
|
+
@seq_names.each_with_index do |name, i|
|
41
|
+
if i < @seq_names.size-1
|
42
|
+
@chroms[name] = Fasta::Chrom.new name, self, seq_size_from_byte_size(@seq_starts[i+1] - @seq_starts[i] - @seq_names[i+1].size - 3), @seq_starts[i]
|
43
|
+
else
|
44
|
+
@chroms[name] = Fasta::Chrom.new name, self, seq_size_from_byte_size(@io.size - @seq_starts[i]), @seq_starts[i]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
public
|
51
|
+
attr_reader :line_size, :chroms
|
52
|
+
def initialize file, size=nil
|
53
|
+
@io = File.open(file)
|
54
|
+
|
55
|
+
@line_size = size || 50
|
56
|
+
|
57
|
+
get_seq_starts
|
58
|
+
|
59
|
+
compute_chrom_stats
|
60
|
+
end
|
61
|
+
|
62
|
+
def size
|
63
|
+
@chroms.inject(0) { |s,v| s += v.last.size }
|
64
|
+
end
|
65
|
+
|
66
|
+
def inspect
|
67
|
+
"#<#{self.class.name}:#{object_id} @chroms=#{@seq_names.count}>"
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_seq chrom, start, stop
|
71
|
+
seq = get_masked_seq chrom, start, stop
|
72
|
+
seq && seq.upcase
|
73
|
+
end
|
74
|
+
|
75
|
+
def get_masked_seq chrom, start, stop
|
76
|
+
raise ArgumentError.new("Improper interval") if !@chroms[chrom] || !@chroms[chrom].include?([start,stop])
|
77
|
+
get_seq_chunk(@chroms[chrom].file_pos(start), @chroms[chrom].file_pos(stop)).gsub(/\n/,'')
|
78
|
+
end
|
79
|
+
end
|