vardetect-vc 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/vardetect_vc +46 -0
- data/bin/vardetect_vc_batch +9 -0
- data/lib/vardetect-vc/call_combine.rb +367 -0
- data/lib/vardetect-vc/call_prepare.rb +122 -0
- data/lib/vardetect-vc/call_snv.rb +493 -0
- data/lib/vardetect-vc/lib.rb +28 -0
- data/lib/vardetect-vc/version.rb +1 -1
- metadata +10 -2
data/bin/vardetect_vc
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'vardetect-vc'
|
4
|
+
require "vardetect-vc/lib"
|
5
|
+
|
6
|
+
|
7
|
+
sample_id = 'G3418'
|
8
|
+
working = 'vcqa'
|
9
|
+
|
10
|
+
path = '/Users/soup/Desktop/chula/data'
|
11
|
+
ref_file = File.join(path,'ref','hg19.fa')
|
12
|
+
|
13
|
+
|
14
|
+
snp_array_file = File.join(path,working,"#{sample_id}_gt_report.txt")
|
15
|
+
sam_tool_file = File.join(path,working,"#{sample_id}_SNP_Indel_ANNO.csv")
|
16
|
+
sam_file = File.join(path,working,'bam',"#{sample_id}.remdup.uniqMap.TS.bam")
|
17
|
+
vc_raw_file = File.join(path,working,'snv-vc',"#{sample_id}_vc_raw_snv.tsv")
|
18
|
+
|
19
|
+
|
20
|
+
st_file = File.join(path,working,sample_id,'st_snv.tsv')
|
21
|
+
vc_file = File.join(path,working,sample_id,'vc_snv.csv')
|
22
|
+
snp_file = File.join(path,working,sample_id,'snpar_snv.csv')
|
23
|
+
|
24
|
+
st_ft_file = File.join(path,working,sample_id,'st_filtered_snv.csv')
|
25
|
+
vc_ft_file = File.join(path,working,sample_id,'vc_filtered_snv.csv')
|
26
|
+
snp_ft_file = File.join(path,working,sample_id,'snpar_filtered_snv.csv')
|
27
|
+
|
28
|
+
|
29
|
+
com_vc_file = File.join(path,working,sample_id,'vc_compared.csv')
|
30
|
+
com_st_file = File.join(path,working,sample_id,'st_compared.csv')
|
31
|
+
com_all_file = File.join(path,working,sample_id,'all_compared.csv')
|
32
|
+
|
33
|
+
combine_all_file = File.join(path,working,sample_id,'combine_all.csv')
|
34
|
+
analysed_file = File.join(path,working,sample_id,'summary.csv')
|
35
|
+
|
36
|
+
# 1. call vc
|
37
|
+
params = "snv -ref #{ref_file} -sam #{sam_file} -vc_out #{vc_raw_file} -core 8"
|
38
|
+
# params = "snv-range -ref #{ref_file} -sam #{sam_file} -vc_out #{vc_raw_file} -core 8 -chr 1 -start 168665645 -stop 170706648 -debug true"
|
39
|
+
params = params.split
|
40
|
+
# 156553003
|
41
|
+
#
|
42
|
+
#
|
43
|
+
# 170706648
|
44
|
+
|
45
|
+
Vardetect::Vc.hi
|
46
|
+
Vardetect::Vc.exec params
|
@@ -0,0 +1,367 @@
|
|
1
|
+
module Vardetect
|
2
|
+
module Vc
|
3
|
+
|
4
|
+
|
5
|
+
def self.call_combine_all params
|
6
|
+
# ====================================================================================================
|
7
|
+
# 4. All comparison
|
8
|
+
# ====================================================================================================
|
9
|
+
|
10
|
+
map_vc = []
|
11
|
+
map_st = {}
|
12
|
+
map_snp = {}
|
13
|
+
|
14
|
+
|
15
|
+
vc_file =params['vc_out']
|
16
|
+
st_file =params['st_out']
|
17
|
+
snp_file =params['snp_out']
|
18
|
+
|
19
|
+
com_out =params['com_out']
|
20
|
+
|
21
|
+
f_st = File.open(st_file,'r')
|
22
|
+
f_vc = File.open(vc_file,'r')
|
23
|
+
f_snp = File.open(snp_file,'r')
|
24
|
+
|
25
|
+
com_output = File.open(com_out,'w')
|
26
|
+
|
27
|
+
puts "Indexing SNP Array SNV"
|
28
|
+
while str = f_snp.gets
|
29
|
+
v = str.strip.split(',')
|
30
|
+
map_snp[v[0]] = v
|
31
|
+
end
|
32
|
+
|
33
|
+
puts map_snp.keys[0..10]
|
34
|
+
|
35
|
+
puts "Indexing SamTools SNV"
|
36
|
+
while str = f_st.gets
|
37
|
+
v = str.strip.split(',')
|
38
|
+
map_st[v[0]] = v
|
39
|
+
end
|
40
|
+
|
41
|
+
puts map_st.keys[0..10]
|
42
|
+
|
43
|
+
puts "Indexing VC SNV"
|
44
|
+
while str = f_vc.gets
|
45
|
+
v = str.split(',')
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
# if (v[2]==v[3] and alt.to_f/dep > 0.1) or v[2]!=v[3]
|
50
|
+
# map_vc<<v
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# if (v[2]==v[3] and alt.to_f/dep > 0.1) or v[2]!=v[3]
|
54
|
+
map_vc<<v
|
55
|
+
# end
|
56
|
+
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
puts map_vc.size
|
61
|
+
|
62
|
+
# puts map_vc[0..10]
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
# map_vc.sort!{|a,b| a[1].to_i<=>b[1].to_i }
|
69
|
+
|
70
|
+
all=0
|
71
|
+
vc_count=0
|
72
|
+
st_count=0
|
73
|
+
vc_het_rate=0
|
74
|
+
st_het_rate=0
|
75
|
+
final = []
|
76
|
+
|
77
|
+
|
78
|
+
for i in map_vc
|
79
|
+
pos = i[0]
|
80
|
+
# puts pos
|
81
|
+
if snp = map_snp[pos]
|
82
|
+
|
83
|
+
|
84
|
+
vc_alt = i[5].to_i
|
85
|
+
vc_dep = i[4].to_i+vc_alt
|
86
|
+
|
87
|
+
vc_alt_a = '-'
|
88
|
+
vc_het = 0
|
89
|
+
|
90
|
+
|
91
|
+
snp_rs = snp[1]
|
92
|
+
snp_a1 = snp[2]
|
93
|
+
snp_a2 = snp[3]
|
94
|
+
|
95
|
+
snp_a = "#{snp_a1}/#{snp_a2}"
|
96
|
+
snp_het = 0
|
97
|
+
snp_het = 1 if snp_a1!=snp_a2
|
98
|
+
snp_gc = snp[4]
|
99
|
+
snp_clust = snp[5]
|
100
|
+
snp_cmp = 1
|
101
|
+
|
102
|
+
snp_check = false
|
103
|
+
if snp_a.strip !='-/-'
|
104
|
+
all+=1
|
105
|
+
snp_check = true
|
106
|
+
end
|
107
|
+
st_check = false
|
108
|
+
|
109
|
+
vc_het_t = 'hom'
|
110
|
+
snp_het_t = 'hom'
|
111
|
+
snp_het_t = 'het' if snp_a1!=snp_a2
|
112
|
+
|
113
|
+
vc_het_ratio = vc_alt.to_f/vc_dep
|
114
|
+
if vc_het_ratio >= 0.15
|
115
|
+
vc_alt_a = i[3]
|
116
|
+
vc_het = 1
|
117
|
+
vc_het_t = 'het'
|
118
|
+
end
|
119
|
+
|
120
|
+
str = "#{i[0]},#{i[1]},#{i[2]},#{i[2]}/#{vc_alt_a},#{vc_het_t},#{vc_dep},#{vc_alt},#{vc_het_ratio},"
|
121
|
+
str += "#{snp_rs},#{snp_a},#{snp_het_t},#{snp_gc},#{snp_clust},"
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
if st = map_st[pos]
|
127
|
+
st_het = 0
|
128
|
+
st_het = 1 if st[5] =='het'
|
129
|
+
str += st[4..-1].join(",")
|
130
|
+
if st_het==snp_het and snp_check
|
131
|
+
st_het_rate+=1
|
132
|
+
st_check =true
|
133
|
+
end
|
134
|
+
st_count+=1
|
135
|
+
end
|
136
|
+
vc_check = false
|
137
|
+
vc_check = true if vc_het==snp_het and snp_check
|
138
|
+
|
139
|
+
if snp_check and vc_check
|
140
|
+
vc_het_rate+=1
|
141
|
+
end
|
142
|
+
vc_count+=1
|
143
|
+
|
144
|
+
cmp = [snp_check,vc_check,st_check,str]
|
145
|
+
|
146
|
+
# final << cmp
|
147
|
+
com_output.puts cmp.join(',')
|
148
|
+
|
149
|
+
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
puts "All #{all} is not -/-"
|
155
|
+
|
156
|
+
puts "VC Hetero Rate : #{vc_het_rate}"
|
157
|
+
|
158
|
+
puts "ST Hetero Rate : #{st_het_rate}"
|
159
|
+
|
160
|
+
f_st.close
|
161
|
+
f_vc.close
|
162
|
+
f_snp.close
|
163
|
+
com_output.close
|
164
|
+
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
def self.analyse_combine params
|
170
|
+
|
171
|
+
|
172
|
+
# index columns
|
173
|
+
idx = {}
|
174
|
+
%w{snp vc st pos ref vc_a1 vc_a vc_het vc_dep vc_alt vc_r snp_ann snp_a snp_het snp_gc snp_clus st_ref st_a2 st_qua st_dep st_alt st_pos}.each_with_index do |i,index|
|
175
|
+
idx[i.to_sym] = index
|
176
|
+
end
|
177
|
+
|
178
|
+
# constant
|
179
|
+
snp_filter = 0.75
|
180
|
+
alt_ratio = 0.1
|
181
|
+
|
182
|
+
|
183
|
+
gcid = idx[:snp_gc]
|
184
|
+
hid = idx[:snp_het]
|
185
|
+
depid= idx[:vc_dep]
|
186
|
+
|
187
|
+
|
188
|
+
combine_file =params['com_out']
|
189
|
+
analyse_file =params['out']
|
190
|
+
f_list = File.open(combine_file,'r')
|
191
|
+
out = File.open(analyse_file,'w')
|
192
|
+
|
193
|
+
all_list = []
|
194
|
+
list = []
|
195
|
+
|
196
|
+
het_list = []
|
197
|
+
hom_list = []
|
198
|
+
|
199
|
+
|
200
|
+
# filter all by gc score snp array
|
201
|
+
while str = f_list.gets
|
202
|
+
# puts str
|
203
|
+
v = str.strip.split(',')
|
204
|
+
all_list<<v
|
205
|
+
if v[gcid].to_f>snp_filter
|
206
|
+
list<<v
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# filter set of snp array 's het hom
|
211
|
+
for i in list
|
212
|
+
if i[hid]=='het'
|
213
|
+
het_list<<i
|
214
|
+
else
|
215
|
+
hom_list<<i
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
out.puts "SNP Array overlap data "
|
222
|
+
out.puts "Filter GC score at,#{snp_filter}"
|
223
|
+
out.puts "Total snv,#{all_list.size}"
|
224
|
+
out.puts "Total snv not pass filter,#{all_list.size-list.size}"
|
225
|
+
out.puts "Total hetero snv,#{het_list.size}"
|
226
|
+
out.puts "Total homo snv,#{hom_list.size}"
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
vc_het_het = []
|
231
|
+
vc_het_hom = []
|
232
|
+
vc_hom_hom = []
|
233
|
+
vc_hom_het = []
|
234
|
+
|
235
|
+
vchid = getidx :vc_het
|
236
|
+
vcrid = getidx :vc_r
|
237
|
+
[vchid]=='het' or
|
238
|
+
for i in list
|
239
|
+
|
240
|
+
if i[hid]=='het'
|
241
|
+
if i[vcrid].to_f >= alt_ratio
|
242
|
+
vc_het_het<<i
|
243
|
+
else
|
244
|
+
vc_het_hom<<i
|
245
|
+
end
|
246
|
+
else
|
247
|
+
if i[vcrid].to_f >= alt_ratio
|
248
|
+
vc_hom_het<<i
|
249
|
+
else
|
250
|
+
vc_hom_hom<<i
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
|
256
|
+
# matrix=[[vc_het_het.size,vc_het_hom.size],[vc_hom_hom.size,vc_hom_het.size]]
|
257
|
+
|
258
|
+
|
259
|
+
|
260
|
+
require 'erb'
|
261
|
+
outtmp = File.open(analyse_file+".html",'w')
|
262
|
+
|
263
|
+
template = ERB.new(File.open('index.html.erb','r').read)
|
264
|
+
|
265
|
+
|
266
|
+
outtmp.puts template.result(binding)
|
267
|
+
|
268
|
+
outtmp.close
|
269
|
+
|
270
|
+
`open #{analyse_file+".html"}`
|
271
|
+
|
272
|
+
|
273
|
+
|
274
|
+
out.close
|
275
|
+
f_list.close
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
def self.getidx id
|
280
|
+
|
281
|
+
idx = {}
|
282
|
+
%w{snp vc st pos ref vc_a1 vc_a vc_het vc_dep vc_alt vc_r snp_ann snp_a snp_het snp_gc snp_clus st_ref st_a2 st_qua st_dep st_alt st_pos}.each_with_index do |i,index|
|
283
|
+
idx[i.to_sym] = index
|
284
|
+
end
|
285
|
+
return idx[id]
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
|
290
|
+
def self.sampling list
|
291
|
+
|
292
|
+
gcid = getidx :snp_gc
|
293
|
+
depid = getidx :vc_dep
|
294
|
+
vcrid = getidx :vc_r
|
295
|
+
|
296
|
+
|
297
|
+
# =====================================================================
|
298
|
+
# sampling gc score value from all data set
|
299
|
+
gc_bin = []
|
300
|
+
gc_range = 100
|
301
|
+
gc_max = 1
|
302
|
+
for i in list
|
303
|
+
j = i[gcid].to_f
|
304
|
+
k = (j * gc_range / gc_max).to_i
|
305
|
+
gc_bin[k]=0 unless gc_bin[k]
|
306
|
+
gc_bin[k]+=1
|
307
|
+
end
|
308
|
+
|
309
|
+
# =====================================================================
|
310
|
+
# sampling dep score value from all data set
|
311
|
+
d_bin = []
|
312
|
+
d_range = 100
|
313
|
+
d_max = 500
|
314
|
+
|
315
|
+
for i in list
|
316
|
+
j = i[depid].to_i
|
317
|
+
k = (j * d_range / d_max).to_i
|
318
|
+
d_bin[k]=0 unless d_bin[k]
|
319
|
+
d_bin[k]+=1
|
320
|
+
end
|
321
|
+
|
322
|
+
# =====================================================================
|
323
|
+
# sampling gc score value from all data set
|
324
|
+
|
325
|
+
alt_bin = []
|
326
|
+
alt_range = 100
|
327
|
+
alt_max = 0.5
|
328
|
+
dep_range = 100
|
329
|
+
dep_max = 500
|
330
|
+
|
331
|
+
|
332
|
+
for i in list
|
333
|
+
|
334
|
+
j = i[vcrid].to_f
|
335
|
+
d = i[depid].to_f
|
336
|
+
|
337
|
+
k = (j * alt_range / alt_max).to_i
|
338
|
+
alt_bin[k]=0 unless alt_bin[k]
|
339
|
+
alt_bin[k]+=1
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
dot = []
|
345
|
+
|
346
|
+
for i in list
|
347
|
+
|
348
|
+
j = i[vcrid].to_f
|
349
|
+
d = i[depid].to_f
|
350
|
+
k = (j * alt_range / alt_max).to_i
|
351
|
+
k2 = (d * dep_range / dep_max).to_i
|
352
|
+
|
353
|
+
dot << "[#{d},#{j}]"
|
354
|
+
|
355
|
+
|
356
|
+
end
|
357
|
+
|
358
|
+
|
359
|
+
|
360
|
+
|
361
|
+
return {:gc=>gc_bin,:dep=>d_bin,:alt=>alt_bin,:dot=>dot}
|
362
|
+
|
363
|
+
end
|
364
|
+
|
365
|
+
|
366
|
+
end
|
367
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
|
2
|
+
module Vardetect
|
3
|
+
module Vc
|
4
|
+
|
5
|
+
|
6
|
+
def self.call_prepare_st params
|
7
|
+
# ====================================================================================================
|
8
|
+
# 2. ST preparing filtered chromosome
|
9
|
+
# ====================================================================================================
|
10
|
+
count = 0
|
11
|
+
start = false
|
12
|
+
sam_tool_file =params['st']
|
13
|
+
snp_file =params['st_out']
|
14
|
+
|
15
|
+
|
16
|
+
st_output = File.open(snp_file,'w')
|
17
|
+
CSV.foreach(sam_tool_file) do |row|
|
18
|
+
if start
|
19
|
+
|
20
|
+
i = row[0][3..-1]
|
21
|
+
id = i.to_i
|
22
|
+
id = i if id == 0
|
23
|
+
key = "#{id}-#{row[1]}"
|
24
|
+
st_output.puts key+","+row.join(",")
|
25
|
+
|
26
|
+
count +=1
|
27
|
+
else
|
28
|
+
start=true
|
29
|
+
end
|
30
|
+
end
|
31
|
+
st_output.close
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.call_prepare_snp params
|
36
|
+
# ====================================================================================================
|
37
|
+
# 3. SNP preparing filtered chromosome
|
38
|
+
# ====================================================================================================
|
39
|
+
|
40
|
+
snp_array_file =params['snp']
|
41
|
+
snp_file =params['snp_out']
|
42
|
+
|
43
|
+
count = 0
|
44
|
+
list = []
|
45
|
+
start = false
|
46
|
+
run = false
|
47
|
+
|
48
|
+
file = File.open(snp_array_file,'r')
|
49
|
+
output = File.open snp_file,'w'
|
50
|
+
map = {}
|
51
|
+
while str = file.gets #and count<100
|
52
|
+
str.strip!
|
53
|
+
if str=='[Data]' and !start
|
54
|
+
start = true
|
55
|
+
elsif start
|
56
|
+
s = str.split
|
57
|
+
if s[0] =='Sample' and !run
|
58
|
+
run = true
|
59
|
+
else
|
60
|
+
|
61
|
+
key = "#{s[1]}-#{s[2]}"
|
62
|
+
map[key] = s
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
count+=1
|
68
|
+
end
|
69
|
+
#
|
70
|
+
map.keys.sort.each do |k|
|
71
|
+
v = map[k]
|
72
|
+
output.puts "#{k},"+v[3..-1].join(",")
|
73
|
+
end
|
74
|
+
|
75
|
+
output.close
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.call_prepare_vc params
|
82
|
+
|
83
|
+
vc_raw_file =params['vc_raw']
|
84
|
+
vc_file =params['vc_out']
|
85
|
+
|
86
|
+
count = 0
|
87
|
+
list = []
|
88
|
+
|
89
|
+
file = File.open(vc_raw_file,'r')
|
90
|
+
output = File.open vc_file,'w'
|
91
|
+
map = {}
|
92
|
+
while str = file.gets #and count<100
|
93
|
+
str.strip!
|
94
|
+
s = str.split
|
95
|
+
# puts str
|
96
|
+
begin
|
97
|
+
if str
|
98
|
+
|
99
|
+
alt = s[6].to_i
|
100
|
+
dep = s[5].to_i + alt
|
101
|
+
if ( alt.to_f/dep > 0.1) or s[2]!=s[3]
|
102
|
+
i = s[0][3..-1]
|
103
|
+
key = "#{i}-#{s[1]}"
|
104
|
+
map[key] = s
|
105
|
+
end
|
106
|
+
end
|
107
|
+
rescue
|
108
|
+
puts str
|
109
|
+
end
|
110
|
+
end
|
111
|
+
#
|
112
|
+
map.keys.sort.each do |k|
|
113
|
+
v = map[k]
|
114
|
+
output.puts "#{k},"+v[2..-1].join(",")
|
115
|
+
end
|
116
|
+
output.close
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,493 @@
|
|
1
|
+
module Vardetect
|
2
|
+
module Vc
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
def self.form seq, cigar
|
7
|
+
ix = cigar.split(/S|M|I|D/)
|
8
|
+
index = 0
|
9
|
+
id = 0
|
10
|
+
str = ''
|
11
|
+
st = 0
|
12
|
+
indel = []
|
13
|
+
# puts seq
|
14
|
+
for i in ix
|
15
|
+
index+=i.size
|
16
|
+
cmd = cigar[index]
|
17
|
+
# puts "#{cmd} #{i.to_i}"
|
18
|
+
index+=1
|
19
|
+
i = i.to_i
|
20
|
+
case cmd
|
21
|
+
when 'S'
|
22
|
+
st = i
|
23
|
+
id += i
|
24
|
+
when 'I'
|
25
|
+
indel<<{:pos=>id-st,:type=>'ins',:size=>i,:cigar=>cigar}
|
26
|
+
id+=i
|
27
|
+
when 'D'
|
28
|
+
str+='N'*i
|
29
|
+
indel<<{:pos=>id-st,:type=>'del',:size=>i,:cigar=>cigar}
|
30
|
+
when 'M'
|
31
|
+
str += seq[id..id+i-1] if seq[id..id+i-1]
|
32
|
+
id+=i
|
33
|
+
end
|
34
|
+
# puts "Ref:#{seq}"
|
35
|
+
# puts "Seq:#{str}"
|
36
|
+
# puts "ID #{id}"
|
37
|
+
# puts
|
38
|
+
end
|
39
|
+
return str,indel
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.call_sam_snv sam,chr,chr_start,chr_stop, debug=nil
|
43
|
+
|
44
|
+
unless debug
|
45
|
+
show = false
|
46
|
+
indel_show = false
|
47
|
+
var_show = false
|
48
|
+
else
|
49
|
+
s = debug.split('|')
|
50
|
+
show = s.index('show')
|
51
|
+
indel_show = s.index('indel')
|
52
|
+
var_show = s.index('var')
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
l = nil
|
57
|
+
snps = []
|
58
|
+
indels = []
|
59
|
+
indels_map = {}
|
60
|
+
|
61
|
+
puts "#{chr} #{chr_start} - #{chr_stop}"
|
62
|
+
seq = sam.fetch_reference(chr,chr_start,chr_stop+1000)
|
63
|
+
sam.load_index
|
64
|
+
|
65
|
+
upseq = seq.upcase
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
als = []
|
70
|
+
|
71
|
+
|
72
|
+
als = sam.fetch(chr, chr_start, chr_stop)
|
73
|
+
|
74
|
+
profile = []
|
75
|
+
list = []
|
76
|
+
start = nil
|
77
|
+
alleles = {'A'=>0,'C'=>1,'G'=>2,'T'=>3}
|
78
|
+
inv_alleles = {0=>'A',1=>'C',2=>'G',3=>'T'}
|
79
|
+
|
80
|
+
|
81
|
+
als.each{|i|
|
82
|
+
|
83
|
+
# puts "#{i.pos}"
|
84
|
+
|
85
|
+
unless start
|
86
|
+
start = i.pos
|
87
|
+
end
|
88
|
+
|
89
|
+
if i.pos!=start
|
90
|
+
|
91
|
+
while start<i.pos
|
92
|
+
del = []
|
93
|
+
|
94
|
+
s = ''
|
95
|
+
ref = seq[start-chr_start]
|
96
|
+
print "#{start}\t#{ref}\t#{list.size}\t" if show
|
97
|
+
ref = ref.upcase
|
98
|
+
|
99
|
+
|
100
|
+
p = [0,0,0,0]
|
101
|
+
q = [0,0,0,0]
|
102
|
+
|
103
|
+
list.each_with_index do|l,index|
|
104
|
+
|
105
|
+
|
106
|
+
c = l.seq[start-l.pos]
|
107
|
+
cq = l.qual[start-l.pos]
|
108
|
+
|
109
|
+
# a = seq[start-1..start-1-5]
|
110
|
+
# b =seq[start-l.pos..start-l.pos-5]
|
111
|
+
# puts "#{a} - #{b}"
|
112
|
+
|
113
|
+
if c
|
114
|
+
|
115
|
+
case c
|
116
|
+
when 'A'
|
117
|
+
p[0]+=1
|
118
|
+
q[0]+=cq-33 if cq
|
119
|
+
when 'C'
|
120
|
+
p[1]+=1
|
121
|
+
q[1]+=cq-33 if cq
|
122
|
+
when 'G'
|
123
|
+
p[2]+=1
|
124
|
+
q[2]+=cq-33 if cq
|
125
|
+
when 'T'
|
126
|
+
p[3]+=1
|
127
|
+
q[3]+=cq-33 if cq
|
128
|
+
end
|
129
|
+
|
130
|
+
s+=c
|
131
|
+
|
132
|
+
print "#{c}" if show
|
133
|
+
|
134
|
+
else
|
135
|
+
del<<l
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
|
143
|
+
pmax = p.index(p.max)
|
144
|
+
max = p[pmax]
|
145
|
+
p[pmax] = 0 if pmax
|
146
|
+
palt = p.index(p.max)
|
147
|
+
alt = p[palt]
|
148
|
+
dep = max+alt
|
149
|
+
max_q = q[pmax].to_f/max
|
150
|
+
alt_q = '0'
|
151
|
+
alt_q = q[palt].to_f/alt if alt > 0
|
152
|
+
|
153
|
+
|
154
|
+
if dep > 2 and ((indel = indels_map[start] and indel[:count]>1 ) or (alt>0 and alt/dep.to_f > 0.1 and alt_q >10) or alleles[ref]!=pmax )
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
text = "#{start}\t#{max}/#{alt.to_i}\t#{ref}\t#{s}"
|
159
|
+
puts text if var_show
|
160
|
+
sum = (max+alt).to_f
|
161
|
+
aalt = '-'
|
162
|
+
|
163
|
+
aalt = inv_alleles[palt] if alt!=0
|
164
|
+
if (alt>0 and alt/dep.to_f > 0.1)
|
165
|
+
c = 'a'
|
166
|
+
else
|
167
|
+
c = 'A'
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
if indel
|
172
|
+
s = indel[:alleles].split('/')
|
173
|
+
ref = s[0]
|
174
|
+
aalt = s[1]
|
175
|
+
alt = indel[:count]
|
176
|
+
if indel[:type]=='ins'
|
177
|
+
c='+'
|
178
|
+
else
|
179
|
+
c='-'
|
180
|
+
end
|
181
|
+
end
|
182
|
+
snps << [chr,start,ref,inv_alleles[pmax],aalt,max,alt,format('%.2f',max_q),format('%.2f',alt_q),c]
|
183
|
+
|
184
|
+
# snps << [chr,start,ref,inv_alleles[pmax],inv_alleles[palt],max,alt,max/sum,alt/sum]
|
185
|
+
|
186
|
+
end
|
187
|
+
list-=del
|
188
|
+
if list.size==0
|
189
|
+
start = i.pos
|
190
|
+
else
|
191
|
+
start+=1
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
puts if show
|
196
|
+
end
|
197
|
+
|
198
|
+
start = i.pos
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
seq_size = i.seq.size
|
204
|
+
|
205
|
+
tseq = form(i.seq,i.cigar)
|
206
|
+
tqual = form(i.qual,i.cigar)
|
207
|
+
|
208
|
+
|
209
|
+
if (i.cigar.index('I') or i.cigar.index('D') )
|
210
|
+
|
211
|
+
|
212
|
+
if indel_show
|
213
|
+
puts "-------------------------INDEL "
|
214
|
+
puts i.pos
|
215
|
+
puts i.cigar
|
216
|
+
puts i.seq
|
217
|
+
puts i.qual
|
218
|
+
# puts i.inspect
|
219
|
+
|
220
|
+
# :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr
|
221
|
+
|
222
|
+
puts seq[i.pos-chr_start..i.pos-chr_start+i.seq.size]
|
223
|
+
puts tseq[0]
|
224
|
+
puts tqual[0]
|
225
|
+
puts tseq[1]
|
226
|
+
end
|
227
|
+
|
228
|
+
if tseq[1].size>0
|
229
|
+
for y in tseq[1]
|
230
|
+
if y[:type]=='ins'
|
231
|
+
iref = '-'
|
232
|
+
iseq = i.seq[y[:pos]..y[:pos]+y[:size]-1]
|
233
|
+
y[:alleles] = "#{iref}/#{iseq}"
|
234
|
+
else
|
235
|
+
iref = seq[i.pos-chr_start+y[:pos]..i.pos-chr_start+y[:pos]+y[:size]-1]
|
236
|
+
iseq = '-'
|
237
|
+
y[:alleles] = "#{iref}/#{iseq}"
|
238
|
+
end
|
239
|
+
y[:pos]+=i.pos
|
240
|
+
indels<<y
|
241
|
+
unless indels_map[y[:pos]]
|
242
|
+
y[:count]=1
|
243
|
+
y[:sm]=y[:alleles]
|
244
|
+
indels_map[y[:pos]] = y
|
245
|
+
else
|
246
|
+
indels_map[y[:pos]][:sm]+=",#{y[:alleles]}"
|
247
|
+
|
248
|
+
indels_map[y[:pos]][:count] +=1
|
249
|
+
end
|
250
|
+
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
end
|
256
|
+
|
257
|
+
|
258
|
+
|
259
|
+
|
260
|
+
|
261
|
+
i.seq = tseq[0]
|
262
|
+
i.qual = tqual[0].bytes.to_a
|
263
|
+
|
264
|
+
|
265
|
+
if i.seq.size > seq_size/2
|
266
|
+
|
267
|
+
si = 0
|
268
|
+
|
269
|
+
i.seq.size.times do |k|
|
270
|
+
# puts "#{i.seq[k]} #{seq[start-chr_start+k]}"
|
271
|
+
si+=1 if i.seq[k]==upseq[start-chr_start+k]
|
272
|
+
end
|
273
|
+
# puts "#{si} #{i.seq.size}"
|
274
|
+
if (si.to_f/i.seq.size)>=0.95
|
275
|
+
list << i
|
276
|
+
else
|
277
|
+
if indel_show
|
278
|
+
puts "Drop with low similarity #{si.to_f/i.seq.size} #{i.cigar}"
|
279
|
+
puts seq[start-chr_start..start-chr_start+100]
|
280
|
+
puts i.seq
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
l = i
|
286
|
+
|
287
|
+
|
288
|
+
}
|
289
|
+
|
290
|
+
|
291
|
+
if indel_show
|
292
|
+
|
293
|
+
for i in indels_map.keys.sort
|
294
|
+
puts indels_map[i].inspect
|
295
|
+
end
|
296
|
+
end
|
297
|
+
# puts l.cigar
|
298
|
+
# puts l.seq
|
299
|
+
# puts snps.size
|
300
|
+
return snps
|
301
|
+
|
302
|
+
end
|
303
|
+
|
304
|
+
def self.call_variance p
|
305
|
+
|
306
|
+
|
307
|
+
chr = p[:chr]
|
308
|
+
|
309
|
+
output = p[:output]
|
310
|
+
testBAMFile = p[:sam]
|
311
|
+
testReference = p[:ref]
|
312
|
+
sam = Bio::DB::Sam.new({:bam=>testBAMFile, :fasta=>testReference})
|
313
|
+
sam.open
|
314
|
+
|
315
|
+
chr_start = p[:start]
|
316
|
+
chr_stop = p[:stop]
|
317
|
+
snps = call_sam_snv sam,chr,chr_start,chr_stop, p[:debug]
|
318
|
+
|
319
|
+
sam.close
|
320
|
+
f = File.open(output,'a')
|
321
|
+
f.puts snps.collect{|j| j.join("\t")}
|
322
|
+
f.close
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
|
327
|
+
def self.inspect_reference testReference
|
328
|
+
|
329
|
+
# index reference
|
330
|
+
|
331
|
+
file = File.open(testReference,'r')
|
332
|
+
map = {}
|
333
|
+
size=0
|
334
|
+
chr = nil
|
335
|
+
while str=file.gets
|
336
|
+
if str[0..0]=='>'
|
337
|
+
if chr
|
338
|
+
puts "#{chr}\t#{size}"
|
339
|
+
map[chr]=size
|
340
|
+
end
|
341
|
+
chr = str.strip.split[0][1..-1]
|
342
|
+
size=0
|
343
|
+
else
|
344
|
+
size+=str.strip.size
|
345
|
+
end
|
346
|
+
end
|
347
|
+
map[chr]=size
|
348
|
+
puts map.inspect
|
349
|
+
file.close
|
350
|
+
genome = map
|
351
|
+
# genome = {"chr1"=>249250621, "chr10"=>135534747, "chr11"=>135006516, "chr12"=>133851895, "chr13"=>115169878, "chr14"=>107349540, "chr15"=>102531392, "chr16"=>90354753, "chr17"=>81195210, "chr18"=>78077248, "chr19"=>59128983, "chr2"=>243199373, "chr20"=>63025520, "chr21"=>48129895, "chr22"=>51304566, "chr3"=>198022430, "chr4"=>191154276, "chr5"=>180915260, "chr6"=>171115067, "chr7"=>159138663, "chr8"=>146364022, "chr9"=>141213431, "chrX"=>155270560, "chrY"=>59373566}
|
352
|
+
|
353
|
+
return genome
|
354
|
+
end
|
355
|
+
|
356
|
+
def self.inspect_reference_human testReference
|
357
|
+
|
358
|
+
# index reference
|
359
|
+
#
|
360
|
+
# file = File.open(testReference,'r')
|
361
|
+
# map = {}
|
362
|
+
# size=0
|
363
|
+
# chr = nil
|
364
|
+
# while str=file.gets
|
365
|
+
# if str[0..0]=='>'
|
366
|
+
# if chr
|
367
|
+
# puts "#{chr}\t#{size}"
|
368
|
+
# map[chr]=size
|
369
|
+
# end
|
370
|
+
# chr = str[1..-1].strip
|
371
|
+
# size=0
|
372
|
+
# else
|
373
|
+
# size+=str.strip.size
|
374
|
+
# end
|
375
|
+
# end
|
376
|
+
# map[chr]=size
|
377
|
+
# puts map.inspect
|
378
|
+
# file.close
|
379
|
+
|
380
|
+
genome = {"chr1"=>249250621, "chr10"=>135534747, "chr11"=>135006516, "chr12"=>133851895, "chr13"=>115169878, "chr14"=>107349540, "chr15"=>102531392, "chr16"=>90354753, "chr17"=>81195210, "chr18"=>78077248, "chr19"=>59128983, "chr2"=>243199373, "chr20"=>63025520, "chr21"=>48129895, "chr22"=>51304566, "chr3"=>198022430, "chr4"=>191154276, "chr5"=>180915260, "chr6"=>171115067, "chr7"=>159138663, "chr8"=>146364022, "chr9"=>141213431, "chrX"=>155270560, "chrY"=>59373566}
|
381
|
+
|
382
|
+
return genome
|
383
|
+
end
|
384
|
+
|
385
|
+
|
386
|
+
def self.process_vc_multicore params, core=1,chr_filter=nil
|
387
|
+
|
388
|
+
sim = core
|
389
|
+
|
390
|
+
path = '/Users/soup/Desktop/vcqa'
|
391
|
+
output = params[:output]
|
392
|
+
|
393
|
+
testReference = params[:ref]
|
394
|
+
|
395
|
+
genome = inspect_reference_human(testReference)
|
396
|
+
|
397
|
+
if chr_filter # 1 as text
|
398
|
+
|
399
|
+
tmp = {}
|
400
|
+
chr = "chr#{chr_filter}"
|
401
|
+
tmp[chr]=genome[chr]
|
402
|
+
genome = tmp
|
403
|
+
end
|
404
|
+
|
405
|
+
puts genome.inspect
|
406
|
+
|
407
|
+
|
408
|
+
# reset output
|
409
|
+
f = File.open(output,'w')
|
410
|
+
f.close
|
411
|
+
|
412
|
+
a = []
|
413
|
+
|
414
|
+
# job generator
|
415
|
+
genome.each_pair do |chr,size|
|
416
|
+
puts "#{chr}\t#{size}"
|
417
|
+
m = 1000000
|
418
|
+
n = size/m
|
419
|
+
|
420
|
+
n.times do |i|
|
421
|
+
params[:chr] = chr
|
422
|
+
params[:start] = 1 + i * m
|
423
|
+
params[:stop] = (i+1) * m
|
424
|
+
a<<params.clone
|
425
|
+
end
|
426
|
+
|
427
|
+
if size%m !=0
|
428
|
+
params[:chr] = chr
|
429
|
+
params[:start] = size - size% m
|
430
|
+
params[:stop] = size
|
431
|
+
a<<params.clone
|
432
|
+
end
|
433
|
+
|
434
|
+
end
|
435
|
+
|
436
|
+
sim = a.size if sim>a.size
|
437
|
+
|
438
|
+
# starting first N processes
|
439
|
+
sim.times do
|
440
|
+
i = a.pop
|
441
|
+
Process.fork{call_variance(i)}
|
442
|
+
end
|
443
|
+
# start one by one as the previous finish
|
444
|
+
a.each do |i|
|
445
|
+
Process.wait(0)
|
446
|
+
Process.fork{call_variance(i)}
|
447
|
+
end
|
448
|
+
# wait for all to finish
|
449
|
+
Process.waitall
|
450
|
+
|
451
|
+
end
|
452
|
+
|
453
|
+
|
454
|
+
|
455
|
+
def self.call_snv params
|
456
|
+
# ====================================================================================================
|
457
|
+
# 1. VC calling
|
458
|
+
# ====================================================================================================
|
459
|
+
core = 4
|
460
|
+
core = params['core'].to_i if params['core'] # number of simultaneous processes
|
461
|
+
sam_file =params['sam']
|
462
|
+
ref_file =params['ref']
|
463
|
+
vc_file =params['vc_out']
|
464
|
+
p = {:sam=>sam_file,:ref=>ref_file,:output=>vc_file}
|
465
|
+
|
466
|
+
# params[:start] = 95_658_000
|
467
|
+
# params[:stop] = 95_659_000
|
468
|
+
# call_variance params
|
469
|
+
|
470
|
+
process_vc_multicore p,core
|
471
|
+
# # params = {:sam=>sam_file,:ref=>ref_file,:output=>vc_all_file}
|
472
|
+
# # process_vc_multicore params,core
|
473
|
+
end
|
474
|
+
|
475
|
+
|
476
|
+
def self.call_snv_range params
|
477
|
+
# ====================================================================================================
|
478
|
+
# 1. VC calling
|
479
|
+
# ====================================================================================================
|
480
|
+
core = 4
|
481
|
+
core = params['core'].to_i if params['core'] # number of simultaneous processes
|
482
|
+
sam_file =params['sam']
|
483
|
+
ref_file =params['ref']
|
484
|
+
vc_file =params['vc_out']
|
485
|
+
p = {:sam=>sam_file,:ref=>ref_file,:output=>vc_file,:chr=>"chr#{params['chr']}",:start=>params['start'].to_i,:stop=>params['stop'].to_i,:debug=>'indel'}
|
486
|
+
puts p.inspect
|
487
|
+
|
488
|
+
call_variance p
|
489
|
+
|
490
|
+
end
|
491
|
+
|
492
|
+
end
|
493
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'vardetect-vc/call_snv'
|
2
|
+
require 'vardetect-vc/call_prepare'
|
3
|
+
require 'vardetect-vc/call_combine'
|
4
|
+
|
5
|
+
module Vardetect
|
6
|
+
module Vc
|
7
|
+
|
8
|
+
|
9
|
+
def self.parse_params params
|
10
|
+
|
11
|
+
puts params
|
12
|
+
params = params.split
|
13
|
+
cmd = params[0]
|
14
|
+
tmp = {}
|
15
|
+
((params.size-1)/2).times do |i|
|
16
|
+
key = params[1+i*2][1..-1]
|
17
|
+
value = params[1+i*2+1]
|
18
|
+
tmp[key]=value
|
19
|
+
end
|
20
|
+
tmp[:cmd]=cmd
|
21
|
+
params = tmp
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
data/lib/vardetect-vc/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vardetect-vc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -78,7 +78,9 @@ dependencies:
|
|
78
78
|
description: A variance calling tool for NGS
|
79
79
|
email:
|
80
80
|
- supasak.kul@biotec.or.th
|
81
|
-
executables:
|
81
|
+
executables:
|
82
|
+
- vardetect_vc
|
83
|
+
- vardetect_vc_batch
|
82
84
|
extensions: []
|
83
85
|
extra_rdoc_files: []
|
84
86
|
files:
|
@@ -87,7 +89,13 @@ files:
|
|
87
89
|
- LICENSE.txt
|
88
90
|
- README.md
|
89
91
|
- Rakefile
|
92
|
+
- bin/vardetect_vc
|
93
|
+
- bin/vardetect_vc_batch
|
90
94
|
- lib/vardetect-vc.rb
|
95
|
+
- lib/vardetect-vc/call_combine.rb
|
96
|
+
- lib/vardetect-vc/call_prepare.rb
|
97
|
+
- lib/vardetect-vc/call_snv.rb
|
98
|
+
- lib/vardetect-vc/lib.rb
|
91
99
|
- lib/vardetect-vc/version.rb
|
92
100
|
- vardetect-vc.gemspec
|
93
101
|
homepage: ''
|