vardetect-vc 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'vardetect-vc'
4
+ require "vardetect-vc/lib"
5
+
6
+
7
+ sample_id = 'G3418'
8
+ working = 'vcqa'
9
+
10
+ path = '/Users/soup/Desktop/chula/data'
11
+ ref_file = File.join(path,'ref','hg19.fa')
12
+
13
+
14
+ snp_array_file = File.join(path,working,"#{sample_id}_gt_report.txt")
15
+ sam_tool_file = File.join(path,working,"#{sample_id}_SNP_Indel_ANNO.csv")
16
+ sam_file = File.join(path,working,'bam',"#{sample_id}.remdup.uniqMap.TS.bam")
17
+ vc_raw_file = File.join(path,working,'snv-vc',"#{sample_id}_vc_raw_snv.tsv")
18
+
19
+
20
+ st_file = File.join(path,working,sample_id,'st_snv.tsv')
21
+ vc_file = File.join(path,working,sample_id,'vc_snv.csv')
22
+ snp_file = File.join(path,working,sample_id,'snpar_snv.csv')
23
+
24
+ st_ft_file = File.join(path,working,sample_id,'st_filtered_snv.csv')
25
+ vc_ft_file = File.join(path,working,sample_id,'vc_filtered_snv.csv')
26
+ snp_ft_file = File.join(path,working,sample_id,'snpar_filtered_snv.csv')
27
+
28
+
29
+ com_vc_file = File.join(path,working,sample_id,'vc_compared.csv')
30
+ com_st_file = File.join(path,working,sample_id,'st_compared.csv')
31
+ com_all_file = File.join(path,working,sample_id,'all_compared.csv')
32
+
33
+ combine_all_file = File.join(path,working,sample_id,'combine_all.csv')
34
+ analysed_file = File.join(path,working,sample_id,'summary.csv')
35
+
36
+ # 1. call vc
37
+ params = "snv -ref #{ref_file} -sam #{sam_file} -vc_out #{vc_raw_file} -core 8"
38
+ # params = "snv-range -ref #{ref_file} -sam #{sam_file} -vc_out #{vc_raw_file} -core 8 -chr 1 -start 168665645 -stop 170706648 -debug true"
39
+ params = params.split
40
+ # 156553003
41
+ #
42
+ #
43
+ # 170706648
44
+
45
+ Vardetect::Vc.hi
46
+ Vardetect::Vc.exec params
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'vardetect-vc'
4
+ require "vardetect-vc/lib"
5
+
6
+
7
+
8
+ Vardetect::Vc.hi
9
+ Vardetect::Vc.exec ARGV
@@ -0,0 +1,367 @@
1
+ module Vardetect
2
+ module Vc
3
+
4
+
5
+ def self.call_combine_all params
6
+ # ====================================================================================================
7
+ # 4. All comparison
8
+ # ====================================================================================================
9
+
10
+ map_vc = []
11
+ map_st = {}
12
+ map_snp = {}
13
+
14
+
15
+ vc_file =params['vc_out']
16
+ st_file =params['st_out']
17
+ snp_file =params['snp_out']
18
+
19
+ com_out =params['com_out']
20
+
21
+ f_st = File.open(st_file,'r')
22
+ f_vc = File.open(vc_file,'r')
23
+ f_snp = File.open(snp_file,'r')
24
+
25
+ com_output = File.open(com_out,'w')
26
+
27
+ puts "Indexing SNP Array SNV"
28
+ while str = f_snp.gets
29
+ v = str.strip.split(',')
30
+ map_snp[v[0]] = v
31
+ end
32
+
33
+ puts map_snp.keys[0..10]
34
+
35
+ puts "Indexing SamTools SNV"
36
+ while str = f_st.gets
37
+ v = str.strip.split(',')
38
+ map_st[v[0]] = v
39
+ end
40
+
41
+ puts map_st.keys[0..10]
42
+
43
+ puts "Indexing VC SNV"
44
+ while str = f_vc.gets
45
+ v = str.split(',')
46
+
47
+
48
+
49
+ # if (v[2]==v[3] and alt.to_f/dep > 0.1) or v[2]!=v[3]
50
+ # map_vc<<v
51
+ # end
52
+ #
53
+ # if (v[2]==v[3] and alt.to_f/dep > 0.1) or v[2]!=v[3]
54
+ map_vc<<v
55
+ # end
56
+
57
+
58
+ end
59
+
60
+ puts map_vc.size
61
+
62
+ # puts map_vc[0..10]
63
+
64
+
65
+
66
+
67
+
68
+ # map_vc.sort!{|a,b| a[1].to_i<=>b[1].to_i }
69
+
70
+ all=0
71
+ vc_count=0
72
+ st_count=0
73
+ vc_het_rate=0
74
+ st_het_rate=0
75
+ final = []
76
+
77
+
78
+ for i in map_vc
79
+ pos = i[0]
80
+ # puts pos
81
+ if snp = map_snp[pos]
82
+
83
+
84
+ vc_alt = i[5].to_i
85
+ vc_dep = i[4].to_i+vc_alt
86
+
87
+ vc_alt_a = '-'
88
+ vc_het = 0
89
+
90
+
91
+ snp_rs = snp[1]
92
+ snp_a1 = snp[2]
93
+ snp_a2 = snp[3]
94
+
95
+ snp_a = "#{snp_a1}/#{snp_a2}"
96
+ snp_het = 0
97
+ snp_het = 1 if snp_a1!=snp_a2
98
+ snp_gc = snp[4]
99
+ snp_clust = snp[5]
100
+ snp_cmp = 1
101
+
102
+ snp_check = false
103
+ if snp_a.strip !='-/-'
104
+ all+=1
105
+ snp_check = true
106
+ end
107
+ st_check = false
108
+
109
+ vc_het_t = 'hom'
110
+ snp_het_t = 'hom'
111
+ snp_het_t = 'het' if snp_a1!=snp_a2
112
+
113
+ vc_het_ratio = vc_alt.to_f/vc_dep
114
+ if vc_het_ratio >= 0.15
115
+ vc_alt_a = i[3]
116
+ vc_het = 1
117
+ vc_het_t = 'het'
118
+ end
119
+
120
+ str = "#{i[0]},#{i[1]},#{i[2]},#{i[2]}/#{vc_alt_a},#{vc_het_t},#{vc_dep},#{vc_alt},#{vc_het_ratio},"
121
+ str += "#{snp_rs},#{snp_a},#{snp_het_t},#{snp_gc},#{snp_clust},"
122
+
123
+
124
+
125
+
126
+ if st = map_st[pos]
127
+ st_het = 0
128
+ st_het = 1 if st[5] =='het'
129
+ str += st[4..-1].join(",")
130
+ if st_het==snp_het and snp_check
131
+ st_het_rate+=1
132
+ st_check =true
133
+ end
134
+ st_count+=1
135
+ end
136
+ vc_check = false
137
+ vc_check = true if vc_het==snp_het and snp_check
138
+
139
+ if snp_check and vc_check
140
+ vc_het_rate+=1
141
+ end
142
+ vc_count+=1
143
+
144
+ cmp = [snp_check,vc_check,st_check,str]
145
+
146
+ # final << cmp
147
+ com_output.puts cmp.join(',')
148
+
149
+
150
+ end
151
+ end
152
+
153
+
154
+ puts "All #{all} is not -/-"
155
+
156
+ puts "VC Hetero Rate : #{vc_het_rate}"
157
+
158
+ puts "ST Hetero Rate : #{st_het_rate}"
159
+
160
+ f_st.close
161
+ f_vc.close
162
+ f_snp.close
163
+ com_output.close
164
+
165
+
166
+ end
167
+
168
+
169
+ def self.analyse_combine params
170
+
171
+
172
+ # index columns
173
+ idx = {}
174
+ %w{snp vc st pos ref vc_a1 vc_a vc_het vc_dep vc_alt vc_r snp_ann snp_a snp_het snp_gc snp_clus st_ref st_a2 st_qua st_dep st_alt st_pos}.each_with_index do |i,index|
175
+ idx[i.to_sym] = index
176
+ end
177
+
178
+ # constant
179
+ snp_filter = 0.75
180
+ alt_ratio = 0.1
181
+
182
+
183
+ gcid = idx[:snp_gc]
184
+ hid = idx[:snp_het]
185
+ depid= idx[:vc_dep]
186
+
187
+
188
+ combine_file =params['com_out']
189
+ analyse_file =params['out']
190
+ f_list = File.open(combine_file,'r')
191
+ out = File.open(analyse_file,'w')
192
+
193
+ all_list = []
194
+ list = []
195
+
196
+ het_list = []
197
+ hom_list = []
198
+
199
+
200
+ # filter all by gc score snp array
201
+ while str = f_list.gets
202
+ # puts str
203
+ v = str.strip.split(',')
204
+ all_list<<v
205
+ if v[gcid].to_f>snp_filter
206
+ list<<v
207
+ end
208
+ end
209
+
210
+ # filter set of snp array 's het hom
211
+ for i in list
212
+ if i[hid]=='het'
213
+ het_list<<i
214
+ else
215
+ hom_list<<i
216
+ end
217
+ end
218
+
219
+
220
+
221
+ out.puts "SNP Array overlap data "
222
+ out.puts "Filter GC score at,#{snp_filter}"
223
+ out.puts "Total snv,#{all_list.size}"
224
+ out.puts "Total snv not pass filter,#{all_list.size-list.size}"
225
+ out.puts "Total hetero snv,#{het_list.size}"
226
+ out.puts "Total homo snv,#{hom_list.size}"
227
+
228
+
229
+
230
+ vc_het_het = []
231
+ vc_het_hom = []
232
+ vc_hom_hom = []
233
+ vc_hom_het = []
234
+
235
+ vchid = getidx :vc_het
236
+ vcrid = getidx :vc_r
237
+ [vchid]=='het' or
238
+ for i in list
239
+
240
+ if i[hid]=='het'
241
+ if i[vcrid].to_f >= alt_ratio
242
+ vc_het_het<<i
243
+ else
244
+ vc_het_hom<<i
245
+ end
246
+ else
247
+ if i[vcrid].to_f >= alt_ratio
248
+ vc_hom_het<<i
249
+ else
250
+ vc_hom_hom<<i
251
+ end
252
+ end
253
+ end
254
+
255
+
256
+ # matrix=[[vc_het_het.size,vc_het_hom.size],[vc_hom_hom.size,vc_hom_het.size]]
257
+
258
+
259
+
260
+ require 'erb'
261
+ outtmp = File.open(analyse_file+".html",'w')
262
+
263
+ template = ERB.new(File.open('index.html.erb','r').read)
264
+
265
+
266
+ outtmp.puts template.result(binding)
267
+
268
+ outtmp.close
269
+
270
+ `open #{analyse_file+".html"}`
271
+
272
+
273
+
274
+ out.close
275
+ f_list.close
276
+ end
277
+
278
+
279
+ def self.getidx id
280
+
281
+ idx = {}
282
+ %w{snp vc st pos ref vc_a1 vc_a vc_het vc_dep vc_alt vc_r snp_ann snp_a snp_het snp_gc snp_clus st_ref st_a2 st_qua st_dep st_alt st_pos}.each_with_index do |i,index|
283
+ idx[i.to_sym] = index
284
+ end
285
+ return idx[id]
286
+
287
+ end
288
+
289
+
290
+ def self.sampling list
291
+
292
+ gcid = getidx :snp_gc
293
+ depid = getidx :vc_dep
294
+ vcrid = getidx :vc_r
295
+
296
+
297
+ # =====================================================================
298
+ # sampling gc score value from all data set
299
+ gc_bin = []
300
+ gc_range = 100
301
+ gc_max = 1
302
+ for i in list
303
+ j = i[gcid].to_f
304
+ k = (j * gc_range / gc_max).to_i
305
+ gc_bin[k]=0 unless gc_bin[k]
306
+ gc_bin[k]+=1
307
+ end
308
+
309
+ # =====================================================================
310
+ # sampling dep score value from all data set
311
+ d_bin = []
312
+ d_range = 100
313
+ d_max = 500
314
+
315
+ for i in list
316
+ j = i[depid].to_i
317
+ k = (j * d_range / d_max).to_i
318
+ d_bin[k]=0 unless d_bin[k]
319
+ d_bin[k]+=1
320
+ end
321
+
322
+ # =====================================================================
323
+ # sampling gc score value from all data set
324
+
325
+ alt_bin = []
326
+ alt_range = 100
327
+ alt_max = 0.5
328
+ dep_range = 100
329
+ dep_max = 500
330
+
331
+
332
+ for i in list
333
+
334
+ j = i[vcrid].to_f
335
+ d = i[depid].to_f
336
+
337
+ k = (j * alt_range / alt_max).to_i
338
+ alt_bin[k]=0 unless alt_bin[k]
339
+ alt_bin[k]+=1
340
+
341
+ end
342
+
343
+
344
+ dot = []
345
+
346
+ for i in list
347
+
348
+ j = i[vcrid].to_f
349
+ d = i[depid].to_f
350
+ k = (j * alt_range / alt_max).to_i
351
+ k2 = (d * dep_range / dep_max).to_i
352
+
353
+ dot << "[#{d},#{j}]"
354
+
355
+
356
+ end
357
+
358
+
359
+
360
+
361
+ return {:gc=>gc_bin,:dep=>d_bin,:alt=>alt_bin,:dot=>dot}
362
+
363
+ end
364
+
365
+
366
+ end
367
+ end
@@ -0,0 +1,122 @@
1
+
2
+ module Vardetect
3
+ module Vc
4
+
5
+
6
+ def self.call_prepare_st params
7
+ # ====================================================================================================
8
+ # 2. ST preparing filtered chromosome
9
+ # ====================================================================================================
10
+ count = 0
11
+ start = false
12
+ sam_tool_file =params['st']
13
+ snp_file =params['st_out']
14
+
15
+
16
+ st_output = File.open(snp_file,'w')
17
+ CSV.foreach(sam_tool_file) do |row|
18
+ if start
19
+
20
+ i = row[0][3..-1]
21
+ id = i.to_i
22
+ id = i if id == 0
23
+ key = "#{id}-#{row[1]}"
24
+ st_output.puts key+","+row.join(",")
25
+
26
+ count +=1
27
+ else
28
+ start=true
29
+ end
30
+ end
31
+ st_output.close
32
+
33
+ end
34
+
35
+ def self.call_prepare_snp params
36
+ # ====================================================================================================
37
+ # 3. SNP preparing filtered chromosome
38
+ # ====================================================================================================
39
+
40
+ snp_array_file =params['snp']
41
+ snp_file =params['snp_out']
42
+
43
+ count = 0
44
+ list = []
45
+ start = false
46
+ run = false
47
+
48
+ file = File.open(snp_array_file,'r')
49
+ output = File.open snp_file,'w'
50
+ map = {}
51
+ while str = file.gets #and count<100
52
+ str.strip!
53
+ if str=='[Data]' and !start
54
+ start = true
55
+ elsif start
56
+ s = str.split
57
+ if s[0] =='Sample' and !run
58
+ run = true
59
+ else
60
+
61
+ key = "#{s[1]}-#{s[2]}"
62
+ map[key] = s
63
+
64
+ end
65
+
66
+ end
67
+ count+=1
68
+ end
69
+ #
70
+ map.keys.sort.each do |k|
71
+ v = map[k]
72
+ output.puts "#{k},"+v[3..-1].join(",")
73
+ end
74
+
75
+ output.close
76
+
77
+
78
+
79
+ end
80
+
81
+ def self.call_prepare_vc params
82
+
83
+ vc_raw_file =params['vc_raw']
84
+ vc_file =params['vc_out']
85
+
86
+ count = 0
87
+ list = []
88
+
89
+ file = File.open(vc_raw_file,'r')
90
+ output = File.open vc_file,'w'
91
+ map = {}
92
+ while str = file.gets #and count<100
93
+ str.strip!
94
+ s = str.split
95
+ # puts str
96
+ begin
97
+ if str
98
+
99
+ alt = s[6].to_i
100
+ dep = s[5].to_i + alt
101
+ if ( alt.to_f/dep > 0.1) or s[2]!=s[3]
102
+ i = s[0][3..-1]
103
+ key = "#{i}-#{s[1]}"
104
+ map[key] = s
105
+ end
106
+ end
107
+ rescue
108
+ puts str
109
+ end
110
+ end
111
+ #
112
+ map.keys.sort.each do |k|
113
+ v = map[k]
114
+ output.puts "#{k},"+v[2..-1].join(",")
115
+ end
116
+ output.close
117
+ end
118
+
119
+
120
+
121
+ end
122
+ end
@@ -0,0 +1,493 @@
1
+ module Vardetect
2
+ module Vc
3
+
4
+
5
+
6
+ def self.form seq, cigar
7
+ ix = cigar.split(/S|M|I|D/)
8
+ index = 0
9
+ id = 0
10
+ str = ''
11
+ st = 0
12
+ indel = []
13
+ # puts seq
14
+ for i in ix
15
+ index+=i.size
16
+ cmd = cigar[index]
17
+ # puts "#{cmd} #{i.to_i}"
18
+ index+=1
19
+ i = i.to_i
20
+ case cmd
21
+ when 'S'
22
+ st = i
23
+ id += i
24
+ when 'I'
25
+ indel<<{:pos=>id-st,:type=>'ins',:size=>i,:cigar=>cigar}
26
+ id+=i
27
+ when 'D'
28
+ str+='N'*i
29
+ indel<<{:pos=>id-st,:type=>'del',:size=>i,:cigar=>cigar}
30
+ when 'M'
31
+ str += seq[id..id+i-1] if seq[id..id+i-1]
32
+ id+=i
33
+ end
34
+ # puts "Ref:#{seq}"
35
+ # puts "Seq:#{str}"
36
+ # puts "ID #{id}"
37
+ # puts
38
+ end
39
+ return str,indel
40
+ end
41
+
42
+ def self.call_sam_snv sam,chr,chr_start,chr_stop, debug=nil
43
+
44
+ unless debug
45
+ show = false
46
+ indel_show = false
47
+ var_show = false
48
+ else
49
+ s = debug.split('|')
50
+ show = s.index('show')
51
+ indel_show = s.index('indel')
52
+ var_show = s.index('var')
53
+ end
54
+
55
+
56
+ l = nil
57
+ snps = []
58
+ indels = []
59
+ indels_map = {}
60
+
61
+ puts "#{chr} #{chr_start} - #{chr_stop}"
62
+ seq = sam.fetch_reference(chr,chr_start,chr_stop+1000)
63
+ sam.load_index
64
+
65
+ upseq = seq.upcase
66
+
67
+
68
+
69
+ als = []
70
+
71
+
72
+ als = sam.fetch(chr, chr_start, chr_stop)
73
+
74
+ profile = []
75
+ list = []
76
+ start = nil
77
+ alleles = {'A'=>0,'C'=>1,'G'=>2,'T'=>3}
78
+ inv_alleles = {0=>'A',1=>'C',2=>'G',3=>'T'}
79
+
80
+
81
+ als.each{|i|
82
+
83
+ # puts "#{i.pos}"
84
+
85
+ unless start
86
+ start = i.pos
87
+ end
88
+
89
+ if i.pos!=start
90
+
91
+ while start<i.pos
92
+ del = []
93
+
94
+ s = ''
95
+ ref = seq[start-chr_start]
96
+ print "#{start}\t#{ref}\t#{list.size}\t" if show
97
+ ref = ref.upcase
98
+
99
+
100
+ p = [0,0,0,0]
101
+ q = [0,0,0,0]
102
+
103
+ list.each_with_index do|l,index|
104
+
105
+
106
+ c = l.seq[start-l.pos]
107
+ cq = l.qual[start-l.pos]
108
+
109
+ # a = seq[start-1..start-1-5]
110
+ # b =seq[start-l.pos..start-l.pos-5]
111
+ # puts "#{a} - #{b}"
112
+
113
+ if c
114
+
115
+ case c
116
+ when 'A'
117
+ p[0]+=1
118
+ q[0]+=cq-33 if cq
119
+ when 'C'
120
+ p[1]+=1
121
+ q[1]+=cq-33 if cq
122
+ when 'G'
123
+ p[2]+=1
124
+ q[2]+=cq-33 if cq
125
+ when 'T'
126
+ p[3]+=1
127
+ q[3]+=cq-33 if cq
128
+ end
129
+
130
+ s+=c
131
+
132
+ print "#{c}" if show
133
+
134
+ else
135
+ del<<l
136
+ end
137
+
138
+
139
+ end
140
+
141
+
142
+
143
+ pmax = p.index(p.max)
144
+ max = p[pmax]
145
+ p[pmax] = 0 if pmax
146
+ palt = p.index(p.max)
147
+ alt = p[palt]
148
+ dep = max+alt
149
+ max_q = q[pmax].to_f/max
150
+ alt_q = '0'
151
+ alt_q = q[palt].to_f/alt if alt > 0
152
+
153
+
154
+ if dep > 2 and ((indel = indels_map[start] and indel[:count]>1 ) or (alt>0 and alt/dep.to_f > 0.1 and alt_q >10) or alleles[ref]!=pmax )
155
+
156
+
157
+
158
+ text = "#{start}\t#{max}/#{alt.to_i}\t#{ref}\t#{s}"
159
+ puts text if var_show
160
+ sum = (max+alt).to_f
161
+ aalt = '-'
162
+
163
+ aalt = inv_alleles[palt] if alt!=0
164
+ if (alt>0 and alt/dep.to_f > 0.1)
165
+ c = 'a'
166
+ else
167
+ c = 'A'
168
+ end
169
+
170
+
171
+ if indel
172
+ s = indel[:alleles].split('/')
173
+ ref = s[0]
174
+ aalt = s[1]
175
+ alt = indel[:count]
176
+ if indel[:type]=='ins'
177
+ c='+'
178
+ else
179
+ c='-'
180
+ end
181
+ end
182
+ snps << [chr,start,ref,inv_alleles[pmax],aalt,max,alt,format('%.2f',max_q),format('%.2f',alt_q),c]
183
+
184
+ # snps << [chr,start,ref,inv_alleles[pmax],inv_alleles[palt],max,alt,max/sum,alt/sum]
185
+
186
+ end
187
+ list-=del
188
+ if list.size==0
189
+ start = i.pos
190
+ else
191
+ start+=1
192
+ end
193
+
194
+
195
+ puts if show
196
+ end
197
+
198
+ start = i.pos
199
+ end
200
+
201
+
202
+
203
+ seq_size = i.seq.size
204
+
205
+ tseq = form(i.seq,i.cigar)
206
+ tqual = form(i.qual,i.cigar)
207
+
208
+
209
+ if (i.cigar.index('I') or i.cigar.index('D') )
210
+
211
+
212
+ if indel_show
213
+ puts "-------------------------INDEL "
214
+ puts i.pos
215
+ puts i.cigar
216
+ puts i.seq
217
+ puts i.qual
218
+ # puts i.inspect
219
+
220
+ # :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr
221
+
222
+ puts seq[i.pos-chr_start..i.pos-chr_start+i.seq.size]
223
+ puts tseq[0]
224
+ puts tqual[0]
225
+ puts tseq[1]
226
+ end
227
+
228
+ if tseq[1].size>0
229
+ for y in tseq[1]
230
+ if y[:type]=='ins'
231
+ iref = '-'
232
+ iseq = i.seq[y[:pos]..y[:pos]+y[:size]-1]
233
+ y[:alleles] = "#{iref}/#{iseq}"
234
+ else
235
+ iref = seq[i.pos-chr_start+y[:pos]..i.pos-chr_start+y[:pos]+y[:size]-1]
236
+ iseq = '-'
237
+ y[:alleles] = "#{iref}/#{iseq}"
238
+ end
239
+ y[:pos]+=i.pos
240
+ indels<<y
241
+ unless indels_map[y[:pos]]
242
+ y[:count]=1
243
+ y[:sm]=y[:alleles]
244
+ indels_map[y[:pos]] = y
245
+ else
246
+ indels_map[y[:pos]][:sm]+=",#{y[:alleles]}"
247
+
248
+ indels_map[y[:pos]][:count] +=1
249
+ end
250
+
251
+ end
252
+ end
253
+
254
+
255
+ end
256
+
257
+
258
+
259
+
260
+
261
+ i.seq = tseq[0]
262
+ i.qual = tqual[0].bytes.to_a
263
+
264
+
265
+ if i.seq.size > seq_size/2
266
+
267
+ si = 0
268
+
269
+ i.seq.size.times do |k|
270
+ # puts "#{i.seq[k]} #{seq[start-chr_start+k]}"
271
+ si+=1 if i.seq[k]==upseq[start-chr_start+k]
272
+ end
273
+ # puts "#{si} #{i.seq.size}"
274
+ if (si.to_f/i.seq.size)>=0.95
275
+ list << i
276
+ else
277
+ if indel_show
278
+ puts "Drop with low similarity #{si.to_f/i.seq.size} #{i.cigar}"
279
+ puts seq[start-chr_start..start-chr_start+100]
280
+ puts i.seq
281
+ end
282
+ end
283
+ end
284
+
285
+ l = i
286
+
287
+
288
+ }
289
+
290
+
291
+ if indel_show
292
+
293
+ for i in indels_map.keys.sort
294
+ puts indels_map[i].inspect
295
+ end
296
+ end
297
+ # puts l.cigar
298
+ # puts l.seq
299
+ # puts snps.size
300
+ return snps
301
+
302
+ end
303
+
304
+ def self.call_variance p
305
+
306
+
307
+ chr = p[:chr]
308
+
309
+ output = p[:output]
310
+ testBAMFile = p[:sam]
311
+ testReference = p[:ref]
312
+ sam = Bio::DB::Sam.new({:bam=>testBAMFile, :fasta=>testReference})
313
+ sam.open
314
+
315
+ chr_start = p[:start]
316
+ chr_stop = p[:stop]
317
+ snps = call_sam_snv sam,chr,chr_start,chr_stop, p[:debug]
318
+
319
+ sam.close
320
+ f = File.open(output,'a')
321
+ f.puts snps.collect{|j| j.join("\t")}
322
+ f.close
323
+ end
324
+
325
+
326
+
327
+ def self.inspect_reference testReference
328
+
329
+ # index reference
330
+
331
+ file = File.open(testReference,'r')
332
+ map = {}
333
+ size=0
334
+ chr = nil
335
+ while str=file.gets
336
+ if str[0..0]=='>'
337
+ if chr
338
+ puts "#{chr}\t#{size}"
339
+ map[chr]=size
340
+ end
341
+ chr = str.strip.split[0][1..-1]
342
+ size=0
343
+ else
344
+ size+=str.strip.size
345
+ end
346
+ end
347
+ map[chr]=size
348
+ puts map.inspect
349
+ file.close
350
+ genome = map
351
+ # genome = {"chr1"=>249250621, "chr10"=>135534747, "chr11"=>135006516, "chr12"=>133851895, "chr13"=>115169878, "chr14"=>107349540, "chr15"=>102531392, "chr16"=>90354753, "chr17"=>81195210, "chr18"=>78077248, "chr19"=>59128983, "chr2"=>243199373, "chr20"=>63025520, "chr21"=>48129895, "chr22"=>51304566, "chr3"=>198022430, "chr4"=>191154276, "chr5"=>180915260, "chr6"=>171115067, "chr7"=>159138663, "chr8"=>146364022, "chr9"=>141213431, "chrX"=>155270560, "chrY"=>59373566}
352
+
353
+ return genome
354
+ end
355
+
356
+ def self.inspect_reference_human testReference
357
+
358
+ # index reference
359
+ #
360
+ # file = File.open(testReference,'r')
361
+ # map = {}
362
+ # size=0
363
+ # chr = nil
364
+ # while str=file.gets
365
+ # if str[0..0]=='>'
366
+ # if chr
367
+ # puts "#{chr}\t#{size}"
368
+ # map[chr]=size
369
+ # end
370
+ # chr = str[1..-1].strip
371
+ # size=0
372
+ # else
373
+ # size+=str.strip.size
374
+ # end
375
+ # end
376
+ # map[chr]=size
377
+ # puts map.inspect
378
+ # file.close
379
+
380
+ genome = {"chr1"=>249250621, "chr10"=>135534747, "chr11"=>135006516, "chr12"=>133851895, "chr13"=>115169878, "chr14"=>107349540, "chr15"=>102531392, "chr16"=>90354753, "chr17"=>81195210, "chr18"=>78077248, "chr19"=>59128983, "chr2"=>243199373, "chr20"=>63025520, "chr21"=>48129895, "chr22"=>51304566, "chr3"=>198022430, "chr4"=>191154276, "chr5"=>180915260, "chr6"=>171115067, "chr7"=>159138663, "chr8"=>146364022, "chr9"=>141213431, "chrX"=>155270560, "chrY"=>59373566}
381
+
382
+ return genome
383
+ end
384
+
385
+
386
+ def self.process_vc_multicore params, core=1,chr_filter=nil
387
+
388
+ sim = core
389
+
390
+ path = '/Users/soup/Desktop/vcqa'
391
+ output = params[:output]
392
+
393
+ testReference = params[:ref]
394
+
395
+ genome = inspect_reference_human(testReference)
396
+
397
+ if chr_filter # 1 as text
398
+
399
+ tmp = {}
400
+ chr = "chr#{chr_filter}"
401
+ tmp[chr]=genome[chr]
402
+ genome = tmp
403
+ end
404
+
405
+ puts genome.inspect
406
+
407
+
408
+ # reset output
409
+ f = File.open(output,'w')
410
+ f.close
411
+
412
+ a = []
413
+
414
+ # job generator
415
+ genome.each_pair do |chr,size|
416
+ puts "#{chr}\t#{size}"
417
+ m = 1000000
418
+ n = size/m
419
+
420
+ n.times do |i|
421
+ params[:chr] = chr
422
+ params[:start] = 1 + i * m
423
+ params[:stop] = (i+1) * m
424
+ a<<params.clone
425
+ end
426
+
427
+ if size%m !=0
428
+ params[:chr] = chr
429
+ params[:start] = size - size% m
430
+ params[:stop] = size
431
+ a<<params.clone
432
+ end
433
+
434
+ end
435
+
436
+ sim = a.size if sim>a.size
437
+
438
+ # starting first N processes
439
+ sim.times do
440
+ i = a.pop
441
+ Process.fork{call_variance(i)}
442
+ end
443
+ # start one by one as the previous finish
444
+ a.each do |i|
445
+ Process.wait(0)
446
+ Process.fork{call_variance(i)}
447
+ end
448
+ # wait for all to finish
449
+ Process.waitall
450
+
451
+ end
452
+
453
+
454
+
455
+ def self.call_snv params
456
+ # ====================================================================================================
457
+ # 1. VC calling
458
+ # ====================================================================================================
459
+ core = 4
460
+ core = params['core'].to_i if params['core'] # number of simultaneous processes
461
+ sam_file =params['sam']
462
+ ref_file =params['ref']
463
+ vc_file =params['vc_out']
464
+ p = {:sam=>sam_file,:ref=>ref_file,:output=>vc_file}
465
+
466
+ # params[:start] = 95_658_000
467
+ # params[:stop] = 95_659_000
468
+ # call_variance params
469
+
470
+ process_vc_multicore p,core
471
+ # # params = {:sam=>sam_file,:ref=>ref_file,:output=>vc_all_file}
472
+ # # process_vc_multicore params,core
473
+ end
474
+
475
+
476
+ def self.call_snv_range params
477
+ # ====================================================================================================
478
+ # 1. VC calling
479
+ # ====================================================================================================
480
+ core = 4
481
+ core = params['core'].to_i if params['core'] # number of simultaneous processes
482
+ sam_file =params['sam']
483
+ ref_file =params['ref']
484
+ vc_file =params['vc_out']
485
+ p = {:sam=>sam_file,:ref=>ref_file,:output=>vc_file,:chr=>"chr#{params['chr']}",:start=>params['start'].to_i,:stop=>params['stop'].to_i,:debug=>'indel'}
486
+ puts p.inspect
487
+
488
+ call_variance p
489
+
490
+ end
491
+
492
+ end
493
+ end
@@ -0,0 +1,28 @@
1
+ require 'vardetect-vc/call_snv'
2
+ require 'vardetect-vc/call_prepare'
3
+ require 'vardetect-vc/call_combine'
4
+
5
+ module Vardetect
6
+ module Vc
7
+
8
+
9
+ def self.parse_params params
10
+
11
+ puts params
12
+ params = params.split
13
+ cmd = params[0]
14
+ tmp = {}
15
+ ((params.size-1)/2).times do |i|
16
+ key = params[1+i*2][1..-1]
17
+ value = params[1+i*2+1]
18
+ tmp[key]=value
19
+ end
20
+ tmp[:cmd]=cmd
21
+ params = tmp
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+
28
+
@@ -1,5 +1,5 @@
1
1
  module Vardetect
2
2
  module Vc
3
- VERSION = "0.0.4"
3
+ VERSION = "0.0.5"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vardetect-vc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -78,7 +78,9 @@ dependencies:
78
78
  description: A variance calling tool for NGS
79
79
  email:
80
80
  - supasak.kul@biotec.or.th
81
- executables: []
81
+ executables:
82
+ - vardetect_vc
83
+ - vardetect_vc_batch
82
84
  extensions: []
83
85
  extra_rdoc_files: []
84
86
  files:
@@ -87,7 +89,13 @@ files:
87
89
  - LICENSE.txt
88
90
  - README.md
89
91
  - Rakefile
92
+ - bin/vardetect_vc
93
+ - bin/vardetect_vc_batch
90
94
  - lib/vardetect-vc.rb
95
+ - lib/vardetect-vc/call_combine.rb
96
+ - lib/vardetect-vc/call_prepare.rb
97
+ - lib/vardetect-vc/call_snv.rb
98
+ - lib/vardetect-vc/lib.rb
91
99
  - lib/vardetect-vc/version.rb
92
100
  - vardetect-vc.gemspec
93
101
  homepage: ''