vardetect-vc 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'vardetect-vc'
4
+ require "vardetect-vc/lib"
5
+
6
+
7
+ sample_id = 'G3418'
8
+ working = 'vcqa'
9
+
10
+ path = '/Users/soup/Desktop/chula/data'
11
+ ref_file = File.join(path,'ref','hg19.fa')
12
+
13
+
14
+ snp_array_file = File.join(path,working,"#{sample_id}_gt_report.txt")
15
+ sam_tool_file = File.join(path,working,"#{sample_id}_SNP_Indel_ANNO.csv")
16
+ sam_file = File.join(path,working,'bam',"#{sample_id}.remdup.uniqMap.TS.bam")
17
+ vc_raw_file = File.join(path,working,'snv-vc',"#{sample_id}_vc_raw_snv.tsv")
18
+
19
+
20
+ st_file = File.join(path,working,sample_id,'st_snv.tsv')
21
+ vc_file = File.join(path,working,sample_id,'vc_snv.csv')
22
+ snp_file = File.join(path,working,sample_id,'snpar_snv.csv')
23
+
24
+ st_ft_file = File.join(path,working,sample_id,'st_filtered_snv.csv')
25
+ vc_ft_file = File.join(path,working,sample_id,'vc_filtered_snv.csv')
26
+ snp_ft_file = File.join(path,working,sample_id,'snpar_filtered_snv.csv')
27
+
28
+
29
+ com_vc_file = File.join(path,working,sample_id,'vc_compared.csv')
30
+ com_st_file = File.join(path,working,sample_id,'st_compared.csv')
31
+ com_all_file = File.join(path,working,sample_id,'all_compared.csv')
32
+
33
+ combine_all_file = File.join(path,working,sample_id,'combine_all.csv')
34
+ analysed_file = File.join(path,working,sample_id,'summary.csv')
35
+
36
+ # 1. call vc
37
+ params = "snv -ref #{ref_file} -sam #{sam_file} -vc_out #{vc_raw_file} -core 8"
38
+ # params = "snv-range -ref #{ref_file} -sam #{sam_file} -vc_out #{vc_raw_file} -core 8 -chr 1 -start 168665645 -stop 170706648 -debug true"
39
+ params = params.split
40
+ # 156553003
41
+ #
42
+ #
43
+ # 170706648
44
+
45
+ Vardetect::Vc.hi
46
+ Vardetect::Vc.exec params
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'vardetect-vc'
4
+ require "vardetect-vc/lib"
5
+
6
+
7
+
8
+ Vardetect::Vc.hi
9
+ Vardetect::Vc.exec ARGV
@@ -0,0 +1,367 @@
1
+ module Vardetect
2
+ module Vc
3
+
4
+
5
+ def self.call_combine_all params
6
+ # ====================================================================================================
7
+ # 4. All comparison
8
+ # ====================================================================================================
9
+
10
+ map_vc = []
11
+ map_st = {}
12
+ map_snp = {}
13
+
14
+
15
+ vc_file =params['vc_out']
16
+ st_file =params['st_out']
17
+ snp_file =params['snp_out']
18
+
19
+ com_out =params['com_out']
20
+
21
+ f_st = File.open(st_file,'r')
22
+ f_vc = File.open(vc_file,'r')
23
+ f_snp = File.open(snp_file,'r')
24
+
25
+ com_output = File.open(com_out,'w')
26
+
27
+ puts "Indexing SNP Array SNV"
28
+ while str = f_snp.gets
29
+ v = str.strip.split(',')
30
+ map_snp[v[0]] = v
31
+ end
32
+
33
+ puts map_snp.keys[0..10]
34
+
35
+ puts "Indexing SamTools SNV"
36
+ while str = f_st.gets
37
+ v = str.strip.split(',')
38
+ map_st[v[0]] = v
39
+ end
40
+
41
+ puts map_st.keys[0..10]
42
+
43
+ puts "Indexing VC SNV"
44
+ while str = f_vc.gets
45
+ v = str.split(',')
46
+
47
+
48
+
49
+ # if (v[2]==v[3] and alt.to_f/dep > 0.1) or v[2]!=v[3]
50
+ # map_vc<<v
51
+ # end
52
+ #
53
+ # if (v[2]==v[3] and alt.to_f/dep > 0.1) or v[2]!=v[3]
54
+ map_vc<<v
55
+ # end
56
+
57
+
58
+ end
59
+
60
+ puts map_vc.size
61
+
62
+ # puts map_vc[0..10]
63
+
64
+
65
+
66
+
67
+
68
+ # map_vc.sort!{|a,b| a[1].to_i<=>b[1].to_i }
69
+
70
+ all=0
71
+ vc_count=0
72
+ st_count=0
73
+ vc_het_rate=0
74
+ st_het_rate=0
75
+ final = []
76
+
77
+
78
+ for i in map_vc
79
+ pos = i[0]
80
+ # puts pos
81
+ if snp = map_snp[pos]
82
+
83
+
84
+ vc_alt = i[5].to_i
85
+ vc_dep = i[4].to_i+vc_alt
86
+
87
+ vc_alt_a = '-'
88
+ vc_het = 0
89
+
90
+
91
+ snp_rs = snp[1]
92
+ snp_a1 = snp[2]
93
+ snp_a2 = snp[3]
94
+
95
+ snp_a = "#{snp_a1}/#{snp_a2}"
96
+ snp_het = 0
97
+ snp_het = 1 if snp_a1!=snp_a2
98
+ snp_gc = snp[4]
99
+ snp_clust = snp[5]
100
+ snp_cmp = 1
101
+
102
+ snp_check = false
103
+ if snp_a.strip !='-/-'
104
+ all+=1
105
+ snp_check = true
106
+ end
107
+ st_check = false
108
+
109
+ vc_het_t = 'hom'
110
+ snp_het_t = 'hom'
111
+ snp_het_t = 'het' if snp_a1!=snp_a2
112
+
113
+ vc_het_ratio = vc_alt.to_f/vc_dep
114
+ if vc_het_ratio >= 0.15
115
+ vc_alt_a = i[3]
116
+ vc_het = 1
117
+ vc_het_t = 'het'
118
+ end
119
+
120
+ str = "#{i[0]},#{i[1]},#{i[2]},#{i[2]}/#{vc_alt_a},#{vc_het_t},#{vc_dep},#{vc_alt},#{vc_het_ratio},"
121
+ str += "#{snp_rs},#{snp_a},#{snp_het_t},#{snp_gc},#{snp_clust},"
122
+
123
+
124
+
125
+
126
+ if st = map_st[pos]
127
+ st_het = 0
128
+ st_het = 1 if st[5] =='het'
129
+ str += st[4..-1].join(",")
130
+ if st_het==snp_het and snp_check
131
+ st_het_rate+=1
132
+ st_check =true
133
+ end
134
+ st_count+=1
135
+ end
136
+ vc_check = false
137
+ vc_check = true if vc_het==snp_het and snp_check
138
+
139
+ if snp_check and vc_check
140
+ vc_het_rate+=1
141
+ end
142
+ vc_count+=1
143
+
144
+ cmp = [snp_check,vc_check,st_check,str]
145
+
146
+ # final << cmp
147
+ com_output.puts cmp.join(',')
148
+
149
+
150
+ end
151
+ end
152
+
153
+
154
+ puts "All #{all} is not -/-"
155
+
156
+ puts "VC Hetero Rate : #{vc_het_rate}"
157
+
158
+ puts "ST Hetero Rate : #{st_het_rate}"
159
+
160
+ f_st.close
161
+ f_vc.close
162
+ f_snp.close
163
+ com_output.close
164
+
165
+
166
+ end
167
+
168
+
169
+ def self.analyse_combine params
170
+
171
+
172
+ # index columns
173
+ idx = {}
174
+ %w{snp vc st pos ref vc_a1 vc_a vc_het vc_dep vc_alt vc_r snp_ann snp_a snp_het snp_gc snp_clus st_ref st_a2 st_qua st_dep st_alt st_pos}.each_with_index do |i,index|
175
+ idx[i.to_sym] = index
176
+ end
177
+
178
+ # constant
179
+ snp_filter = 0.75
180
+ alt_ratio = 0.1
181
+
182
+
183
+ gcid = idx[:snp_gc]
184
+ hid = idx[:snp_het]
185
+ depid= idx[:vc_dep]
186
+
187
+
188
+ combine_file =params['com_out']
189
+ analyse_file =params['out']
190
+ f_list = File.open(combine_file,'r')
191
+ out = File.open(analyse_file,'w')
192
+
193
+ all_list = []
194
+ list = []
195
+
196
+ het_list = []
197
+ hom_list = []
198
+
199
+
200
+ # filter all by gc score snp array
201
+ while str = f_list.gets
202
+ # puts str
203
+ v = str.strip.split(',')
204
+ all_list<<v
205
+ if v[gcid].to_f>snp_filter
206
+ list<<v
207
+ end
208
+ end
209
+
210
+ # filter set of snp array 's het hom
211
+ for i in list
212
+ if i[hid]=='het'
213
+ het_list<<i
214
+ else
215
+ hom_list<<i
216
+ end
217
+ end
218
+
219
+
220
+
221
+ out.puts "SNP Array overlap data "
222
+ out.puts "Filter GC score at,#{snp_filter}"
223
+ out.puts "Total snv,#{all_list.size}"
224
+ out.puts "Total snv not pass filter,#{all_list.size-list.size}"
225
+ out.puts "Total hetero snv,#{het_list.size}"
226
+ out.puts "Total homo snv,#{hom_list.size}"
227
+
228
+
229
+
230
+ vc_het_het = []
231
+ vc_het_hom = []
232
+ vc_hom_hom = []
233
+ vc_hom_het = []
234
+
235
+ vchid = getidx :vc_het
236
+ vcrid = getidx :vc_r
237
+ [vchid]=='het' or
238
+ for i in list
239
+
240
+ if i[hid]=='het'
241
+ if i[vcrid].to_f >= alt_ratio
242
+ vc_het_het<<i
243
+ else
244
+ vc_het_hom<<i
245
+ end
246
+ else
247
+ if i[vcrid].to_f >= alt_ratio
248
+ vc_hom_het<<i
249
+ else
250
+ vc_hom_hom<<i
251
+ end
252
+ end
253
+ end
254
+
255
+
256
+ # matrix=[[vc_het_het.size,vc_het_hom.size],[vc_hom_hom.size,vc_hom_het.size]]
257
+
258
+
259
+
260
+ require 'erb'
261
+ outtmp = File.open(analyse_file+".html",'w')
262
+
263
+ template = ERB.new(File.open('index.html.erb','r').read)
264
+
265
+
266
+ outtmp.puts template.result(binding)
267
+
268
+ outtmp.close
269
+
270
+ `open #{analyse_file+".html"}`
271
+
272
+
273
+
274
+ out.close
275
+ f_list.close
276
+ end
277
+
278
+
279
+ def self.getidx id
280
+
281
+ idx = {}
282
+ %w{snp vc st pos ref vc_a1 vc_a vc_het vc_dep vc_alt vc_r snp_ann snp_a snp_het snp_gc snp_clus st_ref st_a2 st_qua st_dep st_alt st_pos}.each_with_index do |i,index|
283
+ idx[i.to_sym] = index
284
+ end
285
+ return idx[id]
286
+
287
+ end
288
+
289
+
290
+ def self.sampling list
291
+
292
+ gcid = getidx :snp_gc
293
+ depid = getidx :vc_dep
294
+ vcrid = getidx :vc_r
295
+
296
+
297
+ # =====================================================================
298
+ # sampling gc score value from all data set
299
+ gc_bin = []
300
+ gc_range = 100
301
+ gc_max = 1
302
+ for i in list
303
+ j = i[gcid].to_f
304
+ k = (j * gc_range / gc_max).to_i
305
+ gc_bin[k]=0 unless gc_bin[k]
306
+ gc_bin[k]+=1
307
+ end
308
+
309
+ # =====================================================================
310
+ # sampling dep score value from all data set
311
+ d_bin = []
312
+ d_range = 100
313
+ d_max = 500
314
+
315
+ for i in list
316
+ j = i[depid].to_i
317
+ k = (j * d_range / d_max).to_i
318
+ d_bin[k]=0 unless d_bin[k]
319
+ d_bin[k]+=1
320
+ end
321
+
322
+ # =====================================================================
323
+ # sampling gc score value from all data set
324
+
325
+ alt_bin = []
326
+ alt_range = 100
327
+ alt_max = 0.5
328
+ dep_range = 100
329
+ dep_max = 500
330
+
331
+
332
+ for i in list
333
+
334
+ j = i[vcrid].to_f
335
+ d = i[depid].to_f
336
+
337
+ k = (j * alt_range / alt_max).to_i
338
+ alt_bin[k]=0 unless alt_bin[k]
339
+ alt_bin[k]+=1
340
+
341
+ end
342
+
343
+
344
+ dot = []
345
+
346
+ for i in list
347
+
348
+ j = i[vcrid].to_f
349
+ d = i[depid].to_f
350
+ k = (j * alt_range / alt_max).to_i
351
+ k2 = (d * dep_range / dep_max).to_i
352
+
353
+ dot << "[#{d},#{j}]"
354
+
355
+
356
+ end
357
+
358
+
359
+
360
+
361
+ return {:gc=>gc_bin,:dep=>d_bin,:alt=>alt_bin,:dot=>dot}
362
+
363
+ end
364
+
365
+
366
+ end
367
+ end
@@ -0,0 +1,122 @@
1
+
2
+ module Vardetect
3
+ module Vc
4
+
5
+
6
+ def self.call_prepare_st params
7
+ # ====================================================================================================
8
+ # 2. ST preparing filtered chromosome
9
+ # ====================================================================================================
10
+ count = 0
11
+ start = false
12
+ sam_tool_file =params['st']
13
+ snp_file =params['st_out']
14
+
15
+
16
+ st_output = File.open(snp_file,'w')
17
+ CSV.foreach(sam_tool_file) do |row|
18
+ if start
19
+
20
+ i = row[0][3..-1]
21
+ id = i.to_i
22
+ id = i if id == 0
23
+ key = "#{id}-#{row[1]}"
24
+ st_output.puts key+","+row.join(",")
25
+
26
+ count +=1
27
+ else
28
+ start=true
29
+ end
30
+ end
31
+ st_output.close
32
+
33
+ end
34
+
35
+ def self.call_prepare_snp params
36
+ # ====================================================================================================
37
+ # 3. SNP preparing filtered chromosome
38
+ # ====================================================================================================
39
+
40
+ snp_array_file =params['snp']
41
+ snp_file =params['snp_out']
42
+
43
+ count = 0
44
+ list = []
45
+ start = false
46
+ run = false
47
+
48
+ file = File.open(snp_array_file,'r')
49
+ output = File.open snp_file,'w'
50
+ map = {}
51
+ while str = file.gets #and count<100
52
+ str.strip!
53
+ if str=='[Data]' and !start
54
+ start = true
55
+ elsif start
56
+ s = str.split
57
+ if s[0] =='Sample' and !run
58
+ run = true
59
+ else
60
+
61
+ key = "#{s[1]}-#{s[2]}"
62
+ map[key] = s
63
+
64
+ end
65
+
66
+ end
67
+ count+=1
68
+ end
69
+ #
70
+ map.keys.sort.each do |k|
71
+ v = map[k]
72
+ output.puts "#{k},"+v[3..-1].join(",")
73
+ end
74
+
75
+ output.close
76
+
77
+
78
+
79
+ end
80
+
81
+ def self.call_prepare_vc params
82
+
83
+ vc_raw_file =params['vc_raw']
84
+ vc_file =params['vc_out']
85
+
86
+ count = 0
87
+ list = []
88
+
89
+ file = File.open(vc_raw_file,'r')
90
+ output = File.open vc_file,'w'
91
+ map = {}
92
+ while str = file.gets #and count<100
93
+ str.strip!
94
+ s = str.split
95
+ # puts str
96
+ begin
97
+ if str
98
+
99
+ alt = s[6].to_i
100
+ dep = s[5].to_i + alt
101
+ if ( alt.to_f/dep > 0.1) or s[2]!=s[3]
102
+ i = s[0][3..-1]
103
+ key = "#{i}-#{s[1]}"
104
+ map[key] = s
105
+ end
106
+ end
107
+ rescue
108
+ puts str
109
+ end
110
+ end
111
+ #
112
+ map.keys.sort.each do |k|
113
+ v = map[k]
114
+ output.puts "#{k},"+v[2..-1].join(",")
115
+ end
116
+ output.close
117
+ end
118
+
119
+
120
+
121
+ end
122
+ end
@@ -0,0 +1,493 @@
1
+ module Vardetect
2
+ module Vc
3
+
4
+
5
+
6
+ def self.form seq, cigar
7
+ ix = cigar.split(/S|M|I|D/)
8
+ index = 0
9
+ id = 0
10
+ str = ''
11
+ st = 0
12
+ indel = []
13
+ # puts seq
14
+ for i in ix
15
+ index+=i.size
16
+ cmd = cigar[index]
17
+ # puts "#{cmd} #{i.to_i}"
18
+ index+=1
19
+ i = i.to_i
20
+ case cmd
21
+ when 'S'
22
+ st = i
23
+ id += i
24
+ when 'I'
25
+ indel<<{:pos=>id-st,:type=>'ins',:size=>i,:cigar=>cigar}
26
+ id+=i
27
+ when 'D'
28
+ str+='N'*i
29
+ indel<<{:pos=>id-st,:type=>'del',:size=>i,:cigar=>cigar}
30
+ when 'M'
31
+ str += seq[id..id+i-1] if seq[id..id+i-1]
32
+ id+=i
33
+ end
34
+ # puts "Ref:#{seq}"
35
+ # puts "Seq:#{str}"
36
+ # puts "ID #{id}"
37
+ # puts
38
+ end
39
+ return str,indel
40
+ end
41
+
42
+ def self.call_sam_snv sam,chr,chr_start,chr_stop, debug=nil
43
+
44
+ unless debug
45
+ show = false
46
+ indel_show = false
47
+ var_show = false
48
+ else
49
+ s = debug.split('|')
50
+ show = s.index('show')
51
+ indel_show = s.index('indel')
52
+ var_show = s.index('var')
53
+ end
54
+
55
+
56
+ l = nil
57
+ snps = []
58
+ indels = []
59
+ indels_map = {}
60
+
61
+ puts "#{chr} #{chr_start} - #{chr_stop}"
62
+ seq = sam.fetch_reference(chr,chr_start,chr_stop+1000)
63
+ sam.load_index
64
+
65
+ upseq = seq.upcase
66
+
67
+
68
+
69
+ als = []
70
+
71
+
72
+ als = sam.fetch(chr, chr_start, chr_stop)
73
+
74
+ profile = []
75
+ list = []
76
+ start = nil
77
+ alleles = {'A'=>0,'C'=>1,'G'=>2,'T'=>3}
78
+ inv_alleles = {0=>'A',1=>'C',2=>'G',3=>'T'}
79
+
80
+
81
+ als.each{|i|
82
+
83
+ # puts "#{i.pos}"
84
+
85
+ unless start
86
+ start = i.pos
87
+ end
88
+
89
+ if i.pos!=start
90
+
91
+ while start<i.pos
92
+ del = []
93
+
94
+ s = ''
95
+ ref = seq[start-chr_start]
96
+ print "#{start}\t#{ref}\t#{list.size}\t" if show
97
+ ref = ref.upcase
98
+
99
+
100
+ p = [0,0,0,0]
101
+ q = [0,0,0,0]
102
+
103
+ list.each_with_index do|l,index|
104
+
105
+
106
+ c = l.seq[start-l.pos]
107
+ cq = l.qual[start-l.pos]
108
+
109
+ # a = seq[start-1..start-1-5]
110
+ # b =seq[start-l.pos..start-l.pos-5]
111
+ # puts "#{a} - #{b}"
112
+
113
+ if c
114
+
115
+ case c
116
+ when 'A'
117
+ p[0]+=1
118
+ q[0]+=cq-33 if cq
119
+ when 'C'
120
+ p[1]+=1
121
+ q[1]+=cq-33 if cq
122
+ when 'G'
123
+ p[2]+=1
124
+ q[2]+=cq-33 if cq
125
+ when 'T'
126
+ p[3]+=1
127
+ q[3]+=cq-33 if cq
128
+ end
129
+
130
+ s+=c
131
+
132
+ print "#{c}" if show
133
+
134
+ else
135
+ del<<l
136
+ end
137
+
138
+
139
+ end
140
+
141
+
142
+
143
+ pmax = p.index(p.max)
144
+ max = p[pmax]
145
+ p[pmax] = 0 if pmax
146
+ palt = p.index(p.max)
147
+ alt = p[palt]
148
+ dep = max+alt
149
+ max_q = q[pmax].to_f/max
150
+ alt_q = '0'
151
+ alt_q = q[palt].to_f/alt if alt > 0
152
+
153
+
154
+ if dep > 2 and ((indel = indels_map[start] and indel[:count]>1 ) or (alt>0 and alt/dep.to_f > 0.1 and alt_q >10) or alleles[ref]!=pmax )
155
+
156
+
157
+
158
+ text = "#{start}\t#{max}/#{alt.to_i}\t#{ref}\t#{s}"
159
+ puts text if var_show
160
+ sum = (max+alt).to_f
161
+ aalt = '-'
162
+
163
+ aalt = inv_alleles[palt] if alt!=0
164
+ if (alt>0 and alt/dep.to_f > 0.1)
165
+ c = 'a'
166
+ else
167
+ c = 'A'
168
+ end
169
+
170
+
171
+ if indel
172
+ s = indel[:alleles].split('/')
173
+ ref = s[0]
174
+ aalt = s[1]
175
+ alt = indel[:count]
176
+ if indel[:type]=='ins'
177
+ c='+'
178
+ else
179
+ c='-'
180
+ end
181
+ end
182
+ snps << [chr,start,ref,inv_alleles[pmax],aalt,max,alt,format('%.2f',max_q),format('%.2f',alt_q),c]
183
+
184
+ # snps << [chr,start,ref,inv_alleles[pmax],inv_alleles[palt],max,alt,max/sum,alt/sum]
185
+
186
+ end
187
+ list-=del
188
+ if list.size==0
189
+ start = i.pos
190
+ else
191
+ start+=1
192
+ end
193
+
194
+
195
+ puts if show
196
+ end
197
+
198
+ start = i.pos
199
+ end
200
+
201
+
202
+
203
+ seq_size = i.seq.size
204
+
205
+ tseq = form(i.seq,i.cigar)
206
+ tqual = form(i.qual,i.cigar)
207
+
208
+
209
+ if (i.cigar.index('I') or i.cigar.index('D') )
210
+
211
+
212
+ if indel_show
213
+ puts "-------------------------INDEL "
214
+ puts i.pos
215
+ puts i.cigar
216
+ puts i.seq
217
+ puts i.qual
218
+ # puts i.inspect
219
+
220
+ # :qname, :flag, :rname,:pos,:mapq,:cigar, :mrnm, :mpos, :isize, :seq, :qual, :tags, :al, :samstr
221
+
222
+ puts seq[i.pos-chr_start..i.pos-chr_start+i.seq.size]
223
+ puts tseq[0]
224
+ puts tqual[0]
225
+ puts tseq[1]
226
+ end
227
+
228
+ if tseq[1].size>0
229
+ for y in tseq[1]
230
+ if y[:type]=='ins'
231
+ iref = '-'
232
+ iseq = i.seq[y[:pos]..y[:pos]+y[:size]-1]
233
+ y[:alleles] = "#{iref}/#{iseq}"
234
+ else
235
+ iref = seq[i.pos-chr_start+y[:pos]..i.pos-chr_start+y[:pos]+y[:size]-1]
236
+ iseq = '-'
237
+ y[:alleles] = "#{iref}/#{iseq}"
238
+ end
239
+ y[:pos]+=i.pos
240
+ indels<<y
241
+ unless indels_map[y[:pos]]
242
+ y[:count]=1
243
+ y[:sm]=y[:alleles]
244
+ indels_map[y[:pos]] = y
245
+ else
246
+ indels_map[y[:pos]][:sm]+=",#{y[:alleles]}"
247
+
248
+ indels_map[y[:pos]][:count] +=1
249
+ end
250
+
251
+ end
252
+ end
253
+
254
+
255
+ end
256
+
257
+
258
+
259
+
260
+
261
+ i.seq = tseq[0]
262
+ i.qual = tqual[0].bytes.to_a
263
+
264
+
265
+ if i.seq.size > seq_size/2
266
+
267
+ si = 0
268
+
269
+ i.seq.size.times do |k|
270
+ # puts "#{i.seq[k]} #{seq[start-chr_start+k]}"
271
+ si+=1 if i.seq[k]==upseq[start-chr_start+k]
272
+ end
273
+ # puts "#{si} #{i.seq.size}"
274
+ if (si.to_f/i.seq.size)>=0.95
275
+ list << i
276
+ else
277
+ if indel_show
278
+ puts "Drop with low similarity #{si.to_f/i.seq.size} #{i.cigar}"
279
+ puts seq[start-chr_start..start-chr_start+100]
280
+ puts i.seq
281
+ end
282
+ end
283
+ end
284
+
285
+ l = i
286
+
287
+
288
+ }
289
+
290
+
291
+ if indel_show
292
+
293
+ for i in indels_map.keys.sort
294
+ puts indels_map[i].inspect
295
+ end
296
+ end
297
+ # puts l.cigar
298
+ # puts l.seq
299
+ # puts snps.size
300
+ return snps
301
+
302
+ end
303
+
304
+ def self.call_variance p
305
+
306
+
307
+ chr = p[:chr]
308
+
309
+ output = p[:output]
310
+ testBAMFile = p[:sam]
311
+ testReference = p[:ref]
312
+ sam = Bio::DB::Sam.new({:bam=>testBAMFile, :fasta=>testReference})
313
+ sam.open
314
+
315
+ chr_start = p[:start]
316
+ chr_stop = p[:stop]
317
+ snps = call_sam_snv sam,chr,chr_start,chr_stop, p[:debug]
318
+
319
+ sam.close
320
+ f = File.open(output,'a')
321
+ f.puts snps.collect{|j| j.join("\t")}
322
+ f.close
323
+ end
324
+
325
+
326
+
327
+ def self.inspect_reference testReference
328
+
329
+ # index reference
330
+
331
+ file = File.open(testReference,'r')
332
+ map = {}
333
+ size=0
334
+ chr = nil
335
+ while str=file.gets
336
+ if str[0..0]=='>'
337
+ if chr
338
+ puts "#{chr}\t#{size}"
339
+ map[chr]=size
340
+ end
341
+ chr = str.strip.split[0][1..-1]
342
+ size=0
343
+ else
344
+ size+=str.strip.size
345
+ end
346
+ end
347
+ map[chr]=size
348
+ puts map.inspect
349
+ file.close
350
+ genome = map
351
+ # genome = {"chr1"=>249250621, "chr10"=>135534747, "chr11"=>135006516, "chr12"=>133851895, "chr13"=>115169878, "chr14"=>107349540, "chr15"=>102531392, "chr16"=>90354753, "chr17"=>81195210, "chr18"=>78077248, "chr19"=>59128983, "chr2"=>243199373, "chr20"=>63025520, "chr21"=>48129895, "chr22"=>51304566, "chr3"=>198022430, "chr4"=>191154276, "chr5"=>180915260, "chr6"=>171115067, "chr7"=>159138663, "chr8"=>146364022, "chr9"=>141213431, "chrX"=>155270560, "chrY"=>59373566}
352
+
353
+ return genome
354
+ end
355
+
356
+ def self.inspect_reference_human testReference
357
+
358
+ # index reference
359
+ #
360
+ # file = File.open(testReference,'r')
361
+ # map = {}
362
+ # size=0
363
+ # chr = nil
364
+ # while str=file.gets
365
+ # if str[0..0]=='>'
366
+ # if chr
367
+ # puts "#{chr}\t#{size}"
368
+ # map[chr]=size
369
+ # end
370
+ # chr = str[1..-1].strip
371
+ # size=0
372
+ # else
373
+ # size+=str.strip.size
374
+ # end
375
+ # end
376
+ # map[chr]=size
377
+ # puts map.inspect
378
+ # file.close
379
+
380
+ genome = {"chr1"=>249250621, "chr10"=>135534747, "chr11"=>135006516, "chr12"=>133851895, "chr13"=>115169878, "chr14"=>107349540, "chr15"=>102531392, "chr16"=>90354753, "chr17"=>81195210, "chr18"=>78077248, "chr19"=>59128983, "chr2"=>243199373, "chr20"=>63025520, "chr21"=>48129895, "chr22"=>51304566, "chr3"=>198022430, "chr4"=>191154276, "chr5"=>180915260, "chr6"=>171115067, "chr7"=>159138663, "chr8"=>146364022, "chr9"=>141213431, "chrX"=>155270560, "chrY"=>59373566}
381
+
382
+ return genome
383
+ end
384
+
385
+
386
+ def self.process_vc_multicore params, core=1,chr_filter=nil
387
+
388
+ sim = core
389
+
390
+ path = '/Users/soup/Desktop/vcqa'
391
+ output = params[:output]
392
+
393
+ testReference = params[:ref]
394
+
395
+ genome = inspect_reference_human(testReference)
396
+
397
+ if chr_filter # 1 as text
398
+
399
+ tmp = {}
400
+ chr = "chr#{chr_filter}"
401
+ tmp[chr]=genome[chr]
402
+ genome = tmp
403
+ end
404
+
405
+ puts genome.inspect
406
+
407
+
408
+ # reset output
409
+ f = File.open(output,'w')
410
+ f.close
411
+
412
+ a = []
413
+
414
+ # job generator
415
+ genome.each_pair do |chr,size|
416
+ puts "#{chr}\t#{size}"
417
+ m = 1000000
418
+ n = size/m
419
+
420
+ n.times do |i|
421
+ params[:chr] = chr
422
+ params[:start] = 1 + i * m
423
+ params[:stop] = (i+1) * m
424
+ a<<params.clone
425
+ end
426
+
427
+ if size%m !=0
428
+ params[:chr] = chr
429
+ params[:start] = size - size% m
430
+ params[:stop] = size
431
+ a<<params.clone
432
+ end
433
+
434
+ end
435
+
436
+ sim = a.size if sim>a.size
437
+
438
+ # starting first N processes
439
+ sim.times do
440
+ i = a.pop
441
+ Process.fork{call_variance(i)}
442
+ end
443
+ # start one by one as the previous finish
444
+ a.each do |i|
445
+ Process.wait(0)
446
+ Process.fork{call_variance(i)}
447
+ end
448
+ # wait for all to finish
449
+ Process.waitall
450
+
451
+ end
452
+
453
+
454
+
455
+ def self.call_snv params
456
+ # ====================================================================================================
457
+ # 1. VC calling
458
+ # ====================================================================================================
459
+ core = 4
460
+ core = params['core'].to_i if params['core'] # number of simultaneous processes
461
+ sam_file =params['sam']
462
+ ref_file =params['ref']
463
+ vc_file =params['vc_out']
464
+ p = {:sam=>sam_file,:ref=>ref_file,:output=>vc_file}
465
+
466
+ # params[:start] = 95_658_000
467
+ # params[:stop] = 95_659_000
468
+ # call_variance params
469
+
470
+ process_vc_multicore p,core
471
+ # # params = {:sam=>sam_file,:ref=>ref_file,:output=>vc_all_file}
472
+ # # process_vc_multicore params,core
473
+ end
474
+
475
+
476
+ def self.call_snv_range params
477
+ # ====================================================================================================
478
+ # 1. VC calling
479
+ # ====================================================================================================
480
+ core = 4
481
+ core = params['core'].to_i if params['core'] # number of simultaneous processes
482
+ sam_file =params['sam']
483
+ ref_file =params['ref']
484
+ vc_file =params['vc_out']
485
+ p = {:sam=>sam_file,:ref=>ref_file,:output=>vc_file,:chr=>"chr#{params['chr']}",:start=>params['start'].to_i,:stop=>params['stop'].to_i,:debug=>'indel'}
486
+ puts p.inspect
487
+
488
+ call_variance p
489
+
490
+ end
491
+
492
+ end
493
+ end
@@ -0,0 +1,28 @@
1
+ require 'vardetect-vc/call_snv'
2
+ require 'vardetect-vc/call_prepare'
3
+ require 'vardetect-vc/call_combine'
4
+
5
+ module Vardetect
6
+ module Vc
7
+
8
+
9
+ def self.parse_params params
10
+
11
+ puts params
12
+ params = params.split
13
+ cmd = params[0]
14
+ tmp = {}
15
+ ((params.size-1)/2).times do |i|
16
+ key = params[1+i*2][1..-1]
17
+ value = params[1+i*2+1]
18
+ tmp[key]=value
19
+ end
20
+ tmp[:cmd]=cmd
21
+ params = tmp
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+
28
+
@@ -1,5 +1,5 @@
1
1
  module Vardetect
2
2
  module Vc
3
- VERSION = "0.0.4"
3
+ VERSION = "0.0.5"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vardetect-vc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -78,7 +78,9 @@ dependencies:
78
78
  description: A variance calling tool for NGS
79
79
  email:
80
80
  - supasak.kul@biotec.or.th
81
- executables: []
81
+ executables:
82
+ - vardetect_vc
83
+ - vardetect_vc_batch
82
84
  extensions: []
83
85
  extra_rdoc_files: []
84
86
  files:
@@ -87,7 +89,13 @@ files:
87
89
  - LICENSE.txt
88
90
  - README.md
89
91
  - Rakefile
92
+ - bin/vardetect_vc
93
+ - bin/vardetect_vc_batch
90
94
  - lib/vardetect-vc.rb
95
+ - lib/vardetect-vc/call_combine.rb
96
+ - lib/vardetect-vc/call_prepare.rb
97
+ - lib/vardetect-vc/call_snv.rb
98
+ - lib/vardetect-vc/lib.rb
91
99
  - lib/vardetect-vc/version.rb
92
100
  - vardetect-vc.gemspec
93
101
  homepage: ''