bio-vcf 0.8.0 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -0,0 +1,303 @@
1
+ # Parallel copy-on-write streaming (PCOWS)
2
+
3
+ require 'tempfile'
4
+
5
+ class PCOWS
6
+
7
+ RUNNINGEXT = 'part' # file extension
8
+
9
+ def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
10
+ num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
11
+ # $stderr.print "Using ",num_threads,"threads \n"
12
+ @num_threads = num_threads
13
+ @chunk_size = chunk_size
14
+ @pid_list = []
15
+ @name = name
16
+ @timeout = timeout
17
+ @quiet = quiet
18
+ @debug = debug
19
+ if @debug
20
+ $stderr.print "PCOWS running in DEBUG MODE\n"
21
+ end
22
+ if multi_threaded
23
+ @tmpdir = Dir::mktmpdir(@name+'_')
24
+ end
25
+ @last_output = 0 # counter
26
+ @output_locked = false
27
+ end
28
+
29
+ # Feed the worker 'func and state' to COWS. Note that func is a
30
+ # lambda closure so it can pick up surrounding scope at invocation
31
+ # in addition to the data captured in 'state'.
32
+
33
+ def submit_worker(func,state)
34
+ pid = nil
35
+ if multi_threaded
36
+ count = @pid_list.size+1
37
+ fn = mktmpfilename(count)
38
+ pid = fork do
39
+ # ---- This is running a new copy-on-write process
40
+ tempfn = fn+'.'+RUNNINGEXT
41
+ STDOUT.reopen(File.open(tempfn, 'w+'))
42
+ func.call(state).each { | line | print line }
43
+ STDOUT.flush
44
+ STDOUT.close
45
+ # sleep 0.1
46
+ # f.flush
47
+ # f.close
48
+ # sleep 0.2 # interval to make sure we are done writing,
49
+ # otherwise there may be misses at the end of a
50
+ # block (maybe the f.close fixed it)
51
+
52
+ FileUtils::mv(tempfn,fn)
53
+ exit(0)
54
+ end
55
+ Process.detach(pid)
56
+ else
57
+ # ---- Single threaded: call in main process and output immediately
58
+ func.call(state).each { | line | print line }
59
+ end
60
+ @pid_list << [ pid,count,fn ]
61
+ return true
62
+ end
63
+
64
+ def submit_final_worker(func,state)
65
+ @final_worker = true
66
+ submit_worker(func,state)
67
+ end
68
+
69
+ # Make sure no more than num_threads are running at the same time -
70
+ # this is achieved by checking the PID table and the running files
71
+ # in the tmpdir
72
+
73
+ def wait_for_worker_slot()
74
+ return if single_threaded
75
+ Timeout.timeout(@timeout) do
76
+ printed_timeout_message = false
77
+ while true
78
+ # ---- count running pids
79
+ running = @pid_list.reduce(0) do | sum, info |
80
+ (pid,count,fn) = info
81
+ if pid_or_file_running?(pid,fn)
82
+ sum+1
83
+ else
84
+ sum
85
+ end
86
+ end
87
+ return if running < @num_threads
88
+ if not printed_timeout_message
89
+ $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
90
+ printed_timeout_message = true
91
+ end
92
+ sleep 0.1
93
+ end
94
+ end
95
+ end
96
+
97
+ # ---- In this section the output gets collected and passed on to a
98
+ # printer thread. This function makes sure the printing is
99
+ # ordered and that no printers are running at the same
100
+ # time. The printer thread should be doing as little processing
101
+ # as possible.
102
+ #
103
+ # In this implementation type==:by_line will call func for
104
+ # each line. Otherwise it is called once with the filename.
105
+ def process_output(func=nil,type=:by_line, blocking=false)
106
+ return if single_threaded
107
+ output = lambda { |fn|
108
+ if type == :by_line
109
+ File.new(fn).each_line { |buf|
110
+ print buf
111
+ }
112
+ else
113
+ func.call(fn)
114
+ end
115
+ }
116
+ if @output_locked
117
+ # ---- is the other thread still running? We wait until it
118
+ # is finished to start the next one
119
+ (pid,count,fn) = @output_locked
120
+ $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
121
+ return if File.exist?(fn) # continue because thread still processing
122
+ # Now we should remove the .keep file
123
+ cleanup_keep_file(fn)
124
+ @last_output += 1 # get next one in line
125
+ @output_locked = false
126
+ end
127
+ # ---- process the next output chunk. After completion it
128
+ # gets renamed to chunk.keep. This to avoid missing
129
+ # output (if we unlink the file prematurely)
130
+ if info = @pid_list[@last_output]
131
+ (pid,count,fn) = info
132
+ $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
133
+ if File.exist?(fn)
134
+ # Yes! We have the next output, create outputter
135
+ @output_locked = info
136
+ $stderr.print "Set lock on ",[info],"\n" if not @quiet
137
+ if not blocking
138
+ $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
139
+ pid = fork do
140
+ output.call(fn)
141
+ # after finishing output move it to .keep
142
+ FileUtils::mv(fn,fn+'.keep')
143
+ exit(0)
144
+ end
145
+ Process.detach(pid)
146
+ else
147
+ $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
148
+ output.call(fn)
149
+ FileUtils::mv(fn,fn+'.keep')
150
+ end
151
+ else
152
+ sleep 0.2
153
+ end
154
+ end
155
+ end
156
+
157
+ # Wait for a worker slot to appear. When working the pid is writing
158
+ # a file with extension .part(ial). After completion the file is
159
+ # renamed without .part and a slot is free.
160
+ def wait_for_worker(info)
161
+ (pid,count,fn) = info
162
+ if pid_or_file_running?(pid,fn)
163
+ $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
164
+ begin
165
+ Timeout.timeout(@timeout) do
166
+ while not File.exist?(fn) # wait for the result to appear
167
+ sleep 0.2
168
+ return if not pid_or_file_running?(pid,fn) # worker is gone
169
+ end
170
+ end
171
+ # Partial file should have been renamed:
172
+ raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
173
+ $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
174
+ rescue Timeout::Error
175
+ # Kill it to speed up exit
176
+ Process.kill 9, pid
177
+ Process.wait pid
178
+ $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
179
+ $stderr.print "Bailing out"
180
+ raise
181
+ end
182
+ end
183
+ end
184
+
185
+ # This is the final cleanup after the reader thread is done. All workers
186
+ # need to complete.
187
+
188
+ def wait_for_workers()
189
+ return if single_threaded
190
+ @pid_list.each do |info|
191
+ wait_for_worker(info)
192
+ end
193
+ end
194
+
195
+ def process_remaining_output()
196
+ return if single_threaded
197
+ $stderr.print "Processing remaining output...\n" if not @quiet
198
+ while @output_locked
199
+ sleep 0.2
200
+ process_output() # keep trying
201
+ end
202
+ @pid_list.each do |info|
203
+ (pid,count,fn) = info
204
+ while pid_or_file_running?(pid,fn) or File.exist?(fn)
205
+ $stderr.print "Trying: ",[info],"\n" if not @quiet
206
+ process_output(nil,:by_line,true)
207
+ sleep 0.2
208
+ end
209
+ end
210
+ while @output_locked
211
+ sleep 0.1
212
+ process_output(nil,:by_line,true)
213
+ end
214
+ cleanup_tmpdir()
215
+ end
216
+
217
+ def cleanup()
218
+ @pid_list.each do |info|
219
+ (pid,count,fn) = info
220
+ if pid_running?(pid)
221
+ $stderr.print "Killing child ",[info],"\n"
222
+ begin
223
+ Process.kill 9, pid
224
+ Process.wait pid
225
+ rescue Errno::ENOENT
226
+ $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
227
+ rescue Errno::ESRCH
228
+ $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
229
+ end
230
+ end
231
+ File.unlink(fn) if File.exist?(fn)
232
+ cleanup_keep_file(fn,wait: false)
233
+ tempfn = fn+'.'+RUNNINGEXT
234
+ File.unlink(tempfn) if File.exist?(tempfn)
235
+ end
236
+ cleanup_tmpdir()
237
+ end
238
+
239
+ private
240
+
241
+ def mktmpfilename(num,ext=nil)
242
+ @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
243
+ end
244
+
245
+ def pid_or_file_running?(pid,fn)
246
+ (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
247
+ end
248
+
249
+ def pid_running?(pid)
250
+ begin
251
+ fpid,status=Process.waitpid2(pid,Process::WNOHANG)
252
+ rescue Errno::ECHILD, Errno::ESRCH
253
+ return false
254
+ end
255
+ return true if nil == fpid && nil == status
256
+ return ! (status.exited? || status.signaled?)
257
+ end
258
+
259
+ def single_threaded
260
+ @num_threads == 1
261
+ end
262
+
263
+ def multi_threaded
264
+ @num_threads > 1
265
+ end
266
+
267
+ def cpu_count
268
+ begin
269
+ return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
270
+ # Actually, the JVM does not allow fork...
271
+ return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
272
+ rescue LoadError
273
+ # Count on MAC
274
+ return Integer `sysctl -n hw.ncpu 2>/dev/null`
275
+ end
276
+ $stderr.print "Could not determine number of CPUs" if not @quiet
277
+ 1
278
+ end
279
+
280
+ def cleanup_keep_file(fn, opts = { wait: true })
281
+ if not @debug
282
+ keep = fn+'.keep'
283
+ return if not opts[:wait] and !File.exist?(keep)
284
+ $stderr.print "Trying to remove #{keep}\n" if not @quiet
285
+ while true
286
+ if File.exist?(keep)
287
+ $stderr.print "Removing #{keep}\n" if not @quiet
288
+ File.unlink(keep)
289
+ break # forever loop
290
+ end
291
+ sleep 0.1
292
+ end #forever
293
+ end
294
+ end
295
+
296
+ def cleanup_tmpdir
297
+ if not @debug
298
+ $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
299
+ Dir.unlink(@tmpdir) if @tmpdir
300
+ end
301
+ end
302
+
303
+ end
@@ -0,0 +1,75 @@
1
+ require 'erb'
2
+
3
+ module Bio
4
+
5
+ class Template
6
+
7
+ def initialize fn
8
+ raise "Can not find template #{fn}!" if not File.exist?(fn)
9
+ parse(File.read(fn))
10
+ end
11
+
12
+ def parse buf
13
+ header = []
14
+ body = []
15
+ footer = []
16
+ where = :header
17
+ buf.split("\n").each do | line |
18
+ case where
19
+ when :header
20
+ next if line =~ /=HEADER/
21
+ if line =~ /=BODY/
22
+ body = []
23
+ where = :body
24
+ next
25
+ end
26
+ header << line
27
+ when :body
28
+ if line =~ /=FOOTER/
29
+ footer = []
30
+ where = :footer
31
+ next
32
+ end
33
+ body << line
34
+ else
35
+ footer << line
36
+ end
37
+ end
38
+ if body == []
39
+ body = header
40
+ header = []
41
+ end
42
+ @erb_header = ERB.new(header.join("\n")) if header.size
43
+ @erb_body = ERB.new(body.join("\n")) if body.size
44
+ @erb_footer = ERB.new(footer.join("\n")) if footer.size
45
+ end
46
+
47
+ def result env
48
+ @erb.result(env)
49
+ end
50
+
51
+ def header env
52
+ if @erb_header
53
+ @erb_header.result(env)
54
+ else
55
+ ""
56
+ end
57
+ end
58
+
59
+ def body env
60
+ if @erb_body
61
+ @erb_body.result(env)
62
+ else
63
+ ""
64
+ end
65
+ end
66
+
67
+ def footer env
68
+ if @erb_footer
69
+ @erb_footer.result(env)
70
+ else
71
+ ""
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,46 @@
1
+ module BioVcf
2
+ # This class abstracts a VCF file that can be iterated.
3
+ # The VCF can be plain text or compressed with gzip
4
+ # Note that files compressed with bgzip will not work, as thie ruby implementation of Zlib don't allow concatenated files
5
+ class VCFfile
6
+
7
+ def initialize(file: "", is_gz: true)
8
+ @file = file
9
+ @is_gz = is_gz
10
+ end
11
+
12
+ def parseVCFheader(head_line="")
13
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
14
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
15
+ end
16
+
17
+
18
+ #Returns an enum that can be used as an iterator.
19
+ def each
20
+ return enum_for(:each) unless block_given?
21
+ io = nil
22
+ if @is_gz
23
+ infile = open(@file)
24
+ io = Zlib::GzipReader.new(infile)
25
+ else
26
+ io = File.open(@file)
27
+ end
28
+
29
+ header = BioVcf::VcfHeader.new
30
+ io.each_line do |line|
31
+ line.chomp!
32
+ if line =~ /^##fileformat=/
33
+ header.add(line)
34
+ next
35
+ end
36
+ if line =~ /^#/
37
+ header.add(line)
38
+ next
39
+ end
40
+ fields = BioVcf::VcfLine.parse(line)
41
+ rec = BioVcf::VcfRecord.new(fields,header)
42
+ yield rec
43
+ end
44
+ end
45
+ end
46
+ end
@@ -11,7 +11,7 @@ module BioVcf
11
11
  end
12
12
  end
13
13
 
14
- # Helper class for a list of (variant) values, such as A,G.
14
+ # Helper class for a list of (variant) values, such as A,G.
15
15
  # The [] function does the hard work. You can pass in an index (integer)
16
16
  # or nucleotide which translates to an index.
17
17
  # (see ./features for examples)
@@ -20,7 +20,7 @@ module BioVcf
20
20
  @alt = alt
21
21
  @list = list.split(/,/).map{|i| i.to_i}
22
22
  end
23
-
23
+
24
24
  def [] idx
25
25
  if idx.kind_of?(Integer)
26
26
  # return a value
@@ -67,7 +67,7 @@ module BioVcf
67
67
  @alt = alt
68
68
  @list = list.split(/,/).map{|i| i.to_i}
69
69
  end
70
-
70
+
71
71
  def [] idx
72
72
  if idx.kind_of?(Integer)
73
73
  @list[idx].to_i
@@ -87,15 +87,15 @@ module BioVcf
87
87
  end
88
88
 
89
89
  # Return the max value on the nucleotides in the list (typically rec.alt)
90
- def max
90
+ def max
91
91
  @list.reduce(0){ |memo,v| (v>memo ? v : memo) }
92
92
  end
93
93
 
94
- def min
94
+ def min
95
95
  @list.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
96
96
  end
97
97
 
98
- def sum
98
+ def sum
99
99
  @list.reduce(0){ |memo,v| v+memo }
100
100
  end
101
101
  end
@@ -129,14 +129,14 @@ module BioVcf
129
129
  !empty?
130
130
  end
131
131
 
132
- def dp4
133
- ilist('DP4')
132
+ def dp4
133
+ ilist('DP4')
134
134
  end
135
- def ad
136
- ilist('AD')
135
+ def ad
136
+ ilist('AD')
137
137
  end
138
- def pl
139
- ilist('PL')
138
+ def pl
139
+ ilist('PL')
140
140
  end
141
141
 
142
142
  def bcount
@@ -156,7 +156,7 @@ module BioVcf
156
156
  end
157
157
 
158
158
  def gti
159
- gt.split('/').map { |g| g.to_i }
159
+ gt.split(/[\/\|]/).map { |g| g.to_i }
160
160
  end
161
161
 
162
162
  def gts?
@@ -178,11 +178,11 @@ module BioVcf
178
178
  else
179
179
  v = values[fetch(m.to_s.upcase)]
180
180
  return nil if VcfValue::empty?(v)
181
- v = v.to_i if v =~ /^\d+$/
182
- v = v.to_f if v =~ /^\d+\.\d+$/
181
+ return v.to_i if v =~ /^\d+$/
182
+ return v.to_f if v =~ /^\d+\.\d+$/
183
183
  v
184
184
  end
185
- end
185
+ end
186
186
 
187
187
  private
188
188
 
@@ -200,7 +200,7 @@ module BioVcf
200
200
  def ilist name
201
201
  v = fetch_value(name)
202
202
  return nil if not v
203
- v.split(',').map{|i| i.to_i}
203
+ v.split(',').map{|i| i.to_i}
204
204
  end
205
205
 
206
206
  end
@@ -218,10 +218,15 @@ module BioVcf
218
218
  end
219
219
 
220
220
  def [] name
221
- @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
221
+ begin
222
+ @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
223
+ rescue TypeError
224
+ $stderr.print "Unknown field name <#{name}> in record, did you mean r.info.#{name}?\n"
225
+ raise
226
+ end
222
227
  end
223
228
 
224
- def method_missing(m, *args, &block)
229
+ def method_missing(m, *args, &block)
225
230
  name = m.to_s
226
231
  if name =~ /\?$/
227
232
  # test for valid sample
@@ -229,7 +234,7 @@ module BioVcf
229
234
  else
230
235
  @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
231
236
  end
232
- end
237
+ end
233
238
 
234
239
  end
235
240
  end