bio-vcf 0.8.0 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -0,0 +1,303 @@
1
+ # Parallel copy-on-write streaming (PCOWS)
2
+
3
+ require 'tempfile'
4
+
5
+ class PCOWS
6
+
7
+ RUNNINGEXT = 'part' # file extension
8
+
9
+ def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
10
+ num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
11
+ # $stderr.print "Using ",num_threads,"threads \n"
12
+ @num_threads = num_threads
13
+ @chunk_size = chunk_size
14
+ @pid_list = []
15
+ @name = name
16
+ @timeout = timeout
17
+ @quiet = quiet
18
+ @debug = debug
19
+ if @debug
20
+ $stderr.print "PCOWS running in DEBUG MODE\n"
21
+ end
22
+ if multi_threaded
23
+ @tmpdir = Dir::mktmpdir(@name+'_')
24
+ end
25
+ @last_output = 0 # counter
26
+ @output_locked = false
27
+ end
28
+
29
+ # Feed the worker 'func and state' to COWS. Note that func is a
30
+ # lambda closure so it can pick up surrounding scope at invocation
31
+ # in addition to the data captured in 'state'.
32
+
33
+ def submit_worker(func,state)
34
+ pid = nil
35
+ if multi_threaded
36
+ count = @pid_list.size+1
37
+ fn = mktmpfilename(count)
38
+ pid = fork do
39
+ # ---- This is running a new copy-on-write process
40
+ tempfn = fn+'.'+RUNNINGEXT
41
+ STDOUT.reopen(File.open(tempfn, 'w+'))
42
+ func.call(state).each { | line | print line }
43
+ STDOUT.flush
44
+ STDOUT.close
45
+ # sleep 0.1
46
+ # f.flush
47
+ # f.close
48
+ # sleep 0.2 # interval to make sure we are done writing,
49
+ # otherwise there may be misses at the end of a
50
+ # block (maybe the f.close fixed it)
51
+
52
+ FileUtils::mv(tempfn,fn)
53
+ exit(0)
54
+ end
55
+ Process.detach(pid)
56
+ else
57
+ # ---- Single threaded: call in main process and output immediately
58
+ func.call(state).each { | line | print line }
59
+ end
60
+ @pid_list << [ pid,count,fn ]
61
+ return true
62
+ end
63
+
64
+ def submit_final_worker(func,state)
65
+ @final_worker = true
66
+ submit_worker(func,state)
67
+ end
68
+
69
+ # Make sure no more than num_threads are running at the same time -
70
+ # this is achieved by checking the PID table and the running files
71
+ # in the tmpdir
72
+
73
+ def wait_for_worker_slot()
74
+ return if single_threaded
75
+ Timeout.timeout(@timeout) do
76
+ printed_timeout_message = false
77
+ while true
78
+ # ---- count running pids
79
+ running = @pid_list.reduce(0) do | sum, info |
80
+ (pid,count,fn) = info
81
+ if pid_or_file_running?(pid,fn)
82
+ sum+1
83
+ else
84
+ sum
85
+ end
86
+ end
87
+ return if running < @num_threads
88
+ if not printed_timeout_message
89
+ $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
90
+ printed_timeout_message = true
91
+ end
92
+ sleep 0.1
93
+ end
94
+ end
95
+ end
96
+
97
+ # ---- In this section the output gets collected and passed on to a
98
+ # printer thread. This function makes sure the printing is
99
+ # ordered and that no printers are running at the same
100
+ # time. The printer thread should be doing as little processing
101
+ # as possible.
102
+ #
103
+ # In this implementation type==:by_line will call func for
104
+ # each line. Otherwise it is called once with the filename.
105
+ def process_output(func=nil,type=:by_line, blocking=false)
106
+ return if single_threaded
107
+ output = lambda { |fn|
108
+ if type == :by_line
109
+ File.new(fn).each_line { |buf|
110
+ print buf
111
+ }
112
+ else
113
+ func.call(fn)
114
+ end
115
+ }
116
+ if @output_locked
117
+ # ---- is the other thread still running? We wait until it
118
+ # is finished to start the next one
119
+ (pid,count,fn) = @output_locked
120
+ $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
121
+ return if File.exist?(fn) # continue because thread still processing
122
+ # Now we should remove the .keep file
123
+ cleanup_keep_file(fn)
124
+ @last_output += 1 # get next one in line
125
+ @output_locked = false
126
+ end
127
+ # ---- process the next output chunk. After completion it
128
+ # gets renamed to chunk.keep. This to avoid missing
129
+ # output (if we unlink the file prematurely)
130
+ if info = @pid_list[@last_output]
131
+ (pid,count,fn) = info
132
+ $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
133
+ if File.exist?(fn)
134
+ # Yes! We have the next output, create outputter
135
+ @output_locked = info
136
+ $stderr.print "Set lock on ",[info],"\n" if not @quiet
137
+ if not blocking
138
+ $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
139
+ pid = fork do
140
+ output.call(fn)
141
+ # after finishing output move it to .keep
142
+ FileUtils::mv(fn,fn+'.keep')
143
+ exit(0)
144
+ end
145
+ Process.detach(pid)
146
+ else
147
+ $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
148
+ output.call(fn)
149
+ FileUtils::mv(fn,fn+'.keep')
150
+ end
151
+ else
152
+ sleep 0.2
153
+ end
154
+ end
155
+ end
156
+
157
+ # Wait for a worker slot to appear. When working the pid is writing
158
+ # a file with extension .part(ial). After completion the file is
159
+ # renamed without .part and a slot is free.
160
+ def wait_for_worker(info)
161
+ (pid,count,fn) = info
162
+ if pid_or_file_running?(pid,fn)
163
+ $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
164
+ begin
165
+ Timeout.timeout(@timeout) do
166
+ while not File.exist?(fn) # wait for the result to appear
167
+ sleep 0.2
168
+ return if not pid_or_file_running?(pid,fn) # worker is gone
169
+ end
170
+ end
171
+ # Partial file should have been renamed:
172
+ raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
173
+ $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
174
+ rescue Timeout::Error
175
+ # Kill it to speed up exit
176
+ Process.kill 9, pid
177
+ Process.wait pid
178
+ $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
179
+ $stderr.print "Bailing out"
180
+ raise
181
+ end
182
+ end
183
+ end
184
+
185
+ # This is the final cleanup after the reader thread is done. All workers
186
+ # need to complete.
187
+
188
+ def wait_for_workers()
189
+ return if single_threaded
190
+ @pid_list.each do |info|
191
+ wait_for_worker(info)
192
+ end
193
+ end
194
+
195
+ def process_remaining_output()
196
+ return if single_threaded
197
+ $stderr.print "Processing remaining output...\n" if not @quiet
198
+ while @output_locked
199
+ sleep 0.2
200
+ process_output() # keep trying
201
+ end
202
+ @pid_list.each do |info|
203
+ (pid,count,fn) = info
204
+ while pid_or_file_running?(pid,fn) or File.exist?(fn)
205
+ $stderr.print "Trying: ",[info],"\n" if not @quiet
206
+ process_output(nil,:by_line,true)
207
+ sleep 0.2
208
+ end
209
+ end
210
+ while @output_locked
211
+ sleep 0.1
212
+ process_output(nil,:by_line,true)
213
+ end
214
+ cleanup_tmpdir()
215
+ end
216
+
217
+ def cleanup()
218
+ @pid_list.each do |info|
219
+ (pid,count,fn) = info
220
+ if pid_running?(pid)
221
+ $stderr.print "Killing child ",[info],"\n"
222
+ begin
223
+ Process.kill 9, pid
224
+ Process.wait pid
225
+ rescue Errno::ENOENT
226
+ $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
227
+ rescue Errno::ESRCH
228
+ $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
229
+ end
230
+ end
231
+ File.unlink(fn) if File.exist?(fn)
232
+ cleanup_keep_file(fn,wait: false)
233
+ tempfn = fn+'.'+RUNNINGEXT
234
+ File.unlink(tempfn) if File.exist?(tempfn)
235
+ end
236
+ cleanup_tmpdir()
237
+ end
238
+
239
+ private
240
+
241
+ def mktmpfilename(num,ext=nil)
242
+ @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
243
+ end
244
+
245
+ def pid_or_file_running?(pid,fn)
246
+ (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
247
+ end
248
+
249
+ def pid_running?(pid)
250
+ begin
251
+ fpid,status=Process.waitpid2(pid,Process::WNOHANG)
252
+ rescue Errno::ECHILD, Errno::ESRCH
253
+ return false
254
+ end
255
+ return true if nil == fpid && nil == status
256
+ return ! (status.exited? || status.signaled?)
257
+ end
258
+
259
+ def single_threaded
260
+ @num_threads == 1
261
+ end
262
+
263
+ def multi_threaded
264
+ @num_threads > 1
265
+ end
266
+
267
+ def cpu_count
268
+ begin
269
+ return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
270
+ # Actually, the JVM does not allow fork...
271
+ return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
272
+ rescue LoadError
273
+ # Count on MAC
274
+ return Integer `sysctl -n hw.ncpu 2>/dev/null`
275
+ end
276
+ $stderr.print "Could not determine number of CPUs" if not @quiet
277
+ 1
278
+ end
279
+
280
+ def cleanup_keep_file(fn, opts = { wait: true })
281
+ if not @debug
282
+ keep = fn+'.keep'
283
+ return if not opts[:wait] and !File.exist?(keep)
284
+ $stderr.print "Trying to remove #{keep}\n" if not @quiet
285
+ while true
286
+ if File.exist?(keep)
287
+ $stderr.print "Removing #{keep}\n" if not @quiet
288
+ File.unlink(keep)
289
+ break # forever loop
290
+ end
291
+ sleep 0.1
292
+ end #forever
293
+ end
294
+ end
295
+
296
+ def cleanup_tmpdir
297
+ if not @debug
298
+ $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
299
+ Dir.unlink(@tmpdir) if @tmpdir
300
+ end
301
+ end
302
+
303
+ end
@@ -0,0 +1,75 @@
1
+ require 'erb'
2
+
3
+ module Bio
4
+
5
+ class Template
6
+
7
+ def initialize fn
8
+ raise "Can not find template #{fn}!" if not File.exist?(fn)
9
+ parse(File.read(fn))
10
+ end
11
+
12
+ def parse buf
13
+ header = []
14
+ body = []
15
+ footer = []
16
+ where = :header
17
+ buf.split("\n").each do | line |
18
+ case where
19
+ when :header
20
+ next if line =~ /=HEADER/
21
+ if line =~ /=BODY/
22
+ body = []
23
+ where = :body
24
+ next
25
+ end
26
+ header << line
27
+ when :body
28
+ if line =~ /=FOOTER/
29
+ footer = []
30
+ where = :footer
31
+ next
32
+ end
33
+ body << line
34
+ else
35
+ footer << line
36
+ end
37
+ end
38
+ if body == []
39
+ body = header
40
+ header = []
41
+ end
42
+ @erb_header = ERB.new(header.join("\n")) if header.size
43
+ @erb_body = ERB.new(body.join("\n")) if body.size
44
+ @erb_footer = ERB.new(footer.join("\n")) if footer.size
45
+ end
46
+
47
+ def result env
48
+ @erb.result(env)
49
+ end
50
+
51
+ def header env
52
+ if @erb_header
53
+ @erb_header.result(env)
54
+ else
55
+ ""
56
+ end
57
+ end
58
+
59
+ def body env
60
+ if @erb_body
61
+ @erb_body.result(env)
62
+ else
63
+ ""
64
+ end
65
+ end
66
+
67
+ def footer env
68
+ if @erb_footer
69
+ @erb_footer.result(env)
70
+ else
71
+ ""
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,46 @@
1
+ module BioVcf
2
+ # This class abstracts a VCF file that can be iterated.
3
+ # The VCF can be plain text or compressed with gzip
4
+ # Note that files compressed with bgzip will not work, as thie ruby implementation of Zlib don't allow concatenated files
5
+ class VCFfile
6
+
7
+ def initialize(file: "", is_gz: true)
8
+ @file = file
9
+ @is_gz = is_gz
10
+ end
11
+
12
+ def parseVCFheader(head_line="")
13
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
14
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
15
+ end
16
+
17
+
18
+ #Returns an enum that can be used as an iterator.
19
+ def each
20
+ return enum_for(:each) unless block_given?
21
+ io = nil
22
+ if @is_gz
23
+ infile = open(@file)
24
+ io = Zlib::GzipReader.new(infile)
25
+ else
26
+ io = File.open(@file)
27
+ end
28
+
29
+ header = BioVcf::VcfHeader.new
30
+ io.each_line do |line|
31
+ line.chomp!
32
+ if line =~ /^##fileformat=/
33
+ header.add(line)
34
+ next
35
+ end
36
+ if line =~ /^#/
37
+ header.add(line)
38
+ next
39
+ end
40
+ fields = BioVcf::VcfLine.parse(line)
41
+ rec = BioVcf::VcfRecord.new(fields,header)
42
+ yield rec
43
+ end
44
+ end
45
+ end
46
+ end
@@ -11,7 +11,7 @@ module BioVcf
11
11
  end
12
12
  end
13
13
 
14
- # Helper class for a list of (variant) values, such as A,G.
14
+ # Helper class for a list of (variant) values, such as A,G.
15
15
  # The [] function does the hard work. You can pass in an index (integer)
16
16
  # or nucleotide which translates to an index.
17
17
  # (see ./features for examples)
@@ -20,7 +20,7 @@ module BioVcf
20
20
  @alt = alt
21
21
  @list = list.split(/,/).map{|i| i.to_i}
22
22
  end
23
-
23
+
24
24
  def [] idx
25
25
  if idx.kind_of?(Integer)
26
26
  # return a value
@@ -67,7 +67,7 @@ module BioVcf
67
67
  @alt = alt
68
68
  @list = list.split(/,/).map{|i| i.to_i}
69
69
  end
70
-
70
+
71
71
  def [] idx
72
72
  if idx.kind_of?(Integer)
73
73
  @list[idx].to_i
@@ -87,15 +87,15 @@ module BioVcf
87
87
  end
88
88
 
89
89
  # Return the max value on the nucleotides in the list (typically rec.alt)
90
- def max
90
+ def max
91
91
  @list.reduce(0){ |memo,v| (v>memo ? v : memo) }
92
92
  end
93
93
 
94
- def min
94
+ def min
95
95
  @list.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
96
96
  end
97
97
 
98
- def sum
98
+ def sum
99
99
  @list.reduce(0){ |memo,v| v+memo }
100
100
  end
101
101
  end
@@ -129,14 +129,14 @@ module BioVcf
129
129
  !empty?
130
130
  end
131
131
 
132
- def dp4
133
- ilist('DP4')
132
+ def dp4
133
+ ilist('DP4')
134
134
  end
135
- def ad
136
- ilist('AD')
135
+ def ad
136
+ ilist('AD')
137
137
  end
138
- def pl
139
- ilist('PL')
138
+ def pl
139
+ ilist('PL')
140
140
  end
141
141
 
142
142
  def bcount
@@ -156,7 +156,7 @@ module BioVcf
156
156
  end
157
157
 
158
158
  def gti
159
- gt.split('/').map { |g| g.to_i }
159
+ gt.split(/[\/\|]/).map { |g| g.to_i }
160
160
  end
161
161
 
162
162
  def gts?
@@ -178,11 +178,11 @@ module BioVcf
178
178
  else
179
179
  v = values[fetch(m.to_s.upcase)]
180
180
  return nil if VcfValue::empty?(v)
181
- v = v.to_i if v =~ /^\d+$/
182
- v = v.to_f if v =~ /^\d+\.\d+$/
181
+ return v.to_i if v =~ /^\d+$/
182
+ return v.to_f if v =~ /^\d+\.\d+$/
183
183
  v
184
184
  end
185
- end
185
+ end
186
186
 
187
187
  private
188
188
 
@@ -200,7 +200,7 @@ module BioVcf
200
200
  def ilist name
201
201
  v = fetch_value(name)
202
202
  return nil if not v
203
- v.split(',').map{|i| i.to_i}
203
+ v.split(',').map{|i| i.to_i}
204
204
  end
205
205
 
206
206
  end
@@ -218,10 +218,15 @@ module BioVcf
218
218
  end
219
219
 
220
220
  def [] name
221
- @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
221
+ begin
222
+ @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
223
+ rescue TypeError
224
+ $stderr.print "Unknown field name <#{name}> in record, did you mean r.info.#{name}?\n"
225
+ raise
226
+ end
222
227
  end
223
228
 
224
- def method_missing(m, *args, &block)
229
+ def method_missing(m, *args, &block)
225
230
  name = m.to_s
226
231
  if name =~ /\?$/
227
232
  # test for valid sample
@@ -229,7 +234,7 @@ module BioVcf
229
234
  else
230
235
  @samples[name] ||= VcfGenotypeField.new(@fields[@sample_index[name]],@format,@header,@ref,@alt)
231
236
  end
232
- end
237
+ end
233
238
 
234
239
  end
235
240
  end