bio-vcf 0.9.2 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -21
  3. data/LICENSE.txt +1 -1
  4. data/README.md +107 -73
  5. data/RELEASE_NOTES.md +20 -0
  6. data/RELEASE_NOTES.md~ +11 -0
  7. data/VERSION +1 -1
  8. data/bin/bio-vcf +49 -30
  9. data/bio-vcf.gemspec +1 -1
  10. data/features/cli.feature +4 -1
  11. data/features/diff_count.feature +0 -1
  12. data/features/step_definitions/cli-feature.rb +13 -9
  13. data/features/step_definitions/diff_count.rb +1 -1
  14. data/features/step_definitions/somaticsniper.rb +1 -1
  15. data/lib/bio-vcf/pcows.rb +31 -25
  16. data/lib/bio-vcf/vcffile.rb +46 -0
  17. data/lib/bio-vcf/vcfgenotypefield.rb +20 -20
  18. data/lib/bio-vcf/vcfheader.rb +29 -0
  19. data/lib/bio-vcf/vcfrecord.rb +5 -3
  20. data/lib/bio-vcf/vcfsample.rb +3 -1
  21. data/test/data/input/empty.vcf +2 -0
  22. data/test/data/regression/empty-stderr.new +12 -0
  23. data/test/data/regression/empty.new +2 -0
  24. data/test/data/regression/empty.ref +2 -0
  25. data/test/data/regression/eval_once-stderr.new +2 -2
  26. data/test/data/regression/eval_r.info.dp-stderr.new +9 -7
  27. data/test/data/regression/ifilter_s.dp-stderr.new +9 -7
  28. data/test/data/regression/pass1-stderr.new +9 -7
  29. data/test/data/regression/r.info.dp-stderr.new +4 -8
  30. data/test/data/regression/r.info.dp.new +0 -33
  31. data/test/data/regression/rewrite.info.sample-stderr.new +9 -7
  32. data/test/data/regression/s.dp-stderr.new +9 -7
  33. data/test/data/regression/seval_s.dp-stderr.new +9 -7
  34. data/test/data/regression/sfilter_seval_s.dp-stderr.new +9 -7
  35. data/test/data/regression/thread4-stderr.new +9 -7
  36. data/test/data/regression/thread4_4-stderr.new +25 -44
  37. data/test/data/regression/thread4_4.new +0 -20
  38. data/test/data/regression/thread4_4_failed_filter-stderr.new +1 -1
  39. data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -1
  40. data/test/data/regression/vcf2json_full_header-stderr.new +9 -7
  41. data/test/data/regression/vcf2json_use_meta-stderr.new +9 -7
  42. metadata +11 -7
  43. data/features/#cli.feature# +0 -71
  44. data/features/filter.feature~ +0 -35
  45. data/test/stress/stress_test.sh~ +0 -8
@@ -0,0 +1,20 @@
1
+ ## ChangeLog v0.9.4 (2020????)
2
+
3
+ This is an important maintenance release of bio-vcf:
4
+
5
+ + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf).
6
+
7
+ ## Older release notes
8
+
9
+ + Getting ready for a 1.0 release
10
+ + Released 0.9.2 as a gem
11
+ + 0.9.1 removed a rare threading bug and cleanup on error
12
+ + Added support for soft filters (request by Brad Chapman)
13
+ + The outputter now writes (properly) in parallel with the parser
14
+ + bio-vcf turns any VCF into JSON with header information, and
15
+ allows you to pipe that JSON directly into any JSON supporting
16
+ language, including Python and Javascript!
17
+
18
+ ## Older changes
19
+
20
+ For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
@@ -0,0 +1,11 @@
1
+ ## RELEASE NOTES
2
+
3
+
4
+ * Getting ready for a 1.0 release
5
+ * Released 0.9.2 as a gem
6
+ * 0.9.1 removed a rare threading bug and cleanup on error
7
+ * Added support for soft filters (request by Brad Chapman)
8
+ * The outputter now writes (properly) in parallel with the parser
9
+ * bio-vcf turns any VCF into JSON with header information, and
10
+ allows you to pipe that JSON directly into any JSON supporting
11
+ language, including Python and Javascript!
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.2
1
+ 0.9.4
@@ -4,7 +4,7 @@
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT
6
6
  #
7
- # Copyright (C) 2014-2015 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ # Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
8
8
 
9
9
  USAGE = "Vcf parser"
10
10
 
@@ -20,10 +20,10 @@ require 'optparse'
20
20
  require 'timeout'
21
21
  require 'fileutils'
22
22
 
23
- # Uncomment when using the bio-logger
23
+ # Uncomment when using the bio-logger
24
24
  # require 'bio-logger'
25
25
  # log = Bio::Log::LoggerPlus.new 'vcf'
26
- # log.outputters = Bio::Log::Outputter.stderr
26
+ # log.outputters = Bio::Log::Outputter.stderr
27
27
  # Bio::Log::CLI.logger('stderr')
28
28
  # Bio::Log::CLI.trace('info')
29
29
 
@@ -31,7 +31,7 @@ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', v
31
31
  opts = OptionParser.new do |o|
32
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
33
33
 
34
- o.on('-i','--ignore-missing', 'Ignore missing data') do
34
+ o.on('-i','--ignore-missing', 'Ignore missing data') do
35
35
  options[:ignore_missing] = true
36
36
  end
37
37
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
@@ -103,8 +103,8 @@ opts = OptionParser.new do |o|
103
103
  o.on_tail("--tags list", String, "Add tags") do |s|
104
104
  options[:tags] = s
105
105
  end
106
-
107
- o.on("--skip-header", "Do not output VCF header info") do
106
+
107
+ o.on("--skip-header", "Do not output VCF header info") do
108
108
  options[:skip_header] = true
109
109
  end
110
110
 
@@ -127,8 +127,8 @@ opts = OptionParser.new do |o|
127
127
  o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
128
128
  options[:timeout] = i
129
129
  end
130
-
131
- # Uncomment the following when using the bio-logger
130
+
131
+ # Uncomment the following when using the bio-logger
132
132
  # o.separator ""
133
133
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
134
134
  # Bio::Log::CLI.logger(name)
@@ -137,7 +137,16 @@ opts = OptionParser.new do |o|
137
137
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
138
138
  # Bio::Log::CLI.trace(s)
139
139
  # end
140
- #
140
+ #
141
+ o.on("--names", "Output sample names") do |q|
142
+ options[:quiet] = true
143
+ options[:num_threads] = nil
144
+ options[:eval_once] = true
145
+ options[:eval] = 'header.samples.join("\t")'
146
+ # options[:num_threads] = 1
147
+ # options[:thread_lines] = 1
148
+ options[:skip_header] = true
149
+ end
141
150
  o.on("--statistics", "Output statistics") do |q|
142
151
  options[:statistics] = true
143
152
  options[:num_threads] = nil
@@ -146,11 +155,11 @@ opts = OptionParser.new do |o|
146
155
  # Bio::Log::CLI.trace('error')
147
156
  options[:quiet] = true
148
157
  end
149
-
158
+
150
159
  o.on("-v", "--verbose", "Run verbosely") do |v|
151
160
  options[:verbose] = true
152
161
  end
153
-
162
+
154
163
  o.on("--debug", "Show debug messages and keep intermediate output") do |v|
155
164
  # Bio::Log::CLI.trace('debug')
156
165
  options[:debug] = true
@@ -165,11 +174,11 @@ end
165
174
  opts.parse!(ARGV)
166
175
 
167
176
  BIOVCF_VERSION=version
168
- BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015\n"
177
+ BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
169
178
  $stderr.print BIOVCF_BANNER if !options[:quiet]
170
179
 
171
- if options[:show_help]
172
- print opts
180
+ if options[:show_help]
181
+ print opts
173
182
  print USAGE
174
183
  exit 1
175
184
  end
@@ -215,6 +224,7 @@ def parse_header line, samples, options
215
224
  print line if not options[:skip_header]
216
225
  STDIN.each_line do | headerline |
217
226
  if headerline !~ /^#/
227
+ # If no records in VCF, we never get here
218
228
  line = headerline
219
229
  break # end of header
220
230
  end
@@ -229,12 +239,12 @@ def parse_header line, samples, options
229
239
  if add_filter
230
240
  print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
231
241
  end
232
-
242
+
233
243
  selected = header.column_names
234
244
  if samples
235
245
  newfields = selected[0..8]
236
246
  samples.each do |s|
237
- newfields << selected[s+9]
247
+ newfields << selected[s+9]
238
248
  end
239
249
  selected = newfields
240
250
  end
@@ -246,6 +256,10 @@ def parse_header line, samples, options
246
256
  end
247
257
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
248
258
  VcfRdf::header if options[:rdf]
259
+ if line =~ /^#/
260
+ # We did not read a record
261
+ line = nil
262
+ end
249
263
  return header,line
250
264
  end
251
265
 
@@ -291,10 +305,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
291
305
  if filter
292
306
  return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
293
307
  end
294
-
308
+
295
309
  if sfilter # sample 'or' filter
296
310
  rec.each_sample(options[:sfilter_samples]) do | sample |
297
- # return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
298
311
  return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
299
312
  end
300
313
  end
@@ -321,21 +334,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
321
334
 
322
335
  # -----------------------------
323
336
  # From here on decide on output
324
-
337
+
325
338
  rec.add_to_filter_field(add_filter) if set_filter_field
326
339
 
327
340
  if samples
328
341
  # Select certain samples for output
329
342
  newfields = fields[0..8]
330
343
  samples.each do |s|
331
- newfields << fields[s+9]
344
+ newfields << fields[s+9]
332
345
  end
333
346
  fields = newfields
334
347
  end
335
348
  if options[:eval] or seval
336
349
  begin
337
350
  results = nil # result string
338
- if options[:eval]
351
+ if options[:eval]
339
352
  res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
340
353
  results = res if res
341
354
  end
@@ -364,11 +377,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
364
377
  rescue Exception => e
365
378
  $stderr.print e,": ",fields,"\n"
366
379
  $stderr.print e.backtrace.inspect if options[:verbose]
367
- raise
380
+ raise
368
381
  end
369
382
  elsif options[:rewrite]
370
383
  # Default behaviour prints VCF line, but rewrite info
371
- eval(options[:rewrite])
384
+ eval(options[:rewrite])
372
385
  (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
373
386
  elsif stats
374
387
  # do nothing
@@ -390,7 +403,7 @@ line_number=0
390
403
 
391
404
  if options[:bed]
392
405
  bedfilter = BedFilter.new(options[:bed])
393
- end
406
+ end
394
407
 
395
408
  begin
396
409
  # Define linear parser function (going through one chunk)
@@ -415,7 +428,11 @@ begin
415
428
  # ---- Parse the header lines (chomps from STDIN)
416
429
  # and returns header info and the current line
417
430
  if line =~ /^#/
418
- header,line = parse_header(line,samples,options)
431
+ header, line = parse_header(line,samples,options)
432
+ if line.nil?
433
+ # No line after header, to there are no records to process
434
+ break
435
+ end
419
436
  end
420
437
  # p [line_number,line]
421
438
  # ---- After the header continue processing
@@ -428,7 +445,7 @@ begin
428
445
  print template.header(binding) if template
429
446
  header_output_completed = true
430
447
  end
431
-
448
+
432
449
  if options[:eval_once]
433
450
  # this happens if we only want one line evaluated - say to get
434
451
  # the number of samples
@@ -442,7 +459,7 @@ begin
442
459
 
443
460
  # ---- In the following section the VCF lines are parsed by chunks
444
461
  # The chunks may go into different threads
445
-
462
+
446
463
  if chunk_lines.size >= CHUNK_SIZE
447
464
  # ---- process one chunk
448
465
  $stderr.print '.' if not options[:quiet]
@@ -456,14 +473,16 @@ begin
456
473
  pcows.submit_final_worker(process,chunk_lines)
457
474
  pcows.wait_for_workers()
458
475
  pcows.process_remaining_output()
459
-
476
+
460
477
  print template.footer(binding) if template
461
478
  stats.print if stats
462
479
 
463
480
  rescue Exception => e
464
- $stderr.print e.message,"\n" if e.message != 'exit'
481
+ if e.message != 'exit'
482
+ $stderr.print "ERROR: "
483
+ $stderr.print e.message,"\n"
484
+ end
465
485
  pcows.cleanup()
466
486
  raise if options[:verbose]
467
487
  exit 1
468
488
  end
469
-
@@ -31,7 +31,7 @@ Gem::Specification.new do |s|
31
31
  s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
32
32
  Dir['template/**/*']
33
33
 
34
- s.homepage = "http://github.com/pjotrp/bioruby-vcf"
34
+ s.homepage = "http://github.com/vcflib/bio-vcf"
35
35
  s.licenses = ["MIT"]
36
36
  s.require_paths = ["lib"]
37
37
  s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
@@ -68,4 +68,7 @@ Feature: Command-line interface (CLI)
68
68
  When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
69
69
  Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
70
70
 
71
-
71
+ Scenario: Test VCF with no records
72
+ Given I have input file(s) named "test/data/input/empty.vcf"
73
+ When I execute "./bin/bio-vcf --timeout=5"
74
+ Then I expect no errors
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
21
21
  Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22
22
  When I look for the difference
23
23
  Then I expect the diff to be [0,15,0,11]
24
- And the relative diff to be [0,0.23,0,0.85]
25
24
  And I expect the defining tumor nucleotide to be "T"
26
25
  And I expect the tumor count to be 12
27
26
  When I set an inclusion threshold for the reference
@@ -7,15 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
7
7
  @cmd = arg1 + ' < ' + @filenames[0]
8
8
  end
9
9
 
10
- Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
- end
10
+ # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
+ # end
13
13
 
14
- Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
- end
14
+ # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
+ # end
17
17
 
18
18
 
19
- Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
- end
19
+ # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
+ # end
22
+
23
+ # Then(/^I expect no errors$/) do
24
+ # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25
+ # end
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
34
34
  end
35
35
 
36
36
  Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37
- res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
37
+ res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38
38
  expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39
39
  end
40
40
 
@@ -124,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
124
124
  end
125
125
 
126
126
  Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
127
- expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
127
+ expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
128
128
  end
129
129
 
130
130
 
@@ -114,26 +114,22 @@ class PCOWS
114
114
  end
115
115
  }
116
116
  if @output_locked
117
- # ---- is the other thread still running?
117
+ # ---- is the other thread still running? We wait until it
118
+ # is finished to start the next one
118
119
  (pid,count,fn) = @output_locked
119
120
  $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
120
121
  return if File.exist?(fn) # continue because thread still processing
121
122
  # Now we should remove the .keep file
122
- if not @debug
123
- sleep 0.1 # give it a little time
124
- keep = fn+'.keep'
125
- if File.exist?(keep)
126
- $stderr.print "Removing #{keep}\n" if not @quiet
127
- File.unlink(keep)
128
- end
129
- end
123
+ cleanup_keep_file(fn)
130
124
  @last_output += 1 # get next one in line
131
125
  @output_locked = false
132
126
  end
133
- # Still processing
127
+ # ---- process the next output chunk. After completion it
128
+ # gets renamed to chunk.keep. This to avoid missing
129
+ # output (if we unlink the file prematurely)
134
130
  if info = @pid_list[@last_output]
135
131
  (pid,count,fn) = info
136
- $stderr.print "Testing for output file ",[info],"\n" if @debug
132
+ $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
137
133
  if File.exist?(fn)
138
134
  # Yes! We have the next output, create outputter
139
135
  @output_locked = info
@@ -142,26 +138,15 @@ class PCOWS
142
138
  $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
143
139
  pid = fork do
144
140
  output.call(fn)
141
+ # after finishing output move it to .keep
145
142
  FileUtils::mv(fn,fn+'.keep')
146
- # if not @debug
147
- # $stderr.print "Removing #{fn}\n" if not @quiet
148
- # File.unlink(fn)
149
- # else
150
- # FileUtils::mv(fn,fn+'.keep')
151
- # end
152
-
153
143
  exit(0)
154
144
  end
155
145
  Process.detach(pid)
156
146
  else
157
147
  $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
158
148
  output.call(fn)
159
- if not @debug
160
- $stderr.print "Removing #{fn}\n" if not @quiet
161
- File.unlink(fn)
162
- else
163
- FileUtils::mv(fn,fn+'.keep')
164
- end
149
+ FileUtils::mv(fn,fn+'.keep')
165
150
  end
166
151
  else
167
152
  sleep 0.2
@@ -185,7 +170,7 @@ class PCOWS
185
170
  end
186
171
  # Partial file should have been renamed:
187
172
  raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
188
- $stderr.print "OK pid=#{pid}, processing output of #{fn}\n" if not @quiet
173
+ $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
189
174
  rescue Timeout::Error
190
175
  # Kill it to speed up exit
191
176
  Process.kill 9, pid
@@ -222,6 +207,10 @@ class PCOWS
222
207
  sleep 0.2
223
208
  end
224
209
  end
210
+ while @output_locked
211
+ sleep 0.1
212
+ process_output(nil,:by_line,true)
213
+ end
225
214
  cleanup_tmpdir()
226
215
  end
227
216
 
@@ -240,6 +229,7 @@ class PCOWS
240
229
  end
241
230
  end
242
231
  File.unlink(fn) if File.exist?(fn)
232
+ cleanup_keep_file(fn,wait: false)
243
233
  tempfn = fn+'.'+RUNNINGEXT
244
234
  File.unlink(tempfn) if File.exist?(tempfn)
245
235
  end
@@ -287,6 +277,22 @@ class PCOWS
287
277
  1
288
278
  end
289
279
 
280
+ def cleanup_keep_file(fn, opts = { wait: true })
281
+ if not @debug
282
+ keep = fn+'.keep'
283
+ return if not opts[:wait] and !File.exist?(keep)
284
+ $stderr.print "Trying to remove #{keep}\n" if not @quiet
285
+ while true
286
+ if File.exist?(keep)
287
+ $stderr.print "Removing #{keep}\n" if not @quiet
288
+ File.unlink(keep)
289
+ break # forever loop
290
+ end
291
+ sleep 0.1
292
+ end #forever
293
+ end
294
+ end
295
+
290
296
  def cleanup_tmpdir
291
297
  if not @debug
292
298
  $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet