bio-vcf 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -21
  3. data/LICENSE.txt +1 -1
  4. data/README.md +107 -73
  5. data/RELEASE_NOTES.md +20 -0
  6. data/RELEASE_NOTES.md~ +11 -0
  7. data/VERSION +1 -1
  8. data/bin/bio-vcf +49 -30
  9. data/bio-vcf.gemspec +1 -1
  10. data/features/cli.feature +4 -1
  11. data/features/diff_count.feature +0 -1
  12. data/features/step_definitions/cli-feature.rb +13 -9
  13. data/features/step_definitions/diff_count.rb +1 -1
  14. data/features/step_definitions/somaticsniper.rb +1 -1
  15. data/lib/bio-vcf/pcows.rb +31 -25
  16. data/lib/bio-vcf/vcffile.rb +46 -0
  17. data/lib/bio-vcf/vcfgenotypefield.rb +20 -20
  18. data/lib/bio-vcf/vcfheader.rb +29 -0
  19. data/lib/bio-vcf/vcfrecord.rb +5 -3
  20. data/lib/bio-vcf/vcfsample.rb +3 -1
  21. data/test/data/input/empty.vcf +2 -0
  22. data/test/data/regression/empty-stderr.new +12 -0
  23. data/test/data/regression/empty.new +2 -0
  24. data/test/data/regression/empty.ref +2 -0
  25. data/test/data/regression/eval_once-stderr.new +2 -2
  26. data/test/data/regression/eval_r.info.dp-stderr.new +9 -7
  27. data/test/data/regression/ifilter_s.dp-stderr.new +9 -7
  28. data/test/data/regression/pass1-stderr.new +9 -7
  29. data/test/data/regression/r.info.dp-stderr.new +4 -8
  30. data/test/data/regression/r.info.dp.new +0 -33
  31. data/test/data/regression/rewrite.info.sample-stderr.new +9 -7
  32. data/test/data/regression/s.dp-stderr.new +9 -7
  33. data/test/data/regression/seval_s.dp-stderr.new +9 -7
  34. data/test/data/regression/sfilter_seval_s.dp-stderr.new +9 -7
  35. data/test/data/regression/thread4-stderr.new +9 -7
  36. data/test/data/regression/thread4_4-stderr.new +25 -44
  37. data/test/data/regression/thread4_4.new +0 -20
  38. data/test/data/regression/thread4_4_failed_filter-stderr.new +1 -1
  39. data/test/data/regression/thread4_4_failed_filter-stderr.ref +1 -1
  40. data/test/data/regression/vcf2json_full_header-stderr.new +9 -7
  41. data/test/data/regression/vcf2json_use_meta-stderr.new +9 -7
  42. metadata +11 -7
  43. data/features/#cli.feature# +0 -71
  44. data/features/filter.feature~ +0 -35
  45. data/test/stress/stress_test.sh~ +0 -8
@@ -0,0 +1,20 @@
1
+ ## ChangeLog v0.9.4 (2020????)
2
+
3
+ This is an important maintenance release of bio-vcf:
4
+
5
+ + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf).
6
+
7
+ ## Older release notes
8
+
9
+ + Getting ready for a 1.0 release
10
+ + Released 0.9.2 as a gem
11
+ + 0.9.1 removed a rare threading bug and cleanup on error
12
+ + Added support for soft filters (request by Brad Chapman)
13
+ + The outputter now writes (properly) in parallel with the parser
14
+ + bio-vcf turns any VCF into JSON with header information, and
15
+ allows you to pipe that JSON directly into any JSON supporting
16
+ language, including Python and Javascript!
17
+
18
+ ## Older changes
19
+
20
+ For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
@@ -0,0 +1,11 @@
1
+ ## RELEASE NOTES
2
+
3
+
4
+ * Getting ready for a 1.0 release
5
+ * Released 0.9.2 as a gem
6
+ * 0.9.1 removed a rare threading bug and cleanup on error
7
+ * Added support for soft filters (request by Brad Chapman)
8
+ * The outputter now writes (properly) in parallel with the parser
9
+ * bio-vcf turns any VCF into JSON with header information, and
10
+ allows you to pipe that JSON directly into any JSON supporting
11
+ language, including Python and Javascript!
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.2
1
+ 0.9.4
@@ -4,7 +4,7 @@
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT
6
6
  #
7
- # Copyright (C) 2014-2015 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ # Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
8
8
 
9
9
  USAGE = "Vcf parser"
10
10
 
@@ -20,10 +20,10 @@ require 'optparse'
20
20
  require 'timeout'
21
21
  require 'fileutils'
22
22
 
23
- # Uncomment when using the bio-logger
23
+ # Uncomment when using the bio-logger
24
24
  # require 'bio-logger'
25
25
  # log = Bio::Log::LoggerPlus.new 'vcf'
26
- # log.outputters = Bio::Log::Outputter.stderr
26
+ # log.outputters = Bio::Log::Outputter.stderr
27
27
  # Bio::Log::CLI.logger('stderr')
28
28
  # Bio::Log::CLI.trace('info')
29
29
 
@@ -31,7 +31,7 @@ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', v
31
31
  opts = OptionParser.new do |o|
32
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
33
33
 
34
- o.on('-i','--ignore-missing', 'Ignore missing data') do
34
+ o.on('-i','--ignore-missing', 'Ignore missing data') do
35
35
  options[:ignore_missing] = true
36
36
  end
37
37
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
@@ -103,8 +103,8 @@ opts = OptionParser.new do |o|
103
103
  o.on_tail("--tags list", String, "Add tags") do |s|
104
104
  options[:tags] = s
105
105
  end
106
-
107
- o.on("--skip-header", "Do not output VCF header info") do
106
+
107
+ o.on("--skip-header", "Do not output VCF header info") do
108
108
  options[:skip_header] = true
109
109
  end
110
110
 
@@ -127,8 +127,8 @@ opts = OptionParser.new do |o|
127
127
  o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
128
128
  options[:timeout] = i
129
129
  end
130
-
131
- # Uncomment the following when using the bio-logger
130
+
131
+ # Uncomment the following when using the bio-logger
132
132
  # o.separator ""
133
133
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
134
134
  # Bio::Log::CLI.logger(name)
@@ -137,7 +137,16 @@ opts = OptionParser.new do |o|
137
137
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
138
138
  # Bio::Log::CLI.trace(s)
139
139
  # end
140
- #
140
+ #
141
+ o.on("--names", "Output sample names") do |q|
142
+ options[:quiet] = true
143
+ options[:num_threads] = nil
144
+ options[:eval_once] = true
145
+ options[:eval] = 'header.samples.join("\t")'
146
+ # options[:num_threads] = 1
147
+ # options[:thread_lines] = 1
148
+ options[:skip_header] = true
149
+ end
141
150
  o.on("--statistics", "Output statistics") do |q|
142
151
  options[:statistics] = true
143
152
  options[:num_threads] = nil
@@ -146,11 +155,11 @@ opts = OptionParser.new do |o|
146
155
  # Bio::Log::CLI.trace('error')
147
156
  options[:quiet] = true
148
157
  end
149
-
158
+
150
159
  o.on("-v", "--verbose", "Run verbosely") do |v|
151
160
  options[:verbose] = true
152
161
  end
153
-
162
+
154
163
  o.on("--debug", "Show debug messages and keep intermediate output") do |v|
155
164
  # Bio::Log::CLI.trace('debug')
156
165
  options[:debug] = true
@@ -165,11 +174,11 @@ end
165
174
  opts.parse!(ARGV)
166
175
 
167
176
  BIOVCF_VERSION=version
168
- BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015\n"
177
+ BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
169
178
  $stderr.print BIOVCF_BANNER if !options[:quiet]
170
179
 
171
- if options[:show_help]
172
- print opts
180
+ if options[:show_help]
181
+ print opts
173
182
  print USAGE
174
183
  exit 1
175
184
  end
@@ -215,6 +224,7 @@ def parse_header line, samples, options
215
224
  print line if not options[:skip_header]
216
225
  STDIN.each_line do | headerline |
217
226
  if headerline !~ /^#/
227
+ # If no records in VCF, we never get here
218
228
  line = headerline
219
229
  break # end of header
220
230
  end
@@ -229,12 +239,12 @@ def parse_header line, samples, options
229
239
  if add_filter
230
240
  print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
231
241
  end
232
-
242
+
233
243
  selected = header.column_names
234
244
  if samples
235
245
  newfields = selected[0..8]
236
246
  samples.each do |s|
237
- newfields << selected[s+9]
247
+ newfields << selected[s+9]
238
248
  end
239
249
  selected = newfields
240
250
  end
@@ -246,6 +256,10 @@ def parse_header line, samples, options
246
256
  end
247
257
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
248
258
  VcfRdf::header if options[:rdf]
259
+ if line =~ /^#/
260
+ # We did not read a record
261
+ line = nil
262
+ end
249
263
  return header,line
250
264
  end
251
265
 
@@ -291,10 +305,9 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
291
305
  if filter
292
306
  return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
293
307
  end
294
-
308
+
295
309
  if sfilter # sample 'or' filter
296
310
  rec.each_sample(options[:sfilter_samples]) do | sample |
297
- # return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
298
311
  return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
299
312
  end
300
313
  end
@@ -321,21 +334,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
321
334
 
322
335
  # -----------------------------
323
336
  # From here on decide on output
324
-
337
+
325
338
  rec.add_to_filter_field(add_filter) if set_filter_field
326
339
 
327
340
  if samples
328
341
  # Select certain samples for output
329
342
  newfields = fields[0..8]
330
343
  samples.each do |s|
331
- newfields << fields[s+9]
344
+ newfields << fields[s+9]
332
345
  end
333
346
  fields = newfields
334
347
  end
335
348
  if options[:eval] or seval
336
349
  begin
337
350
  results = nil # result string
338
- if options[:eval]
351
+ if options[:eval]
339
352
  res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
340
353
  results = res if res
341
354
  end
@@ -364,11 +377,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
364
377
  rescue Exception => e
365
378
  $stderr.print e,": ",fields,"\n"
366
379
  $stderr.print e.backtrace.inspect if options[:verbose]
367
- raise
380
+ raise
368
381
  end
369
382
  elsif options[:rewrite]
370
383
  # Default behaviour prints VCF line, but rewrite info
371
- eval(options[:rewrite])
384
+ eval(options[:rewrite])
372
385
  (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
373
386
  elsif stats
374
387
  # do nothing
@@ -390,7 +403,7 @@ line_number=0
390
403
 
391
404
  if options[:bed]
392
405
  bedfilter = BedFilter.new(options[:bed])
393
- end
406
+ end
394
407
 
395
408
  begin
396
409
  # Define linear parser function (going through one chunk)
@@ -415,7 +428,11 @@ begin
415
428
  # ---- Parse the header lines (chomps from STDIN)
416
429
  # and returns header info and the current line
417
430
  if line =~ /^#/
418
- header,line = parse_header(line,samples,options)
431
+ header, line = parse_header(line,samples,options)
432
+ if line.nil?
433
+ # No line after header, to there are no records to process
434
+ break
435
+ end
419
436
  end
420
437
  # p [line_number,line]
421
438
  # ---- After the header continue processing
@@ -428,7 +445,7 @@ begin
428
445
  print template.header(binding) if template
429
446
  header_output_completed = true
430
447
  end
431
-
448
+
432
449
  if options[:eval_once]
433
450
  # this happens if we only want one line evaluated - say to get
434
451
  # the number of samples
@@ -442,7 +459,7 @@ begin
442
459
 
443
460
  # ---- In the following section the VCF lines are parsed by chunks
444
461
  # The chunks may go into different threads
445
-
462
+
446
463
  if chunk_lines.size >= CHUNK_SIZE
447
464
  # ---- process one chunk
448
465
  $stderr.print '.' if not options[:quiet]
@@ -456,14 +473,16 @@ begin
456
473
  pcows.submit_final_worker(process,chunk_lines)
457
474
  pcows.wait_for_workers()
458
475
  pcows.process_remaining_output()
459
-
476
+
460
477
  print template.footer(binding) if template
461
478
  stats.print if stats
462
479
 
463
480
  rescue Exception => e
464
- $stderr.print e.message,"\n" if e.message != 'exit'
481
+ if e.message != 'exit'
482
+ $stderr.print "ERROR: "
483
+ $stderr.print e.message,"\n"
484
+ end
465
485
  pcows.cleanup()
466
486
  raise if options[:verbose]
467
487
  exit 1
468
488
  end
469
-
@@ -31,7 +31,7 @@ Gem::Specification.new do |s|
31
31
  s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
32
32
  Dir['template/**/*']
33
33
 
34
- s.homepage = "http://github.com/pjotrp/bioruby-vcf"
34
+ s.homepage = "http://github.com/vcflib/bio-vcf"
35
35
  s.licenses = ["MIT"]
36
36
  s.require_paths = ["lib"]
37
37
  s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
@@ -68,4 +68,7 @@ Feature: Command-line interface (CLI)
68
68
  When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
69
69
  Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
70
70
 
71
-
71
+ Scenario: Test VCF with no records
72
+ Given I have input file(s) named "test/data/input/empty.vcf"
73
+ When I execute "./bin/bio-vcf --timeout=5"
74
+ Then I expect no errors
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
21
21
  Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22
22
  When I look for the difference
23
23
  Then I expect the diff to be [0,15,0,11]
24
- And the relative diff to be [0,0.23,0,0.85]
25
24
  And I expect the defining tumor nucleotide to be "T"
26
25
  And I expect the tumor count to be 12
27
26
  When I set an inclusion threshold for the reference
@@ -7,15 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
7
7
  @cmd = arg1 + ' < ' + @filenames[0]
8
8
  end
9
9
 
10
- Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
- end
10
+ # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
+ # end
13
13
 
14
- Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
- end
14
+ # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
+ # end
17
17
 
18
18
 
19
- Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
- end
19
+ # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
+ # end
22
+
23
+ # Then(/^I expect no errors$/) do
24
+ # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25
+ # end
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
34
34
  end
35
35
 
36
36
  Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37
- res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
37
+ res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38
38
  expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39
39
  end
40
40
 
@@ -124,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
124
124
  end
125
125
 
126
126
  Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
127
- expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
127
+ expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
128
128
  end
129
129
 
130
130
 
@@ -114,26 +114,22 @@ class PCOWS
114
114
  end
115
115
  }
116
116
  if @output_locked
117
- # ---- is the other thread still running?
117
+ # ---- is the other thread still running? We wait until it
118
+ # is finished to start the next one
118
119
  (pid,count,fn) = @output_locked
119
120
  $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
120
121
  return if File.exist?(fn) # continue because thread still processing
121
122
  # Now we should remove the .keep file
122
- if not @debug
123
- sleep 0.1 # give it a little time
124
- keep = fn+'.keep'
125
- if File.exist?(keep)
126
- $stderr.print "Removing #{keep}\n" if not @quiet
127
- File.unlink(keep)
128
- end
129
- end
123
+ cleanup_keep_file(fn)
130
124
  @last_output += 1 # get next one in line
131
125
  @output_locked = false
132
126
  end
133
- # Still processing
127
+ # ---- process the next output chunk. After completion it
128
+ # gets renamed to chunk.keep. This to avoid missing
129
+ # output (if we unlink the file prematurely)
134
130
  if info = @pid_list[@last_output]
135
131
  (pid,count,fn) = info
136
- $stderr.print "Testing for output file ",[info],"\n" if @debug
132
+ $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
137
133
  if File.exist?(fn)
138
134
  # Yes! We have the next output, create outputter
139
135
  @output_locked = info
@@ -142,26 +138,15 @@ class PCOWS
142
138
  $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
143
139
  pid = fork do
144
140
  output.call(fn)
141
+ # after finishing output move it to .keep
145
142
  FileUtils::mv(fn,fn+'.keep')
146
- # if not @debug
147
- # $stderr.print "Removing #{fn}\n" if not @quiet
148
- # File.unlink(fn)
149
- # else
150
- # FileUtils::mv(fn,fn+'.keep')
151
- # end
152
-
153
143
  exit(0)
154
144
  end
155
145
  Process.detach(pid)
156
146
  else
157
147
  $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
158
148
  output.call(fn)
159
- if not @debug
160
- $stderr.print "Removing #{fn}\n" if not @quiet
161
- File.unlink(fn)
162
- else
163
- FileUtils::mv(fn,fn+'.keep')
164
- end
149
+ FileUtils::mv(fn,fn+'.keep')
165
150
  end
166
151
  else
167
152
  sleep 0.2
@@ -185,7 +170,7 @@ class PCOWS
185
170
  end
186
171
  # Partial file should have been renamed:
187
172
  raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
188
- $stderr.print "OK pid=#{pid}, processing output of #{fn}\n" if not @quiet
173
+ $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
189
174
  rescue Timeout::Error
190
175
  # Kill it to speed up exit
191
176
  Process.kill 9, pid
@@ -222,6 +207,10 @@ class PCOWS
222
207
  sleep 0.2
223
208
  end
224
209
  end
210
+ while @output_locked
211
+ sleep 0.1
212
+ process_output(nil,:by_line,true)
213
+ end
225
214
  cleanup_tmpdir()
226
215
  end
227
216
 
@@ -240,6 +229,7 @@ class PCOWS
240
229
  end
241
230
  end
242
231
  File.unlink(fn) if File.exist?(fn)
232
+ cleanup_keep_file(fn,wait: false)
243
233
  tempfn = fn+'.'+RUNNINGEXT
244
234
  File.unlink(tempfn) if File.exist?(tempfn)
245
235
  end
@@ -287,6 +277,22 @@ class PCOWS
287
277
  1
288
278
  end
289
279
 
280
+ def cleanup_keep_file(fn, opts = { wait: true })
281
+ if not @debug
282
+ keep = fn+'.keep'
283
+ return if not opts[:wait] and !File.exist?(keep)
284
+ $stderr.print "Trying to remove #{keep}\n" if not @quiet
285
+ while true
286
+ if File.exist?(keep)
287
+ $stderr.print "Removing #{keep}\n" if not @quiet
288
+ File.unlink(keep)
289
+ break # forever loop
290
+ end
291
+ sleep 0.1
292
+ end #forever
293
+ end
294
+ end
295
+
290
296
  def cleanup_tmpdir
291
297
  if not @debug
292
298
  $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet