bio-vcf 0.8.1 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +2 -8
  4. data/LICENSE.txt +1 -1
  5. data/README.md +467 -129
  6. data/RELEASE_NOTES.md +27 -0
  7. data/RELEASE_NOTES.md~ +11 -0
  8. data/Rakefile +9 -42
  9. data/TAGS +115 -0
  10. data/VERSION +1 -1
  11. data/bin/bio-vcf +156 -108
  12. data/bio-vcf.gemspec +13 -75
  13. data/features/cli.feature +22 -4
  14. data/features/diff_count.feature +0 -1
  15. data/features/filter.feature +12 -0
  16. data/features/multisample.feature +12 -0
  17. data/features/somaticsniper.feature +2 -0
  18. data/features/step_definitions/cli-feature.rb +15 -6
  19. data/features/step_definitions/diff_count.rb +1 -1
  20. data/features/step_definitions/multisample.rb +19 -0
  21. data/features/step_definitions/somaticsniper.rb +9 -1
  22. data/features/step_definitions/vcf_header.rb +48 -0
  23. data/features/support/env.rb +1 -11
  24. data/features/vcf_header.feature +35 -0
  25. data/lib/bio-vcf.rb +1 -0
  26. data/lib/bio-vcf/pcows.rb +303 -0
  27. data/lib/bio-vcf/vcffile.rb +46 -0
  28. data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
  29. data/lib/bio-vcf/vcfheader.rb +137 -5
  30. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  31. data/lib/bio-vcf/vcfrecord.rb +56 -18
  32. data/lib/bio-vcf/vcfsample.rb +26 -2
  33. data/lib/regressiontest.rb +11 -0
  34. data/lib/regressiontest/cli_exec.rb +101 -0
  35. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  36. data/ragel/generate.sh +8 -0
  37. data/template/vcf2json.erb +16 -16
  38. data/template/vcf2json_full_header.erb +22 -0
  39. data/template/vcf2json_use_meta.erb +41 -0
  40. data/test/data/input/empty.vcf +2 -0
  41. data/test/data/input/gatk_exome.vcf +237 -0
  42. data/test/data/input/gatk_wgs.vcf +1000 -0
  43. data/test/data/input/test.bed +632 -0
  44. data/test/data/regression/empty-stderr.new +12 -0
  45. data/test/data/regression/empty.new +2 -0
  46. data/test/data/regression/empty.ref +2 -0
  47. data/test/data/regression/eval_once-stderr.new +2 -0
  48. data/test/data/regression/eval_once.new +1 -0
  49. data/test/data/regression/eval_once.ref +1 -0
  50. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  51. data/test/data/regression/eval_r.info.dp.new +150 -0
  52. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  53. data/test/data/regression/ifilter_s.dp.new +31 -0
  54. data/test/data/regression/pass1-stderr.new +10 -0
  55. data/test/data/regression/pass1.new +88 -0
  56. data/test/data/regression/pass1.ref +88 -0
  57. data/test/data/regression/r.info.dp-stderr.new +4 -0
  58. data/test/data/regression/r.info.dp.new +114 -0
  59. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  60. data/test/data/regression/rewrite.info.sample.new +150 -0
  61. data/test/data/regression/s.dp-stderr.new +18 -0
  62. data/test/data/regression/s.dp.new +145 -0
  63. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  64. data/test/data/regression/seval_s.dp.new +36 -0
  65. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  66. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  67. data/test/data/regression/thread4-stderr.new +10 -0
  68. data/test/data/regression/thread4.new +150 -0
  69. data/test/data/regression/thread4_4-stderr.new +25 -0
  70. data/test/data/regression/thread4_4.new +130 -0
  71. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  72. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
  73. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  74. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  75. data/test/data/regression/vcf2json_full_header.new +225 -0
  76. data/test/data/regression/vcf2json_full_header.ref +225 -0
  77. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  78. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  79. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  80. data/test/performance/metrics.md +18 -1
  81. data/test/stress/stress_test.sh +15 -0
  82. data/test/tmp/test.vcf +12469 -0
  83. metadata +63 -64
  84. data/Gemfile.lock +0 -81
@@ -1,18 +1,13 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
1
+ # No longer generated by jeweler
4
2
  # -*- encoding: utf-8 -*-
5
- # stub: bio-vcf 0.8.1 ruby lib
6
3
 
7
4
  Gem::Specification.new do |s|
8
5
  s.name = "bio-vcf"
9
- s.version = "0.8.1"
6
+ s.version = File.read("VERSION")
10
7
 
11
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib"]
13
9
  s.authors = ["Pjotr Prins"]
14
- s.date = "2014-11-26"
15
- s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
10
+ s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
16
11
  s.email = "pjotr.public01@thebird.nl"
17
12
  s.executables = ["bio-vcf"]
18
13
  s.extra_rdoc_files = [
@@ -22,82 +17,25 @@ Gem::Specification.new do |s|
22
17
  s.files = [
23
18
  ".travis.yml",
24
19
  "Gemfile",
25
- "Gemfile.lock",
26
20
  "LICENSE.txt",
27
21
  "README.md",
28
22
  "Rakefile",
29
23
  "VERSION",
30
24
  "bin/bio-vcf",
31
25
  "bio-vcf.gemspec",
32
- "features/cli.feature",
33
- "features/diff_count.feature",
34
- "features/multisample.feature",
35
- "features/sfilter.feature",
36
- "features/somaticsniper.feature",
37
- "features/step_definitions/bio-vcf_steps.rb",
38
- "features/step_definitions/cli-feature.rb",
39
- "features/step_definitions/diff_count.rb",
40
- "features/step_definitions/multisample.rb",
41
- "features/step_definitions/sfilter.rb",
42
- "features/step_definitions/somaticsniper.rb",
43
- "features/support/env.rb",
44
- "lib/bio-vcf.rb",
45
- "lib/bio-vcf/bedfilter.rb",
46
- "lib/bio-vcf/template.rb",
47
- "lib/bio-vcf/utils.rb",
48
- "lib/bio-vcf/variant.rb",
49
- "lib/bio-vcf/vcf.rb",
50
- "lib/bio-vcf/vcfgenotypefield.rb",
51
- "lib/bio-vcf/vcfheader.rb",
52
- "lib/bio-vcf/vcfline.rb",
53
- "lib/bio-vcf/vcfrdf.rb",
54
- "lib/bio-vcf/vcfrecord.rb",
55
- "lib/bio-vcf/vcfsample.rb",
56
- "lib/bio-vcf/vcfstatistics.rb",
57
- "template/gatk_vcf2rdf.erb",
58
- "template/vcf2json.erb",
59
- "template/vcf2rdf.erb",
60
- "template/vcf2rdf_header.erb",
61
- "test/data/input/dbsnp.vcf",
62
- "test/data/input/multisample.vcf",
63
- "test/data/input/somaticsniper.vcf",
64
- "test/data/regression/eval_r.info.dp.ref",
65
- "test/data/regression/ifilter_s.dp.ref",
66
- "test/data/regression/r.info.dp.ref",
67
- "test/data/regression/rewrite.info.sample.ref",
68
- "test/data/regression/s.dp.ref",
69
- "test/data/regression/seval_s.dp.ref",
70
- "test/data/regression/sfilter_seval_s.dp.ref",
71
- "test/data/regression/thread4.ref",
72
- "test/data/regression/thread4_4.ref",
73
- "test/data/regression/thread4_4_failed_filter-stderr.ref",
74
- "test/performance/metrics.md"
26
+ "ragel/gen_vcfheaderline_parser.rl",
27
+ "ragel/generate.sh",
75
28
  ]
76
- s.homepage = "http://github.com/pjotrp/bioruby-vcf"
29
+ s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
30
+ s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
31
+ Dir['template/**/*']
32
+
33
+ s.homepage = "http://github.com/vcflib/bio-vcf"
77
34
  s.licenses = ["MIT"]
35
+ s.require_paths = ["lib"]
78
36
  s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
79
- s.rubygems_version = "2.2.2"
80
- s.summary = "Fast multi-threaded VCF parser"
81
-
82
- if s.respond_to? :specification_version then
83
- s.specification_version = 4
37
+ # s.rubygems_version = "2.0.3"
38
+ s.summary = "Fast multi-purpose multi-threaded VCF parser"
84
39
 
85
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
86
- s.add_development_dependency(%q<rspec>, [">= 0"])
87
- s.add_development_dependency(%q<cucumber>, [">= 0"])
88
- s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
89
- s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
90
- else
91
- s.add_dependency(%q<rspec>, [">= 0"])
92
- s.add_dependency(%q<cucumber>, [">= 0"])
93
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
94
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
95
- end
96
- else
97
- s.add_dependency(%q<rspec>, [">= 0"])
98
- s.add_dependency(%q<cucumber>, [">= 0"])
99
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
100
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
101
- end
102
40
  end
103
41
 
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
11
11
  Scenario: Test the info filter using dp and threads
12
12
  Given I have input file(s) named "test/data/input/multisample.vcf"
13
13
  When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14
- Then I expect the named output to match the named output "thread4"
14
+ Then I expect the named output to match the named output "thread4" in under 30 seconds
15
15
 
16
16
  Scenario: Test the info filter using dp and threads with lines
17
17
  Given I have input file(s) named "test/data/input/multisample.vcf"
18
18
  When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19
- Then I expect the named output to match the named output "thread4_4"
19
+ Then I expect the named output to match the named output "thread4_4" in under 30 seconds
20
20
 
21
21
  Scenario: Test the sample filter using dp
22
22
  Given I have input file(s) named "test/data/input/multisample.vcf"
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
43
43
  When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
44
44
  Then I expect the named output to match the named output "sfilter_seval_s.dp"
45
45
 
46
-
47
46
  Scenario: Rewrite an info field
48
47
  Given I have input file(s) named "test/data/input/multisample.vcf"
49
48
  When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
50
49
  Then I expect the named output to match the named output "rewrite.info.sample"
51
50
 
51
+ Scenario: Test eval-once
52
+ Given I have input file(s) named "test/data/input/multisample.vcf"
53
+ When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
54
+ Then I expect the named output to match the named output "eval_once"
55
+
56
+ Scenario: Test JSON output with header meta data
57
+ Given I have input file(s) named "test/data/input/multisample.vcf"
58
+ When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
59
+ Then I expect the named output to match the named output "vcf2json_full_header"
60
+
61
+ Scenario: Test JSON output with header meta data and query samples
62
+ Given I have input file(s) named "test/data/input/multisample.vcf"
63
+ When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
64
+ Then I expect the named output to match the named output "vcf2json_use_meta"
65
+
52
66
  Scenario: Test deadlock on failed filter with threads
53
67
  Given I have input file(s) named "test/data/input/multisample.vcf"
54
- When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
68
+ When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
55
69
  Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
56
70
 
71
+ Scenario: Test VCF with no records
72
+ Given I have input file(s) named "test/data/input/empty.vcf"
73
+ When I execute "./bin/bio-vcf --timeout=5"
74
+ Then I expect no errors
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
21
21
  Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22
22
  When I look for the difference
23
23
  Then I expect the diff to be [0,15,0,11]
24
- And the relative diff to be [0,0.23,0,0.85]
25
24
  And I expect the defining tumor nucleotide to be "T"
26
25
  And I expect the tumor count to be 12
27
26
  When I set an inclusion threshold for the reference
@@ -0,0 +1,12 @@
1
+ @filter
2
+ Feature: Adding filters
3
+
4
+ bio-vcf can add soft filters. Rather than removing failing items we can
5
+ inject filter state into the FILTER field. To add state such as PASS or
6
+ LowDepth simply use a filter and the --set-filter switch. If a filter already
7
+ has state the new one is appended with a semi-colon.
8
+
9
+ Scenario: Test the info filter using dp and threads
10
+ Given I have input file(s) named "test/data/input/somaticsniper.vcf"
11
+ When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
12
+ Then I expect the named output to match the named output "pass1"
@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
25
25
  And I expect rec.info.ac to be 5
26
26
  And I expect rec.info.af to be 0.357
27
27
  And I expect rec.info.dp to be 1537
28
+ And I expect rec.info['dp'] to be 1537
28
29
  And I expect rec.info.readposranksum to be 0.815
30
+ And I expect rec.info['ReadPosRankSum'] to be 0.815
31
+ And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
29
32
  And I expect rec.sample['Original'].ad to be [189,25]
30
33
  And I expect rec.sample['Original'].gt to be "0/1"
31
34
  And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -76,3 +79,12 @@ Feature: Multi-sample VCF
76
79
  And I expect r.original.gts to be ["C","G"]
77
80
  And I expect r.original.gts[0] to be "C"
78
81
  And I expect r.original.gts[1] to be "G"
82
+
83
+ # INFO fields with matching tails
84
+ Given multisample vcf line
85
+ """
86
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
87
+ """
88
+ When I parse the record
89
+ Then I expect r.info.end to be 111
90
+ And I expect r.info.ciend to be 999
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
46
46
  And I expect rec.tumor.amq.to_ary to be [37,37]
47
47
  And I expect rec.tumor.mq to be 37
48
48
  And I expect rec.tumor.ss to be 2
49
+ And I expect rec.tumor.ssc to be 33
50
+ And I expect rec.normal.ssc to be nil
49
51
  # The following are additional functions
50
52
  And I expect rec.call_diff to be [-4,2,-2,0]
51
53
  And I expect rec.call_nuc to be "C"
@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
7
7
  @cmd = arg1 + ' < ' + @filenames[0]
8
8
  end
9
9
 
10
- Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
12
- end
10
+ # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
+ # end
13
13
 
14
- Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
15
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
16
- end
14
+ # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
+ # end
17
+
18
+
19
+ # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
+ # end
22
+
23
+ # Then(/^I expect no errors$/) do
24
+ # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25
+ # end
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
34
34
  end
35
35
 
36
36
  Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37
- res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
37
+ res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38
38
  expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39
39
  end
40
40
 
@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
37
37
  expect(@rec1.info.readposranksum).to eq 0.815
38
38
  end
39
39
 
40
+ Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
41
+ expect(@rec1.info['dp']).to eq 1537
42
+ end
43
+
44
+ Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
45
+ expect(@rec1.info['ReadPosRankSum']).to eq 0.815
46
+ end
47
+
48
+ Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
49
+ expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
50
+ end
51
+
40
52
  Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
41
53
  expect(@rec1.sample['Original'].gt).to eq "0/1"
42
54
  end
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
161
173
  expect(@rec1.original.gts[arg1.to_i]).to eq arg2
162
174
  end
163
175
 
176
+ Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
177
+ expect(@rec1.info.end).to eq arg1.to_i
178
+ end
179
+
180
+ Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
181
+ expect(@rec1.info.ciend).to eq arg1.to_i
182
+ end
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
99
99
  end
100
100
 
101
101
 
102
+ Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
103
+ expect(@rec.tumor.ssc).to be 33
104
+ end
105
+
106
+ Then(/^I expect rec\.normal\.ssc to be nil$/) do
107
+ expect(@rec.normal.ssc).to be nil
108
+ end
109
+
102
110
  Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
103
111
  expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
104
112
  end
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
116
124
  end
117
125
 
118
126
  Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
119
- expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
127
+ expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
120
128
  end
121
129
 
122
130
 
@@ -0,0 +1,48 @@
1
+ Given(/^the VCF header lines$/) do |string|
2
+ header = VcfHeader.new
3
+ header.add string
4
+ @vcf = header
5
+ end
6
+
7
+ When(/^I parse the VCF header$/) do
8
+ end
9
+
10
+ Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
11
+ expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
12
+ end
13
+
14
+ Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
15
+ expect(@vcf.fileformat).to eq arg1
16
+ end
17
+
18
+ Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
19
+ expect(@vcf.fileDate).to eq arg1
20
+ end
21
+
22
+ Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
23
+ expect(@vcf.field['fileDate']).to eq arg1
24
+ end
25
+
26
+ Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
27
+ expect(@vcf.phasing).to eq arg1
28
+ end
29
+
30
+ Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
31
+ expect(@vcf.reference).to eq arg1
32
+ end
33
+
34
+ Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
35
+ expect(@vcf.format[arg1].to_s).to eq arg2
36
+ end
37
+
38
+ Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
39
+ expect(@vcf.info[arg1].to_s).to eq arg2
40
+ end
41
+
42
+ Then(/^I expect vcf\.meta to contain all header meta information$/) do
43
+ m = @vcf.meta
44
+ expect(m['fileformat']).to eq "VCFv4.1"
45
+ expect(m['FORMAT']['DP']['Number']).to eq "1"
46
+ expect(m.size).to be 9
47
+ end
48
+
@@ -1,13 +1,3 @@
1
- require 'bundler'
2
- begin
3
- Bundler.setup(:default, :development)
4
- rescue Bundler::BundlerError => e
5
- $stderr.puts e.message
6
- $stderr.puts "Run `bundle install` to install missing gems"
7
- exit e.status_code
8
- end
9
-
10
- # require 'mini/test'
11
1
 
12
2
  $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
13
3
  require 'bio-vcf'
@@ -16,7 +6,7 @@ require 'rspec/expectations'
16
6
 
17
7
  # Add the regression module if in the path (it can also be a gem)
18
8
  rootdir = File.dirname(__FILE__) + '/../..'
19
- $LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
9
+ $LOAD_PATH.unshift(rootdir+'/lib/regressiontest',rootdir+'/../regressiontest/lib')
20
10
  require 'regressiontest'
21
11
 
22
12
  include BioVcf
@@ -0,0 +1,35 @@
1
+ @meta
2
+ Feature: Parsing VCF meta information from the header
3
+
4
+ Take a header and parse that information as defined by the VCF standard.
5
+
6
+ Scenario: When parsing a header line
7
+
8
+ Given the VCF header lines
9
+ """
10
+ ##fileformat=VCFv4.1
11
+ ##fileDate=20140121
12
+ ##phasing=none
13
+ ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
14
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
15
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
16
+ ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
17
+ ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
18
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
19
+ """
20
+ When I parse the VCF header
21
+ Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
22
+ And I expect vcf.fileformat to be "VCFv4.1"
23
+ And I expect vcf.fileDate to be "20140121"
24
+ And I expect vcf.field['fileDate'] to be "20140121"
25
+ And I expect vcf.phasing to be "none"
26
+ And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
27
+ And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
28
+ And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
29
+ And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
30
+ And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
31
+ And I expect vcf.meta to contain all header meta information
32
+
33
+ Scenario: When parsing the header of somatic_sniper.vcf
34
+
35
+ Do something
@@ -11,6 +11,7 @@
11
11
  require 'bio-vcf/utils'
12
12
  require 'bio-vcf/vcf'
13
13
  require 'bio-vcf/vcfsample'
14
+ require 'bio-vcf/vcfheader_line'
14
15
  require 'bio-vcf/vcfheader'
15
16
  require 'bio-vcf/vcfline'
16
17
  require 'bio-vcf/vcfgenotypefield'
@@ -0,0 +1,303 @@
1
+ # Parallel copy-on-write streaming (PCOWS)
2
+
3
+ require 'tempfile'
4
+
5
+ class PCOWS
6
+
7
+ RUNNINGEXT = 'part' # file extension
8
+
9
+ def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
10
+ num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
11
+ # $stderr.print "Using ",num_threads,"threads \n"
12
+ @num_threads = num_threads
13
+ @chunk_size = chunk_size
14
+ @pid_list = []
15
+ @name = name
16
+ @timeout = timeout
17
+ @quiet = quiet
18
+ @debug = debug
19
+ if @debug
20
+ $stderr.print "PCOWS running in DEBUG MODE\n"
21
+ end
22
+ if multi_threaded
23
+ @tmpdir = Dir::mktmpdir(@name+'_')
24
+ end
25
+ @last_output = 0 # counter
26
+ @output_locked = false
27
+ end
28
+
29
+ # Feed the worker 'func and state' to COWS. Note that func is a
30
+ # lambda closure so it can pick up surrounding scope at invocation
31
+ # in addition to the data captured in 'state'.
32
+
33
+ def submit_worker(func,state)
34
+ pid = nil
35
+ if multi_threaded
36
+ count = @pid_list.size+1
37
+ fn = mktmpfilename(count)
38
+ pid = fork do
39
+ # ---- This is running a new copy-on-write process
40
+ tempfn = fn+'.'+RUNNINGEXT
41
+ STDOUT.reopen(File.open(tempfn, 'w+'))
42
+ func.call(state).each { | line | print line }
43
+ STDOUT.flush
44
+ STDOUT.close
45
+ # sleep 0.1
46
+ # f.flush
47
+ # f.close
48
+ # sleep 0.2 # interval to make sure we are done writing,
49
+ # otherwise there may be misses at the end of a
50
+ # block (maybe the f.close fixed it)
51
+
52
+ FileUtils::mv(tempfn,fn)
53
+ exit(0)
54
+ end
55
+ Process.detach(pid)
56
+ else
57
+ # ---- Single threaded: call in main process and output immediately
58
+ func.call(state).each { | line | print line }
59
+ end
60
+ @pid_list << [ pid,count,fn ]
61
+ return true
62
+ end
63
+
64
+ def submit_final_worker(func,state)
65
+ @final_worker = true
66
+ submit_worker(func,state)
67
+ end
68
+
69
+ # Make sure no more than num_threads are running at the same time -
70
+ # this is achieved by checking the PID table and the running files
71
+ # in the tmpdir
72
+
73
+ def wait_for_worker_slot()
74
+ return if single_threaded
75
+ Timeout.timeout(@timeout) do
76
+ printed_timeout_message = false
77
+ while true
78
+ # ---- count running pids
79
+ running = @pid_list.reduce(0) do | sum, info |
80
+ (pid,count,fn) = info
81
+ if pid_or_file_running?(pid,fn)
82
+ sum+1
83
+ else
84
+ sum
85
+ end
86
+ end
87
+ return if running < @num_threads
88
+ if not printed_timeout_message
89
+ $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
90
+ printed_timeout_message = true
91
+ end
92
+ sleep 0.1
93
+ end
94
+ end
95
+ end
96
+
97
+ # ---- In this section the output gets collected and passed on to a
98
+ # printer thread. This function makes sure the printing is
99
+ # ordered and that no printers are running at the same
100
+ # time. The printer thread should be doing as little processing
101
+ # as possible.
102
+ #
103
+ # In this implementation type==:by_line will call func for
104
+ # each line. Otherwise it is called once with the filename.
105
+ def process_output(func=nil,type=:by_line, blocking=false)
106
+ return if single_threaded
107
+ output = lambda { |fn|
108
+ if type == :by_line
109
+ File.new(fn).each_line { |buf|
110
+ print buf
111
+ }
112
+ else
113
+ func.call(fn)
114
+ end
115
+ }
116
+ if @output_locked
117
+ # ---- is the other thread still running? We wait until it
118
+ # is finished to start the next one
119
+ (pid,count,fn) = @output_locked
120
+ $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
121
+ return if File.exist?(fn) # continue because thread still processing
122
+ # Now we should remove the .keep file
123
+ cleanup_keep_file(fn)
124
+ @last_output += 1 # get next one in line
125
+ @output_locked = false
126
+ end
127
+ # ---- process the next output chunk. After completion it
128
+ # gets renamed to chunk.keep. This to avoid missing
129
+ # output (if we unlink the file prematurely)
130
+ if info = @pid_list[@last_output]
131
+ (pid,count,fn) = info
132
+ $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
133
+ if File.exist?(fn)
134
+ # Yes! We have the next output, create outputter
135
+ @output_locked = info
136
+ $stderr.print "Set lock on ",[info],"\n" if not @quiet
137
+ if not blocking
138
+ $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
139
+ pid = fork do
140
+ output.call(fn)
141
+ # after finishing output move it to .keep
142
+ FileUtils::mv(fn,fn+'.keep')
143
+ exit(0)
144
+ end
145
+ Process.detach(pid)
146
+ else
147
+ $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
148
+ output.call(fn)
149
+ FileUtils::mv(fn,fn+'.keep')
150
+ end
151
+ else
152
+ sleep 0.2
153
+ end
154
+ end
155
+ end
156
+
157
+ # Wait for a worker slot to appear. When working the pid is writing
158
+ # a file with extension .part(ial). After completion the file is
159
+ # renamed without .part and a slot is free.
160
+ def wait_for_worker(info)
161
+ (pid,count,fn) = info
162
+ if pid_or_file_running?(pid,fn)
163
+ $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
164
+ begin
165
+ Timeout.timeout(@timeout) do
166
+ while not File.exist?(fn) # wait for the result to appear
167
+ sleep 0.2
168
+ return if not pid_or_file_running?(pid,fn) # worker is gone
169
+ end
170
+ end
171
+ # Partial file should have been renamed:
172
+ raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
173
+ $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
174
+ rescue Timeout::Error
175
+ # Kill it to speed up exit
176
+ Process.kill 9, pid
177
+ Process.wait pid
178
+ $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
179
+ $stderr.print "Bailing out"
180
+ raise
181
+ end
182
+ end
183
+ end
184
+
185
+ # This is the final cleanup after the reader thread is done. All workers
186
+ # need to complete.
187
+
188
+ def wait_for_workers()
189
+ return if single_threaded
190
+ @pid_list.each do |info|
191
+ wait_for_worker(info)
192
+ end
193
+ end
194
+
195
+ def process_remaining_output()
196
+ return if single_threaded
197
+ $stderr.print "Processing remaining output...\n" if not @quiet
198
+ while @output_locked
199
+ sleep 0.2
200
+ process_output() # keep trying
201
+ end
202
+ @pid_list.each do |info|
203
+ (pid,count,fn) = info
204
+ while pid_or_file_running?(pid,fn) or File.exist?(fn)
205
+ $stderr.print "Trying: ",[info],"\n" if not @quiet
206
+ process_output(nil,:by_line,true)
207
+ sleep 0.2
208
+ end
209
+ end
210
+ while @output_locked
211
+ sleep 0.1
212
+ process_output(nil,:by_line,true)
213
+ end
214
+ cleanup_tmpdir()
215
+ end
216
+
217
+ def cleanup()
218
+ @pid_list.each do |info|
219
+ (pid,count,fn) = info
220
+ if pid_running?(pid)
221
+ $stderr.print "Killing child ",[info],"\n"
222
+ begin
223
+ Process.kill 9, pid
224
+ Process.wait pid
225
+ rescue Errno::ENOENT
226
+ $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
227
+ rescue Errno::ESRCH
228
+ $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
229
+ end
230
+ end
231
+ File.unlink(fn) if File.exist?(fn)
232
+ cleanup_keep_file(fn,wait: false)
233
+ tempfn = fn+'.'+RUNNINGEXT
234
+ File.unlink(tempfn) if File.exist?(tempfn)
235
+ end
236
+ cleanup_tmpdir()
237
+ end
238
+
239
+ private
240
+
241
+ def mktmpfilename(num,ext=nil)
242
+ @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
243
+ end
244
+
245
+ def pid_or_file_running?(pid,fn)
246
+ (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
247
+ end
248
+
249
+ def pid_running?(pid)
250
+ begin
251
+ fpid,status=Process.waitpid2(pid,Process::WNOHANG)
252
+ rescue Errno::ECHILD, Errno::ESRCH
253
+ return false
254
+ end
255
+ return true if nil == fpid && nil == status
256
+ return ! (status.exited? || status.signaled?)
257
+ end
258
+
259
+ def single_threaded
260
+ @num_threads == 1
261
+ end
262
+
263
+ def multi_threaded
264
+ @num_threads > 1
265
+ end
266
+
267
+ def cpu_count
268
+ begin
269
+ return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
270
+ # Actually, the JVM does not allow fork...
271
+ return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
272
+ rescue LoadError
273
+ # Count on MAC
274
+ return Integer `sysctl -n hw.ncpu 2>/dev/null`
275
+ end
276
+ $stderr.print "Could not determine number of CPUs" if not @quiet
277
+ 1
278
+ end
279
+
280
+ def cleanup_keep_file(fn, opts = { wait: true })
281
+ if not @debug
282
+ keep = fn+'.keep'
283
+ return if not opts[:wait] and !File.exist?(keep)
284
+ $stderr.print "Trying to remove #{keep}\n" if not @quiet
285
+ while true
286
+ if File.exist?(keep)
287
+ $stderr.print "Removing #{keep}\n" if not @quiet
288
+ File.unlink(keep)
289
+ break # forever loop
290
+ end
291
+ sleep 0.1
292
+ end #forever
293
+ end
294
+ end
295
+
296
+ def cleanup_tmpdir
297
+ if not @debug
298
+ $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
299
+ Dir.unlink(@tmpdir) if @tmpdir
300
+ end
301
+ end
302
+
303
+ end