bio-vcf 0.8.1 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +2 -8
  4. data/LICENSE.txt +1 -1
  5. data/README.md +467 -129
  6. data/RELEASE_NOTES.md +27 -0
  7. data/RELEASE_NOTES.md~ +11 -0
  8. data/Rakefile +9 -42
  9. data/TAGS +115 -0
  10. data/VERSION +1 -1
  11. data/bin/bio-vcf +156 -108
  12. data/bio-vcf.gemspec +13 -75
  13. data/features/cli.feature +22 -4
  14. data/features/diff_count.feature +0 -1
  15. data/features/filter.feature +12 -0
  16. data/features/multisample.feature +12 -0
  17. data/features/somaticsniper.feature +2 -0
  18. data/features/step_definitions/cli-feature.rb +15 -6
  19. data/features/step_definitions/diff_count.rb +1 -1
  20. data/features/step_definitions/multisample.rb +19 -0
  21. data/features/step_definitions/somaticsniper.rb +9 -1
  22. data/features/step_definitions/vcf_header.rb +48 -0
  23. data/features/support/env.rb +1 -11
  24. data/features/vcf_header.feature +35 -0
  25. data/lib/bio-vcf.rb +1 -0
  26. data/lib/bio-vcf/pcows.rb +303 -0
  27. data/lib/bio-vcf/vcffile.rb +46 -0
  28. data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
  29. data/lib/bio-vcf/vcfheader.rb +137 -5
  30. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  31. data/lib/bio-vcf/vcfrecord.rb +56 -18
  32. data/lib/bio-vcf/vcfsample.rb +26 -2
  33. data/lib/regressiontest.rb +11 -0
  34. data/lib/regressiontest/cli_exec.rb +101 -0
  35. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  36. data/ragel/generate.sh +8 -0
  37. data/template/vcf2json.erb +16 -16
  38. data/template/vcf2json_full_header.erb +22 -0
  39. data/template/vcf2json_use_meta.erb +41 -0
  40. data/test/data/input/empty.vcf +2 -0
  41. data/test/data/input/gatk_exome.vcf +237 -0
  42. data/test/data/input/gatk_wgs.vcf +1000 -0
  43. data/test/data/input/test.bed +632 -0
  44. data/test/data/regression/empty-stderr.new +12 -0
  45. data/test/data/regression/empty.new +2 -0
  46. data/test/data/regression/empty.ref +2 -0
  47. data/test/data/regression/eval_once-stderr.new +2 -0
  48. data/test/data/regression/eval_once.new +1 -0
  49. data/test/data/regression/eval_once.ref +1 -0
  50. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  51. data/test/data/regression/eval_r.info.dp.new +150 -0
  52. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  53. data/test/data/regression/ifilter_s.dp.new +31 -0
  54. data/test/data/regression/pass1-stderr.new +10 -0
  55. data/test/data/regression/pass1.new +88 -0
  56. data/test/data/regression/pass1.ref +88 -0
  57. data/test/data/regression/r.info.dp-stderr.new +4 -0
  58. data/test/data/regression/r.info.dp.new +114 -0
  59. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  60. data/test/data/regression/rewrite.info.sample.new +150 -0
  61. data/test/data/regression/s.dp-stderr.new +18 -0
  62. data/test/data/regression/s.dp.new +145 -0
  63. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  64. data/test/data/regression/seval_s.dp.new +36 -0
  65. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  66. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  67. data/test/data/regression/thread4-stderr.new +10 -0
  68. data/test/data/regression/thread4.new +150 -0
  69. data/test/data/regression/thread4_4-stderr.new +25 -0
  70. data/test/data/regression/thread4_4.new +130 -0
  71. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  72. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
  73. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  74. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  75. data/test/data/regression/vcf2json_full_header.new +225 -0
  76. data/test/data/regression/vcf2json_full_header.ref +225 -0
  77. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  78. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  79. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  80. data/test/performance/metrics.md +18 -1
  81. data/test/stress/stress_test.sh +15 -0
  82. data/test/tmp/test.vcf +12469 -0
  83. metadata +63 -64
  84. data/Gemfile.lock +0 -81
@@ -1,18 +1,13 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
1
+ # No longer generated by jeweler
4
2
  # -*- encoding: utf-8 -*-
5
- # stub: bio-vcf 0.8.1 ruby lib
6
3
 
7
4
  Gem::Specification.new do |s|
8
5
  s.name = "bio-vcf"
9
- s.version = "0.8.1"
6
+ s.version = File.read("VERSION")
10
7
 
11
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib"]
13
9
  s.authors = ["Pjotr Prins"]
14
- s.date = "2014-11-26"
15
- s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
10
+ s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
16
11
  s.email = "pjotr.public01@thebird.nl"
17
12
  s.executables = ["bio-vcf"]
18
13
  s.extra_rdoc_files = [
@@ -22,82 +17,25 @@ Gem::Specification.new do |s|
22
17
  s.files = [
23
18
  ".travis.yml",
24
19
  "Gemfile",
25
- "Gemfile.lock",
26
20
  "LICENSE.txt",
27
21
  "README.md",
28
22
  "Rakefile",
29
23
  "VERSION",
30
24
  "bin/bio-vcf",
31
25
  "bio-vcf.gemspec",
32
- "features/cli.feature",
33
- "features/diff_count.feature",
34
- "features/multisample.feature",
35
- "features/sfilter.feature",
36
- "features/somaticsniper.feature",
37
- "features/step_definitions/bio-vcf_steps.rb",
38
- "features/step_definitions/cli-feature.rb",
39
- "features/step_definitions/diff_count.rb",
40
- "features/step_definitions/multisample.rb",
41
- "features/step_definitions/sfilter.rb",
42
- "features/step_definitions/somaticsniper.rb",
43
- "features/support/env.rb",
44
- "lib/bio-vcf.rb",
45
- "lib/bio-vcf/bedfilter.rb",
46
- "lib/bio-vcf/template.rb",
47
- "lib/bio-vcf/utils.rb",
48
- "lib/bio-vcf/variant.rb",
49
- "lib/bio-vcf/vcf.rb",
50
- "lib/bio-vcf/vcfgenotypefield.rb",
51
- "lib/bio-vcf/vcfheader.rb",
52
- "lib/bio-vcf/vcfline.rb",
53
- "lib/bio-vcf/vcfrdf.rb",
54
- "lib/bio-vcf/vcfrecord.rb",
55
- "lib/bio-vcf/vcfsample.rb",
56
- "lib/bio-vcf/vcfstatistics.rb",
57
- "template/gatk_vcf2rdf.erb",
58
- "template/vcf2json.erb",
59
- "template/vcf2rdf.erb",
60
- "template/vcf2rdf_header.erb",
61
- "test/data/input/dbsnp.vcf",
62
- "test/data/input/multisample.vcf",
63
- "test/data/input/somaticsniper.vcf",
64
- "test/data/regression/eval_r.info.dp.ref",
65
- "test/data/regression/ifilter_s.dp.ref",
66
- "test/data/regression/r.info.dp.ref",
67
- "test/data/regression/rewrite.info.sample.ref",
68
- "test/data/regression/s.dp.ref",
69
- "test/data/regression/seval_s.dp.ref",
70
- "test/data/regression/sfilter_seval_s.dp.ref",
71
- "test/data/regression/thread4.ref",
72
- "test/data/regression/thread4_4.ref",
73
- "test/data/regression/thread4_4_failed_filter-stderr.ref",
74
- "test/performance/metrics.md"
26
+ "ragel/gen_vcfheaderline_parser.rl",
27
+ "ragel/generate.sh",
75
28
  ]
76
- s.homepage = "http://github.com/pjotrp/bioruby-vcf"
29
+ s.files += Dir['lib/**/*.rb'] + Dir['bin/*']
30
+ s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
31
+ Dir['template/**/*']
32
+
33
+ s.homepage = "http://github.com/vcflib/bio-vcf"
77
34
  s.licenses = ["MIT"]
35
+ s.require_paths = ["lib"]
78
36
  s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
79
- s.rubygems_version = "2.2.2"
80
- s.summary = "Fast multi-threaded VCF parser"
81
-
82
- if s.respond_to? :specification_version then
83
- s.specification_version = 4
37
+ # s.rubygems_version = "2.0.3"
38
+ s.summary = "Fast multi-purpose multi-threaded VCF parser"
84
39
 
85
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
86
- s.add_development_dependency(%q<rspec>, [">= 0"])
87
- s.add_development_dependency(%q<cucumber>, [">= 0"])
88
- s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
89
- s.add_development_dependency(%q<regressiontest>, ["~> 0.0.3"])
90
- else
91
- s.add_dependency(%q<rspec>, [">= 0"])
92
- s.add_dependency(%q<cucumber>, [">= 0"])
93
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
94
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
95
- end
96
- else
97
- s.add_dependency(%q<rspec>, [">= 0"])
98
- s.add_dependency(%q<cucumber>, [">= 0"])
99
- s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
100
- s.add_dependency(%q<regressiontest>, ["~> 0.0.3"])
101
- end
102
40
  end
103
41
 
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
11
11
  Scenario: Test the info filter using dp and threads
12
12
  Given I have input file(s) named "test/data/input/multisample.vcf"
13
13
  When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14
- Then I expect the named output to match the named output "thread4"
14
+ Then I expect the named output to match the named output "thread4" in under 30 seconds
15
15
 
16
16
  Scenario: Test the info filter using dp and threads with lines
17
17
  Given I have input file(s) named "test/data/input/multisample.vcf"
18
18
  When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19
- Then I expect the named output to match the named output "thread4_4"
19
+ Then I expect the named output to match the named output "thread4_4" in under 30 seconds
20
20
 
21
21
  Scenario: Test the sample filter using dp
22
22
  Given I have input file(s) named "test/data/input/multisample.vcf"
@@ -43,14 +43,32 @@ Feature: Command-line interface (CLI)
43
43
  When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
44
44
  Then I expect the named output to match the named output "sfilter_seval_s.dp"
45
45
 
46
-
47
46
  Scenario: Rewrite an info field
48
47
  Given I have input file(s) named "test/data/input/multisample.vcf"
49
48
  When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
50
49
  Then I expect the named output to match the named output "rewrite.info.sample"
51
50
 
51
+ Scenario: Test eval-once
52
+ Given I have input file(s) named "test/data/input/multisample.vcf"
53
+ When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
54
+ Then I expect the named output to match the named output "eval_once"
55
+
56
+ Scenario: Test JSON output with header meta data
57
+ Given I have input file(s) named "test/data/input/multisample.vcf"
58
+ When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
59
+ Then I expect the named output to match the named output "vcf2json_full_header"
60
+
61
+ Scenario: Test JSON output with header meta data and query samples
62
+ Given I have input file(s) named "test/data/input/multisample.vcf"
63
+ When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
64
+ Then I expect the named output to match the named output "vcf2json_use_meta"
65
+
52
66
  Scenario: Test deadlock on failed filter with threads
53
67
  Given I have input file(s) named "test/data/input/multisample.vcf"
54
- When I execute "./bin/bio-vcf --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
68
+ When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
55
69
  Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
56
70
 
71
+ Scenario: Test VCF with no records
72
+ Given I have input file(s) named "test/data/input/empty.vcf"
73
+ When I execute "./bin/bio-vcf --timeout=5"
74
+ Then I expect no errors
@@ -21,7 +21,6 @@ Feature: Variant calling (filters) - diffing nucleotide counts
21
21
  Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22
22
  When I look for the difference
23
23
  Then I expect the diff to be [0,15,0,11]
24
- And the relative diff to be [0,0.23,0,0.85]
25
24
  And I expect the defining tumor nucleotide to be "T"
26
25
  And I expect the tumor count to be 12
27
26
  When I set an inclusion threshold for the reference
@@ -0,0 +1,12 @@
1
+ @filter
2
+ Feature: Adding filters
3
+
4
+ bio-vcf can add soft filters. Rather than removing failing items we can
5
+ inject filter state into the FILTER field. To add state such as PASS or
6
+ LowDepth simply use a filter and the --set-filter switch. If a filter already
7
+ has state the new one is appended with a semi-colon.
8
+
9
+ Scenario: Test the info filter using dp and threads
10
+ Given I have input file(s) named "test/data/input/somaticsniper.vcf"
11
+ When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
12
+ Then I expect the named output to match the named output "pass1"
@@ -25,7 +25,10 @@ Feature: Multi-sample VCF
25
25
  And I expect rec.info.ac to be 5
26
26
  And I expect rec.info.af to be 0.357
27
27
  And I expect rec.info.dp to be 1537
28
+ And I expect rec.info['dp'] to be 1537
28
29
  And I expect rec.info.readposranksum to be 0.815
30
+ And I expect rec.info['ReadPosRankSum'] to be 0.815
31
+ And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
29
32
  And I expect rec.sample['Original'].ad to be [189,25]
30
33
  And I expect rec.sample['Original'].gt to be "0/1"
31
34
  And I expect rec.sample['s3t2'].ad to be [167,26]
@@ -76,3 +79,12 @@ Feature: Multi-sample VCF
76
79
  And I expect r.original.gts to be ["C","G"]
77
80
  And I expect r.original.gts[0] to be "C"
78
81
  And I expect r.original.gts[1] to be "G"
82
+
83
+ # INFO fields with matching tails
84
+ Given multisample vcf line
85
+ """
86
+ 1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33
87
+ """
88
+ When I parse the record
89
+ Then I expect r.info.end to be 111
90
+ And I expect r.info.ciend to be 999
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
46
46
  And I expect rec.tumor.amq.to_ary to be [37,37]
47
47
  And I expect rec.tumor.mq to be 37
48
48
  And I expect rec.tumor.ss to be 2
49
+ And I expect rec.tumor.ssc to be 33
50
+ And I expect rec.normal.ssc to be nil
49
51
  # The following are additional functions
50
52
  And I expect rec.call_diff to be [-4,2,-2,0]
51
53
  And I expect rec.call_nuc to be "C"
@@ -7,10 +7,19 @@ When /^I execute "(.*?)"$/ do |arg1|
7
7
  @cmd = arg1 + ' < ' + @filenames[0]
8
8
  end
9
9
 
10
- Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '##BioVcf=').should be_true
12
- end
10
+ # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12
+ # end
13
13
 
14
- Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
15
- RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_true
16
- end
14
+ # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16
+ # end
17
+
18
+
19
+ # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20
+ # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21
+ # end
22
+
23
+ # Then(/^I expect no errors$/) do
24
+ # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25
+ # end
@@ -34,7 +34,7 @@ Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/
34
34
  end
35
35
 
36
36
  Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37
- res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4+'.'+arg5).to_f]
37
+ res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38
38
  expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39
39
  end
40
40
 
@@ -37,6 +37,18 @@ Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
37
37
  expect(@rec1.info.readposranksum).to eq 0.815
38
38
  end
39
39
 
40
+ Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
41
+ expect(@rec1.info['dp']).to eq 1537
42
+ end
43
+
44
+ Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
45
+ expect(@rec1.info['ReadPosRankSum']).to eq 0.815
46
+ end
47
+
48
+ Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
49
+ expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
50
+ end
51
+
40
52
  Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
41
53
  expect(@rec1.sample['Original'].gt).to eq "0/1"
42
54
  end
@@ -161,3 +173,10 @@ Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
161
173
  expect(@rec1.original.gts[arg1.to_i]).to eq arg2
162
174
  end
163
175
 
176
+ Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
177
+ expect(@rec1.info.end).to eq arg1.to_i
178
+ end
179
+
180
+ Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
181
+ expect(@rec1.info.ciend).to eq arg1.to_i
182
+ end
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
99
99
  end
100
100
 
101
101
 
102
+ Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
103
+ expect(@rec.tumor.ssc).to be 33
104
+ end
105
+
106
+ Then(/^I expect rec\.normal\.ssc to be nil$/) do
107
+ expect(@rec.normal.ssc).to be nil
108
+ end
109
+
102
110
  Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
103
111
  expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
104
112
  end
@@ -116,7 +124,7 @@ Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
116
124
  end
117
125
 
118
126
  Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
119
- expect(@rec.call_tumor_relative_count).to eq (arg1+'.'+arg2).to_f
127
+ expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
120
128
  end
121
129
 
122
130
 
@@ -0,0 +1,48 @@
1
+ Given(/^the VCF header lines$/) do |string|
2
+ header = VcfHeader.new
3
+ header.add string
4
+ @vcf = header
5
+ end
6
+
7
+ When(/^I parse the VCF header$/) do
8
+ end
9
+
10
+ Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
11
+ expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
12
+ end
13
+
14
+ Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
15
+ expect(@vcf.fileformat).to eq arg1
16
+ end
17
+
18
+ Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
19
+ expect(@vcf.fileDate).to eq arg1
20
+ end
21
+
22
+ Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
23
+ expect(@vcf.field['fileDate']).to eq arg1
24
+ end
25
+
26
+ Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
27
+ expect(@vcf.phasing).to eq arg1
28
+ end
29
+
30
+ Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
31
+ expect(@vcf.reference).to eq arg1
32
+ end
33
+
34
+ Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
35
+ expect(@vcf.format[arg1].to_s).to eq arg2
36
+ end
37
+
38
+ Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
39
+ expect(@vcf.info[arg1].to_s).to eq arg2
40
+ end
41
+
42
+ Then(/^I expect vcf\.meta to contain all header meta information$/) do
43
+ m = @vcf.meta
44
+ expect(m['fileformat']).to eq "VCFv4.1"
45
+ expect(m['FORMAT']['DP']['Number']).to eq "1"
46
+ expect(m.size).to be 9
47
+ end
48
+
@@ -1,13 +1,3 @@
1
- require 'bundler'
2
- begin
3
- Bundler.setup(:default, :development)
4
- rescue Bundler::BundlerError => e
5
- $stderr.puts e.message
6
- $stderr.puts "Run `bundle install` to install missing gems"
7
- exit e.status_code
8
- end
9
-
10
- # require 'mini/test'
11
1
 
12
2
  $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
13
3
  require 'bio-vcf'
@@ -16,7 +6,7 @@ require 'rspec/expectations'
16
6
 
17
7
  # Add the regression module if in the path (it can also be a gem)
18
8
  rootdir = File.dirname(__FILE__) + '/../..'
19
- $LOAD_PATH.unshift(rootdir+'/lib',rootdir+'/../regressiontest/lib')
9
+ $LOAD_PATH.unshift(rootdir+'/lib/regressiontest',rootdir+'/../regressiontest/lib')
20
10
  require 'regressiontest'
21
11
 
22
12
  include BioVcf
@@ -0,0 +1,35 @@
1
+ @meta
2
+ Feature: Parsing VCF meta information from the header
3
+
4
+ Take a header and parse that information as defined by the VCF standard.
5
+
6
+ Scenario: When parsing a header line
7
+
8
+ Given the VCF header lines
9
+ """
10
+ ##fileformat=VCFv4.1
11
+ ##fileDate=20140121
12
+ ##phasing=none
13
+ ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
14
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
15
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
16
+ ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
17
+ ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
18
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
19
+ """
20
+ When I parse the VCF header
21
+ Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
22
+ And I expect vcf.fileformat to be "VCFv4.1"
23
+ And I expect vcf.fileDate to be "20140121"
24
+ And I expect vcf.field['fileDate'] to be "20140121"
25
+ And I expect vcf.phasing to be "none"
26
+ And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
27
+ And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
28
+ And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
29
+ And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
30
+ And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
31
+ And I expect vcf.meta to contain all header meta information
32
+
33
+ Scenario: When parsing the header of somatic_sniper.vcf
34
+
35
+ Do something
@@ -11,6 +11,7 @@
11
11
  require 'bio-vcf/utils'
12
12
  require 'bio-vcf/vcf'
13
13
  require 'bio-vcf/vcfsample'
14
+ require 'bio-vcf/vcfheader_line'
14
15
  require 'bio-vcf/vcfheader'
15
16
  require 'bio-vcf/vcfline'
16
17
  require 'bio-vcf/vcfgenotypefield'
@@ -0,0 +1,303 @@
1
+ # Parallel copy-on-write streaming (PCOWS)
2
+
3
+ require 'tempfile'
4
+
5
+ class PCOWS
6
+
7
+ RUNNINGEXT = 'part' # file extension
8
+
9
+ def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
10
+ num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
11
+ # $stderr.print "Using ",num_threads,"threads \n"
12
+ @num_threads = num_threads
13
+ @chunk_size = chunk_size
14
+ @pid_list = []
15
+ @name = name
16
+ @timeout = timeout
17
+ @quiet = quiet
18
+ @debug = debug
19
+ if @debug
20
+ $stderr.print "PCOWS running in DEBUG MODE\n"
21
+ end
22
+ if multi_threaded
23
+ @tmpdir = Dir::mktmpdir(@name+'_')
24
+ end
25
+ @last_output = 0 # counter
26
+ @output_locked = false
27
+ end
28
+
29
+ # Feed the worker 'func and state' to COWS. Note that func is a
30
+ # lambda closure so it can pick up surrounding scope at invocation
31
+ # in addition to the data captured in 'state'.
32
+
33
+ def submit_worker(func,state)
34
+ pid = nil
35
+ if multi_threaded
36
+ count = @pid_list.size+1
37
+ fn = mktmpfilename(count)
38
+ pid = fork do
39
+ # ---- This is running a new copy-on-write process
40
+ tempfn = fn+'.'+RUNNINGEXT
41
+ STDOUT.reopen(File.open(tempfn, 'w+'))
42
+ func.call(state).each { | line | print line }
43
+ STDOUT.flush
44
+ STDOUT.close
45
+ # sleep 0.1
46
+ # f.flush
47
+ # f.close
48
+ # sleep 0.2 # interval to make sure we are done writing,
49
+ # otherwise there may be misses at the end of a
50
+ # block (maybe the f.close fixed it)
51
+
52
+ FileUtils::mv(tempfn,fn)
53
+ exit(0)
54
+ end
55
+ Process.detach(pid)
56
+ else
57
+ # ---- Single threaded: call in main process and output immediately
58
+ func.call(state).each { | line | print line }
59
+ end
60
+ @pid_list << [ pid,count,fn ]
61
+ return true
62
+ end
63
+
64
+ def submit_final_worker(func,state)
65
+ @final_worker = true
66
+ submit_worker(func,state)
67
+ end
68
+
69
+ # Make sure no more than num_threads are running at the same time -
70
+ # this is achieved by checking the PID table and the running files
71
+ # in the tmpdir
72
+
73
+ def wait_for_worker_slot()
74
+ return if single_threaded
75
+ Timeout.timeout(@timeout) do
76
+ printed_timeout_message = false
77
+ while true
78
+ # ---- count running pids
79
+ running = @pid_list.reduce(0) do | sum, info |
80
+ (pid,count,fn) = info
81
+ if pid_or_file_running?(pid,fn)
82
+ sum+1
83
+ else
84
+ sum
85
+ end
86
+ end
87
+ return if running < @num_threads
88
+ if not printed_timeout_message
89
+ $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
90
+ printed_timeout_message = true
91
+ end
92
+ sleep 0.1
93
+ end
94
+ end
95
+ end
96
+
97
+ # ---- In this section the output gets collected and passed on to a
98
+ # printer thread. This function makes sure the printing is
99
+ # ordered and that no printers are running at the same
100
+ # time. The printer thread should be doing as little processing
101
+ # as possible.
102
+ #
103
+ # In this implementation type==:by_line will call func for
104
+ # each line. Otherwise it is called once with the filename.
105
+ def process_output(func=nil,type=:by_line, blocking=false)
106
+ return if single_threaded
107
+ output = lambda { |fn|
108
+ if type == :by_line
109
+ File.new(fn).each_line { |buf|
110
+ print buf
111
+ }
112
+ else
113
+ func.call(fn)
114
+ end
115
+ }
116
+ if @output_locked
117
+ # ---- is the other thread still running? We wait until it
118
+ # is finished to start the next one
119
+ (pid,count,fn) = @output_locked
120
+ $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
121
+ return if File.exist?(fn) # continue because thread still processing
122
+ # Now we should remove the .keep file
123
+ cleanup_keep_file(fn)
124
+ @last_output += 1 # get next one in line
125
+ @output_locked = false
126
+ end
127
+ # ---- process the next output chunk. After completion it
128
+ # gets renamed to chunk.keep. This to avoid missing
129
+ # output (if we unlink the file prematurely)
130
+ if info = @pid_list[@last_output]
131
+ (pid,count,fn) = info
132
+ $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
133
+ if File.exist?(fn)
134
+ # Yes! We have the next output, create outputter
135
+ @output_locked = info
136
+ $stderr.print "Set lock on ",[info],"\n" if not @quiet
137
+ if not blocking
138
+ $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
139
+ pid = fork do
140
+ output.call(fn)
141
+ # after finishing output move it to .keep
142
+ FileUtils::mv(fn,fn+'.keep')
143
+ exit(0)
144
+ end
145
+ Process.detach(pid)
146
+ else
147
+ $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
148
+ output.call(fn)
149
+ FileUtils::mv(fn,fn+'.keep')
150
+ end
151
+ else
152
+ sleep 0.2
153
+ end
154
+ end
155
+ end
156
+
157
+ # Wait for a worker slot to appear. When working the pid is writing
158
+ # a file with extension .part(ial). After completion the file is
159
+ # renamed without .part and a slot is free.
160
+ def wait_for_worker(info)
161
+ (pid,count,fn) = info
162
+ if pid_or_file_running?(pid,fn)
163
+ $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
164
+ begin
165
+ Timeout.timeout(@timeout) do
166
+ while not File.exist?(fn) # wait for the result to appear
167
+ sleep 0.2
168
+ return if not pid_or_file_running?(pid,fn) # worker is gone
169
+ end
170
+ end
171
+ # Partial file should have been renamed:
172
+ raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
173
+ $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
174
+ rescue Timeout::Error
175
+ # Kill it to speed up exit
176
+ Process.kill 9, pid
177
+ Process.wait pid
178
+ $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
179
+ $stderr.print "Bailing out"
180
+ raise
181
+ end
182
+ end
183
+ end
184
+
185
+ # This is the final cleanup after the reader thread is done. All workers
186
+ # need to complete.
187
+
188
+ def wait_for_workers()
189
+ return if single_threaded
190
+ @pid_list.each do |info|
191
+ wait_for_worker(info)
192
+ end
193
+ end
194
+
195
+ def process_remaining_output()
196
+ return if single_threaded
197
+ $stderr.print "Processing remaining output...\n" if not @quiet
198
+ while @output_locked
199
+ sleep 0.2
200
+ process_output() # keep trying
201
+ end
202
+ @pid_list.each do |info|
203
+ (pid,count,fn) = info
204
+ while pid_or_file_running?(pid,fn) or File.exist?(fn)
205
+ $stderr.print "Trying: ",[info],"\n" if not @quiet
206
+ process_output(nil,:by_line,true)
207
+ sleep 0.2
208
+ end
209
+ end
210
+ while @output_locked
211
+ sleep 0.1
212
+ process_output(nil,:by_line,true)
213
+ end
214
+ cleanup_tmpdir()
215
+ end
216
+
217
+ def cleanup()
218
+ @pid_list.each do |info|
219
+ (pid,count,fn) = info
220
+ if pid_running?(pid)
221
+ $stderr.print "Killing child ",[info],"\n"
222
+ begin
223
+ Process.kill 9, pid
224
+ Process.wait pid
225
+ rescue Errno::ENOENT
226
+ $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
227
+ rescue Errno::ESRCH
228
+ $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
229
+ end
230
+ end
231
+ File.unlink(fn) if File.exist?(fn)
232
+ cleanup_keep_file(fn,wait: false)
233
+ tempfn = fn+'.'+RUNNINGEXT
234
+ File.unlink(tempfn) if File.exist?(tempfn)
235
+ end
236
+ cleanup_tmpdir()
237
+ end
238
+
239
+ private
240
+
241
+ def mktmpfilename(num,ext=nil)
242
+ @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
243
+ end
244
+
245
+ def pid_or_file_running?(pid,fn)
246
+ (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
247
+ end
248
+
249
+ def pid_running?(pid)
250
+ begin
251
+ fpid,status=Process.waitpid2(pid,Process::WNOHANG)
252
+ rescue Errno::ECHILD, Errno::ESRCH
253
+ return false
254
+ end
255
+ return true if nil == fpid && nil == status
256
+ return ! (status.exited? || status.signaled?)
257
+ end
258
+
259
+ def single_threaded
260
+ @num_threads == 1
261
+ end
262
+
263
+ def multi_threaded
264
+ @num_threads > 1
265
+ end
266
+
267
+ def cpu_count
268
+ begin
269
+ return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
270
+ # Actually, the JVM does not allow fork...
271
+ return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
272
+ rescue LoadError
273
+ # Count on MAC
274
+ return Integer `sysctl -n hw.ncpu 2>/dev/null`
275
+ end
276
+ $stderr.print "Could not determine number of CPUs" if not @quiet
277
+ 1
278
+ end
279
+
280
+ def cleanup_keep_file(fn, opts = { wait: true })
281
+ if not @debug
282
+ keep = fn+'.keep'
283
+ return if not opts[:wait] and !File.exist?(keep)
284
+ $stderr.print "Trying to remove #{keep}\n" if not @quiet
285
+ while true
286
+ if File.exist?(keep)
287
+ $stderr.print "Removing #{keep}\n" if not @quiet
288
+ File.unlink(keep)
289
+ break # forever loop
290
+ end
291
+ sleep 0.1
292
+ end #forever
293
+ end
294
+ end
295
+
296
+ def cleanup_tmpdir
297
+ if not @debug
298
+ $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
299
+ Dir.unlink(@tmpdir) if @tmpdir
300
+ end
301
+ end
302
+
303
+ end