bio-vcf 0.8.1 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +2 -8
  4. data/LICENSE.txt +1 -1
  5. data/README.md +467 -129
  6. data/RELEASE_NOTES.md +27 -0
  7. data/RELEASE_NOTES.md~ +11 -0
  8. data/Rakefile +9 -42
  9. data/TAGS +115 -0
  10. data/VERSION +1 -1
  11. data/bin/bio-vcf +156 -108
  12. data/bio-vcf.gemspec +13 -75
  13. data/features/cli.feature +22 -4
  14. data/features/diff_count.feature +0 -1
  15. data/features/filter.feature +12 -0
  16. data/features/multisample.feature +12 -0
  17. data/features/somaticsniper.feature +2 -0
  18. data/features/step_definitions/cli-feature.rb +15 -6
  19. data/features/step_definitions/diff_count.rb +1 -1
  20. data/features/step_definitions/multisample.rb +19 -0
  21. data/features/step_definitions/somaticsniper.rb +9 -1
  22. data/features/step_definitions/vcf_header.rb +48 -0
  23. data/features/support/env.rb +1 -11
  24. data/features/vcf_header.feature +35 -0
  25. data/lib/bio-vcf.rb +1 -0
  26. data/lib/bio-vcf/pcows.rb +303 -0
  27. data/lib/bio-vcf/vcffile.rb +46 -0
  28. data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
  29. data/lib/bio-vcf/vcfheader.rb +137 -5
  30. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  31. data/lib/bio-vcf/vcfrecord.rb +56 -18
  32. data/lib/bio-vcf/vcfsample.rb +26 -2
  33. data/lib/regressiontest.rb +11 -0
  34. data/lib/regressiontest/cli_exec.rb +101 -0
  35. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  36. data/ragel/generate.sh +8 -0
  37. data/template/vcf2json.erb +16 -16
  38. data/template/vcf2json_full_header.erb +22 -0
  39. data/template/vcf2json_use_meta.erb +41 -0
  40. data/test/data/input/empty.vcf +2 -0
  41. data/test/data/input/gatk_exome.vcf +237 -0
  42. data/test/data/input/gatk_wgs.vcf +1000 -0
  43. data/test/data/input/test.bed +632 -0
  44. data/test/data/regression/empty-stderr.new +12 -0
  45. data/test/data/regression/empty.new +2 -0
  46. data/test/data/regression/empty.ref +2 -0
  47. data/test/data/regression/eval_once-stderr.new +2 -0
  48. data/test/data/regression/eval_once.new +1 -0
  49. data/test/data/regression/eval_once.ref +1 -0
  50. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  51. data/test/data/regression/eval_r.info.dp.new +150 -0
  52. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  53. data/test/data/regression/ifilter_s.dp.new +31 -0
  54. data/test/data/regression/pass1-stderr.new +10 -0
  55. data/test/data/regression/pass1.new +88 -0
  56. data/test/data/regression/pass1.ref +88 -0
  57. data/test/data/regression/r.info.dp-stderr.new +4 -0
  58. data/test/data/regression/r.info.dp.new +114 -0
  59. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  60. data/test/data/regression/rewrite.info.sample.new +150 -0
  61. data/test/data/regression/s.dp-stderr.new +18 -0
  62. data/test/data/regression/s.dp.new +145 -0
  63. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  64. data/test/data/regression/seval_s.dp.new +36 -0
  65. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  66. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  67. data/test/data/regression/thread4-stderr.new +10 -0
  68. data/test/data/regression/thread4.new +150 -0
  69. data/test/data/regression/thread4_4-stderr.new +25 -0
  70. data/test/data/regression/thread4_4.new +130 -0
  71. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  72. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
  73. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  74. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  75. data/test/data/regression/vcf2json_full_header.new +225 -0
  76. data/test/data/regression/vcf2json_full_header.ref +225 -0
  77. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  78. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  79. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  80. data/test/performance/metrics.md +18 -1
  81. data/test/stress/stress_test.sh +15 -0
  82. data/test/tmp/test.vcf +12469 -0
  83. metadata +63 -64
  84. data/Gemfile.lock +0 -81
@@ -0,0 +1,27 @@
1
+ ## ChangeLog v0.9.5 (20210118)
2
+
3
+ + Improved README and installation instructions
4
+ + Added guix.scm build and instructions (no need for bundler)
5
+ + Moved regressiontest into tree
6
+
7
+ ## ChangeLog v0.9.4 (20201222)
8
+
9
+ This is an important maintenance release of bio-vcf:
10
+
11
+ + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf)
12
+ + Fixed tests to match recent Ruby updates
13
+
14
+ ## Older release notes
15
+
16
+ + Getting ready for a 1.0 release
17
+ + Released 0.9.2 as a gem
18
+ + 0.9.1 removed a rare threading bug and cleanup on error
19
+ + Added support for soft filters (request by Brad Chapman)
20
+ + The outputter now writes (properly) in parallel with the parser
21
+ + bio-vcf turns any VCF into JSON with header information, and
22
+ allows you to pipe that JSON directly into any JSON supporting
23
+ language, including Python and Javascript!
24
+
25
+ ## Older changes
26
+
27
+ For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
@@ -0,0 +1,11 @@
1
+ ## RELEASE NOTES
2
+
3
+
4
+ * Getting ready for a 1.0 release
5
+ * Released 0.9.2 as a gem
6
+ * 0.9.1 removed a rare threading bug and cleanup on error
7
+ * Added support for soft filters (request by Brad Chapman)
8
+ * The outputter now writes (properly) in parallel with the parser
9
+ * bio-vcf turns any VCF into JSON with header information, and
10
+ allows you to pipe that JSON directly into any JSON supporting
11
+ language, including Python and Javascript!
data/Rakefile CHANGED
@@ -1,54 +1,21 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
3
+ # require 'rubygems'
12
4
  require 'rake'
5
+ # require 'cucumber/rake/task'
13
6
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "bio-vcf"
18
- gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Fast multi-threaded VCF parser}
21
- gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
22
- gem.email = "pjotr.public01@thebird.nl"
23
- gem.authors = ["Pjotr Prins"]
24
- gem.required_ruby_version = '>=2.0.0'
25
- # dependencies defined in Gemfile
26
- end
27
- Jeweler::RubygemsDotOrgTasks.new
28
-
29
- # require 'rspec/core'
30
- # require 'rspec/core/rake_task'
31
- # RSpec::Core::RakeTask.new(:spec) do |spec|
32
- # spec.pattern = FileList['spec/**/*_spec.rb']
33
- # end
34
-
35
- # RSpec::Core::RakeTask.new(:rcov) do |spec|
36
- # spec.pattern = 'spec/**/*_spec.rb'
37
- # spec.rcov = true
7
+ # Cucumber::Rake::Task.new(:features) do |t|
8
+ # t.cucumber_opts = "--bundler false"
38
9
  # end
39
10
 
40
- # require 'rake/testtask'
41
-
42
- # Rake::TestTask.new do |t|
43
- # t.pattern = "spec/*_spec.rb"
44
- # end
45
-
46
- require 'cucumber/rake/task'
47
- Cucumber::Rake::Task.new(:features)
11
+ desc 'Run cucumber' # without bundler
12
+ task :features do
13
+ sh 'cucumber features'
14
+ end
48
15
 
49
16
  task :default => :features
50
17
 
51
- task :test => [ :features ]
18
+ task :test => [ :features ]
52
19
 
53
20
  require 'rdoc/task'
54
21
  Rake::RDocTask.new do |rdoc|
data/TAGS ADDED
@@ -0,0 +1,115 @@
1
+
2
+ ./bin/bio-vcf,0
3
+
4
+ ./lib/bio-vcf.rb,0
5
+
6
+ ./lib/bio-vcf/vcfgenotypefield.rb,1553
7
+ module BioVcf::BioVcf1,0
8
+ class VcfNucleotides::BioVcf::VcfNucleotides7,167
9
+ def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
10
+ def []::BioVcf::VcfNucleotides#[]13,284
11
+ def to_ary::BioVcf::VcfNucleotides#to_ary27,628
12
+ def max::BioVcf::VcfNucleotides#max32,742
13
+ def min::BioVcf::VcfNucleotides#min37,856
14
+ def sum::BioVcf::VcfNucleotides#sum42,975
15
+ class VcfAltInfo::BioVcf::VcfAltInfo50,1082
16
+ def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
17
+ def []::BioVcf::VcfAltInfo#[]56,1194
18
+ def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
19
+ def max::BioVcf::VcfAltInfo#max75,1626
20
+ def min::BioVcf::VcfAltInfo#min79,1702
21
+ def sum::BioVcf::VcfAltInfo#sum83,1783
22
+ class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
23
+ def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
24
+ def dp4::BioVcf::VcfGenotypeField#dp496,2020
25
+ def ad::BioVcf::VcfGenotypeField#ad100,2098
26
+ def pl::BioVcf::VcfGenotypeField#pl104,2174
27
+ def bcount::BioVcf::VcfGenotypeField#bcount108,2250
28
+ def bq::BioVcf::VcfGenotypeField#bq112,2343
29
+ def amq::BioVcf::VcfGenotypeField#amq116,2424
30
+ def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
31
+ class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
32
+ def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
33
+ def []::BioVcf::VcfGenotypeFields#[]141,3021
34
+ def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
35
+
36
+ ./lib/bio-vcf/vcfrdf.rb,156
37
+ module BioVcf::BioVcf1,0
38
+ module VcfRdf::BioVcf::VcfRdf5,93
39
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
40
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
41
+
42
+ ./lib/bio-vcf/vcf.rb,27
43
+ module BioVcf::BioVcf2,1
44
+
45
+ ./lib/bio-vcf/vcfline.rb,118
46
+ module BioVcf::BioVcf1,0
47
+ module VcfLine::BioVcf::VcfLine2,16
48
+ def VcfLine.parse::BioVcf::VcfLine.parse5,82
49
+
50
+ ./lib/bio-vcf/vcfrecord.rb,1831
51
+ module BioVcf::BioVcf1,0
52
+ class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
53
+ def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
54
+ def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
55
+ module VcfRecordParser::BioVcf::VcfRecordParser18,329
56
+ def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
57
+ def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
58
+ module VcfRecordCall::BioVcf::VcfRecordCall30,592
59
+ def call_diff::BioVcf::VcfRecordCall#call_diff31,617
60
+ def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
61
+ def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
62
+ def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
63
+ def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
64
+ def index::BioVcf::VcfRecordCall#index51,1026
65
+ class VcfRecord::BioVcf::VcfRecord56,1125
66
+ attr_reader :header::BioVcf::VcfRecord#header60,1173
67
+ def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
68
+ def chrom::BioVcf::VcfRecord#chrom67,1292
69
+ def pos::BioVcf::VcfRecord#pos71,1332
70
+ def ids::BioVcf::VcfRecord#ids75,1384
71
+ def id::BioVcf::VcfRecord#id79,1443
72
+ def ref::BioVcf::VcfRecord#ref83,1476
73
+ def alt::BioVcf::VcfRecord#alt87,1524
74
+ def qual::BioVcf::VcfRecord#qual91,1582
75
+ def info::BioVcf::VcfRecord#info95,1636
76
+ def format::BioVcf::VcfRecord#format99,1711
77
+ def normal::BioVcf::VcfRecord#normal104,1848
78
+ def tumor::BioVcf::VcfRecord#tumor109,1997
79
+ def sample::BioVcf::VcfRecord#sample114,2134
80
+ def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
81
+ def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
82
+ def method_missing::BioVcf::VcfRecord#method_missing126,2341
83
+
84
+ ./lib/bio-vcf/variant.rb,470
85
+ module BioVcf::BioVcf1,0
86
+ module Variant::BioVcf::Variant3,17
87
+ def Variant.diff::BioVcf::Variant.diff5,37
88
+ def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
89
+ def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
90
+ def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
91
+ def Variant.index::BioVcf::Variant.index25,652
92
+ def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
93
+
94
+ ./lib/bio-vcf/vcfheader.rb,598
95
+ module BioVcf::BioVcf2,1
96
+ module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
97
+ def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
98
+ class VcfHeader::BioVcf::VcfHeader18,339
99
+ attr_reader :lines::BioVcf::VcfHeader#lines20,360
100
+ def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
101
+ def add::BioVcf::VcfHeader#add26,430
102
+ def version::BioVcf::VcfHeader#version30,483
103
+ def column_names::BioVcf::VcfHeader#column_names34,578
104
+ def columns::BioVcf::VcfHeader#columns38,674
105
+ def samples::BioVcf::VcfHeader#samples42,735
106
+
107
+ ./features/step_definitions/diff_count.rb,0
108
+
109
+ ./features/step_definitions/bio-vcf_steps.rb,0
110
+
111
+ ./features/step_definitions/somaticsniper.rb,0
112
+
113
+ ./features/step_definitions/multisample.rb,0
114
+
115
+ ./features/support/env.rb,0
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.1
1
+ 0.9.5
@@ -4,7 +4,7 @@
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT
6
6
  #
7
- # Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ # Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
8
8
 
9
9
  USAGE = "Vcf parser"
10
10
 
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
15
15
  version = File.new(VERSION_FILENAME).read.chomp
16
16
 
17
17
  require 'bio-vcf'
18
+ require 'bio-vcf/pcows'
18
19
  require 'optparse'
19
20
  require 'timeout'
20
21
  require 'fileutils'
21
22
 
22
- # Uncomment when using the bio-logger
23
+ # Uncomment when using the bio-logger
23
24
  # require 'bio-logger'
24
25
  # log = Bio::Log::LoggerPlus.new 'vcf'
25
- # log.outputters = Bio::Log::Outputter.stderr
26
+ # log.outputters = Bio::Log::Outputter.stderr
26
27
  # Bio::Log::CLI.logger('stderr')
27
28
  # Bio::Log::CLI.trace('info')
28
29
 
29
- options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000 }
30
+ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
30
31
  opts = OptionParser.new do |o|
31
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
32
33
 
33
- o.on('-i','--ignore-missing', 'Ignore missing data') do
34
+ o.on('-i','--ignore-missing', 'Ignore missing data') do
34
35
  options[:ignore_missing] = true
35
36
  end
36
37
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
@@ -57,6 +58,9 @@ opts = OptionParser.new do |o|
57
58
  o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
58
59
  options[:efilter_samples] = l
59
60
  end
61
+ o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
62
+ options[:add_filter] = name
63
+ end
60
64
 
61
65
  o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
62
66
  options[:bed] = bed
@@ -68,6 +72,9 @@ opts = OptionParser.new do |o|
68
72
  o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
69
73
  options[:eval_once] = true
70
74
  options[:eval] = cmd
75
+ # options[:num_threads] = 1
76
+ # options[:thread_lines] = 1
77
+ options[:skip_header] = true
71
78
  end
72
79
  o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
73
80
  options[:seval] = cmd
@@ -84,7 +91,7 @@ opts = OptionParser.new do |o|
84
91
  options[:rdf] = true
85
92
  options[:skip_header] = true
86
93
  end
87
- o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
94
+ o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
88
95
  options[:num_threads] = i
89
96
  end
90
97
  o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -96,8 +103,8 @@ opts = OptionParser.new do |o|
96
103
  o.on_tail("--tags list", String, "Add tags") do |s|
97
104
  options[:tags] = s
98
105
  end
99
-
100
- o.on("--skip-header", "Do not output VCF header info") do
106
+
107
+ o.on("--skip-header", "Do not output VCF header info") do
101
108
  options[:skip_header] = true
102
109
  end
103
110
 
@@ -112,9 +119,16 @@ opts = OptionParser.new do |o|
112
119
  options[:template] = s
113
120
  options[:skip_header] = true
114
121
  end
115
-
116
-
117
- # Uncomment the following when using the bio-logger
122
+
123
+ o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
124
+ options[:tag] = true
125
+ end
126
+
127
+ o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
128
+ options[:timeout] = i
129
+ end
130
+
131
+ # Uncomment the following when using the bio-logger
118
132
  # o.separator ""
119
133
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
120
134
  # Bio::Log::CLI.logger(name)
@@ -123,7 +137,16 @@ opts = OptionParser.new do |o|
123
137
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
124
138
  # Bio::Log::CLI.trace(s)
125
139
  # end
126
- #
140
+ #
141
+ o.on("--names", "Output sample names") do |q|
142
+ options[:quiet] = true
143
+ options[:num_threads] = nil
144
+ options[:eval_once] = true
145
+ options[:eval] = 'header.samples.join("\t")'
146
+ # options[:num_threads] = 1
147
+ # options[:thread_lines] = 1
148
+ options[:skip_header] = true
149
+ end
127
150
  o.on("--statistics", "Output statistics") do |q|
128
151
  options[:statistics] = true
129
152
  options[:num_threads] = nil
@@ -132,14 +155,15 @@ opts = OptionParser.new do |o|
132
155
  # Bio::Log::CLI.trace('error')
133
156
  options[:quiet] = true
134
157
  end
135
-
158
+
136
159
  o.on("-v", "--verbose", "Run verbosely") do |v|
137
160
  options[:verbose] = true
138
161
  end
139
-
140
- # o.on("--debug", "Show debug messages") do |v|
141
- # Bio::Log::CLI.trace('debug')
142
- # end
162
+
163
+ o.on("--debug", "Show debug messages and keep intermediate output") do |v|
164
+ # Bio::Log::CLI.trace('debug')
165
+ options[:debug] = true
166
+ end
143
167
 
144
168
  o.separator ""
145
169
  o.on_tail('-h', '--help', 'display this help and exit') do
@@ -150,11 +174,11 @@ end
150
174
  opts.parse!(ARGV)
151
175
 
152
176
  BIOVCF_VERSION=version
153
- BIOVCF_BANNER = "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
154
- $stderr.print BIOVCF_BANNER
177
+ BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
178
+ $stderr.print BIOVCF_BANNER if !options[:quiet]
155
179
 
156
- if options[:show_help]
157
- print opts
180
+ if options[:show_help]
181
+ print opts
158
182
  print USAGE
159
183
  exit 1
160
184
  end
@@ -174,15 +198,6 @@ if options[:template]
174
198
  template = Bio::Template.new(fn)
175
199
  end
176
200
 
177
- if options[:num_threads] != 1
178
- begin
179
- require 'parallel'
180
- rescue LoadError
181
- $stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
182
- options[:num_threads] = 1
183
- end
184
- end
185
-
186
201
  stats = nil
187
202
  if options[:statistics]
188
203
  options[:num_threads] = nil
@@ -193,6 +208,8 @@ end
193
208
  raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
194
209
  raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
195
210
  raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
211
+ # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
212
+ # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
196
213
 
197
214
  if options[:samples]
198
215
  samples = options[:samples].map { |s| s.to_i }
@@ -200,13 +217,14 @@ end
200
217
 
201
218
  include BioVcf
202
219
 
203
- # Parse the header section of a VCF file
220
+ # Parse the header section of a VCF file (chomping STDIN)
204
221
  def parse_header line, samples, options
205
- header = VcfHeader.new
222
+ header = VcfHeader.new(options[:debug])
206
223
  header.add(line)
207
224
  print line if not options[:skip_header]
208
225
  STDIN.each_line do | headerline |
209
226
  if headerline !~ /^#/
227
+ # If no records in VCF, we never get here
210
228
  line = headerline
211
229
  break # end of header
212
230
  end
@@ -214,12 +232,19 @@ def parse_header line, samples, options
214
232
  if not options[:skip_header]
215
233
  if headerline =~ /^#CHR/
216
234
  # The header before actual data contains the sample names, first inject the BioVcf meta information
217
- print header.tag(options),"\n" if not options[:skip_header]
235
+ print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
236
+ # Then the additional filter(s)
237
+ # ##FILTER=<ID=LowQual,Description="Low quality">
238
+ add_filter = options[:add_filter]
239
+ if add_filter
240
+ print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
241
+ end
242
+
218
243
  selected = header.column_names
219
244
  if samples
220
245
  newfields = selected[0..8]
221
246
  samples.each do |s|
222
- newfields << selected[s+9]
247
+ newfields << selected[s+9]
223
248
  end
224
249
  selected = newfields
225
250
  end
@@ -231,10 +256,14 @@ def parse_header line, samples, options
231
256
  end
232
257
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
233
258
  VcfRdf::header if options[:rdf]
259
+ if line =~ /^#/
260
+ # We did not read a record
261
+ line = nil
262
+ end
234
263
  return header,line
235
264
  end
236
265
 
237
- # Parse a VCF line and return the result as a string
266
+ # Parse a VCF line and return the (template) result as a string buffer
238
267
  def parse_line line,header,options,bedfilter,samples,template,stats=nil
239
268
  fields = VcfLine.parse(line)
240
269
  rec = VcfRecord.new(fields,header)
@@ -244,9 +273,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
244
273
  sfilter = options[:sfilter]
245
274
  efilter = options[:efilter]
246
275
  ifilter = options[:ifilter]
276
+ add_filter = options[:add_filter] # contains a filter name (soft filter)
247
277
  seval = options[:seval]
248
278
  ignore_missing = options[:ignore_missing]
249
279
  quiet = options[:quiet]
280
+ set_filter_field = nil
250
281
 
251
282
  if sfilter or efilter or ifilter or seval
252
283
  # check for samples
@@ -261,15 +292,27 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
261
292
  return if not bed
262
293
  end
263
294
 
264
- return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
265
-
266
- if sfilter
295
+ skip = lambda { |&m|
296
+ matched = m.call
297
+ if add_filter
298
+ set_filter_field = true if matched
299
+ false # always continue processing with an add-filter
300
+ else
301
+ not matched
302
+ end
303
+ }
304
+
305
+ if filter
306
+ return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
307
+ end
308
+
309
+ if sfilter # sample 'or' filter
267
310
  rec.each_sample(options[:sfilter_samples]) do | sample |
268
- return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
311
+ return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
269
312
  end
270
313
  end
271
314
 
272
- if ifilter
315
+ if ifilter # include sample filter
273
316
  found = false
274
317
  rec.each_sample(options[:ifilter_samples]) do | sample |
275
318
  if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -278,12 +321,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
278
321
  end
279
322
  end
280
323
  # Skip if there are no matches
281
- return if not found
324
+ return if skip.call {found}
282
325
  end
283
326
 
284
- if efilter
327
+ if efilter # exclude sample filter
285
328
  rec.each_sample(options[:efilter_samples]) do | sample |
286
- return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
329
+ return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
287
330
  end
288
331
  end
289
332
 
@@ -291,18 +334,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
291
334
 
292
335
  # -----------------------------
293
336
  # From here on decide on output
337
+
338
+ rec.add_to_filter_field(add_filter) if set_filter_field
339
+
294
340
  if samples
295
341
  # Select certain samples for output
296
342
  newfields = fields[0..8]
297
343
  samples.each do |s|
298
- newfields << fields[s+9]
344
+ newfields << fields[s+9]
299
345
  end
300
346
  fields = newfields
301
347
  end
302
348
  if options[:eval] or seval
303
349
  begin
304
350
  results = nil # result string
305
- if options[:eval]
351
+ if options[:eval]
306
352
  res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
307
353
  results = res if res
308
354
  end
@@ -320,23 +366,22 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
320
366
  exit 1
321
367
  end
322
368
  return results.to_s+"\n" if results
323
- exit(1) if options[:eval_once] # <--- can this be reached?
324
369
  else
325
370
  if options[:rdf]
326
371
  # Output Turtle RDF
327
372
  VcfRdf::record(options[:id],rec,options[:tags])
328
373
  elsif options[:template]
329
- # Ruby ERB template
374
+ # Use ERB template
330
375
  begin
331
376
  template.body(binding)
332
377
  rescue Exception => e
333
378
  $stderr.print e,": ",fields,"\n"
334
379
  $stderr.print e.backtrace.inspect if options[:verbose]
335
- raise
380
+ raise
336
381
  end
337
382
  elsif options[:rewrite]
338
383
  # Default behaviour prints VCF line, but rewrite info
339
- eval(options[:rewrite])
384
+ eval(options[:rewrite])
340
385
  (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
341
386
  elsif stats
342
387
  # do nothing
@@ -347,20 +392,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
347
392
  end
348
393
  end
349
394
 
395
+ CHUNK_SIZE = options[:thread_lines]
396
+
397
+ pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
398
+ options[:quiet],options[:debug])
350
399
  header = nil
351
400
  header_output_completed = false
352
- NUM_THREADS = options[:num_threads]
353
- CHUNK_SIZE = options[:thread_lines]
354
- CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
355
- chunks = []
356
- lines = []
401
+ chunk_lines = []
357
402
  line_number=0
358
403
 
359
404
  if options[:bed]
360
405
  bedfilter = BedFilter.new(options[:bed])
361
- end
406
+ end
362
407
 
363
408
  begin
409
+ # Define linear parser function (going through one chunk)
364
410
  process = lambda { | lines |
365
411
  res = []
366
412
  lines.each do | line |
@@ -368,73 +414,75 @@ begin
368
414
  end
369
415
  res
370
416
  }
371
- output = lambda { |collection|
372
- collection.each do | result |
373
- result.each { |line| print line }
374
- end
375
- } # end output
376
417
 
377
- print template.header(binding) if template
378
418
  # ---- Main loop
379
419
  STDIN.each_line do | line |
380
420
  line_number += 1
381
- # ---- In this section header information is handled
421
+
422
+ # ---- Skip embedded headers down the line...
382
423
  next if header_output_completed and line =~ /^#/
383
- if line =~ /^##fileformat=/ or line =~ /^#CHR/
384
- header,line = parse_header(line,samples,options)
385
- end
386
- next if line =~ /^##/ # empty file
387
- header_output_completed = true
388
- if not options[:efilter_samples] and options[:ifilter_samples]
389
- # Create exclude set as a complement of include set
390
- options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
391
- end
392
424
 
393
- # ---- In this section the VCF variant lines are parsed
394
- lines << line
395
- if NUM_THREADS == 1
396
- $stderr.print '.' if line_number % CHUNK_SIZE == 0 and not options[:quiet]
397
- if lines.size > CHUNK_SIZE
398
- process.call(lines).each { | l | print l }
399
- lines = []
425
+ # ---- In the following section header information is handled -
426
+ # this only happens once.
427
+
428
+ # ---- Parse the header lines (chomps from STDIN)
429
+ # and returns header info and the current line
430
+ if line =~ /^#/
431
+ header, line = parse_header(line,samples,options)
432
+ if line.nil?
433
+ # No line after header, to there are no records to process
434
+ break
400
435
  end
401
- else
402
- if lines.size > CHUNK_SIZE
403
- chunks << lines
404
- if chunks.size > CHUNK_NUM
405
- $stderr.print '.' if not options[:quiet]
406
- out = Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
407
- process.call(chunk)
408
- }
409
- chunks = []
410
- # Output is forked to a separate process too
411
- fork do
412
- output.call out
413
- STDOUT.flush
414
- STDOUT.close
415
- exit 0
416
- end
417
- end
418
- lines = []
436
+ end
437
+ # p [line_number,line]
438
+ # ---- After the header continue processing
439
+ if not header_output_completed
440
+ # one-time post-header processing
441
+ if not options[:efilter_samples] and options[:ifilter_samples]
442
+ # Create exclude set as a complement of include set
443
+ options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
419
444
  end
445
+ print template.header(binding) if template
446
+ header_output_completed = true
447
+ end
448
+
449
+ if options[:eval_once]
450
+ # this happens if we only want one line evaluated - say to get
451
+ # the number of samples
452
+ print parse_line(line,header,options,bedfilter,samples,template,stats)
453
+ exit 0
454
+ end
455
+
456
+ # ---- Lines are collected in one buffer and the lines buffer
457
+ # is added to the chunks list (for the threads)
458
+ chunk_lines << line
459
+
460
+ # ---- In the following section the VCF lines are parsed by chunks
461
+ # The chunks may go into different threads
462
+
463
+ if chunk_lines.size >= CHUNK_SIZE
464
+ # ---- process one chunk
465
+ $stderr.print '.' if not options[:quiet]
466
+ pcows.wait_for_worker_slot()
467
+ pcows.submit_worker(process,chunk_lines)
468
+ pcows.process_output()
469
+
470
+ chunk_lines = []
420
471
  end
421
472
  end
422
- $stderr.print '.' if not options[:quiet]
423
- if NUM_THREADS == 1
424
- process.call(lines).each { |l| print l}
425
- else
426
- chunks << lines
427
- output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
428
- process.call(chunk)
429
- }
430
- end
473
+ pcows.submit_final_worker(process,chunk_lines)
474
+ pcows.wait_for_workers()
475
+ pcows.process_remaining_output()
476
+
431
477
  print template.footer(binding) if template
432
478
  stats.print if stats
433
479
 
434
480
  rescue Exception => e
435
- # $stderr.print line
436
- $stderr.print e.message,"\n"
481
+ if e.message != 'exit'
482
+ $stderr.print "ERROR: "
483
+ $stderr.print e.message,"\n"
484
+ end
485
+ pcows.cleanup()
437
486
  raise if options[:verbose]
438
487
  exit 1
439
488
  end
440
-