bio-vcf 0.8.0 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -0,0 +1,20 @@
1
+ ## ChangeLog v0.9.4 (2020????)
2
+
3
+ This is an important maintenance release of bio-vcf:
4
+
5
+ + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf).
6
+
7
+ ## Older release notes
8
+
9
+ + Getting ready for a 1.0 release
10
+ + Released 0.9.2 as a gem
11
+ + 0.9.1 removed a rare threading bug and cleanup on error
12
+ + Added support for soft filters (request by Brad Chapman)
13
+ + The outputter now writes (properly) in parallel with the parser
14
+ + bio-vcf turns any VCF into JSON with header information, and
15
+ allows you to pipe that JSON directly into any JSON supporting
16
+ language, including Python and Javascript!
17
+
18
+ ## Older changes
19
+
20
+ For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
@@ -0,0 +1,11 @@
1
+ ## RELEASE NOTES
2
+
3
+
4
+ * Getting ready for a 1.0 release
5
+ * Released 0.9.2 as a gem
6
+ * 0.9.1 removed a rare threading bug and cleanup on error
7
+ * Added support for soft filters (request by Brad Chapman)
8
+ * The outputter now writes (properly) in parallel with the parser
9
+ * bio-vcf turns any VCF into JSON with header information, and
10
+ allows you to pipe that JSON directly into any JSON supporting
11
+ language, including Python and Javascript!
data/Rakefile CHANGED
@@ -1,49 +1,12 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
12
4
  require 'rake'
13
5
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "bio-vcf"
18
- gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Fast multi-threaded VCF parser}
21
- gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
22
- gem.email = "pjotr.public01@thebird.nl"
23
- gem.authors = ["Pjotr Prins"]
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
- # require 'rspec/core'
29
- # require 'rspec/core/rake_task'
30
- # RSpec::Core::RakeTask.new(:spec) do |spec|
31
- # spec.pattern = FileList['spec/**/*_spec.rb']
32
- # end
33
-
34
- # RSpec::Core::RakeTask.new(:rcov) do |spec|
35
- # spec.pattern = 'spec/**/*_spec.rb'
36
- # spec.rcov = true
37
- # end
38
-
39
- # require 'rake/testtask'
40
-
41
- # Rake::TestTask.new do |t|
42
- # t.pattern = "spec/*_spec.rb"
43
- # end
44
-
45
6
  require 'cucumber/rake/task'
46
- Cucumber::Rake::Task.new(:features)
7
+ Cucumber::Rake::Task.new(:features) do |t|
8
+ # t.cucumber_opts = "--bundler false"
9
+ end
47
10
 
48
11
  task :default => :features
49
12
 
data/TAGS ADDED
@@ -0,0 +1,115 @@
1
+
2
+ ./bin/bio-vcf,0
3
+
4
+ ./lib/bio-vcf.rb,0
5
+
6
+ ./lib/bio-vcf/vcfgenotypefield.rb,1553
7
+ module BioVcf::BioVcf1,0
8
+ class VcfNucleotides::BioVcf::VcfNucleotides7,167
9
+ def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
10
+ def []::BioVcf::VcfNucleotides#[]13,284
11
+ def to_ary::BioVcf::VcfNucleotides#to_ary27,628
12
+ def max::BioVcf::VcfNucleotides#max32,742
13
+ def min::BioVcf::VcfNucleotides#min37,856
14
+ def sum::BioVcf::VcfNucleotides#sum42,975
15
+ class VcfAltInfo::BioVcf::VcfAltInfo50,1082
16
+ def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
17
+ def []::BioVcf::VcfAltInfo#[]56,1194
18
+ def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
19
+ def max::BioVcf::VcfAltInfo#max75,1626
20
+ def min::BioVcf::VcfAltInfo#min79,1702
21
+ def sum::BioVcf::VcfAltInfo#sum83,1783
22
+ class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
23
+ def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
24
+ def dp4::BioVcf::VcfGenotypeField#dp496,2020
25
+ def ad::BioVcf::VcfGenotypeField#ad100,2098
26
+ def pl::BioVcf::VcfGenotypeField#pl104,2174
27
+ def bcount::BioVcf::VcfGenotypeField#bcount108,2250
28
+ def bq::BioVcf::VcfGenotypeField#bq112,2343
29
+ def amq::BioVcf::VcfGenotypeField#amq116,2424
30
+ def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
31
+ class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
32
+ def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
33
+ def []::BioVcf::VcfGenotypeFields#[]141,3021
34
+ def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
35
+
36
+ ./lib/bio-vcf/vcfrdf.rb,156
37
+ module BioVcf::BioVcf1,0
38
+ module VcfRdf::BioVcf::VcfRdf5,93
39
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
40
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
41
+
42
+ ./lib/bio-vcf/vcf.rb,27
43
+ module BioVcf::BioVcf2,1
44
+
45
+ ./lib/bio-vcf/vcfline.rb,118
46
+ module BioVcf::BioVcf1,0
47
+ module VcfLine::BioVcf::VcfLine2,16
48
+ def VcfLine.parse::BioVcf::VcfLine.parse5,82
49
+
50
+ ./lib/bio-vcf/vcfrecord.rb,1831
51
+ module BioVcf::BioVcf1,0
52
+ class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
53
+ def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
54
+ def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
55
+ module VcfRecordParser::BioVcf::VcfRecordParser18,329
56
+ def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
57
+ def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
58
+ module VcfRecordCall::BioVcf::VcfRecordCall30,592
59
+ def call_diff::BioVcf::VcfRecordCall#call_diff31,617
60
+ def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
61
+ def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
62
+ def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
63
+ def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
64
+ def index::BioVcf::VcfRecordCall#index51,1026
65
+ class VcfRecord::BioVcf::VcfRecord56,1125
66
+ attr_reader :header::BioVcf::VcfRecord#header60,1173
67
+ def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
68
+ def chrom::BioVcf::VcfRecord#chrom67,1292
69
+ def pos::BioVcf::VcfRecord#pos71,1332
70
+ def ids::BioVcf::VcfRecord#ids75,1384
71
+ def id::BioVcf::VcfRecord#id79,1443
72
+ def ref::BioVcf::VcfRecord#ref83,1476
73
+ def alt::BioVcf::VcfRecord#alt87,1524
74
+ def qual::BioVcf::VcfRecord#qual91,1582
75
+ def info::BioVcf::VcfRecord#info95,1636
76
+ def format::BioVcf::VcfRecord#format99,1711
77
+ def normal::BioVcf::VcfRecord#normal104,1848
78
+ def tumor::BioVcf::VcfRecord#tumor109,1997
79
+ def sample::BioVcf::VcfRecord#sample114,2134
80
+ def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
81
+ def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
82
+ def method_missing::BioVcf::VcfRecord#method_missing126,2341
83
+
84
+ ./lib/bio-vcf/variant.rb,470
85
+ module BioVcf::BioVcf1,0
86
+ module Variant::BioVcf::Variant3,17
87
+ def Variant.diff::BioVcf::Variant.diff5,37
88
+ def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
89
+ def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
90
+ def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
91
+ def Variant.index::BioVcf::Variant.index25,652
92
+ def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
93
+
94
+ ./lib/bio-vcf/vcfheader.rb,598
95
+ module BioVcf::BioVcf2,1
96
+ module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
97
+ def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
98
+ class VcfHeader::BioVcf::VcfHeader18,339
99
+ attr_reader :lines::BioVcf::VcfHeader#lines20,360
100
+ def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
101
+ def add::BioVcf::VcfHeader#add26,430
102
+ def version::BioVcf::VcfHeader#version30,483
103
+ def column_names::BioVcf::VcfHeader#column_names34,578
104
+ def columns::BioVcf::VcfHeader#columns38,674
105
+ def samples::BioVcf::VcfHeader#samples42,735
106
+
107
+ ./features/step_definitions/diff_count.rb,0
108
+
109
+ ./features/step_definitions/bio-vcf_steps.rb,0
110
+
111
+ ./features/step_definitions/somaticsniper.rb,0
112
+
113
+ ./features/step_definitions/multisample.rb,0
114
+
115
+ ./features/support/env.rb,0
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.0
1
+ 0.9.4
@@ -4,7 +4,7 @@
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT
6
6
  #
7
- # Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ # Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
8
8
 
9
9
  USAGE = "Vcf parser"
10
10
 
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
15
15
  version = File.new(VERSION_FILENAME).read.chomp
16
16
 
17
17
  require 'bio-vcf'
18
+ require 'bio-vcf/pcows'
18
19
  require 'optparse'
19
20
  require 'timeout'
20
21
  require 'fileutils'
21
22
 
22
- # Uncomment when using the bio-logger
23
+ # Uncomment when using the bio-logger
23
24
  # require 'bio-logger'
24
25
  # log = Bio::Log::LoggerPlus.new 'vcf'
25
- # log.outputters = Bio::Log::Outputter.stderr
26
+ # log.outputters = Bio::Log::Outputter.stderr
26
27
  # Bio::Log::CLI.logger('stderr')
27
28
  # Bio::Log::CLI.trace('info')
28
29
 
29
- options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000 }
30
+ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
30
31
  opts = OptionParser.new do |o|
31
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
32
33
 
33
- o.on('-i','--ignore-missing', 'Ignore missing data') do
34
+ o.on('-i','--ignore-missing', 'Ignore missing data') do
34
35
  options[:ignore_missing] = true
35
36
  end
36
37
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
@@ -57,6 +58,13 @@ opts = OptionParser.new do |o|
57
58
  o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
58
59
  options[:efilter_samples] = l
59
60
  end
61
+ o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
62
+ options[:add_filter] = name
63
+ end
64
+
65
+ o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
66
+ options[:bed] = bed
67
+ end
60
68
 
61
69
  o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
62
70
  options[:eval] = cmd
@@ -64,6 +72,9 @@ opts = OptionParser.new do |o|
64
72
  o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
65
73
  options[:eval_once] = true
66
74
  options[:eval] = cmd
75
+ # options[:num_threads] = 1
76
+ # options[:thread_lines] = 1
77
+ options[:skip_header] = true
67
78
  end
68
79
  o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
69
80
  options[:seval] = cmd
@@ -80,7 +91,7 @@ opts = OptionParser.new do |o|
80
91
  options[:rdf] = true
81
92
  options[:skip_header] = true
82
93
  end
83
- o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
94
+ o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
84
95
  options[:num_threads] = i
85
96
  end
86
97
  o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -92,8 +103,8 @@ opts = OptionParser.new do |o|
92
103
  o.on_tail("--tags list", String, "Add tags") do |s|
93
104
  options[:tags] = s
94
105
  end
95
-
96
- o.on("--skip-header", "Do not output VCF header info") do
106
+
107
+ o.on("--skip-header", "Do not output VCF header info") do
97
108
  options[:skip_header] = true
98
109
  end
99
110
 
@@ -108,9 +119,16 @@ opts = OptionParser.new do |o|
108
119
  options[:template] = s
109
120
  options[:skip_header] = true
110
121
  end
111
-
112
-
113
- # Uncomment the following when using the bio-logger
122
+
123
+ o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
124
+ options[:tag] = true
125
+ end
126
+
127
+ o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
128
+ options[:timeout] = i
129
+ end
130
+
131
+ # Uncomment the following when using the bio-logger
114
132
  # o.separator ""
115
133
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
116
134
  # Bio::Log::CLI.logger(name)
@@ -119,7 +137,16 @@ opts = OptionParser.new do |o|
119
137
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
120
138
  # Bio::Log::CLI.trace(s)
121
139
  # end
122
- #
140
+ #
141
+ o.on("--names", "Output sample names") do |q|
142
+ options[:quiet] = true
143
+ options[:num_threads] = nil
144
+ options[:eval_once] = true
145
+ options[:eval] = 'header.samples.join("\t")'
146
+ # options[:num_threads] = 1
147
+ # options[:thread_lines] = 1
148
+ options[:skip_header] = true
149
+ end
123
150
  o.on("--statistics", "Output statistics") do |q|
124
151
  options[:statistics] = true
125
152
  options[:num_threads] = nil
@@ -128,14 +155,15 @@ opts = OptionParser.new do |o|
128
155
  # Bio::Log::CLI.trace('error')
129
156
  options[:quiet] = true
130
157
  end
131
-
158
+
132
159
  o.on("-v", "--verbose", "Run verbosely") do |v|
133
160
  options[:verbose] = true
134
161
  end
135
-
136
- # o.on("--debug", "Show debug messages") do |v|
137
- # Bio::Log::CLI.trace('debug')
138
- # end
162
+
163
+ o.on("--debug", "Show debug messages and keep intermediate output") do |v|
164
+ # Bio::Log::CLI.trace('debug')
165
+ options[:debug] = true
166
+ end
139
167
 
140
168
  o.separator ""
141
169
  o.on_tail('-h', '--help', 'display this help and exit') do
@@ -145,10 +173,12 @@ end
145
173
 
146
174
  opts.parse!(ARGV)
147
175
 
148
- $stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
176
+ BIOVCF_VERSION=version
177
+ BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
178
+ $stderr.print BIOVCF_BANNER if !options[:quiet]
149
179
 
150
- if options[:show_help]
151
- print opts
180
+ if options[:show_help]
181
+ print opts
152
182
  print USAGE
153
183
  exit 1
154
184
  end
@@ -161,18 +191,11 @@ $stderr.print "Options: ",options,"\n" if !options[:quiet]
161
191
 
162
192
  if options[:template]
163
193
  include BioVcf::RDF
194
+ require 'bio-vcf/template'
164
195
  fn = options[:template]
165
196
  raise "No template #{fn}!" if not File.exist?(fn)
166
- template = ERB.new(File.read(fn))
167
- end
168
-
169
- if options[:num_threads] != 1
170
- begin
171
- require 'parallel'
172
- rescue LoadError
173
- $stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
174
- options[:num_threads] = 1
175
- end
197
+ # template = ERB.new(File.read(fn))
198
+ template = Bio::Template.new(fn)
176
199
  end
177
200
 
178
201
  stats = nil
@@ -185,6 +208,8 @@ end
185
208
  raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
186
209
  raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
187
210
  raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
211
+ # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
212
+ # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
188
213
 
189
214
  if options[:samples]
190
215
  samples = options[:samples].map { |s| s.to_i }
@@ -192,13 +217,14 @@ end
192
217
 
193
218
  include BioVcf
194
219
 
195
- # Parse the header section of a VCF file
220
+ # Parse the header section of a VCF file (chomping STDIN)
196
221
  def parse_header line, samples, options
197
- header = VcfHeader.new
222
+ header = VcfHeader.new(options[:debug])
198
223
  header.add(line)
199
224
  print line if not options[:skip_header]
200
225
  STDIN.each_line do | headerline |
201
226
  if headerline !~ /^#/
227
+ # If no records in VCF, we never get here
202
228
  line = headerline
203
229
  break # end of header
204
230
  end
@@ -206,12 +232,19 @@ def parse_header line, samples, options
206
232
  if not options[:skip_header]
207
233
  if headerline =~ /^#CHR/
208
234
  # The header before actual data contains the sample names, first inject the BioVcf meta information
209
- print header.tag(options),"\n" if not options[:skip_header]
235
+ print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
236
+ # Then the additional filter(s)
237
+ # ##FILTER=<ID=LowQual,Description="Low quality">
238
+ add_filter = options[:add_filter]
239
+ if add_filter
240
+ print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
241
+ end
242
+
210
243
  selected = header.column_names
211
244
  if samples
212
245
  newfields = selected[0..8]
213
246
  samples.each do |s|
214
- newfields << selected[s+9]
247
+ newfields << selected[s+9]
215
248
  end
216
249
  selected = newfields
217
250
  end
@@ -223,11 +256,15 @@ def parse_header line, samples, options
223
256
  end
224
257
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
225
258
  VcfRdf::header if options[:rdf]
259
+ if line =~ /^#/
260
+ # We did not read a record
261
+ line = nil
262
+ end
226
263
  return header,line
227
264
  end
228
265
 
229
- # Parse a VCF line and return the result as a string
230
- def parse_line line,header,options,samples,template,stats=nil
266
+ # Parse a VCF line and return the (template) result as a string buffer
267
+ def parse_line line,header,options,bedfilter,samples,template,stats=nil
231
268
  fields = VcfLine.parse(line)
232
269
  rec = VcfRecord.new(fields,header)
233
270
  r = rec # alias
@@ -236,9 +273,11 @@ def parse_line line,header,options,samples,template,stats=nil
236
273
  sfilter = options[:sfilter]
237
274
  efilter = options[:efilter]
238
275
  ifilter = options[:ifilter]
276
+ add_filter = options[:add_filter] # contains a filter name (soft filter)
239
277
  seval = options[:seval]
240
278
  ignore_missing = options[:ignore_missing]
241
279
  quiet = options[:quiet]
280
+ set_filter_field = nil
242
281
 
243
282
  if sfilter or efilter or ifilter or seval
244
283
  # check for samples
@@ -248,15 +287,32 @@ def parse_line line,header,options,samples,template,stats=nil
248
287
 
249
288
  # --------------------------
250
289
  # Filtering and set analysis
251
- return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
252
-
253
- if sfilter
290
+ if bedfilter
291
+ bed = bedfilter.contains(rec)
292
+ return if not bed
293
+ end
294
+
295
+ skip = lambda { |&m|
296
+ matched = m.call
297
+ if add_filter
298
+ set_filter_field = true if matched
299
+ false # always continue processing with an add-filter
300
+ else
301
+ not matched
302
+ end
303
+ }
304
+
305
+ if filter
306
+ return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
307
+ end
308
+
309
+ if sfilter # sample 'or' filter
254
310
  rec.each_sample(options[:sfilter_samples]) do | sample |
255
- return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
311
+ return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
256
312
  end
257
313
  end
258
314
 
259
- if ifilter
315
+ if ifilter # include sample filter
260
316
  found = false
261
317
  rec.each_sample(options[:ifilter_samples]) do | sample |
262
318
  if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -265,12 +321,12 @@ def parse_line line,header,options,samples,template,stats=nil
265
321
  end
266
322
  end
267
323
  # Skip if there are no matches
268
- return if not found
324
+ return if skip.call {found}
269
325
  end
270
326
 
271
- if efilter
327
+ if efilter # exclude sample filter
272
328
  rec.each_sample(options[:efilter_samples]) do | sample |
273
- return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
329
+ return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
274
330
  end
275
331
  end
276
332
 
@@ -278,18 +334,21 @@ def parse_line line,header,options,samples,template,stats=nil
278
334
 
279
335
  # -----------------------------
280
336
  # From here on decide on output
337
+
338
+ rec.add_to_filter_field(add_filter) if set_filter_field
339
+
281
340
  if samples
282
341
  # Select certain samples for output
283
342
  newfields = fields[0..8]
284
343
  samples.each do |s|
285
- newfields << fields[s+9]
344
+ newfields << fields[s+9]
286
345
  end
287
346
  fields = newfields
288
347
  end
289
348
  if options[:eval] or seval
290
349
  begin
291
350
  results = nil # result string
292
- if options[:eval]
351
+ if options[:eval]
293
352
  res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
294
353
  results = res if res
295
354
  end
@@ -307,23 +366,22 @@ def parse_line line,header,options,samples,template,stats=nil
307
366
  exit 1
308
367
  end
309
368
  return results.to_s+"\n" if results
310
- exit(1) if options[:eval_once] # <--- can this be reached?
311
369
  else
312
370
  if options[:rdf]
313
371
  # Output Turtle RDF
314
372
  VcfRdf::record(options[:id],rec,options[:tags])
315
373
  elsif options[:template]
316
- # Ruby ERB template
374
+ # Use ERB template
317
375
  begin
318
- template.result(binding)
376
+ template.body(binding)
319
377
  rescue Exception => e
320
378
  $stderr.print e,": ",fields,"\n"
321
379
  $stderr.print e.backtrace.inspect if options[:verbose]
322
- raise
380
+ raise
323
381
  end
324
382
  elsif options[:rewrite]
325
383
  # Default behaviour prints VCF line, but rewrite info
326
- eval(options[:rewrite])
384
+ eval(options[:rewrite])
327
385
  (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
328
386
  elsif stats
329
387
  # do nothing
@@ -334,88 +392,97 @@ def parse_line line,header,options,samples,template,stats=nil
334
392
  end
335
393
  end
336
394
 
395
+ CHUNK_SIZE = options[:thread_lines]
396
+
397
+ pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
398
+ options[:quiet],options[:debug])
337
399
  header = nil
338
400
  header_output_completed = false
339
- NUM_THREADS = options[:num_threads]
340
- CHUNK_SIZE = options[:thread_lines]
341
- CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
342
- chunks = []
343
- lines = []
401
+ chunk_lines = []
344
402
  line_number=0
345
403
 
404
+ if options[:bed]
405
+ bedfilter = BedFilter.new(options[:bed])
406
+ end
407
+
346
408
  begin
409
+ # Define linear parser function (going through one chunk)
347
410
  process = lambda { | lines |
348
411
  res = []
349
412
  lines.each do | line |
350
- res << parse_line(line,header,options,samples,template,stats)
413
+ res << parse_line(line,header,options,bedfilter,samples,template,stats)
351
414
  end
352
415
  res
353
416
  }
354
- output = lambda { |collection|
355
- collection.each do | result |
356
- result.each { |line| print line }
357
- end
358
- } # end output
359
417
 
360
418
  # ---- Main loop
361
419
  STDIN.each_line do | line |
362
420
  line_number += 1
363
- # ---- In this section header information is handled
421
+
422
+ # ---- Skip embedded headers down the line...
364
423
  next if header_output_completed and line =~ /^#/
365
- if line =~ /^##fileformat=/ or line =~ /^#CHR/
366
- header,line = parse_header(line,samples,options)
367
- end
368
- next if line =~ /^##/ # empty file
369
- header_output_completed = true
370
- if not options[:efilter_samples] and options[:ifilter_samples]
371
- # Create exclude set as a complement of include set
372
- options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
373
- end
374
424
 
375
- # ---- In this section the VCF variant lines are parsed
376
- lines << line
377
- if NUM_THREADS == 1
378
- $stderr.print '.' if line_number % CHUNK_SIZE == 0 and not options[:quiet]
379
- if lines.size > CHUNK_SIZE
380
- process.call(lines).each { | l | print l }
381
- lines = []
425
+ # ---- In the following section header information is handled -
426
+ # this only happens once.
427
+
428
+ # ---- Parse the header lines (chomps from STDIN)
429
+ # and returns header info and the current line
430
+ if line =~ /^#/
431
+ header, line = parse_header(line,samples,options)
432
+ if line.nil?
433
+ # No line after header, to there are no records to process
434
+ break
382
435
  end
383
- else
384
- if lines.size > CHUNK_SIZE
385
- chunks << lines
386
- if chunks.size > CHUNK_NUM
387
- $stderr.print '.' if not options[:quiet]
388
- out = Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
389
- process.call(chunk)
390
- }
391
- chunks = []
392
- # Output is forked to a separate process too
393
- fork do
394
- output.call out
395
- STDOUT.flush
396
- STDOUT.close
397
- exit 0
398
- end
399
- end
400
- lines = []
436
+ end
437
+ # p [line_number,line]
438
+ # ---- After the header continue processing
439
+ if not header_output_completed
440
+ # one-time post-header processing
441
+ if not options[:efilter_samples] and options[:ifilter_samples]
442
+ # Create exclude set as a complement of include set
443
+ options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
401
444
  end
445
+ print template.header(binding) if template
446
+ header_output_completed = true
447
+ end
448
+
449
+ if options[:eval_once]
450
+ # this happens if we only want one line evaluated - say to get
451
+ # the number of samples
452
+ print parse_line(line,header,options,bedfilter,samples,template,stats)
453
+ exit 0
454
+ end
455
+
456
+ # ---- Lines are collected in one buffer and the lines buffer
457
+ # is added to the chunks list (for the threads)
458
+ chunk_lines << line
459
+
460
+ # ---- In the following section the VCF lines are parsed by chunks
461
+ # The chunks may go into different threads
462
+
463
+ if chunk_lines.size >= CHUNK_SIZE
464
+ # ---- process one chunk
465
+ $stderr.print '.' if not options[:quiet]
466
+ pcows.wait_for_worker_slot()
467
+ pcows.submit_worker(process,chunk_lines)
468
+ pcows.process_output()
469
+
470
+ chunk_lines = []
402
471
  end
403
472
  end
404
- $stderr.print '.' if not options[:quiet]
405
- if NUM_THREADS == 1
406
- process.call(lines).each { |l| print l}
407
- else
408
- chunks << lines
409
- output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
410
- process.call(chunk)
411
- }
412
- end
473
+ pcows.submit_final_worker(process,chunk_lines)
474
+ pcows.wait_for_workers()
475
+ pcows.process_remaining_output()
476
+
477
+ print template.footer(binding) if template
413
478
  stats.print if stats
414
479
 
415
480
  rescue Exception => e
416
- # $stderr.print line
417
- $stderr.print e.message,"\n"
481
+ if e.message != 'exit'
482
+ $stderr.print "ERROR: "
483
+ $stderr.print e.message,"\n"
484
+ end
485
+ pcows.cleanup()
418
486
  raise if options[:verbose]
419
487
  exit 1
420
488
  end
421
-