bio-vcf 0.8.0 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +4 -5
  4. data/Gemfile.lock +28 -65
  5. data/LICENSE.txt +1 -1
  6. data/README.md +387 -107
  7. data/RELEASE_NOTES.md +20 -0
  8. data/RELEASE_NOTES.md~ +11 -0
  9. data/Rakefile +3 -40
  10. data/TAGS +115 -0
  11. data/VERSION +1 -1
  12. data/bin/bio-vcf +176 -109
  13. data/bio-vcf.gemspec +14 -70
  14. data/features/cli.feature +22 -4
  15. data/features/diff_count.feature +0 -1
  16. data/features/filter.feature +12 -0
  17. data/features/multisample.feature +25 -0
  18. data/features/somaticsniper.feature +2 -0
  19. data/features/step_definitions/cli-feature.rb +15 -6
  20. data/features/step_definitions/diff_count.rb +1 -1
  21. data/features/step_definitions/multisample.rb +19 -0
  22. data/features/step_definitions/somaticsniper.rb +9 -1
  23. data/features/step_definitions/vcf_header.rb +48 -0
  24. data/features/support/env.rb +0 -9
  25. data/features/vcf_header.feature +35 -0
  26. data/lib/bio-vcf.rb +2 -0
  27. data/lib/bio-vcf/bedfilter.rb +43 -0
  28. data/lib/bio-vcf/pcows.rb +303 -0
  29. data/lib/bio-vcf/template.rb +75 -0
  30. data/lib/bio-vcf/vcffile.rb +46 -0
  31. data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
  32. data/lib/bio-vcf/vcfheader.rb +146 -6
  33. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  34. data/lib/bio-vcf/vcfrecord.rb +56 -18
  35. data/lib/bio-vcf/vcfsample.rb +27 -3
  36. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  37. data/ragel/generate.sh +8 -0
  38. data/template/vcf2json.erb +19 -7
  39. data/template/vcf2json_full_header.erb +22 -0
  40. data/template/vcf2json_use_meta.erb +41 -0
  41. data/template/vcf2rdf_header.erb +24 -0
  42. data/test/data/input/empty.vcf +2 -0
  43. data/test/data/input/gatk_exome.vcf +237 -0
  44. data/test/data/input/gatk_wgs.vcf +1000 -0
  45. data/test/data/input/test.bed +632 -0
  46. data/test/data/regression/empty-stderr.new +12 -0
  47. data/test/data/regression/empty.new +2 -0
  48. data/test/data/regression/empty.ref +2 -0
  49. data/test/data/regression/eval_once-stderr.new +2 -0
  50. data/test/data/regression/eval_once.new +1 -0
  51. data/test/data/regression/eval_once.ref +1 -0
  52. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  53. data/test/data/regression/eval_r.info.dp.new +150 -0
  54. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  55. data/test/data/regression/ifilter_s.dp.new +31 -0
  56. data/test/data/regression/pass1-stderr.new +10 -0
  57. data/test/data/regression/pass1.new +88 -0
  58. data/test/data/regression/pass1.ref +88 -0
  59. data/test/data/regression/r.info.dp-stderr.new +4 -0
  60. data/test/data/regression/r.info.dp.new +114 -0
  61. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  62. data/test/data/regression/rewrite.info.sample.new +150 -0
  63. data/test/data/regression/s.dp-stderr.new +18 -0
  64. data/test/data/regression/s.dp.new +145 -0
  65. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  66. data/test/data/regression/seval_s.dp.new +36 -0
  67. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  68. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  69. data/test/data/regression/thread4-stderr.new +10 -0
  70. data/test/data/regression/thread4.new +150 -0
  71. data/test/data/regression/thread4_4-stderr.new +25 -0
  72. data/test/data/regression/thread4_4.new +130 -0
  73. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  74. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
  75. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  76. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  77. data/test/data/regression/vcf2json_full_header.new +225 -0
  78. data/test/data/regression/vcf2json_full_header.ref +225 -0
  79. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  80. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  81. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  82. data/test/performance/metrics.md +18 -1
  83. data/test/stress/stress_test.sh +15 -0
  84. data/test/tmp/test.vcf +12469 -0
  85. metadata +65 -64
@@ -0,0 +1,20 @@
1
+ ## ChangeLog v0.9.4 (2020????)
2
+
3
+ This is an important maintenance release of bio-vcf:
4
+
5
+ + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf).
6
+
7
+ ## Older release notes
8
+
9
+ + Getting ready for a 1.0 release
10
+ + Released 0.9.2 as a gem
11
+ + 0.9.1 removed a rare threading bug and cleanup on error
12
+ + Added support for soft filters (request by Brad Chapman)
13
+ + The outputter now writes (properly) in parallel with the parser
14
+ + bio-vcf turns any VCF into JSON with header information, and
15
+ allows you to pipe that JSON directly into any JSON supporting
16
+ language, including Python and Javascript!
17
+
18
+ ## Older changes
19
+
20
+ For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
@@ -0,0 +1,11 @@
1
+ ## RELEASE NOTES
2
+
3
+
4
+ * Getting ready for a 1.0 release
5
+ * Released 0.9.2 as a gem
6
+ * 0.9.1 removed a rare threading bug and cleanup on error
7
+ * Added support for soft filters (request by Brad Chapman)
8
+ * The outputter now writes (properly) in parallel with the parser
9
+ * bio-vcf turns any VCF into JSON with header information, and
10
+ allows you to pipe that JSON directly into any JSON supporting
11
+ language, including Python and Javascript!
data/Rakefile CHANGED
@@ -1,49 +1,12 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
12
4
  require 'rake'
13
5
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "bio-vcf"
18
- gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Fast multi-threaded VCF parser}
21
- gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
22
- gem.email = "pjotr.public01@thebird.nl"
23
- gem.authors = ["Pjotr Prins"]
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
- # require 'rspec/core'
29
- # require 'rspec/core/rake_task'
30
- # RSpec::Core::RakeTask.new(:spec) do |spec|
31
- # spec.pattern = FileList['spec/**/*_spec.rb']
32
- # end
33
-
34
- # RSpec::Core::RakeTask.new(:rcov) do |spec|
35
- # spec.pattern = 'spec/**/*_spec.rb'
36
- # spec.rcov = true
37
- # end
38
-
39
- # require 'rake/testtask'
40
-
41
- # Rake::TestTask.new do |t|
42
- # t.pattern = "spec/*_spec.rb"
43
- # end
44
-
45
6
  require 'cucumber/rake/task'
46
- Cucumber::Rake::Task.new(:features)
7
+ Cucumber::Rake::Task.new(:features) do |t|
8
+ # t.cucumber_opts = "--bundler false"
9
+ end
47
10
 
48
11
  task :default => :features
49
12
 
data/TAGS ADDED
@@ -0,0 +1,115 @@
1
+
2
+ ./bin/bio-vcf,0
3
+
4
+ ./lib/bio-vcf.rb,0
5
+
6
+ ./lib/bio-vcf/vcfgenotypefield.rb,1553
7
+ module BioVcf::BioVcf1,0
8
+ class VcfNucleotides::BioVcf::VcfNucleotides7,167
9
+ def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
10
+ def []::BioVcf::VcfNucleotides#[]13,284
11
+ def to_ary::BioVcf::VcfNucleotides#to_ary27,628
12
+ def max::BioVcf::VcfNucleotides#max32,742
13
+ def min::BioVcf::VcfNucleotides#min37,856
14
+ def sum::BioVcf::VcfNucleotides#sum42,975
15
+ class VcfAltInfo::BioVcf::VcfAltInfo50,1082
16
+ def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
17
+ def []::BioVcf::VcfAltInfo#[]56,1194
18
+ def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
19
+ def max::BioVcf::VcfAltInfo#max75,1626
20
+ def min::BioVcf::VcfAltInfo#min79,1702
21
+ def sum::BioVcf::VcfAltInfo#sum83,1783
22
+ class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
23
+ def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
24
+ def dp4::BioVcf::VcfGenotypeField#dp496,2020
25
+ def ad::BioVcf::VcfGenotypeField#ad100,2098
26
+ def pl::BioVcf::VcfGenotypeField#pl104,2174
27
+ def bcount::BioVcf::VcfGenotypeField#bcount108,2250
28
+ def bq::BioVcf::VcfGenotypeField#bq112,2343
29
+ def amq::BioVcf::VcfGenotypeField#amq116,2424
30
+ def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
31
+ class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
32
+ def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
33
+ def []::BioVcf::VcfGenotypeFields#[]141,3021
34
+ def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
35
+
36
+ ./lib/bio-vcf/vcfrdf.rb,156
37
+ module BioVcf::BioVcf1,0
38
+ module VcfRdf::BioVcf::VcfRdf5,93
39
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
40
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
41
+
42
+ ./lib/bio-vcf/vcf.rb,27
43
+ module BioVcf::BioVcf2,1
44
+
45
+ ./lib/bio-vcf/vcfline.rb,118
46
+ module BioVcf::BioVcf1,0
47
+ module VcfLine::BioVcf::VcfLine2,16
48
+ def VcfLine.parse::BioVcf::VcfLine.parse5,82
49
+
50
+ ./lib/bio-vcf/vcfrecord.rb,1831
51
+ module BioVcf::BioVcf1,0
52
+ class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
53
+ def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
54
+ def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
55
+ module VcfRecordParser::BioVcf::VcfRecordParser18,329
56
+ def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
57
+ def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
58
+ module VcfRecordCall::BioVcf::VcfRecordCall30,592
59
+ def call_diff::BioVcf::VcfRecordCall#call_diff31,617
60
+ def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
61
+ def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
62
+ def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
63
+ def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
64
+ def index::BioVcf::VcfRecordCall#index51,1026
65
+ class VcfRecord::BioVcf::VcfRecord56,1125
66
+ attr_reader :header::BioVcf::VcfRecord#header60,1173
67
+ def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
68
+ def chrom::BioVcf::VcfRecord#chrom67,1292
69
+ def pos::BioVcf::VcfRecord#pos71,1332
70
+ def ids::BioVcf::VcfRecord#ids75,1384
71
+ def id::BioVcf::VcfRecord#id79,1443
72
+ def ref::BioVcf::VcfRecord#ref83,1476
73
+ def alt::BioVcf::VcfRecord#alt87,1524
74
+ def qual::BioVcf::VcfRecord#qual91,1582
75
+ def info::BioVcf::VcfRecord#info95,1636
76
+ def format::BioVcf::VcfRecord#format99,1711
77
+ def normal::BioVcf::VcfRecord#normal104,1848
78
+ def tumor::BioVcf::VcfRecord#tumor109,1997
79
+ def sample::BioVcf::VcfRecord#sample114,2134
80
+ def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
81
+ def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
82
+ def method_missing::BioVcf::VcfRecord#method_missing126,2341
83
+
84
+ ./lib/bio-vcf/variant.rb,470
85
+ module BioVcf::BioVcf1,0
86
+ module Variant::BioVcf::Variant3,17
87
+ def Variant.diff::BioVcf::Variant.diff5,37
88
+ def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
89
+ def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
90
+ def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
91
+ def Variant.index::BioVcf::Variant.index25,652
92
+ def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
93
+
94
+ ./lib/bio-vcf/vcfheader.rb,598
95
+ module BioVcf::BioVcf2,1
96
+ module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
97
+ def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
98
+ class VcfHeader::BioVcf::VcfHeader18,339
99
+ attr_reader :lines::BioVcf::VcfHeader#lines20,360
100
+ def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
101
+ def add::BioVcf::VcfHeader#add26,430
102
+ def version::BioVcf::VcfHeader#version30,483
103
+ def column_names::BioVcf::VcfHeader#column_names34,578
104
+ def columns::BioVcf::VcfHeader#columns38,674
105
+ def samples::BioVcf::VcfHeader#samples42,735
106
+
107
+ ./features/step_definitions/diff_count.rb,0
108
+
109
+ ./features/step_definitions/bio-vcf_steps.rb,0
110
+
111
+ ./features/step_definitions/somaticsniper.rb,0
112
+
113
+ ./features/step_definitions/multisample.rb,0
114
+
115
+ ./features/support/env.rb,0
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.0
1
+ 0.9.4
@@ -4,7 +4,7 @@
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT
6
6
  #
7
- # Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ # Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
8
8
 
9
9
  USAGE = "Vcf parser"
10
10
 
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
15
15
  version = File.new(VERSION_FILENAME).read.chomp
16
16
 
17
17
  require 'bio-vcf'
18
+ require 'bio-vcf/pcows'
18
19
  require 'optparse'
19
20
  require 'timeout'
20
21
  require 'fileutils'
21
22
 
22
- # Uncomment when using the bio-logger
23
+ # Uncomment when using the bio-logger
23
24
  # require 'bio-logger'
24
25
  # log = Bio::Log::LoggerPlus.new 'vcf'
25
- # log.outputters = Bio::Log::Outputter.stderr
26
+ # log.outputters = Bio::Log::Outputter.stderr
26
27
  # Bio::Log::CLI.logger('stderr')
27
28
  # Bio::Log::CLI.trace('info')
28
29
 
29
- options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000 }
30
+ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
30
31
  opts = OptionParser.new do |o|
31
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
32
33
 
33
- o.on('-i','--ignore-missing', 'Ignore missing data') do
34
+ o.on('-i','--ignore-missing', 'Ignore missing data') do
34
35
  options[:ignore_missing] = true
35
36
  end
36
37
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
@@ -57,6 +58,13 @@ opts = OptionParser.new do |o|
57
58
  o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
58
59
  options[:efilter_samples] = l
59
60
  end
61
+ o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
62
+ options[:add_filter] = name
63
+ end
64
+
65
+ o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
66
+ options[:bed] = bed
67
+ end
60
68
 
61
69
  o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
62
70
  options[:eval] = cmd
@@ -64,6 +72,9 @@ opts = OptionParser.new do |o|
64
72
  o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
65
73
  options[:eval_once] = true
66
74
  options[:eval] = cmd
75
+ # options[:num_threads] = 1
76
+ # options[:thread_lines] = 1
77
+ options[:skip_header] = true
67
78
  end
68
79
  o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
69
80
  options[:seval] = cmd
@@ -80,7 +91,7 @@ opts = OptionParser.new do |o|
80
91
  options[:rdf] = true
81
92
  options[:skip_header] = true
82
93
  end
83
- o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
94
+ o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
84
95
  options[:num_threads] = i
85
96
  end
86
97
  o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -92,8 +103,8 @@ opts = OptionParser.new do |o|
92
103
  o.on_tail("--tags list", String, "Add tags") do |s|
93
104
  options[:tags] = s
94
105
  end
95
-
96
- o.on("--skip-header", "Do not output VCF header info") do
106
+
107
+ o.on("--skip-header", "Do not output VCF header info") do
97
108
  options[:skip_header] = true
98
109
  end
99
110
 
@@ -108,9 +119,16 @@ opts = OptionParser.new do |o|
108
119
  options[:template] = s
109
120
  options[:skip_header] = true
110
121
  end
111
-
112
-
113
- # Uncomment the following when using the bio-logger
122
+
123
+ o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
124
+ options[:tag] = true
125
+ end
126
+
127
+ o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
128
+ options[:timeout] = i
129
+ end
130
+
131
+ # Uncomment the following when using the bio-logger
114
132
  # o.separator ""
115
133
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
116
134
  # Bio::Log::CLI.logger(name)
@@ -119,7 +137,16 @@ opts = OptionParser.new do |o|
119
137
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
120
138
  # Bio::Log::CLI.trace(s)
121
139
  # end
122
- #
140
+ #
141
+ o.on("--names", "Output sample names") do |q|
142
+ options[:quiet] = true
143
+ options[:num_threads] = nil
144
+ options[:eval_once] = true
145
+ options[:eval] = 'header.samples.join("\t")'
146
+ # options[:num_threads] = 1
147
+ # options[:thread_lines] = 1
148
+ options[:skip_header] = true
149
+ end
123
150
  o.on("--statistics", "Output statistics") do |q|
124
151
  options[:statistics] = true
125
152
  options[:num_threads] = nil
@@ -128,14 +155,15 @@ opts = OptionParser.new do |o|
128
155
  # Bio::Log::CLI.trace('error')
129
156
  options[:quiet] = true
130
157
  end
131
-
158
+
132
159
  o.on("-v", "--verbose", "Run verbosely") do |v|
133
160
  options[:verbose] = true
134
161
  end
135
-
136
- # o.on("--debug", "Show debug messages") do |v|
137
- # Bio::Log::CLI.trace('debug')
138
- # end
162
+
163
+ o.on("--debug", "Show debug messages and keep intermediate output") do |v|
164
+ # Bio::Log::CLI.trace('debug')
165
+ options[:debug] = true
166
+ end
139
167
 
140
168
  o.separator ""
141
169
  o.on_tail('-h', '--help', 'display this help and exit') do
@@ -145,10 +173,12 @@ end
145
173
 
146
174
  opts.parse!(ARGV)
147
175
 
148
- $stderr.print "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
176
+ BIOVCF_VERSION=version
177
+ BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
178
+ $stderr.print BIOVCF_BANNER if !options[:quiet]
149
179
 
150
- if options[:show_help]
151
- print opts
180
+ if options[:show_help]
181
+ print opts
152
182
  print USAGE
153
183
  exit 1
154
184
  end
@@ -161,18 +191,11 @@ $stderr.print "Options: ",options,"\n" if !options[:quiet]
161
191
 
162
192
  if options[:template]
163
193
  include BioVcf::RDF
194
+ require 'bio-vcf/template'
164
195
  fn = options[:template]
165
196
  raise "No template #{fn}!" if not File.exist?(fn)
166
- template = ERB.new(File.read(fn))
167
- end
168
-
169
- if options[:num_threads] != 1
170
- begin
171
- require 'parallel'
172
- rescue LoadError
173
- $stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
174
- options[:num_threads] = 1
175
- end
197
+ # template = ERB.new(File.read(fn))
198
+ template = Bio::Template.new(fn)
176
199
  end
177
200
 
178
201
  stats = nil
@@ -185,6 +208,8 @@ end
185
208
  raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
186
209
  raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
187
210
  raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
211
+ # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
212
+ # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
188
213
 
189
214
  if options[:samples]
190
215
  samples = options[:samples].map { |s| s.to_i }
@@ -192,13 +217,14 @@ end
192
217
 
193
218
  include BioVcf
194
219
 
195
- # Parse the header section of a VCF file
220
+ # Parse the header section of a VCF file (chomping STDIN)
196
221
  def parse_header line, samples, options
197
- header = VcfHeader.new
222
+ header = VcfHeader.new(options[:debug])
198
223
  header.add(line)
199
224
  print line if not options[:skip_header]
200
225
  STDIN.each_line do | headerline |
201
226
  if headerline !~ /^#/
227
+ # If no records in VCF, we never get here
202
228
  line = headerline
203
229
  break # end of header
204
230
  end
@@ -206,12 +232,19 @@ def parse_header line, samples, options
206
232
  if not options[:skip_header]
207
233
  if headerline =~ /^#CHR/
208
234
  # The header before actual data contains the sample names, first inject the BioVcf meta information
209
- print header.tag(options),"\n" if not options[:skip_header]
235
+ print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
236
+ # Then the additional filter(s)
237
+ # ##FILTER=<ID=LowQual,Description="Low quality">
238
+ add_filter = options[:add_filter]
239
+ if add_filter
240
+ print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
241
+ end
242
+
210
243
  selected = header.column_names
211
244
  if samples
212
245
  newfields = selected[0..8]
213
246
  samples.each do |s|
214
- newfields << selected[s+9]
247
+ newfields << selected[s+9]
215
248
  end
216
249
  selected = newfields
217
250
  end
@@ -223,11 +256,15 @@ def parse_header line, samples, options
223
256
  end
224
257
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
225
258
  VcfRdf::header if options[:rdf]
259
+ if line =~ /^#/
260
+ # We did not read a record
261
+ line = nil
262
+ end
226
263
  return header,line
227
264
  end
228
265
 
229
- # Parse a VCF line and return the result as a string
230
- def parse_line line,header,options,samples,template,stats=nil
266
+ # Parse a VCF line and return the (template) result as a string buffer
267
+ def parse_line line,header,options,bedfilter,samples,template,stats=nil
231
268
  fields = VcfLine.parse(line)
232
269
  rec = VcfRecord.new(fields,header)
233
270
  r = rec # alias
@@ -236,9 +273,11 @@ def parse_line line,header,options,samples,template,stats=nil
236
273
  sfilter = options[:sfilter]
237
274
  efilter = options[:efilter]
238
275
  ifilter = options[:ifilter]
276
+ add_filter = options[:add_filter] # contains a filter name (soft filter)
239
277
  seval = options[:seval]
240
278
  ignore_missing = options[:ignore_missing]
241
279
  quiet = options[:quiet]
280
+ set_filter_field = nil
242
281
 
243
282
  if sfilter or efilter or ifilter or seval
244
283
  # check for samples
@@ -248,15 +287,32 @@ def parse_line line,header,options,samples,template,stats=nil
248
287
 
249
288
  # --------------------------
250
289
  # Filtering and set analysis
251
- return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
252
-
253
- if sfilter
290
+ if bedfilter
291
+ bed = bedfilter.contains(rec)
292
+ return if not bed
293
+ end
294
+
295
+ skip = lambda { |&m|
296
+ matched = m.call
297
+ if add_filter
298
+ set_filter_field = true if matched
299
+ false # always continue processing with an add-filter
300
+ else
301
+ not matched
302
+ end
303
+ }
304
+
305
+ if filter
306
+ return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
307
+ end
308
+
309
+ if sfilter # sample 'or' filter
254
310
  rec.each_sample(options[:sfilter_samples]) do | sample |
255
- return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
311
+ return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
256
312
  end
257
313
  end
258
314
 
259
- if ifilter
315
+ if ifilter # include sample filter
260
316
  found = false
261
317
  rec.each_sample(options[:ifilter_samples]) do | sample |
262
318
  if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -265,12 +321,12 @@ def parse_line line,header,options,samples,template,stats=nil
265
321
  end
266
322
  end
267
323
  # Skip if there are no matches
268
- return if not found
324
+ return if skip.call {found}
269
325
  end
270
326
 
271
- if efilter
327
+ if efilter # exclude sample filter
272
328
  rec.each_sample(options[:efilter_samples]) do | sample |
273
- return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
329
+ return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
274
330
  end
275
331
  end
276
332
 
@@ -278,18 +334,21 @@ def parse_line line,header,options,samples,template,stats=nil
278
334
 
279
335
  # -----------------------------
280
336
  # From here on decide on output
337
+
338
+ rec.add_to_filter_field(add_filter) if set_filter_field
339
+
281
340
  if samples
282
341
  # Select certain samples for output
283
342
  newfields = fields[0..8]
284
343
  samples.each do |s|
285
- newfields << fields[s+9]
344
+ newfields << fields[s+9]
286
345
  end
287
346
  fields = newfields
288
347
  end
289
348
  if options[:eval] or seval
290
349
  begin
291
350
  results = nil # result string
292
- if options[:eval]
351
+ if options[:eval]
293
352
  res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
294
353
  results = res if res
295
354
  end
@@ -307,23 +366,22 @@ def parse_line line,header,options,samples,template,stats=nil
307
366
  exit 1
308
367
  end
309
368
  return results.to_s+"\n" if results
310
- exit(1) if options[:eval_once] # <--- can this be reached?
311
369
  else
312
370
  if options[:rdf]
313
371
  # Output Turtle RDF
314
372
  VcfRdf::record(options[:id],rec,options[:tags])
315
373
  elsif options[:template]
316
- # Ruby ERB template
374
+ # Use ERB template
317
375
  begin
318
- template.result(binding)
376
+ template.body(binding)
319
377
  rescue Exception => e
320
378
  $stderr.print e,": ",fields,"\n"
321
379
  $stderr.print e.backtrace.inspect if options[:verbose]
322
- raise
380
+ raise
323
381
  end
324
382
  elsif options[:rewrite]
325
383
  # Default behaviour prints VCF line, but rewrite info
326
- eval(options[:rewrite])
384
+ eval(options[:rewrite])
327
385
  (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
328
386
  elsif stats
329
387
  # do nothing
@@ -334,88 +392,97 @@ def parse_line line,header,options,samples,template,stats=nil
334
392
  end
335
393
  end
336
394
 
395
+ CHUNK_SIZE = options[:thread_lines]
396
+
397
+ pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
398
+ options[:quiet],options[:debug])
337
399
  header = nil
338
400
  header_output_completed = false
339
- NUM_THREADS = options[:num_threads]
340
- CHUNK_SIZE = options[:thread_lines]
341
- CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
342
- chunks = []
343
- lines = []
401
+ chunk_lines = []
344
402
  line_number=0
345
403
 
404
+ if options[:bed]
405
+ bedfilter = BedFilter.new(options[:bed])
406
+ end
407
+
346
408
  begin
409
+ # Define linear parser function (going through one chunk)
347
410
  process = lambda { | lines |
348
411
  res = []
349
412
  lines.each do | line |
350
- res << parse_line(line,header,options,samples,template,stats)
413
+ res << parse_line(line,header,options,bedfilter,samples,template,stats)
351
414
  end
352
415
  res
353
416
  }
354
- output = lambda { |collection|
355
- collection.each do | result |
356
- result.each { |line| print line }
357
- end
358
- } # end output
359
417
 
360
418
  # ---- Main loop
361
419
  STDIN.each_line do | line |
362
420
  line_number += 1
363
- # ---- In this section header information is handled
421
+
422
+ # ---- Skip embedded headers down the line...
364
423
  next if header_output_completed and line =~ /^#/
365
- if line =~ /^##fileformat=/ or line =~ /^#CHR/
366
- header,line = parse_header(line,samples,options)
367
- end
368
- next if line =~ /^##/ # empty file
369
- header_output_completed = true
370
- if not options[:efilter_samples] and options[:ifilter_samples]
371
- # Create exclude set as a complement of include set
372
- options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
373
- end
374
424
 
375
- # ---- In this section the VCF variant lines are parsed
376
- lines << line
377
- if NUM_THREADS == 1
378
- $stderr.print '.' if line_number % CHUNK_SIZE == 0 and not options[:quiet]
379
- if lines.size > CHUNK_SIZE
380
- process.call(lines).each { | l | print l }
381
- lines = []
425
+ # ---- In the following section header information is handled -
426
+ # this only happens once.
427
+
428
+ # ---- Parse the header lines (chomps from STDIN)
429
+ # and returns header info and the current line
430
+ if line =~ /^#/
431
+ header, line = parse_header(line,samples,options)
432
+ if line.nil?
433
+ # No line after header, to there are no records to process
434
+ break
382
435
  end
383
- else
384
- if lines.size > CHUNK_SIZE
385
- chunks << lines
386
- if chunks.size > CHUNK_NUM
387
- $stderr.print '.' if not options[:quiet]
388
- out = Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
389
- process.call(chunk)
390
- }
391
- chunks = []
392
- # Output is forked to a separate process too
393
- fork do
394
- output.call out
395
- STDOUT.flush
396
- STDOUT.close
397
- exit 0
398
- end
399
- end
400
- lines = []
436
+ end
437
+ # p [line_number,line]
438
+ # ---- After the header continue processing
439
+ if not header_output_completed
440
+ # one-time post-header processing
441
+ if not options[:efilter_samples] and options[:ifilter_samples]
442
+ # Create exclude set as a complement of include set
443
+ options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
401
444
  end
445
+ print template.header(binding) if template
446
+ header_output_completed = true
447
+ end
448
+
449
+ if options[:eval_once]
450
+ # this happens if we only want one line evaluated - say to get
451
+ # the number of samples
452
+ print parse_line(line,header,options,bedfilter,samples,template,stats)
453
+ exit 0
454
+ end
455
+
456
+ # ---- Lines are collected in one buffer and the lines buffer
457
+ # is added to the chunks list (for the threads)
458
+ chunk_lines << line
459
+
460
+ # ---- In the following section the VCF lines are parsed by chunks
461
+ # The chunks may go into different threads
462
+
463
+ if chunk_lines.size >= CHUNK_SIZE
464
+ # ---- process one chunk
465
+ $stderr.print '.' if not options[:quiet]
466
+ pcows.wait_for_worker_slot()
467
+ pcows.submit_worker(process,chunk_lines)
468
+ pcows.process_output()
469
+
470
+ chunk_lines = []
402
471
  end
403
472
  end
404
- $stderr.print '.' if not options[:quiet]
405
- if NUM_THREADS == 1
406
- process.call(lines).each { |l| print l}
407
- else
408
- chunks << lines
409
- output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
410
- process.call(chunk)
411
- }
412
- end
473
+ pcows.submit_final_worker(process,chunk_lines)
474
+ pcows.wait_for_workers()
475
+ pcows.process_remaining_output()
476
+
477
+ print template.footer(binding) if template
413
478
  stats.print if stats
414
479
 
415
480
  rescue Exception => e
416
- # $stderr.print line
417
- $stderr.print e.message,"\n"
481
+ if e.message != 'exit'
482
+ $stderr.print "ERROR: "
483
+ $stderr.print e.message,"\n"
484
+ end
485
+ pcows.cleanup()
418
486
  raise if options[:verbose]
419
487
  exit 1
420
488
  end
421
-