bio-vcf 0.8.1 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +1 -11
  3. data/Gemfile +2 -8
  4. data/LICENSE.txt +1 -1
  5. data/README.md +467 -129
  6. data/RELEASE_NOTES.md +27 -0
  7. data/RELEASE_NOTES.md~ +11 -0
  8. data/Rakefile +9 -42
  9. data/TAGS +115 -0
  10. data/VERSION +1 -1
  11. data/bin/bio-vcf +156 -108
  12. data/bio-vcf.gemspec +13 -75
  13. data/features/cli.feature +22 -4
  14. data/features/diff_count.feature +0 -1
  15. data/features/filter.feature +12 -0
  16. data/features/multisample.feature +12 -0
  17. data/features/somaticsniper.feature +2 -0
  18. data/features/step_definitions/cli-feature.rb +15 -6
  19. data/features/step_definitions/diff_count.rb +1 -1
  20. data/features/step_definitions/multisample.rb +19 -0
  21. data/features/step_definitions/somaticsniper.rb +9 -1
  22. data/features/step_definitions/vcf_header.rb +48 -0
  23. data/features/support/env.rb +1 -11
  24. data/features/vcf_header.feature +35 -0
  25. data/lib/bio-vcf.rb +1 -0
  26. data/lib/bio-vcf/pcows.rb +303 -0
  27. data/lib/bio-vcf/vcffile.rb +46 -0
  28. data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
  29. data/lib/bio-vcf/vcfheader.rb +137 -5
  30. data/lib/bio-vcf/vcfheader_line.rb +778 -0
  31. data/lib/bio-vcf/vcfrecord.rb +56 -18
  32. data/lib/bio-vcf/vcfsample.rb +26 -2
  33. data/lib/regressiontest.rb +11 -0
  34. data/lib/regressiontest/cli_exec.rb +101 -0
  35. data/ragel/gen_vcfheaderline_parser.rl +165 -0
  36. data/ragel/generate.sh +8 -0
  37. data/template/vcf2json.erb +16 -16
  38. data/template/vcf2json_full_header.erb +22 -0
  39. data/template/vcf2json_use_meta.erb +41 -0
  40. data/test/data/input/empty.vcf +2 -0
  41. data/test/data/input/gatk_exome.vcf +237 -0
  42. data/test/data/input/gatk_wgs.vcf +1000 -0
  43. data/test/data/input/test.bed +632 -0
  44. data/test/data/regression/empty-stderr.new +12 -0
  45. data/test/data/regression/empty.new +2 -0
  46. data/test/data/regression/empty.ref +2 -0
  47. data/test/data/regression/eval_once-stderr.new +2 -0
  48. data/test/data/regression/eval_once.new +1 -0
  49. data/test/data/regression/eval_once.ref +1 -0
  50. data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
  51. data/test/data/regression/eval_r.info.dp.new +150 -0
  52. data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
  53. data/test/data/regression/ifilter_s.dp.new +31 -0
  54. data/test/data/regression/pass1-stderr.new +10 -0
  55. data/test/data/regression/pass1.new +88 -0
  56. data/test/data/regression/pass1.ref +88 -0
  57. data/test/data/regression/r.info.dp-stderr.new +4 -0
  58. data/test/data/regression/r.info.dp.new +114 -0
  59. data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
  60. data/test/data/regression/rewrite.info.sample.new +150 -0
  61. data/test/data/regression/s.dp-stderr.new +18 -0
  62. data/test/data/regression/s.dp.new +145 -0
  63. data/test/data/regression/seval_s.dp-stderr.new +10 -0
  64. data/test/data/regression/seval_s.dp.new +36 -0
  65. data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
  66. data/test/data/regression/sfilter_seval_s.dp.new +31 -0
  67. data/test/data/regression/thread4-stderr.new +10 -0
  68. data/test/data/regression/thread4.new +150 -0
  69. data/test/data/regression/thread4_4-stderr.new +25 -0
  70. data/test/data/regression/thread4_4.new +130 -0
  71. data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
  72. data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
  73. data/test/data/regression/thread4_4_failed_filter.new +110 -0
  74. data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
  75. data/test/data/regression/vcf2json_full_header.new +225 -0
  76. data/test/data/regression/vcf2json_full_header.ref +225 -0
  77. data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
  78. data/test/data/regression/vcf2json_use_meta.new +4697 -0
  79. data/test/data/regression/vcf2json_use_meta.ref +4697 -0
  80. data/test/performance/metrics.md +18 -1
  81. data/test/stress/stress_test.sh +15 -0
  82. data/test/tmp/test.vcf +12469 -0
  83. metadata +63 -64
  84. data/Gemfile.lock +0 -81
@@ -0,0 +1,27 @@
1
+ ## ChangeLog v0.9.5 (20210118)
2
+
3
+ + Improved README and installation instructions
4
+ + Added guix.scm build and instructions (no need for bundler)
5
+ + Moved regressiontest into tree
6
+
7
+ ## ChangeLog v0.9.4 (20201222)
8
+
9
+ This is an important maintenance release of bio-vcf:
10
+
11
+ + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf)
12
+ + Fixed tests to match recent Ruby updates
13
+
14
+ ## Older release notes
15
+
16
+ + Getting ready for a 1.0 release
17
+ + Released 0.9.2 as a gem
18
+ + 0.9.1 removed a rare threading bug and cleanup on error
19
+ + Added support for soft filters (request by Brad Chapman)
20
+ + The outputter now writes (properly) in parallel with the parser
21
+ + bio-vcf turns any VCF into JSON with header information, and
22
+ allows you to pipe that JSON directly into any JSON supporting
23
+ language, including Python and Javascript!
24
+
25
+ ## Older changes
26
+
27
+ For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
@@ -0,0 +1,11 @@
1
+ ## RELEASE NOTES
2
+
3
+
4
+ * Getting ready for a 1.0 release
5
+ * Released 0.9.2 as a gem
6
+ * 0.9.1 removed a rare threading bug and cleanup on error
7
+ * Added support for soft filters (request by Brad Chapman)
8
+ * The outputter now writes (properly) in parallel with the parser
9
+ * bio-vcf turns any VCF into JSON with header information, and
10
+ allows you to pipe that JSON directly into any JSON supporting
11
+ language, including Python and Javascript!
data/Rakefile CHANGED
@@ -1,54 +1,21 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
3
+ # require 'rubygems'
12
4
  require 'rake'
5
+ # require 'cucumber/rake/task'
13
6
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "bio-vcf"
18
- gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Fast multi-threaded VCF parser}
21
- gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
22
- gem.email = "pjotr.public01@thebird.nl"
23
- gem.authors = ["Pjotr Prins"]
24
- gem.required_ruby_version = '>=2.0.0'
25
- # dependencies defined in Gemfile
26
- end
27
- Jeweler::RubygemsDotOrgTasks.new
28
-
29
- # require 'rspec/core'
30
- # require 'rspec/core/rake_task'
31
- # RSpec::Core::RakeTask.new(:spec) do |spec|
32
- # spec.pattern = FileList['spec/**/*_spec.rb']
33
- # end
34
-
35
- # RSpec::Core::RakeTask.new(:rcov) do |spec|
36
- # spec.pattern = 'spec/**/*_spec.rb'
37
- # spec.rcov = true
7
+ # Cucumber::Rake::Task.new(:features) do |t|
8
+ # t.cucumber_opts = "--bundler false"
38
9
  # end
39
10
 
40
- # require 'rake/testtask'
41
-
42
- # Rake::TestTask.new do |t|
43
- # t.pattern = "spec/*_spec.rb"
44
- # end
45
-
46
- require 'cucumber/rake/task'
47
- Cucumber::Rake::Task.new(:features)
11
+ desc 'Run cucumber' # without bundler
12
+ task :features do
13
+ sh 'cucumber features'
14
+ end
48
15
 
49
16
  task :default => :features
50
17
 
51
- task :test => [ :features ]
18
+ task :test => [ :features ]
52
19
 
53
20
  require 'rdoc/task'
54
21
  Rake::RDocTask.new do |rdoc|
data/TAGS ADDED
@@ -0,0 +1,115 @@
1
+
2
+ ./bin/bio-vcf,0
3
+
4
+ ./lib/bio-vcf.rb,0
5
+
6
+ ./lib/bio-vcf/vcfgenotypefield.rb,1553
7
+ module BioVcf::BioVcf1,0
8
+ class VcfNucleotides::BioVcf::VcfNucleotides7,167
9
+ def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
10
+ def []::BioVcf::VcfNucleotides#[]13,284
11
+ def to_ary::BioVcf::VcfNucleotides#to_ary27,628
12
+ def max::BioVcf::VcfNucleotides#max32,742
13
+ def min::BioVcf::VcfNucleotides#min37,856
14
+ def sum::BioVcf::VcfNucleotides#sum42,975
15
+ class VcfAltInfo::BioVcf::VcfAltInfo50,1082
16
+ def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
17
+ def []::BioVcf::VcfAltInfo#[]56,1194
18
+ def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
19
+ def max::BioVcf::VcfAltInfo#max75,1626
20
+ def min::BioVcf::VcfAltInfo#min79,1702
21
+ def sum::BioVcf::VcfAltInfo#sum83,1783
22
+ class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
23
+ def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
24
+ def dp4::BioVcf::VcfGenotypeField#dp496,2020
25
+ def ad::BioVcf::VcfGenotypeField#ad100,2098
26
+ def pl::BioVcf::VcfGenotypeField#pl104,2174
27
+ def bcount::BioVcf::VcfGenotypeField#bcount108,2250
28
+ def bq::BioVcf::VcfGenotypeField#bq112,2343
29
+ def amq::BioVcf::VcfGenotypeField#amq116,2424
30
+ def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
31
+ class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
32
+ def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
33
+ def []::BioVcf::VcfGenotypeFields#[]141,3021
34
+ def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
35
+
36
+ ./lib/bio-vcf/vcfrdf.rb,156
37
+ module BioVcf::BioVcf1,0
38
+ module VcfRdf::BioVcf::VcfRdf5,93
39
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
40
+ def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
41
+
42
+ ./lib/bio-vcf/vcf.rb,27
43
+ module BioVcf::BioVcf2,1
44
+
45
+ ./lib/bio-vcf/vcfline.rb,118
46
+ module BioVcf::BioVcf1,0
47
+ module VcfLine::BioVcf::VcfLine2,16
48
+ def VcfLine.parse::BioVcf::VcfLine.parse5,82
49
+
50
+ ./lib/bio-vcf/vcfrecord.rb,1831
51
+ module BioVcf::BioVcf1,0
52
+ class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
53
+ def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
54
+ def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
55
+ module VcfRecordParser::BioVcf::VcfRecordParser18,329
56
+ def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
57
+ def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
58
+ module VcfRecordCall::BioVcf::VcfRecordCall30,592
59
+ def call_diff::BioVcf::VcfRecordCall#call_diff31,617
60
+ def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
61
+ def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
62
+ def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
63
+ def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
64
+ def index::BioVcf::VcfRecordCall#index51,1026
65
+ class VcfRecord::BioVcf::VcfRecord56,1125
66
+ attr_reader :header::BioVcf::VcfRecord#header60,1173
67
+ def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
68
+ def chrom::BioVcf::VcfRecord#chrom67,1292
69
+ def pos::BioVcf::VcfRecord#pos71,1332
70
+ def ids::BioVcf::VcfRecord#ids75,1384
71
+ def id::BioVcf::VcfRecord#id79,1443
72
+ def ref::BioVcf::VcfRecord#ref83,1476
73
+ def alt::BioVcf::VcfRecord#alt87,1524
74
+ def qual::BioVcf::VcfRecord#qual91,1582
75
+ def info::BioVcf::VcfRecord#info95,1636
76
+ def format::BioVcf::VcfRecord#format99,1711
77
+ def normal::BioVcf::VcfRecord#normal104,1848
78
+ def tumor::BioVcf::VcfRecord#tumor109,1997
79
+ def sample::BioVcf::VcfRecord#sample114,2134
80
+ def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
81
+ def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
82
+ def method_missing::BioVcf::VcfRecord#method_missing126,2341
83
+
84
+ ./lib/bio-vcf/variant.rb,470
85
+ module BioVcf::BioVcf1,0
86
+ module Variant::BioVcf::Variant3,17
87
+ def Variant.diff::BioVcf::Variant.diff5,37
88
+ def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
89
+ def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
90
+ def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
91
+ def Variant.index::BioVcf::Variant.index25,652
92
+ def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
93
+
94
+ ./lib/bio-vcf/vcfheader.rb,598
95
+ module BioVcf::BioVcf2,1
96
+ module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
97
+ def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
98
+ class VcfHeader::BioVcf::VcfHeader18,339
99
+ attr_reader :lines::BioVcf::VcfHeader#lines20,360
100
+ def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
101
+ def add::BioVcf::VcfHeader#add26,430
102
+ def version::BioVcf::VcfHeader#version30,483
103
+ def column_names::BioVcf::VcfHeader#column_names34,578
104
+ def columns::BioVcf::VcfHeader#columns38,674
105
+ def samples::BioVcf::VcfHeader#samples42,735
106
+
107
+ ./features/step_definitions/diff_count.rb,0
108
+
109
+ ./features/step_definitions/bio-vcf_steps.rb,0
110
+
111
+ ./features/step_definitions/somaticsniper.rb,0
112
+
113
+ ./features/step_definitions/multisample.rb,0
114
+
115
+ ./features/support/env.rb,0
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.1
1
+ 0.9.5
@@ -4,7 +4,7 @@
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT
6
6
  #
7
- # Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ # Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
8
8
 
9
9
  USAGE = "Vcf parser"
10
10
 
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
15
15
  version = File.new(VERSION_FILENAME).read.chomp
16
16
 
17
17
  require 'bio-vcf'
18
+ require 'bio-vcf/pcows'
18
19
  require 'optparse'
19
20
  require 'timeout'
20
21
  require 'fileutils'
21
22
 
22
- # Uncomment when using the bio-logger
23
+ # Uncomment when using the bio-logger
23
24
  # require 'bio-logger'
24
25
  # log = Bio::Log::LoggerPlus.new 'vcf'
25
- # log.outputters = Bio::Log::Outputter.stderr
26
+ # log.outputters = Bio::Log::Outputter.stderr
26
27
  # Bio::Log::CLI.logger('stderr')
27
28
  # Bio::Log::CLI.trace('info')
28
29
 
29
- options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000 }
30
+ options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
30
31
  opts = OptionParser.new do |o|
31
32
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
32
33
 
33
- o.on('-i','--ignore-missing', 'Ignore missing data') do
34
+ o.on('-i','--ignore-missing', 'Ignore missing data') do
34
35
  options[:ignore_missing] = true
35
36
  end
36
37
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
@@ -57,6 +58,9 @@ opts = OptionParser.new do |o|
57
58
  o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
58
59
  options[:efilter_samples] = l
59
60
  end
61
+ o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
62
+ options[:add_filter] = name
63
+ end
60
64
 
61
65
  o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
62
66
  options[:bed] = bed
@@ -68,6 +72,9 @@ opts = OptionParser.new do |o|
68
72
  o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
69
73
  options[:eval_once] = true
70
74
  options[:eval] = cmd
75
+ # options[:num_threads] = 1
76
+ # options[:thread_lines] = 1
77
+ options[:skip_header] = true
71
78
  end
72
79
  o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
73
80
  options[:seval] = cmd
@@ -84,7 +91,7 @@ opts = OptionParser.new do |o|
84
91
  options[:rdf] = true
85
92
  options[:skip_header] = true
86
93
  end
87
- o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i|
94
+ o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
88
95
  options[:num_threads] = i
89
96
  end
90
97
  o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
@@ -96,8 +103,8 @@ opts = OptionParser.new do |o|
96
103
  o.on_tail("--tags list", String, "Add tags") do |s|
97
104
  options[:tags] = s
98
105
  end
99
-
100
- o.on("--skip-header", "Do not output VCF header info") do
106
+
107
+ o.on("--skip-header", "Do not output VCF header info") do
101
108
  options[:skip_header] = true
102
109
  end
103
110
 
@@ -112,9 +119,16 @@ opts = OptionParser.new do |o|
112
119
  options[:template] = s
113
120
  options[:skip_header] = true
114
121
  end
115
-
116
-
117
- # Uncomment the following when using the bio-logger
122
+
123
+ o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
124
+ options[:tag] = true
125
+ end
126
+
127
+ o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
128
+ options[:timeout] = i
129
+ end
130
+
131
+ # Uncomment the following when using the bio-logger
118
132
  # o.separator ""
119
133
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
120
134
  # Bio::Log::CLI.logger(name)
@@ -123,7 +137,16 @@ opts = OptionParser.new do |o|
123
137
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
124
138
  # Bio::Log::CLI.trace(s)
125
139
  # end
126
- #
140
+ #
141
+ o.on("--names", "Output sample names") do |q|
142
+ options[:quiet] = true
143
+ options[:num_threads] = nil
144
+ options[:eval_once] = true
145
+ options[:eval] = 'header.samples.join("\t")'
146
+ # options[:num_threads] = 1
147
+ # options[:thread_lines] = 1
148
+ options[:skip_header] = true
149
+ end
127
150
  o.on("--statistics", "Output statistics") do |q|
128
151
  options[:statistics] = true
129
152
  options[:num_threads] = nil
@@ -132,14 +155,15 @@ opts = OptionParser.new do |o|
132
155
  # Bio::Log::CLI.trace('error')
133
156
  options[:quiet] = true
134
157
  end
135
-
158
+
136
159
  o.on("-v", "--verbose", "Run verbosely") do |v|
137
160
  options[:verbose] = true
138
161
  end
139
-
140
- # o.on("--debug", "Show debug messages") do |v|
141
- # Bio::Log::CLI.trace('debug')
142
- # end
162
+
163
+ o.on("--debug", "Show debug messages and keep intermediate output") do |v|
164
+ # Bio::Log::CLI.trace('debug')
165
+ options[:debug] = true
166
+ end
143
167
 
144
168
  o.separator ""
145
169
  o.on_tail('-h', '--help', 'display this help and exit') do
@@ -150,11 +174,11 @@ end
150
174
  opts.parse!(ARGV)
151
175
 
152
176
  BIOVCF_VERSION=version
153
- BIOVCF_BANNER = "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
154
- $stderr.print BIOVCF_BANNER
177
+ BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
178
+ $stderr.print BIOVCF_BANNER if !options[:quiet]
155
179
 
156
- if options[:show_help]
157
- print opts
180
+ if options[:show_help]
181
+ print opts
158
182
  print USAGE
159
183
  exit 1
160
184
  end
@@ -174,15 +198,6 @@ if options[:template]
174
198
  template = Bio::Template.new(fn)
175
199
  end
176
200
 
177
- if options[:num_threads] != 1
178
- begin
179
- require 'parallel'
180
- rescue LoadError
181
- $stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
182
- options[:num_threads] = 1
183
- end
184
- end
185
-
186
201
  stats = nil
187
202
  if options[:statistics]
188
203
  options[:num_threads] = nil
@@ -193,6 +208,8 @@ end
193
208
  raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
194
209
  raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
195
210
  raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
211
+ # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
212
+ # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
196
213
 
197
214
  if options[:samples]
198
215
  samples = options[:samples].map { |s| s.to_i }
@@ -200,13 +217,14 @@ end
200
217
 
201
218
  include BioVcf
202
219
 
203
- # Parse the header section of a VCF file
220
+ # Parse the header section of a VCF file (chomping STDIN)
204
221
  def parse_header line, samples, options
205
- header = VcfHeader.new
222
+ header = VcfHeader.new(options[:debug])
206
223
  header.add(line)
207
224
  print line if not options[:skip_header]
208
225
  STDIN.each_line do | headerline |
209
226
  if headerline !~ /^#/
227
+ # If no records in VCF, we never get here
210
228
  line = headerline
211
229
  break # end of header
212
230
  end
@@ -214,12 +232,19 @@ def parse_header line, samples, options
214
232
  if not options[:skip_header]
215
233
  if headerline =~ /^#CHR/
216
234
  # The header before actual data contains the sample names, first inject the BioVcf meta information
217
- print header.tag(options),"\n" if not options[:skip_header]
235
+ print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
236
+ # Then the additional filter(s)
237
+ # ##FILTER=<ID=LowQual,Description="Low quality">
238
+ add_filter = options[:add_filter]
239
+ if add_filter
240
+ print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
241
+ end
242
+
218
243
  selected = header.column_names
219
244
  if samples
220
245
  newfields = selected[0..8]
221
246
  samples.each do |s|
222
- newfields << selected[s+9]
247
+ newfields << selected[s+9]
223
248
  end
224
249
  selected = newfields
225
250
  end
@@ -231,10 +256,14 @@ def parse_header line, samples, options
231
256
  end
232
257
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
233
258
  VcfRdf::header if options[:rdf]
259
+ if line =~ /^#/
260
+ # We did not read a record
261
+ line = nil
262
+ end
234
263
  return header,line
235
264
  end
236
265
 
237
- # Parse a VCF line and return the result as a string
266
+ # Parse a VCF line and return the (template) result as a string buffer
238
267
  def parse_line line,header,options,bedfilter,samples,template,stats=nil
239
268
  fields = VcfLine.parse(line)
240
269
  rec = VcfRecord.new(fields,header)
@@ -244,9 +273,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
244
273
  sfilter = options[:sfilter]
245
274
  efilter = options[:efilter]
246
275
  ifilter = options[:ifilter]
276
+ add_filter = options[:add_filter] # contains a filter name (soft filter)
247
277
  seval = options[:seval]
248
278
  ignore_missing = options[:ignore_missing]
249
279
  quiet = options[:quiet]
280
+ set_filter_field = nil
250
281
 
251
282
  if sfilter or efilter or ifilter or seval
252
283
  # check for samples
@@ -261,15 +292,27 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
261
292
  return if not bed
262
293
  end
263
294
 
264
- return if filter and not rec.filter(filter,ignore_missing_data: ignore_missing,quiet: quiet)
265
-
266
- if sfilter
295
+ skip = lambda { |&m|
296
+ matched = m.call
297
+ if add_filter
298
+ set_filter_field = true if matched
299
+ false # always continue processing with an add-filter
300
+ else
301
+ not matched
302
+ end
303
+ }
304
+
305
+ if filter
306
+ return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
307
+ end
308
+
309
+ if sfilter # sample 'or' filter
267
310
  rec.each_sample(options[:sfilter_samples]) do | sample |
268
- return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet)
311
+ return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
269
312
  end
270
313
  end
271
314
 
272
- if ifilter
315
+ if ifilter # include sample filter
273
316
  found = false
274
317
  rec.each_sample(options[:ifilter_samples]) do | sample |
275
318
  if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
@@ -278,12 +321,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
278
321
  end
279
322
  end
280
323
  # Skip if there are no matches
281
- return if not found
324
+ return if skip.call {found}
282
325
  end
283
326
 
284
- if efilter
327
+ if efilter # exclude sample filter
285
328
  rec.each_sample(options[:efilter_samples]) do | sample |
286
- return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet)
329
+ return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
287
330
  end
288
331
  end
289
332
 
@@ -291,18 +334,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
291
334
 
292
335
  # -----------------------------
293
336
  # From here on decide on output
337
+
338
+ rec.add_to_filter_field(add_filter) if set_filter_field
339
+
294
340
  if samples
295
341
  # Select certain samples for output
296
342
  newfields = fields[0..8]
297
343
  samples.each do |s|
298
- newfields << fields[s+9]
344
+ newfields << fields[s+9]
299
345
  end
300
346
  fields = newfields
301
347
  end
302
348
  if options[:eval] or seval
303
349
  begin
304
350
  results = nil # result string
305
- if options[:eval]
351
+ if options[:eval]
306
352
  res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
307
353
  results = res if res
308
354
  end
@@ -320,23 +366,22 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
320
366
  exit 1
321
367
  end
322
368
  return results.to_s+"\n" if results
323
- exit(1) if options[:eval_once] # <--- can this be reached?
324
369
  else
325
370
  if options[:rdf]
326
371
  # Output Turtle RDF
327
372
  VcfRdf::record(options[:id],rec,options[:tags])
328
373
  elsif options[:template]
329
- # Ruby ERB template
374
+ # Use ERB template
330
375
  begin
331
376
  template.body(binding)
332
377
  rescue Exception => e
333
378
  $stderr.print e,": ",fields,"\n"
334
379
  $stderr.print e.backtrace.inspect if options[:verbose]
335
- raise
380
+ raise
336
381
  end
337
382
  elsif options[:rewrite]
338
383
  # Default behaviour prints VCF line, but rewrite info
339
- eval(options[:rewrite])
384
+ eval(options[:rewrite])
340
385
  (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
341
386
  elsif stats
342
387
  # do nothing
@@ -347,20 +392,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
347
392
  end
348
393
  end
349
394
 
395
+ CHUNK_SIZE = options[:thread_lines]
396
+
397
+ pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
398
+ options[:quiet],options[:debug])
350
399
  header = nil
351
400
  header_output_completed = false
352
- NUM_THREADS = options[:num_threads]
353
- CHUNK_SIZE = options[:thread_lines]
354
- CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
355
- chunks = []
356
- lines = []
401
+ chunk_lines = []
357
402
  line_number=0
358
403
 
359
404
  if options[:bed]
360
405
  bedfilter = BedFilter.new(options[:bed])
361
- end
406
+ end
362
407
 
363
408
  begin
409
+ # Define linear parser function (going through one chunk)
364
410
  process = lambda { | lines |
365
411
  res = []
366
412
  lines.each do | line |
@@ -368,73 +414,75 @@ begin
368
414
  end
369
415
  res
370
416
  }
371
- output = lambda { |collection|
372
- collection.each do | result |
373
- result.each { |line| print line }
374
- end
375
- } # end output
376
417
 
377
- print template.header(binding) if template
378
418
  # ---- Main loop
379
419
  STDIN.each_line do | line |
380
420
  line_number += 1
381
- # ---- In this section header information is handled
421
+
422
+ # ---- Skip embedded headers down the line...
382
423
  next if header_output_completed and line =~ /^#/
383
- if line =~ /^##fileformat=/ or line =~ /^#CHR/
384
- header,line = parse_header(line,samples,options)
385
- end
386
- next if line =~ /^##/ # empty file
387
- header_output_completed = true
388
- if not options[:efilter_samples] and options[:ifilter_samples]
389
- # Create exclude set as a complement of include set
390
- options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
391
- end
392
424
 
393
- # ---- In this section the VCF variant lines are parsed
394
- lines << line
395
- if NUM_THREADS == 1
396
- $stderr.print '.' if line_number % CHUNK_SIZE == 0 and not options[:quiet]
397
- if lines.size > CHUNK_SIZE
398
- process.call(lines).each { | l | print l }
399
- lines = []
425
+ # ---- In the following section header information is handled -
426
+ # this only happens once.
427
+
428
+ # ---- Parse the header lines (chomps from STDIN)
429
+ # and returns header info and the current line
430
+ if line =~ /^#/
431
+ header, line = parse_header(line,samples,options)
432
+ if line.nil?
433
+ # No line after header, to there are no records to process
434
+ break
400
435
  end
401
- else
402
- if lines.size > CHUNK_SIZE
403
- chunks << lines
404
- if chunks.size > CHUNK_NUM
405
- $stderr.print '.' if not options[:quiet]
406
- out = Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
407
- process.call(chunk)
408
- }
409
- chunks = []
410
- # Output is forked to a separate process too
411
- fork do
412
- output.call out
413
- STDOUT.flush
414
- STDOUT.close
415
- exit 0
416
- end
417
- end
418
- lines = []
436
+ end
437
+ # p [line_number,line]
438
+ # ---- After the header continue processing
439
+ if not header_output_completed
440
+ # one-time post-header processing
441
+ if not options[:efilter_samples] and options[:ifilter_samples]
442
+ # Create exclude set as a complement of include set
443
+ options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
419
444
  end
445
+ print template.header(binding) if template
446
+ header_output_completed = true
447
+ end
448
+
449
+ if options[:eval_once]
450
+ # this happens if we only want one line evaluated - say to get
451
+ # the number of samples
452
+ print parse_line(line,header,options,bedfilter,samples,template,stats)
453
+ exit 0
454
+ end
455
+
456
+ # ---- Lines are collected in one buffer and the lines buffer
457
+ # is added to the chunks list (for the threads)
458
+ chunk_lines << line
459
+
460
+ # ---- In the following section the VCF lines are parsed by chunks
461
+ # The chunks may go into different threads
462
+
463
+ if chunk_lines.size >= CHUNK_SIZE
464
+ # ---- process one chunk
465
+ $stderr.print '.' if not options[:quiet]
466
+ pcows.wait_for_worker_slot()
467
+ pcows.submit_worker(process,chunk_lines)
468
+ pcows.process_output()
469
+
470
+ chunk_lines = []
420
471
  end
421
472
  end
422
- $stderr.print '.' if not options[:quiet]
423
- if NUM_THREADS == 1
424
- process.call(lines).each { |l| print l}
425
- else
426
- chunks << lines
427
- output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
428
- process.call(chunk)
429
- }
430
- end
473
+ pcows.submit_final_worker(process,chunk_lines)
474
+ pcows.wait_for_workers()
475
+ pcows.process_remaining_output()
476
+
431
477
  print template.footer(binding) if template
432
478
  stats.print if stats
433
479
 
434
480
  rescue Exception => e
435
- # $stderr.print line
436
- $stderr.print e.message,"\n"
481
+ if e.message != 'exit'
482
+ $stderr.print "ERROR: "
483
+ $stderr.print e.message,"\n"
484
+ end
485
+ pcows.cleanup()
437
486
  raise if options[:verbose]
438
487
  exit 1
439
488
  end
440
-