bio-vcf 0.8.1 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +2 -8
- data/LICENSE.txt +1 -1
- data/README.md +467 -129
- data/RELEASE_NOTES.md +27 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +9 -42
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +156 -108
- data/bio-vcf.gemspec +13 -75
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +12 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +1 -11
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
- data/lib/bio-vcf/vcfheader.rb +137 -5
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +26 -2
- data/lib/regressiontest.rb +11 -0
- data/lib/regressiontest/cli_exec.rb +101 -0
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +16 -16
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +63 -64
- data/Gemfile.lock +0 -81
data/RELEASE_NOTES.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
## ChangeLog v0.9.5 (20210118)
|
2
|
+
|
3
|
+
+ Improved README and installation instructions
|
4
|
+
+ Added guix.scm build and instructions (no need for bundler)
|
5
|
+
+ Moved regressiontest into tree
|
6
|
+
|
7
|
+
## ChangeLog v0.9.4 (20201222)
|
8
|
+
|
9
|
+
This is an important maintenance release of bio-vcf:
|
10
|
+
|
11
|
+
+ Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf)
|
12
|
+
+ Fixed tests to match recent Ruby updates
|
13
|
+
|
14
|
+
## Older release notes
|
15
|
+
|
16
|
+
+ Getting ready for a 1.0 release
|
17
|
+
+ Released 0.9.2 as a gem
|
18
|
+
+ 0.9.1 removed a rare threading bug and cleanup on error
|
19
|
+
+ Added support for soft filters (request by Brad Chapman)
|
20
|
+
+ The outputter now writes (properly) in parallel with the parser
|
21
|
+
+ bio-vcf turns any VCF into JSON with header information, and
|
22
|
+
allows you to pipe that JSON directly into any JSON supporting
|
23
|
+
language, including Python and Javascript!
|
24
|
+
|
25
|
+
## Older changes
|
26
|
+
|
27
|
+
For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
|
data/RELEASE_NOTES.md~
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
## RELEASE NOTES
|
2
|
+
|
3
|
+
|
4
|
+
* Getting ready for a 1.0 release
|
5
|
+
* Released 0.9.2 as a gem
|
6
|
+
* 0.9.1 removed a rare threading bug and cleanup on error
|
7
|
+
* Added support for soft filters (request by Brad Chapman)
|
8
|
+
* The outputter now writes (properly) in parallel with the parser
|
9
|
+
* bio-vcf turns any VCF into JSON with header information, and
|
10
|
+
allows you to pipe that JSON directly into any JSON supporting
|
11
|
+
language, including Python and Javascript!
|
data/Rakefile
CHANGED
@@ -1,54 +1,21 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
3
|
+
# require 'rubygems'
|
12
4
|
require 'rake'
|
5
|
+
# require 'cucumber/rake/task'
|
13
6
|
|
14
|
-
|
15
|
-
|
16
|
-
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.name = "bio-vcf"
|
18
|
-
gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
19
|
-
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{Fast multi-threaded VCF parser}
|
21
|
-
gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
|
22
|
-
gem.email = "pjotr.public01@thebird.nl"
|
23
|
-
gem.authors = ["Pjotr Prins"]
|
24
|
-
gem.required_ruby_version = '>=2.0.0'
|
25
|
-
# dependencies defined in Gemfile
|
26
|
-
end
|
27
|
-
Jeweler::RubygemsDotOrgTasks.new
|
28
|
-
|
29
|
-
# require 'rspec/core'
|
30
|
-
# require 'rspec/core/rake_task'
|
31
|
-
# RSpec::Core::RakeTask.new(:spec) do |spec|
|
32
|
-
# spec.pattern = FileList['spec/**/*_spec.rb']
|
33
|
-
# end
|
34
|
-
|
35
|
-
# RSpec::Core::RakeTask.new(:rcov) do |spec|
|
36
|
-
# spec.pattern = 'spec/**/*_spec.rb'
|
37
|
-
# spec.rcov = true
|
7
|
+
# Cucumber::Rake::Task.new(:features) do |t|
|
8
|
+
# t.cucumber_opts = "--bundler false"
|
38
9
|
# end
|
39
10
|
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
# end
|
45
|
-
|
46
|
-
require 'cucumber/rake/task'
|
47
|
-
Cucumber::Rake::Task.new(:features)
|
11
|
+
desc 'Run cucumber' # without bundler
|
12
|
+
task :features do
|
13
|
+
sh 'cucumber features'
|
14
|
+
end
|
48
15
|
|
49
16
|
task :default => :features
|
50
17
|
|
51
|
-
task :test => [ :features ]
|
18
|
+
task :test => [ :features ]
|
52
19
|
|
53
20
|
require 'rdoc/task'
|
54
21
|
Rake::RDocTask.new do |rdoc|
|
data/TAGS
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
|
2
|
+
./bin/bio-vcf,0
|
3
|
+
|
4
|
+
./lib/bio-vcf.rb,0
|
5
|
+
|
6
|
+
./lib/bio-vcf/vcfgenotypefield.rb,1553
|
7
|
+
module BioVcf::BioVcf1,0
|
8
|
+
class VcfNucleotides::BioVcf::VcfNucleotides7,167
|
9
|
+
def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
|
10
|
+
def []::BioVcf::VcfNucleotides#[]13,284
|
11
|
+
def to_ary::BioVcf::VcfNucleotides#to_ary27,628
|
12
|
+
def max::BioVcf::VcfNucleotides#max32,742
|
13
|
+
def min::BioVcf::VcfNucleotides#min37,856
|
14
|
+
def sum::BioVcf::VcfNucleotides#sum42,975
|
15
|
+
class VcfAltInfo::BioVcf::VcfAltInfo50,1082
|
16
|
+
def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
|
17
|
+
def []::BioVcf::VcfAltInfo#[]56,1194
|
18
|
+
def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
|
19
|
+
def max::BioVcf::VcfAltInfo#max75,1626
|
20
|
+
def min::BioVcf::VcfAltInfo#min79,1702
|
21
|
+
def sum::BioVcf::VcfAltInfo#sum83,1783
|
22
|
+
class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
|
23
|
+
def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
|
24
|
+
def dp4::BioVcf::VcfGenotypeField#dp496,2020
|
25
|
+
def ad::BioVcf::VcfGenotypeField#ad100,2098
|
26
|
+
def pl::BioVcf::VcfGenotypeField#pl104,2174
|
27
|
+
def bcount::BioVcf::VcfGenotypeField#bcount108,2250
|
28
|
+
def bq::BioVcf::VcfGenotypeField#bq112,2343
|
29
|
+
def amq::BioVcf::VcfGenotypeField#amq116,2424
|
30
|
+
def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
|
31
|
+
class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
|
32
|
+
def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
|
33
|
+
def []::BioVcf::VcfGenotypeFields#[]141,3021
|
34
|
+
def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
|
35
|
+
|
36
|
+
./lib/bio-vcf/vcfrdf.rb,156
|
37
|
+
module BioVcf::BioVcf1,0
|
38
|
+
module VcfRdf::BioVcf::VcfRdf5,93
|
39
|
+
def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
|
40
|
+
def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
|
41
|
+
|
42
|
+
./lib/bio-vcf/vcf.rb,27
|
43
|
+
module BioVcf::BioVcf2,1
|
44
|
+
|
45
|
+
./lib/bio-vcf/vcfline.rb,118
|
46
|
+
module BioVcf::BioVcf1,0
|
47
|
+
module VcfLine::BioVcf::VcfLine2,16
|
48
|
+
def VcfLine.parse::BioVcf::VcfLine.parse5,82
|
49
|
+
|
50
|
+
./lib/bio-vcf/vcfrecord.rb,1831
|
51
|
+
module BioVcf::BioVcf1,0
|
52
|
+
class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
|
53
|
+
def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
|
54
|
+
def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
|
55
|
+
module VcfRecordParser::BioVcf::VcfRecordParser18,329
|
56
|
+
def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
|
57
|
+
def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
|
58
|
+
module VcfRecordCall::BioVcf::VcfRecordCall30,592
|
59
|
+
def call_diff::BioVcf::VcfRecordCall#call_diff31,617
|
60
|
+
def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
|
61
|
+
def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
|
62
|
+
def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
|
63
|
+
def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
|
64
|
+
def index::BioVcf::VcfRecordCall#index51,1026
|
65
|
+
class VcfRecord::BioVcf::VcfRecord56,1125
|
66
|
+
attr_reader :header::BioVcf::VcfRecord#header60,1173
|
67
|
+
def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
|
68
|
+
def chrom::BioVcf::VcfRecord#chrom67,1292
|
69
|
+
def pos::BioVcf::VcfRecord#pos71,1332
|
70
|
+
def ids::BioVcf::VcfRecord#ids75,1384
|
71
|
+
def id::BioVcf::VcfRecord#id79,1443
|
72
|
+
def ref::BioVcf::VcfRecord#ref83,1476
|
73
|
+
def alt::BioVcf::VcfRecord#alt87,1524
|
74
|
+
def qual::BioVcf::VcfRecord#qual91,1582
|
75
|
+
def info::BioVcf::VcfRecord#info95,1636
|
76
|
+
def format::BioVcf::VcfRecord#format99,1711
|
77
|
+
def normal::BioVcf::VcfRecord#normal104,1848
|
78
|
+
def tumor::BioVcf::VcfRecord#tumor109,1997
|
79
|
+
def sample::BioVcf::VcfRecord#sample114,2134
|
80
|
+
def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
|
81
|
+
def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
|
82
|
+
def method_missing::BioVcf::VcfRecord#method_missing126,2341
|
83
|
+
|
84
|
+
./lib/bio-vcf/variant.rb,470
|
85
|
+
module BioVcf::BioVcf1,0
|
86
|
+
module Variant::BioVcf::Variant3,17
|
87
|
+
def Variant.diff::BioVcf::Variant.diff5,37
|
88
|
+
def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
|
89
|
+
def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
|
90
|
+
def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
|
91
|
+
def Variant.index::BioVcf::Variant.index25,652
|
92
|
+
def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
|
93
|
+
|
94
|
+
./lib/bio-vcf/vcfheader.rb,598
|
95
|
+
module BioVcf::BioVcf2,1
|
96
|
+
module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
|
97
|
+
def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
|
98
|
+
class VcfHeader::BioVcf::VcfHeader18,339
|
99
|
+
attr_reader :lines::BioVcf::VcfHeader#lines20,360
|
100
|
+
def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
|
101
|
+
def add::BioVcf::VcfHeader#add26,430
|
102
|
+
def version::BioVcf::VcfHeader#version30,483
|
103
|
+
def column_names::BioVcf::VcfHeader#column_names34,578
|
104
|
+
def columns::BioVcf::VcfHeader#columns38,674
|
105
|
+
def samples::BioVcf::VcfHeader#samples42,735
|
106
|
+
|
107
|
+
./features/step_definitions/diff_count.rb,0
|
108
|
+
|
109
|
+
./features/step_definitions/bio-vcf_steps.rb,0
|
110
|
+
|
111
|
+
./features/step_definitions/somaticsniper.rb,0
|
112
|
+
|
113
|
+
./features/step_definitions/multisample.rb,0
|
114
|
+
|
115
|
+
./features/support/env.rb,0
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.5
|
data/bin/bio-vcf
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Author:: Pjotr Prins
|
5
5
|
# License:: MIT
|
6
6
|
#
|
7
|
-
# Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
|
7
|
+
# Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
|
8
8
|
|
9
9
|
USAGE = "Vcf parser"
|
10
10
|
|
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
|
|
15
15
|
version = File.new(VERSION_FILENAME).read.chomp
|
16
16
|
|
17
17
|
require 'bio-vcf'
|
18
|
+
require 'bio-vcf/pcows'
|
18
19
|
require 'optparse'
|
19
20
|
require 'timeout'
|
20
21
|
require 'fileutils'
|
21
22
|
|
22
|
-
# Uncomment when using the bio-logger
|
23
|
+
# Uncomment when using the bio-logger
|
23
24
|
# require 'bio-logger'
|
24
25
|
# log = Bio::Log::LoggerPlus.new 'vcf'
|
25
|
-
# log.outputters = Bio::Log::Outputter.stderr
|
26
|
+
# log.outputters = Bio::Log::Outputter.stderr
|
26
27
|
# Bio::Log::CLI.logger('stderr')
|
27
28
|
# Bio::Log::CLI.trace('info')
|
28
29
|
|
29
|
-
options = { show_help: false, source: 'https://github.com/
|
30
|
+
options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
|
30
31
|
opts = OptionParser.new do |o|
|
31
32
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
32
33
|
|
33
|
-
o.on('-i','--ignore-missing', 'Ignore missing data') do
|
34
|
+
o.on('-i','--ignore-missing', 'Ignore missing data') do
|
34
35
|
options[:ignore_missing] = true
|
35
36
|
end
|
36
37
|
o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
|
@@ -57,6 +58,9 @@ opts = OptionParser.new do |o|
|
|
57
58
|
o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
|
58
59
|
options[:efilter_samples] = l
|
59
60
|
end
|
61
|
+
o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
|
62
|
+
options[:add_filter] = name
|
63
|
+
end
|
60
64
|
|
61
65
|
o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
|
62
66
|
options[:bed] = bed
|
@@ -68,6 +72,9 @@ opts = OptionParser.new do |o|
|
|
68
72
|
o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
|
69
73
|
options[:eval_once] = true
|
70
74
|
options[:eval] = cmd
|
75
|
+
# options[:num_threads] = 1
|
76
|
+
# options[:thread_lines] = 1
|
77
|
+
options[:skip_header] = true
|
71
78
|
end
|
72
79
|
o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
|
73
80
|
options[:seval] = cmd
|
@@ -84,7 +91,7 @@ opts = OptionParser.new do |o|
|
|
84
91
|
options[:rdf] = true
|
85
92
|
options[:skip_header] = true
|
86
93
|
end
|
87
|
-
o.on("--num-threads [num]", Integer, "Multi-core version (default
|
94
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
|
88
95
|
options[:num_threads] = i
|
89
96
|
end
|
90
97
|
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
@@ -96,8 +103,8 @@ opts = OptionParser.new do |o|
|
|
96
103
|
o.on_tail("--tags list", String, "Add tags") do |s|
|
97
104
|
options[:tags] = s
|
98
105
|
end
|
99
|
-
|
100
|
-
o.on("--skip-header", "Do not output VCF header info") do
|
106
|
+
|
107
|
+
o.on("--skip-header", "Do not output VCF header info") do
|
101
108
|
options[:skip_header] = true
|
102
109
|
end
|
103
110
|
|
@@ -112,9 +119,16 @@ opts = OptionParser.new do |o|
|
|
112
119
|
options[:template] = s
|
113
120
|
options[:skip_header] = true
|
114
121
|
end
|
115
|
-
|
116
|
-
|
117
|
-
|
122
|
+
|
123
|
+
o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
|
124
|
+
options[:tag] = true
|
125
|
+
end
|
126
|
+
|
127
|
+
o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
|
128
|
+
options[:timeout] = i
|
129
|
+
end
|
130
|
+
|
131
|
+
# Uncomment the following when using the bio-logger
|
118
132
|
# o.separator ""
|
119
133
|
# o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
120
134
|
# Bio::Log::CLI.logger(name)
|
@@ -123,7 +137,16 @@ opts = OptionParser.new do |o|
|
|
123
137
|
# o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
|
124
138
|
# Bio::Log::CLI.trace(s)
|
125
139
|
# end
|
126
|
-
#
|
140
|
+
#
|
141
|
+
o.on("--names", "Output sample names") do |q|
|
142
|
+
options[:quiet] = true
|
143
|
+
options[:num_threads] = nil
|
144
|
+
options[:eval_once] = true
|
145
|
+
options[:eval] = 'header.samples.join("\t")'
|
146
|
+
# options[:num_threads] = 1
|
147
|
+
# options[:thread_lines] = 1
|
148
|
+
options[:skip_header] = true
|
149
|
+
end
|
127
150
|
o.on("--statistics", "Output statistics") do |q|
|
128
151
|
options[:statistics] = true
|
129
152
|
options[:num_threads] = nil
|
@@ -132,14 +155,15 @@ opts = OptionParser.new do |o|
|
|
132
155
|
# Bio::Log::CLI.trace('error')
|
133
156
|
options[:quiet] = true
|
134
157
|
end
|
135
|
-
|
158
|
+
|
136
159
|
o.on("-v", "--verbose", "Run verbosely") do |v|
|
137
160
|
options[:verbose] = true
|
138
161
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
162
|
+
|
163
|
+
o.on("--debug", "Show debug messages and keep intermediate output") do |v|
|
164
|
+
# Bio::Log::CLI.trace('debug')
|
165
|
+
options[:debug] = true
|
166
|
+
end
|
143
167
|
|
144
168
|
o.separator ""
|
145
169
|
o.on_tail('-h', '--help', 'display this help and exit') do
|
@@ -150,11 +174,11 @@ end
|
|
150
174
|
opts.parse!(ARGV)
|
151
175
|
|
152
176
|
BIOVCF_VERSION=version
|
153
|
-
BIOVCF_BANNER = "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins
|
154
|
-
$stderr.print BIOVCF_BANNER
|
177
|
+
BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
|
178
|
+
$stderr.print BIOVCF_BANNER if !options[:quiet]
|
155
179
|
|
156
|
-
if options[:show_help]
|
157
|
-
print opts
|
180
|
+
if options[:show_help]
|
181
|
+
print opts
|
158
182
|
print USAGE
|
159
183
|
exit 1
|
160
184
|
end
|
@@ -174,15 +198,6 @@ if options[:template]
|
|
174
198
|
template = Bio::Template.new(fn)
|
175
199
|
end
|
176
200
|
|
177
|
-
if options[:num_threads] != 1
|
178
|
-
begin
|
179
|
-
require 'parallel'
|
180
|
-
rescue LoadError
|
181
|
-
$stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
|
182
|
-
options[:num_threads] = 1
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
201
|
stats = nil
|
187
202
|
if options[:statistics]
|
188
203
|
options[:num_threads] = nil
|
@@ -193,6 +208,8 @@ end
|
|
193
208
|
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
194
209
|
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
195
210
|
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
211
|
+
# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
|
212
|
+
# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
|
196
213
|
|
197
214
|
if options[:samples]
|
198
215
|
samples = options[:samples].map { |s| s.to_i }
|
@@ -200,13 +217,14 @@ end
|
|
200
217
|
|
201
218
|
include BioVcf
|
202
219
|
|
203
|
-
# Parse the header section of a VCF file
|
220
|
+
# Parse the header section of a VCF file (chomping STDIN)
|
204
221
|
def parse_header line, samples, options
|
205
|
-
header = VcfHeader.new
|
222
|
+
header = VcfHeader.new(options[:debug])
|
206
223
|
header.add(line)
|
207
224
|
print line if not options[:skip_header]
|
208
225
|
STDIN.each_line do | headerline |
|
209
226
|
if headerline !~ /^#/
|
227
|
+
# If no records in VCF, we never get here
|
210
228
|
line = headerline
|
211
229
|
break # end of header
|
212
230
|
end
|
@@ -214,12 +232,19 @@ def parse_header line, samples, options
|
|
214
232
|
if not options[:skip_header]
|
215
233
|
if headerline =~ /^#CHR/
|
216
234
|
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
217
|
-
print header.tag(options),"\n" if not options[:skip_header]
|
235
|
+
print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
|
236
|
+
# Then the additional filter(s)
|
237
|
+
# ##FILTER=<ID=LowQual,Description="Low quality">
|
238
|
+
add_filter = options[:add_filter]
|
239
|
+
if add_filter
|
240
|
+
print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
|
241
|
+
end
|
242
|
+
|
218
243
|
selected = header.column_names
|
219
244
|
if samples
|
220
245
|
newfields = selected[0..8]
|
221
246
|
samples.each do |s|
|
222
|
-
newfields << selected[s+9]
|
247
|
+
newfields << selected[s+9]
|
223
248
|
end
|
224
249
|
selected = newfields
|
225
250
|
end
|
@@ -231,10 +256,14 @@ def parse_header line, samples, options
|
|
231
256
|
end
|
232
257
|
print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
|
233
258
|
VcfRdf::header if options[:rdf]
|
259
|
+
if line =~ /^#/
|
260
|
+
# We did not read a record
|
261
|
+
line = nil
|
262
|
+
end
|
234
263
|
return header,line
|
235
264
|
end
|
236
265
|
|
237
|
-
# Parse a VCF line and return the result as a string
|
266
|
+
# Parse a VCF line and return the (template) result as a string buffer
|
238
267
|
def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
239
268
|
fields = VcfLine.parse(line)
|
240
269
|
rec = VcfRecord.new(fields,header)
|
@@ -244,9 +273,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
244
273
|
sfilter = options[:sfilter]
|
245
274
|
efilter = options[:efilter]
|
246
275
|
ifilter = options[:ifilter]
|
276
|
+
add_filter = options[:add_filter] # contains a filter name (soft filter)
|
247
277
|
seval = options[:seval]
|
248
278
|
ignore_missing = options[:ignore_missing]
|
249
279
|
quiet = options[:quiet]
|
280
|
+
set_filter_field = nil
|
250
281
|
|
251
282
|
if sfilter or efilter or ifilter or seval
|
252
283
|
# check for samples
|
@@ -261,15 +292,27 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
261
292
|
return if not bed
|
262
293
|
end
|
263
294
|
|
264
|
-
|
265
|
-
|
266
|
-
|
295
|
+
skip = lambda { |&m|
|
296
|
+
matched = m.call
|
297
|
+
if add_filter
|
298
|
+
set_filter_field = true if matched
|
299
|
+
false # always continue processing with an add-filter
|
300
|
+
else
|
301
|
+
not matched
|
302
|
+
end
|
303
|
+
}
|
304
|
+
|
305
|
+
if filter
|
306
|
+
return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
307
|
+
end
|
308
|
+
|
309
|
+
if sfilter # sample 'or' filter
|
267
310
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
268
|
-
return if
|
311
|
+
return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
269
312
|
end
|
270
313
|
end
|
271
314
|
|
272
|
-
if ifilter
|
315
|
+
if ifilter # include sample filter
|
273
316
|
found = false
|
274
317
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
275
318
|
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
@@ -278,12 +321,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
278
321
|
end
|
279
322
|
end
|
280
323
|
# Skip if there are no matches
|
281
|
-
return if
|
324
|
+
return if skip.call {found}
|
282
325
|
end
|
283
326
|
|
284
|
-
if efilter
|
327
|
+
if efilter # exclude sample filter
|
285
328
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
286
|
-
return if
|
329
|
+
return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
287
330
|
end
|
288
331
|
end
|
289
332
|
|
@@ -291,18 +334,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
291
334
|
|
292
335
|
# -----------------------------
|
293
336
|
# From here on decide on output
|
337
|
+
|
338
|
+
rec.add_to_filter_field(add_filter) if set_filter_field
|
339
|
+
|
294
340
|
if samples
|
295
341
|
# Select certain samples for output
|
296
342
|
newfields = fields[0..8]
|
297
343
|
samples.each do |s|
|
298
|
-
newfields << fields[s+9]
|
344
|
+
newfields << fields[s+9]
|
299
345
|
end
|
300
346
|
fields = newfields
|
301
347
|
end
|
302
348
|
if options[:eval] or seval
|
303
349
|
begin
|
304
350
|
results = nil # result string
|
305
|
-
if options[:eval]
|
351
|
+
if options[:eval]
|
306
352
|
res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
|
307
353
|
results = res if res
|
308
354
|
end
|
@@ -320,23 +366,22 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
320
366
|
exit 1
|
321
367
|
end
|
322
368
|
return results.to_s+"\n" if results
|
323
|
-
exit(1) if options[:eval_once] # <--- can this be reached?
|
324
369
|
else
|
325
370
|
if options[:rdf]
|
326
371
|
# Output Turtle RDF
|
327
372
|
VcfRdf::record(options[:id],rec,options[:tags])
|
328
373
|
elsif options[:template]
|
329
|
-
#
|
374
|
+
# Use ERB template
|
330
375
|
begin
|
331
376
|
template.body(binding)
|
332
377
|
rescue Exception => e
|
333
378
|
$stderr.print e,": ",fields,"\n"
|
334
379
|
$stderr.print e.backtrace.inspect if options[:verbose]
|
335
|
-
raise
|
380
|
+
raise
|
336
381
|
end
|
337
382
|
elsif options[:rewrite]
|
338
383
|
# Default behaviour prints VCF line, but rewrite info
|
339
|
-
eval(options[:rewrite])
|
384
|
+
eval(options[:rewrite])
|
340
385
|
(fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
341
386
|
elsif stats
|
342
387
|
# do nothing
|
@@ -347,20 +392,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
347
392
|
end
|
348
393
|
end
|
349
394
|
|
395
|
+
CHUNK_SIZE = options[:thread_lines]
|
396
|
+
|
397
|
+
pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
|
398
|
+
options[:quiet],options[:debug])
|
350
399
|
header = nil
|
351
400
|
header_output_completed = false
|
352
|
-
|
353
|
-
CHUNK_SIZE = options[:thread_lines]
|
354
|
-
CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
|
355
|
-
chunks = []
|
356
|
-
lines = []
|
401
|
+
chunk_lines = []
|
357
402
|
line_number=0
|
358
403
|
|
359
404
|
if options[:bed]
|
360
405
|
bedfilter = BedFilter.new(options[:bed])
|
361
|
-
end
|
406
|
+
end
|
362
407
|
|
363
408
|
begin
|
409
|
+
# Define linear parser function (going through one chunk)
|
364
410
|
process = lambda { | lines |
|
365
411
|
res = []
|
366
412
|
lines.each do | line |
|
@@ -368,73 +414,75 @@ begin
|
|
368
414
|
end
|
369
415
|
res
|
370
416
|
}
|
371
|
-
output = lambda { |collection|
|
372
|
-
collection.each do | result |
|
373
|
-
result.each { |line| print line }
|
374
|
-
end
|
375
|
-
} # end output
|
376
417
|
|
377
|
-
print template.header(binding) if template
|
378
418
|
# ---- Main loop
|
379
419
|
STDIN.each_line do | line |
|
380
420
|
line_number += 1
|
381
|
-
|
421
|
+
|
422
|
+
# ---- Skip embedded headers down the line...
|
382
423
|
next if header_output_completed and line =~ /^#/
|
383
|
-
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
384
|
-
header,line = parse_header(line,samples,options)
|
385
|
-
end
|
386
|
-
next if line =~ /^##/ # empty file
|
387
|
-
header_output_completed = true
|
388
|
-
if not options[:efilter_samples] and options[:ifilter_samples]
|
389
|
-
# Create exclude set as a complement of include set
|
390
|
-
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
391
|
-
end
|
392
424
|
|
393
|
-
# ---- In
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
425
|
+
# ---- In the following section header information is handled -
|
426
|
+
# this only happens once.
|
427
|
+
|
428
|
+
# ---- Parse the header lines (chomps from STDIN)
|
429
|
+
# and returns header info and the current line
|
430
|
+
if line =~ /^#/
|
431
|
+
header, line = parse_header(line,samples,options)
|
432
|
+
if line.nil?
|
433
|
+
# No line after header, to there are no records to process
|
434
|
+
break
|
400
435
|
end
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
chunks = []
|
410
|
-
# Output is forked to a separate process too
|
411
|
-
fork do
|
412
|
-
output.call out
|
413
|
-
STDOUT.flush
|
414
|
-
STDOUT.close
|
415
|
-
exit 0
|
416
|
-
end
|
417
|
-
end
|
418
|
-
lines = []
|
436
|
+
end
|
437
|
+
# p [line_number,line]
|
438
|
+
# ---- After the header continue processing
|
439
|
+
if not header_output_completed
|
440
|
+
# one-time post-header processing
|
441
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
442
|
+
# Create exclude set as a complement of include set
|
443
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
419
444
|
end
|
445
|
+
print template.header(binding) if template
|
446
|
+
header_output_completed = true
|
447
|
+
end
|
448
|
+
|
449
|
+
if options[:eval_once]
|
450
|
+
# this happens if we only want one line evaluated - say to get
|
451
|
+
# the number of samples
|
452
|
+
print parse_line(line,header,options,bedfilter,samples,template,stats)
|
453
|
+
exit 0
|
454
|
+
end
|
455
|
+
|
456
|
+
# ---- Lines are collected in one buffer and the lines buffer
|
457
|
+
# is added to the chunks list (for the threads)
|
458
|
+
chunk_lines << line
|
459
|
+
|
460
|
+
# ---- In the following section the VCF lines are parsed by chunks
|
461
|
+
# The chunks may go into different threads
|
462
|
+
|
463
|
+
if chunk_lines.size >= CHUNK_SIZE
|
464
|
+
# ---- process one chunk
|
465
|
+
$stderr.print '.' if not options[:quiet]
|
466
|
+
pcows.wait_for_worker_slot()
|
467
|
+
pcows.submit_worker(process,chunk_lines)
|
468
|
+
pcows.process_output()
|
469
|
+
|
470
|
+
chunk_lines = []
|
420
471
|
end
|
421
472
|
end
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
chunks << lines
|
427
|
-
output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
|
428
|
-
process.call(chunk)
|
429
|
-
}
|
430
|
-
end
|
473
|
+
pcows.submit_final_worker(process,chunk_lines)
|
474
|
+
pcows.wait_for_workers()
|
475
|
+
pcows.process_remaining_output()
|
476
|
+
|
431
477
|
print template.footer(binding) if template
|
432
478
|
stats.print if stats
|
433
479
|
|
434
480
|
rescue Exception => e
|
435
|
-
|
436
|
-
|
481
|
+
if e.message != 'exit'
|
482
|
+
$stderr.print "ERROR: "
|
483
|
+
$stderr.print e.message,"\n"
|
484
|
+
end
|
485
|
+
pcows.cleanup()
|
437
486
|
raise if options[:verbose]
|
438
487
|
exit 1
|
439
488
|
end
|
440
|
-
|