bio-vcf 0.8.1 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +2 -8
- data/LICENSE.txt +1 -1
- data/README.md +467 -129
- data/RELEASE_NOTES.md +27 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +9 -42
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +156 -108
- data/bio-vcf.gemspec +13 -75
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +12 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +1 -11
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +1 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +19 -19
- data/lib/bio-vcf/vcfheader.rb +137 -5
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +26 -2
- data/lib/regressiontest.rb +11 -0
- data/lib/regressiontest/cli_exec.rb +101 -0
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +16 -16
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -2
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +63 -64
- data/Gemfile.lock +0 -81
data/RELEASE_NOTES.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
## ChangeLog v0.9.5 (20210118)
|
2
|
+
|
3
|
+
+ Improved README and installation instructions
|
4
|
+
+ Added guix.scm build and instructions (no need for bundler)
|
5
|
+
+ Moved regressiontest into tree
|
6
|
+
|
7
|
+
## ChangeLog v0.9.4 (20201222)
|
8
|
+
|
9
|
+
This is an important maintenance release of bio-vcf:
|
10
|
+
|
11
|
+
+ Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf)
|
12
|
+
+ Fixed tests to match recent Ruby updates
|
13
|
+
|
14
|
+
## Older release notes
|
15
|
+
|
16
|
+
+ Getting ready for a 1.0 release
|
17
|
+
+ Released 0.9.2 as a gem
|
18
|
+
+ 0.9.1 removed a rare threading bug and cleanup on error
|
19
|
+
+ Added support for soft filters (request by Brad Chapman)
|
20
|
+
+ The outputter now writes (properly) in parallel with the parser
|
21
|
+
+ bio-vcf turns any VCF into JSON with header information, and
|
22
|
+
allows you to pipe that JSON directly into any JSON supporting
|
23
|
+
language, including Python and Javascript!
|
24
|
+
|
25
|
+
## Older changes
|
26
|
+
|
27
|
+
For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
|
data/RELEASE_NOTES.md~
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
## RELEASE NOTES
|
2
|
+
|
3
|
+
|
4
|
+
* Getting ready for a 1.0 release
|
5
|
+
* Released 0.9.2 as a gem
|
6
|
+
* 0.9.1 removed a rare threading bug and cleanup on error
|
7
|
+
* Added support for soft filters (request by Brad Chapman)
|
8
|
+
* The outputter now writes (properly) in parallel with the parser
|
9
|
+
* bio-vcf turns any VCF into JSON with header information, and
|
10
|
+
allows you to pipe that JSON directly into any JSON supporting
|
11
|
+
language, including Python and Javascript!
|
data/Rakefile
CHANGED
@@ -1,54 +1,21 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
3
|
+
# require 'rubygems'
|
12
4
|
require 'rake'
|
5
|
+
# require 'cucumber/rake/task'
|
13
6
|
|
14
|
-
|
15
|
-
|
16
|
-
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.name = "bio-vcf"
|
18
|
-
gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
19
|
-
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{Fast multi-threaded VCF parser}
|
21
|
-
gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
|
22
|
-
gem.email = "pjotr.public01@thebird.nl"
|
23
|
-
gem.authors = ["Pjotr Prins"]
|
24
|
-
gem.required_ruby_version = '>=2.0.0'
|
25
|
-
# dependencies defined in Gemfile
|
26
|
-
end
|
27
|
-
Jeweler::RubygemsDotOrgTasks.new
|
28
|
-
|
29
|
-
# require 'rspec/core'
|
30
|
-
# require 'rspec/core/rake_task'
|
31
|
-
# RSpec::Core::RakeTask.new(:spec) do |spec|
|
32
|
-
# spec.pattern = FileList['spec/**/*_spec.rb']
|
33
|
-
# end
|
34
|
-
|
35
|
-
# RSpec::Core::RakeTask.new(:rcov) do |spec|
|
36
|
-
# spec.pattern = 'spec/**/*_spec.rb'
|
37
|
-
# spec.rcov = true
|
7
|
+
# Cucumber::Rake::Task.new(:features) do |t|
|
8
|
+
# t.cucumber_opts = "--bundler false"
|
38
9
|
# end
|
39
10
|
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
# end
|
45
|
-
|
46
|
-
require 'cucumber/rake/task'
|
47
|
-
Cucumber::Rake::Task.new(:features)
|
11
|
+
desc 'Run cucumber' # without bundler
|
12
|
+
task :features do
|
13
|
+
sh 'cucumber features'
|
14
|
+
end
|
48
15
|
|
49
16
|
task :default => :features
|
50
17
|
|
51
|
-
task :test => [ :features ]
|
18
|
+
task :test => [ :features ]
|
52
19
|
|
53
20
|
require 'rdoc/task'
|
54
21
|
Rake::RDocTask.new do |rdoc|
|
data/TAGS
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
|
2
|
+
./bin/bio-vcf,0
|
3
|
+
|
4
|
+
./lib/bio-vcf.rb,0
|
5
|
+
|
6
|
+
./lib/bio-vcf/vcfgenotypefield.rb,1553
|
7
|
+
module BioVcf::BioVcf1,0
|
8
|
+
class VcfNucleotides::BioVcf::VcfNucleotides7,167
|
9
|
+
def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
|
10
|
+
def []::BioVcf::VcfNucleotides#[]13,284
|
11
|
+
def to_ary::BioVcf::VcfNucleotides#to_ary27,628
|
12
|
+
def max::BioVcf::VcfNucleotides#max32,742
|
13
|
+
def min::BioVcf::VcfNucleotides#min37,856
|
14
|
+
def sum::BioVcf::VcfNucleotides#sum42,975
|
15
|
+
class VcfAltInfo::BioVcf::VcfAltInfo50,1082
|
16
|
+
def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
|
17
|
+
def []::BioVcf::VcfAltInfo#[]56,1194
|
18
|
+
def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
|
19
|
+
def max::BioVcf::VcfAltInfo#max75,1626
|
20
|
+
def min::BioVcf::VcfAltInfo#min79,1702
|
21
|
+
def sum::BioVcf::VcfAltInfo#sum83,1783
|
22
|
+
class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
|
23
|
+
def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
|
24
|
+
def dp4::BioVcf::VcfGenotypeField#dp496,2020
|
25
|
+
def ad::BioVcf::VcfGenotypeField#ad100,2098
|
26
|
+
def pl::BioVcf::VcfGenotypeField#pl104,2174
|
27
|
+
def bcount::BioVcf::VcfGenotypeField#bcount108,2250
|
28
|
+
def bq::BioVcf::VcfGenotypeField#bq112,2343
|
29
|
+
def amq::BioVcf::VcfGenotypeField#amq116,2424
|
30
|
+
def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
|
31
|
+
class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
|
32
|
+
def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
|
33
|
+
def []::BioVcf::VcfGenotypeFields#[]141,3021
|
34
|
+
def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
|
35
|
+
|
36
|
+
./lib/bio-vcf/vcfrdf.rb,156
|
37
|
+
module BioVcf::BioVcf1,0
|
38
|
+
module VcfRdf::BioVcf::VcfRdf5,93
|
39
|
+
def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
|
40
|
+
def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
|
41
|
+
|
42
|
+
./lib/bio-vcf/vcf.rb,27
|
43
|
+
module BioVcf::BioVcf2,1
|
44
|
+
|
45
|
+
./lib/bio-vcf/vcfline.rb,118
|
46
|
+
module BioVcf::BioVcf1,0
|
47
|
+
module VcfLine::BioVcf::VcfLine2,16
|
48
|
+
def VcfLine.parse::BioVcf::VcfLine.parse5,82
|
49
|
+
|
50
|
+
./lib/bio-vcf/vcfrecord.rb,1831
|
51
|
+
module BioVcf::BioVcf1,0
|
52
|
+
class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
|
53
|
+
def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
|
54
|
+
def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
|
55
|
+
module VcfRecordParser::BioVcf::VcfRecordParser18,329
|
56
|
+
def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
|
57
|
+
def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
|
58
|
+
module VcfRecordCall::BioVcf::VcfRecordCall30,592
|
59
|
+
def call_diff::BioVcf::VcfRecordCall#call_diff31,617
|
60
|
+
def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
|
61
|
+
def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
|
62
|
+
def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
|
63
|
+
def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
|
64
|
+
def index::BioVcf::VcfRecordCall#index51,1026
|
65
|
+
class VcfRecord::BioVcf::VcfRecord56,1125
|
66
|
+
attr_reader :header::BioVcf::VcfRecord#header60,1173
|
67
|
+
def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
|
68
|
+
def chrom::BioVcf::VcfRecord#chrom67,1292
|
69
|
+
def pos::BioVcf::VcfRecord#pos71,1332
|
70
|
+
def ids::BioVcf::VcfRecord#ids75,1384
|
71
|
+
def id::BioVcf::VcfRecord#id79,1443
|
72
|
+
def ref::BioVcf::VcfRecord#ref83,1476
|
73
|
+
def alt::BioVcf::VcfRecord#alt87,1524
|
74
|
+
def qual::BioVcf::VcfRecord#qual91,1582
|
75
|
+
def info::BioVcf::VcfRecord#info95,1636
|
76
|
+
def format::BioVcf::VcfRecord#format99,1711
|
77
|
+
def normal::BioVcf::VcfRecord#normal104,1848
|
78
|
+
def tumor::BioVcf::VcfRecord#tumor109,1997
|
79
|
+
def sample::BioVcf::VcfRecord#sample114,2134
|
80
|
+
def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
|
81
|
+
def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
|
82
|
+
def method_missing::BioVcf::VcfRecord#method_missing126,2341
|
83
|
+
|
84
|
+
./lib/bio-vcf/variant.rb,470
|
85
|
+
module BioVcf::BioVcf1,0
|
86
|
+
module Variant::BioVcf::Variant3,17
|
87
|
+
def Variant.diff::BioVcf::Variant.diff5,37
|
88
|
+
def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
|
89
|
+
def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
|
90
|
+
def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
|
91
|
+
def Variant.index::BioVcf::Variant.index25,652
|
92
|
+
def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
|
93
|
+
|
94
|
+
./lib/bio-vcf/vcfheader.rb,598
|
95
|
+
module BioVcf::BioVcf2,1
|
96
|
+
module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
|
97
|
+
def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
|
98
|
+
class VcfHeader::BioVcf::VcfHeader18,339
|
99
|
+
attr_reader :lines::BioVcf::VcfHeader#lines20,360
|
100
|
+
def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
|
101
|
+
def add::BioVcf::VcfHeader#add26,430
|
102
|
+
def version::BioVcf::VcfHeader#version30,483
|
103
|
+
def column_names::BioVcf::VcfHeader#column_names34,578
|
104
|
+
def columns::BioVcf::VcfHeader#columns38,674
|
105
|
+
def samples::BioVcf::VcfHeader#samples42,735
|
106
|
+
|
107
|
+
./features/step_definitions/diff_count.rb,0
|
108
|
+
|
109
|
+
./features/step_definitions/bio-vcf_steps.rb,0
|
110
|
+
|
111
|
+
./features/step_definitions/somaticsniper.rb,0
|
112
|
+
|
113
|
+
./features/step_definitions/multisample.rb,0
|
114
|
+
|
115
|
+
./features/support/env.rb,0
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.5
|
data/bin/bio-vcf
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Author:: Pjotr Prins
|
5
5
|
# License:: MIT
|
6
6
|
#
|
7
|
-
# Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
|
7
|
+
# Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
|
8
8
|
|
9
9
|
USAGE = "Vcf parser"
|
10
10
|
|
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
|
|
15
15
|
version = File.new(VERSION_FILENAME).read.chomp
|
16
16
|
|
17
17
|
require 'bio-vcf'
|
18
|
+
require 'bio-vcf/pcows'
|
18
19
|
require 'optparse'
|
19
20
|
require 'timeout'
|
20
21
|
require 'fileutils'
|
21
22
|
|
22
|
-
# Uncomment when using the bio-logger
|
23
|
+
# Uncomment when using the bio-logger
|
23
24
|
# require 'bio-logger'
|
24
25
|
# log = Bio::Log::LoggerPlus.new 'vcf'
|
25
|
-
# log.outputters = Bio::Log::Outputter.stderr
|
26
|
+
# log.outputters = Bio::Log::Outputter.stderr
|
26
27
|
# Bio::Log::CLI.logger('stderr')
|
27
28
|
# Bio::Log::CLI.trace('info')
|
28
29
|
|
29
|
-
options = { show_help: false, source: 'https://github.com/
|
30
|
+
options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
|
30
31
|
opts = OptionParser.new do |o|
|
31
32
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
32
33
|
|
33
|
-
o.on('-i','--ignore-missing', 'Ignore missing data') do
|
34
|
+
o.on('-i','--ignore-missing', 'Ignore missing data') do
|
34
35
|
options[:ignore_missing] = true
|
35
36
|
end
|
36
37
|
o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
|
@@ -57,6 +58,9 @@ opts = OptionParser.new do |o|
|
|
57
58
|
o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
|
58
59
|
options[:efilter_samples] = l
|
59
60
|
end
|
61
|
+
o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
|
62
|
+
options[:add_filter] = name
|
63
|
+
end
|
60
64
|
|
61
65
|
o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
|
62
66
|
options[:bed] = bed
|
@@ -68,6 +72,9 @@ opts = OptionParser.new do |o|
|
|
68
72
|
o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
|
69
73
|
options[:eval_once] = true
|
70
74
|
options[:eval] = cmd
|
75
|
+
# options[:num_threads] = 1
|
76
|
+
# options[:thread_lines] = 1
|
77
|
+
options[:skip_header] = true
|
71
78
|
end
|
72
79
|
o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
|
73
80
|
options[:seval] = cmd
|
@@ -84,7 +91,7 @@ opts = OptionParser.new do |o|
|
|
84
91
|
options[:rdf] = true
|
85
92
|
options[:skip_header] = true
|
86
93
|
end
|
87
|
-
o.on("--num-threads [num]", Integer, "Multi-core version (default
|
94
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
|
88
95
|
options[:num_threads] = i
|
89
96
|
end
|
90
97
|
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
@@ -96,8 +103,8 @@ opts = OptionParser.new do |o|
|
|
96
103
|
o.on_tail("--tags list", String, "Add tags") do |s|
|
97
104
|
options[:tags] = s
|
98
105
|
end
|
99
|
-
|
100
|
-
o.on("--skip-header", "Do not output VCF header info") do
|
106
|
+
|
107
|
+
o.on("--skip-header", "Do not output VCF header info") do
|
101
108
|
options[:skip_header] = true
|
102
109
|
end
|
103
110
|
|
@@ -112,9 +119,16 @@ opts = OptionParser.new do |o|
|
|
112
119
|
options[:template] = s
|
113
120
|
options[:skip_header] = true
|
114
121
|
end
|
115
|
-
|
116
|
-
|
117
|
-
|
122
|
+
|
123
|
+
o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
|
124
|
+
options[:tag] = true
|
125
|
+
end
|
126
|
+
|
127
|
+
o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
|
128
|
+
options[:timeout] = i
|
129
|
+
end
|
130
|
+
|
131
|
+
# Uncomment the following when using the bio-logger
|
118
132
|
# o.separator ""
|
119
133
|
# o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
120
134
|
# Bio::Log::CLI.logger(name)
|
@@ -123,7 +137,16 @@ opts = OptionParser.new do |o|
|
|
123
137
|
# o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
|
124
138
|
# Bio::Log::CLI.trace(s)
|
125
139
|
# end
|
126
|
-
#
|
140
|
+
#
|
141
|
+
o.on("--names", "Output sample names") do |q|
|
142
|
+
options[:quiet] = true
|
143
|
+
options[:num_threads] = nil
|
144
|
+
options[:eval_once] = true
|
145
|
+
options[:eval] = 'header.samples.join("\t")'
|
146
|
+
# options[:num_threads] = 1
|
147
|
+
# options[:thread_lines] = 1
|
148
|
+
options[:skip_header] = true
|
149
|
+
end
|
127
150
|
o.on("--statistics", "Output statistics") do |q|
|
128
151
|
options[:statistics] = true
|
129
152
|
options[:num_threads] = nil
|
@@ -132,14 +155,15 @@ opts = OptionParser.new do |o|
|
|
132
155
|
# Bio::Log::CLI.trace('error')
|
133
156
|
options[:quiet] = true
|
134
157
|
end
|
135
|
-
|
158
|
+
|
136
159
|
o.on("-v", "--verbose", "Run verbosely") do |v|
|
137
160
|
options[:verbose] = true
|
138
161
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
162
|
+
|
163
|
+
o.on("--debug", "Show debug messages and keep intermediate output") do |v|
|
164
|
+
# Bio::Log::CLI.trace('debug')
|
165
|
+
options[:debug] = true
|
166
|
+
end
|
143
167
|
|
144
168
|
o.separator ""
|
145
169
|
o.on_tail('-h', '--help', 'display this help and exit') do
|
@@ -150,11 +174,11 @@ end
|
|
150
174
|
opts.parse!(ARGV)
|
151
175
|
|
152
176
|
BIOVCF_VERSION=version
|
153
|
-
BIOVCF_BANNER = "vcf #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins
|
154
|
-
$stderr.print BIOVCF_BANNER
|
177
|
+
BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
|
178
|
+
$stderr.print BIOVCF_BANNER if !options[:quiet]
|
155
179
|
|
156
|
-
if options[:show_help]
|
157
|
-
print opts
|
180
|
+
if options[:show_help]
|
181
|
+
print opts
|
158
182
|
print USAGE
|
159
183
|
exit 1
|
160
184
|
end
|
@@ -174,15 +198,6 @@ if options[:template]
|
|
174
198
|
template = Bio::Template.new(fn)
|
175
199
|
end
|
176
200
|
|
177
|
-
if options[:num_threads] != 1
|
178
|
-
begin
|
179
|
-
require 'parallel'
|
180
|
-
rescue LoadError
|
181
|
-
$stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
|
182
|
-
options[:num_threads] = 1
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
201
|
stats = nil
|
187
202
|
if options[:statistics]
|
188
203
|
options[:num_threads] = nil
|
@@ -193,6 +208,8 @@ end
|
|
193
208
|
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
194
209
|
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
195
210
|
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
211
|
+
# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
|
212
|
+
# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
|
196
213
|
|
197
214
|
if options[:samples]
|
198
215
|
samples = options[:samples].map { |s| s.to_i }
|
@@ -200,13 +217,14 @@ end
|
|
200
217
|
|
201
218
|
include BioVcf
|
202
219
|
|
203
|
-
# Parse the header section of a VCF file
|
220
|
+
# Parse the header section of a VCF file (chomping STDIN)
|
204
221
|
def parse_header line, samples, options
|
205
|
-
header = VcfHeader.new
|
222
|
+
header = VcfHeader.new(options[:debug])
|
206
223
|
header.add(line)
|
207
224
|
print line if not options[:skip_header]
|
208
225
|
STDIN.each_line do | headerline |
|
209
226
|
if headerline !~ /^#/
|
227
|
+
# If no records in VCF, we never get here
|
210
228
|
line = headerline
|
211
229
|
break # end of header
|
212
230
|
end
|
@@ -214,12 +232,19 @@ def parse_header line, samples, options
|
|
214
232
|
if not options[:skip_header]
|
215
233
|
if headerline =~ /^#CHR/
|
216
234
|
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
217
|
-
print header.tag(options),"\n" if not options[:skip_header]
|
235
|
+
print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
|
236
|
+
# Then the additional filter(s)
|
237
|
+
# ##FILTER=<ID=LowQual,Description="Low quality">
|
238
|
+
add_filter = options[:add_filter]
|
239
|
+
if add_filter
|
240
|
+
print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
|
241
|
+
end
|
242
|
+
|
218
243
|
selected = header.column_names
|
219
244
|
if samples
|
220
245
|
newfields = selected[0..8]
|
221
246
|
samples.each do |s|
|
222
|
-
newfields << selected[s+9]
|
247
|
+
newfields << selected[s+9]
|
223
248
|
end
|
224
249
|
selected = newfields
|
225
250
|
end
|
@@ -231,10 +256,14 @@ def parse_header line, samples, options
|
|
231
256
|
end
|
232
257
|
print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
|
233
258
|
VcfRdf::header if options[:rdf]
|
259
|
+
if line =~ /^#/
|
260
|
+
# We did not read a record
|
261
|
+
line = nil
|
262
|
+
end
|
234
263
|
return header,line
|
235
264
|
end
|
236
265
|
|
237
|
-
# Parse a VCF line and return the result as a string
|
266
|
+
# Parse a VCF line and return the (template) result as a string buffer
|
238
267
|
def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
239
268
|
fields = VcfLine.parse(line)
|
240
269
|
rec = VcfRecord.new(fields,header)
|
@@ -244,9 +273,11 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
244
273
|
sfilter = options[:sfilter]
|
245
274
|
efilter = options[:efilter]
|
246
275
|
ifilter = options[:ifilter]
|
276
|
+
add_filter = options[:add_filter] # contains a filter name (soft filter)
|
247
277
|
seval = options[:seval]
|
248
278
|
ignore_missing = options[:ignore_missing]
|
249
279
|
quiet = options[:quiet]
|
280
|
+
set_filter_field = nil
|
250
281
|
|
251
282
|
if sfilter or efilter or ifilter or seval
|
252
283
|
# check for samples
|
@@ -261,15 +292,27 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
261
292
|
return if not bed
|
262
293
|
end
|
263
294
|
|
264
|
-
|
265
|
-
|
266
|
-
|
295
|
+
skip = lambda { |&m|
|
296
|
+
matched = m.call
|
297
|
+
if add_filter
|
298
|
+
set_filter_field = true if matched
|
299
|
+
false # always continue processing with an add-filter
|
300
|
+
else
|
301
|
+
not matched
|
302
|
+
end
|
303
|
+
}
|
304
|
+
|
305
|
+
if filter
|
306
|
+
return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
307
|
+
end
|
308
|
+
|
309
|
+
if sfilter # sample 'or' filter
|
267
310
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
268
|
-
return if
|
311
|
+
return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
269
312
|
end
|
270
313
|
end
|
271
314
|
|
272
|
-
if ifilter
|
315
|
+
if ifilter # include sample filter
|
273
316
|
found = false
|
274
317
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
275
318
|
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
@@ -278,12 +321,12 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
278
321
|
end
|
279
322
|
end
|
280
323
|
# Skip if there are no matches
|
281
|
-
return if
|
324
|
+
return if skip.call {found}
|
282
325
|
end
|
283
326
|
|
284
|
-
if efilter
|
327
|
+
if efilter # exclude sample filter
|
285
328
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
286
|
-
return if
|
329
|
+
return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
287
330
|
end
|
288
331
|
end
|
289
332
|
|
@@ -291,18 +334,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
291
334
|
|
292
335
|
# -----------------------------
|
293
336
|
# From here on decide on output
|
337
|
+
|
338
|
+
rec.add_to_filter_field(add_filter) if set_filter_field
|
339
|
+
|
294
340
|
if samples
|
295
341
|
# Select certain samples for output
|
296
342
|
newfields = fields[0..8]
|
297
343
|
samples.each do |s|
|
298
|
-
newfields << fields[s+9]
|
344
|
+
newfields << fields[s+9]
|
299
345
|
end
|
300
346
|
fields = newfields
|
301
347
|
end
|
302
348
|
if options[:eval] or seval
|
303
349
|
begin
|
304
350
|
results = nil # result string
|
305
|
-
if options[:eval]
|
351
|
+
if options[:eval]
|
306
352
|
res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
|
307
353
|
results = res if res
|
308
354
|
end
|
@@ -320,23 +366,22 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
320
366
|
exit 1
|
321
367
|
end
|
322
368
|
return results.to_s+"\n" if results
|
323
|
-
exit(1) if options[:eval_once] # <--- can this be reached?
|
324
369
|
else
|
325
370
|
if options[:rdf]
|
326
371
|
# Output Turtle RDF
|
327
372
|
VcfRdf::record(options[:id],rec,options[:tags])
|
328
373
|
elsif options[:template]
|
329
|
-
#
|
374
|
+
# Use ERB template
|
330
375
|
begin
|
331
376
|
template.body(binding)
|
332
377
|
rescue Exception => e
|
333
378
|
$stderr.print e,": ",fields,"\n"
|
334
379
|
$stderr.print e.backtrace.inspect if options[:verbose]
|
335
|
-
raise
|
380
|
+
raise
|
336
381
|
end
|
337
382
|
elsif options[:rewrite]
|
338
383
|
# Default behaviour prints VCF line, but rewrite info
|
339
|
-
eval(options[:rewrite])
|
384
|
+
eval(options[:rewrite])
|
340
385
|
(fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
341
386
|
elsif stats
|
342
387
|
# do nothing
|
@@ -347,20 +392,21 @@ def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
|
347
392
|
end
|
348
393
|
end
|
349
394
|
|
395
|
+
CHUNK_SIZE = options[:thread_lines]
|
396
|
+
|
397
|
+
pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
|
398
|
+
options[:quiet],options[:debug])
|
350
399
|
header = nil
|
351
400
|
header_output_completed = false
|
352
|
-
|
353
|
-
CHUNK_SIZE = options[:thread_lines]
|
354
|
-
CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
|
355
|
-
chunks = []
|
356
|
-
lines = []
|
401
|
+
chunk_lines = []
|
357
402
|
line_number=0
|
358
403
|
|
359
404
|
if options[:bed]
|
360
405
|
bedfilter = BedFilter.new(options[:bed])
|
361
|
-
end
|
406
|
+
end
|
362
407
|
|
363
408
|
begin
|
409
|
+
# Define linear parser function (going through one chunk)
|
364
410
|
process = lambda { | lines |
|
365
411
|
res = []
|
366
412
|
lines.each do | line |
|
@@ -368,73 +414,75 @@ begin
|
|
368
414
|
end
|
369
415
|
res
|
370
416
|
}
|
371
|
-
output = lambda { |collection|
|
372
|
-
collection.each do | result |
|
373
|
-
result.each { |line| print line }
|
374
|
-
end
|
375
|
-
} # end output
|
376
417
|
|
377
|
-
print template.header(binding) if template
|
378
418
|
# ---- Main loop
|
379
419
|
STDIN.each_line do | line |
|
380
420
|
line_number += 1
|
381
|
-
|
421
|
+
|
422
|
+
# ---- Skip embedded headers down the line...
|
382
423
|
next if header_output_completed and line =~ /^#/
|
383
|
-
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
384
|
-
header,line = parse_header(line,samples,options)
|
385
|
-
end
|
386
|
-
next if line =~ /^##/ # empty file
|
387
|
-
header_output_completed = true
|
388
|
-
if not options[:efilter_samples] and options[:ifilter_samples]
|
389
|
-
# Create exclude set as a complement of include set
|
390
|
-
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
391
|
-
end
|
392
424
|
|
393
|
-
# ---- In
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
425
|
+
# ---- In the following section header information is handled -
|
426
|
+
# this only happens once.
|
427
|
+
|
428
|
+
# ---- Parse the header lines (chomps from STDIN)
|
429
|
+
# and returns header info and the current line
|
430
|
+
if line =~ /^#/
|
431
|
+
header, line = parse_header(line,samples,options)
|
432
|
+
if line.nil?
|
433
|
+
# No line after header, to there are no records to process
|
434
|
+
break
|
400
435
|
end
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
chunks = []
|
410
|
-
# Output is forked to a separate process too
|
411
|
-
fork do
|
412
|
-
output.call out
|
413
|
-
STDOUT.flush
|
414
|
-
STDOUT.close
|
415
|
-
exit 0
|
416
|
-
end
|
417
|
-
end
|
418
|
-
lines = []
|
436
|
+
end
|
437
|
+
# p [line_number,line]
|
438
|
+
# ---- After the header continue processing
|
439
|
+
if not header_output_completed
|
440
|
+
# one-time post-header processing
|
441
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
442
|
+
# Create exclude set as a complement of include set
|
443
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
419
444
|
end
|
445
|
+
print template.header(binding) if template
|
446
|
+
header_output_completed = true
|
447
|
+
end
|
448
|
+
|
449
|
+
if options[:eval_once]
|
450
|
+
# this happens if we only want one line evaluated - say to get
|
451
|
+
# the number of samples
|
452
|
+
print parse_line(line,header,options,bedfilter,samples,template,stats)
|
453
|
+
exit 0
|
454
|
+
end
|
455
|
+
|
456
|
+
# ---- Lines are collected in one buffer and the lines buffer
|
457
|
+
# is added to the chunks list (for the threads)
|
458
|
+
chunk_lines << line
|
459
|
+
|
460
|
+
# ---- In the following section the VCF lines are parsed by chunks
|
461
|
+
# The chunks may go into different threads
|
462
|
+
|
463
|
+
if chunk_lines.size >= CHUNK_SIZE
|
464
|
+
# ---- process one chunk
|
465
|
+
$stderr.print '.' if not options[:quiet]
|
466
|
+
pcows.wait_for_worker_slot()
|
467
|
+
pcows.submit_worker(process,chunk_lines)
|
468
|
+
pcows.process_output()
|
469
|
+
|
470
|
+
chunk_lines = []
|
420
471
|
end
|
421
472
|
end
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
chunks << lines
|
427
|
-
output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
|
428
|
-
process.call(chunk)
|
429
|
-
}
|
430
|
-
end
|
473
|
+
pcows.submit_final_worker(process,chunk_lines)
|
474
|
+
pcows.wait_for_workers()
|
475
|
+
pcows.process_remaining_output()
|
476
|
+
|
431
477
|
print template.footer(binding) if template
|
432
478
|
stats.print if stats
|
433
479
|
|
434
480
|
rescue Exception => e
|
435
|
-
|
436
|
-
|
481
|
+
if e.message != 'exit'
|
482
|
+
$stderr.print "ERROR: "
|
483
|
+
$stderr.print e.message,"\n"
|
484
|
+
end
|
485
|
+
pcows.cleanup()
|
437
486
|
raise if options[:verbose]
|
438
487
|
exit 1
|
439
488
|
end
|
440
|
-
|