bio-vcf 0.8.0 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +1 -11
- data/Gemfile +4 -5
- data/Gemfile.lock +28 -65
- data/LICENSE.txt +1 -1
- data/README.md +387 -107
- data/RELEASE_NOTES.md +20 -0
- data/RELEASE_NOTES.md~ +11 -0
- data/Rakefile +3 -40
- data/TAGS +115 -0
- data/VERSION +1 -1
- data/bin/bio-vcf +176 -109
- data/bio-vcf.gemspec +14 -70
- data/features/cli.feature +22 -4
- data/features/diff_count.feature +0 -1
- data/features/filter.feature +12 -0
- data/features/multisample.feature +25 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +15 -6
- data/features/step_definitions/diff_count.rb +1 -1
- data/features/step_definitions/multisample.rb +19 -0
- data/features/step_definitions/somaticsniper.rb +9 -1
- data/features/step_definitions/vcf_header.rb +48 -0
- data/features/support/env.rb +0 -9
- data/features/vcf_header.feature +35 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/bedfilter.rb +43 -0
- data/lib/bio-vcf/pcows.rb +303 -0
- data/lib/bio-vcf/template.rb +75 -0
- data/lib/bio-vcf/vcffile.rb +46 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +25 -20
- data/lib/bio-vcf/vcfheader.rb +146 -6
- data/lib/bio-vcf/vcfheader_line.rb +778 -0
- data/lib/bio-vcf/vcfrecord.rb +56 -18
- data/lib/bio-vcf/vcfsample.rb +27 -3
- data/ragel/gen_vcfheaderline_parser.rl +165 -0
- data/ragel/generate.sh +8 -0
- data/template/vcf2json.erb +19 -7
- data/template/vcf2json_full_header.erb +22 -0
- data/template/vcf2json_use_meta.erb +41 -0
- data/template/vcf2rdf_header.erb +24 -0
- data/test/data/input/empty.vcf +2 -0
- data/test/data/input/gatk_exome.vcf +237 -0
- data/test/data/input/gatk_wgs.vcf +1000 -0
- data/test/data/input/test.bed +632 -0
- data/test/data/regression/empty-stderr.new +12 -0
- data/test/data/regression/empty.new +2 -0
- data/test/data/regression/empty.ref +2 -0
- data/test/data/regression/eval_once-stderr.new +2 -0
- data/test/data/regression/eval_once.new +1 -0
- data/test/data/regression/eval_once.ref +1 -0
- data/test/data/regression/eval_r.info.dp-stderr.new +10 -0
- data/test/data/regression/eval_r.info.dp.new +150 -0
- data/test/data/regression/ifilter_s.dp-stderr.new +34 -0
- data/test/data/regression/ifilter_s.dp.new +31 -0
- data/test/data/regression/pass1-stderr.new +10 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +4 -0
- data/test/data/regression/r.info.dp.new +114 -0
- data/test/data/regression/rewrite.info.sample-stderr.new +10 -0
- data/test/data/regression/rewrite.info.sample.new +150 -0
- data/test/data/regression/s.dp-stderr.new +18 -0
- data/test/data/regression/s.dp.new +145 -0
- data/test/data/regression/seval_s.dp-stderr.new +10 -0
- data/test/data/regression/seval_s.dp.new +36 -0
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +18 -0
- data/test/data/regression/sfilter_seval_s.dp.new +31 -0
- data/test/data/regression/thread4-stderr.new +10 -0
- data/test/data/regression/thread4.new +150 -0
- data/test/data/regression/thread4_4-stderr.new +25 -0
- data/test/data/regression/thread4_4.new +130 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.new +5 -0
- data/test/data/regression/thread4_4_failed_filter-stderr.ref +5 -1
- data/test/data/regression/thread4_4_failed_filter.new +110 -0
- data/test/data/regression/vcf2json_full_header-stderr.new +10 -0
- data/test/data/regression/vcf2json_full_header.new +225 -0
- data/test/data/regression/vcf2json_full_header.ref +225 -0
- data/test/data/regression/vcf2json_use_meta-stderr.new +10 -0
- data/test/data/regression/vcf2json_use_meta.new +4697 -0
- data/test/data/regression/vcf2json_use_meta.ref +4697 -0
- data/test/performance/metrics.md +18 -1
- data/test/stress/stress_test.sh +15 -0
- data/test/tmp/test.vcf +12469 -0
- metadata +65 -64
data/RELEASE_NOTES.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
## ChangeLog v0.9.4 (2020????)
|
2
|
+
|
3
|
+
This is an important maintenance release of bio-vcf:
|
4
|
+
|
5
|
+
+ Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf).
|
6
|
+
|
7
|
+
## Older release notes
|
8
|
+
|
9
|
+
+ Getting ready for a 1.0 release
|
10
|
+
+ Released 0.9.2 as a gem
|
11
|
+
+ 0.9.1 removed a rare threading bug and cleanup on error
|
12
|
+
+ Added support for soft filters (request by Brad Chapman)
|
13
|
+
+ The outputter now writes (properly) in parallel with the parser
|
14
|
+
+ bio-vcf turns any VCF into JSON with header information, and
|
15
|
+
allows you to pipe that JSON directly into any JSON supporting
|
16
|
+
language, including Python and Javascript!
|
17
|
+
|
18
|
+
## Older changes
|
19
|
+
|
20
|
+
For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
|
data/RELEASE_NOTES.md~
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
## RELEASE NOTES
|
2
|
+
|
3
|
+
|
4
|
+
* Getting ready for a 1.0 release
|
5
|
+
* Released 0.9.2 as a gem
|
6
|
+
* 0.9.1 removed a rare threading bug and cleanup on error
|
7
|
+
* Added support for soft filters (request by Brad Chapman)
|
8
|
+
* The outputter now writes (properly) in parallel with the parser
|
9
|
+
* bio-vcf turns any VCF into JSON with header information, and
|
10
|
+
allows you to pipe that JSON directly into any JSON supporting
|
11
|
+
language, including Python and Javascript!
|
data/Rakefile
CHANGED
@@ -1,49 +1,12 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
4
|
require 'rake'
|
13
5
|
|
14
|
-
require 'jeweler'
|
15
|
-
Jeweler::Tasks.new do |gem|
|
16
|
-
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.name = "bio-vcf"
|
18
|
-
gem.homepage = "http://github.com/pjotrp/bioruby-vcf"
|
19
|
-
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{Fast multi-threaded VCF parser}
|
21
|
-
gem.description = %Q{Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting}
|
22
|
-
gem.email = "pjotr.public01@thebird.nl"
|
23
|
-
gem.authors = ["Pjotr Prins"]
|
24
|
-
# dependencies defined in Gemfile
|
25
|
-
end
|
26
|
-
Jeweler::RubygemsDotOrgTasks.new
|
27
|
-
|
28
|
-
# require 'rspec/core'
|
29
|
-
# require 'rspec/core/rake_task'
|
30
|
-
# RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
-
# spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
-
# end
|
33
|
-
|
34
|
-
# RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
-
# spec.pattern = 'spec/**/*_spec.rb'
|
36
|
-
# spec.rcov = true
|
37
|
-
# end
|
38
|
-
|
39
|
-
# require 'rake/testtask'
|
40
|
-
|
41
|
-
# Rake::TestTask.new do |t|
|
42
|
-
# t.pattern = "spec/*_spec.rb"
|
43
|
-
# end
|
44
|
-
|
45
6
|
require 'cucumber/rake/task'
|
46
|
-
Cucumber::Rake::Task.new(:features)
|
7
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
8
|
+
# t.cucumber_opts = "--bundler false"
|
9
|
+
end
|
47
10
|
|
48
11
|
task :default => :features
|
49
12
|
|
data/TAGS
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
|
2
|
+
./bin/bio-vcf,0
|
3
|
+
|
4
|
+
./lib/bio-vcf.rb,0
|
5
|
+
|
6
|
+
./lib/bio-vcf/vcfgenotypefield.rb,1553
|
7
|
+
module BioVcf::BioVcf1,0
|
8
|
+
class VcfNucleotides::BioVcf::VcfNucleotides7,167
|
9
|
+
def initialize::BioVcf::VcfNucleotides#BioVcf::VcfNucleotides.new8,193
|
10
|
+
def []::BioVcf::VcfNucleotides#[]13,284
|
11
|
+
def to_ary::BioVcf::VcfNucleotides#to_ary27,628
|
12
|
+
def max::BioVcf::VcfNucleotides#max32,742
|
13
|
+
def min::BioVcf::VcfNucleotides#min37,856
|
14
|
+
def sum::BioVcf::VcfNucleotides#sum42,975
|
15
|
+
class VcfAltInfo::BioVcf::VcfAltInfo50,1082
|
16
|
+
def initialize::BioVcf::VcfAltInfo#BioVcf::VcfAltInfo.new51,1103
|
17
|
+
def []::BioVcf::VcfAltInfo#[]56,1194
|
18
|
+
def to_ary::BioVcf::VcfAltInfo#to_ary70,1512
|
19
|
+
def max::BioVcf::VcfAltInfo#max75,1626
|
20
|
+
def min::BioVcf::VcfAltInfo#min79,1702
|
21
|
+
def sum::BioVcf::VcfAltInfo#sum83,1783
|
22
|
+
class VcfGenotypeField::BioVcf::VcfGenotypeField88,1850
|
23
|
+
def initialize::BioVcf::VcfGenotypeField#BioVcf::VcfGenotypeField.new89,1877
|
24
|
+
def dp4::BioVcf::VcfGenotypeField#dp496,2020
|
25
|
+
def ad::BioVcf::VcfGenotypeField#ad100,2098
|
26
|
+
def pl::BioVcf::VcfGenotypeField#pl104,2174
|
27
|
+
def bcount::BioVcf::VcfGenotypeField#bcount108,2250
|
28
|
+
def bq::BioVcf::VcfGenotypeField#bq112,2343
|
29
|
+
def amq::BioVcf::VcfGenotypeField#amq116,2424
|
30
|
+
def method_missing::BioVcf::VcfGenotypeField#method_missing120,2507
|
31
|
+
class VcfGenotypeFields::BioVcf::VcfGenotypeFields130,2709
|
32
|
+
def initialize::BioVcf::VcfGenotypeFields#BioVcf::VcfGenotypeFields.new131,2737
|
33
|
+
def []::BioVcf::VcfGenotypeFields#[]141,3021
|
34
|
+
def method_missing::BioVcf::VcfGenotypeFields#method_missing145,3136
|
35
|
+
|
36
|
+
./lib/bio-vcf/vcfrdf.rb,156
|
37
|
+
module BioVcf::BioVcf1,0
|
38
|
+
module VcfRdf::BioVcf::VcfRdf5,93
|
39
|
+
def VcfRdf::BioVcf::VcfRdf#VcfRdf7,112
|
40
|
+
def VcfRdf::BioVcf::VcfRdf#VcfRdf18,463
|
41
|
+
|
42
|
+
./lib/bio-vcf/vcf.rb,27
|
43
|
+
module BioVcf::BioVcf2,1
|
44
|
+
|
45
|
+
./lib/bio-vcf/vcfline.rb,118
|
46
|
+
module BioVcf::BioVcf1,0
|
47
|
+
module VcfLine::BioVcf::VcfLine2,16
|
48
|
+
def VcfLine.parse::BioVcf::VcfLine.parse5,82
|
49
|
+
|
50
|
+
./lib/bio-vcf/vcfrecord.rb,1831
|
51
|
+
module BioVcf::BioVcf1,0
|
52
|
+
class VcfRecordInfo::BioVcf::VcfRecordInfo3,17
|
53
|
+
def initialize::BioVcf::VcfRecordInfo#BioVcf::VcfRecordInfo.new4,41
|
54
|
+
def method_missing::BioVcf::VcfRecordInfo#method_missing9,163
|
55
|
+
module VcfRecordParser::BioVcf::VcfRecordParser18,329
|
56
|
+
def VcfRecordParser.get_format::BioVcf::VcfRecordParser.get_format20,397
|
57
|
+
def VcfRecordParser.get_info::BioVcf::VcfRecordParser.get_info25,517
|
58
|
+
module VcfRecordCall::BioVcf::VcfRecordCall30,592
|
59
|
+
def call_diff::BioVcf::VcfRecordCall#call_diff31,617
|
60
|
+
def call_nuc::BioVcf::VcfRecordCall#call_nuc35,705
|
61
|
+
def call_tumor_count::BioVcf::VcfRecordCall#call_tumor_count39,764
|
62
|
+
def call_tumor_relative_count::BioVcf::VcfRecordCall#call_tumor_relative_count43,833
|
63
|
+
def call_normal_count::BioVcf::VcfRecordCall#call_normal_count47,955
|
64
|
+
def index::BioVcf::VcfRecordCall#index51,1026
|
65
|
+
class VcfRecord::BioVcf::VcfRecord56,1125
|
66
|
+
attr_reader :header::BioVcf::VcfRecord#header60,1173
|
67
|
+
def initialize::BioVcf::VcfRecord#BioVcf::VcfRecord.new62,1198
|
68
|
+
def chrom::BioVcf::VcfRecord#chrom67,1292
|
69
|
+
def pos::BioVcf::VcfRecord#pos71,1332
|
70
|
+
def ids::BioVcf::VcfRecord#ids75,1384
|
71
|
+
def id::BioVcf::VcfRecord#id79,1443
|
72
|
+
def ref::BioVcf::VcfRecord#ref83,1476
|
73
|
+
def alt::BioVcf::VcfRecord#alt87,1524
|
74
|
+
def qual::BioVcf::VcfRecord#qual91,1582
|
75
|
+
def info::BioVcf::VcfRecord#info95,1636
|
76
|
+
def format::BioVcf::VcfRecord#format99,1711
|
77
|
+
def normal::BioVcf::VcfRecord#normal104,1848
|
78
|
+
def tumor::BioVcf::VcfRecord#tumor109,1997
|
79
|
+
def sample::BioVcf::VcfRecord#sample114,2134
|
80
|
+
def sample_by_name::BioVcf::VcfRecord#sample_by_name118,2227
|
81
|
+
def missing_samples?::BioVcf::VcfRecord#missing_samples?122,2283
|
82
|
+
def method_missing::BioVcf::VcfRecord#method_missing126,2341
|
83
|
+
|
84
|
+
./lib/bio-vcf/variant.rb,470
|
85
|
+
module BioVcf::BioVcf1,0
|
86
|
+
module Variant::BioVcf::Variant3,17
|
87
|
+
def Variant.diff::BioVcf::Variant.diff5,37
|
88
|
+
def Variant.threshold_diff::BioVcf::Variant.threshold_diff9,132
|
89
|
+
def Variant.relative_diff::BioVcf::Variant.relative_diff14,269
|
90
|
+
def Variant.relative_threshold_diff::BioVcf::Variant.relative_threshold_diff20,497
|
91
|
+
def Variant.index::BioVcf::Variant.index25,652
|
92
|
+
def Variant.apply_threshold::BioVcf::Variant.apply_threshold31,809
|
93
|
+
|
94
|
+
./lib/bio-vcf/vcfheader.rb,598
|
95
|
+
module BioVcf::BioVcf2,1
|
96
|
+
module VcfHeaderParser::BioVcf::VcfHeaderParser4,18
|
97
|
+
def VcfHeaderParser.get_column_names::BioVcf::VcfHeaderParser.get_column_names5,45
|
98
|
+
class VcfHeader::BioVcf::VcfHeader18,339
|
99
|
+
attr_reader :lines::BioVcf::VcfHeader#lines20,360
|
100
|
+
def initialize::BioVcf::VcfHeader#BioVcf::VcfHeader.new22,384
|
101
|
+
def add::BioVcf::VcfHeader#add26,430
|
102
|
+
def version::BioVcf::VcfHeader#version30,483
|
103
|
+
def column_names::BioVcf::VcfHeader#column_names34,578
|
104
|
+
def columns::BioVcf::VcfHeader#columns38,674
|
105
|
+
def samples::BioVcf::VcfHeader#samples42,735
|
106
|
+
|
107
|
+
./features/step_definitions/diff_count.rb,0
|
108
|
+
|
109
|
+
./features/step_definitions/bio-vcf_steps.rb,0
|
110
|
+
|
111
|
+
./features/step_definitions/somaticsniper.rb,0
|
112
|
+
|
113
|
+
./features/step_definitions/multisample.rb,0
|
114
|
+
|
115
|
+
./features/support/env.rb,0
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.4
|
data/bin/bio-vcf
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Author:: Pjotr Prins
|
5
5
|
# License:: MIT
|
6
6
|
#
|
7
|
-
# Copyright (C) 2014 Pjotr Prins <pjotr.prins@thebird.nl>
|
7
|
+
# Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>
|
8
8
|
|
9
9
|
USAGE = "Vcf parser"
|
10
10
|
|
@@ -15,22 +15,23 @@ VERSION_FILENAME=File.join(gempath,'VERSION')
|
|
15
15
|
version = File.new(VERSION_FILENAME).read.chomp
|
16
16
|
|
17
17
|
require 'bio-vcf'
|
18
|
+
require 'bio-vcf/pcows'
|
18
19
|
require 'optparse'
|
19
20
|
require 'timeout'
|
20
21
|
require 'fileutils'
|
21
22
|
|
22
|
-
# Uncomment when using the bio-logger
|
23
|
+
# Uncomment when using the bio-logger
|
23
24
|
# require 'bio-logger'
|
24
25
|
# log = Bio::Log::LoggerPlus.new 'vcf'
|
25
|
-
# log.outputters = Bio::Log::Outputter.stderr
|
26
|
+
# log.outputters = Bio::Log::Outputter.stderr
|
26
27
|
# Bio::Log::CLI.logger('stderr')
|
27
28
|
# Bio::Log::CLI.trace('info')
|
28
29
|
|
29
|
-
options = { show_help: false, source: 'https://github.com/
|
30
|
+
options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
|
30
31
|
opts = OptionParser.new do |o|
|
31
32
|
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf"
|
32
33
|
|
33
|
-
o.on('-i','--ignore-missing', 'Ignore missing data') do
|
34
|
+
o.on('-i','--ignore-missing', 'Ignore missing data') do
|
34
35
|
options[:ignore_missing] = true
|
35
36
|
end
|
36
37
|
o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
|
@@ -57,6 +58,13 @@ opts = OptionParser.new do |o|
|
|
57
58
|
o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
|
58
59
|
options[:efilter_samples] = l
|
59
60
|
end
|
61
|
+
o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
|
62
|
+
options[:add_filter] = name
|
63
|
+
end
|
64
|
+
|
65
|
+
o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
|
66
|
+
options[:bed] = bed
|
67
|
+
end
|
60
68
|
|
61
69
|
o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
|
62
70
|
options[:eval] = cmd
|
@@ -64,6 +72,9 @@ opts = OptionParser.new do |o|
|
|
64
72
|
o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
|
65
73
|
options[:eval_once] = true
|
66
74
|
options[:eval] = cmd
|
75
|
+
# options[:num_threads] = 1
|
76
|
+
# options[:thread_lines] = 1
|
77
|
+
options[:skip_header] = true
|
67
78
|
end
|
68
79
|
o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
|
69
80
|
options[:seval] = cmd
|
@@ -80,7 +91,7 @@ opts = OptionParser.new do |o|
|
|
80
91
|
options[:rdf] = true
|
81
92
|
options[:skip_header] = true
|
82
93
|
end
|
83
|
-
o.on("--num-threads [num]", Integer, "Multi-core version (default
|
94
|
+
o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
|
84
95
|
options[:num_threads] = i
|
85
96
|
end
|
86
97
|
o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
|
@@ -92,8 +103,8 @@ opts = OptionParser.new do |o|
|
|
92
103
|
o.on_tail("--tags list", String, "Add tags") do |s|
|
93
104
|
options[:tags] = s
|
94
105
|
end
|
95
|
-
|
96
|
-
o.on("--skip-header", "Do not output VCF header info") do
|
106
|
+
|
107
|
+
o.on("--skip-header", "Do not output VCF header info") do
|
97
108
|
options[:skip_header] = true
|
98
109
|
end
|
99
110
|
|
@@ -108,9 +119,16 @@ opts = OptionParser.new do |o|
|
|
108
119
|
options[:template] = s
|
109
120
|
options[:skip_header] = true
|
110
121
|
end
|
111
|
-
|
112
|
-
|
113
|
-
|
122
|
+
|
123
|
+
o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
|
124
|
+
options[:tag] = true
|
125
|
+
end
|
126
|
+
|
127
|
+
o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
|
128
|
+
options[:timeout] = i
|
129
|
+
end
|
130
|
+
|
131
|
+
# Uncomment the following when using the bio-logger
|
114
132
|
# o.separator ""
|
115
133
|
# o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
116
134
|
# Bio::Log::CLI.logger(name)
|
@@ -119,7 +137,16 @@ opts = OptionParser.new do |o|
|
|
119
137
|
# o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
|
120
138
|
# Bio::Log::CLI.trace(s)
|
121
139
|
# end
|
122
|
-
#
|
140
|
+
#
|
141
|
+
o.on("--names", "Output sample names") do |q|
|
142
|
+
options[:quiet] = true
|
143
|
+
options[:num_threads] = nil
|
144
|
+
options[:eval_once] = true
|
145
|
+
options[:eval] = 'header.samples.join("\t")'
|
146
|
+
# options[:num_threads] = 1
|
147
|
+
# options[:thread_lines] = 1
|
148
|
+
options[:skip_header] = true
|
149
|
+
end
|
123
150
|
o.on("--statistics", "Output statistics") do |q|
|
124
151
|
options[:statistics] = true
|
125
152
|
options[:num_threads] = nil
|
@@ -128,14 +155,15 @@ opts = OptionParser.new do |o|
|
|
128
155
|
# Bio::Log::CLI.trace('error')
|
129
156
|
options[:quiet] = true
|
130
157
|
end
|
131
|
-
|
158
|
+
|
132
159
|
o.on("-v", "--verbose", "Run verbosely") do |v|
|
133
160
|
options[:verbose] = true
|
134
161
|
end
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
162
|
+
|
163
|
+
o.on("--debug", "Show debug messages and keep intermediate output") do |v|
|
164
|
+
# Bio::Log::CLI.trace('debug')
|
165
|
+
options[:debug] = true
|
166
|
+
end
|
139
167
|
|
140
168
|
o.separator ""
|
141
169
|
o.on_tail('-h', '--help', 'display this help and exit') do
|
@@ -145,10 +173,12 @@ end
|
|
145
173
|
|
146
174
|
opts.parse!(ARGV)
|
147
175
|
|
148
|
-
|
176
|
+
BIOVCF_VERSION=version
|
177
|
+
BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
|
178
|
+
$stderr.print BIOVCF_BANNER if !options[:quiet]
|
149
179
|
|
150
|
-
if options[:show_help]
|
151
|
-
print opts
|
180
|
+
if options[:show_help]
|
181
|
+
print opts
|
152
182
|
print USAGE
|
153
183
|
exit 1
|
154
184
|
end
|
@@ -161,18 +191,11 @@ $stderr.print "Options: ",options,"\n" if !options[:quiet]
|
|
161
191
|
|
162
192
|
if options[:template]
|
163
193
|
include BioVcf::RDF
|
194
|
+
require 'bio-vcf/template'
|
164
195
|
fn = options[:template]
|
165
196
|
raise "No template #{fn}!" if not File.exist?(fn)
|
166
|
-
template = ERB.new(File.read(fn))
|
167
|
-
|
168
|
-
|
169
|
-
if options[:num_threads] != 1
|
170
|
-
begin
|
171
|
-
require 'parallel'
|
172
|
-
rescue LoadError
|
173
|
-
$stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
|
174
|
-
options[:num_threads] = 1
|
175
|
-
end
|
197
|
+
# template = ERB.new(File.read(fn))
|
198
|
+
template = Bio::Template.new(fn)
|
176
199
|
end
|
177
200
|
|
178
201
|
stats = nil
|
@@ -185,6 +208,8 @@ end
|
|
185
208
|
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
|
186
209
|
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
|
187
210
|
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
|
211
|
+
# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
|
212
|
+
# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
|
188
213
|
|
189
214
|
if options[:samples]
|
190
215
|
samples = options[:samples].map { |s| s.to_i }
|
@@ -192,13 +217,14 @@ end
|
|
192
217
|
|
193
218
|
include BioVcf
|
194
219
|
|
195
|
-
# Parse the header section of a VCF file
|
220
|
+
# Parse the header section of a VCF file (chomping STDIN)
|
196
221
|
def parse_header line, samples, options
|
197
|
-
header = VcfHeader.new
|
222
|
+
header = VcfHeader.new(options[:debug])
|
198
223
|
header.add(line)
|
199
224
|
print line if not options[:skip_header]
|
200
225
|
STDIN.each_line do | headerline |
|
201
226
|
if headerline !~ /^#/
|
227
|
+
# If no records in VCF, we never get here
|
202
228
|
line = headerline
|
203
229
|
break # end of header
|
204
230
|
end
|
@@ -206,12 +232,19 @@ def parse_header line, samples, options
|
|
206
232
|
if not options[:skip_header]
|
207
233
|
if headerline =~ /^#CHR/
|
208
234
|
# The header before actual data contains the sample names, first inject the BioVcf meta information
|
209
|
-
print header.tag(options),"\n" if not options[:skip_header]
|
235
|
+
print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
|
236
|
+
# Then the additional filter(s)
|
237
|
+
# ##FILTER=<ID=LowQual,Description="Low quality">
|
238
|
+
add_filter = options[:add_filter]
|
239
|
+
if add_filter
|
240
|
+
print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
|
241
|
+
end
|
242
|
+
|
210
243
|
selected = header.column_names
|
211
244
|
if samples
|
212
245
|
newfields = selected[0..8]
|
213
246
|
samples.each do |s|
|
214
|
-
newfields << selected[s+9]
|
247
|
+
newfields << selected[s+9]
|
215
248
|
end
|
216
249
|
selected = newfields
|
217
250
|
end
|
@@ -223,11 +256,15 @@ def parse_header line, samples, options
|
|
223
256
|
end
|
224
257
|
print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
|
225
258
|
VcfRdf::header if options[:rdf]
|
259
|
+
if line =~ /^#/
|
260
|
+
# We did not read a record
|
261
|
+
line = nil
|
262
|
+
end
|
226
263
|
return header,line
|
227
264
|
end
|
228
265
|
|
229
|
-
# Parse a VCF line and return the result as a string
|
230
|
-
def parse_line line,header,options,samples,template,stats=nil
|
266
|
+
# Parse a VCF line and return the (template) result as a string buffer
|
267
|
+
def parse_line line,header,options,bedfilter,samples,template,stats=nil
|
231
268
|
fields = VcfLine.parse(line)
|
232
269
|
rec = VcfRecord.new(fields,header)
|
233
270
|
r = rec # alias
|
@@ -236,9 +273,11 @@ def parse_line line,header,options,samples,template,stats=nil
|
|
236
273
|
sfilter = options[:sfilter]
|
237
274
|
efilter = options[:efilter]
|
238
275
|
ifilter = options[:ifilter]
|
276
|
+
add_filter = options[:add_filter] # contains a filter name (soft filter)
|
239
277
|
seval = options[:seval]
|
240
278
|
ignore_missing = options[:ignore_missing]
|
241
279
|
quiet = options[:quiet]
|
280
|
+
set_filter_field = nil
|
242
281
|
|
243
282
|
if sfilter or efilter or ifilter or seval
|
244
283
|
# check for samples
|
@@ -248,15 +287,32 @@ def parse_line line,header,options,samples,template,stats=nil
|
|
248
287
|
|
249
288
|
# --------------------------
|
250
289
|
# Filtering and set analysis
|
251
|
-
|
252
|
-
|
253
|
-
|
290
|
+
if bedfilter
|
291
|
+
bed = bedfilter.contains(rec)
|
292
|
+
return if not bed
|
293
|
+
end
|
294
|
+
|
295
|
+
skip = lambda { |&m|
|
296
|
+
matched = m.call
|
297
|
+
if add_filter
|
298
|
+
set_filter_field = true if matched
|
299
|
+
false # always continue processing with an add-filter
|
300
|
+
else
|
301
|
+
not matched
|
302
|
+
end
|
303
|
+
}
|
304
|
+
|
305
|
+
if filter
|
306
|
+
return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
307
|
+
end
|
308
|
+
|
309
|
+
if sfilter # sample 'or' filter
|
254
310
|
rec.each_sample(options[:sfilter_samples]) do | sample |
|
255
|
-
return if
|
311
|
+
return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
256
312
|
end
|
257
313
|
end
|
258
314
|
|
259
|
-
if ifilter
|
315
|
+
if ifilter # include sample filter
|
260
316
|
found = false
|
261
317
|
rec.each_sample(options[:ifilter_samples]) do | sample |
|
262
318
|
if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
|
@@ -265,12 +321,12 @@ def parse_line line,header,options,samples,template,stats=nil
|
|
265
321
|
end
|
266
322
|
end
|
267
323
|
# Skip if there are no matches
|
268
|
-
return if
|
324
|
+
return if skip.call {found}
|
269
325
|
end
|
270
326
|
|
271
|
-
if efilter
|
327
|
+
if efilter # exclude sample filter
|
272
328
|
rec.each_sample(options[:efilter_samples]) do | sample |
|
273
|
-
return if
|
329
|
+
return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
|
274
330
|
end
|
275
331
|
end
|
276
332
|
|
@@ -278,18 +334,21 @@ def parse_line line,header,options,samples,template,stats=nil
|
|
278
334
|
|
279
335
|
# -----------------------------
|
280
336
|
# From here on decide on output
|
337
|
+
|
338
|
+
rec.add_to_filter_field(add_filter) if set_filter_field
|
339
|
+
|
281
340
|
if samples
|
282
341
|
# Select certain samples for output
|
283
342
|
newfields = fields[0..8]
|
284
343
|
samples.each do |s|
|
285
|
-
newfields << fields[s+9]
|
344
|
+
newfields << fields[s+9]
|
286
345
|
end
|
287
346
|
fields = newfields
|
288
347
|
end
|
289
348
|
if options[:eval] or seval
|
290
349
|
begin
|
291
350
|
results = nil # result string
|
292
|
-
if options[:eval]
|
351
|
+
if options[:eval]
|
293
352
|
res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
|
294
353
|
results = res if res
|
295
354
|
end
|
@@ -307,23 +366,22 @@ def parse_line line,header,options,samples,template,stats=nil
|
|
307
366
|
exit 1
|
308
367
|
end
|
309
368
|
return results.to_s+"\n" if results
|
310
|
-
exit(1) if options[:eval_once] # <--- can this be reached?
|
311
369
|
else
|
312
370
|
if options[:rdf]
|
313
371
|
# Output Turtle RDF
|
314
372
|
VcfRdf::record(options[:id],rec,options[:tags])
|
315
373
|
elsif options[:template]
|
316
|
-
#
|
374
|
+
# Use ERB template
|
317
375
|
begin
|
318
|
-
template.
|
376
|
+
template.body(binding)
|
319
377
|
rescue Exception => e
|
320
378
|
$stderr.print e,": ",fields,"\n"
|
321
379
|
$stderr.print e.backtrace.inspect if options[:verbose]
|
322
|
-
raise
|
380
|
+
raise
|
323
381
|
end
|
324
382
|
elsif options[:rewrite]
|
325
383
|
# Default behaviour prints VCF line, but rewrite info
|
326
|
-
eval(options[:rewrite])
|
384
|
+
eval(options[:rewrite])
|
327
385
|
(fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
|
328
386
|
elsif stats
|
329
387
|
# do nothing
|
@@ -334,88 +392,97 @@ def parse_line line,header,options,samples,template,stats=nil
|
|
334
392
|
end
|
335
393
|
end
|
336
394
|
|
395
|
+
CHUNK_SIZE = options[:thread_lines]
|
396
|
+
|
397
|
+
pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
|
398
|
+
options[:quiet],options[:debug])
|
337
399
|
header = nil
|
338
400
|
header_output_completed = false
|
339
|
-
|
340
|
-
CHUNK_SIZE = options[:thread_lines]
|
341
|
-
CHUNK_NUM = (NUM_THREADS && NUM_THREADS>6 ? NUM_THREADS*4 : 24)
|
342
|
-
chunks = []
|
343
|
-
lines = []
|
401
|
+
chunk_lines = []
|
344
402
|
line_number=0
|
345
403
|
|
404
|
+
if options[:bed]
|
405
|
+
bedfilter = BedFilter.new(options[:bed])
|
406
|
+
end
|
407
|
+
|
346
408
|
begin
|
409
|
+
# Define linear parser function (going through one chunk)
|
347
410
|
process = lambda { | lines |
|
348
411
|
res = []
|
349
412
|
lines.each do | line |
|
350
|
-
res << parse_line(line,header,options,samples,template,stats)
|
413
|
+
res << parse_line(line,header,options,bedfilter,samples,template,stats)
|
351
414
|
end
|
352
415
|
res
|
353
416
|
}
|
354
|
-
output = lambda { |collection|
|
355
|
-
collection.each do | result |
|
356
|
-
result.each { |line| print line }
|
357
|
-
end
|
358
|
-
} # end output
|
359
417
|
|
360
418
|
# ---- Main loop
|
361
419
|
STDIN.each_line do | line |
|
362
420
|
line_number += 1
|
363
|
-
|
421
|
+
|
422
|
+
# ---- Skip embedded headers down the line...
|
364
423
|
next if header_output_completed and line =~ /^#/
|
365
|
-
if line =~ /^##fileformat=/ or line =~ /^#CHR/
|
366
|
-
header,line = parse_header(line,samples,options)
|
367
|
-
end
|
368
|
-
next if line =~ /^##/ # empty file
|
369
|
-
header_output_completed = true
|
370
|
-
if not options[:efilter_samples] and options[:ifilter_samples]
|
371
|
-
# Create exclude set as a complement of include set
|
372
|
-
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
373
|
-
end
|
374
424
|
|
375
|
-
# ---- In
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
425
|
+
# ---- In the following section header information is handled -
|
426
|
+
# this only happens once.
|
427
|
+
|
428
|
+
# ---- Parse the header lines (chomps from STDIN)
|
429
|
+
# and returns header info and the current line
|
430
|
+
if line =~ /^#/
|
431
|
+
header, line = parse_header(line,samples,options)
|
432
|
+
if line.nil?
|
433
|
+
# No line after header, to there are no records to process
|
434
|
+
break
|
382
435
|
end
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
chunks = []
|
392
|
-
# Output is forked to a separate process too
|
393
|
-
fork do
|
394
|
-
output.call out
|
395
|
-
STDOUT.flush
|
396
|
-
STDOUT.close
|
397
|
-
exit 0
|
398
|
-
end
|
399
|
-
end
|
400
|
-
lines = []
|
436
|
+
end
|
437
|
+
# p [line_number,line]
|
438
|
+
# ---- After the header continue processing
|
439
|
+
if not header_output_completed
|
440
|
+
# one-time post-header processing
|
441
|
+
if not options[:efilter_samples] and options[:ifilter_samples]
|
442
|
+
# Create exclude set as a complement of include set
|
443
|
+
options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
|
401
444
|
end
|
445
|
+
print template.header(binding) if template
|
446
|
+
header_output_completed = true
|
447
|
+
end
|
448
|
+
|
449
|
+
if options[:eval_once]
|
450
|
+
# this happens if we only want one line evaluated - say to get
|
451
|
+
# the number of samples
|
452
|
+
print parse_line(line,header,options,bedfilter,samples,template,stats)
|
453
|
+
exit 0
|
454
|
+
end
|
455
|
+
|
456
|
+
# ---- Lines are collected in one buffer and the lines buffer
|
457
|
+
# is added to the chunks list (for the threads)
|
458
|
+
chunk_lines << line
|
459
|
+
|
460
|
+
# ---- In the following section the VCF lines are parsed by chunks
|
461
|
+
# The chunks may go into different threads
|
462
|
+
|
463
|
+
if chunk_lines.size >= CHUNK_SIZE
|
464
|
+
# ---- process one chunk
|
465
|
+
$stderr.print '.' if not options[:quiet]
|
466
|
+
pcows.wait_for_worker_slot()
|
467
|
+
pcows.submit_worker(process,chunk_lines)
|
468
|
+
pcows.process_output()
|
469
|
+
|
470
|
+
chunk_lines = []
|
402
471
|
end
|
403
472
|
end
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
output.call Parallel.map(chunks, :in_processes => NUM_THREADS) { | chunk |
|
410
|
-
process.call(chunk)
|
411
|
-
}
|
412
|
-
end
|
473
|
+
pcows.submit_final_worker(process,chunk_lines)
|
474
|
+
pcows.wait_for_workers()
|
475
|
+
pcows.process_remaining_output()
|
476
|
+
|
477
|
+
print template.footer(binding) if template
|
413
478
|
stats.print if stats
|
414
479
|
|
415
480
|
rescue Exception => e
|
416
|
-
|
417
|
-
|
481
|
+
if e.message != 'exit'
|
482
|
+
$stderr.print "ERROR: "
|
483
|
+
$stderr.print e.message,"\n"
|
484
|
+
end
|
485
|
+
pcows.cleanup()
|
418
486
|
raise if options[:verbose]
|
419
487
|
exit 1
|
420
488
|
end
|
421
|
-
|