bio-vcf 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -3
- data/Gemfile.lock +44 -0
- data/README.md +151 -28
- data/VERSION +1 -1
- data/bin/bio-vcf +47 -15
- data/bio-vcf.gemspec +4 -21
- data/features/#cli.feature# +71 -0
- data/features/cli.feature +3 -3
- data/features/filter.feature +12 -0
- data/features/filter.feature~ +35 -0
- data/features/somaticsniper.feature +2 -0
- data/features/step_definitions/cli-feature.rb +5 -0
- data/features/step_definitions/somaticsniper.rb +8 -0
- data/lib/bio-vcf/pcows.rb +123 -36
- data/lib/bio-vcf/vcfgenotypefield.rb +1 -1
- data/lib/bio-vcf/vcfrecord.rb +21 -0
- data/lib/bio-vcf/vcfsample.rb +13 -0
- data/test/data/regression/eval_once-stderr.new +2 -1
- data/test/data/regression/eval_r.info.dp-stderr.new +8 -4
- data/test/data/regression/ifilter_s.dp-stderr.new +8 -4
- data/test/data/regression/pass1-stderr.new +8 -0
- data/test/data/regression/pass1.new +88 -0
- data/test/data/regression/pass1.ref +88 -0
- data/test/data/regression/r.info.dp-stderr.new +8 -4
- data/test/data/regression/rewrite.info.sample-stderr.new +8 -4
- data/test/data/regression/s.dp-stderr.new +8 -4
- data/test/data/regression/seval_s.dp-stderr.new +8 -4
- data/test/data/regression/sfilter_seval_s.dp-stderr.new +8 -4
- data/test/data/regression/thread4-stderr.new +8 -4
- data/test/data/regression/thread4_4-stderr.new +44 -15
- data/test/data/regression/vcf2json_full_header-stderr.new +8 -4
- data/test/data/regression/vcf2json_use_meta-stderr.new +8 -4
- data/test/stress/stress_test.sh +15 -0
- data/test/stress/stress_test.sh~ +8 -0
- metadata +14 -5
data/bio-vcf.gemspec
CHANGED
@@ -3,12 +3,12 @@
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "bio-vcf"
|
6
|
-
s.version =
|
6
|
+
s.version = File.read("VERSION")
|
7
7
|
|
8
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
9
9
|
s.authors = ["Pjotr Prins"]
|
10
10
|
# s.date = "2015-12-28"
|
11
|
-
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting"
|
11
|
+
s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
|
12
12
|
s.email = "pjotr.public01@thebird.nl"
|
13
13
|
s.executables = ["bio-vcf"]
|
14
14
|
s.extra_rdoc_files = [
|
@@ -35,25 +35,8 @@ Gem::Specification.new do |s|
|
|
35
35
|
s.licenses = ["MIT"]
|
36
36
|
s.require_paths = ["lib"]
|
37
37
|
s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
|
38
|
-
s.rubygems_version = "2.0.3"
|
39
|
-
s.summary = "Fast multi-threaded VCF parser"
|
38
|
+
# s.rubygems_version = "2.0.3"
|
39
|
+
s.summary = "Fast multi-purpose multi-threaded VCF parser"
|
40
40
|
|
41
|
-
# if s.respond_to? :specification_version then
|
42
|
-
# s.specification_version = 4
|
43
|
-
|
44
|
-
# if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
-
# s.add_development_dependency(%q<rspec>, [">= 2.14.0"])
|
46
|
-
# s.add_development_dependency(%q<cucumber>, [">= 1.3.11"])
|
47
|
-
# s.add_development_dependency(%q<regressiontest>, [">= 0.0.3"])
|
48
|
-
# else
|
49
|
-
# s.add_dependency(%q<rspec>, [">= 2.14.0"])
|
50
|
-
# s.add_dependency(%q<cucumber>, [">= 1.3.11"])
|
51
|
-
# s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
|
52
|
-
# end
|
53
|
-
# else
|
54
|
-
# s.add_dependency(%q<rspec>, [">= 2.14.0"])
|
55
|
-
# s.add_dependency(%q<cucumber>, [">= 1.3.11"])
|
56
|
-
# s.add_dependency(%q<regressiontest>, [">= 0.0.3"])
|
57
|
-
# end
|
58
41
|
end
|
59
42
|
|
@@ -0,0 +1,71 @@
|
|
1
|
+
@cli
|
2
|
+
Feature: Command-line interface (CLI)
|
3
|
+
|
4
|
+
bio-vcf has a powerful command line interface. Here we regression test features.
|
5
|
+
|
6
|
+
Scenario: Test the info filter using dp
|
7
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
8
|
+
When I execute "./bin/bio-vcf -i --filter 'r.info.dp>100'"
|
9
|
+
Then I expect the named output to match the named output "r.info.dp"
|
10
|
+
|
11
|
+
Scenario: Test the info filter using dp and threads
|
12
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
13
|
+
When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
|
14
|
+
Then I expect the named output to match the named output "thread4"
|
15
|
+
|
16
|
+
Scenario: Test the info filter using dp and threads with lines
|
17
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
18
|
+
When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
|
19
|
+
Then I expect the named output to match the named output "thread4_4"
|
20
|
+
|
21
|
+
Scenario: Test the sample filter using dp
|
22
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
23
|
+
When I execute "./bin/bio-vcf -i --sfilter 's.dp>20'"
|
24
|
+
Then I expect the named output to match the named output "s.dp"
|
25
|
+
|
26
|
+
Scenario: Test the include sample filter using dp
|
27
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
28
|
+
When I execute "./bin/bio-vcf -i --ifilter 's.dp>100' --seval s.dp"
|
29
|
+
Then I expect the named output to match the named output "ifilter_s.dp"
|
30
|
+
|
31
|
+
Scenario: Test the info eval using dp
|
32
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
33
|
+
When I execute "./bin/bio-vcf -i --eval 'r.info.dp'"
|
34
|
+
Then I expect the named output to match the named output "eval_r.info.dp"
|
35
|
+
|
36
|
+
Scenario: Test the sample eval using dp
|
37
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
38
|
+
When I execute "./bin/bio-vcf -i --seval 's.dp'"
|
39
|
+
Then I expect the named output to match the named output "seval_s.dp"
|
40
|
+
|
41
|
+
Scenario: Test the sample filter + eval using dp
|
42
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
43
|
+
When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
|
44
|
+
Then I expect the named output to match the named output "sfilter_seval_s.dp"
|
45
|
+
|
46
|
+
Scenario: Rewrite an info field
|
47
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
48
|
+
When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
|
49
|
+
Then I expect the named output to match the named output "rewrite.info.sample"
|
50
|
+
|
51
|
+
Scenario: Test eval-once
|
52
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
53
|
+
When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
|
54
|
+
Then I expect the named output to match the named output "eval_once"
|
55
|
+
|
56
|
+
Scenario: Test JSON output with header meta data
|
57
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
58
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
|
59
|
+
Then I expect the named output to match the named output "vcf2json_full_header"
|
60
|
+
|
61
|
+
Scenario: Test JSON output with header meta data and query samples
|
62
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
63
|
+
When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
|
64
|
+
Then I expect the named output to match the named output "vcf2json_use_meta"
|
65
|
+
|
66
|
+
Scenario: Test deadlock on failed filter with threads
|
67
|
+
Given I have input file(s) named "test/data/input/multisample.vcf"
|
68
|
+
When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
69
|
+
Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
|
70
|
+
|
71
|
+
|
data/features/cli.feature
CHANGED
@@ -11,12 +11,12 @@ Feature: Command-line interface (CLI)
|
|
11
11
|
Scenario: Test the info filter using dp and threads
|
12
12
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
13
13
|
When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
|
14
|
-
Then I expect the named output to match the named output "thread4"
|
14
|
+
Then I expect the named output to match the named output "thread4" in under 30 seconds
|
15
15
|
|
16
16
|
Scenario: Test the info filter using dp and threads with lines
|
17
17
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
18
18
|
When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
|
19
|
-
Then I expect the named output to match the named output "thread4_4"
|
19
|
+
Then I expect the named output to match the named output "thread4_4" in under 30 seconds
|
20
20
|
|
21
21
|
Scenario: Test the sample filter using dp
|
22
22
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
@@ -65,7 +65,7 @@ Feature: Command-line interface (CLI)
|
|
65
65
|
|
66
66
|
Scenario: Test deadlock on failed filter with threads
|
67
67
|
Given I have input file(s) named "test/data/input/multisample.vcf"
|
68
|
-
When I execute "./bin/bio-vcf -q --timeout
|
68
|
+
When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
|
69
69
|
Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
|
70
70
|
|
71
71
|
|
@@ -0,0 +1,12 @@
|
|
1
|
+
@filter
|
2
|
+
Feature: Adding filters
|
3
|
+
|
4
|
+
bio-vcf can add soft filters. Rather than removing failing items we can
|
5
|
+
inject filter state into the FILTER field. To add state such as PASS or
|
6
|
+
LowDepth simply use a filter and the --set-filter switch. If a filter already
|
7
|
+
has state the new one is appended with a semi-colon.
|
8
|
+
|
9
|
+
Scenario: Test the info filter using dp and threads
|
10
|
+
Given I have input file(s) named "test/data/input/somaticsniper.vcf"
|
11
|
+
When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
|
12
|
+
Then I expect the named output to match the named output "pass1"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@meta
|
2
|
+
Feature: Parsing VCF meta information from the header
|
3
|
+
|
4
|
+
Take a header and parse that information as defined by the VCF standard.
|
5
|
+
|
6
|
+
Scenario: When parsing a header line
|
7
|
+
|
8
|
+
Given the VCF header lines
|
9
|
+
"""
|
10
|
+
##fileformat=VCFv4.1
|
11
|
+
##fileDate=20140121
|
12
|
+
##phasing=none
|
13
|
+
##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
|
14
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
15
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
|
16
|
+
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
|
17
|
+
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
|
18
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
|
19
|
+
"""
|
20
|
+
When I parse the VCF header
|
21
|
+
Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
|
22
|
+
And I expect vcf.fileformat to be "VCFv4.1"
|
23
|
+
And I expect vcf.fileDate to be "20140121"
|
24
|
+
And I expect vcf.field['fileDate'] to be "20140121"
|
25
|
+
And I expect vcf.phasing to be "none"
|
26
|
+
And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
|
27
|
+
And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
|
28
|
+
And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
|
29
|
+
And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
|
30
|
+
And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
|
31
|
+
And I expect vcf.meta to contain all header meta information
|
32
|
+
|
33
|
+
Scenario: When parsing the header of somatic_sniper.vcf
|
34
|
+
|
35
|
+
Do something
|
@@ -46,6 +46,8 @@ Feature: VCF for Somatic Sniper
|
|
46
46
|
And I expect rec.tumor.amq.to_ary to be [37,37]
|
47
47
|
And I expect rec.tumor.mq to be 37
|
48
48
|
And I expect rec.tumor.ss to be 2
|
49
|
+
And I expect rec.tumor.ssc to be 33
|
50
|
+
And I expect rec.normal.ssc to be nil
|
49
51
|
# The following are additional functions
|
50
52
|
And I expect rec.call_diff to be [-4,2,-2,0]
|
51
53
|
And I expect rec.call_nuc to be "C"
|
@@ -11,6 +11,11 @@ Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
|
|
11
11
|
RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
|
12
12
|
end
|
13
13
|
|
14
|
+
Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
|
15
|
+
RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
|
16
|
+
end
|
17
|
+
|
18
|
+
|
14
19
|
Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
|
15
20
|
RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
|
16
21
|
end
|
@@ -99,6 +99,14 @@ Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
|
|
99
99
|
end
|
100
100
|
|
101
101
|
|
102
|
+
Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
|
103
|
+
expect(@rec.tumor.ssc).to be 33
|
104
|
+
end
|
105
|
+
|
106
|
+
Then(/^I expect rec\.normal\.ssc to be nil$/) do
|
107
|
+
expect(@rec.normal.ssc).to be nil
|
108
|
+
end
|
109
|
+
|
102
110
|
Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
|
103
111
|
expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
|
104
112
|
end
|
data/lib/bio-vcf/pcows.rb
CHANGED
@@ -4,25 +4,31 @@ require 'tempfile'
|
|
4
4
|
|
5
5
|
class PCOWS
|
6
6
|
|
7
|
-
RUNNINGEXT = 'part'
|
7
|
+
RUNNINGEXT = 'part' # file extension
|
8
8
|
|
9
|
-
def initialize(num_threads,name=File.basename(__FILE__),timeout=180)
|
9
|
+
def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
|
10
10
|
num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
|
11
11
|
# $stderr.print "Using ",num_threads,"threads \n"
|
12
12
|
@num_threads = num_threads
|
13
|
+
@chunk_size = chunk_size
|
13
14
|
@pid_list = []
|
14
15
|
@name = name
|
15
16
|
@timeout = timeout
|
17
|
+
@quiet = quiet
|
18
|
+
@debug = debug
|
19
|
+
if @debug
|
20
|
+
$stderr.print "PCOWS running in DEBUG MODE\n"
|
21
|
+
end
|
16
22
|
if multi_threaded
|
17
23
|
@tmpdir = Dir::mktmpdir(@name+'_')
|
18
24
|
end
|
19
25
|
@last_output = 0 # counter
|
20
|
-
@output_locked =
|
26
|
+
@output_locked = false
|
21
27
|
end
|
22
28
|
|
23
|
-
# Feed the worker func and state to COWS. Note that func is a
|
24
|
-
# closure so it can pick up surrounding scope at invocation
|
25
|
-
# addition to the data captured in 'state'.
|
29
|
+
# Feed the worker 'func and state' to COWS. Note that func is a
|
30
|
+
# lambda closure so it can pick up surrounding scope at invocation
|
31
|
+
# in addition to the data captured in 'state'.
|
26
32
|
|
27
33
|
def submit_worker(func,state)
|
28
34
|
pid = nil
|
@@ -36,17 +42,30 @@ class PCOWS
|
|
36
42
|
func.call(state).each { | line | print line }
|
37
43
|
STDOUT.flush
|
38
44
|
STDOUT.close
|
45
|
+
# sleep 0.1
|
46
|
+
# f.flush
|
47
|
+
# f.close
|
48
|
+
# sleep 0.2 # interval to make sure we are done writing,
|
49
|
+
# otherwise there may be misses at the end of a
|
50
|
+
# block (maybe the f.close fixed it)
|
51
|
+
|
39
52
|
FileUtils::mv(tempfn,fn)
|
40
|
-
exit
|
53
|
+
exit(0)
|
41
54
|
end
|
55
|
+
Process.detach(pid)
|
42
56
|
else
|
43
|
-
# ----
|
57
|
+
# ---- Single threaded: call in main process and output immediately
|
44
58
|
func.call(state).each { | line | print line }
|
45
59
|
end
|
46
60
|
@pid_list << [ pid,count,fn ]
|
47
61
|
return true
|
48
62
|
end
|
49
63
|
|
64
|
+
def submit_final_worker(func,state)
|
65
|
+
@final_worker = true
|
66
|
+
submit_worker(func,state)
|
67
|
+
end
|
68
|
+
|
50
69
|
# Make sure no more than num_threads are running at the same time -
|
51
70
|
# this is achieved by checking the PID table and the running files
|
52
71
|
# in the tmpdir
|
@@ -54,7 +73,7 @@ class PCOWS
|
|
54
73
|
def wait_for_worker_slot()
|
55
74
|
return if single_threaded
|
56
75
|
Timeout.timeout(@timeout) do
|
57
|
-
|
76
|
+
printed_timeout_message = false
|
58
77
|
while true
|
59
78
|
# ---- count running pids
|
60
79
|
running = @pid_list.reduce(0) do | sum, info |
|
@@ -66,9 +85,11 @@ class PCOWS
|
|
66
85
|
end
|
67
86
|
end
|
68
87
|
return if running < @num_threads
|
69
|
-
|
70
|
-
|
71
|
-
|
88
|
+
if not printed_timeout_message
|
89
|
+
$stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
|
90
|
+
printed_timeout_message = true
|
91
|
+
end
|
92
|
+
sleep 0.1
|
72
93
|
end
|
73
94
|
end
|
74
95
|
end
|
@@ -81,8 +102,7 @@ class PCOWS
|
|
81
102
|
#
|
82
103
|
# In this implementation type==:by_line will call func for
|
83
104
|
# each line. Otherwise it is called once with the filename.
|
84
|
-
|
85
|
-
def process_output(func=nil,type = :by_line, blocking=false)
|
105
|
+
def process_output(func=nil,type=:by_line, blocking=false)
|
86
106
|
return if single_threaded
|
87
107
|
output = lambda { |fn|
|
88
108
|
if type == :by_line
|
@@ -92,53 +112,87 @@ class PCOWS
|
|
92
112
|
else
|
93
113
|
func.call(fn)
|
94
114
|
end
|
95
|
-
File.unlink(fn)
|
96
115
|
}
|
97
116
|
if @output_locked
|
117
|
+
# ---- is the other thread still running?
|
98
118
|
(pid,count,fn) = @output_locked
|
99
|
-
|
100
|
-
#
|
101
|
-
|
102
|
-
|
119
|
+
$stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
|
120
|
+
return if File.exist?(fn) # continue because thread still processing
|
121
|
+
# Now we should remove the .keep file
|
122
|
+
if not @debug
|
123
|
+
sleep 0.1 # give it a little time
|
124
|
+
keep = fn+'.keep'
|
125
|
+
if File.exist?(keep)
|
126
|
+
$stderr.print "Removing #{keep}\n" if not @quiet
|
127
|
+
File.unlink(keep)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
@last_output += 1 # get next one in line
|
131
|
+
@output_locked = false
|
103
132
|
end
|
133
|
+
# Still processing
|
104
134
|
if info = @pid_list[@last_output]
|
105
135
|
(pid,count,fn) = info
|
106
|
-
$stderr.print "
|
136
|
+
$stderr.print "Testing for output file ",[info],"\n" if @debug
|
107
137
|
if File.exist?(fn)
|
108
138
|
# Yes! We have the next output, create outputter
|
139
|
+
@output_locked = info
|
140
|
+
$stderr.print "Set lock on ",[info],"\n" if not @quiet
|
109
141
|
if not blocking
|
142
|
+
$stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
|
110
143
|
pid = fork do
|
111
144
|
output.call(fn)
|
145
|
+
FileUtils::mv(fn,fn+'.keep')
|
146
|
+
# if not @debug
|
147
|
+
# $stderr.print "Removing #{fn}\n" if not @quiet
|
148
|
+
# File.unlink(fn)
|
149
|
+
# else
|
150
|
+
# FileUtils::mv(fn,fn+'.keep')
|
151
|
+
# end
|
152
|
+
|
112
153
|
exit(0)
|
113
154
|
end
|
114
|
-
|
155
|
+
Process.detach(pid)
|
115
156
|
else
|
157
|
+
$stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
|
116
158
|
output.call(fn)
|
159
|
+
if not @debug
|
160
|
+
$stderr.print "Removing #{fn}\n" if not @quiet
|
161
|
+
File.unlink(fn)
|
162
|
+
else
|
163
|
+
FileUtils::mv(fn,fn+'.keep')
|
164
|
+
end
|
117
165
|
end
|
166
|
+
else
|
167
|
+
sleep 0.2
|
118
168
|
end
|
119
169
|
end
|
120
170
|
end
|
121
171
|
|
172
|
+
# Wait for a worker slot to appear. When working the pid is writing
|
173
|
+
# a file with extension .part(ial). After completion the file is
|
174
|
+
# renamed without .part and a slot is free.
|
122
175
|
def wait_for_worker(info)
|
123
176
|
(pid,count,fn) = info
|
124
177
|
if pid_or_file_running?(pid,fn)
|
125
|
-
$stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete\n"
|
178
|
+
$stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
|
126
179
|
begin
|
127
180
|
Timeout.timeout(@timeout) do
|
128
181
|
while not File.exist?(fn) # wait for the result to appear
|
129
182
|
sleep 0.2
|
183
|
+
return if not pid_or_file_running?(pid,fn) # worker is gone
|
130
184
|
end
|
131
185
|
end
|
132
|
-
#
|
133
|
-
raise "FATAL: child process appears to have crashed #{fn}" if not File.exist?(fn)
|
134
|
-
$stderr.print "OK pid=#{pid}, processing #{fn}\n"
|
186
|
+
# Partial file should have been renamed:
|
187
|
+
raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
|
188
|
+
$stderr.print "OK pid=#{pid}, processing output of #{fn}\n" if not @quiet
|
135
189
|
rescue Timeout::Error
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
190
|
+
# Kill it to speed up exit
|
191
|
+
Process.kill 9, pid
|
192
|
+
Process.wait pid
|
193
|
+
$stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
|
194
|
+
$stderr.print "Bailing out"
|
195
|
+
raise
|
142
196
|
end
|
143
197
|
end
|
144
198
|
end
|
@@ -155,15 +209,41 @@ class PCOWS
|
|
155
209
|
|
156
210
|
def process_remaining_output()
|
157
211
|
return if single_threaded
|
212
|
+
$stderr.print "Processing remaining output...\n" if not @quiet
|
158
213
|
while @output_locked
|
159
214
|
sleep 0.2
|
160
|
-
process_output()
|
215
|
+
process_output() # keep trying
|
161
216
|
end
|
162
217
|
@pid_list.each do |info|
|
163
|
-
|
218
|
+
(pid,count,fn) = info
|
219
|
+
while pid_or_file_running?(pid,fn) or File.exist?(fn)
|
220
|
+
$stderr.print "Trying: ",[info],"\n" if not @quiet
|
221
|
+
process_output(nil,:by_line,true)
|
222
|
+
sleep 0.2
|
223
|
+
end
|
224
|
+
end
|
225
|
+
cleanup_tmpdir()
|
226
|
+
end
|
227
|
+
|
228
|
+
def cleanup()
|
229
|
+
@pid_list.each do |info|
|
230
|
+
(pid,count,fn) = info
|
231
|
+
if pid_running?(pid)
|
232
|
+
$stderr.print "Killing child ",[info],"\n"
|
233
|
+
begin
|
234
|
+
Process.kill 9, pid
|
235
|
+
Process.wait pid
|
236
|
+
rescue Errno::ENOENT
|
237
|
+
$stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
|
238
|
+
rescue Errno::ESRCH
|
239
|
+
$stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
|
240
|
+
end
|
241
|
+
end
|
242
|
+
File.unlink(fn) if File.exist?(fn)
|
243
|
+
tempfn = fn+'.'+RUNNINGEXT
|
244
|
+
File.unlink(tempfn) if File.exist?(tempfn)
|
164
245
|
end
|
165
|
-
|
166
|
-
Dir.unlink(@tmpdir) if @tmpdir
|
246
|
+
cleanup_tmpdir()
|
167
247
|
end
|
168
248
|
|
169
249
|
private
|
@@ -203,8 +283,15 @@ class PCOWS
|
|
203
283
|
# Count on MAC
|
204
284
|
return Integer `sysctl -n hw.ncpu 2>/dev/null`
|
205
285
|
end
|
206
|
-
$stderr.print "Could not determine number of CPUs"
|
286
|
+
$stderr.print "Could not determine number of CPUs" if not @quiet
|
207
287
|
1
|
208
288
|
end
|
209
289
|
|
290
|
+
def cleanup_tmpdir
|
291
|
+
if not @debug
|
292
|
+
$stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
|
293
|
+
Dir.unlink(@tmpdir) if @tmpdir
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
210
297
|
end
|