polyrun 1.4.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/lib/polyrun/cli/ci_shard_hooks.rb +12 -4
- data/lib/polyrun/cli/ci_shard_run_command.rb +3 -1
- data/lib/polyrun/cli/help.rb +3 -0
- data/lib/polyrun/cli/helpers.rb +22 -0
- data/lib/polyrun/cli/run_shards_parallel_children.rb +26 -34
- data/lib/polyrun/cli/run_shards_parallel_wait.rb +267 -0
- data/lib/polyrun/cli/run_shards_plan_boot_phases.rb +34 -1
- data/lib/polyrun/cli/run_shards_plan_options.rb +6 -2
- data/lib/polyrun/cli/run_shards_run.rb +7 -33
- data/lib/polyrun/cli/run_shards_worker_interrupt.rb +75 -0
- data/lib/polyrun/coverage/collector_finish.rb +3 -2
- data/lib/polyrun/coverage/formatter.rb +2 -1
- data/lib/polyrun/coverage/merge/formatters_html.rb +191 -43
- data/lib/polyrun/coverage/merge/html/_file_list.html.erb +21 -0
- data/lib/polyrun/coverage/merge/html/_file_section.html.erb +26 -0
- data/lib/polyrun/coverage/merge/html/_groups_table.html.erb +18 -0
- data/lib/polyrun/coverage/merge/html/_overview.html.erb +47 -0
- data/lib/polyrun/coverage/merge/html/report.css +147 -0
- data/lib/polyrun/coverage/merge/html/report.js +48 -0
- data/lib/polyrun/coverage/merge/html/template.html.erb +30 -0
- data/lib/polyrun/coverage/track_files.rb +9 -0
- data/lib/polyrun/hooks.rb +9 -1
- data/lib/polyrun/log.rb +16 -0
- data/lib/polyrun/minitest.rb +34 -0
- data/lib/polyrun/quick/example_runner.rb +11 -0
- data/lib/polyrun/rspec.rb +18 -0
- data/lib/polyrun/version.rb +1 -1
- data/lib/polyrun/worker_ping.rb +74 -0
- data/sig/polyrun/minitest.rbs +2 -0
- data/sig/polyrun/rspec.rbs +4 -0
- data/sig/polyrun/worker_ping.rbs +10 -0
- metadata +12 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1d4fc5867eb97f45848d6da2b7ac8d0c3906de5cd0df849f02042aaaee1e9bbf
|
|
4
|
+
data.tar.gz: a10e216d02b76c722627ab5d0c73e5fe061c118d14f356bdc620655a7e83454a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fd37e0e3c6f3afccb8da9dba32b0b836d6b0ddf25181b0bbc2b3d20be561d93518af54ffb929cb62511259a788ad080f9f7080252231d3554aee5ab056c3c841
|
|
7
|
+
data.tar.gz: 92cb1e5ded19005ccca5975754525857ae5f1b9561947b9a6d1b736e62c098a61e377e3465dba1add9d265277f44672911be48e27097b9c3813ef188341f3a94
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,26 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
|
+
## Unreleased
|
|
4
|
+
|
|
5
|
+
## 1.5.0 (2026-05-04)
|
|
6
|
+
|
|
7
|
+
- Add `run-shards --worker-timeout SEC` and `POLYRUN_WORKER_TIMEOUT_SEC` (wall time per worker since spawn); stop stuck workers; record exit 124 for that shard.
|
|
8
|
+
- Add `run-shards --worker-idle-timeout SEC` and `POLYRUN_WORKER_IDLE_TIMEOUT_SEC`; parent reads monotonic timestamps from `POLYRUN_WORKER_PING_FILE`; record exit 125 when the last ping is stale. Idle applies only after a valid positive ping (use wall timeout until the first ping).
|
|
9
|
+
- Add `Polyrun::WorkerPing` (`ping!`, `ensure_interval_ping_thread!` when `POLYRUN_WORKER_PING_THREAD`). Add `Polyrun::RSpec.install_worker_ping!` and `Polyrun::Minitest.install_worker_ping!`; Polyrun Quick calls `WorkerPing.ping!` around each example. Parent creates ping paths under `tmp/polyrun/` and unlinks files after workers exit.
|
|
10
|
+
- Poll every live shard worker together when timeouts are enabled so idle and wall limits apply to all children, not only the first waiter.
|
|
11
|
+
- Split parallel worker teardown into `RunShardsParallelWait` and `RunShardsWorkerInterrupt`; keep spawn logic in `RunShardsParallelChildren`.
|
|
12
|
+
- Add `Polyrun::Log.orchestration_warn`; when `POLYRUN_ORCHESTRATION_STDERR=1`, copy one line to process `$stderr` if `Log.stderr` is not the same object (custom/null sinks).
|
|
13
|
+
- Wire `env_worker_timeout_sec` / `env_worker_idle_timeout_sec` into `ci-shard-run` plan context. Rescue `Interrupt` around `after_suite` in `run-shards` and `ci-shard` orchestration where suite hooks run.
|
|
14
|
+
- In `Polyrun::Hooks#run_phase`, rescue `Interrupt` for Ruby DSL and shell hook phases (return 130).
|
|
15
|
+
- Document worker timeout, idle ping, and `POLYRUN_ORCHESTRATION_STDERR` in `polyrun help`. Add `sig/polyrun/worker_ping.rbs` and extend `Polyrun::RSpec` / `Polyrun::Minitest` installer signatures.
|
|
16
|
+
|
|
17
|
+
## 1.4.2 (2026-04-24)
|
|
18
|
+
|
|
19
|
+
- Add richer HTML coverage reports: summary cards, group coverage, sortable file tables, project-relative paths, and per-file source detail.
|
|
20
|
+
- Refactor HTML coverage rendering into stdlib `ERB` templates with `_*.html.erb` partials and isolated `report.css` / `report.js` assets; inline assets into final standalone report.
|
|
21
|
+
- Fix `track_files` coverage scope in `Collector.finish`: keep only files matched by tracked globs, drop unrelated loaded runtime files, and add unloaded tracked files only for non-sharded runs.
|
|
22
|
+
- Add coverage specs for divergent `track_under` / `track_files` configs in serial and sharded finish paths; add `TrackFiles.keep_tracked_files`.
|
|
23
|
+
|
|
3
24
|
## 1.4.1 (2026-04-16)
|
|
4
25
|
|
|
5
26
|
- Add `polyrun merge-failures` and `run-shards --merge-failures` / `--merge-failures-output` / `--merge-failures-format`; merge per-worker JSONL under `tmp/polyrun_failures/polyrun-failure-fragment-*.jsonl` (or RSpec JSON via `-i`). Run merge after all workers exit, including when a shard failed (`--merge-coverage` still runs only after all shards succeed).
|
|
@@ -49,7 +49,11 @@ module Polyrun
|
|
|
49
49
|
"POLYRUN_SHARD_TOTAL" => ctx[:workers].to_s,
|
|
50
50
|
"POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
|
|
51
51
|
)
|
|
52
|
-
|
|
52
|
+
begin
|
|
53
|
+
hook_cfg.run_phase_if_enabled(:after_suite, env_after)
|
|
54
|
+
rescue Interrupt
|
|
55
|
+
Polyrun::Log.warn "polyrun ci-shard: after_suite hook interrupted"
|
|
56
|
+
end
|
|
53
57
|
end
|
|
54
58
|
end
|
|
55
59
|
end
|
|
@@ -109,9 +113,13 @@ module Polyrun
|
|
|
109
113
|
exit_code
|
|
110
114
|
ensure
|
|
111
115
|
if suite_started
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
116
|
+
begin
|
|
117
|
+
hook_cfg.run_phase_if_enabled(:after_suite, env_orch.merge(
|
|
118
|
+
"POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
|
|
119
|
+
))
|
|
120
|
+
rescue Interrupt
|
|
121
|
+
Polyrun::Log.warn "polyrun ci-shard: after_suite hook interrupted"
|
|
122
|
+
end
|
|
115
123
|
end
|
|
116
124
|
end
|
|
117
125
|
end
|
|
@@ -66,7 +66,9 @@ module Polyrun
|
|
|
66
66
|
merge_format: nil,
|
|
67
67
|
config_path: config_path,
|
|
68
68
|
matrix_shard_index: mx,
|
|
69
|
-
matrix_shard_total: mt
|
|
69
|
+
matrix_shard_total: mt,
|
|
70
|
+
worker_timeout_sec: env_worker_timeout_sec,
|
|
71
|
+
worker_idle_timeout_sec: env_worker_idle_timeout_sec
|
|
70
72
|
}
|
|
71
73
|
end
|
|
72
74
|
|
data/lib/polyrun/cli/help.rb
CHANGED
|
@@ -23,6 +23,9 @@ module Polyrun
|
|
|
23
23
|
Warn if merge-coverage wall time exceeds N seconds (default 10): POLYRUN_MERGE_SLOW_WARN_SECONDS (0 disables)
|
|
24
24
|
Failure fragments (run-shards --merge-failures): POLYRUN_MERGE_FAILURES=1; parent sets POLYRUN_FAILURE_FRAGMENTS=1 in workers; POLYRUN_FAILURE_FRAGMENT_DIR, POLYRUN_MERGED_FAILURES_OUT, POLYRUN_MERGED_FAILURES_FORMAT; after_suite sets POLYRUN_MERGED_FAILURES_PATH when merge ran
|
|
25
25
|
Parallel RSpec workers: POLYRUN_WORKERS default 5, max 10 (run-shards / parallel-rspec / start); distinct from POLYRUN_SHARD_PROCESSES / ci-shard --shard-processes (local processes per CI matrix job)
|
|
26
|
+
Per-worker wall timeout: run-shards --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC (max time since each worker spawn). Parent polls all live workers together. Exit 124; remaining workers stopped.
|
|
27
|
+
Per-worker idle timeout: --worker-idle-timeout SEC or POLYRUN_WORKER_IDLE_TIMEOUT_SEC counts only after a successful ping timestamp (positive float in POLYRUN_WORKER_PING_FILE); empty or unreadable pings do not satisfy idle enforcement—use wall timeout until the first ping. RSpec/Minitest/Quick installers call Polyrun::WorkerPing.ping! per example/suite. Ping files live under tmp/polyrun/ (gitignored via tmp/); parent unlinks each after its worker exits. Exit 125. Optional outer cap: --worker-timeout (exit 124). Optional periodic pings: POLYRUN_WORKER_PING_THREAD=1 (POLYRUN_WORKER_PING_INTERVAL_SEC); WorkerPing.ensure_interval_ping_thread! (installers invoke it—call yourself if wiring workers without install_worker_ping!).
|
|
28
|
+
If Polyrun::Log.stderr is null or redirected away, set POLYRUN_ORCHESTRATION_STDERR=1 to also print timeout/SIGINT summary lines to process stderr.
|
|
26
29
|
Partition timing granularity (default file): POLYRUN_TIMING_GRANULARITY=file|example (experimental per-example; see partition.timing_granularity)
|
|
27
30
|
|
|
28
31
|
commands:
|
data/lib/polyrun/cli/helpers.rb
CHANGED
|
@@ -11,6 +11,28 @@ module Polyrun
|
|
|
11
11
|
Polyrun::Config::Resolver.env_int(name, fallback)
|
|
12
12
|
end
|
|
13
13
|
|
|
14
|
+
# Per-worker wall clock (from spawn) for run-shards / ci-shard fan-out; unset or invalid means no limit.
|
|
15
|
+
def env_worker_timeout_sec
|
|
16
|
+
s = ENV["POLYRUN_WORKER_TIMEOUT_SEC"].to_s.strip
|
|
17
|
+
return nil if s.empty?
|
|
18
|
+
|
|
19
|
+
f = Float(s, exception: false)
|
|
20
|
+
return nil if f.nil? || f <= 0
|
|
21
|
+
|
|
22
|
+
f
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Max seconds without a new monotonic timestamp ping in the worker (see +polyrun/worker_ping+).
|
|
26
|
+
def env_worker_idle_timeout_sec
|
|
27
|
+
s = ENV["POLYRUN_WORKER_IDLE_TIMEOUT_SEC"].to_s.strip
|
|
28
|
+
return nil if s.empty?
|
|
29
|
+
|
|
30
|
+
f = Float(s, exception: false)
|
|
31
|
+
return nil if f.nil? || f <= 0
|
|
32
|
+
|
|
33
|
+
f
|
|
34
|
+
end
|
|
35
|
+
|
|
14
36
|
def resolve_shard_index(pc)
|
|
15
37
|
Polyrun::Config::Resolver.resolve_shard_index(pc)
|
|
16
38
|
end
|
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
require "fileutils"
|
|
2
|
+
|
|
3
|
+
require_relative "run_shards_parallel_wait"
|
|
4
|
+
|
|
1
5
|
module Polyrun
|
|
2
6
|
class CLI
|
|
3
|
-
# Spawns
|
|
7
|
+
# Spawns worker processes for +run-shards+ / +ci-shard-*+ fan-out. See {RunShardsParallelWait} for wait/timeout.
|
|
4
8
|
module RunShardsParallelChildren
|
|
9
|
+
include RunShardsParallelWait
|
|
10
|
+
|
|
5
11
|
private
|
|
6
12
|
|
|
7
13
|
# @return [Array(Array, Integer, nil)] +[pids, spawn_error_code]+; +spawn_error_code+ is +nil+ when all spawns succeeded
|
|
@@ -45,9 +51,12 @@ module Polyrun
|
|
|
45
51
|
child_env = child_env.merge("POLYRUN_HOOK_ORCHESTRATOR" => "0")
|
|
46
52
|
child_env = hook_cfg.merge_worker_ruby_env(child_env)
|
|
47
53
|
|
|
54
|
+
ping_path = run_shards_prepare_worker_ping!(ctx, child_env, shard)
|
|
55
|
+
|
|
48
56
|
Polyrun::Log.warn "polyrun run-shards: shard #{shard} → #{paths.size} file(s)" if @verbose
|
|
57
|
+
spawned_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
49
58
|
pid = run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
|
|
50
|
-
pids << {pid: pid, shard: shard}
|
|
59
|
+
pids << {pid: pid, shard: shard, spawned_at: spawned_at, ping_path: ping_path}
|
|
51
60
|
Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.spawn shard=#{shard} child_pid=#{pid} spec_files=#{paths.size}")
|
|
52
61
|
Polyrun::Log.warn "polyrun run-shards: started shard #{shard} pid=#{pid} (#{paths.size} file(s))" if parallel
|
|
53
62
|
end
|
|
@@ -55,6 +64,21 @@ module Polyrun
|
|
|
55
64
|
end
|
|
56
65
|
# rubocop:enable Metrics/AbcSize
|
|
57
66
|
|
|
67
|
+
def run_shards_prepare_worker_ping!(ctx, child_env, shard)
|
|
68
|
+
idle_sec = ctx[:worker_idle_timeout_sec]
|
|
69
|
+
idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
|
|
70
|
+
return nil unless idle_sec
|
|
71
|
+
|
|
72
|
+
dir = File.join(Dir.pwd, "tmp", "polyrun")
|
|
73
|
+
FileUtils.mkdir_p(dir)
|
|
74
|
+
path = File.expand_path("worker-ping-#{$$}-#{shard}.txt", dir)
|
|
75
|
+
File.binwrite(path, "")
|
|
76
|
+
child_env["POLYRUN_WORKER_PING_FILE"] = path
|
|
77
|
+
interval = ENV["POLYRUN_WORKER_PING_INTERVAL_SEC"].to_s.strip
|
|
78
|
+
child_env["POLYRUN_WORKER_PING_INTERVAL_SEC"] = interval.empty? ? "15" : interval
|
|
79
|
+
path
|
|
80
|
+
end
|
|
81
|
+
|
|
58
82
|
def run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
|
|
59
83
|
if hook_cfg.worker_hooks? && !Polyrun::Hooks.disabled?
|
|
60
84
|
Process.spawn(child_env, "sh", "-c", hook_cfg.build_worker_shell_script(cmd, paths))
|
|
@@ -62,38 +86,6 @@ module Polyrun
|
|
|
62
86
|
Process.spawn(child_env, *cmd, *paths)
|
|
63
87
|
end
|
|
64
88
|
end
|
|
65
|
-
|
|
66
|
-
# @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
|
|
67
|
-
def run_shards_wait_all_children(pids, hook_cfg, ctx)
|
|
68
|
-
workers = ctx[:workers]
|
|
69
|
-
shard_results = []
|
|
70
|
-
after_hook_err = 0
|
|
71
|
-
Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
|
|
72
|
-
pids.each do |h|
|
|
73
|
-
Process.wait(h[:pid])
|
|
74
|
-
exitstatus = $?.exitstatus
|
|
75
|
-
ok = $?.success?
|
|
76
|
-
Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
|
|
77
|
-
env_after = ENV.to_h.merge(
|
|
78
|
-
"POLYRUN_HOOK_ORCHESTRATOR" => "1",
|
|
79
|
-
"POLYRUN_SHARD_INDEX" => h[:shard].to_s,
|
|
80
|
-
"POLYRUN_SHARD_TOTAL" => workers.to_s,
|
|
81
|
-
"POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
|
|
82
|
-
)
|
|
83
|
-
rc = hook_cfg.run_phase_if_enabled(:after_shard, env_after)
|
|
84
|
-
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
85
|
-
shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
|
|
86
|
-
end
|
|
87
|
-
rescue Interrupt
|
|
88
|
-
# Do not trap SIGINT: Process.wait raises Interrupt; a trap races and prints Interrupt + SystemExit traces.
|
|
89
|
-
run_shards_shutdown_on_signal!(pids, 130)
|
|
90
|
-
rescue SignalException => e
|
|
91
|
-
raise unless e.signm == "SIGTERM"
|
|
92
|
-
|
|
93
|
-
run_shards_shutdown_on_signal!(pids, 143)
|
|
94
|
-
end
|
|
95
|
-
[shard_results, after_hook_err]
|
|
96
|
-
end
|
|
97
89
|
end
|
|
98
90
|
end
|
|
99
91
|
end
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# rubocop:disable Polyrun/FileLength, Metrics/ModuleLength -- wait loop + idle/wall flush (kept out of spawn module)
|
|
2
|
+
module Polyrun
|
|
3
|
+
class CLI
|
|
4
|
+
# Wait, wall/idle timeout, and +after_shard+ hooks for parallel workers (+run-shards+ / +ci-shard-*+).
|
|
5
|
+
module RunShardsParallelWait
|
|
6
|
+
WORKER_TIMEOUT_EXIT_STATUS = 124
|
|
7
|
+
WORKER_IDLE_TIMEOUT_EXIT_STATUS = 125
|
|
8
|
+
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
# @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
|
|
12
|
+
# rubocop:disable Metrics/AbcSize -- wait loop + timeout flush
|
|
13
|
+
def run_shards_wait_all_children(pids, hook_cfg, ctx)
|
|
14
|
+
workers = ctx[:workers]
|
|
15
|
+
shard_results = []
|
|
16
|
+
after_hook_err = 0
|
|
17
|
+
timeout_sec = ctx[:worker_timeout_sec]
|
|
18
|
+
timeout_sec = nil if timeout_sec.is_a?(Numeric) && timeout_sec <= 0
|
|
19
|
+
idle_sec = ctx[:worker_idle_timeout_sec]
|
|
20
|
+
idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
|
|
21
|
+
|
|
22
|
+
Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
|
|
23
|
+
if timeout_sec || idle_sec
|
|
24
|
+
run_shards_wait_all_children_multiplex(
|
|
25
|
+
pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err
|
|
26
|
+
)
|
|
27
|
+
else
|
|
28
|
+
run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
|
|
29
|
+
end
|
|
30
|
+
rescue Interrupt
|
|
31
|
+
run_shards_shutdown_on_signal!(pids, 130)
|
|
32
|
+
rescue SignalException => e
|
|
33
|
+
raise unless e.signm == "SIGTERM"
|
|
34
|
+
|
|
35
|
+
run_shards_shutdown_on_signal!(pids, 143)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
# rubocop:enable Metrics/AbcSize
|
|
39
|
+
|
|
40
|
+
def run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
|
|
41
|
+
pids.each do |h|
|
|
42
|
+
Process.wait(h[:pid])
|
|
43
|
+
st = $?
|
|
44
|
+
after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
|
|
45
|
+
end
|
|
46
|
+
[shard_results, after_hook_err]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Poll every live PID each tick so wall and idle timeouts apply to all workers, not only the first in wait order.
|
|
50
|
+
def run_shards_wait_all_children_multiplex(pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err)
|
|
51
|
+
pending = pids.dup
|
|
52
|
+
|
|
53
|
+
loop do
|
|
54
|
+
pending.delete_if do |h|
|
|
55
|
+
wpid = Process.wait(h[:pid], Process::WNOHANG)
|
|
56
|
+
next false unless wpid == h[:pid]
|
|
57
|
+
|
|
58
|
+
st = $?
|
|
59
|
+
after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
|
|
60
|
+
true
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
return [shard_results, after_hook_err] if pending.empty?
|
|
64
|
+
|
|
65
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
66
|
+
violation = run_shards_timeout_violation(pids, pending, ctx, now, timeout_sec, idle_sec)
|
|
67
|
+
if violation
|
|
68
|
+
reason, timed_h = violation
|
|
69
|
+
others = pending.reject { |x| x[:pid] == timed_h[:pid] }
|
|
70
|
+
case reason
|
|
71
|
+
when :wall_timeout
|
|
72
|
+
return run_shards_wait_flush_after_worker_timeout!(
|
|
73
|
+
timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err
|
|
74
|
+
)
|
|
75
|
+
when :idle_timeout
|
|
76
|
+
return run_shards_wait_flush_after_worker_idle!(
|
|
77
|
+
timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
sleep(0.2)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# @return [(Symbol, Hash), nil] e.g. +[:wall_timeout, h]+ when a limit is exceeded
|
|
87
|
+
def run_shards_timeout_violation(pids_order, pending, ctx, now, timeout_sec, idle_sec)
|
|
88
|
+
pids_order.each do |h|
|
|
89
|
+
next unless pending.any? { |p| p[:pid] == h[:pid] }
|
|
90
|
+
|
|
91
|
+
if timeout_sec && timeout_sec > 0
|
|
92
|
+
spawned_at = h[:spawned_at] || ctx[:run_t0]
|
|
93
|
+
return [:wall_timeout, h] if now >= spawned_at + timeout_sec
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
pids_order.each do |h|
|
|
98
|
+
next unless pending.any? { |p| p[:pid] == h[:pid] }
|
|
99
|
+
|
|
100
|
+
next unless idle_sec && idle_sec > 0 && h[:ping_path]
|
|
101
|
+
|
|
102
|
+
t, = run_shards_read_worker_ping_payload(h[:ping_path])
|
|
103
|
+
return [:idle_timeout, h] if t && (now - t) > idle_sec
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
nil
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
|
|
110
|
+
exitstatus = st.exitstatus
|
|
111
|
+
ok = st.success?
|
|
112
|
+
Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
|
|
113
|
+
rc = run_shards_invoke_after_shard!(hook_cfg, h[:shard], workers, exitstatus)
|
|
114
|
+
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
115
|
+
shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
|
|
116
|
+
run_shards_unlink_ping_path(h[:ping_path])
|
|
117
|
+
after_hook_err
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def run_shards_unlink_ping_path(path)
|
|
121
|
+
s = path.to_s.strip
|
|
122
|
+
return if s.empty?
|
|
123
|
+
|
|
124
|
+
File.unlink(s) if File.file?(s)
|
|
125
|
+
rescue SystemCallError
|
|
126
|
+
# best-effort cleanup of tmp/polyrun/worker-ping-*.txt
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def run_shards_read_worker_ping_time(path)
|
|
130
|
+
run_shards_read_worker_ping_payload(path)[0]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# @return [Array(Float?, String?)] monotonic time and optional location line (path:line)
|
|
134
|
+
def run_shards_read_worker_ping_payload(path)
|
|
135
|
+
return [nil, nil] unless path && File.file?(path)
|
|
136
|
+
|
|
137
|
+
s = File.binread(path)
|
|
138
|
+
return [nil, nil] if s.nil? || s.strip.empty?
|
|
139
|
+
|
|
140
|
+
time_line, rest = s.split("\n", 2)
|
|
141
|
+
first = time_line.to_s.strip
|
|
142
|
+
return [nil, nil] if first.empty?
|
|
143
|
+
|
|
144
|
+
f = first.to_f
|
|
145
|
+
t = f.positive? ? f : nil
|
|
146
|
+
loc = rest.to_s.strip
|
|
147
|
+
loc = nil if loc.empty?
|
|
148
|
+
[t, loc]
|
|
149
|
+
rescue SystemCallError
|
|
150
|
+
[nil, nil]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def run_shards_wait_flush_after_worker_idle!(timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err)
|
|
154
|
+
run_shards_warn_worker_idle!(timed_h, ctx, idle_sec)
|
|
155
|
+
run_shards_force_stop_pid_status(timed_h[:pid])
|
|
156
|
+
run_shards_unlink_ping_path(timed_h[:ping_path])
|
|
157
|
+
rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_IDLE_TIMEOUT_EXIT_STATUS)
|
|
158
|
+
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
159
|
+
shard_results << {shard: timed_h[:shard], exitstatus: WORKER_IDLE_TIMEOUT_EXIT_STATUS, success: false}
|
|
160
|
+
others.each do |h2|
|
|
161
|
+
st2 = run_shards_wait_or_force_stop_status(h2[:pid])
|
|
162
|
+
exit2 = st2&.exitstatus
|
|
163
|
+
ok2 = st2 ? st2.success? : false
|
|
164
|
+
exit2 = WORKER_IDLE_TIMEOUT_EXIT_STATUS if exit2.nil?
|
|
165
|
+
run_shards_unlink_ping_path(h2[:ping_path])
|
|
166
|
+
rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
|
|
167
|
+
after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
|
|
168
|
+
shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
|
|
169
|
+
end
|
|
170
|
+
[shard_results, after_hook_err]
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def run_shards_wait_flush_after_worker_timeout!(timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err)
|
|
174
|
+
run_shards_warn_worker_timeout!(timed_h, ctx, timeout_sec)
|
|
175
|
+
run_shards_force_stop_pid_status(timed_h[:pid])
|
|
176
|
+
run_shards_unlink_ping_path(timed_h[:ping_path])
|
|
177
|
+
rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_TIMEOUT_EXIT_STATUS)
|
|
178
|
+
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
179
|
+
shard_results << {shard: timed_h[:shard], exitstatus: WORKER_TIMEOUT_EXIT_STATUS, success: false}
|
|
180
|
+
others.each do |h2|
|
|
181
|
+
st2 = run_shards_wait_or_force_stop_status(h2[:pid])
|
|
182
|
+
exit2 = st2&.exitstatus
|
|
183
|
+
ok2 = st2 ? st2.success? : false
|
|
184
|
+
exit2 = WORKER_TIMEOUT_EXIT_STATUS if exit2.nil?
|
|
185
|
+
run_shards_unlink_ping_path(h2[:ping_path])
|
|
186
|
+
rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
|
|
187
|
+
after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
|
|
188
|
+
shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
|
|
189
|
+
end
|
|
190
|
+
[shard_results, after_hook_err]
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def run_shards_invoke_after_shard!(hook_cfg, shard, workers, exitstatus)
|
|
194
|
+
env_after = ENV.to_h.merge(
|
|
195
|
+
"POLYRUN_HOOK_ORCHESTRATOR" => "1",
|
|
196
|
+
"POLYRUN_SHARD_INDEX" => shard.to_s,
|
|
197
|
+
"POLYRUN_SHARD_TOTAL" => workers.to_s,
|
|
198
|
+
"POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
|
|
199
|
+
)
|
|
200
|
+
hook_cfg.run_phase_if_enabled(:after_shard, env_after)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def run_shards_warn_worker_idle!(h, ctx, idle_sec)
|
|
204
|
+
paths = ctx[:plan].shard(h[:shard])
|
|
205
|
+
sample = paths.first(5).join(", ")
|
|
206
|
+
suffix =
|
|
207
|
+
if paths.empty?
|
|
208
|
+
" (no paths)"
|
|
209
|
+
elsif paths.size > 5
|
|
210
|
+
" (#{paths.size} files total)"
|
|
211
|
+
else
|
|
212
|
+
""
|
|
213
|
+
end
|
|
214
|
+
_t, loc = run_shards_read_worker_ping_payload(h[:ping_path])
|
|
215
|
+
ping_suffix = (loc && !loc.to_s.strip.empty?) ? "; last ping #{loc.to_s.strip}" : ""
|
|
216
|
+
Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER IDLE TIMEOUT after #{idle_sec}s since last per-example progress ping — shard #{h[:shard]} pid #{h[:pid]}#{ping_suffix}."
|
|
217
|
+
Polyrun::Log.warn "polyrun run-shards: idle shard file sample: #{sample}#{suffix}"
|
|
218
|
+
Polyrun::Log.warn "polyrun run-shards: use Polyrun::RSpec.install_worker_ping! / Polyrun::Minitest.install_worker_ping! (Polyrun Quick calls ping! each example); exit #{WORKER_IDLE_TIMEOUT_EXIT_STATUS}."
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def run_shards_wait_or_force_stop_status(pid)
|
|
222
|
+
wpid = Process.wait(pid, Process::WNOHANG)
|
|
223
|
+
return $? if wpid == pid
|
|
224
|
+
|
|
225
|
+
run_shards_force_stop_pid_status(pid)
|
|
226
|
+
rescue Errno::ECHILD
|
|
227
|
+
nil
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def run_shards_force_stop_pid_status(pid)
|
|
231
|
+
Process.kill(:KILL, pid)
|
|
232
|
+
st = nil
|
|
233
|
+
begin
|
|
234
|
+
Process.wait(pid)
|
|
235
|
+
st = $?
|
|
236
|
+
rescue Errno::ECHILD
|
|
237
|
+
# child already reaped
|
|
238
|
+
end
|
|
239
|
+
st
|
|
240
|
+
rescue Errno::ESRCH
|
|
241
|
+
begin
|
|
242
|
+
Process.wait(pid)
|
|
243
|
+
$?
|
|
244
|
+
rescue Errno::ECHILD
|
|
245
|
+
nil
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def run_shards_warn_worker_timeout!(h, ctx, timeout_sec)
|
|
250
|
+
paths = ctx[:plan].shard(h[:shard])
|
|
251
|
+
sample = paths.first(5).join(", ")
|
|
252
|
+
suffix =
|
|
253
|
+
if paths.empty?
|
|
254
|
+
" (no paths)"
|
|
255
|
+
elsif paths.size > 5
|
|
256
|
+
" (#{paths.size} files total)"
|
|
257
|
+
else
|
|
258
|
+
""
|
|
259
|
+
end
|
|
260
|
+
Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER TIMEOUT after #{timeout_sec}s (wall time since worker spawn) — shard #{h[:shard]} pid #{h[:pid]}."
|
|
261
|
+
Polyrun::Log.warn "polyrun run-shards: timeout shard includes: #{sample}#{suffix}"
|
|
262
|
+
Polyrun::Log.warn "polyrun run-shards: override with --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC; recorded exit #{WORKER_TIMEOUT_EXIT_STATUS} for this worker."
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
# rubocop:enable Polyrun/FileLength, Metrics/ModuleLength
|
|
@@ -19,6 +19,9 @@ module Polyrun
|
|
|
19
19
|
err = run_shards_validate_cmd(cmd)
|
|
20
20
|
return [:fail, err] if err
|
|
21
21
|
|
|
22
|
+
run_shards_normalize_worker_timeout_option!(o)
|
|
23
|
+
run_shards_normalize_worker_idle_timeout_option!(o)
|
|
24
|
+
|
|
22
25
|
cmd = Shellwords.split(cmd.first) if cmd.size == 1 && cmd.first.include?(" ")
|
|
23
26
|
|
|
24
27
|
[:ok, o, cmd]
|
|
@@ -118,9 +121,39 @@ module Polyrun
|
|
|
118
121
|
merge_failures: run_shards_merge_failures_flag(o, cfg),
|
|
119
122
|
merge_failures_output: run_shards_merge_failures_output_opt(o, cfg),
|
|
120
123
|
merge_failures_format: run_shards_merge_failures_format_opt(o, cfg),
|
|
121
|
-
config_path: config_path
|
|
124
|
+
config_path: config_path,
|
|
125
|
+
worker_timeout_sec: run_shards_resolved_worker_timeout_sec(o),
|
|
126
|
+
worker_idle_timeout_sec: run_shards_resolved_worker_idle_timeout_sec(o)
|
|
122
127
|
}
|
|
123
128
|
end
|
|
129
|
+
|
|
130
|
+
def run_shards_normalize_worker_idle_timeout_option!(o)
|
|
131
|
+
v = o[:worker_idle_timeout_sec]
|
|
132
|
+
return if v.nil?
|
|
133
|
+
|
|
134
|
+
o[:worker_idle_timeout_sec] = nil if v <= 0
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def run_shards_resolved_worker_idle_timeout_sec(o)
|
|
138
|
+
cli = o[:worker_idle_timeout_sec]
|
|
139
|
+
return cli.to_f if cli.is_a?(Numeric) && cli > 0
|
|
140
|
+
|
|
141
|
+
env_worker_idle_timeout_sec
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def run_shards_normalize_worker_timeout_option!(o)
|
|
145
|
+
v = o[:worker_timeout_sec]
|
|
146
|
+
return if v.nil?
|
|
147
|
+
|
|
148
|
+
o[:worker_timeout_sec] = nil if v <= 0
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def run_shards_resolved_worker_timeout_sec(o)
|
|
152
|
+
cli = o[:worker_timeout_sec]
|
|
153
|
+
return cli.to_f if cli.is_a?(Numeric) && cli > 0
|
|
154
|
+
|
|
155
|
+
env_worker_timeout_sec
|
|
156
|
+
end
|
|
124
157
|
end
|
|
125
158
|
end
|
|
126
159
|
end
|
|
@@ -27,7 +27,9 @@ module Polyrun
|
|
|
27
27
|
merge_format: nil,
|
|
28
28
|
merge_failures: false,
|
|
29
29
|
merge_failures_output: nil,
|
|
30
|
-
merge_failures_format: nil
|
|
30
|
+
merge_failures_format: nil,
|
|
31
|
+
worker_timeout_sec: nil,
|
|
32
|
+
worker_idle_timeout_sec: nil
|
|
31
33
|
}
|
|
32
34
|
end
|
|
33
35
|
|
|
@@ -39,8 +41,10 @@ module Polyrun
|
|
|
39
41
|
|
|
40
42
|
# rubocop:disable Metrics/AbcSize -- one argv block for run-shards
|
|
41
43
|
def run_shards_plan_options_register!(opts, st)
|
|
42
|
-
opts.banner = "usage: polyrun run-shards [--workers N] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
|
|
44
|
+
opts.banner = "usage: polyrun run-shards [--workers N] [--worker-timeout SEC] [--worker-idle-timeout SEC] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
|
|
43
45
|
opts.on("--workers N", Integer) { |v| st[:workers] = v }
|
|
46
|
+
opts.on("--worker-timeout SEC", Float, "Max seconds per worker since spawn (also POLYRUN_WORKER_TIMEOUT_SEC); kills stuck workers (exit 124)") { |v| st[:worker_timeout_sec] = v }
|
|
47
|
+
opts.on("--worker-idle-timeout SEC", Float, "Max seconds since last valid WorkerPing timestamp in POLYRUN_WORKER_PING_FILE (needs prior ping); RSpec/Minitest: install_worker_ping!; Quick: automatic; exit 125") { |v| st[:worker_idle_timeout_sec] = v }
|
|
44
48
|
opts.on("--strategy NAME", String) { |v| st[:strategy] = v }
|
|
45
49
|
opts.on("--seed VAL") { |v| st[:seed] = v }
|
|
46
50
|
opts.on("--paths-file PATH", String) { |v| st[:paths_file] = v }
|
|
@@ -2,6 +2,7 @@ require "shellwords"
|
|
|
2
2
|
require "rbconfig"
|
|
3
3
|
|
|
4
4
|
require_relative "run_shards_planning"
|
|
5
|
+
require_relative "run_shards_worker_interrupt"
|
|
5
6
|
require_relative "run_shards_parallel_children"
|
|
6
7
|
|
|
7
8
|
module Polyrun
|
|
@@ -9,6 +10,7 @@ module Polyrun
|
|
|
9
10
|
# Partition + spawn workers for `polyrun run-shards` (keeps {RunShardsCommand} file small).
|
|
10
11
|
module RunShardsRun
|
|
11
12
|
include RunShardsPlanning
|
|
13
|
+
include RunShardsWorkerInterrupt
|
|
12
14
|
include RunShardsParallelChildren
|
|
13
15
|
|
|
14
16
|
private
|
|
@@ -93,7 +95,11 @@ module Polyrun
|
|
|
93
95
|
"POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s,
|
|
94
96
|
"POLYRUN_MERGED_FAILURES_PATH" => merged_failures_path.to_s
|
|
95
97
|
)
|
|
96
|
-
|
|
98
|
+
begin
|
|
99
|
+
hook_cfg.run_phase_if_enabled(:after_suite, env_after)
|
|
100
|
+
rescue Interrupt
|
|
101
|
+
Polyrun::Log.warn "polyrun run-shards: after_suite hook interrupted; workers are stopped or were not started"
|
|
102
|
+
end
|
|
97
103
|
end
|
|
98
104
|
end
|
|
99
105
|
end
|
|
@@ -106,38 +112,6 @@ module Polyrun
|
|
|
106
112
|
Polyrun::Log.warn "polyrun run-shards: each worker prints its own summary line; the last \"N examples\" line is not a total across shards."
|
|
107
113
|
end
|
|
108
114
|
|
|
109
|
-
# Best-effort worker teardown then exit. Does not return.
|
|
110
|
-
def run_shards_shutdown_on_signal!(pids, code)
|
|
111
|
-
run_shards_terminate_children!(pids)
|
|
112
|
-
exit(code)
|
|
113
|
-
rescue Interrupt
|
|
114
|
-
pids.each do |h|
|
|
115
|
-
Process.kill(:KILL, h[:pid])
|
|
116
|
-
rescue Errno::ESRCH
|
|
117
|
-
# already reaped
|
|
118
|
-
end
|
|
119
|
-
pids.each do |h|
|
|
120
|
-
Process.wait(h[:pid])
|
|
121
|
-
rescue Errno::ESRCH, Errno::ECHILD, Interrupt
|
|
122
|
-
# already reaped or give up
|
|
123
|
-
end
|
|
124
|
-
exit(code)
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
# Send SIGTERM to each worker PID and wait so Ctrl+C / SIGTERM does not leave orphans.
|
|
128
|
-
def run_shards_terminate_children!(pids)
|
|
129
|
-
pids.each do |h|
|
|
130
|
-
Process.kill(:TERM, h[:pid])
|
|
131
|
-
rescue Errno::ESRCH
|
|
132
|
-
# already reaped
|
|
133
|
-
end
|
|
134
|
-
pids.each do |h|
|
|
135
|
-
Process.wait(h[:pid])
|
|
136
|
-
rescue Errno::ESRCH, Errno::ECHILD
|
|
137
|
-
# already reaped
|
|
138
|
-
end
|
|
139
|
-
end
|
|
140
|
-
|
|
141
115
|
def run_shards_merge_or_hint_coverage(ctx)
|
|
142
116
|
if ctx[:merge_coverage]
|
|
143
117
|
mo = ctx[:merge_output] || "coverage/merged.json"
|